Merge remote-tracking branch 'upstream/18.2' into master

Change-Id: Ib3754c06dcf97605e48fc886bba6a5f46b47eb3d
diff --git a/.travis.yml b/.travis.yml
index e0d6a82..4df406b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -18,11 +18,13 @@
     - LIBPCIACCESS_VERSION=libpciaccess-0.13.4
     - LIBDRM_VERSION=libdrm-2.4.74
     - XCBPROTO_VERSION=xcb-proto-1.13
+    - RANDRPROTO_VERSION=randrproto-1.3.0
+    - LIBXRANDR_VERSION=libXrandr-1.3.0
     - LIBXCB_VERSION=libxcb-1.13
     - LIBXSHMFENCE_VERSION=libxshmfence-1.2
     - LIBVDPAU_VERSION=libvdpau-1.1
     - LIBVA_VERSION=libva-1.7.0
-    - LIBWAYLAND_VERSION=wayland-1.11.1
+    - LIBWAYLAND_VERSION=wayland-1.15.0
     - WAYLAND_PROTOCOLS_VERSION=wayland-protocols-1.8
     - PKG_CONFIG_PATH=$HOME/prefix/lib/pkgconfig:$HOME/prefix/share/pkgconfig
     - LD_LIBRARY_PATH="$HOME/prefix/lib:$LD_LIBRARY_PATH"
@@ -33,27 +35,28 @@
     - env:
         - LABEL="meson Vulkan"
         - BUILD=meson
-        - MESON_OPTIONS="-Ddri-drivers= -Dgallium-drivers="
-        - LLVM_VERSION=4.0
+        - MESON_OPTIONS="-Ddri-drivers=[] -Dgallium-drivers=[]"
+        - LLVM_VERSION=5.0
         - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
       addons:
         apt:
           sources:
-            - llvm-toolchain-trusty-4.0
+            - llvm-toolchain-trusty-5.0
           packages:
             # LLVM packaging is broken and misses these dependencies
             - libedit-dev
             # From sources above
-            - llvm-4.0-dev
+            - llvm-5.0-dev
             # Common
             - xz-utils
             - libexpat1-dev
+            - libx11-xcb-dev
             - libelf-dev
             - python3-pip
     - env:
         - LABEL="meson loaders/classic DRI"
         - BUILD=meson
-        - MESON_OPTIONS="-Dvulkan-drivers= -Dgallium-drivers="
+        - MESON_OPTIONS="-Dvulkan-drivers=[] -Dgallium-drivers=[]"
       addons:
         apt:
           packages:
@@ -92,7 +95,7 @@
         - BUILD=make
         - MAKEFLAGS="-j4"
         - MAKE_CHECK_COMMAND="true"
-        - LLVM_VERSION=4.0
+        - LLVM_VERSION=5.0
         - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
         - OVERRIDE_CC="gcc-4.8"
         - OVERRIDE_CXX="g++-4.8"
@@ -105,15 +108,14 @@
       addons:
         apt:
           sources:
-            - llvm-toolchain-trusty-4.0
+            - llvm-toolchain-trusty-5.0
           packages:
             # LLVM packaging is broken and misses these dependencies
             - libedit-dev
             # From sources above
-            - llvm-4.0-dev
+            - llvm-5.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -123,7 +125,7 @@
         - BUILD=make
         - MAKEFLAGS="-j4"
         - MAKE_CHECK_COMMAND="true"
-        - LLVM_VERSION=4.0
+        - LLVM_VERSION=5.0
         - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
         - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
         - DRI_DRIVERS=""
@@ -134,15 +136,14 @@
       addons:
         apt:
           sources:
-            - llvm-toolchain-trusty-4.0
+            - llvm-toolchain-trusty-5.0
           packages:
             # LLVM packaging is broken and misses these dependencies
             - libedit-dev
             # From sources above
-            - llvm-4.0-dev
+            - llvm-5.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -159,7 +160,7 @@
         - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
         - DRI_DRIVERS=""
         - GALLIUM_ST="--enable-dri --disable-opencl --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx-bellagio --disable-gallium-osmesa"
-        - GALLIUM_DRIVERS="i915,nouveau,pl111,r300,r600,freedreno,svga,swrast,vc4,virgl,etnaviv,imx"
+        - GALLIUM_DRIVERS="i915,nouveau,pl111,r300,r600,freedreno,svga,swrast,v3d,vc4,virgl,etnaviv,imx"
         - VULKAN_DRIVERS=""
         - LIBUNWIND_FLAGS="--enable-libunwind"
       addons:
@@ -174,7 +175,6 @@
             - llvm-3.9-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -213,7 +213,6 @@
             - libclang-3.9-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -231,7 +230,7 @@
         - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
         - DRI_DRIVERS=""
         - GALLIUM_ST="--disable-dri --enable-opencl --enable-opencl-icd --enable-llvm --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx-bellagio --disable-gallium-osmesa"
-        - GALLIUM_DRIVERS="r600,radeonsi"
+        - GALLIUM_DRIVERS="r600"
         - VULKAN_DRIVERS=""
         - LIBUNWIND_FLAGS="--enable-libunwind"
       addons:
@@ -249,7 +248,6 @@
             - libclang-4.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -285,7 +283,38 @@
             - libclang-5.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
+            - libexpat1-dev
+            - libx11-xcb-dev
+            - libelf-dev
+            - libunwind8-dev
+    - env:
+        # NOTE: Analogous to SWR above, building Clover is quite slow.
+        - LABEL="make Gallium ST Clover LLVM-6.0"
+        - BUILD=make
+        - MAKEFLAGS="-j4"
+        - MAKE_CHECK_COMMAND="true"
+        - LLVM_VERSION=6.0
+        - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
+        - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl"
+        - DRI_DRIVERS=""
+        - GALLIUM_ST="--disable-dri --enable-opencl --enable-opencl-icd --enable-llvm --disable-xa --disable-nine --disable-xvmc --disable-vdpau --disable-va --disable-omx-bellagio --disable-gallium-osmesa"
+        - GALLIUM_DRIVERS="r600,radeonsi"
+        - VULKAN_DRIVERS=""
+        - LIBUNWIND_FLAGS="--enable-libunwind"
+      addons:
+        apt:
+          sources:
+            - llvm-toolchain-trusty-6.0
+            # llvm-6 depends on gcc-4.9 which is not in main repo
+            - ubuntu-toolchain-r-test
+          packages:
+            - libclc-dev
+            # From sources above
+            - llvm-6.0-dev
+            - clang-6.0
+            - libclang-6.0-dev
+            # Common
+            - xz-utils
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -321,7 +350,6 @@
             - libedit-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -331,7 +359,7 @@
         - BUILD=make
         - MAKEFLAGS="-j4"
         - MAKE_CHECK_COMMAND="make -C src/gtest check && make -C src/intel check"
-        - LLVM_VERSION=4.0
+        - LLVM_VERSION=5.0
         - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
         - DRI_LOADERS="--disable-glx --disable-gbm --disable-egl --with-platforms=x11,wayland"
         - DRI_DRIVERS=""
@@ -342,15 +370,14 @@
       addons:
         apt:
           sources:
-            - llvm-toolchain-trusty-4.0
+            - llvm-toolchain-trusty-5.0
           packages:
             # LLVM packaging is broken and misses these dependencies
             - libedit-dev
             # From sources above
-            - llvm-4.0-dev
+            - llvm-5.0-dev
             # Common
             - xz-utils
-            - x11proto-xf86vidmode-dev
             - libexpat1-dev
             - libx11-xcb-dev
             - libelf-dev
@@ -365,7 +392,6 @@
       addons:
         apt:
           packages:
-            - scons
             # Common
             - xz-utils
             - x11proto-xf86vidmode-dev
@@ -384,7 +410,6 @@
       addons:
         apt:
           packages:
-            - scons
             # LLVM packaging is broken and misses these dependencies
             - libedit-dev
             - llvm-3.3-dev
@@ -399,7 +424,7 @@
         - BUILD=scons
         - SCONSFLAGS="-j4"
         - SCONS_TARGET="swr=1"
-        - LLVM_VERSION=4.0
+        - LLVM_VERSION=5.0
         - LLVM_CONFIG="llvm-config-${LLVM_VERSION}"
         # Keep it symmetrical to the make build. There's no actual SWR, yet.
         - SCONS_CHECK_COMMAND="true"
@@ -408,13 +433,12 @@
       addons:
         apt:
           sources:
-            - llvm-toolchain-trusty-4.0
+            - llvm-toolchain-trusty-5.0
           packages:
-            - scons
             # LLVM packaging is broken and misses these dependencies
             - libedit-dev
             # From sources above
-            - llvm-4.0-dev
+            - llvm-5.0-dev
             # Common
             - xz-utils
             - x11proto-xf86vidmode-dev
@@ -467,6 +491,11 @@
       pip3 install --user "meson<0.45.0";
     fi
 
+  # Install a more modern scons from pip.
+  - if test "x$BUILD" = xscons; then
+      pip2 install --user "scons>=2.4";
+    fi
+
   # Since libdrm gets updated in configure.ac regularly, try to pick up the
   # latest version from there.
   - for line in `grep "^LIBDRM.*_REQUIRED=" configure.ac`; do
@@ -510,6 +539,14 @@
       tar -jxvf $LIBDRM_VERSION.tar.bz2
       (cd $LIBDRM_VERSION && ./configure --prefix=$HOME/prefix --enable-vc4 --enable-freedreno --enable-etnaviv-experimental-api && make install)
 
+      wget $XORG_RELEASES/proto/$RANDRPROTO_VERSION.tar.bz2
+      tar -jxvf $RANDRPROTO_VERSION.tar.bz2
+      (cd $RANDRPROTO_VERSION && ./configure --prefix=$HOME/prefix && make install)
+
+      wget $XORG_RELEASES/lib/$LIBXRANDR_VERSION.tar.bz2
+      tar -jxvf $LIBXRANDR_VERSION.tar.bz2
+      (cd $LIBXRANDR_VERSION && ./configure --prefix=$HOME/prefix && make install)
+
       wget $XORG_RELEASES/lib/$LIBXSHMFENCE_VERSION.tar.bz2
       tar -jxvf $LIBXSHMFENCE_VERSION.tar.bz2
       (cd $LIBXSHMFENCE_VERSION && ./configure --prefix=$HOME/prefix && make install)
@@ -541,13 +578,34 @@
            "#ifndef _LINUX_MEMFD_H" \
            "#define _LINUX_MEMFD_H" \
            "" \
-           "#define __NR_memfd_create 319" \
-           "#define SYS_memfd_create __NR_memfd_create" \
-           "" \
            "#define MFD_CLOEXEC             0x0001U" \
            "#define MFD_ALLOW_SEALING       0x0002U" \
            "" \
            "#endif /* _LINUX_MEMFD_H */" > linux/memfd.h
+
+      # Generate this header, including the missing SYS_memfd_create
+      # macro, which is not provided by the header in the Travis
+      # instance
+      mkdir -p sys
+      printf "%s\n" \
+           "#ifndef _SYSCALL_H" \
+           "#define _SYSCALL_H      1" \
+           "" \
+           "#include <asm/unistd.h>" \
+           "" \
+           "#ifndef _LIBC" \
+           "# include <bits/syscall.h>" \
+           "#endif" \
+           "" \
+           "#ifndef __NR_memfd_create" \
+           "# define __NR_memfd_create 319 /* Taken from <asm/unistd_64.h> */" \
+           "#endif" \
+           "" \
+           "#ifndef SYS_memfd_create" \
+           "# define SYS_memfd_create __NR_memfd_create" \
+           "#endif" \
+           "" \
+           "#endif" > sys/syscall.h
     fi
 
 script:
@@ -558,7 +616,9 @@
 
       export CFLAGS="$CFLAGS -isystem`pwd`";
 
-      ./autogen.sh --enable-debug
+      mkdir build &&
+      cd build &&
+      ../autogen.sh --enable-debug
         $LIBUNWIND_FLAGS
         $DRI_LOADERS
         --with-dri-drivers=$DRI_DRIVERS
diff --git a/Android.common.mk b/Android.common.mk
index 97d5728..397dc03 100644
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -74,6 +74,7 @@
 	-DHAVE_ENDIAN_H \
 	-DHAVE_ZLIB \
 	-DMAJOR_IN_SYSMACROS \
+	-DVK_USE_PLATFORM_ANDROID_KHR \
 	-fvisibility=hidden \
 	-Wno-sign-compare
 
diff --git a/Makefile.am b/Makefile.am
index 86d7e7f..9e27db0 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -77,6 +77,7 @@
 	include/drm-uapi/drm_mode.h \
 	include/drm-uapi/i915_drm.h \
 	include/drm-uapi/tegra_drm.h \
+	include/drm-uapi/v3d_drm.h \
 	include/drm-uapi/vc4_drm.h \
 	include/D3D9 \
 	include/GL/wglext.h \
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..a91e308
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,79 @@
+`Mesa <https://mesa3d.org>`_ - The 3D Graphics Library
+======================================================
+
+
+Source
+------
+
+This repository lives at https://gitlab.freedesktop.org/mesa/mesa.
+Other repositories are likely forks, and code found there is not supported.
+
+
+Build status
+------------
+
+Travis:
+
+.. image:: https://travis-ci.org/mesa3d/mesa.svg?branch=master
+    :target: https://travis-ci.org/mesa3d/mesa
+
+Appveyor:
+
+.. image:: https://img.shields.io/appveyor/ci/mesa3d/mesa.svg
+    :target: https://ci.appveyor.com/project/mesa3d/mesa
+
+Coverity:
+
+.. image:: https://scan.coverity.com/projects/139/badge.svg?flat=1
+    :target: https://scan.coverity.com/projects/mesa
+
+
+Build & install
+---------------
+
+You can find more information in our documentation (`docs/install.html
+<https://mesa3d.org/install.html>`_), but the recommended way is to use
+Meson (`docs/meson.html <https://mesa3d.org/meson.html>`_):
+
+.. code-block:: sh
+
+  $ mkdir build
+  $ cd build
+  $ meson ..
+  $ sudo ninja install
+
+
+Support
+-------
+
+Many Mesa devs hang on IRC; if you're not sure which channel is
+appropriate, you should ask your question on `Freenode's #dri-devel
+<irc://chat.freenode.net#dri-devel>`_, someone will redirect you if
+necessary.
+Remember that not everyone is in the same timezone as you, so it might
+take a while before someone qualified sees your question.
+To figure out who you're talking to, or which nick to ping for your
+question, check out `Who's Who on IRC
+<https://dri.freedesktop.org/wiki/WhosWho/>`_.
+
+The next best option is to ask your question in an email to the
+mailing lists: `mesa-dev\@lists.freedesktop.org
+<https://lists.freedesktop.org/mailman/listinfo/mesa-dev>`_
+
+
+Bug reports
+-----------
+
+If you think something isn't working properly, please file a bug report
+(`docs/bugs.html <https://mesa3d.org/bugs.html>`_).
+
+
+Contributing
+------------
+
+Contributions are welcome, and step-by-step instructions can be found in our
+documentation (`docs/submittingpatches.html
+<https://mesa3d.org/submittingpatches.html>`_).
+
+Note that Mesa uses email mailing-lists for patches submission, review and
+discussions.
diff --git a/REVIEWERS b/REVIEWERS
index 638b8b7..beece0a 100644
--- a/REVIEWERS
+++ b/REVIEWERS
@@ -116,6 +116,7 @@
 R: Dylan Baker <dylan@pnwbakers.com>
 R: Eric Engestrom <eric@engestrom.ch>
 F: */meson.build
+F: meson.build
 F: meson_options.txt
 
 ANDROID EGL SUPPORT
diff --git a/SConstruct b/SConstruct
index a59a8ea..6e034fb 100644
--- a/SConstruct
+++ b/SConstruct
@@ -28,6 +28,12 @@
 import common
 
 #######################################################################
+# Minimal scons version
+
+EnsureSConsVersion(2, 4)
+
+
+#######################################################################
 # Configuration options
 
 opts = Variables('config.py')
diff --git a/VERSION b/VERSION
index 616b61c..9ad62d7 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-18.1.9
+18.2.6
diff --git a/appveyor.yml b/appveyor.yml
index bd33e2e..86440f0 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -35,13 +35,13 @@
 
 cache:
 - win_flex_bison-2.5.9.zip
-- llvm-3.3.1-msvc2015-mtd.7z
+- llvm-5.0.1-msvc2015-mtd.7z
 
 os: Visual Studio 2015
 
 environment:
   WINFLEXBISON_ARCHIVE: win_flex_bison-2.5.9.zip
-  LLVM_ARCHIVE: llvm-3.3.1-msvc2015-mtd.7z
+  LLVM_ARCHIVE: llvm-5.0.1-msvc2015-mtd.7z
 
 install:
 # Check pip
diff --git a/bin/.cherry-ignore b/bin/.cherry-ignore
index 6e098d1..fe5abdf 100644
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -1,129 +1,33 @@
-d89f58a6b8436b59dcf3b896c0ccddabed3f78fd
-a7d0c53ab89ca86b705014925214101f5bc4187f
-
-# These patches are ignored becuase Jason provided a version rebased on the 18.1
-# branch that was pulled instead
-#
-778e2881a04817e8c10c7a400bf1e37414420194
-3b54dd87f707a0fa40a1555bee64aeb06a381c27
-eeae4851494c16d2a6591550bfa6ef77d887ebe3
-a26693493570a9d0f0fba1be617e01ee7bfff4db
-0e7f3febf7e739c075a139ae641d65a0618752f3
-
-# This has a warning that it fixes more than one commit, but isn't needed in
-# 18.1
-#
-a1220e73116bad74f39c1792a0b0cf0e4e5031db
-
-# This doesn't apply and isn't necessary since
-# 1cc2e0cc6b47bd5efbf2af266405060785085e6b isn't in the 18.1 branch
-#
-587e712eda95c31d88ea9d20e59ad0ae59afef4f
-
-# This requires too many previous patch, and Marek (the author) decided to
-# to drop it from stable
-# 
-cac7ab1192eefdd8d8b3f25053fb006b5c330eb8
-
-# This patch is excluded since it requires additional patches to be pulled,
-# and is mainly aimed at developers, who rarely (if ever) work in the
-# stable branch
-#
-a2f5292c82ad07731d633b36a663e46adc181db9
-
-# This patch required manual backport, which was provided as
-# 3953467ee7851792c1d4b1c9435499545a7da9fc
-#
-4a67ce886a7b3def5f66c1aedf9e5436d157a03c
-
-# This patch required manual backport, which was provided as
-# 31677c5aa867e457cd06ae25150be2155e8da3c6
-#
-1f616a840eac02241c585d28e9dac8f19a297f39
-
-# Jason de-nominated this because it "a) shouldn't be needed and b) is horribly
-# broken"
-#
-11712b9ca17e4e1a819dcb7d020e19c6da77bc90
-
-# None of these are tagged for 18.1, they're for 18.2, but get-pick-list
-# still finds them for some reason
-#
-1c7a2433b270afb65f044d0cf49cb67715f50b5b
-0f79b2015bc0c44a8ed470684b6789f0e2e6aa6c
-ccbe33af5b086f4b488ac7ca8a8a45ebc9ac189c
-3f9cb2eb05152f4f0269e97893a16f23261f095b
-f2c0d310d6efe560de8192ab468ba02d50c9ac1e
-50a8713d4f90a6c70a23f9f5871420371df283a7
-1561e4984eb03d6946d19b820b83a96bbbd83b98
-66e12451ac4e4e1c05a48b2cd2b0d3186f779f20
-73b342c7a52a93d283799800824311639f372de0
-71d5b2fbf83061a1319141d26942771e8c75ff2b
-011a811652c74dcc9f56506ebb6075e4bdfe6ef9
-f3a78a9da01218df0067b24b52204a4e5f01bc69
-f9e8456c39136aa41f85f82758a00e5aa2aab334
-0aacb5eab6120aa1410966d23101e16eea3fbcd7
-a4a104fc81e93555899050efac23c3cd6ba762ab
-24ee53231da84a1be5ec08abebe8a2ff6aa019ca
-4c43ec461de4f122d5d6566361d064c816e4ef69
-743e11c10b180247488ae0cc24900560e0a74e2b
-4ffb575da59fd3aece02734ca4fd3212d5002d55
-8c048af5890d43578ca41eb9dcfa60cb9cc3fc9c
-c92a463d2341dd7893dd8b54775930ed9be72ac0
-ea1e50cc166ae855f9fa91ca6a4f944123298e4e
-f73f748323ef5a421ffd8fa0f02afd9627e31023
-d4e52281aa9c1acc92619736da8b67d8c02ce380
-a5f35aa742c3f1e2fae6a6c2fb53f92822f0cb70
-f6e09db2e613c215257b80f40957d580165b5ddf
-d4bf954fe61ec231be2bfa5e059f0fb7f6150bd1
-abdf396cbeaec2bfe9da2fd773d42fa3022ca8b5
-b9f6521157ab55073eec528cacc1f3b567e49503
-aa3020592964344c7032396d159e4ab2df743587
-063264db5be2941746fa58f164cdc803362753a9
-748f4cce183007587a6688ef25ad5f9dbea5c33c
-9de062ef207c6062d1fabb70209f4bbc9dc4732d
-7d1d1208c2b38890fe065b6431ef2e3b7166bae4
-0796c3934ebfe3448acf2d63f478f51c08e33046
-864c780566b8782c4fc69b4337db768223717bd8
-
-# These have more than one fixes tag and generate a warning
-#
-24839663a40257e0468406d72c48d431b5ae2bd4
-6ff1c479968819b93c46d24bd898e89ce14ac401
-ac0856ae4100a05dcd1fd932d9fd10200f8f7a7c
-c9f54486959716762e6818dabb0a73a8cd46df67
-
-# This patch requires patches that would require python 3, the thing they're
-# fixing isn't really even a bug, it's more of a style choice, so I'm not going
-# to pull them.
-#
-48820ed8da0ad50d51a58f26e156d82b685492e2
-
-# This patch doesn't apply and either needs to be backported or can just be
-# ignored
-#
-1e40f6948310be07abb2d0198e6602769892cdac
-
-# This patch isn't necessary, since the patch it fixes is not present in 18.1
-#
-a72dbc461bdb7714656e62cd8f4b00a404c2e6e0
-
-# This requires a much more significant patch not present in 18.1
-#
-4dc244eb447b1fa4e39d67a58328ed774395c901
-
-# This patches were dropped since they only fix developer tools, which aren't
-# built by default and should be of no use to end users or distros
-#
-97fcccb25ed5f55139c03ebc1c71742f0f25f683
-4aec44c0d9c4c0649c362199fac97efe0a3b38a4
-
-# This patch was reverted on master shortly after merging.
-#
-90819abb56f6b1a0cd4946b13b6caf24fb46e500
-
-# These were supreceeded by patches backported to 18.1
-#
-3341429d74099b436c3824164837eebd47029ded
-9158e0bd82ffdad4baf46221bccbbb3fe4764c11
-cc3b99bb48769ccd018b781338b548306af5046b
+# fixes:  This commit has more than one Fixes tag but the commit it
+#         addresses didn't land in branch.
+6ff1c479968819b93c46d24bd898e89ce14ac401 autotools: don't ship the git_sha1.h generated in git in the tarballs
+# pick:   This commit addresses a regression introduced by previous
+#         commit fa9e6c235da, which didn't make it for 18.2.
+a72dbc461bdb7714656e62cd8f4b00a404c2e6e0 mesa: allow GL_UNSIGNED_BYTE type for SNORM reads
+# fixes:  This commit has more than one Fixes tag but the commit it
+#         addresses didn't land in branch.
+c9f54486959716762e6818dabb0a73a8cd46df67 radeonsi: fix regression in indirect input swizzles.
+# extra:  Just some comments update.
+2ad9917e187c1e9dbb053d3c98aa0e39fa374059 anv/blorp: Fix a comment as per Nanley's review feedback
+# fixes:  This commit was immediately reverted by commit 2dce1175c1c.
+4aec44c0d9c4c0649c362199fac97efe0a3b38a4 i965/tools: 32bit compilation with meson
+# pick:   This commit was reverted by commit 95bb7d82ca8.
+90819abb56f6b1a0cd4946b13b6caf24fb46e500 radv: fix descriptor pool allocation size
+# pick:   There is a specific patch for stable branch for this commit.
+0d495bec25bd7584de4e988c2b4528c1996bc1d0 radeonsi: NaN should pass kill_if
+# pick:   This commit reverts 0fa9e6d7b30 which did not land in branch.
+aa02d7e8781c25ee18b6da97606300808c84973a Revert "anv/skylake: disable ForceThreadDispatchEnable"
+# pick:   Explicit 18.3 only nominations.
+b1b2dd06a7b777e862b525302b15bcaf407d3648 radv: add missing TFB queries support to CmdCopyQueryPoolsResults()
+e0c7114eb3c19d4c2653f661698a6baa3bc9bedf st/mesa: disable L3 thread pinning
+b5f213bb1dcde22949dffe9d3a431fecd5d0f33b radv: binding streamout buffers doesn't change context regs
+9367514524f70faad99c721bac92339c8ff8bad9 radeonsi: fix video APIs on Raven2
+ea9f95e2a67eca90bb84eea24e7b4b804b3b1345 radeonsi: go back to using bottom-of-pipe for beginning of TIME_ELAPSED
+# fixes:  This commit was reverted by commit 5f312e95f87.
+a9031bf9b55602d93cccef6c926e2179c23205b4 i965/batch: avoid reverting batch buffer if saved state is an empty
+# extra:  intel/aub_viewer is not present in branch
+ac324a6809c09c54d3b0bfdb00e5e62987ec4ad8 intel/aub_viewer: fix dynamic state printing
+0db898cef2f5a455138e5845689c075aadba1c1f intel/aub_viewer: Print blend states properly
+# fixes: This commit requires commits 854202f70e6 and 84bc5738401 which did not
+#        land in branch.
+c120dbfe4d18240315ecec9b43a61aeb9ab239ac mesa/main: fix incorrect depth-error
diff --git a/bin/bugzilla_mesa.sh b/bin/bugzilla_mesa.sh
index a8f5305..9095bc9 100755
--- a/bin/bugzilla_mesa.sh
+++ b/bin/bugzilla_mesa.sh
@@ -23,7 +23,7 @@
 echo ""
 
 # extract fdo urls from commit log
-git log $* | grep 'bugs.freedesktop.org/show_bug' | sed -e $trim_before | sort -n -u | sed -e $use_after |\
+git log --pretty=medium $* | grep 'bugs.freedesktop.org/show_bug' | sed -e $trim_before | sort -n -u | sed -e $use_after |\
 while read url
 do
 	id=$(echo $url | cut -d'=' -f2)
diff --git a/bin/get-pick-list.sh b/bin/get-pick-list.sh
index 9e9a39e..9f9cbc4 100755
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -7,21 +7,92 @@
 # $ bin/get-pick-list.sh
 # $ bin/get-pick-list.sh > picklist
 # $ bin/get-pick-list.sh | tee picklist
+#
+# The output is as follows:
+# [nomination_type] commit_sha commit summary
+
+is_stable_nomination()
+{
+	git show --summary "$1" | grep -q -i -o "CC:.*mesa-stable"
+}
+
+is_typod_nomination()
+{
+	git show --summary "$1" | grep -q -i -o "CC:.*mesa-dev"
+}
+
+# Helper to handle various mistypos of the fixes tag.
+# The tag string itself is passed as argument and normalised within.
+is_sha_nomination()
+{
+	fixes=`git show --pretty=medium -s $1 | tr -d "\n" | \
+		sed -e 's/'"$2"'/\nfixes:/Ig' | \
+		grep -Eo 'fixes:[a-f0-9]{8,40}'`
+
+	fixes_count=`echo "$fixes" | wc -l`
+	if test $fixes_count -eq 0; then
+		return 0
+	fi
+	while test $fixes_count -gt 0; do
+		# Treat only the current line
+		id=`echo "$fixes" | tail -n $fixes_count | head -n 1 | cut -d : -f 2`
+		fixes_count=$(($fixes_count-1))
+
+		# Bail out if we cannot find suitable id.
+		# Any specific validation the $id is valid and not some junk, is
+		# implied with the follow up code
+		if test "x$id" = x; then
+			continue
+		fi
+
+		#Check if the offending commit is in branch.
+
+		# Be that cherry-picked ...
+		# ... or landed before the branchpoint.
+		if grep -q ^$id already_picked ||
+		   grep -q ^$id already_landed ; then
+			return 0
+		fi
+	done
+	return 1
+}
+
+is_fixes_nomination()
+{
+	is_sha_nomination "$1" "fixes:[[:space:]]*"
+	if test $? -eq 0; then
+		return 0
+	fi
+	is_sha_nomination "$1" "fixes[[:space:]]\+"
+}
+
+is_brokenby_nomination()
+{
+	is_sha_nomination "$1" "broken by"
+}
+
+is_revert_nomination()
+{
+	is_sha_nomination "$1" "This reverts commit "
+}
 
 # Use the last branchpoint as our limit for the search
 latest_branchpoint=`git merge-base origin/master HEAD`
 
-# Grep for commits with "cherry picked from commit" in the commit message.
+# List all the commits between day 1 and the branch point...
+git log --reverse --pretty=%H $latest_branchpoint > already_landed
+
+# ... and the ones cherry-picked.
 git log --reverse --pretty=medium --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
 	grep "cherry picked from commit" |\
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked
 
-# Grep for commits that were marked as a candidate for the stable tree.
-git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable' $latest_branchpoint..origin/master |\
+# Grep for potential candidates
+git log --reverse --pretty=%H -i --grep='^CC:.*mesa-stable\|^CC:.*mesa-dev\|\<fixes\>\|\<broken by\>\|This reverts commit' $latest_branchpoint..origin/master |\
 while read sha
 do
 	# Check to see whether the patch is on the ignore list.
-	if [ -f bin/.cherry-ignore ] ; then
+	if test -f bin/.cherry-ignore; then
 		if grep -q ^$sha bin/.cherry-ignore ; then
 			continue
 		fi
@@ -32,7 +103,23 @@
 		continue
 	fi
 
-	git log -n1 --pretty=oneline $sha | cat
+	if is_stable_nomination "$sha"; then
+		tag=stable
+	elif is_typod_nomination "$sha"; then
+		tag=typod
+	elif is_fixes_nomination "$sha"; then
+		tag=fixes
+	elif is_brokenby_nomination "$sha"; then
+		tag=brokenby
+	elif is_revert_nomination "$sha"; then
+		tag=revert
+	else
+		continue
+	fi
+
+	printf "[ %8s ] " "$tag"
+	git --no-pager show --summary --oneline $sha
 done
 
 rm -f already_picked
+rm -f already_landed
diff --git a/bin/get-typod-pick-list.sh b/bin/get-typod-pick-list.sh
deleted file mode 100755
index eb4181d..0000000
--- a/bin/get-typod-pick-list.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/sh
-
-# Script for generating a list of candidates which have typos in the nomination line
-#
-# Usage examples:
-#
-# $ bin/get-typod-pick-list.sh
-# $ bin/get-typod-pick-list.sh > picklist
-# $ bin/get-typod-pick-list.sh | tee picklist
-
-# NB:
-# This script intentionally _never_ checks for specific version tag
-# Should we consider folding it with the original get-pick-list.sh
-
-# Use the last branchpoint as our limit for the search
-latest_branchpoint=`git merge-base origin/master HEAD`
-
-# Grep for commits with "cherry picked from commit" in the commit message.
-git log --reverse --grep="cherry picked from commit" $latest_branchpoint..HEAD |\
-	grep "cherry picked from commit" |\
-	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked
-
-# Grep for commits that were marked as a candidate for the stable tree.
-git log --reverse --pretty=%H -i --grep='^CC:.*mesa-dev' $latest_branchpoint..origin/master |\
-while read sha
-do
-	# Check to see whether the patch is on the ignore list.
-	if [ -f bin/.cherry-ignore ] ; then
-		if grep -q ^$sha bin/.cherry-ignore ; then
-			continue
-		fi
-	fi
-
-	# Check to see if it has already been picked over.
-	if grep -q ^$sha already_picked ; then
-		continue
-	fi
-
-	git log -n1 --pretty=oneline $sha | cat
-done
-
-rm -f already_picked
diff --git a/bin/install_megadrivers.py b/bin/install_megadrivers.py
index 4ee98d0..551e385 100755
--- a/bin/install_megadrivers.py
+++ b/bin/install_megadrivers.py
@@ -48,23 +48,23 @@
         os.makedirs(to)
     shutil.copy(args.megadriver, master)
 
-    for each in args.drivers:
-        driver = os.path.join(to, each)
+    for driver in args.drivers:
+        abs_driver = os.path.join(to, driver)
 
-        if os.path.lexists(driver):
-            os.unlink(driver)
-        print('installing {} to {}'.format(args.megadriver, driver))
-        os.link(master, driver)
+        if os.path.lexists(abs_driver):
+            os.unlink(abs_driver)
+        print('installing {} to {}'.format(args.megadriver, abs_driver))
+        os.link(master, abs_driver)
 
         try:
             ret = os.getcwd()
             os.chdir(to)
 
-            name, ext = os.path.splitext(each)
+            name, ext = os.path.splitext(driver)
             while ext != '.so':
                 if os.path.lexists(name):
                     os.unlink(name)
-                os.symlink(each, name)
+                os.symlink(driver, name)
                 name, ext = os.path.splitext(name)
         finally:
             os.chdir(ret)
diff --git a/common.py b/common.py
index 24a7e8a..25bf86a 100644
--- a/common.py
+++ b/common.py
@@ -86,7 +86,7 @@
         from SCons.Options.EnumOption import EnumOption
     opts.Add(EnumOption('build', 'build type', 'debug',
                         allowed_values=('debug', 'checked', 'profile',
-                                        'release', 'opt')))
+                                        'release')))
     opts.Add(BoolOption('verbose', 'verbose output', 'no'))
     opts.Add(EnumOption('machine', 'use machine-specific assembly code',
                         default_machine,
@@ -107,9 +107,6 @@
     opts.Add(BoolOption('debug', 'DEPRECATED: debug build', 'yes'))
     opts.Add(BoolOption('profile', 'DEPRECATED: profile build', 'no'))
     opts.Add(BoolOption('quiet', 'DEPRECATED: profile build', 'yes'))
-    opts.Add(BoolOption('texture_float',
-                        'enable floating-point textures and renderbuffers',
-                        'no'))
     opts.Add(BoolOption('swr', 'Build OpenSWR', 'no'))
     if host_platform == 'windows':
         opts.Add('MSVC_VERSION', 'Microsoft Visual C/C++ version')
diff --git a/configure.ac b/configure.ac
index 14f1af2..b4c2ff1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -78,8 +78,9 @@
 LIBDRM_INTEL_REQUIRED=2.4.75
 LIBDRM_NVVIEUX_REQUIRED=2.4.66
 LIBDRM_NOUVEAU_REQUIRED=2.4.66
-LIBDRM_FREEDRENO_REQUIRED=2.4.91
+LIBDRM_FREEDRENO_REQUIRED=2.4.92
 LIBDRM_ETNAVIV_REQUIRED=2.4.89
+LIBDRM_VC4_REQUIRED=2.4.89
 
 dnl Versions for external dependencies
 DRI2PROTO_REQUIRED=2.8
@@ -89,6 +90,7 @@
 LIBVA_REQUIRED=0.39.0
 VDPAU_REQUIRED=1.1
 WAYLAND_REQUIRED=1.11
+WAYLAND_EGL_BACKEND_REQUIRED=3
 WAYLAND_PROTOCOLS_REQUIRED=1.8
 XCB_REQUIRED=1.9.3
 XCBDRI2_REQUIRED=1.8
@@ -106,9 +108,9 @@
 LLVM_REQUIRED_GALLIUM=3.3.0
 LLVM_REQUIRED_OPENCL=3.9.0
 LLVM_REQUIRED_R600=3.9.0
-LLVM_REQUIRED_RADEONSI=4.0.0
-LLVM_REQUIRED_RADV=4.0.0
-LLVM_REQUIRED_SWR=4.0.0
+LLVM_REQUIRED_RADEONSI=5.0.0
+LLVM_REQUIRED_RADV=5.0.0
+LLVM_REQUIRED_SWR=5.0.0
 
 dnl Check for progs
 AC_PROG_CPP
@@ -119,6 +121,7 @@
 AC_ARG_VAR([CXX11_CXXFLAGS], [Compiler flag to enable C++11 support (only needed if not
                               enabled by default and different  from -std=c++11)])
 AM_PROG_CC_C_O
+AC_PROG_GREP
 AC_PROG_NM
 AM_PROG_AS
 AX_CHECK_GNU_MAKE
@@ -292,6 +295,12 @@
 
 AM_CONDITIONAL(HAVE_ANDROID, test "x$android" = xyes)
 
+# Toggle Werror since at some point clang started treating unknown -W
+# flags as warnings, succeeding with the build, yet issuing an annoying
+# warning.
+save_CFLAGS="$CFLAGS"
+export CFLAGS="$CFLAGS -Werror"
+
 dnl
 dnl Check compiler flags
 dnl
@@ -299,10 +308,18 @@
 AX_CHECK_COMPILE_FLAG([-Werror=implicit-function-declaration], [CFLAGS="$CFLAGS -Werror=implicit-function-declaration"])
 AX_CHECK_COMPILE_FLAG([-Werror=missing-prototypes],            [CFLAGS="$CFLAGS -Werror=missing-prototypes"])
 AX_CHECK_COMPILE_FLAG([-Wmissing-prototypes],                  [CFLAGS="$CFLAGS -Wmissing-prototypes"])
+dnl Dylan Baker: gcc and clang always accepr -Wno-*, hence check for the original warning, then set the no-* flag
+AX_CHECK_COMPILE_FLAG([-Wmissing-field-initializers],          [CFLAGS="$CFLAGS -Wno-missing-field-initializers"])
 AX_CHECK_COMPILE_FLAG([-fno-math-errno],                       [CFLAGS="$CFLAGS -fno-math-errno"])
+
 AX_CHECK_COMPILE_FLAG([-fno-trapping-math],                    [CFLAGS="$CFLAGS -fno-trapping-math"])
 AX_CHECK_COMPILE_FLAG([-fvisibility=hidden],                   [VISIBILITY_CFLAGS="-fvisibility=hidden"])
 
+CFLAGS="$save_CFLAGS"
+
+# Toggle Werror since at some point clang started treating unknown -W
+# flags as warnings, succeeding with the build, yet issuing an annoying
+# warning.
 dnl
 dnl Check C++ compiler flags
 dnl
@@ -311,6 +328,7 @@
 AX_CHECK_COMPILE_FLAG([-fno-math-errno],                       [CXXFLAGS="$CXXFLAGS -fno-math-errno"])
 AX_CHECK_COMPILE_FLAG([-fno-trapping-math],                    [CXXFLAGS="$CXXFLAGS -fno-trapping-math"])
 AX_CHECK_COMPILE_FLAG([-fvisibility=hidden],                   [VISIBILITY_CXXFLAGS="-fvisibility=hidden"])
+AX_CHECK_COMPILE_FLAG([-Wmissing-field-initializers],          [CXXFLAGS="$CXXFLAGS -Wno-missing-field-initializers"])
 AC_LANG_POP([C++])
 
 # Flags to help ensure that certain portions of the code -- and only those
@@ -761,21 +779,6 @@
 AC_SUBST([LIB_EXT])
 
 dnl
-dnl potentially-infringing-but-nobody-knows-for-sure stuff
-dnl
-AC_ARG_ENABLE([texture-float],
-    [AS_HELP_STRING([--enable-texture-float],
-        [enable floating-point textures and renderbuffers @<:@default=disabled@:>@])],
-    [enable_texture_float="$enableval"],
-    [enable_texture_float=no]
-)
-if test "x$enable_texture_float" = xyes; then
-    AC_MSG_WARN([Floating-point textures enabled.])
-    AC_MSG_WARN([Please consult docs/patents.txt with your lawyer before building Mesa.])
-    DEFINES="$DEFINES -DTEXTURE_FLOAT_ENABLED"
-fi
-
-dnl
 dnl Arch/platform-specific settings
 dnl
 AC_ARG_ENABLE([asm],
@@ -1374,7 +1377,7 @@
 AC_ARG_WITH([gallium-drivers],
     [AS_HELP_STRING([--with-gallium-drivers@<:@=DIRS...@:>@],
         [comma delimited Gallium drivers list, e.g.
-        "i915,nouveau,r300,r600,radeonsi,freedreno,pl111,svga,swrast,swr,tegra,vc4,vc5,virgl,etnaviv,imx"
+        "i915,nouveau,r300,r600,radeonsi,freedreno,pl111,svga,swrast,swr,tegra,v3d,vc4,virgl,etnaviv,imx"
         @<:@default=r300,r600,svga,swrast@:>@])],
     [with_gallium_drivers="$withval"],
     [with_gallium_drivers="$GALLIUM_DRIVERS_DEFAULT"])
@@ -1423,6 +1426,7 @@
                                         "x$enable_gles1" = xyes -o \
                                         "x$enable_gles2" = xyes)
 AM_CONDITIONAL(NEED_KHRPLATFORM, test "x$enable_egl" = xyes -o \
+                                      "x$enable_opengl" = xyes -o \
                                       "x$enable_gles1" = xyes -o \
                                       "x$enable_gles2" = xyes)
 
@@ -1589,6 +1593,7 @@
 AM_CONDITIONAL(HAVE_LMSENSORS, test "x$enable_lmsensors" = xyes )
 AM_CONDITIONAL(HAVE_GALLIUM_EXTRA_HUD, test "x$enable_gallium_extra_hud" = xyes )
 AM_CONDITIONAL(HAVE_WINDOWSDRI, test "x$enable_dri" = xyes -a "x$dri_platform" = xwindows )
+AM_CONDITIONAL(HAVE_XLEASE, test "x$have_xlease" = xyes )
 
 AC_ARG_ENABLE([shared-glapi],
     [AS_HELP_STRING([--enable-shared-glapi],
@@ -1702,11 +1707,7 @@
         fi
     fi
 
-    # add xf86vidmode if available
-    PKG_CHECK_MODULES([XF86VIDMODE], [xxf86vm], HAVE_XF86VIDMODE=yes, HAVE_XF86VIDMODE=no)
-    if test "$HAVE_XF86VIDMODE" = yes ; then
-        dri_modules="$dri_modules xxf86vm"
-    fi
+    dri_modules="$dri_modules xxf86vm"
 
     PKG_CHECK_MODULES([DRIGL], [$dri_modules])
     GL_PC_REQ_PRIV="$GL_PC_REQ_PRIV $dri_modules"
@@ -1719,10 +1720,6 @@
     ;;
 esac
 
-# This is outside the case (above) so that it is invoked even for non-GLX
-# builds.
-AM_CONDITIONAL(HAVE_XF86VIDMODE, test "x$HAVE_XF86VIDMODE" = xyes)
-
 GLESv1_CM_LIB_DEPS="$LIBDRM_LIBS -lm $PTHREAD_LIBS $DLOPEN_LIBS"
 GLESv1_CM_PC_LIB_PRIV="-lm $PTHREAD_LIBS $DLOPEN_LIBS"
 GLESv2_LIB_DEPS="$LIBDRM_LIBS -lm $PTHREAD_LIBS $DLOPEN_LIBS"
@@ -1739,8 +1736,6 @@
 AC_SUBST([GLESv2_LIB_DEPS])
 AC_SUBST([GLESv2_PC_LIB_PRIV])
 
-AC_SUBST([HAVE_XF86VIDMODE])
-
 dnl
 dnl More GLX setup
 dnl
@@ -1823,6 +1818,9 @@
         PKG_CHECK_MODULES([WAYLAND_CLIENT], [wayland-client >= $WAYLAND_REQUIRED])
         PKG_CHECK_MODULES([WAYLAND_SERVER], [wayland-server >= $WAYLAND_REQUIRED])
         PKG_CHECK_MODULES([WAYLAND_PROTOCOLS], [wayland-protocols >= $WAYLAND_PROTOCOLS_REQUIRED])
+        if test "x$enable_egl" = xyes; then
+          PKG_CHECK_MODULES([WAYLAND_EGL], [wayland-egl-backend >= $WAYLAND_EGL_BACKEND_REQUIRED])
+        fi
         WAYLAND_PROTOCOLS_DATADIR=`$PKG_CONFIG --variable=pkgdatadir wayland-protocols`
 
         PKG_CHECK_MODULES([WAYLAND_SCANNER], [wayland-scanner],
@@ -1855,6 +1853,9 @@
 
     android)
         PKG_CHECK_MODULES([ANDROID], [cutils hardware sync])
+        if test -n "$with_gallium_drivers"; then
+            PKG_CHECK_MODULES([BACKTRACE], [backtrace])
+        fi
         DEFINES="$DEFINES -DHAVE_ANDROID_PLATFORM"
         ;;
 
@@ -1890,12 +1891,45 @@
     fi
 fi
 
+
+if echo "$platforms" | grep -q 'x11' && echo "$platforms" | grep -q 'drm'; then
+    have_xlease=yes
+else
+    have_xlease=no
+fi
+
+if test x"$have_xlease" = xyes; then
+    randr_modules="x11-xcb xcb-randr"
+    PKG_CHECK_MODULES([XCB_RANDR], [$randr_modules])
+    xlib_randr_modules="xrandr"
+    PKG_CHECK_MODULES([XLIB_RANDR], [$xlib_randr_modules])
+fi
+
 AM_CONDITIONAL(HAVE_PLATFORM_X11, echo "$platforms" | grep -q 'x11')
 AM_CONDITIONAL(HAVE_PLATFORM_WAYLAND, echo "$platforms" | grep -q 'wayland')
 AM_CONDITIONAL(HAVE_PLATFORM_DRM, echo "$platforms" | grep -q 'drm')
 AM_CONDITIONAL(HAVE_PLATFORM_SURFACELESS, echo "$platforms" | grep -q 'surfaceless')
 AM_CONDITIONAL(HAVE_PLATFORM_ANDROID, echo "$platforms" | grep -q 'android')
 
+AC_ARG_ENABLE(xlib-lease,
+    [AS_HELP_STRING([--enable-xlib-lease]
+                    [enable VK_acquire_xlib_display using X leases])],
+    [enable_xlib_lease=$enableval], [enable_xlib_lease=auto])
+case "x$enable_xlib_lease" in
+xyes)
+    ;;
+xno)
+    ;;
+*)
+    if echo "$platforms" | grep -q 'x11' && echo "$platforms" | grep -q 'drm'; then
+        enable_xlib_lease=yes
+    else
+        enable_xlib_lease=no
+    fi
+esac
+
+AM_CONDITIONAL(HAVE_XLIB_LEASE, test "x$enable_xlib_lease" = xyes)
+
 dnl
 dnl More DRI setup
 dnl
@@ -2746,20 +2780,20 @@
             ;;
         xvc4)
             HAVE_GALLIUM_VC4=yes
-            require_libdrm "vc4"
+            PKG_CHECK_MODULES([VC4], [libdrm >= $LIBDRM_VC4_REQUIRED])
 
             PKG_CHECK_MODULES([SIMPENROSE], [simpenrose],
                               [USE_VC4_SIMULATOR=yes;
                                DEFINES="$DEFINES -DUSE_VC4_SIMULATOR"],
                               [USE_VC4_SIMULATOR=no])
             ;;
-        xvc5)
-            HAVE_GALLIUM_VC5=yes
+        xv3d)
+            HAVE_GALLIUM_V3D=yes
 
-            PKG_CHECK_MODULES([VC5_SIMULATOR], [v3dv3],
-                              [USE_VC5_SIMULATOR=yes;
-                               DEFINES="$DEFINES -DUSE_VC5_SIMULATOR"],
-                              [AC_MSG_ERROR([vc5 requires the simulator])])
+            PKG_CHECK_MODULES([V3D_SIMULATOR], [v3dv3],
+                              [USE_V3D_SIMULATOR=yes;
+                               DEFINES="$DEFINES -DUSE_V3D_SIMULATOR"],
+                              [USE_V3D_SIMULATOR=no])
             ;;
         xpl111)
             HAVE_GALLIUM_PL111=yes
@@ -2779,8 +2813,9 @@
 fi
 
 # XXX: Keep in sync with LLVM_REQUIRED_SWR
-AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x4.0.0 -a \
-                                              "x$LLVM_VERSION" != x4.0.1)
+AM_CONDITIONAL(SWR_INVALID_LLVM_VERSION, test "x$LLVM_VERSION" != x5.0.0 -a \
+                                              "x$LLVM_VERSION" != x5.0.1 -a \
+                                              "x$LLVM_VERSION" != x5.0.2)
 
 if test "x$enable_llvm" = "xyes" -a "$with_gallium_drivers"; then
     llvm_require_version $LLVM_REQUIRED_GALLIUM "gallium"
@@ -2911,8 +2946,8 @@
 AM_CONDITIONAL(HAVE_GALLIUM_SWRAST, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes -o \
                                          "x$HAVE_GALLIUM_LLVMPIPE" = xyes -o \
                                          "x$HAVE_GALLIUM_SWR" = xyes)
+AM_CONDITIONAL(HAVE_GALLIUM_V3D, test "x$HAVE_GALLIUM_V3D" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes)
-AM_CONDITIONAL(HAVE_GALLIUM_VC5, test "x$HAVE_GALLIUM_VC5" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_VIRGL, test "x$HAVE_GALLIUM_VIRGL" = xyes)
 
 AM_CONDITIONAL(HAVE_GALLIUM_STATIC_TARGETS, test "x$enable_shared_pipe_drivers" = xno)
@@ -2940,7 +2975,7 @@
                                       "x$HAVE_RADEON_VULKAN" = xyes)
 
 AM_CONDITIONAL(HAVE_BROADCOM_DRIVERS, test "x$HAVE_GALLIUM_VC4" = xyes -o \
-                                      "x$HAVE_GALLIUM_VC5" = xyes)
+                                      "x$HAVE_GALLIUM_V3D" = xyes)
 
 AM_CONDITIONAL(HAVE_INTEL_DRIVERS, test "x$HAVE_INTEL_VULKAN" = xyes -o \
                                         "x$HAVE_I965_DRI" = xyes)
@@ -2951,8 +2986,8 @@
 AM_CONDITIONAL(NEED_WINSYS_XLIB, test "x$enable_glx" = xgallium-xlib)
 AM_CONDITIONAL(HAVE_GALLIUM_COMPUTE, test x$enable_opencl = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_LLVM, test "x$enable_llvm" = xyes)
+AM_CONDITIONAL(USE_V3D_SIMULATOR, test x$USE_V3D_SIMULATOR = xyes)
 AM_CONDITIONAL(USE_VC4_SIMULATOR, test x$USE_VC4_SIMULATOR = xyes)
-AM_CONDITIONAL(USE_VC5_SIMULATOR, test x$USE_VC5_SIMULATOR = xyes)
 
 AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
@@ -2988,7 +3023,7 @@
 AC_SUBST([XVMC_MINOR], 0)
 
 AC_SUBST([XA_MAJOR], 2)
-AC_SUBST([XA_MINOR], 3)
+AC_SUBST([XA_MINOR], 4)
 AC_SUBST([XA_PATCH], 0)
 AC_SUBST([XA_VERSION], "$XA_MAJOR.$XA_MINOR.$XA_PATCH")
 
@@ -3037,8 +3072,6 @@
                  src/egl/Makefile
                  src/egl/main/egl.pc
                  src/egl/wayland/wayland-drm/Makefile
-                 src/egl/wayland/wayland-egl/Makefile
-                 src/egl/wayland/wayland-egl/wayland-egl.pc
                  src/gallium/Makefile
                  src/gallium/auxiliary/Makefile
                  src/gallium/auxiliary/pipe-loader/Makefile
@@ -3056,8 +3089,8 @@
                  src/gallium/drivers/tegra/Makefile
                  src/gallium/drivers/etnaviv/Makefile
                  src/gallium/drivers/imx/Makefile
+                 src/gallium/drivers/v3d/Makefile
                  src/gallium/drivers/vc4/Makefile
-                 src/gallium/drivers/vc5/Makefile
                  src/gallium/drivers/virgl/Makefile
                  src/gallium/state_trackers/clover/Makefile
                  src/gallium/state_trackers/dri/Makefile
@@ -3104,8 +3137,8 @@
                  src/gallium/winsys/sw/wrapper/Makefile
                  src/gallium/winsys/sw/xlib/Makefile
                  src/gallium/winsys/tegra/drm/Makefile
+                 src/gallium/winsys/v3d/drm/Makefile
                  src/gallium/winsys/vc4/drm/Makefile
-                 src/gallium/winsys/vc5/drm/Makefile
                  src/gallium/winsys/virgl/drm/Makefile
                  src/gallium/winsys/virgl/vtest/Makefile
                  src/gbm/Makefile
@@ -3140,7 +3173,9 @@
                  src/mesa/state_tracker/tests/Makefile
                  src/util/Makefile
                  src/util/tests/hash_table/Makefile
+                 src/util/tests/set/Makefile
                  src/util/tests/string_buffer/Makefile
+                 src/util/tests/vma/Makefile
                  src/util/xmlpool/Makefile
                  src/vulkan/Makefile])
 
diff --git a/docs/codingstyle.html b/docs/codingstyle.html
index 7e9f470..34acae2 100644
--- a/docs/codingstyle.html
+++ b/docs/codingstyle.html
@@ -83,7 +83,7 @@
     *     "An INVALID_OPERATION error is generated for any of the following
     *     conditions:
     *
-    *     * <length> is zero."
+    *     * &lt;length&gt; is zero."
     *
     * Additionally, page 94 of the PDF of the OpenGL 4.5 core spec
     * (30.10.2014) also says this, so it's no longer allowed for desktop GL,
@@ -94,7 +94,7 @@
 <pre>
    /**
     * Create and initialize a new buffer object.  Called via the
-    * ctx->Driver.CreateObject() driver callback function.
+    * ctx-&gt;Driver.CreateObject() driver callback function.
     * \param  name  integer name of the object
     * \param  type  one of GL_FOO, GL_BAR, etc.
     * \return  pointer to new object or NULL if error
diff --git a/docs/egl.html b/docs/egl.html
index 3d8a85b..2bc8f23 100644
--- a/docs/egl.html
+++ b/docs/egl.html
@@ -168,6 +168,7 @@
 <p>This driver can share DRI drivers with <code>libGL</code>.</p>
 
 </dd>
+</dl>
 
 <h2>Packaging</h2>
 
diff --git a/docs/faq.html b/docs/faq.html
index 1f2fd66..6270a07 100644
--- a/docs/faq.html
+++ b/docs/faq.html
@@ -16,7 +16,7 @@
 
 <center>
 <h1>Mesa Frequently Asked Questions</h1>
-Last updated: 9 October 2012
+Last updated: 19 September 2018
 </center>
 
 <br>
@@ -373,18 +373,16 @@
 
 <h2>4.3 Why isn't GL_EXT_texture_compression_s3tc implemented in Mesa?</h2>
 <p>
-The <a href="http://oss.sgi.com/projects/ogl-sample/registry/EXT/texture_compression_s3tc.txt">specification for the extension</a>
-indicates that there are intellectual property (IP) and/or patent issues
-to be dealt with.
-</p>
-<p>We've been unsuccessful in getting a response from S3 (or whoever owns
-the IP nowadays) to indicate whether or not an open source project can
-implement the extension (specifically the compression/decompression
-algorithms).
+Oh but it is! Prior to 2nd October 2017, the Mesa project did not include s3tc
+support due to intellectual property (IP) and/or patent issues around the s3tc
+algorithm.
 </p>
 <p>
-In the mean time, a 3rd party <a href="https://dri.freedesktop.org/wiki/S3TC">
-plug-in library</a> is available.
+As of Mesa 17.3.0, Mesa now officially supports s3tc, as the patent has expired.
+</p>
+<p>
+In versions prior to this, a 3rd party <a href="https://dri.freedesktop.org/wiki/S3TC">
+plug-in library</a> was required.
 </p>
 
 </div>
diff --git a/docs/favicon.ico b/docs/favicon.ico
new file mode 100644
index 0000000..e7e244a
--- /dev/null
+++ b/docs/favicon.ico
Binary files differ
diff --git a/docs/favicon.png b/docs/favicon.png
new file mode 100644
index 0000000..19898ef
--- /dev/null
+++ b/docs/favicon.png
Binary files differ
diff --git a/docs/features.txt b/docs/features.txt
index b1eb9e9..3761447 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -36,7 +36,7 @@
 Feature                                                 Status
 ------------------------------------------------------- ------------------------
 
-GL 3.0, GLSL 1.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr
+GL 3.0, GLSL 1.30 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr, virgl
 
   glBindFragDataLocation, glGetFragDataLocation         DONE
   GL_NV_conditional_render (Conditional rendering)      DONE ()
@@ -63,12 +63,12 @@
   glVertexAttribI commands                              DONE
   Depth format cube textures                            DONE ()
   GLX_ARB_create_context (GLX 1.4 is required)          DONE
-  Multisample anti-aliasing                             DONE (freedreno (*), llvmpipe (*), softpipe (*), swr (*))
+  Multisample anti-aliasing                             DONE (freedreno/a5xx, freedreno (*), llvmpipe (*), softpipe (*), swr (*))
 
-(*) freedreno, llvmpipe, softpipe, and swr have fake Multisample anti-aliasing support
+(*) freedreno (a2xx-a4xx), llvmpipe, softpipe, and swr have fake Multisample anti-aliasing support
 
 
-GL 3.1, GLSL 1.40 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr
+GL 3.1, GLSL 1.40 --- all DONE: freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr, virgl
 
   Forward compatible context support/deprecations       DONE ()
   GL_ARB_draw_instanced (Instanced drawing)             DONE ()
@@ -81,7 +81,7 @@
   GL_EXT_texture_snorm (Signed normalized textures)     DONE ()
 
 
-GL 3.2, GLSL 1.50 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr
+GL 3.2, GLSL 1.50 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, swr, virgl
 
   Core/compatibility profiles                           DONE
   Geometry shaders                                      DONE ()
@@ -90,13 +90,13 @@
   GL_ARB_fragment_coord_conventions (Frag shader coord) DONE (freedreno)
   GL_ARB_provoking_vertex (Provoking vertex)            DONE (freedreno)
   GL_ARB_seamless_cube_map (Seamless cubemaps)          DONE (freedreno)
-  GL_ARB_texture_multisample (Multisample textures)     DONE ()
+  GL_ARB_texture_multisample (Multisample textures)     DONE (freedreno/a5xx)
   GL_ARB_depth_clamp (Frag depth clamp)                 DONE (freedreno)
   GL_ARB_sync (Fence objects)                           DONE (freedreno)
   GLX_ARB_create_context_profile                        DONE
 
 
-GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
+GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe, virgl
 
   GL_ARB_blend_func_extended                            DONE (freedreno/a3xx, swr)
   GL_ARB_explicit_attrib_location                       DONE (all drivers that support GLSL)
@@ -110,18 +110,18 @@
   GL_ARB_vertex_type_2_10_10_10_rev                     DONE (freedreno, swr)
 
 
-GL 4.0, GLSL 4.00 --- all DONE: i965/gen7+, nvc0, r600, radeonsi
+GL 4.0, GLSL 4.00 --- all DONE: i965/gen7+, nvc0, r600, radeonsi, virgl
 
   GL_ARB_draw_buffers_blend                             DONE (freedreno, i965/gen6+, nv50, llvmpipe, softpipe, swr)
   GL_ARB_draw_indirect                                  DONE (freedreno, i965/gen7+, llvmpipe, softpipe, swr)
   GL_ARB_gpu_shader5                                    DONE (i965/gen7+)
   - 'precise' qualifier                                 DONE
   - Dynamically uniform sampler array indices           DONE (softpipe)
-  - Dynamically uniform UBO array indices               DONE ()
+  - Dynamically uniform UBO array indices               DONE (freedreno)
   - Implicit signed -> unsigned conversions             DONE
   - Fused multiply-add                                  DONE ()
-  - Packing/bitfield/conversion functions               DONE (softpipe)
-  - Enhanced textureGather                              DONE (softpipe)
+  - Packing/bitfield/conversion functions               DONE (freedreno, softpipe)
+  - Enhanced textureGather                              DONE (freedreno, softpipe)
   - Geometry shader instancing                          DONE (llvmpipe, softpipe)
   - Geometry shader multiple streams                    DONE ()
   - Enhanced per-sample shading                         DONE ()
@@ -139,7 +139,7 @@
   GL_ARB_transform_feedback3                            DONE (i965/gen7+, llvmpipe, softpipe, swr)
 
 
-GL 4.1, GLSL 4.10 --- all DONE: i965/gen7+, nvc0, r600, radeonsi
+GL 4.1, GLSL 4.10 --- all DONE: i965/gen7+, nvc0, r600, radeonsi, virgl
 
   GL_ARB_ES2_compatibility                              DONE (freedreno, i965, nv50, llvmpipe, softpipe, swr)
   GL_ARB_get_program_binary                             DONE (0 or 1 binary formats)
@@ -149,7 +149,7 @@
   GL_ARB_viewport_array                                 DONE (i965, nv50, llvmpipe, softpipe)
 
 
-GL 4.2, GLSL 4.20 -- all DONE: i965/gen7+, nvc0, r600, radeonsi
+GL 4.2, GLSL 4.20 -- all DONE: i965/gen7+, nvc0, r600, radeonsi, virgl
 
   GL_ARB_texture_compression_bptc                       DONE (freedreno, i965)
   GL_ARB_compressed_texture_pixel_storage               DONE (all drivers)
@@ -165,7 +165,7 @@
   GL_ARB_map_buffer_alignment                           DONE (all drivers)
 
 
-GL 4.3, GLSL 4.30 -- all DONE: i965/gen8+, nvc0, r600, radeonsi
+GL 4.3, GLSL 4.30 -- all DONE: i965/gen8+, nvc0, r600, radeonsi, virgl
 
   GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
   GL_ARB_ES3_compatibility                              DONE (all drivers that support GLSL 3.30)
@@ -205,20 +205,20 @@
   - input/output block locations                        DONE
   GL_ARB_multi_bind                                     DONE (all drivers)
   GL_ARB_query_buffer_object                            DONE (i965/hsw+)
-  GL_ARB_texture_mirror_clamp_to_edge                   DONE (i965, nv50, llvmpipe, softpipe, swr)
-  GL_ARB_texture_stencil8                               DONE (freedreno, i965/hsw+, nv50, llvmpipe, softpipe, swr)
-  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (i965, nv50, llvmpipe, softpipe, swr)
+  GL_ARB_texture_mirror_clamp_to_edge                   DONE (i965, nv50, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_texture_stencil8                               DONE (freedreno, i965/hsw+, nv50, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (i965, nv50, llvmpipe, softpipe, swr, virgl)
 
 GL 4.5, GLSL 4.50 -- all DONE: nvc0, radeonsi
 
-  GL_ARB_ES3_1_compatibility                            DONE (i965/hsw+, r600)
+  GL_ARB_ES3_1_compatibility                            DONE (i965/hsw+, r600, virgl)
   GL_ARB_clip_control                                   DONE (freedreno, i965, nv50, r600, llvmpipe, softpipe, swr)
-  GL_ARB_conditional_render_inverted                    DONE (freedreno, i965, nv50, r600, llvmpipe, softpipe, swr)
-  GL_ARB_cull_distance                                  DONE (i965, nv50, r600, llvmpipe, softpipe, swr)
-  GL_ARB_derivative_control                             DONE (i965, nv50, r600)
+  GL_ARB_conditional_render_inverted                    DONE (freedreno, i965, nv50, r600, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_cull_distance                                  DONE (i965, nv50, r600, llvmpipe, softpipe, swr, virgl)
+  GL_ARB_derivative_control                             DONE (i965, nv50, r600, virgl)
   GL_ARB_direct_state_access                            DONE (all drivers)
   GL_ARB_get_texture_sub_image                          DONE (all drivers)
-  GL_ARB_shader_texture_image_samples                   DONE (i965, nv50, r600)
+  GL_ARB_shader_texture_image_samples                   DONE (i965, nv50, r600, virgl)
   GL_ARB_texture_barrier                                DONE (freedreno, i965, nv50, r600)
   GL_KHR_context_flush_control                          DONE (all - but needs GLX/EGL extension to be useful)
   GL_KHR_robustness                                     DONE (i965)
@@ -229,19 +229,19 @@
   GL_ARB_gl_spirv                                       in progress (Nicolai Hähnle, Ian Romanick)
   GL_ARB_indirect_parameters                            DONE (i965/gen7+, nvc0, radeonsi)
   GL_ARB_pipeline_statistics_query                      DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe, swr)
-  GL_ARB_polygon_offset_clamp                           DONE (freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, swr)
-  GL_ARB_shader_atomic_counter_ops                      DONE (freedreno/a5xx, i965/gen7+, nvc0, r600, radeonsi, softpipe)
+  GL_ARB_polygon_offset_clamp                           DONE (freedreno, i965, nv50, nvc0, r600, radeonsi, llvmpipe, swr, virgl)
+  GL_ARB_shader_atomic_counter_ops                      DONE (freedreno/a5xx, i965/gen7+, nvc0, r600, radeonsi, softpipe, virgl)
   GL_ARB_shader_draw_parameters                         DONE (i965, nvc0, radeonsi)
   GL_ARB_shader_group_vote                              DONE (i965, nvc0, radeonsi)
   GL_ARB_spirv_extensions                               in progress (Nicolai Hähnle, Ian Romanick)
   GL_ARB_texture_filter_anisotropic                     DONE (freedreno, i965, nv50, nvc0, r600, radeonsi, softpipe (*), llvmpipe (*))
-  GL_ARB_transform_feedback_overflow_query              DONE (i965/gen6+, nvc0, radeonsi, llvmpipe, softpipe)
+  GL_ARB_transform_feedback_overflow_query              DONE (i965/gen6+, nvc0, radeonsi, llvmpipe, softpipe, virgl)
   GL_KHR_no_error                                       DONE (all drivers)
 
 (*) softpipe and llvmpipe advertise 16x anisotropy but simply ignore the setting
 
 These are the extensions cherry-picked to make GLES 3.1
-GLES3.1, GLSL ES 3.1 -- all DONE: i965/hsw+, nvc0, r600, radeonsi
+GLES3.1, GLSL ES 3.1 -- all DONE: i965/hsw+, nvc0, r600, radeonsi, virgl
 
   GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
   GL_ARB_compute_shader                                 DONE (freedreno/a5xx, i965/gen7+, softpipe)
@@ -256,11 +256,11 @@
   GL_ARB_shading_language_packing                       DONE (all drivers)
   GL_ARB_separate_shader_objects                        DONE (all drivers)
   GL_ARB_stencil_texturing                              DONE (freedreno, nv50, llvmpipe, softpipe, swr)
-  GL_ARB_texture_multisample (Multisample textures)     DONE (i965/gen7+, nv50, llvmpipe, softpipe)
+  GL_ARB_texture_multisample (Multisample textures)     DONE (freedreno/a5xx, i965/gen7+, nv50, llvmpipe, softpipe)
   GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
   GL_ARB_vertex_attrib_binding                          DONE (all drivers)
-  GS5 Enhanced textureGather                            DONE (freedreno, i965/gen7+,)
-  GS5 Packing/bitfield/conversion functions             DONE (i965/gen6+)
+  GS5 Enhanced textureGather                            DONE (freedreno, i965/gen7+)
+  GS5 Packing/bitfield/conversion functions             DONE (freedreno/a5xx, i965/gen6+)
   GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)
 
   Additional functionality not covered above:
@@ -269,28 +269,28 @@
       glGetBooleani_v - restrict to GLES enums
       gl_HelperInvocation support                       DONE (i965, r600)
 
-GLES3.2, GLSL ES 3.2 -- all DONE: i965/gen9+
+GLES3.2, GLSL ES 3.2 -- all DONE: i965/gen9+, radeonsi, virgl
 
   GL_EXT_color_buffer_float                             DONE (all drivers)
-  GL_KHR_blend_equation_advanced                        DONE (i965, nvc0, radeonsi)
+  GL_KHR_blend_equation_advanced                        DONE (i965, nvc0)
   GL_KHR_debug                                          DONE (all drivers)
-  GL_KHR_robustness                                     DONE (i965, nvc0, radeonsi)
+  GL_KHR_robustness                                     DONE (i965, nvc0)
   GL_KHR_texture_compression_astc_ldr                   DONE (freedreno, i965/gen9+)
   GL_OES_copy_image                                     DONE (all drivers)
   GL_OES_draw_buffers_indexed                           DONE (all drivers that support GL_ARB_draw_buffers_blend)
   GL_OES_draw_elements_base_vertex                      DONE (all drivers)
-  GL_OES_geometry_shader                                DONE (i965/hsw+, nvc0, radeonsi)
+  GL_OES_geometry_shader                                DONE (i965/hsw+, nvc0)
   GL_OES_gpu_shader5                                    DONE (all drivers that support GL_ARB_gpu_shader5)
-  GL_OES_primitive_bounding_box                         DONE (i965/gen7+, nvc0, radeonsi)
-  GL_OES_sample_shading                                 DONE (i965, nvc0, r600, radeonsi)
-  GL_OES_sample_variables                               DONE (i965, nvc0, r600, radeonsi)
+  GL_OES_primitive_bounding_box                         DONE (i965/gen7+, nvc0)
+  GL_OES_sample_shading                                 DONE (i965, nvc0, r600)
+  GL_OES_sample_variables                               DONE (i965, nvc0, r600)
   GL_OES_shader_image_atomic                            DONE (all drivers that support GL_ARB_shader_image_load_store)
   GL_OES_shader_io_blocks                               DONE (All drivers that support GLES 3.1)
-  GL_OES_shader_multisample_interpolation               DONE (i965, nvc0, r600, radeonsi)
+  GL_OES_shader_multisample_interpolation               DONE (i965, nvc0, r600)
   GL_OES_tessellation_shader                            DONE (all drivers that support GL_ARB_tessellation_shader)
   GL_OES_texture_border_clamp                           DONE (all drivers)
-  GL_OES_texture_buffer                                 DONE (i965, nvc0, radeonsi)
-  GL_OES_texture_cube_map_array                         DONE (i965/hsw+, nvc0, radeonsi)
+  GL_OES_texture_buffer                                 DONE (freedreno, i965, nvc0)
+  GL_OES_texture_cube_map_array                         DONE (i965/hsw+, nvc0)
   GL_OES_texture_stencil8                               DONE (all drivers that support GL_ARB_texture_stencil8)
   GL_OES_texture_storage_multisample_2d_array           DONE (all drivers that support GL_ARB_texture_multisample)
 
@@ -299,17 +299,17 @@
   GL_ARB_bindless_texture                               DONE (nvc0, radeonsi)
   GL_ARB_cl_event                                       not started
   GL_ARB_compute_variable_group_size                    DONE (nvc0, radeonsi)
-  GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+)
-  GL_ARB_fragment_shader_interlock                      not started
+  GL_ARB_ES3_2_compatibility                            DONE (i965/gen8+, radeonsi, virgl)
+  GL_ARB_fragment_shader_interlock                      DONE (i965)
   GL_ARB_gpu_shader_int64                               DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe)
   GL_ARB_parallel_shader_compile                        not started, but Chia-I Wu did some related work in 2014
-  GL_ARB_post_depth_coverage                            DONE (i965)
+  GL_ARB_post_depth_coverage                            DONE (i965, nvc0)
   GL_ARB_robustness_isolation                           not started
-  GL_ARB_sample_locations                               not started
-  GL_ARB_seamless_cubemap_per_texture                   DONE (i965, nvc0, radeonsi, r600, softpipe, swr)
+  GL_ARB_sample_locations                               DONE (nvc0)
+  GL_ARB_seamless_cubemap_per_texture                   DONE (freedreno, i965, nvc0, radeonsi, r600, softpipe, swr, virgl)
   GL_ARB_shader_ballot                                  DONE (i965/gen8+, nvc0, radeonsi)
   GL_ARB_shader_clock                                   DONE (i965/gen7+, nv50, nvc0, r600, radeonsi)
-  GL_ARB_shader_stencil_export                          DONE (i965/gen9+, r600, radeonsi, softpipe, llvmpipe, swr)
+  GL_ARB_shader_stencil_export                          DONE (i965/gen9+, r600, radeonsi, softpipe, llvmpipe, swr, virgl)
   GL_ARB_shader_viewport_layer_array                    DONE (i965/gen6+, nvc0, radeonsi)
   GL_ARB_sparse_buffer                                  DONE (radeonsi/CIK+)
   GL_ARB_sparse_texture                                 not started
@@ -322,12 +322,14 @@
   GL_EXT_semaphore                                      DONE (radeonsi)
   GL_EXT_semaphore_fd                                   DONE (radeonsi)
   GL_EXT_semaphore_win32                                not started
+  GL_EXT_texture_norm16                                 DONE (i965, r600, radeonsi, nvc0)
   GL_KHR_blend_equation_advanced_coherent               DONE (i965/gen9+)
   GL_KHR_texture_compression_astc_hdr                   DONE (i965/bxt)
   GL_KHR_texture_compression_astc_sliced_3d             DONE (i965/gen9+)
   GL_OES_depth_texture_cube_map                         DONE (all drivers that support GLSL 1.30+)
   GL_OES_EGL_image                                      DONE (all drivers)
-  GL_OES_EGL_image_external_essl3                       not started
+  GL_OES_EGL_image_external                             DONE (all drivers)
+  GL_OES_EGL_image_external_essl3                       DONE (all drivers)
   GL_OES_required_internalformat                        DONE (all drivers)
   GL_OES_surfaceless_context                            DONE (all drivers)
   GL_OES_texture_compression_astc                       DONE (core only)
@@ -335,7 +337,7 @@
   GL_OES_texture_float_linear                           DONE (freedreno, i965, r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe)
   GL_OES_texture_half_float                             DONE (freedreno, i965, r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe)
   GL_OES_texture_half_float_linear                      DONE (freedreno, i965, r300, r600, radeonsi, nv30, nv50, nvc0, softpipe, llvmpipe)
-  GL_OES_texture_view                                   not started - based on GL_ARB_texture_view
+  GL_OES_texture_view                                   DONE (i965/gen8+)
   GL_OES_viewport_array                                 DONE (i965, nvc0, radeonsi)
   GLX_ARB_context_flush_control                         not started
   GLX_ARB_robustness_application_isolation              not started
@@ -352,39 +354,55 @@
 
 Vulkan 1.0 -- all DONE: anv, radv
 
-Khronos extensions that are not part of any Vulkan version:
+Vulkan 1.1 -- all DONE: anv, radv
+
   VK_KHR_16bit_storage                                  in progress (Alejandro)
-  VK_KHR_android_surface                                not started
+  VK_KHR_bind_memory2                                   DONE (anv, radv)
   VK_KHR_dedicated_allocation                           DONE (anv, radv)
   VK_KHR_descriptor_update_template                     DONE (anv, radv)
-  VK_KHR_display                                        not started
-  VK_KHR_display_swapchain                              not started
-  VK_KHR_external_fence                                 not started
-  VK_KHR_external_fence_capabilities                    not started
-  VK_KHR_external_fence_fd                              not started
-  VK_KHR_external_fence_win32                           not started
+  VK_KHR_device_group                                   not started
+  VK_KHR_device_group_creation                          not started
+  VK_KHR_external_fence                                 DONE (anv, radv)
+  VK_KHR_external_fence_capabilities                    DONE (anv, radv)
   VK_KHR_external_memory                                DONE (anv, radv)
   VK_KHR_external_memory_capabilities                   DONE (anv, radv)
-  VK_KHR_external_memory_fd                             DONE (anv, radv)
-  VK_KHR_external_memory_win32                          not started
-  VK_KHR_external_semaphore                             DONE (radv)
-  VK_KHR_external_semaphore_capabilities                DONE (radv)
-  VK_KHR_external_semaphore_fd                          DONE (radv)
-  VK_KHR_external_semaphore_win32                       not started
+  VK_KHR_external_semaphore                             DONE (anv, radv)
+  VK_KHR_external_semaphore_capabilities                DONE (anv, radv)
   VK_KHR_get_memory_requirements2                       DONE (anv, radv)
   VK_KHR_get_physical_device_properties2                DONE (anv, radv)
-  VK_KHR_get_surface_capabilities2                      DONE (anv)
-  VK_KHR_incremental_present                            DONE (anv, radv)
   VK_KHR_maintenance1                                   DONE (anv, radv)
+  VK_KHR_maintenance2                                   DONE (anv, radv)
+  VK_KHR_maintenance3                                   DONE (anv, radv)
+  VK_KHR_multiview                                      DONE (anv, radv)
+  VK_KHR_relaxed_block_layout                           DONE (anv, radv)
+  VK_KHR_sampler_ycbcr_conversion                       DONE (anv)
+  VK_KHR_shader_draw_parameters                         DONE (anv, radv)
+  VK_KHR_storage_buffer_storage_class                   DONE (anv, radv)
+  VK_KHR_variable_pointers                              DONE (anv, radv)
+
+Khronos extensions that are not part of any Vulkan version:
+  VK_KHR_8bit_storage                                   DONE (anv)
+  VK_KHR_android_surface                                not started
+  VK_KHR_create_renderpass2                             DONE (anv, radv)
+  VK_KHR_display                                        DONE (anv, radv)
+  VK_KHR_display_swapchain                              DONE (anv, radv)
+  VK_KHR_draw_indirect_count                            DONE (radv)
+  VK_KHR_external_fence_fd                              DONE (anv, radv)
+  VK_KHR_external_fence_win32                           not started
+  VK_KHR_external_memory_fd                             DONE (anv, radv)
+  VK_KHR_external_memory_win32                          not started
+  VK_KHR_external_semaphore_fd                          DONE (anv, radv)
+  VK_KHR_external_semaphore_win32                       not started
+  VK_KHR_get_display_properties2                        DONE (anv, radv)
+  VK_KHR_get_surface_capabilities2                      DONE (anv, radv)
+  VK_KHR_image_format_list                              DONE (anv, radv)
+  VK_KHR_incremental_present                            DONE (anv, radv)
   VK_KHR_mir_surface                                    not started
   VK_KHR_push_descriptor                                DONE (anv, radv)
   VK_KHR_sampler_mirror_clamp_to_edge                   DONE (anv, radv)
-  VK_KHR_shader_draw_parameters                         DONE (anv, radv)
   VK_KHR_shared_presentable_image                       not started
-  VK_KHR_storage_buffer_storage_class                   DONE (anv, radv)
   VK_KHR_surface                                        DONE (anv, radv)
   VK_KHR_swapchain                                      DONE (anv, radv)
-  VK_KHR_variable_pointers                              DONE (anv, radv)
   VK_KHR_wayland_surface                                DONE (anv, radv)
   VK_KHR_win32_keyed_mutex                              not started
   VK_KHR_win32_surface                                  not started
diff --git a/docs/index.html b/docs/index.html
index 2383d98..f0a153a 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,71 @@
 
 <h1>News</h1>
 
+<h2>July 27, 2018</h2>
+<p>
+<a href="relnotes/18.1.5.html">Mesa 18.1.5</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>July 13, 2018</h2>
+<p>
+<a href="relnotes/18.1.4.html">Mesa 18.1.4</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>June 29, 2018</h2>
+<p>
+<a href="relnotes/18.1.3.html">Mesa 18.1.3</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>June 15, 2018</h2>
+<p>
+<a href="relnotes/18.1.2.html">Mesa 18.1.2</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>June 3, 2018</h2>
+<p>
+<a href="relnotes/18.0.5.html">Mesa 18.0.5</a> is released.
+This is a bug-fix release.
+<br>
+NOTE: It is anticipated that 18.0.5 will be the final release in the
+18.0 series. Users of 18.0 are encouraged to migrate to the 18.1
+series in order to obtain future fixes.
+</p>
+
+<h2>June 1, 2018</h2>
+<p>
+<a href="relnotes/18.1.1.html">Mesa 18.1.1</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>May 18, 2018</h2>
+<p>
+<a href="relnotes/18.1.0.html">Mesa 18.1.0</a> is released.  This is a
+new development release.  See the release notes for more information
+about the release.
+</p>
+
+<h2>May 17, 2018</h2>
+<p>
+<a href="relnotes/18.0.4.html">Mesa 18.0.4</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>May 7, 2018</h2>
+<p>
+<a href="relnotes/18.0.3.html">Mesa 18.0.3</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>April 28, 2018</h2>
+<p>
+<a href="relnotes/18.0.2.html">Mesa 18.0.2</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>April 18, 2018</h2>
 <p>
 <a href="relnotes/18.0.1.html">Mesa 18.0.1</a> is released.
diff --git a/docs/meson.html b/docs/meson.html
index b887758..29907a6 100644
--- a/docs/meson.html
+++ b/docs/meson.html
@@ -24,10 +24,7 @@
 <p>The meson build is tested on on Linux, macOS, Cygwin and Haiku, it should
 work on FreeBSD, DragonflyBSD, NetBSD, and OpenBSD.</p>
 
-<p><strong>Mesa requires Meson >= 0.42.0 to build in general.</strong>
-
-Additionaly, to build the Clover OpenCL state tracker or the OpenSWR driver
-meson 0.44.0 or greater is required.
+<p><strong>Mesa requires Meson >= 0.44.1 to build.</strong>
 
 Some older versions of meson do not check that they are too old and will error
 out in odd ways.
@@ -36,7 +33,7 @@
 <p>
 The meson program is used to configure the source directory and generates
 either a ninja build file or Visual Studio® build files. The latter must
-be enabled via the --backend switch, as ninja is the default backend on all
+be enabled via the <code>--backend</code> switch, as ninja is the default backend on all
 operating systems. Meson only supports out-of-tree builds, and must be passed a
 directory to put built and generated sources into. We'll call that directory
 "build" for examples.
@@ -52,7 +49,7 @@
 your meson global arguments and project arguments, along with their defaults
 and your local settings.
 
-Moes does not currently support listing options before configure a build
+Meson does not currently support listing options before configure a build
 directory, but this feature is being discussed upstream.
 </p>
 
@@ -63,7 +60,7 @@
 <p>
 With additional arguments <code>meson configure</code> is used to change
 options on already configured build directory. All options passed to this
-command are in the form -D "command"="value".
+command are in the form <code>-D "command"="value"</code>.
 </p>
 
 <pre>
@@ -71,6 +68,14 @@
 </pre>
 
 <p>
+Note that options taking lists (such as <code>platforms</code>) are
+<a href="http://mesonbuild.com/Build-options.html#using-build-options">a bit
+more complicated</a>, but the simplest form compatible with Mesa options
+is to use a comma to separate values (<code>-D platforms=drm,wayland</code>)
+and brackets to represent an empty list (<code>-D platforms=[]</code>).
+</p>
+
+<p>
 Once you've run the initial <code>meson</code> command successfully you can use
 your configured backend to build the project. With ninja, the -C option can be
 be used to point at a directory to build.
@@ -85,13 +90,14 @@
 depending on the options you have chosen. Later, if you want to rebuild for a
 different configuration, you should run <code>ninja clean</code> before
 changing the configuration, or create a new out of tree build directory for
-each configuration you want to build.
-
-http://mesonbuild.com/Using-multiple-build-directories.html
+each configuration you want to build
+<a href="http://mesonbuild.com/Using-multiple-build-directories.html">as
+recommended in the documentation</a>
 </p>
 
+<dl>
 <dt><code>Environment Variables</code></dt>
-<dd><p>Meson supports the standard CC and CXX envrionment variables for
+<dd><p>Meson supports the standard CC and CXX environment variables for
 changing the default compiler, and CFLAGS, CXXFLAGS, and LDFLAGS for setting
 options to the compiler and linker.
 
@@ -102,9 +108,9 @@
 These arguments are consumed and stored by meson when it is initialized or
 re-initialized. Therefore passing them to meson configure will not do anything,
 and passing them to ninja will only do something if ninja decides to
-re-initialze meson, for example, if a meson.build file has been changed.
+re-initialize meson, for example, if a meson.build file has been changed.
 Changing these variables will not cause all targets to be rebuilt, so running
-ninja clean is recomended when changing CFLAGS or CXXFLAGS. meson will never
+ninja clean is recommended when changing CFLAGS or CXXFLAGS. Meson will never
 change compiler in a configured build directory.
 </p>
 
@@ -116,14 +122,13 @@
     CFLAGS=-Wno-typedef-redefinition ninja -C build-clang
 </pre>
 
-<p>Meson also honors DESTDIR for installs</p>
+<p>Meson also honors <code>DESTDIR</code> for installs</p>
 </dd>
 
 
-<dl>
 <dt><code>LLVM</code></dt>
 <dd><p>Meson includes upstream logic to wrap llvm-config using it's standard
-dependncy interface. It will search $PATH (or %PATH% on windows) for
+dependency interface. It will search <code>$PATH</code> (or <code>%PATH%</code> on windows) for
 llvm-config, so using an LLVM from a non-standard path is as easy as
 <code>PATH=/path/with/llvm-config:$PATH meson build</code>.
 </p></dd>
@@ -146,7 +151,7 @@
 the <code>meson</code> than to <code>meson configure</code>. These options are
 passed as --option=foo to <code>meson</code>, but -Doption=foo to <code>meson
 configure</code>. Mesa defined options are always passed as -Doption=foo.
-<p>
+</p>
 
 <p>For those coming from autotools be aware of the following:</p>
 
@@ -155,13 +160,13 @@
 <dd><p>This option will set the compiler debug/optimisation levels to aid
 debugging the Mesa libraries.</p>
 
-<p>Note that in meson this defaults to "debugoptimized", and  not setting it to
-"release" will yield non-optimal performance and binary size. Not using "debug"
-may interfer with debbugging as some code and validation will be optimized
-away.
+<p>Note that in meson this defaults to <code>debugoptimized</code>, and
+not setting it to <code>release</code> will yield non-optimal
+performance and binary size. Not using <code>debug</code> may interfere
+with debugging as some code and validation will be optimized away.
 </p>
 
-<p> For those wishing to pass their own optimization flags, use the "plain"
+<p> For those wishing to pass their own optimization flags, use the <code>plain</code>
 buildtype, which causes meson to inject no additional compiler arguments, only
 those in the C/CXXFLAGS and those that mesa itself defines.</p>
 </dd>
@@ -169,10 +174,14 @@
 
 <dl>
 <dt><code>-Db_ndebug</code></dt>
-<dd><p>This option controls assertions in meson projects. When set to false
+<dd><p>This option controls assertions in meson projects. When set to <code>false</code>
 (the default) assertions are enabled, when set to true they are disabled. This
 is unrelated to the <code>buildtype</code>; setting the latter to
 <code>release</code> will not turn off assertions.
 </p>
 </dd>
 </dl>
+
+</div>
+</body>
+</html>
diff --git a/docs/patents.txt b/docs/patents.txt
deleted file mode 100644
index 91c5757..0000000
--- a/docs/patents.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-ARB_texture_float:
-
-    Silicon Graphics, Inc. owns US Patent #6,650,327, issued November 18,
-    2003 [1].
-
-    SGI believes this patent contains necessary IP for graphics systems
-    implementing floating point rasterization and floating point
-    framebuffer capabilities described in ARB_texture_float extension, and
-    will discuss licensing on RAND terms, on an individual basis with
-    companies wishing to use this IP in the context of conformant OpenGL
-    implementations [2].
-
-    The source code to implement ARB_texture_float extension is included
-    and can be toggled on at compile time, for those who purchased a
-    license from SGI, or are in a country where the patent does not apply,
-    etc.
-
-    The software is provided "as is", without warranty of any kind, express
-    or implied, including but not limited to the warranties of
-    merchantability, fitness for a particular purpose and noninfringement.
-    In no event shall the authors or copyright holders be liable for any
-    claim, damages or other liability, whether in an action of contract,
-    tort or otherwise, arising from, out of or in connection with the
-    software or the use or other dealings in the software.
-
-    You should contact a lawyer or SGI's legal department if you want to
-    enable this extension.
-
-
-[1] https://patents.google.com/patent/US6650327B1
-[2] https://www.opengl.org/registry/specs/ARB/texture_float.txt
diff --git a/docs/precompiled.html b/docs/precompiled.html
index d1f4ace..97dc1ff 100644
--- a/docs/precompiled.html
+++ b/docs/precompiled.html
@@ -24,10 +24,12 @@
 has to use unofficial channels.
 <br>
 There are some general directions:
+<ul>
 <li>Debian/Ubuntu based distros - PPA: xorg-edgers, oibaf and padoka</li>
 <li>Fedora - Corp: erp and che</li>
 <li>OpenSuse/SLES - OBS: X11:XOrg and pontostroy:X11</li>
 <li>Gentoo/Archlinux - officially provided/supported</li>
+</ul>
 </p>
 
 </div>
diff --git a/docs/release-calendar.html b/docs/release-calendar.html
index f2b8fc4..c086b90 100644
--- a/docs/release-calendar.html
+++ b/docs/release-calendar.html
@@ -37,95 +37,47 @@
 <th>Release</th>
 <th>Release manager</th>
 <th>Notes</th>
-<tr>
-<td rowspan="3">18.0</td>
-<td>2018-04-20</td>
-<td>18.0.2</td>
-<td>Juan A. Suarez Romero</td>
-<td></td>
 </tr>
 <tr>
-<td>2018-05-04</td>
-<td>18.0.3</td>
-<td>Juan A. Suarez Romero</td>
-<td></td>
-</tr>
-<tr>
-<td>2018-05-18</td>
-<td>18.0.4</td>
-<td>Juan A. Suarez Romero</td>
-<td>Last planned 18.0.x release</td>
-</tr>
-<tr>
-<td rowspan="8">18.1</td>
-<td>2018-04-20</td>
-<td>18.1.0rc1</td>
+<td rowspan="3">18.1</td>
+<td>2018-08-10</td>
+<td>18.1.6</td>
 <td>Dylan Baker</td>
 <td></td>
 </tr>
 <tr>
-<td>2018-04-27</td>
-<td>18.1.0rc2</td>
+<td>2018-08-24</td>
+<td>18.1.7</td>
 <td>Dylan Baker</td>
 <td></td>
 </tr>
 <tr>
-<td>2018-05-04</td>
-<td>18.1.0rc3</td>
+<td>2018-09-07</td>
+<td>18.1.8</td>
 <td>Dylan Baker</td>
-<td></td>
-</tr>
-<tr>
-<td>2018-05-11</td>
-<td>18.1.0rc4</td>
-<td>Dylan Baker</td>
-<td>Last planned RC/Final release</td>
-</tr>
-<tr>
-<td>TBD</td>
-<td>18.1.1</td>
-<td>Emil Velikov</td>
-<td></td>
-</tr>
-<tr>
-<td>TBD</td>
-<td>18.1.2</td>
-<td>Emil Velikov</td>
-<td></td>
-</tr>
-<tr>
-<td>TBD</td>
-<td>18.1.3</td>
-<td>Emil Velikov</td>
-<td></td>
-</tr>
-<tr>
-<td>TBD</td>
-<td>18.1.4</td>
-<td>Emil Velikov</td>
-<td>Last planned RC/Final release</td>
+<td>Last planned 18.1.x release</td>
 </tr>
 <tr>
 <td rowspan="4">18.2</td>
-<td>2018-07-20</td>
+<td>2018-08-01</td>
 <td>18.2.0rc1</td>
 <td>Andres Gomez</td>
 <td></td>
 </tr>
 <tr>
-<td>2018-07-27</td>
+<td>2018-08-08</td>
 <td>18.2.0rc2</td>
 <td>Andres Gomez</td>
 <td></td>
 </tr>
 <tr>
-<td>2018-08-03</td>
+<td>2018-08-15</td>
 <td>18.2.0rc3</td>
 <td>Andres Gomez</td>
 <td></td>
 </tr>
 <tr>
-<td>2018-08-10</td>
+<td>2018-08-22</td>
 <td>18.2.0rc4</td>
 <td>Andres Gomez</td>
 <td>Last planned RC/Final release</td>
diff --git a/docs/releasing.html b/docs/releasing.html
index a022d0c..14315e7 100644
--- a/docs/releasing.html
+++ b/docs/releasing.html
@@ -54,8 +54,8 @@
 <h1 id="schedule">Release schedule</h1>
 
 <p>
-Releases should happen on Fridays. Delays can occur although those should be keep
-to a minimum.
+Releases should happen on Wednesdays. Delays can occur although those
+should be keep to a minimum.
 <br>
 See our <a href="release-calendar.html" target="_parent">calendar</a> for the
 date and other details for individual releases.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 6bd1a1a..9d4a262 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,7 +21,16 @@
 </p>
 
 <ul>
+<li><a href="relnotes/18.1.5.html">18.1.5 release notes</a>
+<li><a href="relnotes/18.1.4.html">18.1.4 release notes</a>
+<li><a href="relnotes/18.1.3.html">18.1.3 release notes</a>
+<li><a href="relnotes/18.1.2.html">18.1.2 release notes</a>
+<li><a href="relnotes/18.0.5.html">18.0.5 release notes</a>
+<li><a href="relnotes/18.1.1.html">18.1.1 release notes</a>
 <li><a href="relnotes/18.1.0.html">18.1.0 release notes</a>
+<li><a href="relnotes/18.0.4.html">18.0.4 release notes</a>
+<li><a href="relnotes/18.0.3.html">18.0.3 release notes</a>
+<li><a href="relnotes/18.0.2.html">18.0.2 release notes</a>
 <li><a href="relnotes/18.0.1.html">18.0.1 release notes</a>
 <li><a href="relnotes/17.3.9.html">17.3.9 release notes</a>
 <li><a href="relnotes/17.3.8.html">17.3.8 release notes</a>
diff --git a/docs/relnotes/18.0.0.html b/docs/relnotes/18.0.0.html
index 6fa6370..9b7951b 100644
--- a/docs/relnotes/18.0.0.html
+++ b/docs/relnotes/18.0.0.html
@@ -48,8 +48,8 @@
 <li>Disk shader cache support for i965 when MESA_GLSL_CACHE_DISABLE environment variable is set to "0" or "false"</li>
 <li>GL_ARB_shader_atomic_counters and GL_ARB_shader_atomic_counter_ops on r600/evergreen+</li>
 <li>GL_ARB_shader_image_load_store and GL_ARB_shader_image_size on r600/evergreen+</li>
-<li>GL_ARB_shader_storage_buffer_object on r600/evergreen+<li>
-<li>GL_ARB_compute_shader on r600/evergreen+<li>
+<li>GL_ARB_shader_storage_buffer_object on r600/evergreen+</li>
+<li>GL_ARB_compute_shader on r600/evergreen+</li>
 <li>GL_ARB_cull_distance on r600/evergreen+</li>
 <li>GL_ARB_enhanced_layouts on r600/evergreen+</li>
 <li>GL_ARB_bindless_texture on nvc0/kepler</li>
diff --git a/docs/relnotes/18.1.6.html b/docs/relnotes/18.1.6.html
deleted file mode 100644
index 1e92548..0000000
--- a/docs/relnotes/18.1.6.html
+++ /dev/null
@@ -1,188 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
-<html lang="en">
-<head>
-  <meta http-equiv="content-type" content="text/html; charset=utf-8">
-  <title>Mesa Release Notes</title>
-  <link rel="stylesheet" type="text/css" href="../mesa.css">
-</head>
-<body>
-
-<div class="header">
-  <h1>The Mesa 3D Graphics Library</h1>
-</div>
-
-<iframe src="../contents.html"></iframe>
-<div class="content">
-
-<h1>Mesa 18.1.6 Release Notes / August 13 2018</h1>
-
-<p>
-Mesa 18.1.6 is a bug fix release which fixes bugs found since the 18.1.5 release.
-</p>
-<p>
-Mesa 18.1.6 implements the OpenGL 4.5 API, but the version reported by
-glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
-glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
-Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
-4.5 is <strong>only</strong> available if requested at context creation.
-Compatibility contexts may report a lower version depending on each driver.
-</p>
-
-
-<h2>SHA256 checksums</h2>
-<pre>
-580e03328ffefe1fd43b19ab7669f20d931601a1c0a4c0f8b9c65d6e81a06df3  mesa-18.1.6.tar.gz
-bb7ce759069801804fcfb8152da3457f76cd7b4e0096e4870ff5adcb5c894289  mesa-18.1.6.tar.xz
-</pre>
-
-
-<h2>New features</h2>
-
-<p>None</p>
-
-<h2>Bug fixes</h2>
-<ul>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=13728">Bug 13728</a> - [G965] Some objects in Neverwinter Nights Linux version not displayed correctly</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98699">Bug 98699</a> - &quot;float[a+++4 ? 1:1] f;&quot; crashes glsl_compiler</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99730">Bug 99730</a> - Metro Redux game(s) needs override for midshader extension declaration</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106382">Bug 106382</a> - Shader cache breaks INTEL_DEBUG=shader_time</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107117">Bug 107117</a> - mesa-18.1: regression with TFP on intel with modesettings and glamor acceleration</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107212">Bug 107212</a> - Dual-Core CPU E5500 / G45: RetroArch with reicast core results in corrupted graphics</li>
-
-</ul>
-
-<h2>Changes</h2>
-<p>Adam Jackson (1):</p>
-<ul>
-  <li>glx: GLX_MESA_multithread_makecurrent is direct-only</li>
-</ul>
-
-<p>Andres Gomez (3):</p>
-<ul>
-  <li>ddebug: use util_snprintf() in dd_get_debug_filename_and_mkdir</li>
-  <li>gallium/aux/util: use util_snprintf() in test_texture_barrier</li>
-  <li>glsl: use util_snprintf()</li>
-</ul>
-
-<p>Christian Gmeiner (1):</p>
-<ul>
-  <li>etnaviv: fix typo in query names</li>
-</ul>
-
-<p>Dave Airlie (1):</p>
-<ul>
-  <li>r600: reduce num compute threads to 1024.</li>
-</ul>
-
-<p>Dylan Baker (6):</p>
-<ul>
-  <li>docs: Add sha-256 sums for 18.1.5</li>
-  <li>nir/meson: fix c vs cpp args for nir test</li>
-  <li>gallium: fix ddebug on windows</li>
-  <li>cherry-ignore: add patches that get-pick-list is finding in error</li>
-  <li>cherry-ignore: Add some additional patches that are for 18.2</li>
-  <li>bump version to 18.1.6</li>
-</ul>
-
-<p>Emil Velikov (5):</p>
-<ul>
-  <li>swr: don't export swr_create_screen_internal</li>
-  <li>automake: require shared glapi when using DRI based libGL</li>
-  <li>autotools: error out when using the broken --with-{gl, osmesa}-lib-name</li>
-  <li>autotools: error out when building with mangling and glvnd</li>
-  <li>autotools: use correct gl.pc LIBS when using glvnd</li>
-</ul>
-
-<p>Eric Anholt (4):</p>
-<ul>
-  <li>vc4: Fix a leak of the no-vertex-elements workaround BO.</li>
-  <li>vc4: Respect a sampler view's first_layer field.</li>
-  <li>vc4: Ignore samplers for finding uniform offsets.</li>
-  <li>egl: Fix leak of X11 pixmaps backing pbuffers in DRI3.</li>
-</ul>
-
-<p>Gert Wollny (1):</p>
-<ul>
-  <li>meson, install_megadrivers: Also remove stale symlinks</li>
-</ul>
-
-<p>Jan Vesely (2):</p>
-<ul>
-  <li>clover: Reduce wait_count in abort path.</li>
-  <li>clover: Don't extend illegal integer types.</li>
-</ul>
-
-<p>Jason Ekstrand (2):</p>
-<ul>
-  <li>nir: Take if uses into account in ssa_def_components_read</li>
-  <li>i965/fs: Flag all slots of a flat input as flat</li>
-</ul>
-
-<p>Jon Turney (1):</p>
-<ul>
-  <li>meson: use correct keyword to fix a meson warning</li>
-</ul>
-
-<p>Jordan Justen (2):</p>
-<ul>
-  <li>i965, anv: Use INTEL_DEBUG for disk_cache driver flags</li>
-  <li>i965: Disable shader cache with INTEL_DEBUG=shader_time</li>
-</ul>
-
-<p>Juan A. Suarez Romero (2):</p>
-<ul>
-  <li>wayland/egl: update surface size on window resize</li>
-  <li>wayland/egl: initialize window surface size to window size</li>
-</ul>
-
-<p>Karol Herbst (2):</p>
-<ul>
-  <li>nir/lower_int64: mark all metadata as dirty</li>
-  <li>nvc0/ir: return 0 in imageLoad on incomplete textures</li>
-</ul>
-
-<p>Kenneth Graunke (1):</p>
-<ul>
-  <li>intel: Fix SIMD16 unaligned payload GRF reads on Gen4-5.</li>
-</ul>
-
-<p>Marek Olšák (1):</p>
-<ul>
-  <li>ac/surface: fix MSAA corruption on Vega due to FMASK tile swizzle</li>
-</ul>
-
-<p>Mauro Rossi (2):</p>
-<ul>
-  <li>radv: generate entrypoints for VK_ANDROID_native_buffer</li>
-  <li>radv: move vk_format_table.c to generated sources</li>
-</ul>
-
-<p>Olivier Fourdan (1):</p>
-<ul>
-  <li>dri3: For 1.2, use root window instead of pixmap drawable</li>
-</ul>
-
-<p>Tapani Pälli (1):</p>
-<ul>
-  <li>glsl: handle error case with ast_post_inc, ast_post_dec</li>
-</ul>
-
-<p>Vlad Golovkin (1):</p>
-<ul>
-  <li>swr: Remove unnecessary memset call</li>
-</ul>
-
-<p>vadym.shovkoplias (1):</p>
-<ul>
-  <li>drirc: Allow extension midshader for Metro Redux</li>
-</ul>
-
-</div>
-</body>
-</html>
diff --git a/docs/relnotes/18.1.7.html b/docs/relnotes/18.1.7.html
deleted file mode 100644
index 9f4baac..0000000
--- a/docs/relnotes/18.1.7.html
+++ /dev/null
@@ -1,104 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
-<html lang="en">
-<head>
-  <meta http-equiv="content-type" content="text/html; charset=utf-8">
-  <title>Mesa Release Notes</title>
-  <link rel="stylesheet" type="text/css" href="../mesa.css">
-</head>
-<body>
-
-<div class="header">
-  <h1>The Mesa 3D Graphics Library</h1>
-</div>
-
-<iframe src="../contents.html"></iframe>
-<div class="content">
-
-<h1>Mesa 18.1.7 Release Notes / August 24 2018</h1>
-
-<p>
-Mesa 18.1.7 is a bug fix release which fixes bugs found since the 18.1.6 release.
-</p>
-<p>
-Mesa 18.1.7 implements the OpenGL 4.5 API, but the version reported by
-glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
-glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
-Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
-4.5 is <strong>only</strong> available if requested at context creation.
-Compatibility contexts may report a lower version depending on each driver.
-</p>
-
-
-<h2>SHA256 checksums</h2>
-<pre>
-0c3c240bcd1352d179e65993214f9d55a399beac852c3ab4433e8df9b6c51c83  mesa-18.1.7.tar.gz
-655e3b32ce3bdddd5e6e8768596e5d4bdef82d0dd37067c324cc4b2daa207306  mesa-18.1.7.tar.xz
-</pre>
-
-
-<h2>New features</h2>
-
-<p>None</p>
-
-<h2>Bug fixes</h2>
-<ul>
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105975">Bug 105975</a> - i965 always reports 0 viewport subpixel bits</li>
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107098">Bug 107098</a> - Segfault after munmap(kms_sw_dt-&gt;ro_mapped)</li>
-</ul>
-
-<h2>Changes</h2>
-<p>Alexander Tsoy (1):</p>
-<ul>
-  <li>meson: fix build for egl platform_x11 without dri3 and gbm</li>
-</ul>
-
-<p>Bas Nieuwenhuizen (1):</p>
-<ul>
-  <li>radv: Fix missing Android platform define.</li>
-</ul>
-
-<p>Danylo Piliaiev (1):</p>
-<ul>
-  <li>i965: Advertise 8 bits subpixel precision for viewport bounds on gen6+</li>
-</ul>
-
-<p>Dave Airlie (1):</p>
-<ul>
-  <li>r600/eg: rework atomic counter emission with flushes</li>
-</ul>
-
-<p>Dylan Baker (7):</p>
-<ul>
-  <li>docs: Add sha256 sums for 18.1.6</li>
-  <li>cherry-ignore: Add additional 18.2 only patches</li>
-  <li>cherry-ignore: Add more 18.2 patches</li>
-  <li>cherry-ignore: Add more 18.2 patches</li>
-  <li>cherry-ignore: Add a couple of patches with &gt; 1 fixes tags</li>
-  <li>cherry-ignore: more 18.2 patches</li>
-  <li>bump version for 18.1.7 release</li>
-</ul>
-
-<p>Jason Ekstrand (2):</p>
-<ul>
-  <li>intel: Switch the order of the 2x MSAA sample positions</li>
-  <li>anv/lower_ycbcr: Use the binding array size for bounds checks</li>
-</ul>
-
-<p>Ray Strode (1):</p>
-<ul>
-  <li>gallium/winsys/kms: don't unmap what wasn't mapped</li>
-</ul>
-
-<p>Samuel Pitoiset (1):</p>
-<ul>
-  <li>radv/winsys: fix creating the BO list for virtual buffers</li>
-</ul>
-
-<p>Timothy Arceri (1):</p>
-<ul>
-  <li>radv: add Doom workaround</li>
-</ul>
-
-</div>
-</body>
-</html>
diff --git a/docs/relnotes/18.1.8.html b/docs/relnotes/18.1.8.html
deleted file mode 100644
index 90b46f2..0000000
--- a/docs/relnotes/18.1.8.html
+++ /dev/null
@@ -1,180 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
-<html lang="en">
-<head>
-  <meta http-equiv="content-type" content="text/html; charset=utf-8">
-  <title>Mesa Release Notes</title>
-  <link rel="stylesheet" type="text/css" href="../mesa.css">
-</head>
-<body>
-
-<div class="header">
-  <h1>The Mesa 3D Graphics Library</h1>
-</div>
-
-<iframe src="../contents.html"></iframe>
-<div class="content">
-
-<h1>Mesa 18.1.8 Release Notes / September 7 2018</h1>
-
-<p>
-Mesa 18.1.8 is a bug fix release which fixes bugs found since the 18.1.7 release.
-</p>
-<p>
-Mesa 18.1.8 implements the OpenGL 4.5 API, but the version reported by
-glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
-glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
-Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
-4.5 is <strong>only</strong> available if requested at context creation.
-Compatibility contexts may report a lower version depending on each driver.
-</p>
-
-
-<h2>SHA256 checksums</h2>
-<pre>
-8ec62f215dd1bb3910987f9941c6fc31632a0874e618815cf1e8e29445c86e0a  mesa-18.1.8.tar.gz
-bd1be67fe9c73b517765264ac28911c84144682d28dbff140e1c2deb2f44c21b  mesa-18.1.8.tar.xz
-</pre>
-
-
-<h2>New features</h2>
-<p>None</p>
-
-<h2>Bug fixes</h2>
-<ul>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93355">Bug 93355</a> - [BXT,SKLGT4e] intermittent ext_framebuffer_multisample.accuracy fails</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101247">Bug 101247</a> - Mesa fails to link GLSL programs with unused output blocks</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104809">Bug 104809</a> - anv: DOOM 2016 and Wolfenstein II:The New Colossus crash due to not having depthBoundsTest</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105904">Bug 105904</a> - Needed to delete mesa shader cache after driver upgrade for 32 bit wine vulkan programs to work.</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106738">Bug 106738</a> - No test for miptrees with DRI modifiers</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106865">Bug 106865</a> - [GLK] piglit.spec.ext_framebuffer_multisample.accuracy stencil tests fail</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107359">Bug 107359</a> - [Regression] [bisected] [OpenGL CTS] [SKL,BDW] KHR-GL46.texture_barrier*-texels, GTF-GL46.gtf21.GL2FixedTests.buffer_corners.buffer_corners, and GTF-GL46.gtf21.GL2FixedTests.stencil_plane_corners.stencil_plane_corners fail with some configuration</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107477">Bug 107477</a> - [DXVK] Setting high shader quality in GTA V results in LLVM error</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107579">Bug 107579</a> - [SNB] The graphic corruption when we reuse the GS compiled and used for TFB when statebuffer contain magic trash in the unused space</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107601">Bug 107601</a> - Rise of the Tomb Raider Segmentation Fault when the game starts</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107760">Bug 107760</a> - GPU Hang when Playing DiRT 3 Complete Edition using Steam Play with DXVK</li>
-
-</ul>
-
-<h2>Changes</h2>
-<p>Andrii Simiklit (1):</p>
-<ul>
-  <li>i965/gen6/xfb: handle case where transform feedback is not active</li>
-</ul>
-
-<p>Bas Nieuwenhuizen (3):</p>
-<ul>
-  <li>radv: Add missing checks in radv_get_image_format_properties.</li>
-  <li>radv: Fix CMASK dimensions.</li>
-  <li>radv: Use a lower max offchip buffer count.</li>
-</ul>
-
-<p>Christian Gmeiner (1):</p>
-<ul>
-  <li>tegra: fix memory leak</li>
-</ul>
-
-<p>Daniel Stone (1):</p>
-<ul>
-  <li>st/dri: Don't expose sRGB formats to clients</li>
-</ul>
-
-<p>Dave Airlie (1):</p>
-<ul>
-  <li>ac/radeonsi: fix CIK copy max size</li>
-</ul>
-
-<p>Dylan Baker (10):</p>
-<ul>
-  <li>docs: Add mesa 18.1.7 notes</li>
-  <li>cherry-ignore: add a patch</li>
-  <li>cherry-ignore: Add more 18.2 only patches</li>
-  <li>meson: Actually load translation files</li>
-  <li>cherry-ignore: Add more 18.2 patches</li>
-  <li>cherry-ignore: Add additional patch</li>
-  <li>cherry-ignore: Add patch that doesn't apply to 18.1</li>
-  <li>cherry-ignore: Add a couple of two fixes warning patches</li>
-  <li>cherry-ignore: Add patch that needs more significant patches to function</li>
-  <li>Bump version to 18.1.8</li>
-</ul>
-
-<p>Emil Velikov (1):</p>
-<ul>
-  <li>docs: update required mako version</li>
-</ul>
-
-<p>Grazvydas Ignotas (1):</p>
-<ul>
-  <li>radv: place pointer length into cache uuid</li>
-</ul>
-
-<p>Gurchetan Singh (2):</p>
-<ul>
-  <li>meson: fix egl build for surfaceless</li>
-  <li>meson: fix egl build for android</li>
-</ul>
-
-<p>Ian Romanick (2):</p>
-<ul>
-  <li>i965/vec4: Clamp indirect tes input array reads with 0x0fffffff</li>
-  <li>i965/vec4: Correctly handle uniform sources in generate_tes_add_indirect_urb_offset</li>
-</ul>
-
-<p>Jason Ekstrand (5):</p>
-<ul>
-  <li>anv: Fill holes in the VF VUE to zero</li>
-  <li>nir/algebraic: Be more careful converting ushr to extract_u8/16</li>
-  <li>egl/dri2: Add a helper for the number of planes for a FOURCC format</li>
-  <li>egl/dri2: Guard against invalid fourcc formats</li>
-  <li>anv/blorp: Do more flushing around HiZ clears</li>
-</ul>
-
-<p>Juan A. Suarez Romero (1):</p>
-<ul>
-  <li>egl/wayland: do not leak wl_buffer when it is locked</li>
-</ul>
-
-<p>Lionel Landwerlin (1):</p>
-<ul>
-  <li>anv: blorp: support multiple aspect blits</li>
-</ul>
-
-<p>Marek Olšák (1):</p>
-<ul>
-  <li>glapi: actually implement GL_EXT_robustness for GLES</li>
-</ul>
-
-<p>Nanley Chery (7):</p>
-<ul>
-  <li>intel/isl: Avoid tiling some 16K-wide render targets</li>
-  <li>i965: Make blt_pitch public</li>
-  <li>i965/miptree: Drop an if case from retile_as_linear</li>
-  <li>i965/miptree: Use the correct BLT pitch</li>
-  <li>i965/miptree: Use miptree_map in map_blit functions</li>
-  <li>i965/miptree: Fix can_blit_slice()</li>
-  <li>i965/gen7_urb: Re-emit PUSH_CONSTANT_ALLOC on some gen9</li>
-</ul>
-
-<p>Samuel Pitoiset (1):</p>
-<ul>
-  <li>radv: fix passing clip/cull distances from VS to PS</li>
-</ul>
-
-<p>vadym.shovkoplias (1):</p>
-<ul>
-  <li>glsl/linker: Allow unused in blocks which are not declated on previous stage</li>
-</ul>
-
-</div>
-</body>
-</html>
diff --git a/docs/relnotes/18.1.9.html b/docs/relnotes/18.1.9.html
deleted file mode 100644
index e141adf..0000000
--- a/docs/relnotes/18.1.9.html
+++ /dev/null
@@ -1,177 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
-<html lang="en">
-<head>
-  <meta http-equiv="content-type" content="text/html; charset=utf-8">
-  <title>Mesa Release Notes</title>
-  <link rel="stylesheet" type="text/css" href="../mesa.css">
-</head>
-<body>
-
-<div class="header">
-  <h1>The Mesa 3D Graphics Library</h1>
-</div>
-
-<iframe src="../contents.html"></iframe>
-<div class="content">
-
-<h1>Mesa 18.1.8 Release Notes / September 24 2018</h1>
-
-<p>
-Mesa 18.1.9 is a bug fix release which fixes bugs found since the 18.1.8 release.
-</p>
-<p>
-Mesa 18.1.9 implements the OpenGL 4.5 API, but the version reported by
-glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
-glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
-Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
-4.5 is <strong>only</strong> available if requested at context creation.
-Compatibility contexts may report a lower version depending on each driver.
-</p>
-
-
-<h2>SHA256 checksums</h2>
-<pre>
-TBD
-</pre>
-
-
-<h2>New features</h2>
-<p>None</p>
-
-<h2>Bug fixes</h2>
-<ul>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103241">Bug 103241</a> - Anv crashes when using 64-bit vertex inputs</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104926">Bug 104926</a> - swrast: Mesa 17.3.3 produces:  HW cursor for format 875713089 not supported</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107280">Bug 107280</a> - [DXVK] Batman: Arkham City with tessellation enabled hangs on SKL GT4</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107772">Bug 107772</a> - Mesa preprocessor matches if(def)s &amp; endifs incorrectly</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107779">Bug 107779</a> - Access violation with some games</li>
-
-<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107810">Bug 107810</a> - The 'va_end' call is missed after 'va_copy' in 'util_vsnprintf' function under windows</li>
-
-</ul>
-
-<h2>Changes</h2>
-<p>Andrii Simiklit (4):</p>
-<ul>
-  <li>apple/glx/log: added missing va_end() after va_copy()</li>
-  <li>mesa/util: don't use the same 'va_list' instance twice</li>
-  <li>mesa/util: don't ignore NULL returned from 'malloc'</li>
-  <li>mesa/util: add missing va_end() after va_copy()</li>
-</ul>
-
-<p>Bas Nieuwenhuizen (4):</p>
-<ul>
-  <li>radv: Use build ID if available for cache UUID.</li>
-  <li>radv: Only allow 16 user SGPRs for compute on GFX9+.</li>
-  <li>radv: Set the user SGPR MSB for Vega.</li>
-  <li>radv: Fix driver UUID SHA1 init.</li>
-</ul>
-
-<p>Christopher Egert (1):</p>
-<ul>
-  <li>radeon: fix ColorMask</li>
-</ul>
-
-<p>Dave Airlie (1):</p>
-<ul>
-  <li>virgl: don't send a shader create with no data. (v2)</li>
-</ul>
-
-<p>Dylan Baker (10):</p>
-<ul>
-  <li>docs/relnotes: Add sha256 sums for mesa 18.1.8</li>
-  <li>cherry-ignore: Add additional 18.2 patch</li>
-  <li>meson: Print a message about why a libdrm version was selected</li>
-  <li>cherry-ignore: add another 18.2 patch</li>
-  <li>cherry-ignore: Add patches that don't apply cleanly and are for developer tools</li>
-  <li>cherry-ignore: Add more 18.2 patches</li>
-  <li>cherry-ignore: add 18.2 patchs</li>
-  <li>cherry-ignore: add a patch that was reverted on master</li>
-  <li>cherry-ignore: one final update</li>
-  <li>Bump version to 18.1.9</li>
-</ul>
-
-<p>Erik Faye-Lund (2):</p>
-<ul>
-  <li>winsys/virgl: avoid unintended behavior</li>
-  <li>virgl: adjust strides when mapping temp-resources</li>
-</ul>
-
-<p>Gert Wollny (1):</p>
-<ul>
-  <li>winsys/virgl: correct resource and handle allocation (v2)</li>
-</ul>
-
-<p>Jason Ekstrand (6):</p>
-<ul>
-  <li>anv/pipeline: Only consider double elements which actually exist</li>
-  <li>i965: Workaround the gen9 hw astc5x5 sampler bug</li>
-  <li>anv: Re-emit vertex buffers when the pipeline changes</li>
-  <li>anv: Disable the vertex cache when tessellating on SKL GT4</li>
-  <li>anv: Clamp scissors to the framebuffer boundary</li>
-  <li>anv/query: Write both dwords in emit_zero_queries</li>
-</ul>
-
-<p>Josh Pieper (1):</p>
-<ul>
-  <li>st/mesa: Validate the result of pipe_transfer_map in make_texture (v2)</li>
-</ul>
-
-<p>Kenneth Feng (1):</p>
-<ul>
-  <li>amd: Add Picasso device id</li>
-</ul>
-
-<p>Marek Olšák (4):</p>
-<ul>
-  <li>st/mesa: help fix stencil border color for GL_DEPTH_STENCIL textures</li>
-  <li>radeonsi: fix HTILE for NPOT textures with mipmapping on SI/CI</li>
-  <li>r600: fix HTILE for NPOT textures with mipmapping</li>
-  <li>radeonsi: fix printing a BO list into ddebug reports</li>
-</ul>
-
-<p>Mathias Fröhlich (1):</p>
-<ul>
-  <li>tnl: Fix green gun regression in xonotic.</li>
-</ul>
-
-<p>Mauro Rossi (3):</p>
-<ul>
-  <li>android: broadcom/genxml: fix collision with intel/genxml header-gen macro</li>
-  <li>android: broadcom/cle: add gallium include path</li>
-  <li>android: broadcom/cle: export the broadcom top level path headers</li>
-</ul>
-
-<p>Michal Srb (1):</p>
-<ul>
-  <li>st/dri: don't set queryDmaBufFormats/queryDmaBufModifiers if the driver does not implement it</li>
-</ul>
-
-<p>Michel Dänzer (1):</p>
-<ul>
-  <li>loader/dri3: Only wait for back buffer fences in dri3_get_buffer</li>
-</ul>
-
-<p>Pierre Moreau (1):</p>
-<ul>
-  <li>nvir: Always split 64-bit IMAD/IMUL operations</li>
-</ul>
-
-<p>Sergii Romantsov (1):</p>
-<ul>
-  <li>intel: compiler option msse2 and mstackrealign</li>
-</ul>
-
-<p>Timothy Arceri (1):</p>
-<ul>
-  <li>glsl: fixer lexer for unreachable defines</li>
-</ul>
-
-</div>
-</body>
-</html>
diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html
new file mode 100644
index 0000000..968312c
--- /dev/null
+++ b/docs/relnotes/18.2.0.html
@@ -0,0 +1,284 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.2.0 Release Notes / September 7, 2018</h1>
+
+<p>
+Mesa 18.2.0 is a new development release. People who are concerned
+with stability and reliability should stick with a previous release or
+wait for Mesa 18.2.1.
+</p>
+<p>
+Mesa 18.2.0 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+<p>
+libwayland-egl is now distributed by Wayland (since 1.15,
+<a href="https://lists.freedesktop.org/archives/wayland-devel/2018-April/037767.html">see announcement</a>),
+and has been removed from Mesa in this release. Make sure you're using
+an up-to-date version of Wayland to keep the functionality.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+b9e6bb3eb7660b0726ba28405ffa0cb77de619e925b910b72f4d7a85c0098596  mesa-18.2.0.tar.gz
+22452bdffff8e11bf4284278155a9f77cb28d6d73a12c507f1490732d0d9ddce  mesa-18.2.0.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+
+<p>
+Note: some of the new features are only available with certain drivers.
+</p>
+
+<ul>
+<li>OpenGL 4.3 on virgl</li>
+<li>OpenGL 4.4 Compatibility profile on radeonsi</li>
+<li>OpenGL ES 3.2 on radeonsi and virgl</li>
+<li>GL_ARB_ES3_2_compatibility on radeonsi</li>
+<li>GL_ARB_fragment_shader_interlock on i965</li>
+<li>GL_ARB_sample_locations and GL_NV_sample_locations on nvc0 (GM200+)</li>
+<li>GL_ANDROID_extension_pack_es31a on radeonsi.</li>
+<li>GL_KHR_texture_compression_astc_ldr on radeonsi</li>
+<li>GL_NV_conservative_raster and GL_NV_conservative_raster_dilate on nvc0 (GM200+)</li>
+<li>GL_NV_conservative_raster_pre_snap_triangles on nvc0 (GP102+)</li>
+<li>multisampled images on nvc0 (GM107+) (now supported on GF100+)</li>
+</ul>
+
+<h2>Bug fixes</h2>
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=13728">Bug 13728</a> - [G965] Some objects in Neverwinter Nights Linux version not displayed correctly</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=61761">Bug 61761</a> - glPolygonOffsetEXT, OFFSET_BIAS incorrectly set to a huge number</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=65422">Bug 65422</a> - Rename api_validate.[ch] to draw_validate.[ch]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=78097">Bug 78097</a> - glUniform1ui and friends not supported by display lists</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91808">Bug 91808</a> - trine1 misrender r600g</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=93355">Bug 93355</a> - [BXT,SKLGT4e] intermittent ext_framebuffer_multisample.accuracy fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=95009">Bug 95009</a> - [SNB] amd_shader_trinary_minmax.execution.built-in-functions.gs-mid3-ivec2-ivec2-ivec2 intermittent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=95012">Bug 95012</a> - [SNB] glsl-1_50.execution.built-in-functions.gs-op tests intermittent</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98699">Bug 98699</a> - &quot;float[a+++4 ? 1:1] f;&quot; crashes glsl_compiler</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99116">Bug 99116</a> - Wine DirectDraw programs showing only a blackscreen when using Mesa Gallium drivers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99730">Bug 99730</a> - Metro Redux game(s) needs override for midshader extension declaration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100177">Bug 100177</a> - [GM206] Misrendering in XCOM Ennemy Within</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=100430">Bug 100430</a> - [radv] graphical glitches on dolphin emulator</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=101247">Bug 101247</a> - Mesa fails to link GLSL programs with unused output blocks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102390">Bug 102390</a> - centroid interpolation causes broken attribute values</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=102678">Bug 102678</a> - gl_BaseVertex should always be zero when the draw command has no &lt;basevertex&gt; parameter</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103274">Bug 103274</a> - BRW allocates too much heap memory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104388">Bug 104388</a> - [snb] GPU HANG: ecode 6:0:0x85fffff8 in fgfs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104626">Bug 104626</a> - broadcom/vc5: double compare</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104809">Bug 104809</a> - anv: DOOM 2016 and Wolfenstein II:The New Colossus crash due to not having depthBoundsTest</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105351">Bug 105351</a> - [Gen6+] piglit's arb_shader_image_load_store-host-mem-barrier fails with a glGetTexSubImage fallback path</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105374">Bug 105374</a> - texture3d, a SaschaWillems demo, assert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105396">Bug 105396</a> - tc compatible htile sets depth of htiles of discarded fragments to 1.0</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105399">Bug 105399</a> - [snb] GPU hang: after geometry shader emits no geometry, the program hangs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105497">Bug 105497</a> - shader-db crashes on 72 core system after ast_type_qualifier bitset change</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105613">Bug 105613</a> - Compute shader locks up within nested &quot;for&quot; loop</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105731">Bug 105731</a> - linker error &quot;fragment shader input ... has no matching output in the previous stage&quot; when previous stage's output declaration in a separate shader object</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105904">Bug 105904</a> - Needed to delete mesa shader cache after driver upgrade for 32 bit wine vulkan programs to work.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105975">Bug 105975</a> - i965 always reports 0 viewport subpixel bits</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106090">Bug 106090</a> - Compiling compute shader crashes RADV</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106133">Bug 106133</a> - make check &quot;OSError: [Errno 24] Too many open files&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106163">Bug 106163</a> - r600/sb: optimizer tries to schedule access to different array elements in one instruction group</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106174">Bug 106174</a> - vulkan dota2 broken (segfaulting), found bug commit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106180">Bug 106180</a> - [bisected] radv vulkan smoke test black screen (Add support for DRI3 v1.2)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106232">Bug 106232</a> - LLVM unit tests have error in random number handling</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106243">Bug 106243</a> - [kbl] GPU HANG: 9:0:0x85dffffb, in Cinnamon</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106315">Bug 106315</a> - The witness + dxvk suffers flickering garbage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106331">Bug 106331</a> - radv doesnt support VK_FORMAT_R32G32B32_SFLOAT</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106382">Bug 106382</a> - Shader cache breaks INTEL_DEBUG=shader_time</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106393">Bug 106393</a> - glsl-fs-shader-stencil-export hangs forever</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106450">Bug 106450</a> - glGetIntegerv return wrong value in some cases</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106462">Bug 106462</a> - piglit.spec.arb_vertex_array_bgra.get regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106479">Bug 106479</a> - NDEBUG not defined for libamdgpu_addrlib</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106480">Bug 106480</a> - A2B10G10R10_SNORM vertex attribute doesn't work.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106499">Bug 106499</a> - [regression, bisected] Several games crash on start</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106504">Bug 106504</a> - vulkan SPIR-V parsing failed at ../src/compiler/spirv/vtn_cfg.c:381</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106511">Bug 106511</a> - radv: MSAA broken on SI (assertion failure in vkCreateImage)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106587">Bug 106587</a> - Dota2 is very dark when using vulkan render on a Intel &lt;&lt; AMD prime setup</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106594">Bug 106594</a> - [regression,apitrace,bisected] Prison Architect rendered unplayable by multicoloured flickering triangles and overlayed triangles when performing certain actions</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106619">Bug 106619</a> - [OpenCL][llvm-svn]build failure  addPassesToEmitFile candidate expects 6 arguments, 3 provided</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106629">Bug 106629</a> - [SNB,IVB,HSW,BDW] dEQP-EGL.functional.image.create.gles2_cubemap_negative_z_rgb_read_pixels</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106642">Bug 106642</a> - X server crashes in i965 on desktop startup when DRI3 v1.2 / modifier support is enabled</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106643">Bug 106643</a> - double free when exporting a temporarily imported semaphore</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106673">Bug 106673</a> - [bisected] Steam is unusable since commit 5c33e8c7</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106687">Bug 106687</a> - radv: Fast color clears use incorrect format</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106708">Bug 106708</a> - [SKL/KBL/GLK] 2-3% performance drop in SynMark DrvState and 5-9% drop on SynMark Multithread</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106748">Bug 106748</a> - st/mesa: use PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY broke qemu -display sdl,gl=on</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106756">Bug 106756</a> - Wine 3.9 crashes with DXVK on Just Cause 3 and Quantum Break on VEGA but works ON POLARIS</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106774">Bug 106774</a> - GLSL IR copy propagates loads of SSBOs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106776">Bug 106776</a> - vma_random unrecognized command line option &quot;-std=c++11&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106778">Bug 106778</a> - Files missing from tarball - intel_sanitize_gpu.*</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106779">Bug 106779</a> - Files missing from tarball - u_debug_stack_android.cpp</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106784">Bug 106784</a> - 18.1.1 autotools build fail without mako</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106801">Bug 106801</a> - vma_random_test.cpp:239:18: error: non-constant-expression cannot be narrowed from type 'unsigned long' to 'uint_fast32_t' (aka 'unsigned int') in initializer list [-Wc++11-narrowing]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106810">Bug 106810</a> - ProgramBinary does not switch program correctly when using transform feedback</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106823">Bug 106823</a> - Failed to recongnize keyword of shader code</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106830">Bug 106830</a> - [bisected] 32 bit tests (deqp, piglit, glcts, vulkancts) crashing on all platforms</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106861">Bug 106861</a> - fatal error: wayland-egl-backend.h: No such file or directory compilation terminated.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106865">Bug 106865</a> - [GLK] piglit.spec.ext_framebuffer_multisample.accuracy stencil tests fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106903">Bug 106903</a> - radv: Fragment shader output goes to wrong attachments when render targets are sparse</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106906">Bug 106906</a> - Failed to recongnize keyword “sampler2DRect” and &quot;sampler2DRectShadow&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106907">Bug 106907</a> - Correct Transform Feedback Varyings information is expected after using ProgramBinary</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106912">Bug 106912</a> - radv: 16-bit depth buffer causes artifacts in Shadow Warrior 2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106928">Bug 106928</a> - When starting a match Rocket League crashes on &quot;Go&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106941">Bug 106941</a> - Intel ANV vulkan driver exposing version 1.1.0 which is incorrect</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106986">Bug 106986</a> - glGetQueryiv error when querying number of result bits for GL_ANY_SAMPLES_PASSED_CONSERVATIVE</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=106997">Bug 106997</a> - [Regression]. Dying light game is crashing on latest mesa</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107098">Bug 107098</a> - Segfault after munmap(kms_sw_dt-&gt;ro_mapped)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107117">Bug 107117</a> - mesa-18.1: regression with TFP on intel with modesettings and glamor acceleration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107190">Bug 107190</a> - Got seg fault on snb when use INTEL_DEBUG=bat</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107193">Bug 107193</a> - piglit.spec.arb_compute_shader.linker.bug-93840 fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107212">Bug 107212</a> - Dual-Core CPU E5500 / G45: RetroArch with reicast core results in corrupted graphics</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107223">Bug 107223</a> - [GEN9+] 50% perf drop in SynMark Fill* tests (E2E RBC gets disabled?)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107248">Bug 107248</a> - [G45 ILK G965] Texture handling broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107275">Bug 107275</a> - NIR segfaults after spirv-opt</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107276">Bug 107276</a> - radv: OpBitfieldUExtract returns incorrect result when count is zero</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107295">Bug 107295</a> - Access violation on glDrawArrays with count &gt;= 2048</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107305">Bug 107305</a> - glsl/opt_copy_propagation_elements.cpp:72:9: error: delegating constructors are permitted only in C++11</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107312">Bug 107312</a> - Mesa-git RPM build fails after commit 8cacf38f527d42e41441ef8c25d95d4b2f4e8602</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107359">Bug 107359</a> - [Regression] [bisected] [OpenGL CTS] [SKL,BDW] KHR-GL46.texture_barrier*-texels, GTF-GL46.gtf21.GL2FixedTests.buffer_corners.buffer_corners, and GTF-GL46.gtf21.GL2FixedTests.stencil_plane_corners.stencil_plane_corners fail with some configuration</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107366">Bug 107366</a> - NIR verification crashes on piglit tests</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107423">Bug 107423</a> - vc4 build failure: &quot;v3d_decoder.c:893: undefined reference to `clif_lookup_bo'&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107443">Bug 107443</a> - Build error on arm64: v3d_decoder.c:837:17: error: format not a string literal and no format arguments [-Werror=format-security]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107460">Bug 107460</a> - radv: OpControlBarrier does not always work correctly (bisected)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107477">Bug 107477</a> - [DXVK] Setting high shader quality in GTA V results in LLVM error</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107510">Bug 107510</a> - [GEN8+] up to 10% perf drop on several 3D benchmarks</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107544">Bug 107544</a> - intel/decoder: out of bounds group_iter</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107550">Bug 107550</a> - &quot;0[2]&quot; as function parameter hits assert</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107579">Bug 107579</a> - [SNB] The graphic corruption when we reuse the GS compiled and used for TFB when statebuffer contain magic trash in the unused space</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107601">Bug 107601</a> - Rise of the Tomb Raider Segmentation Fault when the game starts</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107610">Bug 107610</a> - Dolphin emulator mis-renders shadow overlay in Super Mario Sunshine</li>
+
+</ul>
+
+<h2>Changes</h2>
+
+<ul>
+<li>Removed GL_EXT_polygon_offset applications should use glPolygonOffset instead.</li>
+<li>Removed libwayland-egl, now part of Wayland</li>
+</ul>
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/18.2.1.html b/docs/relnotes/18.2.1.html
new file mode 100644
index 0000000..23fb8f4
--- /dev/null
+++ b/docs/relnotes/18.2.1.html
@@ -0,0 +1,227 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.2.1 Release Notes / September 21, 2018</h1>
+
+<p>
+Mesa 18.2.1 is a bug fix release which fixes bugs found since the 18.2.0 release.
+</p>
+<p>
+Mesa 18.2.0 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+SHA256: 45419ccbe1bf9a2e15ffe71ced34615002e1b42c24b917fbe2b2f58ab1970562  mesa-18.2.1.tar.gz
+SHA256: 9636dc6f3d188abdcca02da97cedd73640d9035224efd5db724187d062c81056  mesa-18.2.1.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=103241">Bug 103241</a> - Anv crashes when using 64-bit vertex inputs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107280">Bug 107280</a> - [DXVK] Batman: Arkham City with tessellation enabled hangs on SKL GT4</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107772">Bug 107772</a> - Mesa preprocessor matches if(def)s &amp; endifs incorrectly</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107779">Bug 107779</a> - Access violation with some games</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107810">Bug 107810</a> - The 'va_end' call is missed after 'va_copy' in 'util_vsnprintf' function under windows</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107832">Bug 107832</a> - Gallium picking A16L16 formats when emulating INTENSITY16 conflicts with mesa</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107843">Bug 107843</a> - 32bit Mesa build failes with meson.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107879">Bug 107879</a> - crash happens when link program</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107891">Bug 107891</a> - [wine, regression, bisected] RAGE, Wolfenstein The New Order hangs in menu</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Andres Gomez (3):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.2.0</li>
+  <li>Revert "Revert "glsl: skip stringification in preprocessor if in unreachable branch""</li>
+  <li>cherry-ignore: i965/tools: 32bit compilation with meson</li>
+</ul>
+
+<p>Andrii Simiklit (4):</p>
+<ul>
+  <li>apple/glx/log: added missing va_end() after va_copy()</li>
+  <li>mesa/util: don't use the same 'va_list' instance twice</li>
+  <li>mesa/util: don't ignore NULL returned from 'malloc'</li>
+  <li>mesa/util: add missing va_end() after va_copy()</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (5):</p>
+<ul>
+  <li>radv: Support v3 of VK_EXT_vertex_attribute_divisor.</li>
+  <li>radv: Set the user SGPR MSB for Vega.</li>
+  <li>radv: Only allow 16 user SGPRs for compute on GFX9+.</li>
+  <li>radv: Use build ID if available for cache UUID.</li>
+  <li>radv: Fix driver UUID SHA1 init.</li>
+</ul>
+
+<p>Christopher Egert (1):</p>
+<ul>
+  <li>radeon: fix ColorMask</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>virgl: don't send a shader create with no data. (v2)</li>
+</ul>
+
+<p>Dylan Baker (1):</p>
+<ul>
+  <li>meson: Print a message about why a libdrm version was selected</li>
+</ul>
+
+<p>Eric Anholt (2):</p>
+<ul>
+  <li>v3d: Fix SRC_ALPHA_SATURATE blending for RTs without alpha.</li>
+  <li>v3d: Fix setup of the VCM cache size.</li>
+</ul>
+
+<p>Erik Faye-Lund (2):</p>
+<ul>
+  <li>winsys/virgl: avoid unintended behavior</li>
+  <li>virgl: adjust strides when mapping temp-resources</li>
+</ul>
+
+<p>Fritz Koenig (2):</p>
+<ul>
+  <li>mesa: Additional FlipY applications</li>
+  <li>mesa: FramebufferParameteri parameter checking</li>
+</ul>
+
+<p>Gert Wollny (2):</p>
+<ul>
+  <li>winsys/virgl: correct resource and handle allocation (v2)</li>
+  <li>mesa/texture: Also check for LA texture when querying intensity component size</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>i965/fs: Don't propagate conditional modifiers from integer compares to adds</li>
+</ul>
+
+<p>Jason Ekstrand (11):</p>
+<ul>
+  <li>anv/pipeline: Only consider double elements which actually exist</li>
+  <li>i965: Workaround the gen9 hw astc5x5 sampler bug</li>
+  <li>anv: Re-emit vertex buffers when the pipeline changes</li>
+  <li>anv: Disable the vertex cache when tessellating on SKL GT4</li>
+  <li>anv: Clamp scissors to the framebuffer boundary</li>
+  <li>vulkan: Update the XML and headers to 1.1.84</li>
+  <li>anv: Support v3 of VK_EXT_vertex_attribute_divisor</li>
+  <li>anv/query: Write both dwords in emit_zero_queries</li>
+  <li>nir: Add a small pass to rematerialize derefs per-block</li>
+  <li>nir/loop_unroll: Re-materialize derefs in use blocks before unrolling</li>
+  <li>nir/opt_if: Re-materialize derefs in use blocks before peeling loops</li>
+</ul>
+
+<p>Josh Pieper (1):</p>
+<ul>
+  <li>st/mesa: Validate the result of pipe_transfer_map in make_texture (v2)</li>
+</ul>
+
+<p>Juan A. Suarez Romero (2):</p>
+<ul>
+  <li>cherry-ignore: radv: fix descriptor pool allocation size</li>
+  <li>Update version to 18.2.1</li>
+</ul>
+
+<p>Kenneth Feng (1):</p>
+<ul>
+  <li>amd: Add Picasso device id</li>
+</ul>
+
+<p>Marek Olšák (5):</p>
+<ul>
+  <li>radeonsi: fix HTILE for NPOT textures with mipmapping on SI/CI</li>
+  <li>winsys/radeon: fix CMASK fast clear for NPOT textures with mipmapping on SI/CI</li>
+  <li>r600: fix HTILE for NPOT textures with mipmapping</li>
+  <li>radeonsi: fix printing a BO list into ddebug reports</li>
+  <li>ac: revert new LLVM 7.0 behavior for fdiv</li>
+</ul>
+
+<p>Mathias Fröhlich (1):</p>
+<ul>
+  <li>tnl: Fix green gun regression in xonotic.</li>
+</ul>
+
+<p>Mauro Rossi (3):</p>
+<ul>
+  <li>android: broadcom/genxml: fix collision with intel/genxml header-gen macro</li>
+  <li>android: broadcom/cle: add gallium include path</li>
+  <li>android: broadcom/cle: export the broadcom top level path headers</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>loader/dri3: Only wait for back buffer fences in dri3_get_buffer</li>
+</ul>
+
+<p>Pierre Moreau (1):</p>
+<ul>
+  <li>nvir: Always split 64-bit IMAD/IMUL operations</li>
+</ul>
+
+<p>Samuel Pitoiset (7):</p>
+<ul>
+  <li>radv: fix function names for VK_EXT_conditional_rendering</li>
+  <li>radv: fix VK_EXT_conditional_rendering visibility</li>
+  <li>radv: bump the maximum number of arguments to 64</li>
+  <li>radv: handle loc-&gt;indirect correctly for the first descriptor</li>
+  <li>radv: fix GPU hangs with 32-bit indirect descriptors</li>
+  <li>radv: fix flushing indirect descriptors</li>
+  <li>radv: fix setting global locations for indirect descriptors</li>
+</ul>
+
+<p>Sergii Romantsov (3):</p>
+<ul>
+  <li>intel: compiler option msse2 and mstackrealign</li>
+  <li>i965/tools: 32bit compilation with meson</li>
+  <li>mesa/meson: 32bit xmlconfig linkage</li>
+</ul>
+
+<p>Timothy Arceri (2):</p>
+<ul>
+  <li>glsl: fixer lexer for unreachable defines</li>
+  <li>Revert "radeonsi: avoid syncing the driver thread in si_fence_finish"</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/18.2.2.html b/docs/relnotes/18.2.2.html
new file mode 100644
index 0000000..9793c03
--- /dev/null
+++ b/docs/relnotes/18.2.2.html
@@ -0,0 +1,155 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.2.2 Release Notes / October 5, 2018</h1>
+
+<p>
+Mesa 18.2.2 is a bug fix release which fixes bugs found since the 18.2.1 release.
+</p>
+<p>
+Mesa 18.2.2 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+SHA256: c51711168971957037cc7e3e19e8abe1ec6eeab9cf236d419a1e7728a41cac8a  mesa-18.2.2.tar.gz
+SHA256: c3ba82b12a89d3d9fed2bdd96b4702dbb7ab675034650a8b1b718320daf073c4  mesa-18.2.2.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104602">Bug 104602</a> - [apitrace] Graphical artifacts in Civilization VI on RX Vega</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=104926">Bug 104926</a> - swrast: Mesa 17.3.3 produces:  HW cursor for format 875713089 not supported</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107276">Bug 107276</a> - radv: OpBitfieldUExtract returns incorrect result when count is zero</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107786">Bug 107786</a> - [DXVK] MSAA reflections are broken in GTA V</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108024">Bug 108024</a> - [Debian Stretch]Fail to build because &quot;xcb_randr_lease_t&quot;</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Deucher (1):</p>
+<ul>
+  <li>pci_ids: add new polaris pci id</li>
+</ul>
+
+<p>Andres Rodriguez (1):</p>
+<ul>
+  <li>radv: only emit ZPASS_DONE for timestamp queries on gfx queues</li>
+</ul>
+
+<p>Axel Davy (3):</p>
+<ul>
+  <li>st/nine: Clamp RCP when 0*inf!=0</li>
+  <li>st/nine: Avoid redundant SetCursorPos calls</li>
+  <li>st/nine: Increase maximum number of temp registers</li>
+</ul>
+
+<p>Dylan Baker (1):</p>
+<ul>
+  <li>meson: Don't compile pipe loader with dri support when not using dri</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>vc4: Fix sin(0.0) and cos(0.0) accuracy to fix SDL rendering rotation.</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>vulkan/wsi/display: check if wsi_swapchain_init() succeeded</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>anv,radv: Implement vkAcquireNextImage2</li>
+</ul>
+
+<p>Juan A. Suarez Romero (2):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.2.1</li>
+  <li>Update version to 18.2.2</li>
+</ul>
+
+<p>Leo Liu (1):</p>
+<ul>
+  <li>radeon/uvd: use bitstream coded number for symbols of Huffman tables</li>
+</ul>
+
+<p>Marek Olšák (2):</p>
+<ul>
+  <li>glsl_to_tgsi: invert gl_SamplePosition.y for the default framebuffer</li>
+  <li>radeonsi: NaN should pass kill_if</li>
+</ul>
+
+<p>Maxime (1):</p>
+<ul>
+  <li>vulkan: Disable randr lease for libxcb &lt; 1.13</li>
+</ul>
+
+<p>Michal Srb (1):</p>
+<ul>
+  <li>st/dri: don't set queryDmaBufFormats/queryDmaBufModifiers if the driver does not implement it</li>
+</ul>
+
+<p>Rhys Perry (2):</p>
+<ul>
+  <li>nvc0: Update counter reading shaders to new NVC0_CB_AUX_MP_INFO</li>
+  <li>nvc0: fix bindless multisampled images on Maxwell+</li>
+</ul>
+
+<p>Samuel Iglesias Gonsálvez (1):</p>
+<ul>
+  <li>anv: Add support for protected memory properties on anv_GetPhysicalDeviceProperties2()</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>radv: use the resolve compute path if dest uses multiple layers</li>
+</ul>
+
+<p>Stuart Young (1):</p>
+<ul>
+  <li>docs: Update FAQ with respect to s3tc support</li>
+</ul>
+
+<p>Timothy Arceri (1):</p>
+<ul>
+  <li>radeonsi: add a workaround for bitfield_extract when count is 0</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/18.2.3.html b/docs/relnotes/18.2.3.html
new file mode 100644
index 0000000..596a0a1
--- /dev/null
+++ b/docs/relnotes/18.2.3.html
@@ -0,0 +1,167 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.2.3 Release Notes / October 19, 2018</h1>
+
+<p>
+Mesa 18.2.3 is a bug fix release which fixes bugs found since the 18.2.2 release.
+</p>
+<p>
+Mesa 18.2.3 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+0e13e2342eae74d8848df23595c4bb4b2f8874c9e1213b8466b1fbfa7ef99375  mesa-18.2.3.tar.gz
+e2bf83c17e1abdecb1ee81af22652e27e9aa38f963e95e60f34275cc0376304f  mesa-18.2.3.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=99507">Bug 99507</a> - Corrupted frame contents with Vulkan version of DOTA2, Talos Principle and Sascha Willems' demos when they're run Vsynched in fullscreen</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107857">Bug 107857</a> - GPU hang - GS_EMIT without shader outputs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107926">Bug 107926</a> - [anv] Rise of the Tomb Raider always misrendering, segfault and gpu hang.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108012">Bug 108012</a> - Compiler crashes on access of non-existent member incremental operations</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Boyuan Zhang (1):</p>
+<ul>
+  <li>st/va: use provided sizes and coords for vlVaGetImage</li>
+</ul>
+
+<p>Dave Airlie (1):</p>
+<ul>
+  <li>anv: add missing unlock in error path.</li>
+</ul>
+
+<p>Dylan Baker (1):</p>
+<ul>
+  <li>meson: Don't allow building EGL on Windows or MacOS</li>
+</ul>
+
+<p>Emil Velikov (5):</p>
+<ul>
+  <li>st/nine: do not double-close the fd on teardown</li>
+  <li>egl: make eglSwapInterval a no-op for !window surfaces</li>
+  <li>egl: make eglSwapBuffers* a no-op for !window surfaces</li>
+  <li>vl/dri3: do full teardown on screen_destroy</li>
+  <li>Revert "mesa: remove unnecessary 'sort by year' for the GL extensions"</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>radv: add missing meson c++ visibility arguments</li>
+</ul>
+
+<p>Fritz Koenig (1):</p>
+<ul>
+  <li>i965: Replace checks for rb-&gt;Name with FlipY (v2)</li>
+</ul>
+
+<p>Gert Wollny (1):</p>
+<ul>
+  <li>virgl, vtest: Correct the transfer size calculation</li>
+</ul>
+
+<p>Ilia Mirkin (4):</p>
+<ul>
+  <li>glsl: fix array assignments of a swizzled vector</li>
+  <li>nv50,nvc0: mark RGBX_UINT formats as renderable</li>
+  <li>nv50,nvc0: guard against zero-size blits</li>
+  <li>nvc0: fix blitting red to srgb8_alpha</li>
+</ul>
+
+<p>Jason Ekstrand (7):</p>
+<ul>
+  <li>nir/cf: Remove phi sources if needed in nir_handle_add_jump</li>
+  <li>anv: Use separate MOCS settings for external BOs</li>
+  <li>intel/fs: Fix a typo in need_matching_subreg_offset</li>
+  <li>nir/from_ssa: Don't rewrite derefs destinations to registers</li>
+  <li>anv/batch_chain: Don't start a new BO just for BATCH_BUFFER_START</li>
+  <li>nir/alu_to_scalar: Use ssa_for_alu_src in hand-rolled expansions</li>
+  <li>intel: Don't propagate conditional modifiers if a UD source is negated</li>
+</ul>
+
+<p>Juan A. Suarez Romero (2):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.2.2</li>
+  <li>Update version to 18.2.3</li>
+</ul>
+
+<p>Józef Kucia (1):</p>
+<ul>
+  <li>radeonsi: avoid sending GS_EMIT in shaders without outputs</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>drirc: add a workaround for ARMA 3</li>
+</ul>
+
+<p>Samuel Pitoiset (1):</p>
+<ul>
+  <li>radv: add a workaround for a VGT hang with prim restart and strips</li>
+</ul>
+
+<p>Tapani Pälli (1):</p>
+<ul>
+  <li>glsl: do not attempt assignment if operand type not parsed correctly</li>
+</ul>
+
+<p>Timothy Arceri (11):</p>
+<ul>
+  <li>glsl: ignore trailing whitespace when define redefined</li>
+  <li>util: disable cache if we have no build-id and timestamp is zero</li>
+  <li>util: rename timestamp param in disk_cache_create()</li>
+  <li>util: add disk_cache_get_function_identifier()</li>
+  <li>radeonsi: use build-id when available for disk cache</li>
+  <li>nouveau: use build-id when available for disk cache</li>
+  <li>r600: use build-id when available for disk cache</li>
+  <li>mesa/st: add force_compat_profile option to driconfig</li>
+  <li>util: use force_compat_profile for Wolfenstein The Old Blood</li>
+  <li>util: better handle program names from wine</li>
+  <li>util: add drirc workarounds for RAGE</li>
+</ul>
+
+<p>Vinson Lee (1):</p>
+<ul>
+  <li>r600/sb: Fix constant-logical-operand warning.</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/18.2.4.html b/docs/relnotes/18.2.4.html
new file mode 100644
index 0000000..5da4362
--- /dev/null
+++ b/docs/relnotes/18.2.4.html
@@ -0,0 +1,154 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.2.4 Release Notes / October 31, 2018</h1>
+
+<p>
+Mesa 18.2.4 is a bug fix release which fixes bugs found since the 18.2.4 release.
+</p>
+<p>
+Mesa 18.2.4 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+968bfe78605e9397ddf244933b1fa62edb8429fc55aaec2ae7e20bb1c82abdea  mesa-18.2.4.tar.gz
+621d1aebb57876d5b6a5d2dcf4eb7e0620e650c6fe5cf3655c65e243adc9cb4e  mesa-18.2.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107865">Bug 107865</a> - swr fail to build with llvm-libs 6.0.1</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108272">Bug 108272</a> - [polaris10] opencl-mesa: Anything using OpenCL segfaults, XFX Radeon RX 580</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108524">Bug 108524</a> - [RADV]  GPU lockup on event synchronization</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Smith (2):</p>
+<ul>
+  <li>ac/nir: Use context-specific LLVM types</li>
+  <li>anv: Fix sanitization of stencil state when the depth test is disabled</li>
+</ul>
+
+<p>Alok Hota (2):</p>
+<ul>
+  <li>swr/rast: ignore CreateElementUnorderedAtomicMemCpy</li>
+  <li>swr/rast: fix intrinsic/function for LLVM 7 compatibility</li>
+</ul>
+
+<p>Andres Rodriguez (1):</p>
+<ul>
+  <li>radv: fix check for perftest options size</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (1):</p>
+<ul>
+  <li>radv: Emit enqueued pipeline barriers on event write.</li>
+</ul>
+
+<p>Connor Abbott (2):</p>
+<ul>
+  <li>ac: Introduce ac_build_expand()</li>
+  <li>ac: Fix loading a dvec3 from an SSBO</li>
+</ul>
+
+<p>David McFarland (1):</p>
+<ul>
+  <li>util: Change remaining uint32 cache ids to sha1</li>
+</ul>
+
+<p>Dylan Baker (1):</p>
+<ul>
+  <li>meson: don't require libelf for r600 without LLVM</li>
+</ul>
+
+<p>Elie Tournier (1):</p>
+<ul>
+  <li>gallium: Correctly handle no config context creation</li>
+</ul>
+
+<p>Eric Engestrom (1):</p>
+<ul>
+  <li>radv: s/abs/fabsf/ for floats</li>
+</ul>
+
+<p>Jan Vesely (1):</p>
+<ul>
+  <li>radeonsi: Bump number of allowed global buffers to 32</li>
+</ul>
+
+<p>Jason Ekstrand (3):</p>
+<ul>
+  <li>spirv: Use the right bit-size for spec constant ops</li>
+  <li>blorp: Emit a dummy 3DSTATE_WM prior to 3DSTATE_WM_HZ_OP</li>
+  <li>anv: Flag semaphore BOs as external</li>
+</ul>
+
+<p>Juan A. Suarez Romero (3):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.2.3</li>
+  <li>cherry-ignore: Revert "anv/skylake: disable ForceThreadDispatchEnable"</li>
+  <li>Update version to 18.2.4</li>
+</ul>
+
+<p>Liviu Prodea (1):</p>
+<ul>
+  <li>scons: Put to rest zombie texture_float build option.</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>radeonsi: fix a VGT hang with primitive restart on Polaris10 and later</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>loader/dri3: Also wait for front buffer fence if we triggered it</li>
+</ul>
+
+<p>Nanley Chery (1):</p>
+<ul>
+  <li>intel/blorp: Define the clear value bounds for HiZ clears</li>
+</ul>
+
+<p>Rob Clark (2):</p>
+<ul>
+  <li>freedreno: fix inorder rendering case</li>
+  <li>freedreno: don't flush when new and old pfb is identical</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/18.2.5.html b/docs/relnotes/18.2.5.html
new file mode 100644
index 0000000..ac4690f
--- /dev/null
+++ b/docs/relnotes/18.2.5.html
@@ -0,0 +1,172 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.2.5 Release Notes / November 15, 2018</h1>
+
+<p>
+Mesa 18.2.5 is a bug fix release which fixes bugs found since the 18.2.4 release.
+</p>
+<p>
+Mesa 18.2.5 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+dddc28928b6f4083a0d5120b58c1c8e2dc189ab5c14299c08a386607fdbbdce7  mesa-18.2.5.tar.gz
+b12c32872832e5353155e1e8026e1f1ab75bba9dc5b178d712045684d26c2b73  mesa-18.2.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=105731">Bug 105731</a> - linker error &quot;fragment shader input ... has no matching output in the previous stage&quot; when previous stage's output declaration in a separate shader object</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107511">Bug 107511</a> - KHR/khrplatform.h not always installed when needed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107626">Bug 107626</a> - [SNB] The graphical corruption and GPU hang occur sometimes on the piglit test &quot;arb_texture_multisample-large-float-texture&quot; with parameter --fp16</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108082">Bug 108082</a> - warning: unknown warning option '-Wno-format-truncation' [-Wunknown-warning-option]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108560">Bug 108560</a> - Mesa 32 is built without sse</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Andre Heider (1):</p>
+<ul>
+  <li>st/nine: fix stack corruption due to ABI mismatch</li>
+</ul>
+
+<p>Andrii Simiklit (1):</p>
+<ul>
+  <li>i965/batch: don't ignore the 'brw_new_batch' call for a 'new batch'</li>
+</ul>
+
+<p>Dylan Baker (2):</p>
+<ul>
+  <li>meson: link gallium nine with pthreads</li>
+  <li>meson: fix libatomic tests</li>
+</ul>
+
+<p>Emil Velikov (2):</p>
+<ul>
+  <li>egl/glvnd: correctly report errors when vendor cannot be found</li>
+  <li>m4: add Werror when checking for compiler flags</li>
+</ul>
+
+<p>Eric Engestrom (6):</p>
+<ul>
+  <li>svga: add missing meson build dependency</li>
+  <li>clover: add missing meson build dependency</li>
+  <li>wsi/wayland: use proper VkResult type</li>
+  <li>wsi/wayland: only finish() a successfully init()ed display</li>
+  <li>configure: install KHR/khrplatform.h when needed</li>
+  <li>meson: install KHR/khrplatform.h when needed</li>
+</ul>
+
+<p>Gert Wollny (1):</p>
+<ul>
+  <li>virgl/vtest-winsys: Use virgl version of bind flags</li>
+</ul>
+
+<p>Jonathan Gray (1):</p>
+<ul>
+  <li>intel/tools: include stdarg.h in error2aub</li>
+</ul>
+
+<p>Juan A. Suarez Romero (4):</p>
+<ul>
+  <li>docs: add sha256 checksums for 18.2.4</li>
+  <li>cherry-ignore: add explicit 18.3 only nominations</li>
+  <li>cherry-ignore: i965/batch: avoid reverting batch buffer if saved state is an empty</li>
+  <li>Update version to 18.2.5</li>
+</ul>
+
+<p>Lionel Landwerlin (1):</p>
+<ul>
+  <li>anv/android: mark gralloc allocated BOs as external</li>
+</ul>
+
+<p>Marek Olšák (3):</p>
+<ul>
+  <li>ac: fix ac_build_fdiv for f64</li>
+  <li>st/va: fix incorrect use of resource_destroy</li>
+  <li>include: update GL &amp; GLES headers (v2)</li>
+</ul>
+
+<p>Matt Turner (2):</p>
+<ul>
+  <li>util/ralloc: Switch from DEBUG to NDEBUG</li>
+  <li>util/ralloc: Make sizeof(linear_header) a multiple of 8</li>
+</ul>
+
+<p>Olivier Fourdan (1):</p>
+<ul>
+  <li>wayland/egl: Resize EGL surface on update buffer for swrast</li>
+</ul>
+
+<p>Rhys Perry (1):</p>
+<ul>
+  <li>glsl_to_tgsi: don't create 64-bit integer MAD/FMA</li>
+</ul>
+
+<p>Samuel Pitoiset (2):</p>
+<ul>
+  <li>radv: disable conditional rendering for vkCmdCopyQueryPoolResults()</li>
+  <li>radv: only expose VK_SUBGROUP_FEATURE_ARITHMETIC_BIT for VI+</li>
+</ul>
+
+<p>Sergii Romantsov (1):</p>
+<ul>
+  <li>autotools: library-dependency when no sse and 32-bit</li>
+</ul>
+
+<p>Timothy Arceri (4):</p>
+<ul>
+  <li>st/mesa: calculate buffer size correctly for packed uniforms</li>
+  <li>st/glsl_to_nir: fix next_stage gathering</li>
+  <li>nir: add glsl_type_is_integer() helper</li>
+  <li>nir: don't pack varyings ints with floats unless flat</li>
+</ul>
+
+<p>Vadym Shovkoplias (1):</p>
+<ul>
+  <li>glsl/linker: Fix out variables linking during single stage</li>
+</ul>
+
+<p>Vinson Lee (1):</p>
+<ul>
+  <li>r600/sb: Fix constant logical operand in assert.</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/18.2.6.html b/docs/relnotes/18.2.6.html
new file mode 100644
index 0000000..576514d
--- /dev/null
+++ b/docs/relnotes/18.2.6.html
@@ -0,0 +1,179 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 18.2.6 Release Notes / November 28, 2018</h1>
+
+<p>
+Mesa 18.2.6 is a bug fix release which fixes bugs found since the 18.2.5 release.
+</p>
+<p>
+Mesa 18.2.6 implements the OpenGL 4.5 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.5.  OpenGL
+4.5 is <strong>only</strong> available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+e0ea1236dbc6c412b02e1b5d7f838072525971a6630246fa82ae4466a6d8a587  mesa-18.2.6.tar.gz
+9ebafa4f8249df0c718e93b9ca155e3593a1239af303aa2a8b0f2056a7efdc12  mesa-18.2.6.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107626">Bug 107626</a> - [SNB] The graphical corruption and GPU hang occur sometimes on the piglit test &quot;arb_texture_multisample-large-float-texture&quot; with parameter --fp16</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=107856">Bug 107856</a> - i965 incorrectly calculates the number of layers for texture views (assert)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108630">Bug 108630</a> - [G965] piglit.spec.!opengl 1_2.tex3d-maxsize spins forever</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108713">Bug 108713</a> - Gallium: use after free with transform feedback</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=108829">Bug 108829</a> - [meson] libglapi exports internal API</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Andrii Simiklit (1):</p>
+<ul>
+  <li>i965/batch: avoid reverting batch buffer if saved state is an empty</li>
+</ul>
+
+<p>Bas Nieuwenhuizen (1):</p>
+<ul>
+  <li>radv: Fix opaque metadata descriptor last layer.</li>
+</ul>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>scons/svga: remove opt from the list of valid build types</li>
+</ul>
+
+<p>Danylo Piliaiev (1):</p>
+<ul>
+  <li>i965: Fix calculation of layers array length for isl_view</li>
+</ul>
+
+<p>Dylan Baker (2):</p>
+<ul>
+  <li>meson: Don't set -Wall</li>
+  <li>meson: Don't force libva to required from auto</li>
+</ul>
+
+<p>Emil Velikov (13):</p>
+<ul>
+  <li>bin/get-pick-list.sh: simplify git oneline printing</li>
+  <li>bin/get-pick-list.sh: prefix output with "[stable] "</li>
+  <li>bin/get-pick-list.sh: handle "typod" usecase.</li>
+  <li>bin/get-pick-list.sh: handle the fixes tag</li>
+  <li>bin/get-pick-list.sh: tweak the commit sha matching pattern</li>
+  <li>bin/get-pick-list.sh: flesh out is_sha_nomination</li>
+  <li>bin/get-pick-list.sh: handle fixes tag with missing colon</li>
+  <li>bin/get-pick-list.sh: handle unofficial "broken by" tag</li>
+  <li>bin/get-pick-list.sh: use test instead of [ ]</li>
+  <li>bin/get-pick-list.sh: handle reverts prior to the branchpoint</li>
+  <li>travis: drop unneeded x11proto-xf86vidmode-dev</li>
+  <li>glx: make xf86vidmode mandatory for direct rendering</li>
+  <li>travis: adding missing x11-xcb for meson+vulkan</li>
+</ul>
+
+<p>Eric Anholt (1):</p>
+<ul>
+  <li>vc4: Make sure we make ro scanout resources for create_with_modifiers.</li>
+</ul>
+
+<p>Eric Engestrom (5):</p>
+<ul>
+  <li>meson: only run vulkan's meson.build when building vulkan</li>
+  <li>gbm: remove unnecessary meson include</li>
+  <li>meson: fix wayland-less builds</li>
+  <li>egl: add missing glvnd entrypoint for EGL_ANDROID_blob_cache</li>
+  <li>glapi: add missing visibility args</li>
+</ul>
+
+<p>Erik Faye-Lund (1):</p>
+<ul>
+  <li>mesa/main: remove bogus error for zero-sized images</li>
+</ul>
+
+<p>Gert Wollny (3):</p>
+<ul>
+  <li>mesa: Reference count shaders that are used by transform feedback objects</li>
+  <li>r600: clean up the GS ring buffers when the context is destroyed</li>
+  <li>glsl: free or reuse memory allocated for TF varying</li>
+</ul>
+
+<p>Jason Ekstrand (2):</p>
+<ul>
+  <li>nir/lower_alu_to_scalar: Don't try to lower unpack_32_2x16</li>
+  <li>anv: Put robust buffer access in the pipeline hash</li>
+</ul>
+
+<p>Juan A. Suarez Romero (6):</p>
+<ul>
+  <li>cherry-ignore: add explicit 18.3 only nominations</li>
+  <li>cherry-ignore: intel/aub_viewer: fix dynamic state printing</li>
+  <li>cherry-ignore: intel/aub_viewer: Print blend states properly</li>
+  <li>cherry-ignore: mesa/main: fix incorrect depth-error</li>
+  <li>docs: add sha256 checksums for 18.2.5</li>
+  <li>Update version to 18.2.6</li>
+</ul>
+
+<p>Karol Herbst (1):</p>
+<ul>
+  <li>nir/spirv: cast shift operand to u32</li>
+</ul>
+
+<p>Kenneth Graunke (1):</p>
+<ul>
+  <li>i965: Add PCI IDs for new Amberlake parts that are Coffeelake based</li>
+</ul>
+
+<p>Lionel Landwerlin (1):</p>
+<ul>
+  <li>egl/dri: fix error value with unknown drm format</li>
+</ul>
+
+<p>Marek Olšák (2):</p>
+<ul>
+  <li>winsys/amdgpu: fix a buffer leak in amdgpu_bo_from_handle</li>
+  <li>winsys/amdgpu: fix a device handle leak in amdgpu_winsys_create</li>
+</ul>
+
+<p>Rodrigo Vivi (4):</p>
+<ul>
+  <li>i965: Add a new CFL PCI ID.</li>
+  <li>intel: aubinator: Adding missed platforms to the error message.</li>
+  <li>intel: Introducing Amber Lake platform</li>
+  <li>intel: Introducing Whiskey Lake platform</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/specs/MESA_framebuffer_flip_y.txt b/docs/specs/MESA_framebuffer_flip_y.txt
new file mode 100644
index 0000000..697ab4e
--- /dev/null
+++ b/docs/specs/MESA_framebuffer_flip_y.txt
@@ -0,0 +1,81 @@
+Name
+
+    MESA_framebuffer_flip_y
+
+Name Strings
+
+    GL_MESA_framebuffer_flip_y
+
+Contact
+
+    Fritz Koenig <frkoenig@google.com>
+
+Contributors
+
+    Fritz Koenig, Google
+    Kristian Høgsberg, Google
+    Chad Versace, Google
+
+Status
+
+    Proposal
+
+Version
+
+    Version 1, June 7, 2018
+
+Number
+
+    302
+
+Dependencies
+
+    OpenGL ES 3.1 is required, for FramebufferParameteri.
+
+Overview
+
+    This extension defines a new framebuffer parameter,
+    GL_FRAMEBUFFER_FLIP_Y_MESA, that changes the behavior of the reads and
+    writes to the framebuffer attachment points. When GL_FRAMEBUFFER_FLIP_Y_MESA
+    is GL_TRUE, render commands and pixel transfer operations access the
+    backing store of each attachment point with an y-inverted coordinate
+    system. This y-inversion is relative to the coordinate system set when
+    GL_FRAMEBUFFER_FLIP_Y_MESA is GL_FALSE.
+
+    Access through TexSubImage2D and similar calls will notice the effect of
+    the flip when they are not attached to framebuffer objects because
+    GL_FRAMEBUFFER_FLIP_Y_MESA is associated with the framebuffer object and
+    not the attachment points.
+
+IP Status
+
+    None
+
+Issues
+
+    None
+
+New Procedures and Functions
+
+    None
+
+New Types
+
+    None
+
+New Tokens
+
+    Accepted by the <pname> argument of FramebufferParameteri and
+    GetFramebufferParameteriv:
+
+        GL_FRAMEBUFFER_FLIP_Y_MESA                      0x8BBB
+
+Errors
+
+    An INVALID_OPERATION error is generated by GetFramebufferParameteriv if the
+    default framebuffer is bound to <target> and <pname> is FRAMEBUFFER_FLIP_Y_MESA.
+
+Revision History
+
+    Version 1, June, 2018
+        Initial draft (Fritz Koenig)
diff --git a/docs/specs/enums.txt b/docs/specs/enums.txt
index bf3ca9c..e1b95ec 100644
--- a/docs/specs/enums.txt
+++ b/docs/specs/enums.txt
@@ -71,6 +71,9 @@
 	GL_TILE_RASTER_ORDER_INCREASING_X_MESA	0x8BB9
 	GL_TILE_RASTER_ORDER_INCREASING_Y_MESA	0x8BBA
 
+GL_MESA_framebuffer_flip_y
+	GL_FRAMEBUFFER_FLIP_Y_MESA           0x8BBB
+
 EGL_MESA_drm_image
         EGL_DRM_BUFFER_FORMAT_MESA		0x31D0
         EGL_DRM_BUFFER_USE_MESA			0x31D1
diff --git a/docs/submittingpatches.html b/docs/submittingpatches.html
index ba09aa4..e5350bd 100644
--- a/docs/submittingpatches.html
+++ b/docs/submittingpatches.html
@@ -36,7 +36,7 @@
 perhaps, in very trivial cases.)
 <li>Code patches should follow Mesa
 <a href="codingstyle.html" target="_parent">coding conventions</a>.
-<li>Whenever possible, patches should only effect individual Mesa/Gallium
+<li>Whenever possible, patches should only affect individual Mesa/Gallium
 components.
 <li>Patches should never introduce build breaks and should be bisectable (see
 <code>git bisect</code>.)
@@ -122,9 +122,9 @@
 <pre>
     $ scripts/get_reviewer.pl --help # to get the help screen
     $ scripts/get_reviewer.pl -f src/egl/drivers/dri2/platform_android.c
-    Rob Herring <robh@kernel.org> (reviewer:ANDROID EGL SUPPORT,added_lines:188/700=27%,removed_lines:58/283=20%)
-    Tomasz Figa <tfiga@chromium.org> (reviewer:ANDROID EGL SUPPORT,authored:12/41=29%,added_lines:308/700=44%,removed_lines:115/283=41%)
-    Emil Velikov <emil.l.velikov@gmail.com> (authored:13/41=32%,removed_lines:76/283=27%)
+    Rob Herring &lt;robh@kernel.org&gt; (reviewer:ANDROID EGL SUPPORT,added_lines:188/700=27%,removed_lines:58/283=20%)
+    Tomasz Figa &lt;tfiga@chromium.org&gt; (reviewer:ANDROID EGL SUPPORT,authored:12/41=29%,added_lines:308/700=44%,removed_lines:115/283=41%)
+    Emil Velikov &lt;emil.l.velikov@gmail.com&gt; (authored:13/41=32%,removed_lines:76/283=27%)
 </pre>
 </ul>
 
diff --git a/docs/utilities.html b/docs/utilities.html
index c141abe..222e734 100644
--- a/docs/utilities.html
+++ b/docs/utilities.html
@@ -31,7 +31,7 @@
   <dd>is a very useful tool for tracking down
   memory-related problems in your code.</dd>
 
-  <dt><a href="https://scan.coverity.com/projects/mesa">Coverity</a><dt>
+  <dt><a href="https://scan.coverity.com/projects/mesa">Coverity</a></dt>
   <dd>provides static code analysis of Mesa.  If you create an account
   you can see the results and try to fix outstanding issues.</dd>
 </dl>
diff --git a/docs/viewperf.html b/docs/viewperf.html
index 0eb51a5..ed89ee2 100644
--- a/docs/viewperf.html
+++ b/docs/viewperf.html
@@ -18,8 +18,8 @@
 
 <p>
 This page lists known issues with
-<a href="https://www.spec.org/gwpg/gpc.static/vp11info.html" target="_main">SPEC Viewperf 11</a>
-and <a href="https://www.spec.org/gwpg/gpc.static/vp12info.html" target="_main">SPEC Viewperf 12</a>
+<a href="https://www.spec.org/gwpg/gpc.static/vp11info.html">SPEC Viewperf 11</a>
+and <a href="https://www.spec.org/gwpg/gpc.static/vp12info.html">SPEC Viewperf 12</a>
 when running on Mesa-based drivers.
 </p>
 
@@ -66,13 +66,10 @@
 
 <p>
 These tests use features of the
-<a href="https://www.opengl.org/registry/specs/NV/fragment_program2.txt"
-target="_main">
-GL_NV_fragment_program2</a> and
-<a href="https://www.opengl.org/registry/specs/NV/vertex_program3.txt"
-target="_main">
-GL_NV_vertex_program3</a> extensions without checking if the driver supports
-them.
+<a href="https://www.opengl.org/registry/specs/NV/fragment_program2.txt">GL_NV_fragment_program2</a>
+and
+<a href="https://www.opengl.org/registry/specs/NV/vertex_program3.txt">GL_NV_vertex_program3</a>
+extensions without checking if the driver supports them.
 </p>
 <p>
 When Mesa tries to compile the vertex/fragment programs it generates errors
@@ -86,8 +83,8 @@
 
 <p>
 These tests depend on the
-<a href="https://www.opengl.org/registry/specs/NV/primitive_restart.txt"
-target="_main">GL_NV_primitive_restart</a> extension.
+<a href="https://www.opengl.org/registry/specs/NV/primitive_restart.txt">GL_NV_primitive_restart</a>
+extension.
 </p>
 
 <p>
@@ -124,7 +121,7 @@
 
 <p>
 A trace captured with
-<a href="https://github.com/apitrace/apitrace" target="_main">API trace</a>
+<a href="https://github.com/apitrace/apitrace">API trace</a>
 shows this sequences of calls like this:
 
 <pre>
diff --git a/include/GL/glcorearb.h b/include/GL/glcorearb.h
index a78bbb6..3cf945c 100644
--- a/include/GL/glcorearb.h
+++ b/include/GL/glcorearb.h
@@ -1,12 +1,12 @@
-#ifndef __glcorearb_h_
-#define __glcorearb_h_ 1
+#ifndef __gl_glcorearb_h_
+#define __gl_glcorearb_h_ 1
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
-** Copyright (c) 2013-2017 The Khronos Group Inc.
+** Copyright (c) 2013-2018 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -306,7 +306,7 @@
 typedef void (APIENTRYP PFNGLGETTEXLEVELPARAMETERFVPROC) (GLenum target, GLint level, GLenum pname, GLfloat *params);
 typedef void (APIENTRYP PFNGLGETTEXLEVELPARAMETERIVPROC) (GLenum target, GLint level, GLenum pname, GLint *params);
 typedef GLboolean (APIENTRYP PFNGLISENABLEDPROC) (GLenum cap);
-typedef void (APIENTRYP PFNGLDEPTHRANGEPROC) (GLdouble near, GLdouble far);
+typedef void (APIENTRYP PFNGLDEPTHRANGEPROC) (GLdouble n, GLdouble f);
 typedef void (APIENTRYP PFNGLVIEWPORTPROC) (GLint x, GLint y, GLsizei width, GLsizei height);
 #ifdef GL_GLEXT_PROTOTYPES
 GLAPI void APIENTRY glCullFace (GLenum mode);
@@ -355,7 +355,7 @@
 GLAPI void APIENTRY glGetTexLevelParameterfv (GLenum target, GLint level, GLenum pname, GLfloat *params);
 GLAPI void APIENTRY glGetTexLevelParameteriv (GLenum target, GLint level, GLenum pname, GLint *params);
 GLAPI GLboolean APIENTRY glIsEnabled (GLenum cap);
-GLAPI void APIENTRY glDepthRange (GLdouble near, GLdouble far);
+GLAPI void APIENTRY glDepthRange (GLdouble n, GLdouble f);
 GLAPI void APIENTRY glViewport (GLint x, GLint y, GLsizei width, GLsizei height);
 #endif
 #endif /* GL_VERSION_1_0 */
@@ -613,9 +613,9 @@
 
 #ifndef GL_VERSION_1_5
 #define GL_VERSION_1_5 1
-#include <stddef.h>
-typedef ptrdiff_t GLsizeiptr;
-typedef ptrdiff_t GLintptr;
+#include <KHR/khrplatform.h>
+typedef khronos_ssize_t GLsizeiptr;
+typedef khronos_intptr_t GLintptr;
 #define GL_BUFFER_SIZE                    0x8764
 #define GL_BUFFER_USAGE                   0x8765
 #define GL_QUERY_COUNTER_BITS             0x8864
@@ -3958,6 +3958,22 @@
 #define GL_KHR_texture_compression_astc_sliced_3d 1
 #endif /* GL_KHR_texture_compression_astc_sliced_3d */
 
+#ifndef GL_AMD_framebuffer_multisample_advanced
+#define GL_AMD_framebuffer_multisample_advanced 1
+#define GL_RENDERBUFFER_STORAGE_SAMPLES_AMD 0x91B2
+#define GL_MAX_COLOR_FRAMEBUFFER_SAMPLES_AMD 0x91B3
+#define GL_MAX_COLOR_FRAMEBUFFER_STORAGE_SAMPLES_AMD 0x91B4
+#define GL_MAX_DEPTH_STENCIL_FRAMEBUFFER_SAMPLES_AMD 0x91B5
+#define GL_NUM_SUPPORTED_MULTISAMPLE_MODES_AMD 0x91B6
+#define GL_SUPPORTED_MULTISAMPLE_MODES_AMD 0x91B7
+typedef void (APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEADVANCEDAMDPROC) (GLenum target, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLNAMEDRENDERBUFFERSTORAGEMULTISAMPLEADVANCEDAMDPROC) (GLuint renderbuffer, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glRenderbufferStorageMultisampleAdvancedAMD (GLenum target, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+GLAPI void APIENTRY glNamedRenderbufferStorageMultisampleAdvancedAMD (GLuint renderbuffer, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+#endif
+#endif /* GL_AMD_framebuffer_multisample_advanced */
+
 #ifndef GL_AMD_performance_monitor
 #define GL_AMD_performance_monitor 1
 #define GL_COUNTER_TYPE_AMD               0x8BC0
@@ -4001,6 +4017,17 @@
 #define GL_RGB_RAW_422_APPLE              0x8A51
 #endif /* GL_APPLE_rgb_422 */
 
+#ifndef GL_EXT_EGL_image_storage
+#define GL_EXT_EGL_image_storage 1
+typedef void *GLeglImageOES;
+typedef void (APIENTRYP PFNGLEGLIMAGETARGETTEXSTORAGEEXTPROC) (GLenum target, GLeglImageOES image, const GLint* attrib_list);
+typedef void (APIENTRYP PFNGLEGLIMAGETARGETTEXTURESTORAGEEXTPROC) (GLuint texture, GLeglImageOES image, const GLint* attrib_list);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glEGLImageTargetTexStorageEXT (GLenum target, GLeglImageOES image, const GLint* attrib_list);
+GLAPI void APIENTRY glEGLImageTargetTextureStorageEXT (GLuint texture, GLeglImageOES image, const GLint* attrib_list);
+#endif
+#endif /* GL_EXT_EGL_image_storage */
+
 #ifndef GL_EXT_debug_label
 #define GL_EXT_debug_label 1
 #define GL_PROGRAM_PIPELINE_OBJECT_EXT    0x8A4F
@@ -4598,6 +4625,19 @@
 #endif
 #endif /* GL_EXT_separate_shader_objects */
 
+#ifndef GL_EXT_shader_framebuffer_fetch
+#define GL_EXT_shader_framebuffer_fetch 1
+#define GL_FRAGMENT_SHADER_DISCARDS_SAMPLES_EXT 0x8A52
+#endif /* GL_EXT_shader_framebuffer_fetch */
+
+#ifndef GL_EXT_shader_framebuffer_fetch_non_coherent
+#define GL_EXT_shader_framebuffer_fetch_non_coherent 1
+typedef void (APIENTRYP PFNGLFRAMEBUFFERFETCHBARRIEREXTPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glFramebufferFetchBarrierEXT (void);
+#endif
+#endif /* GL_EXT_shader_framebuffer_fetch_non_coherent */
+
 #ifndef GL_EXT_shader_integer_mix
 #define GL_EXT_shader_integer_mix 1
 #endif /* GL_EXT_shader_integer_mix */
@@ -4612,6 +4652,8 @@
 
 #ifndef GL_EXT_texture_filter_minmax
 #define GL_EXT_texture_filter_minmax 1
+#define GL_TEXTURE_REDUCTION_MODE_EXT     0x9366
+#define GL_WEIGHTED_AVERAGE_EXT           0x9367
 #endif /* GL_EXT_texture_filter_minmax */
 
 #ifndef GL_EXT_texture_sRGB_decode
@@ -4635,6 +4677,11 @@
 #endif
 #endif /* GL_EXT_window_rectangles */
 
+#ifndef GL_INTEL_blackhole_render
+#define GL_INTEL_blackhole_render 1
+#define GL_BLACKHOLE_RENDER_INTEL         0x83FC
+#endif /* GL_INTEL_blackhole_render */
+
 #ifndef GL_INTEL_conservative_rasterization
 #define GL_INTEL_conservative_rasterization 1
 #define GL_CONSERVATIVE_RASTERIZATION_INTEL 0x83FE
@@ -4677,7 +4724,7 @@
 typedef void (APIENTRYP PFNGLGETFIRSTPERFQUERYIDINTELPROC) (GLuint *queryId);
 typedef void (APIENTRYP PFNGLGETNEXTPERFQUERYIDINTELPROC) (GLuint queryId, GLuint *nextQueryId);
 typedef void (APIENTRYP PFNGLGETPERFCOUNTERINFOINTELPROC) (GLuint queryId, GLuint counterId, GLuint counterNameLength, GLchar *counterName, GLuint counterDescLength, GLchar *counterDesc, GLuint *counterOffset, GLuint *counterDataSize, GLuint *counterTypeEnum, GLuint *counterDataTypeEnum, GLuint64 *rawCounterMaxValue);
-typedef void (APIENTRYP PFNGLGETPERFQUERYDATAINTELPROC) (GLuint queryHandle, GLuint flags, GLsizei dataSize, GLvoid *data, GLuint *bytesWritten);
+typedef void (APIENTRYP PFNGLGETPERFQUERYDATAINTELPROC) (GLuint queryHandle, GLuint flags, GLsizei dataSize, void *data, GLuint *bytesWritten);
 typedef void (APIENTRYP PFNGLGETPERFQUERYIDBYNAMEINTELPROC) (GLchar *queryName, GLuint *queryId);
 typedef void (APIENTRYP PFNGLGETPERFQUERYINFOINTELPROC) (GLuint queryId, GLuint queryNameLength, GLchar *queryName, GLuint *dataSize, GLuint *noCounters, GLuint *noInstances, GLuint *capsMask);
 #ifdef GL_GLEXT_PROTOTYPES
@@ -4688,7 +4735,7 @@
 GLAPI void APIENTRY glGetFirstPerfQueryIdINTEL (GLuint *queryId);
 GLAPI void APIENTRY glGetNextPerfQueryIdINTEL (GLuint queryId, GLuint *nextQueryId);
 GLAPI void APIENTRY glGetPerfCounterInfoINTEL (GLuint queryId, GLuint counterId, GLuint counterNameLength, GLchar *counterName, GLuint counterDescLength, GLchar *counterDesc, GLuint *counterOffset, GLuint *counterDataSize, GLuint *counterTypeEnum, GLuint *counterDataTypeEnum, GLuint64 *rawCounterMaxValue);
-GLAPI void APIENTRY glGetPerfQueryDataINTEL (GLuint queryHandle, GLuint flags, GLsizei dataSize, GLvoid *data, GLuint *bytesWritten);
+GLAPI void APIENTRY glGetPerfQueryDataINTEL (GLuint queryHandle, GLuint flags, GLsizei dataSize, void *data, GLuint *bytesWritten);
 GLAPI void APIENTRY glGetPerfQueryIdByNameINTEL (GLchar *queryName, GLuint *queryId);
 GLAPI void APIENTRY glGetPerfQueryInfoINTEL (GLuint queryId, GLuint queryNameLength, GLchar *queryName, GLuint *dataSize, GLuint *noCounters, GLuint *noInstances, GLuint *capsMask);
 #endif
@@ -4923,6 +4970,11 @@
 #endif
 #endif /* GL_NV_conservative_raster_dilate */
 
+#ifndef GL_NV_conservative_raster_pre_snap
+#define GL_NV_conservative_raster_pre_snap 1
+#define GL_CONSERVATIVE_RASTER_MODE_PRE_SNAP_NV 0x9550
+#endif /* GL_NV_conservative_raster_pre_snap */
+
 #ifndef GL_NV_conservative_raster_pre_snap_triangles
 #define GL_NV_conservative_raster_pre_snap_triangles 1
 #define GL_CONSERVATIVE_RASTER_MODE_NV    0x954D
@@ -4934,6 +4986,10 @@
 #endif
 #endif /* GL_NV_conservative_raster_pre_snap_triangles */
 
+#ifndef GL_NV_conservative_raster_underestimation
+#define GL_NV_conservative_raster_underestimation 1
+#endif /* GL_NV_conservative_raster_underestimation */
+
 #ifndef GL_NV_draw_vulkan_image
 #define GL_NV_draw_vulkan_image 1
 typedef void (APIENTRY  *GLVULKANPROCNV)(void);
diff --git a/include/GL/glext.h b/include/GL/glext.h
index 75fd1f6..181df28 100644
--- a/include/GL/glext.h
+++ b/include/GL/glext.h
@@ -1,12 +1,12 @@
-#ifndef __glext_h_
-#define __glext_h_ 1
+#ifndef __gl_glext_h_
+#define __gl_glext_h_ 1
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
-** Copyright (c) 2013-2017 The Khronos Group Inc.
+** Copyright (c) 2013-2018 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -51,7 +51,7 @@
 #define GLAPI extern
 #endif
 
-#define GL_GLEXT_VERSION 20171010
+#define GL_GLEXT_VERSION 20180725
 
 /* Generated C header for:
  * API: gl
@@ -464,9 +464,9 @@
 
 #ifndef GL_VERSION_1_5
 #define GL_VERSION_1_5 1
-#include <stddef.h>
-typedef ptrdiff_t GLsizeiptr;
-typedef ptrdiff_t GLintptr;
+#include <KHR/khrplatform.h>
+typedef khronos_ssize_t GLsizeiptr;
+typedef khronos_intptr_t GLintptr;
 #define GL_BUFFER_SIZE                    0x8764
 #define GL_BUFFER_USAGE                   0x8765
 #define GL_QUERY_COUNTER_BITS             0x8864
@@ -4718,6 +4718,7 @@
 
 #ifndef GL_ARB_vertex_buffer_object
 #define GL_ARB_vertex_buffer_object 1
+#include <stddef.h>
 typedef ptrdiff_t GLsizeiptrARB;
 typedef ptrdiff_t GLintptrARB;
 #define GL_BUFFER_SIZE_ARB                0x8764
@@ -5445,6 +5446,22 @@
 #endif
 #endif /* GL_AMD_draw_buffers_blend */
 
+#ifndef GL_AMD_framebuffer_multisample_advanced
+#define GL_AMD_framebuffer_multisample_advanced 1
+#define GL_RENDERBUFFER_STORAGE_SAMPLES_AMD 0x91B2
+#define GL_MAX_COLOR_FRAMEBUFFER_SAMPLES_AMD 0x91B3
+#define GL_MAX_COLOR_FRAMEBUFFER_STORAGE_SAMPLES_AMD 0x91B4
+#define GL_MAX_DEPTH_STENCIL_FRAMEBUFFER_SAMPLES_AMD 0x91B5
+#define GL_NUM_SUPPORTED_MULTISAMPLE_MODES_AMD 0x91B6
+#define GL_SUPPORTED_MULTISAMPLE_MODES_AMD 0x91B7
+typedef void (APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEADVANCEDAMDPROC) (GLenum target, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (APIENTRYP PFNGLNAMEDRENDERBUFFERSTORAGEMULTISAMPLEADVANCEDAMDPROC) (GLuint renderbuffer, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glRenderbufferStorageMultisampleAdvancedAMD (GLenum target, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+GLAPI void APIENTRY glNamedRenderbufferStorageMultisampleAdvancedAMD (GLuint renderbuffer, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+#endif
+#endif /* GL_AMD_framebuffer_multisample_advanced */
+
 #ifndef GL_AMD_framebuffer_sample_positions
 #define GL_AMD_framebuffer_sample_positions 1
 #define GL_SUBSAMPLE_DISTANCE_AMD         0x883F
@@ -5709,6 +5726,10 @@
 #define GL_AMD_shader_explicit_vertex_parameter 1
 #endif /* GL_AMD_shader_explicit_vertex_parameter */
 
+#ifndef GL_AMD_shader_gpu_shader_half_float_fetch
+#define GL_AMD_shader_gpu_shader_half_float_fetch 1
+#endif /* GL_AMD_shader_gpu_shader_half_float_fetch */
+
 #ifndef GL_AMD_shader_image_load_store_lod
 #define GL_AMD_shader_image_load_store_lod 1
 #endif /* GL_AMD_shader_image_load_store_lod */
@@ -6456,6 +6477,17 @@
 #define GL_422_REV_AVERAGE_EXT            0x80CF
 #endif /* GL_EXT_422_pixels */
 
+#ifndef GL_EXT_EGL_image_storage
+#define GL_EXT_EGL_image_storage 1
+typedef void *GLeglImageOES;
+typedef void (APIENTRYP PFNGLEGLIMAGETARGETTEXSTORAGEEXTPROC) (GLenum target, GLeglImageOES image, const GLint* attrib_list);
+typedef void (APIENTRYP PFNGLEGLIMAGETARGETTEXTURESTORAGEEXTPROC) (GLuint texture, GLeglImageOES image, const GLint* attrib_list);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glEGLImageTargetTexStorageEXT (GLenum target, GLeglImageOES image, const GLint* attrib_list);
+GLAPI void APIENTRY glEGLImageTargetTextureStorageEXT (GLuint texture, GLeglImageOES image, const GLint* attrib_list);
+#endif
+#endif /* GL_EXT_EGL_image_storage */
+
 #ifndef GL_EXT_abgr
 #define GL_EXT_abgr 1
 #define GL_ABGR_EXT                       0x8000
@@ -7994,6 +8026,8 @@
 #define GL_LAYOUT_SHADER_READ_ONLY_EXT    0x9591
 #define GL_LAYOUT_TRANSFER_SRC_EXT        0x9592
 #define GL_LAYOUT_TRANSFER_DST_EXT        0x9593
+#define GL_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_EXT 0x9530
+#define GL_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_EXT 0x9531
 typedef void (APIENTRYP PFNGLGENSEMAPHORESEXTPROC) (GLsizei n, GLuint *semaphores);
 typedef void (APIENTRYP PFNGLDELETESEMAPHORESEXTPROC) (GLsizei n, const GLuint *semaphores);
 typedef GLboolean (APIENTRYP PFNGLISSEMAPHOREEXTPROC) (GLuint semaphore);
@@ -8052,6 +8086,19 @@
 #define GL_SEPARATE_SPECULAR_COLOR_EXT    0x81FA
 #endif /* GL_EXT_separate_specular_color */
 
+#ifndef GL_EXT_shader_framebuffer_fetch
+#define GL_EXT_shader_framebuffer_fetch 1
+#define GL_FRAGMENT_SHADER_DISCARDS_SAMPLES_EXT 0x8A52
+#endif /* GL_EXT_shader_framebuffer_fetch */
+
+#ifndef GL_EXT_shader_framebuffer_fetch_non_coherent
+#define GL_EXT_shader_framebuffer_fetch_non_coherent 1
+typedef void (APIENTRYP PFNGLFRAMEBUFFERFETCHBARRIEREXTPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GLAPI void APIENTRY glFramebufferFetchBarrierEXT (void);
+#endif
+#endif /* GL_EXT_shader_framebuffer_fetch_non_coherent */
+
 #ifndef GL_EXT_shader_image_load_formatted
 #define GL_EXT_shader_image_load_formatted 1
 #endif /* GL_EXT_shader_image_load_formatted */
@@ -8352,6 +8399,8 @@
 
 #ifndef GL_EXT_texture_filter_minmax
 #define GL_EXT_texture_filter_minmax 1
+#define GL_TEXTURE_REDUCTION_MODE_EXT     0x9366
+#define GL_WEIGHTED_AVERAGE_EXT           0x9367
 #endif /* GL_EXT_texture_filter_minmax */
 
 #ifndef GL_EXT_texture_integer
@@ -9099,6 +9148,11 @@
 #define GL_INTERLACE_READ_INGR            0x8568
 #endif /* GL_INGR_interlace_read */
 
+#ifndef GL_INTEL_blackhole_render
+#define GL_INTEL_blackhole_render 1
+#define GL_BLACKHOLE_RENDER_INTEL         0x83FC
+#endif /* GL_INTEL_blackhole_render */
+
 #ifndef GL_INTEL_conservative_rasterization
 #define GL_INTEL_conservative_rasterization 1
 #define GL_CONSERVATIVE_RASTERIZATION_INTEL 0x83FE
@@ -9180,7 +9234,7 @@
 typedef void (APIENTRYP PFNGLGETFIRSTPERFQUERYIDINTELPROC) (GLuint *queryId);
 typedef void (APIENTRYP PFNGLGETNEXTPERFQUERYIDINTELPROC) (GLuint queryId, GLuint *nextQueryId);
 typedef void (APIENTRYP PFNGLGETPERFCOUNTERINFOINTELPROC) (GLuint queryId, GLuint counterId, GLuint counterNameLength, GLchar *counterName, GLuint counterDescLength, GLchar *counterDesc, GLuint *counterOffset, GLuint *counterDataSize, GLuint *counterTypeEnum, GLuint *counterDataTypeEnum, GLuint64 *rawCounterMaxValue);
-typedef void (APIENTRYP PFNGLGETPERFQUERYDATAINTELPROC) (GLuint queryHandle, GLuint flags, GLsizei dataSize, GLvoid *data, GLuint *bytesWritten);
+typedef void (APIENTRYP PFNGLGETPERFQUERYDATAINTELPROC) (GLuint queryHandle, GLuint flags, GLsizei dataSize, void *data, GLuint *bytesWritten);
 typedef void (APIENTRYP PFNGLGETPERFQUERYIDBYNAMEINTELPROC) (GLchar *queryName, GLuint *queryId);
 typedef void (APIENTRYP PFNGLGETPERFQUERYINFOINTELPROC) (GLuint queryId, GLuint queryNameLength, GLchar *queryName, GLuint *dataSize, GLuint *noCounters, GLuint *noInstances, GLuint *capsMask);
 #ifdef GL_GLEXT_PROTOTYPES
@@ -9191,7 +9245,7 @@
 GLAPI void APIENTRY glGetFirstPerfQueryIdINTEL (GLuint *queryId);
 GLAPI void APIENTRY glGetNextPerfQueryIdINTEL (GLuint queryId, GLuint *nextQueryId);
 GLAPI void APIENTRY glGetPerfCounterInfoINTEL (GLuint queryId, GLuint counterId, GLuint counterNameLength, GLchar *counterName, GLuint counterDescLength, GLchar *counterDesc, GLuint *counterOffset, GLuint *counterDataSize, GLuint *counterTypeEnum, GLuint *counterDataTypeEnum, GLuint64 *rawCounterMaxValue);
-GLAPI void APIENTRY glGetPerfQueryDataINTEL (GLuint queryHandle, GLuint flags, GLsizei dataSize, GLvoid *data, GLuint *bytesWritten);
+GLAPI void APIENTRY glGetPerfQueryDataINTEL (GLuint queryHandle, GLuint flags, GLsizei dataSize, void *data, GLuint *bytesWritten);
 GLAPI void APIENTRY glGetPerfQueryIdByNameINTEL (GLchar *queryName, GLuint *queryId);
 GLAPI void APIENTRY glGetPerfQueryInfoINTEL (GLuint queryId, GLuint queryNameLength, GLchar *queryName, GLuint *dataSize, GLuint *noCounters, GLuint *noInstances, GLuint *capsMask);
 #endif
@@ -9583,6 +9637,11 @@
 #endif
 #endif /* GL_NV_conservative_raster_dilate */
 
+#ifndef GL_NV_conservative_raster_pre_snap
+#define GL_NV_conservative_raster_pre_snap 1
+#define GL_CONSERVATIVE_RASTER_MODE_PRE_SNAP_NV 0x9550
+#endif /* GL_NV_conservative_raster_pre_snap */
+
 #ifndef GL_NV_conservative_raster_pre_snap_triangles
 #define GL_NV_conservative_raster_pre_snap_triangles 1
 #define GL_CONSERVATIVE_RASTER_MODE_NV    0x954D
@@ -9594,6 +9653,10 @@
 #endif
 #endif /* GL_NV_conservative_raster_pre_snap_triangles */
 
+#ifndef GL_NV_conservative_raster_underestimation
+#define GL_NV_conservative_raster_underestimation 1
+#endif /* GL_NV_conservative_raster_underestimation */
+
 #ifndef GL_NV_copy_depth_to_color
 #define GL_NV_copy_depth_to_color 1
 #define GL_DEPTH_STENCIL_TO_RGBA_NV       0x886E
@@ -9902,7 +9965,7 @@
 #define GL_PER_GPU_STORAGE_NV             0x9548
 #define GL_MULTICAST_PROGRAMMABLE_SAMPLE_LOCATION_NV 0x9549
 typedef void (APIENTRYP PFNGLRENDERGPUMASKNVPROC) (GLbitfield mask);
-typedef void (APIENTRYP PFNGLMULTICASTBUFFERSUBDATANVPROC) (GLbitfield gpuMask, GLuint buffer, GLintptr offset, GLsizeiptr size, const GLvoid *data);
+typedef void (APIENTRYP PFNGLMULTICASTBUFFERSUBDATANVPROC) (GLbitfield gpuMask, GLuint buffer, GLintptr offset, GLsizeiptr size, const void *data);
 typedef void (APIENTRYP PFNGLMULTICASTCOPYBUFFERSUBDATANVPROC) (GLuint readGpu, GLbitfield writeGpuMask, GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
 typedef void (APIENTRYP PFNGLMULTICASTCOPYIMAGESUBDATANVPROC) (GLuint srcGpu, GLbitfield dstGpuMask, GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth);
 typedef void (APIENTRYP PFNGLMULTICASTBLITFRAMEBUFFERNVPROC) (GLuint srcGpu, GLuint dstGpu, GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
@@ -9915,7 +9978,7 @@
 typedef void (APIENTRYP PFNGLMULTICASTGETQUERYOBJECTUI64VNVPROC) (GLuint gpu, GLuint id, GLenum pname, GLuint64 *params);
 #ifdef GL_GLEXT_PROTOTYPES
 GLAPI void APIENTRY glRenderGpuMaskNV (GLbitfield mask);
-GLAPI void APIENTRY glMulticastBufferSubDataNV (GLbitfield gpuMask, GLuint buffer, GLintptr offset, GLsizeiptr size, const GLvoid *data);
+GLAPI void APIENTRY glMulticastBufferSubDataNV (GLbitfield gpuMask, GLuint buffer, GLintptr offset, GLsizeiptr size, const void *data);
 GLAPI void APIENTRY glMulticastCopyBufferSubDataNV (GLuint readGpu, GLbitfield writeGpuMask, GLuint readBuffer, GLuint writeBuffer, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
 GLAPI void APIENTRY glMulticastCopyImageSubDataNV (GLuint srcGpu, GLbitfield dstGpuMask, GLuint srcName, GLenum srcTarget, GLint srcLevel, GLint srcX, GLint srcY, GLint srcZ, GLuint dstName, GLenum dstTarget, GLint dstLevel, GLint dstX, GLint dstY, GLint dstZ, GLsizei srcWidth, GLsizei srcHeight, GLsizei srcDepth);
 GLAPI void APIENTRY glMulticastBlitFramebufferNV (GLuint srcGpu, GLuint dstGpu, GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1, GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1, GLbitfield mask, GLenum filter);
diff --git a/include/GL/glxext.h b/include/GL/glxext.h
index 0f60a38..4c984ef 100644
--- a/include/GL/glxext.h
+++ b/include/GL/glxext.h
@@ -1,12 +1,12 @@
-#ifndef __glxext_h_
-#define __glxext_h_ 1
+#ifndef __glx_glxext_h_
+#define __glx_glxext_h_ 1
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
-** Copyright (c) 2013-2017 The Khronos Group Inc.
+** Copyright (c) 2013-2018 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -34,7 +34,7 @@
 **   https://github.com/KhronosGroup/OpenGL-Registry
 */
 
-#define GLX_GLXEXT_VERSION 20170728
+#define GLX_GLXEXT_VERSION 20180525
 
 /* Generated C header for:
  * API: glx
@@ -325,6 +325,10 @@
 #define GLX_VENDOR_NAMES_EXT              0x20F6
 #endif /* GLX_EXT_libglvnd */
 
+#ifndef GLX_EXT_no_config_context
+#define GLX_EXT_no_config_context 1
+#endif /* GLX_EXT_no_config_context */
+
 #ifndef GLX_EXT_stereo_tree
 #define GLX_EXT_stereo_tree 1
 typedef struct {
@@ -503,6 +507,16 @@
 #endif
 #endif /* GLX_MESA_set_3dfx_mode */
 
+#ifndef GLX_MESA_swap_control
+#define GLX_MESA_swap_control 1
+typedef int ( *PFNGLXGETSWAPINTERVALMESAPROC) (void);
+typedef int ( *PFNGLXSWAPINTERVALMESAPROC) (unsigned int interval);
+#ifdef GLX_GLXEXT_PROTOTYPES
+int glXGetSwapIntervalMESA (void);
+int glXSwapIntervalMESA (unsigned int interval);
+#endif
+#endif /* GLX_MESA_swap_control */
+
 #ifndef GLX_NV_copy_buffer
 #define GLX_NV_copy_buffer 1
 typedef void ( *PFNGLXCOPYBUFFERSUBDATANVPROC) (Display *dpy, GLXContext readCtx, GLXContext writeCtx, GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index 4f4795c..c32cdd3 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -589,7 +589,7 @@
  * SWRast Loader extension.
  */
 #define __DRI_SWRAST_LOADER "DRI_SWRastLoader"
-#define __DRI_SWRAST_LOADER_VERSION 3
+#define __DRI_SWRAST_LOADER_VERSION 4
 struct __DRIswrastLoaderExtensionRec {
     __DRIextension base;
 
@@ -631,6 +631,24 @@
    void (*getImage2)(__DRIdrawable *readable,
 		     int x, int y, int width, int height, int stride,
 		     char *data, void *loaderPrivate);
+
+    /**
+     * Put shm image to drawable
+     *
+     * \since 4
+     */
+    void (*putImageShm)(__DRIdrawable *drawable, int op,
+                        int x, int y, int width, int height, int stride,
+                        int shmid, char *shmaddr, unsigned offset,
+                        void *loaderPrivate);
+    /**
+     * Get shm image from readable
+     *
+     * \since 4
+     */
+    void (*getImageShm)(__DRIdrawable *readable,
+                        int x, int y, int width, int height,
+                        int shmid, void *loaderPrivate);
 };
 
 /**
@@ -1253,6 +1271,7 @@
 #define __DRI_IMAGE_FORMAT_YUYV         0x100f
 #define __DRI_IMAGE_FORMAT_XBGR2101010  0x1010
 #define __DRI_IMAGE_FORMAT_ABGR2101010  0x1011
+#define __DRI_IMAGE_FORMAT_SABGR8       0x1012
 
 #define __DRI_IMAGE_USE_SHARE		0x0001
 #define __DRI_IMAGE_USE_SCANOUT		0x0002
@@ -1289,6 +1308,7 @@
 #define __DRI_IMAGE_FOURCC_ABGR8888	0x34324241
 #define __DRI_IMAGE_FOURCC_XBGR8888	0x34324258
 #define __DRI_IMAGE_FOURCC_SARGB8888	0x83324258
+#define __DRI_IMAGE_FOURCC_SABGR8888	0x84324258
 #define __DRI_IMAGE_FOURCC_ARGB2101010	0x30335241
 #define __DRI_IMAGE_FOURCC_XRGB2101010	0x30335258
 #define __DRI_IMAGE_FOURCC_ABGR2101010	0x30334241
diff --git a/include/GLES2/gl2.h b/include/GLES2/gl2.h
index 8ba907c..b4051e5 100644
--- a/include/GLES2/gl2.h
+++ b/include/GLES2/gl2.h
@@ -1,12 +1,12 @@
-#ifndef __gl2_h_
-#define __gl2_h_ 1
+#ifndef __gles2_gl2_h_
+#define __gles2_gl2_h_ 1
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
-** Copyright (c) 2013-2017 The Khronos Group Inc.
+** Copyright (c) 2013-2018 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -44,7 +44,7 @@
 #define GL_GLES_PROTOTYPES 1
 #endif
 
-/* Generated on date 20170606 */
+/* Generated on date 20180725 */
 
 /* Generated C header for:
  * API: gles2
diff --git a/include/GLES2/gl2ext.h b/include/GLES2/gl2ext.h
index a7d19a1..559173d 100644
--- a/include/GLES2/gl2ext.h
+++ b/include/GLES2/gl2ext.h
@@ -1,12 +1,12 @@
-#ifndef __gl2ext_h_
-#define __gl2ext_h_ 1
+#ifndef __gles2_gl2ext_h_
+#define __gles2_gl2ext_h_ 1
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
-** Copyright (c) 2013-2017 The Khronos Group Inc.
+** Copyright (c) 2013-2018 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -38,7 +38,7 @@
 #define GL_APIENTRYP GL_APIENTRY*
 #endif
 
-/* Generated on date 20170804 */
+/* Generated on date 20180725 */
 
 /* Generated C header for:
  * API: gles2
@@ -159,6 +159,16 @@
 #define GL_CONTEXT_FLAG_NO_ERROR_BIT_KHR  0x00000008
 #endif /* GL_KHR_no_error */
 
+#ifndef GL_KHR_parallel_shader_compile
+#define GL_KHR_parallel_shader_compile 1
+#define GL_MAX_SHADER_COMPILER_THREADS_KHR 0x91B0
+#define GL_COMPLETION_STATUS_KHR          0x91B1
+typedef void (GL_APIENTRYP PFNGLMAXSHADERCOMPILERTHREADSKHRPROC) (GLuint count);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glMaxShaderCompilerThreadsKHR (GLuint count);
+#endif
+#endif /* GL_KHR_parallel_shader_compile */
+
 #ifndef GL_KHR_robust_buffer_access_behavior
 #define GL_KHR_robust_buffer_access_behavior 1
 #endif /* GL_KHR_robust_buffer_access_behavior */
@@ -791,6 +801,22 @@
 #define GL_ATC_RGBA_INTERPOLATED_ALPHA_AMD 0x87EE
 #endif /* GL_AMD_compressed_ATC_texture */
 
+#ifndef GL_AMD_framebuffer_multisample_advanced
+#define GL_AMD_framebuffer_multisample_advanced 1
+#define GL_RENDERBUFFER_STORAGE_SAMPLES_AMD 0x91B2
+#define GL_MAX_COLOR_FRAMEBUFFER_SAMPLES_AMD 0x91B3
+#define GL_MAX_COLOR_FRAMEBUFFER_STORAGE_SAMPLES_AMD 0x91B4
+#define GL_MAX_DEPTH_STENCIL_FRAMEBUFFER_SAMPLES_AMD 0x91B5
+#define GL_NUM_SUPPORTED_MULTISAMPLE_MODES_AMD 0x91B6
+#define GL_SUPPORTED_MULTISAMPLE_MODES_AMD 0x91B7
+typedef void (GL_APIENTRYP PFNGLRENDERBUFFERSTORAGEMULTISAMPLEADVANCEDAMDPROC) (GLenum target, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+typedef void (GL_APIENTRYP PFNGLNAMEDRENDERBUFFERSTORAGEMULTISAMPLEADVANCEDAMDPROC) (GLuint renderbuffer, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glRenderbufferStorageMultisampleAdvancedAMD (GLenum target, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+GL_APICALL void GL_APIENTRY glNamedRenderbufferStorageMultisampleAdvancedAMD (GLuint renderbuffer, GLsizei samples, GLsizei storageSamples, GLenum internalformat, GLsizei width, GLsizei height);
+#endif
+#endif /* GL_AMD_framebuffer_multisample_advanced */
+
 #ifndef GL_AMD_performance_monitor
 #define GL_AMD_performance_monitor 1
 #define GL_COUNTER_TYPE_AMD               0x8BC0
@@ -1055,6 +1081,16 @@
 #define GL_EXT_EGL_image_array 1
 #endif /* GL_EXT_EGL_image_array */
 
+#ifndef GL_EXT_EGL_image_storage
+#define GL_EXT_EGL_image_storage 1
+typedef void (GL_APIENTRYP PFNGLEGLIMAGETARGETTEXSTORAGEEXTPROC) (GLenum target, GLeglImageOES image, const GLint* attrib_list);
+typedef void (GL_APIENTRYP PFNGLEGLIMAGETARGETTEXTURESTORAGEEXTPROC) (GLuint texture, GLeglImageOES image, const GLint* attrib_list);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glEGLImageTargetTexStorageEXT (GLenum target, GLeglImageOES image, const GLint* attrib_list);
+GL_APICALL void GL_APIENTRY glEGLImageTargetTextureStorageEXT (GLuint texture, GLeglImageOES image, const GLint* attrib_list);
+#endif
+#endif /* GL_EXT_EGL_image_storage */
+
 #ifndef GL_EXT_YUV_target
 #define GL_EXT_YUV_target 1
 #define GL_SAMPLER_EXTERNAL_2D_Y2Y_EXT    0x8BE7
@@ -1126,6 +1162,20 @@
 #endif
 #endif /* GL_EXT_clear_texture */
 
+#ifndef GL_EXT_clip_control
+#define GL_EXT_clip_control 1
+#define GL_LOWER_LEFT_EXT                 0x8CA1
+#define GL_UPPER_LEFT_EXT                 0x8CA2
+#define GL_NEGATIVE_ONE_TO_ONE_EXT        0x935E
+#define GL_ZERO_TO_ONE_EXT                0x935F
+#define GL_CLIP_ORIGIN_EXT                0x935C
+#define GL_CLIP_DEPTH_MODE_EXT            0x935D
+typedef void (GL_APIENTRYP PFNGLCLIPCONTROLEXTPROC) (GLenum origin, GLenum depth);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glClipControlEXT (GLenum origin, GLenum depth);
+#endif
+#endif /* GL_EXT_clip_control */
+
 #ifndef GL_EXT_clip_cull_distance
 #define GL_EXT_clip_cull_distance 1
 #define GL_MAX_CLIP_DISTANCES_EXT         0x0D32
@@ -1680,6 +1730,8 @@
 #define GL_LAYOUT_SHADER_READ_ONLY_EXT    0x9591
 #define GL_LAYOUT_TRANSFER_SRC_EXT        0x9592
 #define GL_LAYOUT_TRANSFER_DST_EXT        0x9593
+#define GL_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_EXT 0x9530
+#define GL_LAYOUT_DEPTH_ATTACHMENT_STENCIL_READ_ONLY_EXT 0x9531
 typedef void (GL_APIENTRYP PFNGLGENSEMAPHORESEXTPROC) (GLsizei n, GLuint *semaphores);
 typedef void (GL_APIENTRYP PFNGLDELETESEMAPHORESEXTPROC) (GLsizei n, const GLuint *semaphores);
 typedef GLboolean (GL_APIENTRYP PFNGLISSEMAPHOREEXTPROC) (GLuint semaphore);
@@ -1823,6 +1875,14 @@
 #define GL_FRAGMENT_SHADER_DISCARDS_SAMPLES_EXT 0x8A52
 #endif /* GL_EXT_shader_framebuffer_fetch */
 
+#ifndef GL_EXT_shader_framebuffer_fetch_non_coherent
+#define GL_EXT_shader_framebuffer_fetch_non_coherent 1
+typedef void (GL_APIENTRYP PFNGLFRAMEBUFFERFETCHBARRIEREXTPROC) (void);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glFramebufferFetchBarrierEXT (void);
+#endif
+#endif /* GL_EXT_shader_framebuffer_fetch_non_coherent */
+
 #ifndef GL_EXT_shader_group_vote
 #define GL_EXT_shader_group_vote 1
 #endif /* GL_EXT_shader_group_vote */
@@ -2067,12 +2127,24 @@
 
 #ifndef GL_EXT_texture_filter_minmax
 #define GL_EXT_texture_filter_minmax 1
+#define GL_TEXTURE_REDUCTION_MODE_EXT     0x9366
+#define GL_WEIGHTED_AVERAGE_EXT           0x9367
 #endif /* GL_EXT_texture_filter_minmax */
 
 #ifndef GL_EXT_texture_format_BGRA8888
 #define GL_EXT_texture_format_BGRA8888 1
 #endif /* GL_EXT_texture_format_BGRA8888 */
 
+#ifndef GL_EXT_texture_format_sRGB_override
+#define GL_EXT_texture_format_sRGB_override 1
+#define GL_TEXTURE_FORMAT_SRGB_OVERRIDE_EXT 0x8FBF
+#endif /* GL_EXT_texture_format_sRGB_override */
+
+#ifndef GL_EXT_texture_mirror_clamp_to_edge
+#define GL_EXT_texture_mirror_clamp_to_edge 1
+#define GL_MIRROR_CLAMP_TO_EDGE_EXT       0x8743
+#endif /* GL_EXT_texture_mirror_clamp_to_edge */
+
 #ifndef GL_EXT_texture_norm16
 #define GL_EXT_texture_norm16 1
 #define GL_R16_EXT                        0x822A
@@ -2275,6 +2347,11 @@
 #define GL_CUBIC_MIPMAP_LINEAR_IMG        0x913B
 #endif /* GL_IMG_texture_filter_cubic */
 
+#ifndef GL_INTEL_blackhole_render
+#define GL_INTEL_blackhole_render 1
+#define GL_BLACKHOLE_RENDER_INTEL         0x83FC
+#endif /* GL_INTEL_blackhole_render */
+
 #ifndef GL_INTEL_conservative_rasterization
 #define GL_INTEL_conservative_rasterization 1
 #define GL_CONSERVATIVE_RASTERIZATION_INTEL 0x83FE
@@ -2317,7 +2394,7 @@
 typedef void (GL_APIENTRYP PFNGLGETFIRSTPERFQUERYIDINTELPROC) (GLuint *queryId);
 typedef void (GL_APIENTRYP PFNGLGETNEXTPERFQUERYIDINTELPROC) (GLuint queryId, GLuint *nextQueryId);
 typedef void (GL_APIENTRYP PFNGLGETPERFCOUNTERINFOINTELPROC) (GLuint queryId, GLuint counterId, GLuint counterNameLength, GLchar *counterName, GLuint counterDescLength, GLchar *counterDesc, GLuint *counterOffset, GLuint *counterDataSize, GLuint *counterTypeEnum, GLuint *counterDataTypeEnum, GLuint64 *rawCounterMaxValue);
-typedef void (GL_APIENTRYP PFNGLGETPERFQUERYDATAINTELPROC) (GLuint queryHandle, GLuint flags, GLsizei dataSize, GLvoid *data, GLuint *bytesWritten);
+typedef void (GL_APIENTRYP PFNGLGETPERFQUERYDATAINTELPROC) (GLuint queryHandle, GLuint flags, GLsizei dataSize, void *data, GLuint *bytesWritten);
 typedef void (GL_APIENTRYP PFNGLGETPERFQUERYIDBYNAMEINTELPROC) (GLchar *queryName, GLuint *queryId);
 typedef void (GL_APIENTRYP PFNGLGETPERFQUERYINFOINTELPROC) (GLuint queryId, GLuint queryNameLength, GLchar *queryName, GLuint *dataSize, GLuint *noCounters, GLuint *noInstances, GLuint *capsMask);
 #ifdef GL_GLEXT_PROTOTYPES
@@ -2328,12 +2405,17 @@
 GL_APICALL void GL_APIENTRY glGetFirstPerfQueryIdINTEL (GLuint *queryId);
 GL_APICALL void GL_APIENTRY glGetNextPerfQueryIdINTEL (GLuint queryId, GLuint *nextQueryId);
 GL_APICALL void GL_APIENTRY glGetPerfCounterInfoINTEL (GLuint queryId, GLuint counterId, GLuint counterNameLength, GLchar *counterName, GLuint counterDescLength, GLchar *counterDesc, GLuint *counterOffset, GLuint *counterDataSize, GLuint *counterTypeEnum, GLuint *counterDataTypeEnum, GLuint64 *rawCounterMaxValue);
-GL_APICALL void GL_APIENTRY glGetPerfQueryDataINTEL (GLuint queryHandle, GLuint flags, GLsizei dataSize, GLvoid *data, GLuint *bytesWritten);
+GL_APICALL void GL_APIENTRY glGetPerfQueryDataINTEL (GLuint queryHandle, GLuint flags, GLsizei dataSize, void *data, GLuint *bytesWritten);
 GL_APICALL void GL_APIENTRY glGetPerfQueryIdByNameINTEL (GLchar *queryName, GLuint *queryId);
 GL_APICALL void GL_APIENTRY glGetPerfQueryInfoINTEL (GLuint queryId, GLuint queryNameLength, GLchar *queryName, GLuint *dataSize, GLuint *noCounters, GLuint *noInstances, GLuint *capsMask);
 #endif
 #endif /* GL_INTEL_performance_query */
 
+#ifndef GL_MESA_framebuffer_flip_y
+#define GL_MESA_framebuffer_flip_y 1
+#define GL_FRAMEBUFFER_FLIP_Y_MESA        0x8BBB
+#endif /* GL_MESA_framebuffer_flip_y */
+
 #ifndef GL_MESA_program_binary_formats
 #define GL_MESA_program_binary_formats 1
 #define GL_PROGRAM_BINARY_FORMAT_MESA     0x875F
@@ -2449,6 +2531,17 @@
 #define GL_FACTOR_MAX_AMD                 0x901D
 #endif /* GL_NV_blend_minmax_factor */
 
+#ifndef GL_NV_clip_space_w_scaling
+#define GL_NV_clip_space_w_scaling 1
+#define GL_VIEWPORT_POSITION_W_SCALE_NV   0x937C
+#define GL_VIEWPORT_POSITION_W_SCALE_X_COEFF_NV 0x937D
+#define GL_VIEWPORT_POSITION_W_SCALE_Y_COEFF_NV 0x937E
+typedef void (GL_APIENTRYP PFNGLVIEWPORTPOSITIONWSCALENVPROC) (GLuint index, GLfloat xcoeff, GLfloat ycoeff);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glViewportPositionWScaleNV (GLuint index, GLfloat xcoeff, GLfloat ycoeff);
+#endif
+#endif /* GL_NV_clip_space_w_scaling */
+
 #ifndef GL_NV_conditional_render
 #define GL_NV_conditional_render 1
 #define GL_QUERY_WAIT_NV                  0x8E13
@@ -2475,6 +2568,11 @@
 #endif
 #endif /* GL_NV_conservative_raster */
 
+#ifndef GL_NV_conservative_raster_pre_snap
+#define GL_NV_conservative_raster_pre_snap 1
+#define GL_CONSERVATIVE_RASTER_MODE_PRE_SNAP_NV 0x9550
+#endif /* GL_NV_conservative_raster_pre_snap */
+
 #ifndef GL_NV_conservative_raster_pre_snap_triangles
 #define GL_NV_conservative_raster_pre_snap_triangles 1
 #define GL_CONSERVATIVE_RASTER_MODE_NV    0x954D
@@ -2846,6 +2944,7 @@
 
 #ifndef GL_NV_path_rendering
 #define GL_NV_path_rendering 1
+typedef double GLdouble;
 #define GL_PATH_FORMAT_SVG_NV             0x9070
 #define GL_PATH_FORMAT_PS_NV              0x9071
 #define GL_STANDARD_FONT_NAME_NV          0x9072
@@ -3056,6 +3155,25 @@
 typedef GLenum (GL_APIENTRYP PFNGLPATHMEMORYGLYPHINDEXARRAYNVPROC) (GLuint firstPathName, GLenum fontTarget, GLsizeiptr fontSize, const void *fontData, GLsizei faceIndex, GLuint firstGlyphIndex, GLsizei numGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
 typedef void (GL_APIENTRYP PFNGLPROGRAMPATHFRAGMENTINPUTGENNVPROC) (GLuint program, GLint location, GLenum genMode, GLint components, const GLfloat *coeffs);
 typedef void (GL_APIENTRYP PFNGLGETPROGRAMRESOURCEFVNVPROC) (GLuint program, GLenum programInterface, GLuint index, GLsizei propCount, const GLenum *props, GLsizei bufSize, GLsizei *length, GLfloat *params);
+typedef void (GL_APIENTRYP PFNGLMATRIXFRUSTUMEXTPROC) (GLenum mode, GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar);
+typedef void (GL_APIENTRYP PFNGLMATRIXLOADIDENTITYEXTPROC) (GLenum mode);
+typedef void (GL_APIENTRYP PFNGLMATRIXLOADTRANSPOSEFEXTPROC) (GLenum mode, const GLfloat *m);
+typedef void (GL_APIENTRYP PFNGLMATRIXLOADTRANSPOSEDEXTPROC) (GLenum mode, const GLdouble *m);
+typedef void (GL_APIENTRYP PFNGLMATRIXLOADFEXTPROC) (GLenum mode, const GLfloat *m);
+typedef void (GL_APIENTRYP PFNGLMATRIXLOADDEXTPROC) (GLenum mode, const GLdouble *m);
+typedef void (GL_APIENTRYP PFNGLMATRIXMULTTRANSPOSEFEXTPROC) (GLenum mode, const GLfloat *m);
+typedef void (GL_APIENTRYP PFNGLMATRIXMULTTRANSPOSEDEXTPROC) (GLenum mode, const GLdouble *m);
+typedef void (GL_APIENTRYP PFNGLMATRIXMULTFEXTPROC) (GLenum mode, const GLfloat *m);
+typedef void (GL_APIENTRYP PFNGLMATRIXMULTDEXTPROC) (GLenum mode, const GLdouble *m);
+typedef void (GL_APIENTRYP PFNGLMATRIXORTHOEXTPROC) (GLenum mode, GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar);
+typedef void (GL_APIENTRYP PFNGLMATRIXPOPEXTPROC) (GLenum mode);
+typedef void (GL_APIENTRYP PFNGLMATRIXPUSHEXTPROC) (GLenum mode);
+typedef void (GL_APIENTRYP PFNGLMATRIXROTATEFEXTPROC) (GLenum mode, GLfloat angle, GLfloat x, GLfloat y, GLfloat z);
+typedef void (GL_APIENTRYP PFNGLMATRIXROTATEDEXTPROC) (GLenum mode, GLdouble angle, GLdouble x, GLdouble y, GLdouble z);
+typedef void (GL_APIENTRYP PFNGLMATRIXSCALEFEXTPROC) (GLenum mode, GLfloat x, GLfloat y, GLfloat z);
+typedef void (GL_APIENTRYP PFNGLMATRIXSCALEDEXTPROC) (GLenum mode, GLdouble x, GLdouble y, GLdouble z);
+typedef void (GL_APIENTRYP PFNGLMATRIXTRANSLATEFEXTPROC) (GLenum mode, GLfloat x, GLfloat y, GLfloat z);
+typedef void (GL_APIENTRYP PFNGLMATRIXTRANSLATEDEXTPROC) (GLenum mode, GLdouble x, GLdouble y, GLdouble z);
 #ifdef GL_GLEXT_PROTOTYPES
 GL_APICALL GLuint GL_APIENTRY glGenPathsNV (GLsizei range);
 GL_APICALL void GL_APIENTRY glDeletePathsNV (GLuint path, GLsizei range);
@@ -3114,6 +3232,25 @@
 GL_APICALL GLenum GL_APIENTRY glPathMemoryGlyphIndexArrayNV (GLuint firstPathName, GLenum fontTarget, GLsizeiptr fontSize, const void *fontData, GLsizei faceIndex, GLuint firstGlyphIndex, GLsizei numGlyphs, GLuint pathParameterTemplate, GLfloat emScale);
 GL_APICALL void GL_APIENTRY glProgramPathFragmentInputGenNV (GLuint program, GLint location, GLenum genMode, GLint components, const GLfloat *coeffs);
 GL_APICALL void GL_APIENTRY glGetProgramResourcefvNV (GLuint program, GLenum programInterface, GLuint index, GLsizei propCount, const GLenum *props, GLsizei bufSize, GLsizei *length, GLfloat *params);
+GL_APICALL void GL_APIENTRY glMatrixFrustumEXT (GLenum mode, GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar);
+GL_APICALL void GL_APIENTRY glMatrixLoadIdentityEXT (GLenum mode);
+GL_APICALL void GL_APIENTRY glMatrixLoadTransposefEXT (GLenum mode, const GLfloat *m);
+GL_APICALL void GL_APIENTRY glMatrixLoadTransposedEXT (GLenum mode, const GLdouble *m);
+GL_APICALL void GL_APIENTRY glMatrixLoadfEXT (GLenum mode, const GLfloat *m);
+GL_APICALL void GL_APIENTRY glMatrixLoaddEXT (GLenum mode, const GLdouble *m);
+GL_APICALL void GL_APIENTRY glMatrixMultTransposefEXT (GLenum mode, const GLfloat *m);
+GL_APICALL void GL_APIENTRY glMatrixMultTransposedEXT (GLenum mode, const GLdouble *m);
+GL_APICALL void GL_APIENTRY glMatrixMultfEXT (GLenum mode, const GLfloat *m);
+GL_APICALL void GL_APIENTRY glMatrixMultdEXT (GLenum mode, const GLdouble *m);
+GL_APICALL void GL_APIENTRY glMatrixOrthoEXT (GLenum mode, GLdouble left, GLdouble right, GLdouble bottom, GLdouble top, GLdouble zNear, GLdouble zFar);
+GL_APICALL void GL_APIENTRY glMatrixPopEXT (GLenum mode);
+GL_APICALL void GL_APIENTRY glMatrixPushEXT (GLenum mode);
+GL_APICALL void GL_APIENTRY glMatrixRotatefEXT (GLenum mode, GLfloat angle, GLfloat x, GLfloat y, GLfloat z);
+GL_APICALL void GL_APIENTRY glMatrixRotatedEXT (GLenum mode, GLdouble angle, GLdouble x, GLdouble y, GLdouble z);
+GL_APICALL void GL_APIENTRY glMatrixScalefEXT (GLenum mode, GLfloat x, GLfloat y, GLfloat z);
+GL_APICALL void GL_APIENTRY glMatrixScaledEXT (GLenum mode, GLdouble x, GLdouble y, GLdouble z);
+GL_APICALL void GL_APIENTRY glMatrixTranslatefEXT (GLenum mode, GLfloat x, GLfloat y, GLfloat z);
+GL_APICALL void GL_APIENTRY glMatrixTranslatedEXT (GLenum mode, GLdouble x, GLdouble y, GLdouble z);
 #endif
 #endif /* GL_NV_path_rendering */
 
@@ -3225,6 +3362,10 @@
 #define GL_SAMPLER_CUBE_SHADOW_NV         0x8DC5
 #endif /* GL_NV_shadow_samplers_cube */
 
+#ifndef GL_NV_stereo_view_rendering
+#define GL_NV_stereo_view_rendering 1
+#endif /* GL_NV_stereo_view_rendering */
+
 #ifndef GL_NV_texture_border_clamp
 #define GL_NV_texture_border_clamp 1
 #define GL_TEXTURE_BORDER_COLOR_NV        0x1004
@@ -3427,6 +3568,19 @@
 #endif
 #endif /* GL_QCOM_shader_framebuffer_fetch_noncoherent */
 
+#ifndef GL_QCOM_texture_foveated
+#define GL_QCOM_texture_foveated 1
+#define GL_TEXTURE_FOVEATED_FEATURE_BITS_QCOM 0x8BFB
+#define GL_TEXTURE_FOVEATED_MIN_PIXEL_DENSITY_QCOM 0x8BFC
+#define GL_TEXTURE_FOVEATED_FEATURE_QUERY_QCOM 0x8BFD
+#define GL_TEXTURE_FOVEATED_NUM_FOCAL_POINTS_QUERY_QCOM 0x8BFE
+#define GL_FRAMEBUFFER_INCOMPLETE_FOVEATION_QCOM 0x8BFF
+typedef void (GL_APIENTRYP PFNGLTEXTUREFOVEATIONPARAMETERSQCOMPROC) (GLuint texture, GLuint layer, GLuint focalPoint, GLfloat focalX, GLfloat focalY, GLfloat gainX, GLfloat gainY, GLfloat foveaArea);
+#ifdef GL_GLEXT_PROTOTYPES
+GL_APICALL void GL_APIENTRY glTextureFoveationParametersQCOM (GLuint texture, GLuint layer, GLuint focalPoint, GLfloat focalX, GLfloat focalY, GLfloat gainX, GLfloat gainY, GLfloat foveaArea);
+#endif
+#endif /* GL_QCOM_texture_foveated */
+
 #ifndef GL_QCOM_tiled_rendering
 #define GL_QCOM_tiled_rendering 1
 #define GL_COLOR_BUFFER_BIT0_QCOM         0x00000001
diff --git a/include/GLES3/gl3.h b/include/GLES3/gl3.h
index 71e72b4..532bbbd 100644
--- a/include/GLES3/gl3.h
+++ b/include/GLES3/gl3.h
@@ -1,12 +1,12 @@
-#ifndef __gl3_h_
-#define __gl3_h_ 1
+#ifndef __gles2_gl3_h_
+#define __gles2_gl3_h_ 1
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
-** Copyright (c) 2013-2017 The Khronos Group Inc.
+** Copyright (c) 2013-2018 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -44,7 +44,7 @@
 #define GL_GLES_PROTOTYPES 1
 #endif
 
-/* Generated on date 20170606 */
+/* Generated on date 20180725 */
 
 /* Generated C header for:
  * API: gles2
diff --git a/include/drm-uapi/drm_fourcc.h b/include/drm-uapi/drm_fourcc.h
index e04613d..d5e5235 100644
--- a/include/drm-uapi/drm_fourcc.h
+++ b/include/drm-uapi/drm_fourcc.h
@@ -385,6 +385,23 @@
 	fourcc_mod_code(NVIDIA, 0x15)
 
 /*
+ * Some Broadcom modifiers take parameters, for example the number of
+ * vertical lines in the image. Reserve the lower 32 bits for modifier
+ * type, and the next 24 bits for parameters. Top 8 bits are the
+ * vendor code.
+ */
+#define __fourcc_mod_broadcom_param_shift 8
+#define __fourcc_mod_broadcom_param_bits 48
+#define fourcc_mod_broadcom_code(val, params) \
+	fourcc_mod_code(BROADCOM, ((((__u64)params) << __fourcc_mod_broadcom_param_shift) | val))
+#define fourcc_mod_broadcom_param(m) \
+	((int)(((m) >> __fourcc_mod_broadcom_param_shift) &	\
+	       ((1ULL << __fourcc_mod_broadcom_param_bits) - 1)))
+#define fourcc_mod_broadcom_mod(m) \
+	((m) & ~(((1ULL << __fourcc_mod_broadcom_param_bits) - 1) <<	\
+		 __fourcc_mod_broadcom_param_shift))
+
+/*
  * Broadcom VC4 "T" format
  *
  * This is the primary layout that the V3D GPU can texture from (it
@@ -405,6 +422,69 @@
  */
 #define DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED fourcc_mod_code(BROADCOM, 1)
 
+/*
+ * Broadcom SAND format
+ *
+ * This is the native format that the H.264 codec block uses.  For VC4
+ * HVS, it is only valid for H.264 (NV12/21) and RGBA modes.
+ *
+ * The image can be considered to be split into columns, and the
+ * columns are placed consecutively into memory.  The width of those
+ * columns can be either 32, 64, 128, or 256 pixels, but in practice
+ * only 128 pixel columns are used.
+ *
+ * The pitch between the start of each column is set to optimally
+ * switch between SDRAM banks. This is passed as the number of lines
+ * of column width in the modifier (we can't use the stride value due
+ * to various core checks that look at it , so you should set the
+ * stride to width*cpp).
+ *
+ * Note that the column height for this format modifier is the same
+ * for all of the planes, assuming that each column contains both Y
+ * and UV.  Some SAND-using hardware stores UV in a separate tiled
+ * image from Y to reduce the column height, which is not supported
+ * with these modifiers.
+ */
+
+#define DRM_FORMAT_MOD_BROADCOM_SAND32_COL_HEIGHT(v) \
+	fourcc_mod_broadcom_code(2, v)
+#define DRM_FORMAT_MOD_BROADCOM_SAND64_COL_HEIGHT(v) \
+	fourcc_mod_broadcom_code(3, v)
+#define DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(v) \
+	fourcc_mod_broadcom_code(4, v)
+#define DRM_FORMAT_MOD_BROADCOM_SAND256_COL_HEIGHT(v) \
+	fourcc_mod_broadcom_code(5, v)
+
+#define DRM_FORMAT_MOD_BROADCOM_SAND32 \
+	DRM_FORMAT_MOD_BROADCOM_SAND32_COL_HEIGHT(0)
+#define DRM_FORMAT_MOD_BROADCOM_SAND64 \
+	DRM_FORMAT_MOD_BROADCOM_SAND64_COL_HEIGHT(0)
+#define DRM_FORMAT_MOD_BROADCOM_SAND128 \
+	DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(0)
+#define DRM_FORMAT_MOD_BROADCOM_SAND256 \
+	DRM_FORMAT_MOD_BROADCOM_SAND256_COL_HEIGHT(0)
+
+/* Broadcom UIF format
+ *
+ * This is the common format for the current Broadcom multimedia
+ * blocks, including V3D 3.x and newer, newer video codecs, and
+ * displays.
+ *
+ * The image consists of utiles (64b blocks), UIF blocks (2x2 utiles),
+ * and macroblocks (4x4 UIF blocks).  Those 4x4 UIF block groups are
+ * stored in columns, with padding between the columns to ensure that
+ * moving from one column to the next doesn't hit the same SDRAM page
+ * bank.
+ *
+ * To calculate the padding, it is assumed that each hardware block
+ * and the software driving it knows the platform's SDRAM page size,
+ * number of banks, and XOR address, and that it's identical between
+ * all blocks using the format.  This tiling modifier will use XOR as
+ * necessary to reduce the padding.  If a hardware block can't do XOR,
+ * the assumption is that a no-XOR tiling modifier will be created.
+ */
+#define DRM_FORMAT_MOD_BROADCOM_UIF fourcc_mod_code(BROADCOM, 6)
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/src/gallium/drivers/vc5/vc5_drm.h b/include/drm-uapi/v3d_drm.h
similarity index 67%
rename from src/gallium/drivers/vc5/vc5_drm.h
rename to include/drm-uapi/v3d_drm.h
index 184863d..7b66277 100644
--- a/src/gallium/drivers/vc5/vc5_drm.h
+++ b/include/drm-uapi/v3d_drm.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014-2017 Broadcom
+ * Copyright © 2014-2018 Broadcom
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,8 +21,8 @@
  * IN THE SOFTWARE.
  */
 
-#ifndef _VC5_DRM_H_
-#define _VC5_DRM_H_
+#ifndef _V3D_DRM_H_
+#define _V3D_DRM_H_
 
 #include "drm.h"
 
@@ -30,28 +30,28 @@
 extern "C" {
 #endif
 
-#define DRM_VC5_SUBMIT_CL                         0x00
-#define DRM_VC5_WAIT_BO                           0x01
-#define DRM_VC5_CREATE_BO                         0x02
-#define DRM_VC5_MMAP_BO                           0x03
-#define DRM_VC5_GET_PARAM                         0x04
-#define DRM_VC5_GET_BO_OFFSET                     0x05
+#define DRM_V3D_SUBMIT_CL                         0x00
+#define DRM_V3D_WAIT_BO                           0x01
+#define DRM_V3D_CREATE_BO                         0x02
+#define DRM_V3D_MMAP_BO                           0x03
+#define DRM_V3D_GET_PARAM                         0x04
+#define DRM_V3D_GET_BO_OFFSET                     0x05
 
-#define DRM_IOCTL_VC5_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_SUBMIT_CL, struct drm_vc5_submit_cl)
-#define DRM_IOCTL_VC5_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_WAIT_BO, struct drm_vc5_wait_bo)
-#define DRM_IOCTL_VC5_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_CREATE_BO, struct drm_vc5_create_bo)
-#define DRM_IOCTL_VC5_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_MMAP_BO, struct drm_vc5_mmap_bo)
-#define DRM_IOCTL_VC5_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_GET_PARAM, struct drm_vc5_get_param)
-#define DRM_IOCTL_VC5_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_VC5_GET_BO_OFFSET, struct drm_vc5_get_bo_offset)
+#define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
+#define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
+#define DRM_IOCTL_V3D_CREATE_BO           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_CREATE_BO, struct drm_v3d_create_bo)
+#define DRM_IOCTL_V3D_MMAP_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_MMAP_BO, struct drm_v3d_mmap_bo)
+#define DRM_IOCTL_V3D_GET_PARAM           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_PARAM, struct drm_v3d_get_param)
+#define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
 
 /**
- * struct drm_vc5_submit_cl - ioctl argument for submitting commands to the 3D
+ * struct drm_v3d_submit_cl - ioctl argument for submitting commands to the 3D
  * engine.
  *
  * This asks the kernel to have the GPU execute an optional binner
  * command list, and a render command list.
  */
-struct drm_vc5_submit_cl {
+struct drm_v3d_submit_cl {
 	/* Pointer to the binner command list.
 	 *
 	 * This is the first set of commands executed, which runs the
@@ -101,29 +101,32 @@
 
 	/* Number of BO handles passed in (size is that times 4). */
 	__u32 bo_handle_count;
+
+	/* Pad, must be zero-filled. */
+	__u32 pad;
 };
 
 /**
- * struct drm_vc5_wait_bo - ioctl argument for waiting for
- * completion of the last DRM_VC5_SUBMIT_CL on a BO.
+ * struct drm_v3d_wait_bo - ioctl argument for waiting for
+ * completion of the last DRM_V3D_SUBMIT_CL on a BO.
  *
  * This is useful for cases where multiple processes might be
  * rendering to a BO and you want to wait for all rendering to be
  * completed.
  */
-struct drm_vc5_wait_bo {
+struct drm_v3d_wait_bo {
 	__u32 handle;
 	__u32 pad;
 	__u64 timeout_ns;
 };
 
 /**
- * struct drm_vc5_create_bo - ioctl argument for creating VC5 BOs.
+ * struct drm_v3d_create_bo - ioctl argument for creating V3D BOs.
  *
  * There are currently no values for the flags argument, but it may be
  * used in a future extension.
  */
-struct drm_vc5_create_bo {
+struct drm_v3d_create_bo {
 	__u32 size;
 	__u32 flags;
 	/** Returned GEM handle for the BO. */
@@ -140,7 +143,7 @@
 };
 
 /**
- * struct drm_vc5_mmap_bo - ioctl argument for mapping VC5 BOs.
+ * struct drm_v3d_mmap_bo - ioctl argument for mapping V3D BOs.
  *
  * This doesn't actually perform an mmap.  Instead, it returns the
  * offset you need to use in an mmap on the DRM device node.  This
@@ -150,7 +153,7 @@
  * There are currently no values for the flags argument, but it may be
  * used in a future extension.
  */
-struct drm_vc5_mmap_bo {
+struct drm_v3d_mmap_bo {
 	/** Handle for the object being mapped. */
 	__u32 handle;
 	__u32 flags;
@@ -158,17 +161,17 @@
 	__u64 offset;
 };
 
-enum drm_vc5_param {
-	DRM_VC5_PARAM_V3D_UIFCFG,
-	DRM_VC5_PARAM_V3D_HUB_IDENT1,
-	DRM_VC5_PARAM_V3D_HUB_IDENT2,
-	DRM_VC5_PARAM_V3D_HUB_IDENT3,
-	DRM_VC5_PARAM_V3D_CORE0_IDENT0,
-	DRM_VC5_PARAM_V3D_CORE0_IDENT1,
-	DRM_VC5_PARAM_V3D_CORE0_IDENT2,
+enum drm_v3d_param {
+	DRM_V3D_PARAM_V3D_UIFCFG,
+	DRM_V3D_PARAM_V3D_HUB_IDENT1,
+	DRM_V3D_PARAM_V3D_HUB_IDENT2,
+	DRM_V3D_PARAM_V3D_HUB_IDENT3,
+	DRM_V3D_PARAM_V3D_CORE0_IDENT0,
+	DRM_V3D_PARAM_V3D_CORE0_IDENT1,
+	DRM_V3D_PARAM_V3D_CORE0_IDENT2,
 };
 
-struct drm_vc5_get_param {
+struct drm_v3d_get_param {
 	__u32 param;
 	__u32 pad;
 	__u64 value;
@@ -176,10 +179,10 @@
 
 /**
  * Returns the offset for the BO in the V3D address space for this DRM fd.
- * This is the same value returned by drm_vc5_create_bo, if that was called
+ * This is the same value returned by drm_v3d_create_bo, if that was called
  * from this DRM fd.
  */
-struct drm_vc5_get_bo_offset {
+struct drm_v3d_get_bo_offset {
 	__u32 handle;
 	__u32 offset;
 };
@@ -188,4 +191,4 @@
 }
 #endif
 
-#endif /* _VC5_DRM_H_ */
+#endif /* _V3D_DRM_H_ */
diff --git a/include/drm-uapi/vc4_drm.h b/include/drm-uapi/vc4_drm.h
index 4117117..31f50de 100644
--- a/include/drm-uapi/vc4_drm.h
+++ b/include/drm-uapi/vc4_drm.h
@@ -183,10 +183,17 @@
 	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
 	__u32 perfmonid;
 
-	/* Unused field to align this struct on 64 bits. Must be set to 0.
-	 * If one ever needs to add an u32 field to this struct, this field
-	 * can be used.
+	/* Syncobj handle to wait on. If set, processing of this render job
+	 * will not start until the syncobj is signaled. 0 means ignore.
 	 */
+	__u32 in_sync;
+
+	/* Syncobj handle to export fence to. If set, the fence in the syncobj
+	 * will be replaced with a fence that signals upon completion of this
+	 * render job. 0 means ignore.
+	 */
+	__u32 out_sync;
+
 	__u32 pad2;
 };
 
diff --git a/include/meson.build b/include/meson.build
index b4555ea..081c1bc 100644
--- a/include/meson.build
+++ b/include/meson.build
@@ -43,7 +43,7 @@
   )
 endif
 
-if with_gles1 or with_gles2 or with_egl
+if with_gles1 or with_gles2 or with_opengl or with_egl
   install_headers('KHR/khrplatform.h', subdir : 'KHR')
 endif
 
diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index c740a50..7201562 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -163,27 +163,31 @@
 CHIPSET(0x5926, kbl_gt3, "Intel(R) Iris Plus Graphics 640 (Kaby Lake GT3e)")
 CHIPSET(0x5927, kbl_gt3, "Intel(R) Iris Plus Graphics 650 (Kaby Lake GT3e)")
 CHIPSET(0x593B, kbl_gt4, "Intel(R) Kabylake GT4")
+CHIPSET(0x591C, kbl_gt2, "Intel(R) Amber Lake (Kabylake) GT2")
+CHIPSET(0x87C0, kbl_gt2, "Intel(R) Amber Lake (Kabylake) GT2")
+CHIPSET(0x87CA, cfl_gt2, "Intel(R) Amber Lake (Coffeelake) GT2")
 CHIPSET(0x3184, glk,     "Intel(R) UHD Graphics 605 (Geminilake)")
 CHIPSET(0x3185, glk_2x6, "Intel(R) UHD Graphics 600 (Geminilake 2x6)")
 CHIPSET(0x3E90, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)")
 CHIPSET(0x3E93, cfl_gt1, "Intel(R) UHD Graphics 610 (Coffeelake 2x6 GT1)")
 CHIPSET(0x3E99, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
-CHIPSET(0x3EA1, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
-CHIPSET(0x3EA4, cfl_gt1, "Intel(R) HD Graphics (Coffeelake 2x6 GT1)")
 CHIPSET(0x3E91, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)")
 CHIPSET(0x3E92, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)")
 CHIPSET(0x3E96, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
+CHIPSET(0x3E98, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
 CHIPSET(0x3E9A, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
 CHIPSET(0x3E9B, cfl_gt2, "Intel(R) UHD Graphics 630 (Coffeelake 3x8 GT2)")
 CHIPSET(0x3E94, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
-CHIPSET(0x3EA0, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
-CHIPSET(0x3EA3, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
 CHIPSET(0x3EA9, cfl_gt2, "Intel(R) HD Graphics (Coffeelake 3x8 GT2)")
-CHIPSET(0x3EA2, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)")
 CHIPSET(0x3EA5, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)")
 CHIPSET(0x3EA6, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)")
 CHIPSET(0x3EA7, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)")
 CHIPSET(0x3EA8, cfl_gt3, "Intel(R) HD Graphics (Coffeelake 3x8 GT3)")
+CHIPSET(0x3EA1, cfl_gt1, "Intel(R) HD Graphics (Whiskey Lake 2x6 GT1)")
+CHIPSET(0x3EA4, cfl_gt1, "Intel(R) HD Graphics (Whiskey Lake 3x8 GT1)")
+CHIPSET(0x3EA0, cfl_gt2, "Intel(R) HD Graphics (Whiskey Lake 3x8 GT2)")
+CHIPSET(0x3EA3, cfl_gt2, "Intel(R) HD Graphics (Whiskey Lake 3x8 GT2)")
+CHIPSET(0x3EA2, cfl_gt3, "Intel(R) HD Graphics (Whiskey Lake 3x8 GT3)")
 CHIPSET(0x5A49, cnl_2x8, "Intel(R) HD Graphics (Cannonlake 2x8 GT0.5)")
 CHIPSET(0x5A4A, cnl_2x8, "Intel(R) HD Graphics (Cannonlake 2x8 GT0.5)")
 CHIPSET(0x5A41, cnl_3x8, "Intel(R) HD Graphics (Cannonlake 3x8 GT1)")
diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h
index f810e2b..35ea355 100644
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -204,6 +204,7 @@
 CHIPSET(0x67CF, POLARIS10)
 CHIPSET(0x67D0, POLARIS10)
 CHIPSET(0x67DF, POLARIS10)
+CHIPSET(0x6FDF, POLARIS10)
 
 CHIPSET(0x98E4, STONEY)
 
@@ -235,5 +236,12 @@
 CHIPSET(0x69A3, VEGA12)
 CHIPSET(0x69AF, VEGA12)
 
+CHIPSET(0x66A0, VEGA20)
+CHIPSET(0x66A1, VEGA20)
+CHIPSET(0x66A2, VEGA20)
+CHIPSET(0x66A3, VEGA20)
+CHIPSET(0x66A7, VEGA20)
+CHIPSET(0x66AF, VEGA20)
+
 CHIPSET(0x15DD, RAVEN)
 CHIPSET(0x15D8, RAVEN)
diff --git a/include/vulkan/vk_icd.h b/include/vulkan/vk_icd.h
index 7b54fb5..b935fa1 100644
--- a/include/vulkan/vk_icd.h
+++ b/include/vulkan/vk_icd.h
@@ -24,13 +24,34 @@
 #define VKICD_H
 
 #include "vulkan.h"
+#include <stdbool.h>
 
-/*
- * Loader-ICD version negotiation API
- */
-#define CURRENT_LOADER_ICD_INTERFACE_VERSION 3
+// Loader-ICD version negotiation API.  Versions add the following features:
+//   Version 0 - Initial.  Doesn't support vk_icdGetInstanceProcAddr
+//               or vk_icdNegotiateLoaderICDInterfaceVersion.
+//   Version 1 - Add support for vk_icdGetInstanceProcAddr.
+//   Version 2 - Add Loader/ICD Interface version negotiation
+//               via vk_icdNegotiateLoaderICDInterfaceVersion.
+//   Version 3 - Add ICD creation/destruction of KHR_surface objects.
+//   Version 4 - Add unknown physical device extension qyering via
+//               vk_icdGetPhysicalDeviceProcAddr.
+//   Version 5 - Tells ICDs that the loader is now paying attention to the
+//               application version of Vulkan passed into the ApplicationInfo
+//               structure during vkCreateInstance.  This will tell the ICD
+//               that if the loader is older, it should automatically fail a
+//               call for any API version > 1.0.  Otherwise, the loader will
+//               manually determine if it can support the expected version.
+#define CURRENT_LOADER_ICD_INTERFACE_VERSION 5
 #define MIN_SUPPORTED_LOADER_ICD_INTERFACE_VERSION 0
-typedef VkResult (VKAPI_PTR *PFN_vkNegotiateLoaderICDInterfaceVersion)(uint32_t *pVersion);
+#define MIN_PHYS_DEV_EXTENSION_ICD_INTERFACE_VERSION 4
+typedef VkResult(VKAPI_PTR *PFN_vkNegotiateLoaderICDInterfaceVersion)(uint32_t *pVersion);
+
+// This is defined in vk_layer.h which will be found by the loader, but if an ICD is building against this
+// file directly, it won't be found.
+#ifndef PFN_GetPhysicalDeviceProcAddr
+typedef PFN_vkVoidFunction(VKAPI_PTR *PFN_GetPhysicalDeviceProcAddr)(VkInstance instance, const char *pName);
+#endif
+
 /*
  * The ICD must reserve space for a pointer for the loader's dispatch
  * table, at the start of <each object>.
@@ -64,6 +85,9 @@
     VK_ICD_WSI_PLATFORM_WIN32,
     VK_ICD_WSI_PLATFORM_XCB,
     VK_ICD_WSI_PLATFORM_XLIB,
+    VK_ICD_WSI_PLATFORM_ANDROID,
+    VK_ICD_WSI_PLATFORM_MACOS,
+    VK_ICD_WSI_PLATFORM_IOS,
     VK_ICD_WSI_PLATFORM_DISPLAY
 } VkIcdWsiPlatform;
 
@@ -77,7 +101,7 @@
     MirConnection *connection;
     MirSurface *mirSurface;
 } VkIcdSurfaceMir;
-#endif // VK_USE_PLATFORM_MIR_KHR
+#endif  // VK_USE_PLATFORM_MIR_KHR
 
 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
 typedef struct {
@@ -85,7 +109,7 @@
     struct wl_display *display;
     struct wl_surface *surface;
 } VkIcdSurfaceWayland;
-#endif // VK_USE_PLATFORM_WAYLAND_KHR
+#endif  // VK_USE_PLATFORM_WAYLAND_KHR
 
 #ifdef VK_USE_PLATFORM_WIN32_KHR
 typedef struct {
@@ -93,7 +117,7 @@
     HINSTANCE hinstance;
     HWND hwnd;
 } VkIcdSurfaceWin32;
-#endif // VK_USE_PLATFORM_WIN32_KHR
+#endif  // VK_USE_PLATFORM_WIN32_KHR
 
 #ifdef VK_USE_PLATFORM_XCB_KHR
 typedef struct {
@@ -101,7 +125,7 @@
     xcb_connection_t *connection;
     xcb_window_t window;
 } VkIcdSurfaceXcb;
-#endif // VK_USE_PLATFORM_XCB_KHR
+#endif  // VK_USE_PLATFORM_XCB_KHR
 
 #ifdef VK_USE_PLATFORM_XLIB_KHR
 typedef struct {
@@ -109,13 +133,28 @@
     Display *dpy;
     Window window;
 } VkIcdSurfaceXlib;
-#endif // VK_USE_PLATFORM_XLIB_KHR
+#endif  // VK_USE_PLATFORM_XLIB_KHR
 
 #ifdef VK_USE_PLATFORM_ANDROID_KHR
 typedef struct {
-    ANativeWindow* window;
+    VkIcdSurfaceBase base;
+    struct ANativeWindow *window;
 } VkIcdSurfaceAndroid;
-#endif //VK_USE_PLATFORM_ANDROID_KHR
+#endif  // VK_USE_PLATFORM_ANDROID_KHR
+
+#ifdef VK_USE_PLATFORM_MACOS_MVK
+typedef struct {
+    VkIcdSurfaceBase base;
+    const void *pView;
+} VkIcdSurfaceMacOS;
+#endif  // VK_USE_PLATFORM_MACOS_MVK
+
+#ifdef VK_USE_PLATFORM_IOS_MVK
+typedef struct {
+    VkIcdSurfaceBase base;
+    const void *pView;
+} VkIcdSurfaceIOS;
+#endif  // VK_USE_PLATFORM_IOS_MVK
 
 typedef struct {
     VkIcdSurfaceBase base;
@@ -128,4 +167,4 @@
     VkExtent2D imageExtent;
 } VkIcdSurfaceDisplay;
 
-#endif // VKICD_H
+#endif  // VKICD_H
diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4
index 51df0c0..0fdca90 100644
--- a/m4/ax_check_compile_flag.m4
+++ b/m4/ax_check_compile_flag.m4
@@ -55,6 +55,11 @@
 #   modified version of the Autoconf Macro, you may extend this special
 #   exception to the GPL to apply to your modified version as well.
 
+# Emil:
+# Toggle Werror since at some point clang started treating unknown -W
+# flags as warnings, succeeding with the build, yet issuing an annoying
+# warning.
+
 #serial 3
 
 AC_DEFUN([AX_CHECK_COMPILE_FLAG],
@@ -62,7 +67,7 @@
 AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
 AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
   ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
-  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
+  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1 -Werror"
   AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
     [AS_VAR_SET(CACHEVAR,[yes])],
     [AS_VAR_SET(CACHEVAR,[no])])
diff --git a/meson.build b/meson.build
index 5dafcff..cce31cd 100644
--- a/meson.build
+++ b/meson.build
@@ -25,10 +25,13 @@
     [find_program('python', 'python2', 'python3'), 'bin/meson_get_version.py']
   ).stdout(),
   license : 'MIT',
-  meson_version : '>= 0.42',
+  meson_version : '>= 0.44.1',
   default_options : ['buildtype=debugoptimized', 'c_std=c99', 'cpp_std=c++11']
 )
 
+cc = meson.get_compiler('c')
+cpp = meson.get_compiler('cpp')
+
 null_dep = dependency('', required : false)
 
 system_has_kms_drm = ['openbsd', 'netbsd', 'freebsd', 'dragonfly', 'linux'].contains(host_machine.system())
@@ -50,16 +53,13 @@
 with_valgrind = get_option('valgrind')
 with_libunwind = get_option('libunwind')
 with_asm = get_option('asm')
+with_glx_read_only_text = get_option('glx-read-only-text')
 with_osmesa = get_option('osmesa')
-with_swr_arches = get_option('swr-arches').split(',')
-with_tools = get_option('tools').split(',')
+with_swr_arches = get_option('swr-arches')
+with_tools = get_option('tools')
 if with_tools.contains('all')
   with_tools = ['freedreno', 'glsl', 'intel', 'nir', 'nouveau', 'xvmc']
 endif
-if get_option('texture-float')
-  pre_args += '-DTEXTURE_FLOAT_ENABLED'
-  message('WARNING: Floating-point texture enabled. Please consult docs/patents.txt and your lawyer before building mesa.')
-endif
 
 dri_drivers_path = get_option('dri-drivers-path')
 if dri_drivers_path == ''
@@ -93,128 +93,102 @@
 
 system_has_kms_drm = ['openbsd', 'netbsd', 'freebsd', 'dragonfly', 'linux'].contains(host_machine.system())
 
-with_dri = false
-with_dri_i915 = false
-with_dri_i965 = false
-with_dri_r100 = false
-with_dri_r200 = false
-with_dri_nouveau = false
-with_dri_swrast = false
 _drivers = get_option('dri-drivers')
-if _drivers == 'auto'
+if _drivers.contains('auto')
   if system_has_kms_drm
     # TODO: PPC, Sparc
     if ['x86', 'x86_64'].contains(host_machine.cpu_family())
-      _drivers = 'i915,i965,r100,r200,nouveau'
+      _drivers = ['i915', 'i965', 'r100', 'r200', 'nouveau']
     elif ['arm', 'aarch64'].contains(host_machine.cpu_family())
-      _drivers = ''
+      _drivers = []
     else
       error('Unknown architecture. Please pass -Ddri-drivers to set driver options. Patches gladly accepted to fix this.')
     endif
   elif ['darwin', 'windows', 'cygwin', 'haiku'].contains(host_machine.system())
     # only swrast would make sense here, but gallium swrast is a much better default
-    _drivers = ''
+    _drivers = []
   else
     error('Unknown OS. Please pass -Ddri-drivers to set driver options. Patches gladly accepted to fix this.')
   endif
 endif
-if _drivers != ''
-  _split = _drivers.split(',')
-  with_dri_i915 = _split.contains('i915')
-  with_dri_i965 = _split.contains('i965')
-  with_dri_r100 = _split.contains('r100')
-  with_dri_r200 = _split.contains('r200')
-  with_dri_nouveau = _split.contains('nouveau')
-  with_dri_swrast = _split.contains('swrast')
-  with_dri = true
-endif
 
-with_gallium = false
-with_gallium_pl111 = false
-with_gallium_radeonsi = false
-with_gallium_r300 = false
-with_gallium_r600 = false
-with_gallium_nouveau = false
-with_gallium_freedreno = false
-with_gallium_softpipe = false
-with_gallium_vc4 = false
-with_gallium_vc5 = false
-with_gallium_etnaviv = false
-with_gallium_imx = false
-with_gallium_tegra = false
-with_gallium_i915 = false
-with_gallium_svga = false
-with_gallium_virgl = false
-with_gallium_swr = false
+with_dri_i915 = _drivers.contains('i915')
+with_dri_i965 = _drivers.contains('i965')
+with_dri_r100 = _drivers.contains('r100')
+with_dri_r200 = _drivers.contains('r200')
+with_dri_nouveau = _drivers.contains('nouveau')
+with_dri_swrast = _drivers.contains('swrast')
+
+with_dri = _drivers.length() != 0 and _drivers != ['']
+
 _drivers = get_option('gallium-drivers')
-if _drivers == 'auto'
+if _drivers.contains('auto')
   if system_has_kms_drm
     # TODO: PPC, Sparc
     if ['x86', 'x86_64'].contains(host_machine.cpu_family())
-      _drivers = 'r300,r600,radeonsi,nouveau,virgl,svga,swrast'
+      _drivers = [
+        'r300', 'r600', 'radeonsi', 'nouveau', 'virgl', 'svga', 'swrast'
+      ]
     elif ['arm', 'aarch64'].contains(host_machine.cpu_family())
-      _drivers = 'pl111,vc4,vc5,freedreno,etnaviv,imx,nouveau,tegra,virgl,swrast'
+      _drivers = [
+        'pl111', 'v3d', 'vc4', 'freedreno', 'etnaviv', 'imx', 'nouveau',
+        'tegra', 'virgl', 'swrast',
+      ]
     else
       error('Unknown architecture. Please pass -Dgallium-drivers to set driver options. Patches gladly accepted to fix this.')
     endif
   elif ['darwin', 'windows', 'cygwin', 'haiku'].contains(host_machine.system())
-    _drivers = 'swrast'
+    _drivers = ['swrast']
   else
     error('Unknown OS. Please pass -Dgallium-drivers to set driver options. Patches gladly accepted to fix this.')
   endif
 endif
-if _drivers != ''
-  _split = _drivers.split(',')
-  with_gallium_pl111 = _split.contains('pl111')
-  with_gallium_radeonsi = _split.contains('radeonsi')
-  with_gallium_r300 = _split.contains('r300')
-  with_gallium_r600 = _split.contains('r600')
-  with_gallium_nouveau = _split.contains('nouveau')
-  with_gallium_freedreno = _split.contains('freedreno')
-  with_gallium_softpipe = _split.contains('swrast')
-  with_gallium_vc4 = _split.contains('vc4')
-  with_gallium_vc5 = _split.contains('vc5')
-  with_gallium_etnaviv = _split.contains('etnaviv')
-  with_gallium_imx = _split.contains('imx')
-  with_gallium_tegra = _split.contains('tegra')
-  with_gallium_i915 = _split.contains('i915')
-  with_gallium_svga = _split.contains('svga')
-  with_gallium_virgl = _split.contains('virgl')
-  with_gallium_swr = _split.contains('swr')
-  with_gallium = true
-  if system_has_kms_drm
-    _glx = get_option('glx')
-    _egl = get_option('egl')
-    if _glx == 'dri' or _egl == 'true' or (_glx == 'disabled' and _egl != 'false')
-      with_dri = true
-    endif
+with_gallium_pl111 = _drivers.contains('pl111')
+with_gallium_radeonsi = _drivers.contains('radeonsi')
+with_gallium_r300 = _drivers.contains('r300')
+with_gallium_r600 = _drivers.contains('r600')
+with_gallium_nouveau = _drivers.contains('nouveau')
+with_gallium_freedreno = _drivers.contains('freedreno')
+with_gallium_softpipe = _drivers.contains('swrast')
+with_gallium_vc4 = _drivers.contains('vc4')
+with_gallium_v3d = _drivers.contains('v3d')
+with_gallium_etnaviv = _drivers.contains('etnaviv')
+with_gallium_imx = _drivers.contains('imx')
+with_gallium_tegra = _drivers.contains('tegra')
+with_gallium_i915 = _drivers.contains('i915')
+with_gallium_svga = _drivers.contains('svga')
+with_gallium_virgl = _drivers.contains('virgl')
+with_gallium_swr = _drivers.contains('swr')
+
+with_gallium = _drivers.length() != 0 and _drivers != ['']
+
+if with_gallium and system_has_kms_drm
+  _glx = get_option('glx')
+  _egl = get_option('egl')
+  if _glx == 'dri' or _egl == 'true' or (_glx == 'disabled' and _egl != 'false')
+    with_dri = true
   endif
 endif
 
-with_intel_vk = false
-with_amd_vk = false
-with_any_vk = false
 _vulkan_drivers = get_option('vulkan-drivers')
-if _vulkan_drivers == 'auto'
+if _vulkan_drivers.contains('auto')
   if system_has_kms_drm
     if host_machine.cpu_family().startswith('x86')
-      _vulkan_drivers = 'amd,intel'
+      _vulkan_drivers = ['amd', 'intel']
     else
       error('Unknown architecture. Please pass -Dvulkan-drivers to set driver options. Patches gladly accepted to fix this.')
     endif
   elif ['darwin', 'windows', 'cygwin', 'haiku'].contains(host_machine.system())
     # No vulkan driver supports windows or macOS currently
-    _vulkan_drivers = ''
+    _vulkan_drivers = []
   else
     error('Unknown OS. Please pass -Dvulkan-drivers to set driver options. Patches gladly accepted to fix this.')
   endif
 endif
-if _vulkan_drivers != ''
-  _split = _vulkan_drivers.split(',')
-  with_intel_vk = _split.contains('intel')
-  with_amd_vk = _split.contains('amd')
-  with_any_vk = with_amd_vk or with_intel_vk
-endif
+
+with_intel_vk = _vulkan_drivers.contains('intel')
+with_amd_vk = _vulkan_drivers.contains('amd')
+with_any_vk = _vulkan_drivers.length() != 0 and _vulkan_drivers != ['']
 
 if with_dri_swrast and (with_gallium_softpipe or with_gallium_swr)
   error('Only one swrast provider can be built')
@@ -247,33 +221,37 @@
   with_dri_platform = 'none'
 endif
 
-with_platform_android = false
-with_platform_wayland = false
-with_platform_x11 = false
-with_platform_drm = false
-with_platform_surfaceless = false
-egl_native_platform = ''
 _platforms = get_option('platforms')
-if _platforms == 'auto'
+if _platforms.contains('auto')
   if system_has_kms_drm
-    _platforms = 'x11,wayland,drm,surfaceless'
+    _platforms = ['x11', 'wayland', 'drm', 'surfaceless']
   elif ['darwin', 'windows', 'cygwin'].contains(host_machine.system())
-    _platforms = 'x11,surfaceless'
+    _platforms = ['x11', 'surfaceless']
   elif ['haiku'].contains(host_machine.system())
-    _platforms = 'haiku'
+    _platforms = ['haiku']
   else
     error('Unknown OS. Please pass -Dplatforms to set platforms. Patches gladly accepted to fix this.')
   endif
 endif
-if _platforms != ''
-  _split = _platforms.split(',')
-  with_platform_android = _split.contains('android')
-  with_platform_x11 = _split.contains('x11')
-  with_platform_wayland = _split.contains('wayland')
-  with_platform_drm = _split.contains('drm')
-  with_platform_haiku = _split.contains('haiku')
-  with_platform_surfaceless = _split.contains('surfaceless')
-  egl_native_platform = _split[0]
+
+with_platform_android = _platforms.contains('android')
+with_platform_x11 = _platforms.contains('x11')
+with_platform_wayland = _platforms.contains('wayland')
+with_platform_drm = _platforms.contains('drm')
+with_platform_haiku = _platforms.contains('haiku')
+with_platform_surfaceless = _platforms.contains('surfaceless')
+
+with_platforms = false
+if _platforms.length() != 0 and _platforms != ['']
+  with_platforms = true
+  egl_native_platform = _platforms[0]
+endif
+
+_xlib_lease = get_option('xlib-lease')
+if _xlib_lease == 'auto'
+  with_xlib_lease = with_platform_x11 and with_platform_drm
+else
+  with_xlib_lease = _xlib_lease == 'true'
 endif
 
 with_glx = get_option('glx')
@@ -285,7 +263,6 @@
   elif with_gallium
     # Even when building just gallium drivers the user probably wants dri
     with_glx = 'dri'
-    with_dri = true
   elif with_platform_x11 and with_any_opengl and not with_any_vk
     # The automatic behavior should not be to turn on xlib based glx when
     # building only vulkan drivers
@@ -294,6 +271,11 @@
     with_glx = 'disabled'
   endif
 endif
+if with_glx == 'dri'
+   if with_gallium
+      with_dri = true
+   endif
+endif
 
 if not (with_dri or with_gallium or with_glx == 'xlib' or with_glx == 'gallium-xlib')
   with_gles1 = false
@@ -315,16 +297,21 @@
 
 _egl = get_option('egl')
 if _egl == 'auto'
-  with_egl = with_dri and with_shared_glapi and egl_native_platform != ''
+  with_egl = (
+    not ['darwin', 'windows'].contains(host_machine.system()) and
+    with_dri and with_shared_glapi and with_platforms
+  )
 elif _egl == 'true'
   if not with_dri
     error('EGL requires dri')
   elif not with_shared_glapi
     error('EGL requires shared-glapi')
-  elif egl_native_platform == ''
+  elif not with_platforms
     error('No platforms specified, consider -Dplatforms=drm,x11 at least')
   elif not ['disabled', 'dri'].contains(with_glx)
     error('EGL requires dri, but a GLX is being built without dri')
+  elif ['darwin', 'windows'].contains(host_machine.system())
+    error('EGL is not available on Windows or MacOS')
   endif
   with_egl = true
 else
@@ -343,11 +330,7 @@
 pre_args += '-DGLX_USE_TLS'
 if with_glx != 'disabled'
   if not (with_platform_x11 and with_any_opengl)
-    if with_glx == 'auto'
-      with_glx = 'disabled'
-    else
-      error('Cannot build GLX support without X11 platform support and at least one OpenGL API')
-    endif
+    error('Cannot build GLX support without X11 platform support and at least one OpenGL API')
   elif with_glx == 'gallium-xlib' 
     if not with_gallium
       error('Gallium-xlib based GLX requires at least one gallium driver')
@@ -360,8 +343,12 @@
     if with_dri
       error('xlib conflicts with any dri driver')
     endif
-  elif with_glx == 'dri' and not with_dri
-    error('dri based GLX requires at least one DRI driver')
+  elif with_glx == 'dri'
+    if not with_dri
+      error('dri based GLX requires at least one DRI driver')
+    elif not with_shared_glapi
+      error('dri based GLX requires shared-glapi')
+    endif
   endif
 endif
 
@@ -578,8 +565,6 @@
   else
     _va = 'false'
   endif
-elif _va == 'auto'
-  _va = 'true'
 endif
 with_gallium_va = false
 dep_va = null_dep
@@ -633,13 +618,34 @@
   endif
 endif
 
+if get_option('power8') != 'false'
+  if host_machine.cpu_family() == 'ppc64le'
+    if cc.get_id() == 'gcc' and cc.version().version_compare('< 4.8')
+      error('Altivec is not supported with gcc version < 4.8.')
+    endif
+    if cc.compiles('''
+        #include <altivec.h>
+        int main() {
+          vector unsigned char r;
+          vector unsigned int v = vec_splat_u32 (1);
+          r = __builtin_vec_vgbbd ((vector unsigned char) v);
+          return 0;
+        }''',
+        args : '-mpower8-vector',
+        name : 'POWER8 intrinsics')
+      pre_args += ['-D_ARCH_PWR8', '-mpower8-vector']
+    elif get_option('power8') == 'true'
+      error('POWER8 intrinsic support required but not found.')
+    endif
+  endif
+endif
+
 _opencl = get_option('gallium-opencl')
 if _opencl != 'disabled'
   if not with_gallium
     error('OpenCL Clover implementation requires at least one gallium driver.')
   endif
 
-  # TODO: alitvec?
   dep_clc = dependency('libclc')
   with_gallium_opencl = true
   with_opencl_icd = _opencl == 'icd'
@@ -700,7 +706,6 @@
   error('Python (2.x) mako module required to build mesa.')
 endif
 
-cc = meson.get_compiler('c')
 if cc.get_id() == 'gcc' and cc.version().version_compare('< 4.4.6')
   error('When using GCC, version 4.4.6 or later is required.')
 endif
@@ -767,22 +772,25 @@
 
 # Check for generic C arguments
 c_args = []
-foreach a : ['-Wall', '-Werror=implicit-function-declaration',
+foreach a : ['-Werror=implicit-function-declaration',
              '-Werror=missing-prototypes', '-fno-math-errno',
              '-fno-trapping-math', '-Qunused-arguments']
   if cc.has_argument(a)
     c_args += a
   endif
 endforeach
+if cc.has_argument('-Wmissing-field-initializers')
+  c_args += '-Wno-missing-field-initializers'
+endif
+
 c_vis_args = []
 if cc.has_argument('-fvisibility=hidden')
   c_vis_args += '-fvisibility=hidden'
 endif
 
 # Check for generic C++ arguments
-cpp = meson.get_compiler('cpp')
 cpp_args = []
-foreach a : ['-Wall', '-fno-math-errno', '-fno-trapping-math',
+foreach a : ['-fno-math-errno', '-fno-trapping-math',
              '-Qunused-arguments']
   if cpp.has_argument(a)
     cpp_args += a
@@ -791,9 +799,12 @@
 
 # For some reason, the test for -Wno-foo always succeeds with gcc, even if the
 # option is not supported. Hence, check for -Wfoo instead.
-if cpp.has_argument('-Wnon-virtual-dtor')
-  cpp_args += '-Wno-non-virtual-dtor'
-endif
+
+foreach a : ['non-virtual-dtor', 'missing-field-initializers']
+  if cpp.has_argument('-W' + a)
+    cpp_args += '-Wno-' + a
+  endif
+endforeach
 
 no_override_init_args = []
 foreach a : ['override-init', 'initializer-overrides']
@@ -874,34 +885,49 @@
                    int main() {
                      return __sync_add_and_fetch(&v, (uint64_t)1);
                    }''',
+                dependencies : dep_atomic,
                 name : 'GCC 64bit atomics')
-  pre_args += '-DMISSING_64_BIT_ATOMICS'
+  pre_args += '-DMISSING_64BIT_ATOMICS'
 endif
 
-# TODO: endian
-# TODO: powr8
 # TODO: shared/static? Is this even worth doing?
 
-# Building x86 assembly code requires running x86 binaries. It is possible for
-# x86_64 OSes to run x86 binaries, so don't disable asm in those cases
-# TODO: it should be possible to use an exe_wrapper to run the binary during
-# the build. 
+# When cross compiling we generally need to turn off the use of assembly,
+# because mesa's assembly relies on building an executable for the host system,
+# and running it to get information about struct sizes. There is at least one
+# case of cross compiling where we can use asm, and that's x86_64 -> x86 when
+# host OS == build OS, since in that case the build machine can run the host's
+# binaries.
 if meson.is_cross_build() 
-  if not (build_machine.cpu_family().startswith('x86') and host_machine.cpu_family() == 'x86'
-          and build_machine.system() == host_machine.system())
-    message('Cross compiling to x86 from non-x86, disabling asm')
+  if build_machine.system() != host_machine.system()
+    # TODO: It may be possible to do this with an exe_wrapper (like wine).
+    message('Cross compiling from one OS to another, disabling assembly.')
+    with_asm = false
+  elif not (build_machine.cpu_family().startswith('x86') and host_machine.cpu_family() == 'x86')
+    # FIXME: Gentoo always sets -m32 for x86_64 -> x86 builds, resulting in an
+    # x86 -> x86 cross compile. We use startswith rather than == to handle this
+    # case.
+    # TODO: There may be other cases where the 64 bit version of the
+    # architecture can run 32 bit binaries (aarch64 and armv7 for example)
+    message('''
+      Cross compiling to different architectures, and the host cannot run
+      the build machine's binaries. Disabling assembly.
+    ''')
     with_asm = false
   endif
 endif
 
 with_asm_arch = ''
 if with_asm
-  # TODO: SPARC and PPC
   if host_machine.cpu_family() == 'x86'
     if system_has_kms_drm
       with_asm_arch = 'x86'
       pre_args += ['-DUSE_X86_ASM', '-DUSE_MMX_ASM', '-DUSE_3DNOW_ASM',
                    '-DUSE_SSE_ASM']
+
+      if with_glx_read_only_text
+         pre_args += ['-DGLX_X86_READONLY_TEXT']
+      endif
     endif
   elif host_machine.cpu_family() == 'x86_64'
     if system_has_kms_drm
@@ -918,6 +944,16 @@
       with_asm_arch = 'aarch64'
       pre_args += ['-DUSE_AARCH64_ASM']
     endif
+  elif host_machine.cpu_family() == 'sparc64'
+    if system_has_kms_drm
+      with_asm_arch = 'sparc'
+      pre_args += ['-DUSE_SPARC_ASM']
+    endif
+  elif host_machine.cpu_family() == 'ppc64le'
+    if system_has_kms_drm
+      with_asm_arch = 'ppc64le'
+      pre_args += ['-DUSE_PPC64LE_ASM']
+    endif
   endif
 endif
 
@@ -1024,14 +1060,6 @@
 if dep_thread.found() and host_machine.system() != 'windows'
   pre_args += '-DHAVE_PTHREAD'
 endif
-if with_amd_vk or with_gallium_radeonsi or with_gallium_r600 or with_gallium_opencl
-  dep_elf = dependency('libelf', required : false)
-  if not dep_elf.found()
-    dep_elf = cc.find_library('elf')
-  endif
-else
-  dep_elf = null_dep
-endif
 dep_expat = dependency('expat')
 # this only exists on linux so either this is linux and it will be found, or
 # its not linux and and wont
@@ -1052,7 +1080,7 @@
 _drm_radeon_ver = '2.4.71'
 _drm_nouveau_ver = '2.4.66'
 _drm_etnaviv_ver = '2.4.89'
-_drm_freedreno_ver = '2.4.91'
+_drm_freedreno_ver = '2.4.92'
 _drm_intel_ver = '2.4.75'
 _drm_ver = '2.4.75'
 
@@ -1066,6 +1094,12 @@
   ['freedreno', with_gallium_freedreno],
 ]
 
+# VC4 only needs core libdrm support of this version, not a libdrm_vc4
+# library.
+if with_gallium_vc4
+  _drm_ver = '2.4.89'
+endif
+
 # Loop over the enables versions and get the highest libdrm requirement for all
 # active drivers.
 _drm_blame = ''
@@ -1103,6 +1137,7 @@
 endif
 
 llvm_modules = ['bitwriter', 'engine', 'mcdisassembler', 'mcjit']
+llvm_optional_modules = []
 if with_amd_vk or with_gallium_radeonsi or with_gallium_r600
   llvm_modules += ['amdgpu', 'bitreader', 'ipo']
   if with_gallium_r600
@@ -1114,11 +1149,11 @@
     'all-targets', 'linker', 'coverage', 'instrumentation', 'ipo', 'irreader',
     'lto', 'option', 'objcarcopts', 'profiledata',
   ]
-  # TODO: optional modules
+  llvm_optional_modules += ['coroutines', 'opencl']
 endif
 
 if with_amd_vk or with_gallium_radeonsi or with_gallium_swr
-  _llvm_version = '>= 4.0.0'
+  _llvm_version = '>= 5.0.0'
 elif with_gallium_opencl or with_gallium_r600
   _llvm_version = '>= 3.9.0'
 else
@@ -1128,12 +1163,20 @@
 _llvm = get_option('llvm')
 if _llvm == 'auto'
   dep_llvm = dependency(
-    'llvm', version : _llvm_version, modules : llvm_modules,
+    'llvm',
+    version : _llvm_version,
+    modules : llvm_modules,
+    optional_modules : llvm_optional_modules,
     required : with_amd_vk or with_gallium_radeonsi or with_gallium_swr or with_gallium_opencl,
   )
   with_llvm = dep_llvm.found()
 elif _llvm == 'true'
-  dep_llvm = dependency('llvm', version : _llvm_version, modules : llvm_modules)
+  dep_llvm = dependency(
+    'llvm',
+    version : _llvm_version,
+    modules : llvm_modules,
+    optional_modules : llvm_optional_modules,
+  )
   with_llvm = true
 else
   dep_llvm = null_dep
@@ -1162,10 +1205,27 @@
     '-DHAVE_LLVM=0x0@0@0@1@'.format(_llvm_version[0], _llvm_version[1]),
     '-DMESA_LLVM_VERSION_PATCH=@0@'.format(_llvm_patch),
   ]
+
+  # LLVM can be built without rtti, turning off rtti changes the ABI of C++
+  # programs, so we need to build all C++ code in mesa without rtti as well to
+  # ensure that linking works.
+  if dep_llvm.get_configtool_variable('has-rtti') == 'NO'
+    cpp_args += '-fno-rtti'
+  endif
 elif with_amd_vk or with_gallium_radeonsi or with_gallium_swr
   error('The following drivers require LLVM: Radv, RadeonSI, SWR. One of these is enabled, but LLVM is disabled.')
 endif
 
+if (with_amd_vk or with_gallium_radeonsi or with_gallium_opencl or
+    (with_gallium_r600 and with_llvm))
+  dep_elf = dependency('libelf', required : false)
+  if not dep_elf.found()
+    dep_elf = cc.find_library('elf')
+  endif
+else
+  dep_elf = null_dep
+endif
+
 dep_glvnd = null_dep
 if with_glvnd
   dep_glvnd = dependency('libglvnd', version : '>= 0.2.0')
@@ -1192,8 +1252,6 @@
   pre_args += '-DMESA_SELINUX'
 endif
 
-# TODO: llvm-prefix and llvm-shared-libs
-
 if with_libunwind != 'false'
   dep_unwind = dependency('libunwind', required : with_libunwind == 'true')
   if dep_unwind.found()
@@ -1203,8 +1261,6 @@
   dep_unwind = null_dep
 endif
 
-# TODO: gallium-hud
-
 if with_osmesa != 'none'
   if with_osmesa == 'classic' and not with_dri_swrast
     error('OSMesa classic requires dri (classic) swrast.')
@@ -1232,17 +1288,16 @@
   dep_wl_protocols = dependency('wayland-protocols', version : '>= 1.8')
   dep_wayland_client = dependency('wayland-client', version : '>=1.11')
   dep_wayland_server = dependency('wayland-server', version : '>=1.11')
+  if with_egl
+    dep_wayland_egl = dependency('wayland-egl-backend', version : '>= 3')
+    dep_wayland_egl_headers = declare_dependency(
+      compile_args : run_command(prog_pkgconfig, ['wayland-egl-backend', '--cflags']).stdout().split())
+  endif
   wayland_dmabuf_xml = join_paths(
     dep_wl_protocols.get_pkgconfig_variable('pkgdatadir'), 'unstable',
     'linux-dmabuf', 'linux-dmabuf-unstable-v1.xml'
   )
   pre_args += ['-DHAVE_WAYLAND_PLATFORM', '-DWL_HIDE_DEPRECATED']
-else
-  prog_wl_scanner = []
-  dep_wl_protocols = null_dep
-  dep_wayland_client = null_dep
-  dep_wayland_server = null_dep
-  wayland_dmabuf_xml = ''
 endif
 
 dep_x11 = null_dep
@@ -1262,6 +1317,8 @@
 dep_xcb_sync = null_dep
 dep_xcb_xfixes = null_dep
 dep_xshmfence = null_dep
+dep_xcb_xrandr = null_dep
+dep_xlib_xrandr = null_dep
 if with_platform_x11
   if with_glx == 'xlib' or with_glx == 'gallium-xlib'
     dep_x11 = dependency('x11')
@@ -1273,7 +1330,7 @@
     dep_xdamage = dependency('xdamage', version : '>= 1.1')
     dep_xfixes = dependency('xfixes')
     dep_xcb_glx = dependency('xcb-glx', version : '>= 1.8.1')
-    dep_xxf86vm = dependency('xxf86vm', required : false)
+    dep_xxf86vm = dependency('xxf86vm')
   endif
   if (with_any_vk or with_glx == 'dri' or
        (with_gallium_vdpau or with_gallium_xvmc or with_gallium_va or
@@ -1308,6 +1365,10 @@
       with_gallium_omx != 'disabled'))
     dep_xcb_xfixes = dependency('xcb-xfixes')
   endif
+  if with_xlib_lease
+    dep_xcb_xrandr = dependency('xcb-randr', version : '>= 1.12')
+    dep_xlib_xrandr = dependency('xrandr', version : '>= 1.3')
+  endif
 endif
 
 if get_option('gallium-extra-hud')
@@ -1324,18 +1385,6 @@
   dep_lmsensors = null_dep
 endif
 
-# TODO: various libdirs
-
-# TODO: gallium driver dirs
-
-# FIXME: this is a workaround for #2326
-prog_touch = find_program('touch')
-dummy_cpp = custom_target(
-  'dummy_cpp',
-  output : 'dummy.cpp',
-  command : [prog_touch, '@OUTPUT@'],
-)
-
 foreach a : pre_args
   add_project_arguments(a, language : ['c', 'cpp'])
 endforeach
@@ -1359,13 +1408,11 @@
   if with_dri_platform == 'drm'
     gl_priv_reqs += 'xcb-dri2 >= 1.8'
   endif
+  gl_priv_reqs += 'xxf86vm'
 endif
 if dep_libdrm.found()
   gl_priv_reqs += 'libdrm >= 2.4.75'
 endif
-if dep_xxf86vm.found()
-  gl_priv_reqs += 'xxf86vm'
-endif
 
 gl_priv_libs = []
 if dep_thread.found()
diff --git a/meson_options.txt b/meson_options.txt
index 496fe38..5bb560b 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -1,4 +1,4 @@
-# Copyright © 2017 Intel Corporation
+# Copyright © 2017-2018 Intel Corporation
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -20,8 +20,11 @@
 
 option(
   'platforms',
-  type : 'string',
-  value : 'auto',
+  type : 'array',
+  value : ['auto'],
+  choices : [
+    '', 'auto', 'x11', 'wayland', 'drm', 'surfaceless', 'haiku', 'android',
+  ],
   description : 'comma separated list of window systems to support. If this is set to auto all platforms applicable to the OS will be enabled.'
 )
 option(
@@ -33,9 +36,10 @@
 )
 option(
   'dri-drivers',
-  type : 'string',
-  value : 'auto',
-  description : 'comma separated list of dri drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
+  type : 'array',
+  value : ['auto'],
+  choices : ['', 'auto', 'i915', 'i965', 'r100', 'r200', 'nouveau', 'swrast'],
+  description : 'List of dri drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
 )
 option(
   'dri-drivers-path',
@@ -51,9 +55,14 @@
 )
 option(
   'gallium-drivers',
-  type : 'string',
-  value : 'auto',
-  description : 'comma separated list of gallium drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
+  type : 'array',
+  value : ['auto'],
+  choices : [
+    '', 'auto', 'pl111', 'radeonsi', 'r300', 'r600', 'nouveau', 'freedreno',
+    'swrast', 'v3d', 'vc4', 'etnaviv', 'imx', 'tegra', 'i915', 'svga', 'virgl',
+    'swr',
+  ],
+  description : 'List of gallium drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
 )
 option(
   'gallium-extra-hud',
@@ -141,9 +150,10 @@
 )
 option(
   'vulkan-drivers',
-  type : 'string',
-  value : 'auto',
-  description : 'comma separated list of vulkan drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
+  type : 'array',
+  value : ['auto'],
+  choices : ['', 'auto', 'amd', 'intel'],
+  description : 'List of vulkan drivers to build. If this is set to auto all drivers applicable to the target OS/architecture will be built'
 )
 option(
   'shader-cache',
@@ -215,6 +225,12 @@
   description : 'Build assembly code if possible'
 )
 option(
+   'glx-read-only-text',
+   type : 'boolean',
+   value : false,
+   description : 'Disable writable .text section on x86 (decreases performance)'
+)
+option(
   'llvm',
   type : 'combo',
   value : 'auto',
@@ -249,12 +265,6 @@
   description : 'Build unit tests. Currently this will build *all* unit tests, which may build more than expected.'
 )
 option(
-  'texture-float',
-  type : 'boolean',
-  value : false,
-  description : 'Enable floating point textures and renderbuffers. This option may be patent encumbered, please read docs/patents.txt and consult with your lawyer before turning this on.'
-)
-option(
   'selinux',
   type : 'boolean',
   value : false,
@@ -276,13 +286,29 @@
 )
 option(
   'swr-arches',
-  type : 'string',
-  value : 'avx,avx2',
-  description : 'Comma delemited swr architectures. choices : avx,avx2,knl,skx'
+  type : 'array',
+  value : ['avx', 'avx2'],
+  choices : ['avx', 'avx2', 'knl', 'skx'],
+  description : 'Architectures to build SWR support for.',
 )
 option(
   'tools',
-  type : 'string',
-  value : '',
-  description : 'Comma delimited list of tools to build. choices : freedreno,glsl,intel,nir,nouveau,xvmc or all'
+  type : 'array',
+  value : [],
+  choices : ['freedreno', 'glsl', 'intel', 'nir', 'nouveau', 'xvmc', 'all'],
+  description : 'List of tools to build.',
+)
+option(
+  'power8',
+  type : 'combo',
+  value : 'auto',
+  choices : ['auto', 'true', 'false'],
+  description : 'Enable power8 optimizations.',
+)
+option(
+  'xlib-lease',
+  type : 'combo',
+  value : 'auto',
+  choices : ['auto', 'true', 'false'],
+  description : 'Enable VK_EXT_acquire_xlib_display.'
 )
diff --git a/scons/gallium.py b/scons/gallium.py
index 6cb20ef..012794c 100755
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -282,7 +282,7 @@
         if env['build'] == 'profile':
             env['debug'] = False
             env['profile'] = True
-        if env['build'] in ('release', 'opt'):
+        if env['build'] == 'release':
             env['debug'] = False
             env['profile'] = False
 
@@ -328,8 +328,6 @@
         cppdefines += ['NDEBUG']
     if env['build'] == 'profile':
         cppdefines += ['PROFILE']
-    if env['build'] in ('opt', 'profile'):
-        cppdefines += ['VMX86_STATS']
     if env['platform'] in ('posix', 'linux', 'freebsd', 'darwin'):
         cppdefines += [
             '_POSIX_SOURCE',
@@ -392,10 +390,6 @@
         cppdefines += ['PIPE_SUBSYSTEM_WINDOWS_USER']
     if env['embedded']:
         cppdefines += ['PIPE_SUBSYSTEM_EMBEDDED']
-    if env['texture_float']:
-        print('warning: Floating-point textures enabled.')
-        print('warning: Please consult docs/patents.txt with your lawyer before building Mesa.')
-        cppdefines += ['TEXTURE_FLOAT_ENABLED']
     env.Append(CPPDEFINES = cppdefines)
 
     # C compiler options
@@ -484,7 +478,7 @@
             ccflags += [
                 '/O2', # optimize for speed
             ]
-        if env['build'] in ('release', 'opt'):
+        if env['build'] == 'release':
             if not env['clang']:
                 ccflags += [
                     '/GL', # enable whole program optimization
@@ -595,7 +589,7 @@
             shlinkflags += ['-Wl,--enable-stdcall-fixup']
             #shlinkflags += ['-Wl,--kill-at']
     if msvc:
-        if env['build'] in ('release', 'opt') and not env['clang']:
+        if env['build'] == 'release' and not env['clang']:
             # enable Link-time Code Generation
             linkflags += ['/LTCG']
             env.Append(ARFLAGS = ['/LTCG'])
diff --git a/scons/llvm.py b/scons/llvm.py
index 79118be..a34edfb 100644
--- a/scons/llvm.py
+++ b/scons/llvm.py
@@ -123,6 +123,10 @@
                 'LLVMDemangle', 'LLVMGlobalISel', 'LLVMDebugInfoMSF',
                 'LLVMBinaryFormat',
             ])
+            if env['platform'] == 'windows' and env['crosscompile']:
+                # LLVM 5.0 requires MinGW w/ pthreads due to use of std::thread and friends.
+                assert env['gcc']
+                env['CXX'] = env['CXX'] + '-posix'
         elif llvm_version >= distutils.version.LooseVersion('4.0'):
             env.Prepend(LIBS = [
                 'LLVMX86Disassembler', 'LLVMX86AsmParser',
@@ -211,8 +215,11 @@
             'imagehlp',
             'psapi',
             'shell32',
-            'advapi32'
+            'advapi32',
+            'ole32',
+            'uuid',
         ])
+
         if env['msvc']:
             # Some of the LLVM C headers use the inline keyword without
             # defining it.
diff --git a/src/Makefile.am b/src/Makefile.am
index fd5ae44..9bb3bce 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -95,11 +95,6 @@
 SUBDIRS += gbm
 endif
 
-## Optionally required by EGL
-if HAVE_PLATFORM_WAYLAND
-SUBDIRS += egl/wayland/wayland-egl
-endif
-
 if HAVE_EGL
 SUBDIRS += egl
 endif
diff --git a/src/amd/Android.mk b/src/amd/Android.mk
index 07af052..e40e7da 100644
--- a/src/amd/Android.mk
+++ b/src/amd/Android.mk
@@ -27,3 +27,6 @@
 
 include $(LOCAL_PATH)/Android.addrlib.mk
 include $(LOCAL_PATH)/Android.common.mk
+ifneq ($(filter radeonsi,$(BOARD_GPU_DRIVERS)),)
+include $(LOCAL_PATH)/vulkan/Android.mk
+endif
diff --git a/src/amd/addrlib/amdgpu_asic_addr.h b/src/amd/addrlib/amdgpu_asic_addr.h
index b4b8aec..e5838d4 100644
--- a/src/amd/addrlib/amdgpu_asic_addr.h
+++ b/src/amd/addrlib/amdgpu_asic_addr.h
@@ -87,6 +87,7 @@
 
 #define AMDGPU_VEGA10_RANGE     0x01, 0x14
 #define AMDGPU_VEGA12_RANGE     0x14, 0x28
+#define AMDGPU_VEGA20_RANGE     0x28, 0xFF
 
 #define AMDGPU_RAVEN_RANGE      0x01, 0x81
 
@@ -128,6 +129,7 @@
 #define ASICREV_IS_VEGA10_P(r)         ASICREV_IS(r, VEGA10)
 #define ASICREV_IS_VEGA12_P(r)         ASICREV_IS(r, VEGA12)
 #define ASICREV_IS_VEGA12_p(r)         ASICREV_IS(r, VEGA12)
+#define ASICREV_IS_VEGA20_P(r)         ASICREV_IS(r, VEGA20)
 
 #define ASICREV_IS_RAVEN(r)            ASICREV_IS(r, RAVEN)
 
diff --git a/src/amd/addrlib/gfx9/gfx9addrlib.cpp b/src/amd/addrlib/gfx9/gfx9addrlib.cpp
index b88d324..ef86c3b 100644
--- a/src/amd/addrlib/gfx9/gfx9addrlib.cpp
+++ b/src/amd/addrlib/gfx9/gfx9addrlib.cpp
@@ -1230,6 +1230,7 @@
         {
             ADDR_ASSERT(m_settings.isVega10 == FALSE);
             ADDR_ASSERT(m_settings.isRaven == FALSE);
+            ADDR_ASSERT(m_settings.isVega20 == FALSE);
 
             if (m_settings.isVega12)
             {
@@ -1273,7 +1274,7 @@
             m_settings.isArcticIsland = 1;
             m_settings.isVega10    = ASICREV_IS_VEGA10_P(uChipRevision);
             m_settings.isVega12    = ASICREV_IS_VEGA12_P(uChipRevision);
-
+            m_settings.isVega20    = ASICREV_IS_VEGA20_P(uChipRevision);
             m_settings.isDce12 = 1;
 
             if (m_settings.isVega10 == 0)
diff --git a/src/amd/addrlib/gfx9/gfx9addrlib.h b/src/amd/addrlib/gfx9/gfx9addrlib.h
index 7c61a40..cf56507 100644
--- a/src/amd/addrlib/gfx9/gfx9addrlib.h
+++ b/src/amd/addrlib/gfx9/gfx9addrlib.h
@@ -56,6 +56,7 @@
         UINT_32 isVega10            : 1;
         UINT_32 isRaven             : 1;
         UINT_32 isVega12            : 1;
+        UINT_32 isVega20            : 1;
 
         // Display engine IP version name
         UINT_32 isDce12             : 1;
diff --git a/src/amd/common/ac_binary.h b/src/amd/common/ac_binary.h
index 4bd86b9..735e393 100644
--- a/src/amd/common/ac_binary.h
+++ b/src/amd/common/ac_binary.h
@@ -27,6 +27,10 @@
 #include <stdint.h>
 #include <stdbool.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct ac_shader_reloc {
 	char name[32];
 	uint64_t offset;
@@ -98,4 +102,8 @@
 				  bool supports_spill);
 void ac_shader_binary_clean(struct ac_shader_binary *b);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* AC_BINARY_H */
diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index 2bf5020..40441ec 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -96,6 +96,7 @@
 		       struct radeon_info *info,
 		       struct amdgpu_gpu_info *amdinfo)
 {
+	struct drm_amdgpu_info_device device_info = {};
 	struct amdgpu_buffer_size_alignments alignment_info = {};
 	struct drm_amdgpu_info_hw_ip dma = {}, compute = {}, uvd = {};
 	struct drm_amdgpu_info_hw_ip uvd_enc = {}, vce = {}, vcn_dec = {};
@@ -124,6 +125,13 @@
 		return false;
 	}
 
+	r = amdgpu_query_info(dev, AMDGPU_INFO_DEV_INFO, sizeof(device_info),
+			      &device_info);
+	if (r) {
+		fprintf(stderr, "amdgpu: amdgpu_query_info(dev_info) failed.\n");
+		return false;
+	}
+
 	r = amdgpu_query_buffer_size_alignment(dev, &alignment_info);
 	if (r) {
 		fprintf(stderr, "amdgpu: amdgpu_query_buffer_size_alignment failed.\n");
@@ -235,7 +243,7 @@
 	}
 
 	if (info->drm_minor >= 9) {
-		struct drm_amdgpu_memory_info meminfo;
+		struct drm_amdgpu_memory_info meminfo = {};
 
 		r = amdgpu_query_info(dev, AMDGPU_INFO_MEMORY, sizeof(meminfo), &meminfo);
 		if (r) {
@@ -324,6 +332,7 @@
 	info->gds_gfx_partition_size = gds.gds_gfx_partition_size;
 	/* convert the shader clock from KHz to MHz */
 	info->max_shader_clock = amdinfo->max_engine_clk / 1000;
+	info->num_tcc_blocks = device_info.num_tcc_blocks;
 	info->max_se = amdinfo->num_shader_engines;
 	info->max_sh_per_se = amdinfo->num_shader_arrays_per_engine;
 	info->has_hw_decode =
@@ -343,8 +352,34 @@
 	info->has_local_buffers = info->drm_minor >= 20 &&
 				  !info->has_dedicated_vram;
 	info->kernel_flushes_hdp_before_ib = true;
+	info->htile_cmask_support_1d_tiling = true;
+	info->si_TA_CS_BC_BASE_ADDR_allowed = true;
+	info->has_bo_metadata = true;
+	info->has_gpu_reset_status_query = true;
+	info->has_gpu_reset_counter_query = false;
+	info->has_eqaa_surface_allocator = true;
+	info->has_format_bc1_through_bc7 = true;
+	/* DRM 3.1.0 doesn't flush TC for VI correctly. */
+	info->kernel_flushes_tc_l2_after_ib = info->chip_class != VI ||
+					      info->drm_minor >= 2;
+	info->has_indirect_compute_dispatch = true;
+	/* SI doesn't support unaligned loads. */
+	info->has_unaligned_shader_loads = info->chip_class != SI;
+	/* Disable sparse mappings on SI due to VM faults in CP DMA. Enable them once
+	 * these faults are mitigated in software.
+	 * Disable sparse mappings on GFX9 due to hangs.
+	 */
+	info->has_sparse_vm_mappings =
+		info->chip_class >= CIK && info->chip_class <= VI &&
+		info->drm_minor >= 13;
+	info->has_2d_tiling = true;
+	info->has_read_registers_query = true;
 
 	info->num_render_backends = amdinfo->rb_pipes;
+	/* The value returned by the kernel driver was wrong. */
+	if (info->family == CHIP_KAVERI)
+		info->num_render_backends = 2;
+
 	info->clock_crystal_freq = amdinfo->gpu_counter_freq;
 	if (!info->clock_crystal_freq) {
 		fprintf(stderr, "amdgpu: clock crystal frequency is 0, timestamps will be wrong\n");
@@ -477,7 +512,7 @@
 	printf("    vce_fw_version = %u\n", info->vce_fw_version);
 	printf("    vce_harvest_config = %i\n", info->vce_harvest_config);
 
-	printf("Kernel info:\n");
+	printf("Kernel & winsys capabilities:\n");
 	printf("    drm = %i.%i.%i\n", info->drm_major,
 	       info->drm_minor, info->drm_patchlevel);
 	printf("    has_userptr = %i\n", info->has_userptr);
@@ -487,10 +522,24 @@
 	printf("    has_ctx_priority = %u\n", info->has_ctx_priority);
 	printf("    has_local_buffers = %u\n", info->has_local_buffers);
 	printf("    kernel_flushes_hdp_before_ib = %u\n", info->kernel_flushes_hdp_before_ib);
+	printf("    htile_cmask_support_1d_tiling = %u\n", info->htile_cmask_support_1d_tiling);
+	printf("    si_TA_CS_BC_BASE_ADDR_allowed = %u\n", info->si_TA_CS_BC_BASE_ADDR_allowed);
+	printf("    has_bo_metadata = %u\n", info->has_bo_metadata);
+	printf("    has_gpu_reset_status_query = %u\n", info->has_gpu_reset_status_query);
+	printf("    has_gpu_reset_counter_query = %u\n", info->has_gpu_reset_counter_query);
+	printf("    has_eqaa_surface_allocator = %u\n", info->has_eqaa_surface_allocator);
+	printf("    has_format_bc1_through_bc7 = %u\n", info->has_format_bc1_through_bc7);
+	printf("    kernel_flushes_tc_l2_after_ib = %u\n", info->kernel_flushes_tc_l2_after_ib);
+	printf("    has_indirect_compute_dispatch = %u\n", info->has_indirect_compute_dispatch);
+	printf("    has_unaligned_shader_loads = %u\n", info->has_unaligned_shader_loads);
+	printf("    has_sparse_vm_mappings = %u\n", info->has_sparse_vm_mappings);
+	printf("    has_2d_tiling = %u\n", info->has_2d_tiling);
+	printf("    has_read_registers_query = %u\n", info->has_read_registers_query);
 
 	printf("Shader core info:\n");
 	printf("    max_shader_clock = %i\n", info->max_shader_clock);
 	printf("    num_good_compute_units = %i\n", info->num_good_compute_units);
+	printf("    num_tcc_blocks = %i\n", info->num_tcc_blocks);
 	printf("    max_se = %i\n", info->max_se);
 	printf("    max_sh_per_se = %i\n", info->max_sh_per_se);
 
@@ -550,3 +599,235 @@
 		       G_0098F8_NUM_LOWER_PIPES(info->gb_addr_config));
 	}
 }
+
+int
+ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family)
+{
+	if (chip_class >= GFX9)
+		return -1;
+
+	switch (family) {
+	case CHIP_OLAND:
+	case CHIP_HAINAN:
+	case CHIP_KAVERI:
+	case CHIP_KABINI:
+	case CHIP_MULLINS:
+	case CHIP_ICELAND:
+	case CHIP_CARRIZO:
+	case CHIP_STONEY:
+		return 16;
+	case CHIP_TAHITI:
+	case CHIP_PITCAIRN:
+	case CHIP_VERDE:
+	case CHIP_BONAIRE:
+	case CHIP_HAWAII:
+	case CHIP_TONGA:
+	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+	case CHIP_POLARIS12:
+	case CHIP_VEGAM:
+		return 32;
+	default:
+		unreachable("Unknown GPU");
+	}
+}
+
+void
+ac_get_raster_config(struct radeon_info *info,
+		     uint32_t *raster_config_p,
+		     uint32_t *raster_config_1_p)
+{
+	unsigned raster_config, raster_config_1;
+
+	switch (info->family) {
+	/* 1 SE / 1 RB */
+	case CHIP_HAINAN:
+	case CHIP_KABINI:
+	case CHIP_MULLINS:
+	case CHIP_STONEY:
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 1 SE / 4 RBs */
+	case CHIP_VERDE:
+		raster_config = 0x0000124a;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 1 SE / 2 RBs (Oland is special) */
+	case CHIP_OLAND:
+		raster_config = 0x00000082;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 1 SE / 2 RBs */
+	case CHIP_KAVERI:
+	case CHIP_ICELAND:
+	case CHIP_CARRIZO:
+		raster_config = 0x00000002;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 2 SEs / 4 RBs */
+	case CHIP_BONAIRE:
+	case CHIP_POLARIS11:
+	case CHIP_POLARIS12:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 2 SEs / 8 RBs */
+	case CHIP_TAHITI:
+	case CHIP_PITCAIRN:
+		raster_config = 0x2a00126a;
+		raster_config_1 = 0x00000000;
+		break;
+	/* 4 SEs / 8 RBs */
+	case CHIP_TONGA:
+	case CHIP_POLARIS10:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+		break;
+	/* 4 SEs / 16 RBs */
+	case CHIP_HAWAII:
+	case CHIP_FIJI:
+	case CHIP_VEGAM:
+		raster_config = 0x3a00161a;
+		raster_config_1 = 0x0000002e;
+		break;
+	default:
+		fprintf(stderr,
+			"ac: Unknown GPU, using 0 for raster_config\n");
+		raster_config = 0x00000000;
+		raster_config_1 = 0x00000000;
+		break;
+	}
+
+	/* drm/radeon on Kaveri is buggy, so disable 1 RB to work around it.
+	 * This decreases performance by up to 50% when the RB is the bottleneck.
+	 */
+	if (info->family == CHIP_KAVERI && info->drm_major == 2)
+		raster_config = 0x00000000;
+
+	/* Fiji: Old kernels have incorrect tiling config. This decreases
+	 * RB performance by 25%. (it disables 1 RB in the second packer)
+	 */
+	if (info->family == CHIP_FIJI &&
+	    info->cik_macrotile_mode_array[0] == 0x000000e8) {
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+	}
+
+	*raster_config_p = raster_config;
+	*raster_config_1_p = raster_config_1;
+}
+
+void
+ac_get_harvested_configs(struct radeon_info *info,
+			 unsigned raster_config,
+			 unsigned *cik_raster_config_1_p,
+			 unsigned *raster_config_se)
+{
+	unsigned sh_per_se = MAX2(info->max_sh_per_se, 1);
+	unsigned num_se = MAX2(info->max_se, 1);
+	unsigned rb_mask = info->enabled_rb_mask;
+	unsigned num_rb = MIN2(info->num_render_backends, 16);
+	unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
+	unsigned rb_per_se = num_rb / num_se;
+	unsigned se_mask[4];
+	unsigned se;
+
+	se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
+	se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
+	se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
+	se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
+
+	assert(num_se == 1 || num_se == 2 || num_se == 4);
+	assert(sh_per_se == 1 || sh_per_se == 2);
+	assert(rb_per_pkr == 1 || rb_per_pkr == 2);
+
+
+	if (info->chip_class >= CIK) {
+		unsigned raster_config_1 = *cik_raster_config_1_p;
+		if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
+				     (!se_mask[2] && !se_mask[3]))) {
+			raster_config_1 &= C_028354_SE_PAIR_MAP;
+
+			if (!se_mask[0] && !se_mask[1]) {
+				raster_config_1 |=
+					S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
+			} else {
+				raster_config_1 |=
+					S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
+			}
+			*cik_raster_config_1_p = raster_config_1;
+		}
+	}
+
+	for (se = 0; se < num_se; se++) {
+		unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
+		unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
+		int idx = (se / 2) * 2;
+
+		raster_config_se[se] = raster_config;
+		if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
+			raster_config_se[se] &= C_028350_SE_MAP;
+
+			if (!se_mask[idx]) {
+				raster_config_se[se] |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
+			} else {
+				raster_config_se[se] |=
+					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
+			}
+		}
+
+		pkr0_mask &= rb_mask;
+		pkr1_mask &= rb_mask;
+		if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
+			raster_config_se[se] &= C_028350_PKR_MAP;
+
+			if (!pkr0_mask) {
+				raster_config_se[se] |=
+					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
+			} else {
+				raster_config_se[se] |=
+					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
+			}
+		}
+
+		if (rb_per_se >= 2) {
+			unsigned rb0_mask = 1 << (se * rb_per_se);
+			unsigned rb1_mask = rb0_mask << 1;
+
+			rb0_mask &= rb_mask;
+			rb1_mask &= rb_mask;
+			if (!rb0_mask || !rb1_mask) {
+				raster_config_se[se] &= C_028350_RB_MAP_PKR0;
+
+				if (!rb0_mask) {
+					raster_config_se[se] |=
+						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
+				} else {
+					raster_config_se[se] |=
+						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
+				}
+			}
+
+			if (rb_per_se > 2) {
+				rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
+				rb1_mask = rb0_mask << 1;
+				rb0_mask &= rb_mask;
+				rb1_mask &= rb_mask;
+				if (!rb0_mask || !rb1_mask) {
+					raster_config_se[se] &= C_028350_RB_MAP_PKR1;
+
+					if (!rb0_mask) {
+						raster_config_se[se] |=
+							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
+					} else {
+						raster_config_se[se] |=
+							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
+					}
+				}
+			}
+		}
+	}
+}
diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h
index 9857cd0..f6e09d2 100644
--- a/src/amd/common/ac_gpu_info.h
+++ b/src/amd/common/ac_gpu_info.h
@@ -86,7 +86,7 @@
 	uint32_t                    vce_fw_version;
 	uint32_t                    vce_harvest_config;
 
-	/* Kernel info. */
+	/* Kernel & winsys capabilities. */
 	uint32_t                    drm_major; /* version */
 	uint32_t                    drm_minor;
 	uint32_t                    drm_patchlevel;
@@ -97,11 +97,25 @@
 	bool                        has_ctx_priority;
 	bool                        has_local_buffers;
 	bool                        kernel_flushes_hdp_before_ib;
+	bool                        htile_cmask_support_1d_tiling;
+	bool                        si_TA_CS_BC_BASE_ADDR_allowed;
+	bool                        has_bo_metadata;
+	bool                        has_gpu_reset_status_query;
+	bool                        has_gpu_reset_counter_query;
+	bool                        has_eqaa_surface_allocator;
+	bool                        has_format_bc1_through_bc7;
+	bool                        kernel_flushes_tc_l2_after_ib;
+	bool                        has_indirect_compute_dispatch;
+	bool                        has_unaligned_shader_loads;
+	bool                        has_sparse_vm_mappings;
+	bool                        has_2d_tiling;
+	bool                        has_read_registers_query;
 
 	/* Shader cores. */
 	uint32_t                    r600_max_quad_pipes; /* wave size / 16 */
 	uint32_t                    max_shader_clock;
 	uint32_t                    num_good_compute_units;
+	uint32_t                    num_tcc_blocks;
 	uint32_t                    max_se; /* shader engines */
 	uint32_t                    max_sh_per_se; /* shader arrays per shader engine */
 
@@ -131,6 +145,29 @@
 
 void ac_compute_device_uuid(struct radeon_info *info, char *uuid, size_t size);
 void ac_print_gpu_info(struct radeon_info *info);
+int ac_get_gs_table_depth(enum chip_class chip_class, enum radeon_family family);
+void ac_get_raster_config(struct radeon_info *info,
+			  uint32_t *raster_config_p,
+			  uint32_t *raster_config_1_p);
+void ac_get_harvested_configs(struct radeon_info *info,
+			      unsigned raster_config,
+			      unsigned *cik_raster_config_1_p,
+			      unsigned *raster_config_se);
+
+static inline unsigned ac_get_max_simd_waves(enum radeon_family family)
+{
+
+	switch (family) {
+	/* These always have 8 waves: */
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
+	case CHIP_POLARIS12:
+	case CHIP_VEGAM:
+		return 8;
+	default:
+		return 10;
+	}
+}
 
 #ifdef __cplusplus
 }
diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 8c2bc47..c85d281 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -57,15 +57,15 @@
  * The caller is responsible for initializing ctx::module and ctx::builder.
  */
 void
-ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
+ac_llvm_context_init(struct ac_llvm_context *ctx,
 		     enum chip_class chip_class, enum radeon_family family)
 {
 	LLVMValueRef args[1];
 
+	ctx->context = LLVMContextCreate();
+
 	ctx->chip_class = chip_class;
 	ctx->family = family;
-
-	ctx->context = context;
 	ctx->module = NULL;
 	ctx->builder = NULL;
 
@@ -175,6 +175,8 @@
 	switch (kind) {
 	case LLVMIntegerTypeKind:
 		return LLVMGetIntTypeWidth(type) / 8;
+	case LLVMHalfTypeKind:
+		return 2;
 	case LLVMFloatTypeKind:
 		return 4;
 	case LLVMDoubleTypeKind:
@@ -320,6 +322,9 @@
 	case LLVMIntegerTypeKind:
 		snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
 		break;
+	case LLVMHalfTypeKind:
+		snprintf(buf, bufsize, "f16");
+		break;
 	case LLVMFloatTypeKind:
 		snprintf(buf, bufsize, "f32");
 		break;
@@ -510,6 +515,43 @@
 	return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
 }
 
+/* Expand a scalar or vector to <dst_channels x type> by filling the remaining
+ * channels with undef. Extract at most src_channels components from the input.
+ */
+LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx,
+			     LLVMValueRef value,
+			     unsigned src_channels,
+			     unsigned dst_channels)
+{
+	LLVMTypeRef elemtype;
+	LLVMValueRef chan[dst_channels];
+
+	if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
+		unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
+
+		if (src_channels == dst_channels && vec_size == dst_channels)
+			return value;
+
+		src_channels = MIN2(src_channels, vec_size);
+
+		for (unsigned i = 0; i < src_channels; i++)
+			chan[i] = ac_llvm_extract_elem(ctx, value, i);
+
+		elemtype = LLVMGetElementType(LLVMTypeOf(value));
+	} else {
+		if (src_channels) {
+			assert(src_channels == 1);
+			chan[0] = value;
+		}
+		elemtype = LLVMTypeOf(value);
+	}
+
+	for (unsigned i = src_channels; i < dst_channels; i++)
+		chan[i] = LLVMGetUndef(elemtype);
+
+	return ac_build_gather_values(ctx, chan, dst_channels);
+}
+
 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
  * with undef. Extract at most num_channels components from the input.
  */
@@ -517,32 +559,7 @@
 				     LLVMValueRef value,
 				     unsigned num_channels)
 {
-	LLVMTypeRef elemtype;
-	LLVMValueRef chan[4];
-
-	if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
-		unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
-		num_channels = MIN2(num_channels, vec_size);
-
-		if (num_channels >= 4)
-			return value;
-
-		for (unsigned i = 0; i < num_channels; i++)
-			chan[i] = ac_llvm_extract_elem(ctx, value, i);
-
-		elemtype = LLVMGetElementType(LLVMTypeOf(value));
-	} else {
-		if (num_channels) {
-			assert(num_channels == 1);
-			chan[0] = value;
-		}
-		elemtype = LLVMTypeOf(value);
-	}
-
-	while (num_channels < 4)
-		chan[num_channels++] = LLVMGetUndef(elemtype);
-
-	return ac_build_gather_values(ctx, chan, 4);
+	return ac_build_expand(ctx, value, num_channels, 4);
 }
 
 LLVMValueRef
@@ -550,7 +567,15 @@
 	      LLVMValueRef num,
 	      LLVMValueRef den)
 {
-	LLVMValueRef ret = LLVMBuildFDiv(ctx->builder, num, den, "");
+	/* If we do (num / den), LLVM >= 7.0 does:
+	 *    return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
+	 *
+	 * If we do (num * (1 / den)), LLVM does:
+	 *    return num * v_rcp_f32(den);
+	 */
+	LLVMValueRef one = LLVMTypeOf(num) == ctx->f64 ? ctx->f64_1 : ctx->f32_1;
+	LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, one, den, "");
+	LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
 
 	/* Use v_rcp_f32 instead of precise division. */
 	if (!LLVMIsConstant(ret))
@@ -888,37 +913,36 @@
 			    bool writeonly_memory,
 			    bool swizzle_enable_hint)
 {
+	/* Split 3 channel stores, becase LLVM doesn't support 3-channel
+	 * intrinsics. */
+	if (num_channels == 3) {
+		LLVMValueRef v[3], v01;
+
+		for (int i = 0; i < 3; i++) {
+			v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
+					LLVMConstInt(ctx->i32, i, 0), "");
+		}
+		v01 = ac_build_gather_values(ctx, v, 2);
+
+		ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
+					    soffset, inst_offset, glc, slc,
+					    writeonly_memory, swizzle_enable_hint);
+		ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
+					    soffset, inst_offset + 8,
+					    glc, slc,
+					    writeonly_memory, swizzle_enable_hint);
+		return;
+	}
+
 	/* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
 	 * (voffset is swizzled, but soffset isn't swizzled).
 	 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
 	 */
 	if (!swizzle_enable_hint) {
-		/* Split 3 channel stores, becase LLVM doesn't support 3-channel
-		 * intrinsics. */
-		if (num_channels == 3) {
-			LLVMValueRef v[3], v01;
-
-			for (int i = 0; i < 3; i++) {
-				v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
-						LLVMConstInt(ctx->i32, i, 0), "");
-			}
-			v01 = ac_build_gather_values(ctx, v, 2);
-
-			ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
-						    soffset, inst_offset, glc, slc,
-						    writeonly_memory, swizzle_enable_hint);
-			ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
-						    soffset, inst_offset + 8,
-						    glc, slc,
-						    writeonly_memory, swizzle_enable_hint);
-			return;
-		}
-
-		unsigned func = CLAMP(num_channels, 1, 3) - 1;
-		static const char *types[] = {"f32", "v2f32", "v4f32"};
-		char name[256];
 		LLVMValueRef offset = soffset;
 
+		static const char *types[] = {"f32", "v2f32", "v4f32"};
+
 		if (inst_offset)
 			offset = LLVMBuildAdd(ctx->builder, offset,
 					      LLVMConstInt(ctx->i32, inst_offset, 0), "");
@@ -934,53 +958,46 @@
 			LLVMConstInt(ctx->i1, slc, 0),
 		};
 
+		char name[256];
 		snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
-			 types[func]);
+			 types[CLAMP(num_channels, 1, 3) - 1]);
 
 		ac_build_intrinsic(ctx, name, ctx->voidt,
 				   args, ARRAY_SIZE(args),
 				   writeonly_memory ?
-					   AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
-					   AC_FUNC_ATTR_WRITEONLY);
+				   AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+				   AC_FUNC_ATTR_WRITEONLY);
 		return;
 	}
 
-	static unsigned dfmt[] = {
+	static const unsigned dfmt[] = {
 		V_008F0C_BUF_DATA_FORMAT_32,
 		V_008F0C_BUF_DATA_FORMAT_32_32,
 		V_008F0C_BUF_DATA_FORMAT_32_32_32,
 		V_008F0C_BUF_DATA_FORMAT_32_32_32_32
 	};
-	assert(num_channels >= 1 && num_channels <= 4);
-
+	static const char *types[] = {"i32", "v2i32", "v4i32"};
 	LLVMValueRef args[] = {
-		rsrc,
 		vdata,
-		LLVMConstInt(ctx->i32, num_channels, 0),
-		voffset ? voffset : LLVMGetUndef(ctx->i32),
+		LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
+		LLVMConstInt(ctx->i32, 0, 0),
+		voffset ? voffset : LLVMConstInt(ctx->i32, 0, 0),
 		soffset,
 		LLVMConstInt(ctx->i32, inst_offset, 0),
 		LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
 		LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
-		LLVMConstInt(ctx->i32, voffset != NULL, 0),
-		LLVMConstInt(ctx->i32, 0, 0), /* idxen */
-		LLVMConstInt(ctx->i32, glc, 0),
-		LLVMConstInt(ctx->i32, slc, 0),
-		LLVMConstInt(ctx->i32, 0, 0), /* tfe*/
+		LLVMConstInt(ctx->i1, glc, 0),
+		LLVMConstInt(ctx->i1, slc, 0),
 	};
-
-	/* The instruction offset field has 12 bits */
-	assert(voffset || inst_offset < (1 << 12));
-
-	/* The intrinsic is overloaded, we need to add a type suffix for overloading to work. */
-	unsigned func = CLAMP(num_channels, 1, 3) - 1;
-	const char *types[] = {"i32", "v2i32", "v4i32"};
 	char name[256];
-	snprintf(name, sizeof(name), "llvm.SI.tbuffer.store.%s", types[func]);
+	snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
+		 types[CLAMP(num_channels, 1, 3) - 1]);
 
 	ac_build_intrinsic(ctx, name, ctx->voidt,
 			   args, ARRAY_SIZE(args),
-			   AC_FUNC_ATTR_LEGACY);
+			   writeonly_memory ?
+				   AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
+				   AC_FUNC_ATTR_WRITEONLY);
 }
 
 static LLVMValueRef
@@ -1106,6 +1123,31 @@
 	                                   can_speculate, true);
 }
 
+LLVMValueRef
+ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef vindex,
+			    LLVMValueRef voffset,
+				LLVMValueRef soffset,
+				LLVMValueRef immoffset)
+{
+	const char *name = "llvm.amdgcn.tbuffer.load.i32";
+	LLVMTypeRef type = ctx->i32;
+	LLVMValueRef params[] = {
+				rsrc,
+				vindex,
+				voffset,
+				soffset,
+				immoffset,
+				LLVMConstInt(ctx->i32, V_008F0C_BUF_DATA_FORMAT_16, false),
+				LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, false),
+				ctx->i1false,
+				ctx->i1false,
+	};
+	LLVMValueRef res = ac_build_intrinsic(ctx, name, type, params, 9, 0);
+	return LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
+}
+
 /**
  * Set range metadata on an instruction.  This can only be used on load and
  * call instructions.  If you know an instruction can only produce the values
@@ -1178,7 +1220,21 @@
 	LLVMValueRef tl, trbl, args[2];
 	LLVMValueRef result;
 
-	if (ctx->chip_class >= VI) {
+	if (HAVE_LLVM >= 0x0700) {
+		unsigned tl_lanes[4], trbl_lanes[4];
+
+		for (unsigned i = 0; i < 4; ++i) {
+			tl_lanes[i] = i & mask;
+			trbl_lanes[i] = (i & mask) + idx;
+		}
+
+		tl = ac_build_quad_swizzle(ctx, val,
+		                           tl_lanes[0], tl_lanes[1],
+		                           tl_lanes[2], tl_lanes[3]);
+		trbl = ac_build_quad_swizzle(ctx, val,
+		                             trbl_lanes[0], trbl_lanes[1],
+		                             trbl_lanes[2], trbl_lanes[3]);
+	} else if (ctx->chip_class >= VI) {
 		LLVMValueRef thread_id, tl_tid, trbl_tid;
 		thread_id = ac_get_thread_id(ctx);
 
@@ -1248,6 +1304,13 @@
 	tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
 	trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
 	result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
+
+	if (HAVE_LLVM >= 0x0700) {
+		result = ac_build_intrinsic(ctx,
+			"llvm.amdgcn.wqm.f32", ctx->f32,
+			&result, 1, 0);
+	}
+
 	return result;
 }
 
@@ -1367,66 +1430,41 @@
 
 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
 {
-	if (HAVE_LLVM >= 0x0500) {
-		return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
-				     ctx->f32_1);
-	}
-
-	LLVMValueRef args[3] = {
-		value,
-		LLVMConstReal(ctx->f32, 0),
-		LLVMConstReal(ctx->f32, 1),
-	};
-
-	return ac_build_intrinsic(ctx, "llvm.AMDGPU.clamp.", ctx->f32, args, 3,
-				  AC_FUNC_ATTR_READNONE |
-				  AC_FUNC_ATTR_LEGACY);
+	return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
+			     ctx->f32_1);
 }
 
 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
 {
 	LLVMValueRef args[9];
 
-	if (HAVE_LLVM >= 0x0500) {
-		args[0] = LLVMConstInt(ctx->i32, a->target, 0);
-		args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
+	args[0] = LLVMConstInt(ctx->i32, a->target, 0);
+	args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
 
-		if (a->compr) {
-			LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
-			LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
+	if (a->compr) {
+		LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
+		LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
 
-			args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
-						   v2i16, "");
-			args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
-						   v2i16, "");
-			args[4] = LLVMConstInt(ctx->i1, a->done, 0);
-			args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+		args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
+				v2i16, "");
+		args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
+				v2i16, "");
+		args[4] = LLVMConstInt(ctx->i1, a->done, 0);
+		args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
 
-			ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
-					   ctx->voidt, args, 6, 0);
-		} else {
-			args[2] = a->out[0];
-			args[3] = a->out[1];
-			args[4] = a->out[2];
-			args[5] = a->out[3];
-			args[6] = LLVMConstInt(ctx->i1, a->done, 0);
-			args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
+		ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
+				   ctx->voidt, args, 6, 0);
+	} else {
+		args[2] = a->out[0];
+		args[3] = a->out[1];
+		args[4] = a->out[2];
+		args[5] = a->out[3];
+		args[6] = LLVMConstInt(ctx->i1, a->done, 0);
+		args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
 
-			ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
-					   ctx->voidt, args, 8, 0);
-		}
-		return;
+		ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
+				   ctx->voidt, args, 8, 0);
 	}
-
-	args[0] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
-	args[1] = LLVMConstInt(ctx->i32, a->valid_mask, 0);
-	args[2] = LLVMConstInt(ctx->i32, a->done, 0);
-	args[3] = LLVMConstInt(ctx->i32, a->target, 0);
-	args[4] = LLVMConstInt(ctx->i32, a->compr, 0);
-	memcpy(args + 5, a->out, sizeof(a->out[0]) * 4);
-
-	ac_build_intrinsic(ctx, "llvm.SI.export", ctx->voidt, args, 9,
-			   AC_FUNC_ATTR_LEGACY);
 }
 
 void ac_build_export_null(struct ac_llvm_context *ctx)
@@ -1485,8 +1523,26 @@
 	}
 }
 
-LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
-				   struct ac_image_args *a)
+static const char *get_atomic_name(enum ac_atomic_op op)
+{
+	switch (op) {
+	case ac_atomic_swap: return "swap";
+	case ac_atomic_add: return "add";
+	case ac_atomic_sub: return "sub";
+	case ac_atomic_smin: return "smin";
+	case ac_atomic_umin: return "umin";
+	case ac_atomic_smax: return "smax";
+	case ac_atomic_umax: return "umax";
+	case ac_atomic_and: return "and";
+	case ac_atomic_or: return "or";
+	case ac_atomic_xor: return "xor";
+	}
+	unreachable("bad atomic op");
+}
+
+/* LLVM 6 and older */
+static LLVMValueRef ac_build_image_opcode_llvm6(struct ac_llvm_context *ctx,
+						struct ac_image_args *a)
 {
 	LLVMValueRef args[16];
 	LLVMTypeRef retty = ctx->v4f32;
@@ -1494,16 +1550,6 @@
 	const char *atomic_subop = "";
 	char intr_name[128], coords_type[64];
 
-	assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
-	       !a->level_zero);
-	assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
-		a->opcode != ac_image_store_mip) ||
-	       a->lod);
-	assert((a->bias ? 1 : 0) +
-	       (a->lod ? 1 : 0) +
-	       (a->level_zero ? 1 : 0) +
-	       (a->derivs[0] ? 1 : 0) <= 1);
-
 	bool sample = a->opcode == ac_image_sample ||
 		      a->opcode == ac_image_gather4 ||
 		      a->opcode == ac_image_get_lod;
@@ -1615,18 +1661,7 @@
 		if (a->opcode == ac_image_atomic_cmpswap) {
 			atomic_subop = "cmpswap";
 		} else {
-			switch (a->atomic) {
-			case ac_atomic_swap: atomic_subop = "swap"; break;
-			case ac_atomic_add: atomic_subop = "add"; break;
-			case ac_atomic_sub: atomic_subop = "sub"; break;
-			case ac_atomic_smin: atomic_subop = "smin"; break;
-			case ac_atomic_umin: atomic_subop = "umin"; break;
-			case ac_atomic_smax: atomic_subop = "smax"; break;
-			case ac_atomic_umax: atomic_subop = "umax"; break;
-			case ac_atomic_and: atomic_subop = "and"; break;
-			case ac_atomic_or: atomic_subop = "or"; break;
-			case ac_atomic_xor: atomic_subop = "xor"; break;
-			}
+			atomic_subop = get_atomic_name(a->atomic);
 		}
 		break;
 	case ac_image_get_lod:
@@ -1670,22 +1705,173 @@
 	return result;
 }
 
+LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
+				   struct ac_image_args *a)
+{
+	const char *overload[3] = { "", "", "" };
+	unsigned num_overloads = 0;
+	LLVMValueRef args[18];
+	unsigned num_args = 0;
+	enum ac_image_dim dim = a->dim;
+
+	assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
+	       !a->level_zero);
+	assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
+		a->opcode != ac_image_store_mip) ||
+	       a->lod);
+	assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+	       (!a->compare && !a->offset));
+	assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
+		a->opcode == ac_image_get_lod) ||
+	       !a->bias);
+	assert((a->bias ? 1 : 0) +
+	       (a->lod ? 1 : 0) +
+	       (a->level_zero ? 1 : 0) +
+	       (a->derivs[0] ? 1 : 0) <= 1);
+
+	if (HAVE_LLVM < 0x0700)
+		return ac_build_image_opcode_llvm6(ctx, a);
+
+	if (a->opcode == ac_image_get_lod) {
+		switch (dim) {
+		case ac_image_1darray:
+			dim = ac_image_1d;
+			break;
+		case ac_image_2darray:
+		case ac_image_cube:
+			dim = ac_image_2d;
+			break;
+		default:
+			break;
+		}
+	}
+
+	bool sample = a->opcode == ac_image_sample ||
+		      a->opcode == ac_image_gather4 ||
+		      a->opcode == ac_image_get_lod;
+	bool atomic = a->opcode == ac_image_atomic ||
+		      a->opcode == ac_image_atomic_cmpswap;
+	LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
+
+	if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
+		args[num_args++] = a->data[0];
+		if (a->opcode == ac_image_atomic_cmpswap)
+			args[num_args++] = a->data[1];
+	}
+
+	if (!atomic)
+		args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
+
+	if (a->offset)
+		args[num_args++] = ac_to_integer(ctx, a->offset);
+	if (a->bias) {
+		args[num_args++] = ac_to_float(ctx, a->bias);
+		overload[num_overloads++] = ".f32";
+	}
+	if (a->compare)
+		args[num_args++] = ac_to_float(ctx, a->compare);
+	if (a->derivs[0]) {
+		unsigned count = ac_num_derivs(dim);
+		for (unsigned i = 0; i < count; ++i)
+			args[num_args++] = ac_to_float(ctx, a->derivs[i]);
+		overload[num_overloads++] = ".f32";
+	}
+	unsigned num_coords =
+		a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
+	for (unsigned i = 0; i < num_coords; ++i)
+		args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
+	if (a->lod)
+		args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
+	overload[num_overloads++] = sample ? ".f32" : ".i32";
+
+	args[num_args++] = a->resource;
+	if (sample) {
+		args[num_args++] = a->sampler;
+		args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
+	}
+
+	args[num_args++] = ctx->i32_0; /* texfailctrl */
+	args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
+
+	const char *name;
+	const char *atomic_subop = "";
+	switch (a->opcode) {
+	case ac_image_sample: name = "sample"; break;
+	case ac_image_gather4: name = "gather4"; break;
+	case ac_image_load: name = "load"; break;
+	case ac_image_load_mip: name = "load.mip"; break;
+	case ac_image_store: name = "store"; break;
+	case ac_image_store_mip: name = "store.mip"; break;
+	case ac_image_atomic:
+		name = "atomic.";
+		atomic_subop = get_atomic_name(a->atomic);
+		break;
+	case ac_image_atomic_cmpswap:
+		name = "atomic.";
+		atomic_subop = "cmpswap";
+		break;
+	case ac_image_get_lod: name = "getlod"; break;
+	case ac_image_get_resinfo: name = "getresinfo"; break;
+	default: unreachable("invalid image opcode");
+	}
+
+	const char *dimname;
+	switch (dim) {
+	case ac_image_1d: dimname = "1d"; break;
+	case ac_image_2d: dimname = "2d"; break;
+	case ac_image_3d: dimname = "3d"; break;
+	case ac_image_cube: dimname = "cube"; break;
+	case ac_image_1darray: dimname = "1darray"; break;
+	case ac_image_2darray: dimname = "2darray"; break;
+	case ac_image_2dmsaa: dimname = "2dmsaa"; break;
+	case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
+	default: unreachable("invalid dim");
+	}
+
+	bool lod_suffix =
+		a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
+	char intr_name[96];
+	snprintf(intr_name, sizeof(intr_name),
+		 "llvm.amdgcn.image.%s%s" /* base name */
+		 "%s%s%s" /* sample/gather modifiers */
+		 ".%s.%s%s%s%s", /* dimension and type overloads */
+		 name, atomic_subop,
+		 a->compare ? ".c" : "",
+		 a->bias ? ".b" :
+		 lod_suffix ? ".l" :
+		 a->derivs[0] ? ".d" :
+		 a->level_zero ? ".lz" : "",
+		 a->offset ? ".o" : "",
+		 dimname,
+		 atomic ? "i32" : "v4f32",
+		 overload[0], overload[1], overload[2]);
+
+	LLVMTypeRef retty;
+	if (atomic)
+		retty = ctx->i32;
+	else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
+		retty = ctx->voidt;
+	else
+		retty = ctx->v4f32;
+
+	LLVMValueRef result =
+		ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
+				   a->attributes);
+	if (!sample && retty == ctx->v4f32) {
+		result = LLVMBuildBitCast(ctx->builder, result,
+					  ctx->v4i32, "");
+	}
+	return result;
+}
+
 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
 				    LLVMValueRef args[2])
 {
-	if (HAVE_LLVM >= 0x0500) {
-		LLVMTypeRef v2f16 =
-			LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
-		LLVMValueRef res =
-			ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz",
-					   v2f16, args, 2,
-					   AC_FUNC_ATTR_READNONE);
-		return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
-	}
+	LLVMTypeRef v2f16 =
+		LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
 
-	return ac_build_intrinsic(ctx, "llvm.SI.packf16", ctx->i32, args, 2,
-				  AC_FUNC_ATTR_READNONE |
-				  AC_FUNC_ATTR_LEGACY);
+	return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
+				  args, 2, AC_FUNC_ATTR_READNONE);
 }
 
 /* Upper 16 bits must be zero. */
@@ -1869,20 +2055,11 @@
 		width,
 	};
 
-	if (HAVE_LLVM >= 0x0500) {
-		return ac_build_intrinsic(ctx,
-					  is_signed ? "llvm.amdgcn.sbfe.i32" :
-						      "llvm.amdgcn.ubfe.i32",
-					  ctx->i32, args, 3,
-					  AC_FUNC_ATTR_READNONE);
-	}
-
 	return ac_build_intrinsic(ctx,
-				  is_signed ? "llvm.AMDGPU.bfe.i32" :
-					      "llvm.AMDGPU.bfe.u32",
+				  is_signed ? "llvm.amdgcn.sbfe.i32" :
+					      "llvm.amdgcn.ubfe.i32",
 				  ctx->i32, args, 3,
-				  AC_FUNC_ATTR_READNONE |
-				  AC_FUNC_ATTR_LEGACY);
+				  AC_FUNC_ATTR_READNONE);
 }
 
 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
@@ -1962,9 +2139,9 @@
 	return val;
 }
 
-#define AC_EXP_TARGET (HAVE_LLVM >= 0x0500 ? 0 : 3)
-#define AC_EXP_ENABLED_CHANNELS (HAVE_LLVM >= 0x0500 ? 1 : 0)
-#define AC_EXP_OUT0 (HAVE_LLVM >= 0x0500 ? 2 : 5)
+#define AC_EXP_TARGET		0
+#define AC_EXP_ENABLED_CHANNELS 1
+#define AC_EXP_OUT0		2
 
 enum ac_ir_type {
 	AC_IR_UNDEF,
@@ -2618,11 +2795,13 @@
 	final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
 				    LLVMConstInt(ac->i32, 4, 0), "");
 	final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
+	/* Mask the sample index by 0x7, because 0x8 means an unknown value
+	 * with EQAA, so those will map to 0. */
 	final_sample = LLVMBuildAnd(ac->builder, final_sample,
-				    LLVMConstInt(ac->i32, 0xF, 0), "");
+				    LLVMConstInt(ac->i32, 0x7, 0), "");
 
 	/* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
-	 * resource descriptor is 0 (invalid),
+	 * resource descriptor is 0 (invalid).
 	 */
 	LLVMValueRef tmp;
 	tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index 026955a..92d72ae 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -97,7 +97,7 @@
 };
 
 void
-ac_llvm_context_init(struct ac_llvm_context *ctx, LLVMContextRef context,
+ac_llvm_context_init(struct ac_llvm_context *ctx,
 		     enum chip_class chip_class, enum radeon_family family);
 
 void
@@ -161,6 +161,9 @@
 ac_build_gather_values(struct ac_llvm_context *ctx,
 		       LLVMValueRef *values,
 		       unsigned value_count);
+LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx,
+			     LLVMValueRef value,
+			     unsigned src_channels, unsigned dst_channels);
 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
 				     LLVMValueRef value,
 				     unsigned num_channels);
@@ -253,6 +256,14 @@
                                                   bool can_speculate);
 
 LLVMValueRef
+ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
+			    LLVMValueRef rsrc,
+			    LLVMValueRef vindex,
+			    LLVMValueRef voffset,
+				LLVMValueRef soffset,
+				LLVMValueRef immoffset);
+
+LLVMValueRef
 ac_get_thread_id(struct ac_llvm_context *ctx);
 
 #define AC_TID_MASK_TOP_LEFT 0xfffffffc
diff --git a/src/amd/common/ac_llvm_helper.cpp b/src/amd/common/ac_llvm_helper.cpp
index 1d2369b..a4b2fde 100644
--- a/src/amd/common/ac_llvm_helper.cpp
+++ b/src/amd/common/ac_llvm_helper.cpp
@@ -29,30 +29,24 @@
 #pragma push_macro("DEBUG")
 #undef DEBUG
 
+#include "ac_binary.h"
 #include "ac_llvm_util.h"
-#include <llvm-c/Core.h>
-#include <llvm/Target/TargetOptions.h>
-#include <llvm/ExecutionEngine/ExecutionEngine.h>
-#include <llvm/IR/Attributes.h>
-#include <llvm/IR/CallSite.h>
-#include <llvm/IR/IRBuilder.h>
 
-#if HAVE_LLVM < 0x0500
-namespace llvm {
-typedef AttributeSet AttributeList;
-}
+#include <llvm-c/Core.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/Analysis/TargetLibraryInfo.h>
+#include <llvm/Transforms/IPO.h>
+
+#include <llvm/IR/LegacyPassManager.h>
+#if HAVE_LLVM < 0x0700
+#include "llvm/Support/raw_ostream.h"
 #endif
 
 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes)
 {
    llvm::Argument *A = llvm::unwrap<llvm::Argument>(val);
-#if HAVE_LLVM < 0x0500
-   llvm::AttrBuilder B;
-   B.addDereferenceableAttr(bytes);
-   A->addAttr(llvm::AttributeList::get(A->getContext(), A->getArgNo() + 1,  B));
-#else
    A->addAttr(llvm::Attribute::getWithDereferenceableBytes(A->getContext(), bytes));
-#endif
 }
 
 bool ac_is_sgpr_param(LLVMValueRef arg)
@@ -73,6 +67,16 @@
 	return LLVMGetValueKind(v) == LLVMFunctionValueKind;
 }
 
+LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx)
+{
+   llvm::TargetMachine *TM = reinterpret_cast<llvm::TargetMachine*>(tm);
+   LLVMModuleRef module = LLVMModuleCreateWithNameInContext("mesa-shader", ctx);
+
+   llvm::unwrap(module)->setTargetTriple(TM->getTargetTriple().getTriple());
+   llvm::unwrap(module)->setDataLayout(TM->createDataLayout());
+   return module;
+}
+
 LLVMBuilderRef ac_create_builder(LLVMContextRef ctx,
 				 enum ac_float_mode float_mode)
 {
@@ -99,3 +103,78 @@
 
 	return builder;
 }
+
+LLVMTargetLibraryInfoRef
+ac_create_target_library_info(const char *triple)
+{
+	return reinterpret_cast<LLVMTargetLibraryInfoRef>(new llvm::TargetLibraryInfoImpl(llvm::Triple(triple)));
+}
+
+void
+ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info)
+{
+	delete reinterpret_cast<llvm::TargetLibraryInfoImpl *>(library_info);
+}
+
+/* The LLVM compiler is represented as a pass manager containing passes for
+ * optimizations, instruction selection, and code generation.
+ */
+struct ac_compiler_passes {
+	ac_compiler_passes(): ostream(code_string) {}
+
+	llvm::SmallString<0> code_string;  /* ELF shader binary */
+	llvm::raw_svector_ostream ostream; /* stream for appending data to the binary */
+	llvm::legacy::PassManager passmgr; /* list of passes */
+};
+
+struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm)
+{
+	struct ac_compiler_passes *p = new ac_compiler_passes();
+	if (!p)
+		return NULL;
+
+	llvm::TargetMachine *TM = reinterpret_cast<llvm::TargetMachine*>(tm);
+
+	if (TM->addPassesToEmitFile(p->passmgr, p->ostream,
+#if HAVE_LLVM >= 0x0700
+				    nullptr,
+#endif
+				    llvm::TargetMachine::CGFT_ObjectFile)) {
+		fprintf(stderr, "amd: TargetMachine can't emit a file of this type!\n");
+		delete p;
+		return NULL;
+	}
+	return p;
+}
+
+void ac_destroy_llvm_passes(struct ac_compiler_passes *p)
+{
+	delete p;
+}
+
+/* This returns false on failure. */
+bool ac_compile_module_to_binary(struct ac_compiler_passes *p, LLVMModuleRef module,
+				 struct ac_shader_binary *binary)
+{
+	p->passmgr.run(*llvm::unwrap(module));
+
+	llvm::StringRef data = p->ostream.str();
+	bool success = ac_elf_read(data.data(), data.size(), binary);
+	p->code_string = ""; /* release the ELF shader binary */
+
+	if (!success)
+		fprintf(stderr, "amd: cannot read an ELF shader binary\n");
+	return success;
+}
+
+void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr)
+{
+	llvm::unwrap(passmgr)->add(llvm::createBarrierNoopPass());
+}
+
+void ac_enable_global_isel(LLVMTargetMachineRef tm)
+{
+#if HAVE_LLVM >= 0x0700
+  reinterpret_cast<llvm::TargetMachine*>(tm)->setGlobalISel(true);
+#endif
+}
diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c
index 5b52381..cd35251 100644
--- a/src/amd/common/ac_llvm_util.c
+++ b/src/amd/common/ac_llvm_util.c
@@ -28,7 +28,13 @@
 #include "util/bitscan.h"
 #include <llvm-c/Core.h>
 #include <llvm-c/Support.h>
+#include <llvm-c/Transforms/IPO.h>
+#include <llvm-c/Transforms/Scalar.h>
+#if HAVE_LLVM >= 0x0700
+#include <llvm-c/Transforms/Utils.h>
+#endif
 #include "c11/threads.h"
+#include "gallivm/lp_bld_misc.h"
 #include "util/u_math.h"
 
 #include <assert.h>
@@ -50,20 +56,27 @@
 	 * https://reviews.llvm.org/D26348
 	 *
 	 * "mesa" is the prefix for error messages.
+	 *
+	 * -global-isel-abort=2 is a no-op unless global isel has been enabled.
+	 * This option tells the backend to fall-back to SelectionDAG and print
+	 * a diagnostic message if global isel fails.
 	 */
-	const char *argv[2] = { "mesa", "-simplifycfg-sink-common=false" };
-	LLVMParseCommandLineOptions(2, argv, NULL);
+	const char *argv[3] = { "mesa", "-simplifycfg-sink-common=false", "-global-isel-abort=2" };
+	LLVMParseCommandLineOptions(3, argv, NULL);
 }
 
 static once_flag ac_init_llvm_target_once_flag = ONCE_FLAG_INIT;
 
-LLVMTargetRef ac_get_llvm_target(const char *triple)
+void ac_init_llvm_once(void)
+{
+	call_once(&ac_init_llvm_target_once_flag, ac_init_llvm_target);
+}
+
+static LLVMTargetRef ac_get_llvm_target(const char *triple)
 {
 	LLVMTargetRef target = NULL;
 	char *err_message = NULL;
 
-	call_once(&ac_init_llvm_target_once_flag, ac_init_llvm_target);
-
 	if (LLVMGetTargetFromTriple(triple, &target, &err_message)) {
 		fprintf(stderr, "Cannot find target for triple %s ", triple);
 		if (err_message) {
@@ -115,15 +128,22 @@
 	case CHIP_VEGAM:
 		return "polaris11";
 	case CHIP_VEGA10:
-	case CHIP_VEGA12:
-	case CHIP_RAVEN:
 		return "gfx900";
+	case CHIP_RAVEN:
+		return "gfx902";
+	case CHIP_VEGA12:
+		return HAVE_LLVM >= 0x0700 ? "gfx904" : "gfx902";
+	case CHIP_VEGA20:
+		return HAVE_LLVM >= 0x0700 ? "gfx906" : "gfx902";
 	default:
 		return "";
 	}
 }
 
-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, enum ac_target_machine_options tm_options)
+static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family,
+						     enum ac_target_machine_options tm_options,
+						     LLVMCodeGenOptLevel level,
+						     const char **out_triple)
 {
 	assert(family >= CHIP_TAHITI);
 	char features[256];
@@ -142,13 +162,50 @@
 	                             triple,
 	                             ac_get_llvm_processor_name(family),
 				     features,
-	                             LLVMCodeGenLevelDefault,
+	                             level,
 	                             LLVMRelocDefault,
 	                             LLVMCodeModelDefault);
 
+	if (out_triple)
+		*out_triple = triple;
+	if (tm_options & AC_TM_ENABLE_GLOBAL_ISEL)
+		ac_enable_global_isel(tm);
 	return tm;
 }
 
+static LLVMPassManagerRef ac_create_passmgr(LLVMTargetLibraryInfoRef target_library_info,
+					    bool check_ir)
+{
+	LLVMPassManagerRef passmgr = LLVMCreatePassManager();
+	if (!passmgr)
+		return NULL;
+
+	if (target_library_info)
+		LLVMAddTargetLibraryInfo(target_library_info,
+					 passmgr);
+
+	if (check_ir)
+		LLVMAddVerifierPass(passmgr);
+	LLVMAddAlwaysInlinerPass(passmgr);
+	/* Normally, the pass manager runs all passes on one function before
+	 * moving onto another. Adding a barrier no-op pass forces the pass
+	 * manager to run the inliner on all functions first, which makes sure
+	 * that the following passes are only run on the remaining non-inline
+	 * function, so it removes useless work done on dead inline functions.
+	 */
+	ac_llvm_add_barrier_noop_pass(passmgr);
+	/* This pass should eliminate all the load and store instructions. */
+	LLVMAddPromoteMemoryToRegisterPass(passmgr);
+	LLVMAddScalarReplAggregatesPass(passmgr);
+	LLVMAddLICMPass(passmgr);
+	LLVMAddAggressiveDCEPass(passmgr);
+	LLVMAddCFGSimplificationPass(passmgr);
+	/* This is recommended by the instruction combining pass. */
+	LLVMAddEarlyCSEMemSSAPass(passmgr);
+	LLVMAddInstructionCombiningPass(passmgr);
+	return passmgr;
+}
+
 static const char *attr_to_str(enum ac_func_attr attr)
 {
    switch (attr) {
@@ -240,3 +297,60 @@
 
 	return private_mem_vgprs;
 }
+
+bool
+ac_init_llvm_compiler(struct ac_llvm_compiler *compiler,
+		      bool okay_to_leak_target_library_info,
+		      enum radeon_family family,
+		      enum ac_target_machine_options tm_options)
+{
+	const char *triple;
+	memset(compiler, 0, sizeof(*compiler));
+
+	compiler->tm = ac_create_target_machine(family, tm_options,
+						LLVMCodeGenLevelDefault,
+						&triple);
+	if (!compiler->tm)
+		return false;
+
+	if (tm_options & AC_TM_CREATE_LOW_OPT) {
+		compiler->low_opt_tm =
+			ac_create_target_machine(family, tm_options,
+						 LLVMCodeGenLevelLess, NULL);
+		if (!compiler->low_opt_tm)
+			goto fail;
+	}
+
+	if (okay_to_leak_target_library_info || (HAVE_LLVM >= 0x0700)) {
+		compiler->target_library_info =
+			ac_create_target_library_info(triple);
+		if (!compiler->target_library_info)
+			goto fail;
+	}
+
+	compiler->passmgr = ac_create_passmgr(compiler->target_library_info,
+					      tm_options & AC_TM_CHECK_IR);
+	if (!compiler->passmgr)
+		goto fail;
+
+	return true;
+fail:
+	ac_destroy_llvm_compiler(compiler);
+	return false;
+}
+
+void
+ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler)
+{
+	if (compiler->passmgr)
+		LLVMDisposePassManager(compiler->passmgr);
+#if HAVE_LLVM >= 0x0700
+	/* This crashes on LLVM 5.0 and 6.0 and Ubuntu 18.04, so leak it there. */
+	if (compiler->target_library_info)
+		ac_dispose_target_library_info(compiler->target_library_info);
+#endif
+	if (compiler->low_opt_tm)
+		LLVMDisposeTargetMachine(compiler->low_opt_tm);
+	if (compiler->tm)
+		LLVMDisposeTargetMachine(compiler->tm);
+}
diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h
index 9c6b89b..eaf5f21 100644
--- a/src/amd/common/ac_llvm_util.h
+++ b/src/amd/common/ac_llvm_util.h
@@ -35,6 +35,9 @@
 extern "C" {
 #endif
 
+struct ac_shader_binary;
+struct ac_compiler_passes;
+
 enum ac_func_attr {
 	AC_FUNC_ATTR_ALWAYSINLINE = (1 << 0),
 	AC_FUNC_ATTR_INREG        = (1 << 2),
@@ -59,6 +62,9 @@
 	AC_TM_FORCE_ENABLE_XNACK = (1 << 2),
 	AC_TM_FORCE_DISABLE_XNACK = (1 << 3),
 	AC_TM_PROMOTE_ALLOCA_TO_SCRATCH = (1 << 4),
+	AC_TM_CHECK_IR = (1 << 5),
+	AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6),
+	AC_TM_CREATE_LOW_OPT = (1 << 7),
 };
 
 enum ac_float_mode {
@@ -67,10 +73,23 @@
 	AC_FLOAT_MODE_UNSAFE_FP_MATH,
 };
 
-const char *ac_get_llvm_processor_name(enum radeon_family family);
-LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, enum ac_target_machine_options tm_options);
+/* Per-thread persistent LLVM objects. */
+struct ac_llvm_compiler {
+	LLVMTargetLibraryInfoRef	target_library_info;
+	LLVMPassManagerRef		passmgr;
 
-LLVMTargetRef ac_get_llvm_target(const char *triple);
+	/* Default compiler. */
+	LLVMTargetMachineRef		tm;
+	struct ac_compiler_passes	*passes;
+
+	/* Optional compiler for faster compilation with fewer optimizations.
+	 * LLVM modules can be created with "tm" too. There is no difference.
+	 */
+	LLVMTargetMachineRef		low_opt_tm; /* uses -O1 instead of -O2 */
+	struct ac_compiler_passes	*low_opt_passes;
+};
+
+const char *ac_get_llvm_processor_name(enum radeon_family family);
 void ac_add_attr_dereferenceable(LLVMValueRef val, uint64_t bytes);
 bool ac_is_sgpr_param(LLVMValueRef param);
 void ac_add_function_attr(LLVMContextRef ctx, LLVMValueRef function,
@@ -81,6 +100,7 @@
 
 LLVMValueRef ac_llvm_get_called_value(LLVMValueRef call);
 bool ac_llvm_is_function(LLVMValueRef v);
+LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx);
 
 LLVMBuilderRef ac_create_builder(LLVMContextRef ctx,
 				 enum ac_float_mode float_mode);
@@ -108,6 +128,24 @@
 unsigned
 ac_count_scratch_private_memory(LLVMValueRef function);
 
+LLVMTargetLibraryInfoRef ac_create_target_library_info(const char *triple);
+void ac_dispose_target_library_info(LLVMTargetLibraryInfoRef library_info);
+void ac_init_llvm_once(void);
+
+
+bool ac_init_llvm_compiler(struct ac_llvm_compiler *compiler,
+			   bool okay_to_leak_target_library_info,
+			   enum radeon_family family,
+			   enum ac_target_machine_options tm_options);
+void ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler);
+
+struct ac_compiler_passes *ac_create_llvm_passes(LLVMTargetMachineRef tm);
+void ac_destroy_llvm_passes(struct ac_compiler_passes *p);
+bool ac_compile_module_to_binary(struct ac_compiler_passes *p, LLVMModuleRef module,
+				 struct ac_shader_binary *binary);
+void ac_llvm_add_barrier_noop_pass(LLVMPassManagerRef passmgr);
+void ac_enable_global_isel(LLVMTargetMachineRef tm);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index c75379d..2cb08be 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -27,6 +27,7 @@
 #include "ac_binary.h"
 #include "sid.h"
 #include "nir/nir.h"
+#include "nir/nir_deref.h"
 #include "util/bitscan.h"
 #include "util/u_math.h"
 #include "ac_shader_abi.h"
@@ -53,7 +54,7 @@
 };
 
 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
-				     const nir_deref_var *deref,
+				     nir_deref_instr *deref_instr,
 				     enum ac_descriptor_type desc_type,
 				     const nir_tex_instr *instr,
 				     bool image, bool write);
@@ -87,7 +88,6 @@
 		return is_array ? ac_image_1darray : ac_image_1d;
 	case GLSL_SAMPLER_DIM_2D:
 	case GLSL_SAMPLER_DIM_RECT:
-	case GLSL_SAMPLER_DIM_SUBPASS:
 	case GLSL_SAMPLER_DIM_EXTERNAL:
 		return is_array ? ac_image_2darray : ac_image_2d;
 	case GLSL_SAMPLER_DIM_3D:
@@ -95,8 +95,11 @@
 	case GLSL_SAMPLER_DIM_CUBE:
 		return ac_image_cube;
 	case GLSL_SAMPLER_DIM_MS:
-	case GLSL_SAMPLER_DIM_SUBPASS_MS:
 		return is_array ? ac_image_2darraymsaa : ac_image_2dmsaa;
+	case GLSL_SAMPLER_DIM_SUBPASS:
+		return ac_image_2darray;
+	case GLSL_SAMPLER_DIM_SUBPASS_MS:
+		return ac_image_2darraymsaa;
 	default:
 		unreachable("bad sampler dim");
 	}
@@ -415,10 +418,24 @@
 					  const LLVMValueRef srcs[3])
 {
 	LLVMValueRef result;
-	LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, srcs[2], LLVMConstInt(ctx->i32, 32, false), "");
 
-	result = ac_build_bfe(ctx, srcs[0], srcs[1], srcs[2], is_signed);
-	result = LLVMBuildSelect(ctx->builder, icond, srcs[0], result, "");
+	if (HAVE_LLVM < 0x0700) {
+		LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, srcs[2], LLVMConstInt(ctx->i32, 32, false), "");
+		result = ac_build_bfe(ctx, srcs[0], srcs[1], srcs[2], is_signed);
+		result = LLVMBuildSelect(ctx->builder, icond, srcs[0], result, "");
+	} else {
+		/* FIXME: LLVM 7 returns incorrect result when count is 0.
+		 * https://bugs.freedesktop.org/show_bug.cgi?id=107276
+		 */
+		LLVMValueRef zero = LLVMConstInt(ctx->i32, 0, false);
+		LLVMValueRef icond1 = LLVMBuildICmp(ctx->builder, LLVMIntEQ, srcs[2], LLVMConstInt(ctx->i32, 32, false), "");
+		LLVMValueRef icond2 = LLVMBuildICmp(ctx->builder, LLVMIntEQ, srcs[2], zero, "");
+
+		result = ac_build_bfe(ctx, srcs[0], srcs[1], srcs[2], is_signed);
+		result = LLVMBuildSelect(ctx->builder, icond1, srcs[0], result, "");
+		result = LLVMBuildSelect(ctx->builder, icond2, zero, result, "");
+	}
+
 	return result;
 }
 
@@ -461,7 +478,8 @@
 	comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, "");
 	comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, "");
 
-	return ac_build_cvt_pkrtz_f16(ctx, comp);
+	return LLVMBuildBitCast(ctx->builder, ac_build_cvt_pkrtz_f16(ctx, comp),
+				ctx->i32, "");
 }
 
 static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx,
@@ -840,34 +858,47 @@
 			src[i] = ac_to_integer(&ctx->ac, src[i]);
 		result = ac_build_gather_values(&ctx->ac, src, num_components);
 		break;
+	case nir_op_f2i16:
 	case nir_op_f2i32:
 	case nir_op_f2i64:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
 		result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, "");
 		break;
+	case nir_op_f2u16:
 	case nir_op_f2u32:
 	case nir_op_f2u64:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
 		result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, "");
 		break;
+	case nir_op_i2f16:
 	case nir_op_i2f32:
 	case nir_op_i2f64:
 		src[0] = ac_to_integer(&ctx->ac, src[0]);
 		result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
 		break;
+	case nir_op_u2f16:
 	case nir_op_u2f32:
 	case nir_op_u2f64:
 		src[0] = ac_to_integer(&ctx->ac, src[0]);
 		result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
 		break;
+	case nir_op_f2f16_rtz:
+		src[0] = ac_to_float(&ctx->ac, src[0]);
+		LLVMValueRef param[2] = { src[0], ctx->ac.f32_0 };
+		result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
+		result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
+		break;
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16:
+	case nir_op_f2f32:
 	case nir_op_f2f64:
 		src[0] = ac_to_float(&ctx->ac, src[0]);
-		result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+		if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type))
+			result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
+		else
+			result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
 		break;
-	case nir_op_f2f32:
-		src[0] = ac_to_float(&ctx->ac, src[0]);
-		result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
-		break;
+	case nir_op_u2u16:
 	case nir_op_u2u32:
 	case nir_op_u2u64:
 		src[0] = ac_to_integer(&ctx->ac, src[0]);
@@ -876,6 +907,7 @@
 		else
 			result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
 		break;
+	case nir_op_i2i16:
 	case nir_op_i2i32:
 	case nir_op_i2i64:
 		src[0] = ac_to_integer(&ctx->ac, src[0]);
@@ -1081,6 +1113,10 @@
 
 	for (unsigned i = 0; i < instr->def.num_components; ++i) {
 		switch (instr->def.bit_size) {
+		case 16:
+			values[i] = LLVMConstInt(element_type,
+			                         instr->value.u16[i], false);
+			break;
 		case 32:
 			values[i] = LLVMConstInt(element_type,
 			                         instr->value.u32[i], false);
@@ -1131,10 +1167,12 @@
 }
 
 static LLVMValueRef lower_gather4_integer(struct ac_llvm_context *ctx,
+					  nir_variable *var,
 					  struct ac_image_args *args,
 					  const nir_tex_instr *instr)
 {
-	enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
+	const struct glsl_type *type = glsl_without_array(var->type);
+	enum glsl_base_type stype = glsl_get_sampler_result_type(type);
 	LLVMValueRef half_texel[2];
 	LLVMValueRef compare_cube_wa = NULL;
 	LLVMValueRef result;
@@ -1241,6 +1279,22 @@
 	return result;
 }
 
+static nir_deref_instr *get_tex_texture_deref(const nir_tex_instr *instr)
+{
+	nir_deref_instr *texture_deref_instr = NULL;
+
+	for (unsigned i = 0; i < instr->num_srcs; i++) {
+		switch (instr->src[i].src_type) {
+		case nir_tex_src_texture_deref:
+			texture_deref_instr = nir_src_as_deref(instr->src[i].src);
+			break;
+		default:
+			break;
+		}
+	}
+	return texture_deref_instr;
+}
+
 static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
 					const nir_tex_instr *instr,
 					struct ac_image_args *args)
@@ -1301,9 +1355,12 @@
 	}
 
 	if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= VI) {
-		enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
+		nir_deref_instr *texture_deref_instr = get_tex_texture_deref(instr);
+		nir_variable *var = nir_deref_instr_get_variable(texture_deref_instr);
+		const struct glsl_type *type = glsl_without_array(var->type);
+		enum glsl_base_type stype = glsl_get_sampler_result_type(type);
 		if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) {
-			return lower_gather4_integer(&ctx->ac, args, instr);
+			return lower_gather4_integer(&ctx->ac, var, args, instr);
 		}
 	}
 
@@ -1340,6 +1397,26 @@
 			    get_src(ctx, instr->src[0]), "");
 
 	ptr = ac_build_gep0(&ctx->ac, ctx->abi->push_constants, addr);
+
+	if (instr->dest.ssa.bit_size == 16) {
+		unsigned load_dwords = instr->dest.ssa.num_components / 2 + 1;
+		LLVMTypeRef vec_type = LLVMVectorType(LLVMInt16TypeInContext(ctx->ac.context), 2 * load_dwords);
+		ptr = ac_cast_ptr(&ctx->ac, ptr, vec_type);
+		LLVMValueRef res = LLVMBuildLoad(ctx->ac.builder, ptr, "");
+		res = LLVMBuildBitCast(ctx->ac.builder, res, vec_type, "");
+		LLVMValueRef cond = LLVMBuildLShr(ctx->ac.builder, addr, ctx->ac.i32_1, "");
+		cond = LLVMBuildTrunc(ctx->ac.builder, cond, LLVMInt1Type(), "");
+		LLVMValueRef mask[] = { LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
+					LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
+					LLVMConstInt(ctx->ac.i32, 4, false)};
+		LLVMValueRef swizzle_aligned = LLVMConstVector(&mask[0], instr->dest.ssa.num_components);
+		LLVMValueRef swizzle_unaligned = LLVMConstVector(&mask[1], instr->dest.ssa.num_components);
+		LLVMValueRef shuffle_aligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_aligned, "");
+		LLVMValueRef shuffle_unaligned = LLVMBuildShuffleVector(ctx->ac.builder, res, res, swizzle_unaligned, "");
+		res = LLVMBuildSelect(ctx->ac.builder, cond, shuffle_unaligned, shuffle_aligned, "");
+		return LLVMBuildBitCast(ctx->ac.builder, res, get_def_type(ctx, &instr->dest.ssa), "");
+	}
+
 	ptr = ac_cast_ptr(&ctx->ac, ptr, get_def_type(ctx, &instr->dest.ssa));
 
 	return LLVMBuildLoad(ctx->ac.builder, ptr, "");
@@ -1365,31 +1442,24 @@
 static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src,
                                          unsigned start, unsigned count)
 {
-	LLVMTypeRef type = LLVMTypeOf(src);
+	LLVMValueRef mask[] = {
+	LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
+	LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false) };
 
-	if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
+	unsigned src_elements = ac_get_llvm_num_components(src);
+
+	if (count == src_elements) {
 		assert(start == 0);
-		assert(count == 1);
 		return src;
+	} else if (count == 1) {
+		assert(start < src_elements);
+		return LLVMBuildExtractElement(ctx->builder, src, mask[start],  "");
+	} else {
+		assert(start + count <= src_elements);
+		assert(count <= 4);
+		LLVMValueRef swizzle = LLVMConstVector(&mask[start], count);
+		return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, "");
 	}
-
-	unsigned src_elements = LLVMGetVectorSize(type);
-	assert(start < src_elements);
-	assert(start + count <= src_elements);
-
-	if (start == 0 && count == src_elements)
-		return src;
-
-	if (count == 1)
-		return LLVMBuildExtractElement(ctx->builder, src, LLVMConstInt(ctx->i32, start, false), "");
-
-	assert(count <= 8);
-	LLVMValueRef indices[8];
-	for (unsigned i = 0; i < count; ++i)
-		indices[i] = LLVMConstInt(ctx->i32, start + i, false);
-
-	LLVMValueRef swizzle = LLVMConstVector(indices, count);
-	return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, "");
 }
 
 static void visit_store_ssbo(struct ac_nir_context *ctx,
@@ -1397,33 +1467,19 @@
 {
 	const char *store_name;
 	LLVMValueRef src_data = get_src(ctx, instr->src[0]);
-	LLVMTypeRef data_type = ctx->ac.f32;
-	int elem_size_mult = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 32;
-	int components_32bit = elem_size_mult * instr->num_components;
+	int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8;
 	unsigned writemask = nir_intrinsic_write_mask(instr);
-	LLVMValueRef base_data, base_offset;
-	LLVMValueRef params[6];
 
-	params[1] = ctx->abi->load_ssbo(ctx->abi,
+	LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
 				        get_src(ctx, instr->src[1]), true);
-	params[2] = ctx->ac.i32_0; /* vindex */
-	params[4] = ctx->ac.i1false;  /* glc */
-	params[5] = ctx->ac.i1false;  /* slc */
-
-	if (components_32bit > 1)
-		data_type = LLVMVectorType(ctx->ac.f32, components_32bit);
-
-	writemask = widen_mask(writemask, elem_size_mult);
-
-	base_data = ac_to_float(&ctx->ac, src_data);
+	LLVMValueRef base_data = ac_to_float(&ctx->ac, src_data);
 	base_data = ac_trim_vector(&ctx->ac, base_data, instr->num_components);
-	base_data = LLVMBuildBitCast(ctx->ac.builder, base_data,
-				     data_type, "");
-	base_offset = get_src(ctx, instr->src[2]);      /* voffset */
+	LLVMValueRef base_offset = get_src(ctx, instr->src[2]);
+
 	while (writemask) {
 		int start, count;
-		LLVMValueRef data;
-		LLVMValueRef offset;
+		LLVMValueRef data, offset;
+		LLVMTypeRef data_type;
 
 		u_bit_scan_consecutive_range(&writemask, &start, &count);
 
@@ -1433,31 +1489,76 @@
 			writemask |= 1 << (start + 2);
 			count = 2;
 		}
+		int num_bytes = count * elem_size_bytes; /* count in bytes */
 
-		if (count > 4) {
-			writemask |= ((1u << (count - 4)) - 1u) << (start + 4);
-			count = 4;
+		/* we can only store 4 DWords at the same time.
+		 * can only happen for 64 Bit vectors. */
+		if (num_bytes > 16) {
+			writemask |= ((1u << (count - 2)) - 1u) << (start + 2);
+			count = 2;
+			num_bytes = 16;
 		}
 
-		if (count == 4) {
-			store_name = "llvm.amdgcn.buffer.store.v4f32";
-		} else if (count == 2) {
-			store_name = "llvm.amdgcn.buffer.store.v2f32";
-
-		} else {
-			assert(count == 1);
-			store_name = "llvm.amdgcn.buffer.store.f32";
+		/* check alignment of 16 Bit stores */
+		if (elem_size_bytes == 2 && num_bytes > 2 && (start % 2) == 1) {
+			writemask |= ((1u << (count - 1)) - 1u) << (start + 1);
+			count = 1;
+			num_bytes = 2;
 		}
 		data = extract_vector_range(&ctx->ac, base_data, start, count);
 
-		offset = base_offset;
-		if (start != 0) {
-			offset = LLVMBuildAdd(ctx->ac.builder, offset, LLVMConstInt(ctx->ac.i32, start * 4, false), "");
+		if (start == 0) {
+			offset = base_offset;
+		} else {
+			offset = LLVMBuildAdd(ctx->ac.builder, base_offset,
+					      LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), "");
 		}
-		params[0] = data;
-		params[3] = offset;
-		ac_build_intrinsic(&ctx->ac, store_name,
-				   ctx->ac.voidt, params, 6, 0);
+		if (num_bytes == 2) {
+			store_name = "llvm.amdgcn.tbuffer.store.i32";
+			data_type = ctx->ac.i32;
+			LLVMValueRef tbuffer_params[] = {
+				data,
+				rsrc,
+				ctx->ac.i32_0, /* vindex */
+				offset,        /* voffset */
+				ctx->ac.i32_0,
+				ctx->ac.i32_0,
+				LLVMConstInt(ctx->ac.i32, 2, false), // dfmt (= 16bit)
+				LLVMConstInt(ctx->ac.i32, 4, false), // nfmt (= uint)
+				ctx->ac.i1false,
+				ctx->ac.i1false,
+			};
+			ac_build_intrinsic(&ctx->ac, store_name,
+					   ctx->ac.voidt, tbuffer_params, 10, 0);
+		} else {
+			switch (num_bytes) {
+			case 16: /* v4f32 */
+				store_name = "llvm.amdgcn.buffer.store.v4f32";
+				data_type = ctx->ac.v4f32;
+				break;
+			case 8: /* v2f32 */
+				store_name = "llvm.amdgcn.buffer.store.v2f32";
+				data_type = ctx->ac.v2f32;
+				break;
+			case 4: /* f32 */
+				store_name = "llvm.amdgcn.buffer.store.f32";
+				data_type = ctx->ac.f32;
+				break;
+			default:
+				unreachable("Malformed vector store.");
+			}
+			data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, "");
+			LLVMValueRef params[] = {
+				data,
+				rsrc,
+				ctx->ac.i32_0, /* vindex */
+				offset,
+				ctx->ac.i1false,  /* glc */
+				ctx->ac.i1false,  /* slc */
+			};
+			ac_build_intrinsic(&ctx->ac, store_name,
+					   ctx->ac.voidt, params, 6, 0);
+		}
 	}
 }
 
@@ -1521,58 +1622,78 @@
                                       const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef results[2];
-	int load_components;
+	int load_bytes;
+	int elem_size_bytes = instr->dest.ssa.bit_size / 8;
 	int num_components = instr->num_components;
-	if (instr->dest.ssa.bit_size == 64)
-		num_components *= 2;
+	int num_bytes = num_components * elem_size_bytes;
 
-	for (int i = 0; i < num_components; i += load_components) {
-		load_components = MIN2(num_components - i, 4);
+	for (int i = 0; i < num_bytes; i += load_bytes) {
+		load_bytes = MIN2(num_bytes - i, 16);
 		const char *load_name;
-		LLVMTypeRef data_type = ctx->ac.f32;
-		LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * 4, false);
-		offset = LLVMBuildAdd(ctx->ac.builder, get_src(ctx, instr->src[1]), offset, "");
+		LLVMTypeRef data_type;
+		LLVMValueRef offset = get_src(ctx, instr->src[1]);
+		LLVMValueRef immoffset = LLVMConstInt(ctx->ac.i32, i, false);
+		LLVMValueRef rsrc = ctx->abi->load_ssbo(ctx->abi,
+							get_src(ctx, instr->src[0]), false);
+		LLVMValueRef vindex = ctx->ac.i32_0;
 
-		if (load_components == 3)
-			data_type = LLVMVectorType(ctx->ac.f32, 4);
-		else if (load_components > 1)
-			data_type = LLVMVectorType(ctx->ac.f32, load_components);
-
-		if (load_components >= 3)
-			load_name = "llvm.amdgcn.buffer.load.v4f32";
-		else if (load_components == 2)
-			load_name = "llvm.amdgcn.buffer.load.v2f32";
-		else if (load_components == 1)
-			load_name = "llvm.amdgcn.buffer.load.f32";
-		else
-			unreachable("unhandled number of components");
-
-		LLVMValueRef params[] = {
-			ctx->abi->load_ssbo(ctx->abi,
-					    get_src(ctx, instr->src[0]),
-					    false),
-			ctx->ac.i32_0,
-			offset,
-			ctx->ac.i1false,
-			ctx->ac.i1false,
-		};
-
-		results[i > 0 ? 1 : 0] = ac_build_intrinsic(&ctx->ac, load_name, data_type, params, 5, 0);
+		int idx = i ? 1 : 0;
+		if (load_bytes == 2) {
+			results[idx] = ac_build_tbuffer_load_short(&ctx->ac,
+								   rsrc,
+								   vindex,
+								   offset,
+								   ctx->ac.i32_0,
+								   immoffset);
+		} else {
+			switch (load_bytes) {
+			case 16:
+			case 12:
+				load_name = "llvm.amdgcn.buffer.load.v4f32";
+				data_type = ctx->ac.v4f32;
+				break;
+			case 8:
+			case 6:
+				load_name = "llvm.amdgcn.buffer.load.v2f32";
+				data_type = ctx->ac.v2f32;
+				break;
+			case 4:
+				load_name = "llvm.amdgcn.buffer.load.f32";
+				data_type = ctx->ac.f32;
+				break;
+			default:
+				unreachable("Malformed load buffer.");
+			}
+			LLVMValueRef params[] = {
+				rsrc,
+				vindex,
+				LLVMBuildAdd(ctx->ac.builder, offset, immoffset, ""),
+				ctx->ac.i1false,
+				ctx->ac.i1false,
+			};
+			results[idx] = ac_build_intrinsic(&ctx->ac, load_name, data_type, params, 5, 0);
+			unsigned num_elems = ac_get_type_size(data_type) / elem_size_bytes;
+			LLVMTypeRef resTy = LLVMVectorType(LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size), num_elems);
+			results[idx] = LLVMBuildBitCast(ctx->ac.builder, results[idx], resTy, "");
+		}
 	}
 
 	assume(results[0]);
 	LLVMValueRef ret = results[0];
-	if (num_components > 4 || num_components == 3) {
+	if (num_bytes > 16 || num_components == 3) {
 		LLVMValueRef masks[] = {
 		        LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
 		        LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
-			LLVMConstInt(ctx->ac.i32, 4, false), LLVMConstInt(ctx->ac.i32, 5, false),
-		        LLVMConstInt(ctx->ac.i32, 6, false), LLVMConstInt(ctx->ac.i32, 7, false)
 		};
 
+		if (num_bytes > 16 && num_components == 3) {
+			/* we end up with a v2i64 and i64 but shuffle fails on that */
+			results[1] = ac_build_expand(&ctx->ac, results[1], 1, 2);
+		}
+
 		LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
 		ret = LLVMBuildShuffleVector(ctx->ac.builder, results[0],
-					     results[num_components > 4 ? 1 : 0], swizzle, "");
+					     results[num_bytes > 16 ? 1 : 0], swizzle, "");
 	}
 
 	return LLVMBuildBitCast(ctx->ac.builder, ret,
@@ -1593,83 +1714,88 @@
 	if (instr->dest.ssa.bit_size == 64)
 		num_components *= 2;
 
-	ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset,
-				   NULL, 0, false, false, true, true);
-	ret = ac_trim_vector(&ctx->ac, ret, num_components);
+	if (instr->dest.ssa.bit_size == 16) {
+		LLVMValueRef results[num_components];
+		for (unsigned i = 0; i < num_components; ++i) {
+			results[i] = ac_build_tbuffer_load_short(&ctx->ac,
+								 rsrc,
+								 ctx->ac.i32_0,
+								 offset,
+								 ctx->ac.i32_0,
+								 LLVMConstInt(ctx->ac.i32, 2 * i, 0));
+		}
+		ret = ac_build_gather_values(&ctx->ac, results, num_components);
+	} else {
+		ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset,
+					   NULL, 0, false, false, true, true);
+
+		ret = ac_trim_vector(&ctx->ac, ret, num_components);
+	}
+
 	return LLVMBuildBitCast(ctx->ac.builder, ret,
 	                        get_def_type(ctx, &instr->dest.ssa), "");
 }
 
 static void
-get_deref_offset(struct ac_nir_context *ctx, nir_deref_var *deref,
-		 bool vs_in, unsigned *vertex_index_out,
-		 LLVMValueRef *vertex_index_ref,
-		 unsigned *const_out, LLVMValueRef *indir_out)
+get_deref_offset(struct ac_nir_context *ctx, nir_deref_instr *instr,
+                 bool vs_in, unsigned *vertex_index_out,
+                 LLVMValueRef *vertex_index_ref,
+                 unsigned *const_out, LLVMValueRef *indir_out)
 {
-	unsigned const_offset = 0;
-	nir_deref *tail = &deref->deref;
-	LLVMValueRef offset = NULL;
+	nir_variable *var = nir_deref_instr_get_variable(instr);
+	nir_deref_path path;
+	unsigned idx_lvl = 1;
+
+	nir_deref_path_init(&path, instr, NULL);
 
 	if (vertex_index_out != NULL || vertex_index_ref != NULL) {
-		tail = tail->child;
-		nir_deref_array *deref_array = nir_deref_as_array(tail);
-		if (vertex_index_out)
-			*vertex_index_out = deref_array->base_offset;
-
 		if (vertex_index_ref) {
-			LLVMValueRef vtx = LLVMConstInt(ctx->ac.i32, deref_array->base_offset, false);
-			if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-				vtx = LLVMBuildAdd(ctx->ac.builder, vtx, get_src(ctx, deref_array->indirect), "");
-			}
-			*vertex_index_ref = vtx;
+			*vertex_index_ref = get_src(ctx, path.path[idx_lvl]->arr.index);
+			if (vertex_index_out)
+				*vertex_index_out = 0;
+		} else {
+			nir_const_value *v = nir_src_as_const_value(path.path[idx_lvl]->arr.index);
+			assert(v);
+			*vertex_index_out = v->u32[0];
 		}
+		++idx_lvl;
 	}
 
-	if (deref->var->data.compact) {
-		assert(tail->child->deref_type == nir_deref_type_array);
-		assert(glsl_type_is_scalar(glsl_without_array(deref->var->type)));
-		nir_deref_array *deref_array = nir_deref_as_array(tail->child);
-		/* We always lower indirect dereferences for "compact" array vars. */
-		assert(deref_array->deref_array_type == nir_deref_array_type_direct);
+	uint32_t const_offset = 0;
+	LLVMValueRef offset = NULL;
 
-		const_offset = deref_array->base_offset;
+	if (var->data.compact) {
+		assert(instr->deref_type == nir_deref_type_array);
+		nir_const_value *v = nir_src_as_const_value(instr->arr.index);
+		assert(v);
+		const_offset = v->u32[0];
 		goto out;
 	}
 
-	while (tail->child != NULL) {
-		const struct glsl_type *parent_type = tail->type;
-		tail = tail->child;
+	for (; path.path[idx_lvl]; ++idx_lvl) {
+		const struct glsl_type *parent_type = path.path[idx_lvl - 1]->type;
+		if (path.path[idx_lvl]->deref_type == nir_deref_type_struct) {
+			unsigned index = path.path[idx_lvl]->strct.index;
 
-		if (tail->deref_type == nir_deref_type_array) {
-			nir_deref_array *deref_array = nir_deref_as_array(tail);
-			LLVMValueRef index, stride, local_offset;
-			unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
-
-			const_offset += size * deref_array->base_offset;
-			if (deref_array->deref_array_type == nir_deref_array_type_direct)
-				continue;
-
-			assert(deref_array->deref_array_type == nir_deref_array_type_indirect);
-			index = get_src(ctx, deref_array->indirect);
-			stride = LLVMConstInt(ctx->ac.i32, size, 0);
-			local_offset = LLVMBuildMul(ctx->ac.builder, stride, index, "");
-
-			if (offset)
-				offset = LLVMBuildAdd(ctx->ac.builder, offset, local_offset, "");
-			else
-				offset = local_offset;
-		} else if (tail->deref_type == nir_deref_type_struct) {
-			nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
-
-			for (unsigned i = 0; i < deref_struct->index; i++) {
+			for (unsigned i = 0; i < index; i++) {
 				const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
 				const_offset += glsl_count_attribute_slots(ft, vs_in);
 			}
+		} else if(path.path[idx_lvl]->deref_type == nir_deref_type_array) {
+			unsigned size = glsl_count_attribute_slots(path.path[idx_lvl]->type, vs_in);
+			LLVMValueRef array_off = LLVMBuildMul(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, size, 0),
+			                                      get_src(ctx, path.path[idx_lvl]->arr.index), "");
+			if (offset)
+				offset = LLVMBuildAdd(ctx->ac.builder, offset, array_off, "");
+			else
+				offset = array_off;
 		} else
-			unreachable("unsupported deref type");
-
+			unreachable("Uhandled deref type in get_deref_instr_offset");
 	}
+
 out:
+	nir_deref_path_finish(&path);
+
 	if (const_offset && offset)
 		offset = LLVMBuildAdd(ctx->ac.builder, offset,
 				      LLVMConstInt(ctx->ac.i32, const_offset, 0),
@@ -1679,45 +1805,6 @@
 	*indir_out = offset;
 }
 
-static LLVMValueRef
-build_gep_for_deref(struct ac_nir_context *ctx,
-		    nir_deref_var *deref)
-{
-	struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, deref->var);
-	assert(entry->data);
-	LLVMValueRef val = entry->data;
-	nir_deref *tail = deref->deref.child;
-	while (tail != NULL) {
-		LLVMValueRef offset;
-		switch (tail->deref_type) {
-		case nir_deref_type_array: {
-			nir_deref_array *array = nir_deref_as_array(tail);
-			offset = LLVMConstInt(ctx->ac.i32, array->base_offset, 0);
-			if (array->deref_array_type ==
-			    nir_deref_array_type_indirect) {
-				offset = LLVMBuildAdd(ctx->ac.builder, offset,
-						      get_src(ctx,
-							      array->indirect),
-						      "");
-			}
-			break;
-		}
-		case nir_deref_type_struct: {
-			nir_deref_struct *deref_struct =
-				nir_deref_as_struct(tail);
-			offset = LLVMConstInt(ctx->ac.i32,
-					      deref_struct->index, 0);
-			break;
-		}
-		default:
-			unreachable("bad deref type");
-		}
-		val = ac_build_gep0(&ctx->ac, val, offset);
-		tail = tail->child;
-	}
-	return val;
-}
-
 static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx,
 				       nir_intrinsic_instr *instr,
 				       bool load_inputs)
@@ -1726,14 +1813,17 @@
 	LLVMValueRef vertex_index = NULL;
 	LLVMValueRef indir_index = NULL;
 	unsigned const_index = 0;
-	unsigned location = instr->variables[0]->var->data.location;
-	unsigned driver_location = instr->variables[0]->var->data.driver_location;
-	const bool is_patch =  instr->variables[0]->var->data.patch;
-	const bool is_compact = instr->variables[0]->var->data.compact;
 
-	get_deref_offset(ctx, instr->variables[0],
-			 false, NULL, is_patch ? NULL : &vertex_index,
-			 &const_index, &indir_index);
+	nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+
+	unsigned location = var->data.location;
+	unsigned driver_location = var->data.driver_location;
+	const bool is_patch =  var->data.patch;
+	const bool is_compact = var->data.compact;
+
+	get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+	                 false, NULL, is_patch ? NULL : &vertex_index,
+	                 &const_index, &indir_index);
 
 	LLVMTypeRef dest_type = get_def_type(ctx, &instr->dest.ssa);
 
@@ -1746,32 +1836,39 @@
 	result = ctx->abi->load_tess_varyings(ctx->abi, src_component_type,
 					      vertex_index, indir_index,
 					      const_index, location, driver_location,
-					      instr->variables[0]->var->data.location_frac,
+					      var->data.location_frac,
 					      instr->num_components,
 					      is_patch, is_compact, load_inputs);
+	if (instr->dest.ssa.bit_size == 16) {
+		result = ac_to_integer(&ctx->ac, result);
+		result = LLVMBuildTrunc(ctx->ac.builder, result, dest_type, "");
+	}
 	return LLVMBuildBitCast(ctx->ac.builder, result, dest_type, "");
 }
 
 static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 				   nir_intrinsic_instr *instr)
 {
+	nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+
 	LLVMValueRef values[8];
-	int idx = instr->variables[0]->var->data.driver_location;
+	int idx = var->data.driver_location;
 	int ve = instr->dest.ssa.num_components;
-	unsigned comp = instr->variables[0]->var->data.location_frac;
+	unsigned comp = var->data.location_frac;
 	LLVMValueRef indir_index;
 	LLVMValueRef ret;
 	unsigned const_index;
-	unsigned stride = instr->variables[0]->var->data.compact ? 1 : 4;
+	unsigned stride = var->data.compact ? 1 : 4;
 	bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
-	             instr->variables[0]->var->data.mode == nir_var_shader_in;
-	get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
-				      &const_index, &indir_index);
+	             var->data.mode == nir_var_shader_in;
+
+	get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), vs_in, NULL, NULL,
+	                 &const_index, &indir_index);
 
 	if (instr->dest.ssa.bit_size == 64)
 		ve *= 2;
 
-	switch (instr->variables[0]->var->data.mode) {
+	switch (var->data.mode) {
 	case nir_var_shader_in:
 		if (ctx->stage == MESA_SHADER_TESS_CTRL ||
 		    ctx->stage == MESA_SHADER_TESS_EVAL) {
@@ -1782,20 +1879,19 @@
 			LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
 			LLVMValueRef indir_index;
 			unsigned const_index, vertex_index;
-			get_deref_offset(ctx, instr->variables[0],
-					 false, &vertex_index, NULL,
-					 &const_index, &indir_index);
+			get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+			                 false, &vertex_index, NULL, &const_index, &indir_index);
 
-			return ctx->abi->load_inputs(ctx->abi, instr->variables[0]->var->data.location,
-						     instr->variables[0]->var->data.driver_location,
-						     instr->variables[0]->var->data.location_frac,
+			return ctx->abi->load_inputs(ctx->abi, var->data.location,
+						     var->data.driver_location,
+						     var->data.location_frac,
 						     instr->num_components, vertex_index, const_index, type);
 		}
 
 		for (unsigned chan = comp; chan < ve + comp; chan++) {
 			if (indir_index) {
 				unsigned count = glsl_count_attribute_slots(
-						instr->variables[0]->var->type,
+						var->type,
 						ctx->stage == MESA_SHADER_VERTEX);
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
@@ -1813,7 +1909,7 @@
 		for (unsigned chan = 0; chan < ve; chan++) {
 			if (indir_index) {
 				unsigned count = glsl_count_attribute_slots(
-					instr->variables[0]->var->type, false);
+					var->type, false);
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
 						&ctx->ac, ctx->locals + idx + chan, count,
@@ -1828,8 +1924,7 @@
 		}
 		break;
 	case nir_var_shared: {
-		LLVMValueRef address = build_gep_for_deref(ctx,
-							   instr->variables[0]);
+		LLVMValueRef address = get_src(ctx, instr->src[0]);
 		LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, "");
 		return LLVMBuildBitCast(ctx->ac.builder, val,
 					get_def_type(ctx, &instr->dest.ssa),
@@ -1843,7 +1938,7 @@
 		for (unsigned chan = comp; chan < ve + comp; chan++) {
 			if (indir_index) {
 				unsigned count = glsl_count_attribute_slots(
-						instr->variables[0]->var->type, false);
+						var->type, false);
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
 						&ctx->ac, ctx->abi->outputs + idx + chan, count,
@@ -1870,15 +1965,18 @@
 visit_store_var(struct ac_nir_context *ctx,
 		nir_intrinsic_instr *instr)
 {
+        nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+
 	LLVMValueRef temp_ptr, value;
-	int idx = instr->variables[0]->var->data.driver_location;
-	unsigned comp = instr->variables[0]->var->data.location_frac;
-	LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
+	int idx = var->data.driver_location;
+	unsigned comp = var->data.location_frac;
+	LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[1]));
 	int writemask = instr->const_index[0];
 	LLVMValueRef indir_index;
 	unsigned const_index;
-	get_deref_offset(ctx, instr->variables[0], false,
-		         NULL, NULL, &const_index, &indir_index);
+
+	get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), false,
+	                 NULL, NULL, &const_index, &indir_index);
 
 	if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) {
 
@@ -1891,20 +1989,20 @@
 
 	writemask = writemask << comp;
 
-	switch (instr->variables[0]->var->data.mode) {
+	switch (var->data.mode) {
 	case nir_var_shader_out:
 
 		if (ctx->stage == MESA_SHADER_TESS_CTRL) {
 			LLVMValueRef vertex_index = NULL;
 			LLVMValueRef indir_index = NULL;
 			unsigned const_index = 0;
-			const bool is_patch = instr->variables[0]->var->data.patch;
+			const bool is_patch = var->data.patch;
 
-			get_deref_offset(ctx, instr->variables[0],
-					 false, NULL, is_patch ? NULL : &vertex_index,
-					 &const_index, &indir_index);
+			get_deref_offset(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
+			                 false, NULL, is_patch ? NULL : &vertex_index,
+			                 &const_index, &indir_index);
 
-			ctx->abi->store_tcs_outputs(ctx->abi, instr->variables[0]->var,
+			ctx->abi->store_tcs_outputs(ctx->abi, var,
 						    vertex_index, indir_index,
 						    const_index, src, writemask);
 			return;
@@ -1917,11 +2015,11 @@
 
 			value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp);
 
-			if (instr->variables[0]->var->data.compact)
+			if (var->data.compact)
 				stride = 1;
 			if (indir_index) {
 				unsigned count = glsl_count_attribute_slots(
-						instr->variables[0]->var->type, false);
+						var->type, false);
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
 						&ctx->ac, ctx->abi->outputs + idx + chan, count,
@@ -1947,7 +2045,7 @@
 			value = ac_llvm_extract_elem(&ctx->ac, src, chan);
 			if (indir_index) {
 				unsigned count = glsl_count_attribute_slots(
-					instr->variables[0]->var->type, false);
+					var->type, false);
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
 					&ctx->ac, ctx->locals + idx + chan, count,
@@ -1966,13 +2064,9 @@
 		break;
 	case nir_var_shared: {
 		int writemask = instr->const_index[0];
-		LLVMValueRef address = build_gep_for_deref(ctx,
-							   instr->variables[0]);
-		LLVMValueRef val = get_src(ctx, instr->src[0]);
-		unsigned components =
-			glsl_get_vector_elements(
-			   nir_deref_tail(&instr->variables[0]->deref)->type);
-		if (writemask == (1 << components) - 1) {
+		LLVMValueRef address = get_src(ctx, instr->src[0]);
+		LLVMValueRef val = get_src(ctx, instr->src[1]);
+		if (util_is_power_of_two_nonzero(writemask)) {
 			val = LLVMBuildBitCast(
 			   ctx->ac.builder, val,
 			   LLVMGetElementType(LLVMTypeOf(address)), "");
@@ -2098,30 +2192,32 @@
 	return sample_index;
 }
 
-static bool
-glsl_is_array_image(const struct glsl_type *type)
+static nir_variable *get_image_variable(const nir_intrinsic_instr *instr)
 {
-	const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
+	assert(instr->src[0].is_ssa);
+	return nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+}
 
-	if (glsl_sampler_type_is_array(type))
-		return true;
-
-	return dim == GLSL_SAMPLER_DIM_SUBPASS ||
-	       dim == GLSL_SAMPLER_DIM_SUBPASS_MS;
+static LLVMValueRef get_image_descriptor(struct ac_nir_context *ctx,
+                                         const nir_intrinsic_instr *instr,
+                                         enum ac_descriptor_type desc_type,
+                                         bool write)
+{
+	return get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), desc_type, NULL, true, write);
 }
 
 static void get_image_coords(struct ac_nir_context *ctx,
 			     const nir_intrinsic_instr *instr,
 			     struct ac_image_args *args)
 {
-	const struct glsl_type *type = glsl_without_array(instr->variables[0]->var->type);
+	const struct glsl_type *type = glsl_without_array(get_image_variable(instr)->type);
 
-	LLVMValueRef src0 = get_src(ctx, instr->src[0]);
+	LLVMValueRef src0 = get_src(ctx, instr->src[1]);
 	LLVMValueRef masks[] = {
 		LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
 		LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
 	};
-	LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[1]), 0);
+	LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
 
 	int count;
 	enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
@@ -2156,10 +2252,10 @@
 							       fmask_load_address[1],
 							       fmask_load_address[2],
 							       sample_index,
-							       get_sampler_desc(ctx, instr->variables[0], AC_DESC_FMASK, NULL, true, false));
+							       get_image_descriptor(ctx, instr, AC_DESC_FMASK, false));
 	}
 	if (count == 1 && !gfx9_1d) {
-		if (instr->src[0].ssa->num_components)
+		if (instr->src[1].ssa->num_components)
 			args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
 		else
 			args->coords[0] = src0;
@@ -2202,7 +2298,7 @@
 static LLVMValueRef get_image_buffer_descriptor(struct ac_nir_context *ctx,
                                                 const nir_intrinsic_instr *instr, bool write)
 {
-	LLVMValueRef rsrc = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER, NULL, true, write);
+	LLVMValueRef rsrc = get_image_descriptor(ctx, instr, AC_DESC_BUFFER, write);
 	if (ctx->abi->gfx9_stride_size_workaround) {
 		LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 2, 0), "");
 		LLVMValueRef stride = LLVMBuildExtractElement(ctx->ac.builder, rsrc, LLVMConstInt(ctx->ac.i32, 1, 0), "");
@@ -2222,12 +2318,9 @@
 				     const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef res;
-	const nir_variable *var = instr->variables[0]->var;
+	const nir_variable *var = get_image_variable(instr);
 	const struct glsl_type *type = var->type;
 
-	if(instr->variables[0]->deref.child)
-		type = instr->variables[0]->deref.child->type;
-
 	type = glsl_without_array(type);
 
 	const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
@@ -2237,7 +2330,7 @@
 		LLVMValueRef rsrc, vindex;
 
 		rsrc = get_image_buffer_descriptor(ctx, instr, false);
-		vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
+		vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
 						 ctx->ac.i32_0, "");
 
 		/* TODO: set "glc" and "can_speculate" when OpenGL needs it. */
@@ -2252,10 +2345,9 @@
 		struct ac_image_args args = {};
 		args.opcode = ac_image_load;
 		get_image_coords(ctx, instr, &args);
-		args.resource = get_sampler_desc(ctx, instr->variables[0],
-						 AC_DESC_IMAGE, NULL, true, false);
+		args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
 		args.dim = get_ac_image_dim(&ctx->ac, glsl_get_sampler_dim(type),
-					    glsl_is_array_image(type));
+					    glsl_sampler_type_is_array(type));
 		args.dmask = 15;
 		args.attributes = AC_FUNC_ATTR_READONLY;
 		if (var->data.image._volatile || var->data.image.coherent)
@@ -2270,7 +2362,7 @@
 			      nir_intrinsic_instr *instr)
 {
 	LLVMValueRef params[8];
-	const nir_variable *var = instr->variables[0]->var;
+	const nir_variable *var = get_image_variable(instr);
 	const struct glsl_type *type = glsl_without_array(var->type);
 	const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
 	LLVMValueRef glc = ctx->ac.i1false;
@@ -2281,9 +2373,9 @@
 	if (dim == GLSL_SAMPLER_DIM_BUF) {
 		LLVMValueRef rsrc = get_image_buffer_descriptor(ctx, instr, true);
 
-		params[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[2])); /* data */
+		params[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3])); /* data */
 		params[1] = rsrc;
-		params[2] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
+		params[2] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
 						    ctx->ac.i32_0, ""); /* vindex */
 		params[3] = ctx->ac.i32_0; /* voffset */
 		params[4] = glc;  /* glc */
@@ -2293,12 +2385,11 @@
 	} else {
 		struct ac_image_args args = {};
 		args.opcode = ac_image_store;
-		args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[2]));
+		args.data[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[3]));
 		get_image_coords(ctx, instr, &args);
-		args.resource = get_sampler_desc(ctx, instr->variables[0],
-						 AC_DESC_IMAGE, NULL, true, false);
+		args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
 		args.dim = get_ac_image_dim(&ctx->ac, glsl_get_sampler_dim(type),
-					    glsl_is_array_image(type));
+					    glsl_sampler_type_is_array(type));
 		args.dmask = 15;
 		if (force_glc || var->data.image._volatile || var->data.image.coherent)
 			args.cache_policy |= ac_glc;
@@ -2313,9 +2404,9 @@
 {
 	LLVMValueRef params[7];
 	int param_count = 0;
-	const nir_variable *var = instr->variables[0]->var;
+	const nir_variable *var = get_image_variable(instr);
 
-	bool cmpswap = instr->intrinsic == nir_intrinsic_image_var_atomic_comp_swap;
+	bool cmpswap = instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap;
 	const char *atomic_name;
 	char intrinsic_name[41];
 	enum ac_atomic_op atomic_subop;
@@ -2325,35 +2416,35 @@
 	bool is_unsigned = glsl_get_sampler_result_type(type) == GLSL_TYPE_UINT;
 
 	switch (instr->intrinsic) {
-	case nir_intrinsic_image_var_atomic_add:
+	case nir_intrinsic_image_deref_atomic_add:
 		atomic_name = "add";
 		atomic_subop = ac_atomic_add;
 		break;
-	case nir_intrinsic_image_var_atomic_min:
+	case nir_intrinsic_image_deref_atomic_min:
 		atomic_name = is_unsigned ? "umin" : "smin";
 		atomic_subop = is_unsigned ? ac_atomic_umin : ac_atomic_smin;
 		break;
-	case nir_intrinsic_image_var_atomic_max:
+	case nir_intrinsic_image_deref_atomic_max:
 		atomic_name = is_unsigned ? "umax" : "smax";
 		atomic_subop = is_unsigned ? ac_atomic_umax : ac_atomic_smax;
 		break;
-	case nir_intrinsic_image_var_atomic_and:
+	case nir_intrinsic_image_deref_atomic_and:
 		atomic_name = "and";
 		atomic_subop = ac_atomic_and;
 		break;
-	case nir_intrinsic_image_var_atomic_or:
+	case nir_intrinsic_image_deref_atomic_or:
 		atomic_name = "or";
 		atomic_subop = ac_atomic_or;
 		break;
-	case nir_intrinsic_image_var_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_xor:
 		atomic_name = "xor";
 		atomic_subop = ac_atomic_xor;
 		break;
-	case nir_intrinsic_image_var_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_exchange:
 		atomic_name = "swap";
 		atomic_subop = ac_atomic_swap;
 		break;
-	case nir_intrinsic_image_var_atomic_comp_swap:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
 		atomic_name = "cmpswap";
 		atomic_subop = 0; /* not used */
 		break;
@@ -2362,12 +2453,12 @@
 	}
 
 	if (cmpswap)
-		params[param_count++] = get_src(ctx, instr->src[3]);
-	params[param_count++] = get_src(ctx, instr->src[2]);
+		params[param_count++] = get_src(ctx, instr->src[4]);
+	params[param_count++] = get_src(ctx, instr->src[3]);
 
 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
 		params[param_count++] = get_image_buffer_descriptor(ctx, instr, true);
-		params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
+		params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]),
 								ctx->ac.i32_0, ""); /* vindex */
 		params[param_count++] = ctx->ac.i32_0; /* voffset */
 		params[param_count++] = ctx->ac.i1false;  /* slc */
@@ -2386,10 +2477,9 @@
 		if (cmpswap)
 			args.data[1] = params[1];
 		get_image_coords(ctx, instr, &args);
-		args.resource = get_sampler_desc(ctx, instr->variables[0],
-						 AC_DESC_IMAGE, NULL, true, false);
+		args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, true);
 		args.dim = get_ac_image_dim(&ctx->ac, glsl_get_sampler_dim(type),
-					    glsl_is_array_image(type));
+					    glsl_sampler_type_is_array(type));
 
 		return ac_build_image_opcode(&ctx->ac, &args);
 	}
@@ -2398,15 +2488,14 @@
 static LLVMValueRef visit_image_samples(struct ac_nir_context *ctx,
 					const nir_intrinsic_instr *instr)
 {
-	const nir_variable *var = instr->variables[0]->var;
+	const nir_variable *var = get_image_variable(instr);
 	const struct glsl_type *type = glsl_without_array(var->type);
 
 	struct ac_image_args args = { 0 };
 	args.dim = get_ac_sampler_dim(&ctx->ac, glsl_get_sampler_dim(type),
 				      glsl_sampler_type_is_array(type));
 	args.dmask = 0xf;
-	args.resource = get_sampler_desc(ctx, instr->variables[0],
-					 AC_DESC_IMAGE, NULL, true, false);
+	args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
 	args.opcode = ac_image_get_resinfo;
 	args.lod = ctx->ac.i32_0;
 	args.attributes = AC_FUNC_ATTR_READNONE;
@@ -2418,20 +2507,18 @@
 				     const nir_intrinsic_instr *instr)
 {
 	LLVMValueRef res;
-	const nir_variable *var = instr->variables[0]->var;
+	const nir_variable *var = get_image_variable(instr);
 	const struct glsl_type *type = glsl_without_array(var->type);
 
 	if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF)
-		return get_buffer_size(ctx,
-			get_sampler_desc(ctx, instr->variables[0],
-					 AC_DESC_BUFFER, NULL, true, false), true);
+		return get_buffer_size(ctx, get_image_descriptor(ctx, instr, AC_DESC_BUFFER, false), true);
 
 	struct ac_image_args args = { 0 };
 
 	args.dim = get_ac_image_dim(&ctx->ac, glsl_get_sampler_dim(type),
 				    glsl_sampler_type_is_array(type));
 	args.dmask = 0xf;
-	args.resource = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE, NULL, true, false);
+	args.resource = get_image_descriptor(ctx, instr, AC_DESC_IMAGE, false);
 	args.opcode = ac_image_get_resinfo;
 	args.lod = ctx->ac.i32_0;
 	args.attributes = AC_FUNC_ATTR_READNONE;
@@ -2627,8 +2714,8 @@
 	LLVMValueRef result;
 	LLVMValueRef src = get_src(ctx, instr->src[src_idx]);
 
-	if (instr->intrinsic == nir_intrinsic_var_atomic_comp_swap ||
-	    instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap) {
+	if (instr->intrinsic == nir_intrinsic_shared_atomic_comp_swap ||
+	    instr->intrinsic == nir_intrinsic_deref_atomic_comp_swap) {
 		LLVMValueRef src1 = get_src(ctx, instr->src[src_idx + 1]);
 		result = LLVMBuildAtomicCmpXchg(ctx->ac.builder,
 						ptr, src, src1,
@@ -2639,40 +2726,40 @@
 	} else {
 		LLVMAtomicRMWBinOp op;
 		switch (instr->intrinsic) {
-		case nir_intrinsic_var_atomic_add:
 		case nir_intrinsic_shared_atomic_add:
+		case nir_intrinsic_deref_atomic_add:
 			op = LLVMAtomicRMWBinOpAdd;
 			break;
-		case nir_intrinsic_var_atomic_umin:
 		case nir_intrinsic_shared_atomic_umin:
+		case nir_intrinsic_deref_atomic_umin:
 			op = LLVMAtomicRMWBinOpUMin;
 			break;
-		case nir_intrinsic_var_atomic_umax:
 		case nir_intrinsic_shared_atomic_umax:
+		case nir_intrinsic_deref_atomic_umax:
 			op = LLVMAtomicRMWBinOpUMax;
 			break;
-		case nir_intrinsic_var_atomic_imin:
 		case nir_intrinsic_shared_atomic_imin:
+		case nir_intrinsic_deref_atomic_imin:
 			op = LLVMAtomicRMWBinOpMin;
 			break;
-		case nir_intrinsic_var_atomic_imax:
 		case nir_intrinsic_shared_atomic_imax:
+		case nir_intrinsic_deref_atomic_imax:
 			op = LLVMAtomicRMWBinOpMax;
 			break;
-		case nir_intrinsic_var_atomic_and:
 		case nir_intrinsic_shared_atomic_and:
+		case nir_intrinsic_deref_atomic_and:
 			op = LLVMAtomicRMWBinOpAnd;
 			break;
-		case nir_intrinsic_var_atomic_or:
 		case nir_intrinsic_shared_atomic_or:
+		case nir_intrinsic_deref_atomic_or:
 			op = LLVMAtomicRMWBinOpOr;
 			break;
-		case nir_intrinsic_var_atomic_xor:
 		case nir_intrinsic_shared_atomic_xor:
+		case nir_intrinsic_deref_atomic_xor:
 			op = LLVMAtomicRMWBinOpXor;
 			break;
-		case nir_intrinsic_var_atomic_exchange:
 		case nir_intrinsic_shared_atomic_exchange:
+		case nir_intrinsic_deref_atomic_exchange:
 			op = LLVMAtomicRMWBinOpXchg;
 			break;
 		default:
@@ -2709,24 +2796,26 @@
 	LLVMValueRef src_c0 = NULL;
 	LLVMValueRef src_c1 = NULL;
 	LLVMValueRef src0 = NULL;
-	int input_index = instr->variables[0]->var->data.location - VARYING_SLOT_VAR0;
+
+	nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+	int input_index = var->data.location - VARYING_SLOT_VAR0;
 	switch (instr->intrinsic) {
-	case nir_intrinsic_interp_var_at_centroid:
+	case nir_intrinsic_interp_deref_at_centroid:
 		location = INTERP_CENTROID;
 		break;
-	case nir_intrinsic_interp_var_at_sample:
-	case nir_intrinsic_interp_var_at_offset:
+	case nir_intrinsic_interp_deref_at_sample:
+	case nir_intrinsic_interp_deref_at_offset:
 		location = INTERP_CENTER;
-		src0 = get_src(ctx, instr->src[0]);
+		src0 = get_src(ctx, instr->src[1]);
 		break;
 	default:
 		break;
 	}
 
-	if (instr->intrinsic == nir_intrinsic_interp_var_at_offset) {
+	if (instr->intrinsic == nir_intrinsic_interp_deref_at_offset) {
 		src_c0 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, src0, ctx->ac.i32_0, ""));
 		src_c1 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, src0, ctx->ac.i32_1, ""));
-	} else if (instr->intrinsic == nir_intrinsic_interp_var_at_sample) {
+	} else if (instr->intrinsic == nir_intrinsic_interp_deref_at_sample) {
 		LLVMValueRef sample_position;
 		LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f);
 
@@ -2738,7 +2827,7 @@
 		src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_position, ctx->ac.i32_1, "");
 		src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, "");
 	}
-	interp_param = ctx->abi->lookup_interp_param(ctx->abi, instr->variables[0]->var->data.interpolation, location);
+	interp_param = ctx->abi->lookup_interp_param(ctx->abi, var->data.interpolation, location);
 	attr_number = LLVMConstInt(ctx->ac.i32, input_index, false);
 
 	if (location == INTERP_CENTER) {
@@ -2802,7 +2891,7 @@
 		}
 	}
 	return ac_build_varying_gather_values(&ctx->ac, result, instr->num_components,
-					      instr->variables[0]->var->data.location_frac);
+					      var->data.location_frac);
 }
 
 static void visit_intrinsic(struct ac_nir_context *ctx,
@@ -2961,10 +3050,10 @@
 	case nir_intrinsic_get_buffer_size:
 		result = visit_get_buffer_size(ctx, instr);
 		break;
-	case nir_intrinsic_load_var:
+	case nir_intrinsic_load_deref:
 		result = visit_load_var(ctx, instr);
 		break;
-	case nir_intrinsic_store_var:
+	case nir_intrinsic_store_deref:
 		visit_store_var(ctx, instr);
 		break;
 	case nir_intrinsic_load_shared:
@@ -2973,26 +3062,26 @@
 	case nir_intrinsic_store_shared:
 		visit_store_shared(ctx, instr);
 		break;
-	case nir_intrinsic_image_var_samples:
+	case nir_intrinsic_image_deref_samples:
 		result = visit_image_samples(ctx, instr);
 		break;
-	case nir_intrinsic_image_var_load:
+	case nir_intrinsic_image_deref_load:
 		result = visit_image_load(ctx, instr);
 		break;
-	case nir_intrinsic_image_var_store:
+	case nir_intrinsic_image_deref_store:
 		visit_image_store(ctx, instr);
 		break;
-	case nir_intrinsic_image_var_atomic_add:
-	case nir_intrinsic_image_var_atomic_min:
-	case nir_intrinsic_image_var_atomic_max:
-	case nir_intrinsic_image_var_atomic_and:
-	case nir_intrinsic_image_var_atomic_or:
-	case nir_intrinsic_image_var_atomic_xor:
-	case nir_intrinsic_image_var_atomic_exchange:
-	case nir_intrinsic_image_var_atomic_comp_swap:
+	case nir_intrinsic_image_deref_atomic_add:
+	case nir_intrinsic_image_deref_atomic_min:
+	case nir_intrinsic_image_deref_atomic_max:
+	case nir_intrinsic_image_deref_atomic_and:
+	case nir_intrinsic_image_deref_atomic_or:
+	case nir_intrinsic_image_deref_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
 		result = visit_image_atomic(ctx, instr);
 		break;
-	case nir_intrinsic_image_var_size:
+	case nir_intrinsic_image_deref_size:
 		result = visit_image_size(ctx, instr);
 		break;
 	case nir_intrinsic_shader_clock:
@@ -3027,23 +3116,23 @@
 		result = visit_var_atomic(ctx, instr, ptr, 1);
 		break;
 	}
-	case nir_intrinsic_var_atomic_add:
-	case nir_intrinsic_var_atomic_imin:
-	case nir_intrinsic_var_atomic_umin:
-	case nir_intrinsic_var_atomic_imax:
-	case nir_intrinsic_var_atomic_umax:
-	case nir_intrinsic_var_atomic_and:
-	case nir_intrinsic_var_atomic_or:
-	case nir_intrinsic_var_atomic_xor:
-	case nir_intrinsic_var_atomic_exchange:
-	case nir_intrinsic_var_atomic_comp_swap: {
-		LLVMValueRef ptr = build_gep_for_deref(ctx, instr->variables[0]);
-		result = visit_var_atomic(ctx, instr, ptr, 0);
+	case nir_intrinsic_deref_atomic_add:
+	case nir_intrinsic_deref_atomic_imin:
+	case nir_intrinsic_deref_atomic_umin:
+	case nir_intrinsic_deref_atomic_imax:
+	case nir_intrinsic_deref_atomic_umax:
+	case nir_intrinsic_deref_atomic_and:
+	case nir_intrinsic_deref_atomic_or:
+	case nir_intrinsic_deref_atomic_xor:
+	case nir_intrinsic_deref_atomic_exchange:
+	case nir_intrinsic_deref_atomic_comp_swap: {
+		LLVMValueRef ptr = get_src(ctx, instr->src[0]);
+		result = visit_var_atomic(ctx, instr, ptr, 1);
 		break;
 	}
-	case nir_intrinsic_interp_var_at_centroid:
-	case nir_intrinsic_interp_var_at_sample:
-	case nir_intrinsic_interp_var_at_offset:
+	case nir_intrinsic_interp_deref_at_centroid:
+	case nir_intrinsic_interp_deref_at_sample:
+	case nir_intrinsic_interp_deref_at_offset:
 		result = visit_interp(ctx, instr);
 		break;
 	case nir_intrinsic_emit_vertex:
@@ -3121,7 +3210,7 @@
 }
 
 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
-				     const nir_deref_var *deref,
+				     nir_deref_instr *deref_instr,
 				     enum ac_descriptor_type desc_type,
 				     const nir_tex_instr *tex_instr,
 				     bool image, bool write)
@@ -3132,45 +3221,36 @@
 	unsigned base_index;
 	bool bindless = false;
 
-	if (!deref) {
+	if (!deref_instr) {
 		assert(tex_instr && !image);
 		descriptor_set = 0;
 		base_index = tex_instr->sampler_index;
 	} else {
-		const nir_deref *tail = &deref->deref;
-		while (tail->child) {
-			const nir_deref_array *child = nir_deref_as_array(tail->child);
-			unsigned array_size = glsl_get_aoa_size(tail->child->type);
-
+		while(deref_instr->deref_type != nir_deref_type_var) {
+			unsigned array_size = glsl_get_aoa_size(deref_instr->type);
 			if (!array_size)
 				array_size = 1;
 
-			assert(child->deref_array_type != nir_deref_array_type_wildcard);
-
-			if (child->deref_array_type == nir_deref_array_type_indirect) {
-				LLVMValueRef indirect = get_src(ctx, child->indirect);
+			assert(deref_instr->deref_type == nir_deref_type_array);
+			nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
+			if (const_value) {
+				constant_index += array_size * const_value->u32[0];
+			} else {
+				LLVMValueRef indirect = get_src(ctx, deref_instr->arr.index);
 
 				indirect = LLVMBuildMul(ctx->ac.builder, indirect,
 					LLVMConstInt(ctx->ac.i32, array_size, false), "");
 
-				if (!index)
+                                if (!index)
 					index = indirect;
 				else
 					index = LLVMBuildAdd(ctx->ac.builder, index, indirect, "");
 			}
 
-			constant_index += child->base_offset * array_size;
-
-			tail = &child->deref;
+			deref_instr = nir_src_as_deref(deref_instr->parent);
 		}
-		descriptor_set = deref->var->data.descriptor_set;
-
-		if (deref->var->data.bindless) {
-			bindless = deref->var->data.bindless;
-			base_index = deref->var->data.driver_location;
-		} else {
-			base_index = deref->var->data.binding;
-		}
+		descriptor_set = deref_instr->var->data.descriptor_set;
+		base_index = deref_instr->var->data.binding;
 	}
 
 	return ctx->abi->load_sampler_desc(ctx->abi,
@@ -3214,21 +3294,37 @@
 			   LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
 			   LLVMValueRef *fmask_ptr)
 {
+	nir_deref_instr *texture_deref_instr = NULL;
+	nir_deref_instr *sampler_deref_instr = NULL;
+
+	for (unsigned i = 0; i < instr->num_srcs; i++) {
+		switch (instr->src[i].src_type) {
+		case nir_tex_src_texture_deref:
+			texture_deref_instr = nir_src_as_deref(instr->src[i].src);
+			break;
+		case nir_tex_src_sampler_deref:
+			sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (!sampler_deref_instr)
+		sampler_deref_instr = texture_deref_instr;
+
 	if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF)
-		*res_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_BUFFER, instr, false, false);
+		*res_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_BUFFER, instr, false, false);
 	else
-		*res_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_IMAGE, instr, false, false);
+		*res_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_IMAGE, instr, false, false);
 	if (samp_ptr) {
-		if (instr->sampler)
-			*samp_ptr = get_sampler_desc(ctx, instr->sampler, AC_DESC_SAMPLER, instr, false, false);
-		else
-			*samp_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_SAMPLER, instr, false, false);
+		*samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, AC_DESC_SAMPLER, instr, false, false);
 		if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
 			*samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
 	}
-	if (fmask_ptr && !instr->sampler && (instr->op == nir_texop_txf_ms ||
-					     instr->op == nir_texop_samples_identical))
-		*fmask_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_FMASK, instr, false, false);
+	if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
+	                  instr->op == nir_texop_samples_identical))
+		*fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, AC_DESC_FMASK, instr, false, false);
 }
 
 static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx,
@@ -3588,6 +3684,34 @@
 	}
 }
 
+static void visit_deref(struct ac_nir_context *ctx,
+                        nir_deref_instr *instr)
+{
+	if (instr->mode != nir_var_shared)
+		return;
+
+	LLVMValueRef result = NULL;
+	switch(instr->deref_type) {
+	case nir_deref_type_var: {
+		struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, instr->var);
+		result = entry->data;
+		break;
+	}
+	case nir_deref_type_struct:
+		result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
+		                       LLVMConstInt(ctx->ac.i32, instr->strct.index, 0));
+		break;
+	case nir_deref_type_array:
+		result = ac_build_gep0(&ctx->ac, get_src(ctx, instr->parent),
+		                       get_src(ctx, instr->arr.index));
+		break;
+	default:
+		unreachable("Unhandled deref_instr deref type");
+	}
+
+	ctx->ssa_defs[instr->dest.ssa.index] = result;
+}
+
 static void visit_cf_list(struct ac_nir_context *ctx,
                           struct exec_list *list);
 
@@ -3618,6 +3742,9 @@
 		case nir_instr_type_jump:
 			visit_jump(&ctx->ac, nir_instr_as_jump(instr));
 			break;
+		case nir_instr_type_deref:
+			visit_deref(ctx, nir_instr_as_deref(instr));
+			break;
 		default:
 			fprintf(stderr, "Unknown NIR instr type: ");
 			nir_print_instr(instr, stderr);
@@ -3716,10 +3843,12 @@
 		}
 	}
 
+	bool is_16bit = glsl_type_is_16bit(variable->type);
+	LLVMTypeRef type = is_16bit ? ctx->f16 : ctx->f32;
 	for (unsigned i = 0; i < attrib_count; ++i) {
 		for (unsigned chan = 0; chan < 4; chan++) {
 			abi->outputs[ac_llvm_reg_index_soa(output_loc + i, chan)] =
-		                       ac_build_alloca_undef(ctx, ctx->f32, "");
+		                       ac_build_alloca_undef(ctx, type, "");
 		}
 	}
 }
diff --git a/src/amd/common/ac_surface.c b/src/amd/common/ac_surface.c
index 0ec3cbd..94723dc 100644
--- a/src/amd/common/ac_surface.c
+++ b/src/amd/common/ac_surface.c
@@ -143,6 +143,10 @@
 		*addrlib_family = FAMILY_AI;
 		*addrlib_revid = get_first(AMDGPU_VEGA12_RANGE);
 		break;
+	case CHIP_VEGA20:
+		*addrlib_family = FAMILY_AI;
+		*addrlib_revid = get_first(AMDGPU_VEGA20_RANGE);
+		break;
 	case CHIP_RAVEN:
 		*addrlib_family = FAMILY_RV;
 		*addrlib_revid = get_first(AMDGPU_RAVEN_RANGE);
@@ -227,8 +231,16 @@
 	return addrCreateOutput.hLib;
 }
 
-static int surf_config_sanity(const struct ac_surf_config *config)
+static int surf_config_sanity(const struct ac_surf_config *config,
+			      unsigned flags)
 {
+	/* FMASK is allocated together with the color surface and can't be
+	 * allocated separately.
+	 */
+	assert(!(flags & RADEON_SURF_FMASK));
+	if (flags & RADEON_SURF_FMASK)
+		return -EINVAL;
+
 	/* all dimension must be at least 1 ! */
 	if (!config->info.width || !config->info.height || !config->info.depth ||
 	    !config->info.array_size || !config->info.levels)
@@ -241,10 +253,27 @@
 	case 4:
 	case 8:
 		break;
+	case 16:
+		if (flags & RADEON_SURF_Z_OR_SBUFFER)
+			return -EINVAL;
+		break;
 	default:
 		return -EINVAL;
 	}
 
+	if (!(flags & RADEON_SURF_Z_OR_SBUFFER)) {
+		switch (config->info.storage_samples) {
+		case 0:
+		case 1:
+		case 2:
+		case 4:
+		case 8:
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+
 	if (config->is_3d && config->info.array_size > 1)
 		return -EINVAL;
 	if (config->is_cube && config->info.depth > 1)
@@ -276,10 +305,10 @@
 	 */
 	if (config->info.levels == 1 &&
 	    AddrSurfInfoIn->tileMode == ADDR_TM_LINEAR_ALIGNED &&
-	    AddrSurfInfoIn->bpp) {
+	    AddrSurfInfoIn->bpp &&
+	    util_is_power_of_two_or_zero(AddrSurfInfoIn->bpp)) {
 		unsigned alignment = 256 / (AddrSurfInfoIn->bpp / 8);
 
-		assert(util_is_power_of_two_or_zero(AddrSurfInfoIn->bpp));
 		AddrSurfInfoIn->width = align(AddrSurfInfoIn->width, alignment);
 	}
 
@@ -343,6 +372,9 @@
 	/* The previous level's flag tells us if we can use DCC for this level. */
 	if (AddrSurfInfoIn->flags.dccCompatible &&
 	    (level == 0 || AddrDccOut->subLvlCompressible)) {
+		bool prev_level_clearable = level == 0 ||
+					    AddrDccOut->dccRamSizeAligned;
+
 		AddrDccIn->colorSurfSize = AddrSurfInfoOut->surfSize;
 		AddrDccIn->tileMode = AddrSurfInfoOut->tileMode;
 		AddrDccIn->tileInfo = *AddrSurfInfoOut->pTileInfo;
@@ -355,10 +387,26 @@
 
 		if (ret == ADDR_OK) {
 			surf_level->dcc_offset = surf->dcc_size;
-			surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
 			surf->num_dcc_levels = level + 1;
 			surf->dcc_size = surf_level->dcc_offset + AddrDccOut->dccRamSize;
 			surf->dcc_alignment = MAX2(surf->dcc_alignment, AddrDccOut->dccRamBaseAlign);
+
+			/* If the DCC size of a subresource (1 mip level or 1 slice)
+			 * is not aligned, the DCC memory layout is not contiguous for
+			 * that subresource, which means we can't use fast clear.
+			 *
+			 * We only do fast clears for whole mipmap levels. If we did
+			 * per-slice fast clears, the same restriction would apply.
+			 * (i.e. only compute the slice size and see if it's aligned)
+			 *
+			 * The last level can be non-contiguous and still be clearable
+			 * if it's interleaved with the next level that doesn't exist.
+			 */
+			if (AddrDccOut->dccRamSizeAligned ||
+			    (prev_level_clearable && level == config->info.levels - 1))
+				surf_level->dcc_fast_clear_size = AddrDccOut->dccFastClearSize;
+			else
+				surf_level->dcc_fast_clear_size = 0;
 		}
 	}
 
@@ -392,6 +440,7 @@
 }
 
 #define   G_009910_MICRO_TILE_MODE(x)          (((x) >> 0) & 0x03)
+#define     V_009910_ADDR_SURF_THICK_MICRO_TILING                   0x03
 #define   G_009910_MICRO_TILE_MODE_NEW(x)      (((x) >> 22) & 0x07)
 
 static void gfx6_set_micro_tile_mode(struct radeon_surf *surf,
@@ -426,7 +475,6 @@
 	unsigned bpe = surf->bpe;
 
 	if (surf->flags & RADEON_SURF_SCANOUT &&
-	    !(surf->flags & RADEON_SURF_FMASK) &&
 	    config->info.samples <= 1 &&
 	    surf->blk_w <= 2 && surf->blk_h == 1) {
 		/* subsampled */
@@ -503,6 +551,66 @@
 	return 0;
 }
 
+void ac_compute_cmask(const struct radeon_info *info,
+		      const struct ac_surf_config *config,
+		      struct radeon_surf *surf)
+{
+	unsigned pipe_interleave_bytes = info->pipe_interleave_bytes;
+	unsigned num_pipes = info->num_tile_pipes;
+	unsigned cl_width, cl_height;
+
+	if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
+		return;
+
+	assert(info->chip_class <= VI);
+
+	switch (num_pipes) {
+	case 2:
+		cl_width = 32;
+		cl_height = 16;
+		break;
+	case 4:
+		cl_width = 32;
+		cl_height = 32;
+		break;
+	case 8:
+		cl_width = 64;
+		cl_height = 32;
+		break;
+	case 16: /* Hawaii */
+		cl_width = 64;
+		cl_height = 64;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	unsigned base_align = num_pipes * pipe_interleave_bytes;
+
+	unsigned width = align(surf->u.legacy.level[0].nblk_x, cl_width*8);
+	unsigned height = align(surf->u.legacy.level[0].nblk_y, cl_height*8);
+	unsigned slice_elements = (width * height) / (8*8);
+
+	/* Each element of CMASK is a nibble. */
+	unsigned slice_bytes = slice_elements / 2;
+
+	surf->u.legacy.cmask_slice_tile_max = (width * height) / (128*128);
+	if (surf->u.legacy.cmask_slice_tile_max)
+		surf->u.legacy.cmask_slice_tile_max -= 1;
+
+	unsigned num_layers;
+	if (config->is_3d)
+		num_layers = config->info.depth;
+	else if (config->is_cube)
+		num_layers = 6;
+	else
+		num_layers = config->info.array_size;
+
+	surf->cmask_alignment = MAX2(256, base_align);
+	surf->cmask_size = align(slice_bytes, base_align) * num_layers;
+}
+
 /**
  * Fill in the tiling information in \p surf based on the given surface config.
  *
@@ -537,9 +645,8 @@
 
 	compressed = surf->blk_w == 4 && surf->blk_h == 4;
 
-	/* MSAA and FMASK require 2D tiling. */
-	if (config->info.samples > 1 ||
-	    (surf->flags & RADEON_SURF_FMASK))
+	/* MSAA requires 2D tiling. */
+	if (config->info.samples > 1)
 		mode = RADEON_SURF_MODE_2D;
 
 	/* DB doesn't support linear layouts. */
@@ -582,13 +689,18 @@
 	}
 
 	AddrDccIn.numSamples = AddrSurfInfoIn.numSamples =
-		config->info.samples ? config->info.samples : 1;
+		MAX2(1, config->info.samples);
 	AddrSurfInfoIn.tileIndex = -1;
 
+	if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER)) {
+		AddrDccIn.numSamples = AddrSurfInfoIn.numFrags =
+			MAX2(1, config->info.storage_samples);
+	}
+
 	/* Set the micro tile type. */
 	if (surf->flags & RADEON_SURF_SCANOUT)
 		AddrSurfInfoIn.tileType = ADDR_DISPLAYABLE;
-	else if (surf->flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_FMASK))
+	else if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
 		AddrSurfInfoIn.tileType = ADDR_DEPTH_SAMPLE_ORDER;
 	else
 		AddrSurfInfoIn.tileType = ADDR_NON_DISPLAYABLE;
@@ -596,7 +708,6 @@
 	AddrSurfInfoIn.flags.color = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER);
 	AddrSurfInfoIn.flags.depth = (surf->flags & RADEON_SURF_ZBUFFER) != 0;
 	AddrSurfInfoIn.flags.cube = config->is_cube;
-	AddrSurfInfoIn.flags.fmask = (surf->flags & RADEON_SURF_FMASK) != 0;
 	AddrSurfInfoIn.flags.display = get_display_flag(config, surf);
 	AddrSurfInfoIn.flags.pow2Pad = config->info.levels > 1;
 	AddrSurfInfoIn.flags.tcCompatible = (surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) != 0;
@@ -661,8 +772,6 @@
 	if (AddrSurfInfoIn.tileMode >= ADDR_TM_2D_TILED_THIN1 &&
 	    surf->u.legacy.bankw && surf->u.legacy.bankh &&
 	    surf->u.legacy.mtilea && surf->u.legacy.tile_split) {
-		assert(!(surf->flags & RADEON_SURF_FMASK));
-
 		/* If any of these parameters are incorrect, the calculation
 		 * will fail. */
 		AddrTileInfoIn.banks = surf->u.legacy.num_banks;
@@ -809,6 +918,67 @@
 		}
 	}
 
+	/* Compute FMASK. */
+	if (config->info.samples >= 2 && AddrSurfInfoIn.flags.color) {
+		ADDR_COMPUTE_FMASK_INFO_INPUT fin = {0};
+		ADDR_COMPUTE_FMASK_INFO_OUTPUT fout = {0};
+		ADDR_TILEINFO fmask_tile_info = {};
+
+		fin.size = sizeof(fin);
+		fout.size = sizeof(fout);
+
+		fin.tileMode = AddrSurfInfoOut.tileMode;
+		fin.pitch = AddrSurfInfoOut.pitch;
+		fin.height = config->info.height;
+		fin.numSlices = AddrSurfInfoIn.numSlices;
+		fin.numSamples = AddrSurfInfoIn.numSamples;
+		fin.numFrags = AddrSurfInfoIn.numFrags;
+		fin.tileIndex = -1;
+		fout.pTileInfo = &fmask_tile_info;
+
+		r = AddrComputeFmaskInfo(addrlib, &fin, &fout);
+		if (r)
+			return r;
+
+		surf->fmask_size = fout.fmaskBytes;
+		surf->fmask_alignment = fout.baseAlign;
+		surf->fmask_tile_swizzle = 0;
+
+		surf->u.legacy.fmask.slice_tile_max =
+			(fout.pitch * fout.height) / 64;
+		if (surf->u.legacy.fmask.slice_tile_max)
+		    surf->u.legacy.fmask.slice_tile_max -= 1;
+
+		surf->u.legacy.fmask.tiling_index = fout.tileIndex;
+		surf->u.legacy.fmask.bankh = fout.pTileInfo->bankHeight;
+		surf->u.legacy.fmask.pitch_in_pixels = fout.pitch;
+
+		/* Compute tile swizzle for FMASK. */
+		if (config->info.fmask_surf_index &&
+		    !(surf->flags & RADEON_SURF_SHAREABLE)) {
+			ADDR_COMPUTE_BASE_SWIZZLE_INPUT xin = {0};
+			ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT xout = {0};
+
+			xin.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_INPUT);
+			xout.size = sizeof(ADDR_COMPUTE_BASE_SWIZZLE_OUTPUT);
+
+			/* This counter starts from 1 instead of 0. */
+			xin.surfIndex = p_atomic_inc_return(config->info.fmask_surf_index);
+			xin.tileIndex = fout.tileIndex;
+			xin.macroModeIndex = fout.macroModeIndex;
+			xin.pTileInfo = fout.pTileInfo;
+			xin.tileMode = fin.tileMode;
+
+			int r = AddrComputeBaseSwizzle(addrlib, &xin, &xout);
+			if (r != ADDR_OK)
+				return r;
+
+			assert(xout.tileSwizzle <=
+			       u_bit_consecutive(0, sizeof(surf->tile_swizzle) * 8));
+			surf->fmask_tile_swizzle = xout.tileSwizzle;
+		}
+	}
+
 	/* Recalculate the whole DCC miptree size including disabled levels.
 	 * This is what addrlib does, but calling addrlib would be a lot more
 	 * complicated.
@@ -829,13 +999,34 @@
 	/* Make sure HTILE covers the whole miptree, because the shader reads
 	 * TC-compatible HTILE even for levels where it's disabled by DB.
 	 */
-	if (surf->htile_size && config->info.levels > 1)
-		surf->htile_size *= 2;
+	if (surf->htile_size && config->info.levels > 1 &&
+	    surf->flags & RADEON_SURF_TC_COMPATIBLE_HTILE) {
+		/* MSAA can't occur with levels > 1, so ignore the sample count. */
+		const unsigned total_pixels = surf->surf_size / surf->bpe;
+		const unsigned htile_block_size = 8 * 8;
+		const unsigned htile_element_size = 4;
+
+		surf->htile_size = (total_pixels / htile_block_size) *
+				   htile_element_size;
+		surf->htile_size = align(surf->htile_size, surf->htile_alignment);
+	}
 
 	surf->is_linear = surf->u.legacy.level[0].mode == RADEON_SURF_MODE_LINEAR_ALIGNED;
 	surf->is_displayable = surf->is_linear ||
 			       surf->micro_tile_mode == RADEON_MICRO_MODE_DISPLAY ||
 			       surf->micro_tile_mode == RADEON_MICRO_MODE_ROTATED;
+
+	/* The rotated micro tile mode doesn't work if both CMASK and RB+ are
+	 * used at the same time. This case is not currently expected to occur
+	 * because we don't use rotated. Enforce this restriction on all chips
+	 * to facilitate testing.
+	 */
+	if (surf->micro_tile_mode == RADEON_MICRO_MODE_ROTATED) {
+		assert(!"rotate micro tile mode is unsupported");
+		return ADDR_ERROR;
+	}
+
+	ac_compute_cmask(info, config, surf);
 	return 0;
 }
 
@@ -1095,8 +1286,8 @@
 
 			surf->u.gfx9.fmask.swizzle_mode = fin.swizzleMode;
 			surf->u.gfx9.fmask.epitch = fout.pitch - 1;
-			surf->u.gfx9.fmask_size = fout.fmaskBytes;
-			surf->u.gfx9.fmask_alignment = fout.baseAlign;
+			surf->fmask_size = fout.fmaskBytes;
+			surf->fmask_alignment = fout.baseAlign;
 
 			/* Compute tile swizzle for the FMASK surface. */
 			if (config->info.fmask_surf_index &&
@@ -1122,8 +1313,8 @@
 					return ret;
 
 				assert(xout.pipeBankXor <=
-				       u_bit_consecutive(0, sizeof(surf->u.gfx9.fmask_tile_swizzle) * 8));
-				surf->u.gfx9.fmask_tile_swizzle = xout.pipeBankXor;
+				       u_bit_consecutive(0, sizeof(surf->fmask_tile_swizzle) * 8));
+				surf->fmask_tile_swizzle = xout.pipeBankXor;
 			}
 		}
 
@@ -1135,7 +1326,7 @@
 			cin.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_INPUT);
 			cout.size = sizeof(ADDR2_COMPUTE_CMASK_INFO_OUTPUT);
 
-			if (in->numSamples) {
+			if (in->numSamples > 1) {
 				/* FMASK is always aligned. */
 				cin.cMaskFlags.pipeAligned = 1;
 				cin.cMaskFlags.rbAligned = 1;
@@ -1160,8 +1351,8 @@
 
 			surf->u.gfx9.cmask.rb_aligned = cin.cMaskFlags.rbAligned;
 			surf->u.gfx9.cmask.pipe_aligned = cin.cMaskFlags.pipeAligned;
-			surf->u.gfx9.cmask_size = cout.cmaskBytes;
-			surf->u.gfx9.cmask_alignment = cout.baseAlign;
+			surf->cmask_size = cout.cmaskBytes;
+			surf->cmask_alignment = cout.baseAlign;
 		}
 	}
 
@@ -1178,8 +1369,6 @@
 	ADDR2_COMPUTE_SURFACE_INFO_INPUT AddrSurfInfoIn = {0};
 	int r;
 
-	assert(!(surf->flags & RADEON_SURF_FMASK));
-
 	AddrSurfInfoIn.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT);
 
 	compressed = surf->blk_w == 4 && surf->blk_h == 4;
@@ -1217,6 +1406,10 @@
 			assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
 			AddrSurfInfoIn.format = ADDR_FMT_32_32;
 			break;
+		case 12:
+			assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
+			AddrSurfInfoIn.format = ADDR_FMT_32_32_32;
+			break;
 		case 16:
 			assert(!(surf->flags & RADEON_SURF_Z_OR_SBUFFER));
 			AddrSurfInfoIn.format = ADDR_FMT_32_32_32_32;
@@ -1236,9 +1429,12 @@
 	AddrSurfInfoIn.flags.opt4space = 1;
 
 	AddrSurfInfoIn.numMipLevels = config->info.levels;
-	AddrSurfInfoIn.numSamples = config->info.samples ? config->info.samples : 1;
+	AddrSurfInfoIn.numSamples = MAX2(1, config->info.samples);
 	AddrSurfInfoIn.numFrags = AddrSurfInfoIn.numSamples;
 
+	if (!(surf->flags & RADEON_SURF_Z_OR_SBUFFER))
+		AddrSurfInfoIn.numFrags = MAX2(1, config->info.storage_samples);
+
 	/* GFX9 doesn't support 1D depth textures, so allocate all 1D textures
 	 * as 2D to avoid having shader variants for 1D vs 2D, so all shaders
 	 * must sample 1D textures as 2D. */
@@ -1291,13 +1487,13 @@
 
 	surf->num_dcc_levels = 0;
 	surf->surf_size = 0;
+	surf->fmask_size = 0;
 	surf->dcc_size = 0;
 	surf->htile_size = 0;
 	surf->htile_slice_size = 0;
 	surf->u.gfx9.surf_offset = 0;
 	surf->u.gfx9.stencil_offset = 0;
-	surf->u.gfx9.fmask_size = 0;
-	surf->u.gfx9.cmask_size = 0;
+	surf->cmask_size = 0;
 
 	/* Calculate texture layout information. */
 	r = gfx9_compute_miptree(addrlib, config, surf, compressed,
@@ -1371,8 +1567,13 @@
 		case ADDR_SW_4KB_R_X:
 		case ADDR_SW_64KB_R_X:
 		case ADDR_SW_VAR_R_X:
-			surf->micro_tile_mode = RADEON_MICRO_MODE_ROTATED;
-			break;
+			/* The rotated micro tile mode doesn't work if both CMASK and RB+ are
+			 * used at the same time. This case is not currently expected to occur
+			 * because we don't use rotated. Enforce this restriction on all chips
+			 * to facilitate testing.
+			 */
+			assert(!"rotate micro tile mode is unsupported");
+			return ADDR_ERROR;
 
 		/* Z = depth. */
 		case ADDR_SW_4KB_Z:
@@ -1391,7 +1592,7 @@
 
 	/* Temporary workaround to prevent VM faults and hangs. */
 	if (info->family == CHIP_VEGA12)
-		surf->u.gfx9.fmask_size *= 8;
+		surf->fmask_size *= 8;
 
 	return 0;
 }
@@ -1403,7 +1604,7 @@
 {
 	int r;
 
-	r = surf_config_sanity(config);
+	r = surf_config_sanity(config, surf->flags);
 	if (r)
 		return r;
 
diff --git a/src/amd/common/ac_surface.h b/src/amd/common/ac_surface.h
index 37df859..8ba964e 100644
--- a/src/amd/common/ac_surface.h
+++ b/src/amd/common/ac_surface.h
@@ -79,6 +79,13 @@
     enum radeon_surf_mode       mode:2;
 };
 
+struct legacy_surf_fmask {
+    unsigned slice_tile_max; /* max 4M */
+    uint8_t tiling_index;    /* max 31 */
+    uint8_t bankh;           /* max 8 */
+    uint16_t pitch_in_pixels;
+};
+
 struct legacy_surf_layout {
     unsigned                    bankw:4;  /* max 8 */
     unsigned                    bankh:4;  /* max 8 */
@@ -101,6 +108,8 @@
     struct legacy_surf_level    stencil_level[RADEON_SURF_MAX_LEVELS];
     uint8_t                     tiling_index[RADEON_SURF_MAX_LEVELS];
     uint8_t                     stencil_tiling_index[RADEON_SURF_MAX_LEVELS];
+    struct legacy_surf_fmask    fmask;
+    unsigned                    cmask_slice_tile_max;
 };
 
 /* Same as addrlib - AddrResourceType. */
@@ -142,13 +151,6 @@
     uint16_t                    dcc_pitch_max;  /* (mip chain pitch - 1) */
 
     uint64_t                    stencil_offset; /* separate stencil */
-    uint64_t                    fmask_size;
-    uint64_t                    cmask_size;
-
-    uint32_t                    fmask_alignment;
-    uint32_t                    cmask_alignment;
-
-    uint8_t                     fmask_tile_swizzle;
 };
 
 struct radeon_surf {
@@ -188,18 +190,24 @@
      * - depth/stencil if HTILE is not TC-compatible and if the gen is not GFX9
      */
     uint8_t                     tile_swizzle;
+    uint8_t                     fmask_tile_swizzle;
 
     uint64_t                    surf_size;
+    uint64_t                    fmask_size;
+    uint32_t                    surf_alignment;
+    uint32_t                    fmask_alignment;
+
     /* DCC and HTILE are very small. */
     uint32_t                    dcc_size;
-    uint32_t                    htile_size;
-
-    uint32_t                    htile_slice_size;
-
-    uint32_t                    surf_alignment;
     uint32_t                    dcc_alignment;
+
+    uint32_t                    htile_size;
+    uint32_t                    htile_slice_size;
     uint32_t                    htile_alignment;
 
+    uint32_t                    cmask_size;
+    uint32_t                    cmask_alignment;
+
     union {
         /* R600-VI return values.
          *
@@ -217,12 +225,13 @@
 	uint32_t width;
 	uint32_t height;
 	uint32_t depth;
-	uint8_t samples;
+	uint8_t samples; /* For Z/S: samples; For color: FMASK coverage samples */
+	uint8_t storage_samples; /* For color: allocated samples */
 	uint8_t levels;
 	uint8_t num_channels; /* heuristic for displayability */
 	uint16_t array_size;
 	uint32_t *surf_index; /* Set a monotonic counter for tile swizzling. */
-	uint32_t *fmask_surf_index; /* GFX9+ */
+	uint32_t *fmask_surf_index;
 };
 
 struct ac_surf_config {
@@ -240,6 +249,10 @@
 		       enum radeon_surf_mode mode,
 		       struct radeon_surf *surf);
 
+void ac_compute_cmask(const struct radeon_info *info,
+		      const struct ac_surf_config *config,
+		      struct radeon_surf *surf);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/amd/common/amd_family.h b/src/amd/common/amd_family.h
index 6410368..a282898 100644
--- a/src/amd/common/amd_family.h
+++ b/src/amd/common/amd_family.h
@@ -95,6 +95,7 @@
     CHIP_VEGAM,
     CHIP_VEGA10,
     CHIP_VEGA12,
+    CHIP_VEGA20,
     CHIP_RAVEN,
     CHIP_LAST,
 };
diff --git a/src/amd/common/gfx9d.h b/src/amd/common/gfx9d.h
index 8c61645..d18e665 100644
--- a/src/amd/common/gfx9d.h
+++ b/src/amd/common/gfx9d.h
@@ -1123,7 +1123,6 @@
 #define   S_030960_HW_USE_ONLY(x)                                     (((unsigned)(x) & 0x1) << 23)
 #define   G_030960_HW_USE_ONLY(x)                                     (((x) >> 23) & 0x1)
 #define   C_030960_HW_USE_ONLY                                        0xFF7FFFFF
-#define R_030964_VGT_OBJECT_ID                                          0x030964
 #define R_030968_VGT_INSTANCE_BASE_ID                                   0x030968
 #define R_030A00_PA_SU_LINE_STIPPLE_VALUE                               0x030A00
 #define   S_030A00_LINE_STIPPLE_VALUE(x)                              (((unsigned)(x) & 0xFFFFFF) << 0)
@@ -1195,19 +1194,6 @@
 #define   S_030E04_ADDRESS(x)                                         (((unsigned)(x) & 0xFF) << 0)
 #define   G_030E04_ADDRESS(x)                                         (((x) >> 0) & 0xFF)
 #define   C_030E04_ADDRESS                                            0xFFFFFF00
-#define R_030E08_TA_GRAD_ADJ_UCONFIG                                    0x030E08
-#define   S_030E08_GRAD_ADJ_0(x)                                      (((unsigned)(x) & 0xFF) << 0)
-#define   G_030E08_GRAD_ADJ_0(x)                                      (((x) >> 0) & 0xFF)
-#define   C_030E08_GRAD_ADJ_0                                         0xFFFFFF00
-#define   S_030E08_GRAD_ADJ_1(x)                                      (((unsigned)(x) & 0xFF) << 8)
-#define   G_030E08_GRAD_ADJ_1(x)                                      (((x) >> 8) & 0xFF)
-#define   C_030E08_GRAD_ADJ_1                                         0xFFFF00FF
-#define   S_030E08_GRAD_ADJ_2(x)                                      (((unsigned)(x) & 0xFF) << 16)
-#define   G_030E08_GRAD_ADJ_2(x)                                      (((x) >> 16) & 0xFF)
-#define   C_030E08_GRAD_ADJ_2                                         0xFF00FFFF
-#define   S_030E08_GRAD_ADJ_3(x)                                      (((unsigned)(x) & 0xFF) << 24)
-#define   G_030E08_GRAD_ADJ_3(x)                                      (((x) >> 24) & 0xFF)
-#define   C_030E08_GRAD_ADJ_3                                         0x00FFFFFF
 #define R_030F00_DB_OCCLUSION_COUNT0_LOW                                0x030F00
 #define R_008F00_SQ_BUF_RSRC_WORD0                                      0x008F00
 #define R_030F04_DB_OCCLUSION_COUNT0_HI                                 0x030F04
@@ -4084,10 +4070,6 @@
 #define   S_028060_DISALLOW_OVERFLOW(x)                               (((unsigned)(x) & 0x1) << 3)
 #define   G_028060_DISALLOW_OVERFLOW(x)                               (((x) >> 3) & 0x1)
 #define   C_028060_DISALLOW_OVERFLOW                                  0xFFFFFFF7
-#define R_028064_DB_RENDER_FILTER                                       0x028064
-#define   S_028064_PS_INVOKE_MASK(x)                                  (((unsigned)(x) & 0xFFFF) << 0)
-#define   G_028064_PS_INVOKE_MASK(x)                                  (((x) >> 0) & 0xFFFF)
-#define   C_028064_PS_INVOKE_MASK                                     0xFFFF0000
 #define R_028068_DB_Z_INFO2                                             0x028068
 #define   S_028068_EPITCH(x)                                          (((unsigned)(x) & 0xFFFF) << 0)
 #define   G_028068_EPITCH(x)                                          (((x) >> 0) & 0xFFFF)
@@ -4417,9 +4399,6 @@
 #define   S_02835C_NUM_RB_PER_SE(x)                                   (((unsigned)(x) & 0x03) << 5)
 #define   G_02835C_NUM_RB_PER_SE(x)                                   (((x) >> 5) & 0x03)
 #define   C_02835C_NUM_RB_PER_SE                                      0xFFFFFF9F
-#define   S_02835C_DISABLE_SRBSL_DB_OPTIMIZED_PACKING(x)              (((unsigned)(x) & 0x1) << 8)
-#define   G_02835C_DISABLE_SRBSL_DB_OPTIMIZED_PACKING(x)              (((x) >> 8) & 0x1)
-#define   C_02835C_DISABLE_SRBSL_DB_OPTIMIZED_PACKING                 0xFFFFFEFF
 #define R_028360_CP_PERFMON_CNTX_CNTL                                   0x028360
 #define   S_028360_PERFMON_ENABLE(x)                                  (((unsigned)(x) & 0x1) << 31)
 #define   G_028360_PERFMON_ENABLE(x)                                  (((x) >> 31) & 0x1)
@@ -4463,26 +4442,6 @@
 #define   S_0283A8_BOT_QTR(x)                                         (((unsigned)(x) & 0xFF) << 24)
 #define   G_0283A8_BOT_QTR(x)                                         (((x) >> 24) & 0xFF)
 #define   C_0283A8_BOT_QTR                                            0x00FFFFFF
-#define R_0283AC_PA_SC_FOV_WINDOW_LR                                    0x0283AC
-#define   S_0283AC_LEFT_EYE_FOV_LEFT(x)                               (((unsigned)(x) & 0xFF) << 0)
-#define   G_0283AC_LEFT_EYE_FOV_LEFT(x)                               (((x) >> 0) & 0xFF)
-#define   C_0283AC_LEFT_EYE_FOV_LEFT                                  0xFFFFFF00
-#define   S_0283AC_LEFT_EYE_FOV_RIGHT(x)                              (((unsigned)(x) & 0xFF) << 8)
-#define   G_0283AC_LEFT_EYE_FOV_RIGHT(x)                              (((x) >> 8) & 0xFF)
-#define   C_0283AC_LEFT_EYE_FOV_RIGHT                                 0xFFFF00FF
-#define   S_0283AC_RIGHT_EYE_FOV_LEFT(x)                              (((unsigned)(x) & 0xFF) << 16)
-#define   G_0283AC_RIGHT_EYE_FOV_LEFT(x)                              (((x) >> 16) & 0xFF)
-#define   C_0283AC_RIGHT_EYE_FOV_LEFT                                 0xFF00FFFF
-#define   S_0283AC_RIGHT_EYE_FOV_RIGHT(x)                             (((unsigned)(x) & 0xFF) << 24)
-#define   G_0283AC_RIGHT_EYE_FOV_RIGHT(x)                             (((x) >> 24) & 0xFF)
-#define   C_0283AC_RIGHT_EYE_FOV_RIGHT                                0x00FFFFFF
-#define R_0283B0_PA_SC_FOV_WINDOW_TB                                    0x0283B0
-#define   S_0283B0_FOV_TOP(x)                                         (((unsigned)(x) & 0xFF) << 0)
-#define   G_0283B0_FOV_TOP(x)                                         (((x) >> 0) & 0xFF)
-#define   C_0283B0_FOV_TOP                                            0xFFFFFF00
-#define   S_0283B0_FOV_BOT(x)                                         (((unsigned)(x) & 0xFF) << 8)
-#define   G_0283B0_FOV_BOT(x)                                         (((x) >> 8) & 0xFF)
-#define   C_0283B0_FOV_BOT                                            0xFFFF00FF
 #define R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX                           0x02840C
 #define R_028414_CB_BLEND_RED                                           0x028414
 #define R_028418_CB_BLEND_GREEN                                         0x028418
@@ -5772,9 +5731,6 @@
 #define   S_028830_RECTANGLE_FILTER_DISABLE(x)                        (((unsigned)(x) & 0x1) << 4)
 #define   G_028830_RECTANGLE_FILTER_DISABLE(x)                        (((x) >> 4) & 0x1)
 #define   C_028830_RECTANGLE_FILTER_DISABLE                           0xFFFFFFEF
-#define   S_028830_SRBSL_ENABLE(x)                                    (((unsigned)(x) & 0x1) << 5)
-#define   G_028830_SRBSL_ENABLE(x)                                    (((x) >> 5) & 0x1)
-#define   C_028830_SRBSL_ENABLE                                       0xFFFFFFDF
 #define R_028834_PA_CL_OBJPRIM_ID_CNTL                                  0x028834
 #define   S_028834_OBJ_ID_SEL(x)                                      (((unsigned)(x) & 0x1) << 0)
 #define   G_028834_OBJ_ID_SEL(x)                                      (((x) >> 0) & 0x1)
@@ -5808,6 +5764,7 @@
 #define   S_02883C_USE_PROVOKING_ZW(x)                                (((unsigned)(x) & 0x1) << 4)
 #define   G_02883C_USE_PROVOKING_ZW(x)                                (((x) >> 4) & 0x1)
 #define   C_02883C_USE_PROVOKING_ZW                                   0xFFFFFFEF
+#define R_028840_PA_STEREO_CNTL                                         0x028840
 #define R_028A00_PA_SU_POINT_SIZE                                       0x028A00
 #define   S_028A00_HEIGHT(x)                                          (((unsigned)(x) & 0xFFFF) << 0)
 #define   G_028A00_HEIGHT(x)                                          (((x) >> 0) & 0xFFFF)
@@ -6273,10 +6230,6 @@
 #define   S_028A98_OBJECT_ID_INST_EN(x)                               (((unsigned)(x) & 0x1) << 3)
 #define   G_028A98_OBJECT_ID_INST_EN(x)                               (((x) >> 3) & 0x1)
 #define   C_028A98_OBJECT_ID_INST_EN                                  0xFFFFFFF7
-#define R_028A9C_VGT_INDEX_PAYLOAD_CNTL                                 0x028A9C
-#define   S_028A9C_COMPOUND_INDEX_EN(x)                               (((unsigned)(x) & 0x1) << 0)
-#define   G_028A9C_COMPOUND_INDEX_EN(x)                               (((x) >> 0) & 0x1)
-#define   C_028A9C_COMPOUND_INDEX_EN                                  0xFFFFFFFE
 #define R_028AA0_VGT_INSTANCE_STEP_RATE_0                               0x028AA0
 #define R_028AA4_VGT_INSTANCE_STEP_RATE_1                               0x028AA4
 #define R_028AAC_VGT_ESGS_RING_ITEMSIZE                                 0x028AAC
diff --git a/src/amd/common/sid.h b/src/amd/common/sid.h
index 30cfeb9..303c036 100644
--- a/src/amd/common/sid.h
+++ b/src/amd/common/sid.h
@@ -239,7 +239,7 @@
 #define     S_411_ENGINE(x)		(((unsigned)(x) & 0x1) << 27)
 #define       V_411_ME			0
 #define       V_411_PFP			1
-#define     S_411_DSL_SEL(x)		(((unsigned)(x) & 0x3) << 20)
+#define     S_411_DST_SEL(x)		(((unsigned)(x) & 0x3) << 20)
 #define       V_411_DST_ADDR		0
 #define       V_411_GDS			1 /* program DAS to 1 as well */
 #define       V_411_NOWHERE		2 /* new for GFX9 */
@@ -294,7 +294,7 @@
 #define       V_500_GDS			1 /* program SAS to 1 as well */
 #define       V_500_DATA		2
 #define       V_500_SRC_ADDR_TC_L2	3 /* new for CIK */
-#define     S_500_DSL_SEL(x)		(((unsigned)(x) & 0x3) << 20)
+#define     S_500_DST_SEL(x)		(((unsigned)(x) & 0x3) << 20)
 #define       V_500_DST_ADDR		0
 #define       V_500_GDS			1 /* program DAS to 1 as well */
 #define       V_500_NOWHERE		2 /* new for GFX9 */
diff --git a/src/amd/common/sid_tables.py b/src/amd/common/sid_tables.py
index 4e53ace..7b5e626 100644
--- a/src/amd/common/sid_tables.py
+++ b/src/amd/common/sid_tables.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 CopyRight = '''
 /*
@@ -64,8 +65,8 @@
         """
         fragments = [
             '"%s\\0" /* %s */' % (
-                te[0].encode('string_escape'),
-                ', '.join(str(idx) for idx in te[2])
+                te[0].encode('unicode_escape').decode(),
+                ', '.join(str(idx) for idx in sorted(te[2]))
             )
             for te in self.table
         ]
@@ -333,10 +334,10 @@
     strings_offsets = IntTable("int")
     fields = FieldTable()
 
-    print '/* This file is autogenerated by sid_tables.py from sid.h. Do not edit directly. */'
-    print
-    print CopyRight.strip()
-    print '''
+    print('/* This file is autogenerated by sid_tables.py from sid.h. Do not edit directly. */')
+    print()
+    print(CopyRight.strip())
+    print('''
 #ifndef SID_TABLES_H
 #define SID_TABLES_H
 
@@ -358,17 +359,17 @@
         unsigned name_offset;
         unsigned op;
 };
-'''
+''')
 
-    print 'static const struct si_packet3 packet3_table[] = {'
+    print('static const struct si_packet3 packet3_table[] = {')
     for pkt in packets:
-        print '\t{%s, %s},' % (strings.add(pkt[5:]), pkt)
-    print '};'
-    print
+        print('\t{%s, %s},' % (strings.add(pkt[5:]), pkt))
+    print('};')
+    print()
 
     regs = {}
     for asic in asics:
-        print 'static const struct si_reg %s_reg_table[] = {' % (asic.name)
+        print('static const struct si_reg %s_reg_table[] = {' % (asic.name))
         for reg in asic.registers:
             # Only output a register that was changed or added relative to
             # the previous generation
@@ -377,27 +378,27 @@
                 continue
 
             if len(reg.fields):
-                print '\t{%s, %s, %s, %s},' % (strings.add(reg.name), reg.r_name,
-                    len(reg.fields), fields.add(reg.fields))
+                print('\t{%s, %s, %s, %s},' % (strings.add(reg.name), reg.r_name,
+                    len(reg.fields), fields.add(reg.fields)))
             else:
-                print '\t{%s, %s},' % (strings.add(reg.name), reg.r_name)
+                print('\t{%s, %s},' % (strings.add(reg.name), reg.r_name))
 
             regs[reg.r_name] = reg
-        print '};'
-        print
+        print('};')
+        print()
 
     fields.emit(sys.stdout, strings, strings_offsets)
 
-    print
+    print()
 
     strings.emit(sys.stdout, "sid_strings")
 
-    print
+    print()
 
     strings_offsets.emit(sys.stdout, "sid_strings_offsets")
 
-    print
-    print '#endif'
+    print()
+    print('#endif')
 
 
 def main():
diff --git a/src/amd/vulkan/.gitignore b/src/amd/vulkan/.gitignore
index 7c02e42..1aabfc0 100644
--- a/src/amd/vulkan/.gitignore
+++ b/src/amd/vulkan/.gitignore
@@ -2,6 +2,7 @@
 /radv_entrypoints.c
 /radv_entrypoints.h
 /radv_extensions.c
+/radv_extensions.h
 /radv_timestamp.h
 /dev_icd.json
 /vk_format_table.c
diff --git a/src/amd/vulkan/Android.mk b/src/amd/vulkan/Android.mk
new file mode 100644
index 0000000..51b0356
--- /dev/null
+++ b/src/amd/vulkan/Android.mk
@@ -0,0 +1,168 @@
+# Copyright © 2018 Advanced Micro Devices, Inc.
+# Copyright © 2018 Mauro Rossi issor.oruam@gmail.com
+
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+# get VULKAN_FILES and VULKAN_GENERATED_FILES
+include $(LOCAL_PATH)/Makefile.sources
+
+# The gallium includes are for the util/u_math.h include from main/macros.h
+
+RADV_COMMON_INCLUDES := \
+	$(MESA_TOP)/include \
+	$(MESA_TOP)/src/ \
+	$(MESA_TOP)/src/vulkan/wsi \
+	$(MESA_TOP)/src/vulkan/util \
+	$(MESA_TOP)/src/amd \
+	$(MESA_TOP)/src/amd/common \
+	$(MESA_TOP)/src/compiler \
+	$(MESA_TOP)/src/mapi \
+	$(MESA_TOP)/src/mesa \
+	$(MESA_TOP)/src/mesa/drivers/dri/common \
+	$(MESA_TOP)/src/gallium/auxiliary \
+	$(MESA_TOP)/src/gallium/include \
+	frameworks/native/vulkan/include
+
+RADV_SHARED_LIBRARIES := libdrm_amdgpu
+
+ifeq ($(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7),)
+RADV_SHARED_LIBRARIES += libnativewindow
+endif
+
+#
+# libmesa_radv_common
+#
+
+include $(CLEAR_VARS)
+LOCAL_MODULE := libmesa_radv_common
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+
+intermediates := $(call local-generated-sources-dir)
+
+LOCAL_SRC_FILES := \
+	$(VULKAN_FILES)
+
+LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
+LOCAL_CFLAGS += -DVK_USE_PLATFORM_ANDROID_KHR
+
+$(call mesa-build-with-llvm)
+
+LOCAL_C_INCLUDES := \
+	$(RADV_COMMON_INCLUDES) \
+	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_amd_common,,) \
+	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_nir,,)/nir \
+	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_radv_common,,) \
+	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_vulkan_util,,)/util
+
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+	libmesa_vulkan_util
+
+LOCAL_GENERATED_SOURCES += $(intermediates)/radv_entrypoints.c
+LOCAL_GENERATED_SOURCES += $(intermediates)/radv_entrypoints.h
+LOCAL_GENERATED_SOURCES += $(intermediates)/radv_extensions.c
+LOCAL_GENERATED_SOURCES += $(intermediates)/radv_extensions.h
+LOCAL_GENERATED_SOURCES += $(intermediates)/vk_format_table.c
+
+RADV_ENTRYPOINTS_SCRIPT := $(MESA_TOP)/src/amd/vulkan/radv_entrypoints_gen.py
+RADV_EXTENSIONS_SCRIPT := $(MESA_TOP)/src/amd/vulkan/radv_extensions.py
+VK_FORMAT_TABLE_SCRIPT := $(MESA_TOP)/src/amd/vulkan/vk_format_table.py
+VK_FORMAT_PARSE_SCRIPT := $(MESA_TOP)/src/amd/vulkan/vk_format_parse.py
+
+vulkan_api_xml = $(MESA_TOP)/src/vulkan/registry/vk.xml
+vk_format_layout_csv = $(MESA_TOP)/src/amd/vulkan/vk_format_layout.csv
+
+$(intermediates)/radv_entrypoints.c: $(RADV_ENTRYPOINTS_SCRIPT) \
+					$(RADV_EXTENSIONS_SCRIPT) \
+					$(vulkan_api_xml)
+	@mkdir -p $(dir $@)
+	$(MESA_PYTHON2) $(RADV_ENTRYPOINTS_SCRIPT) \
+		--xml $(vulkan_api_xml) \
+		--outdir $(dir $@)
+
+$(intermediates)/radv_entrypoints.h: $(intermediates)/radv_entrypoints.c
+
+$(intermediates)/radv_extensions.c: $(RADV_EXTENSIONS_SCRIPT) $(vulkan_api_xml)
+	@mkdir -p $(dir $@)
+	$(MESA_PYTHON2) $(RADV_EXTENSIONS_SCRIPT) \
+		--xml $(vulkan_api_xml) \
+		--out-c $@ \
+		--out-h $(addsuffix .h,$(basename $@))
+
+$(intermediates)/radv_extensions.h: $(intermediates)/radv_extensions.c
+
+$(intermediates)/vk_format_table.c: $(VK_FORMAT_TABLE_SCRIPT) \
+					$(VK_FORMAT_PARSE_SCRIPT) \
+					$(vk_format_layout_csv)
+	@mkdir -p $(dir $@)
+	$(MESA_PYTHON2) $(VK_FORMAT_TABLE_SCRIPT) $(vk_format_layout_csv) > $@
+
+LOCAL_SHARED_LIBRARIES += $(RADV_SHARED_LIBRARIES)
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+	$(MESA_TOP)/src/amd/vulkan \
+	$(intermediates)
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+#
+# libvulkan_radeon
+#
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := vulkan.radv
+LOCAL_MODULE_CLASS := SHARED_LIBRARIES
+LOCAL_PROPRIETARY_MODULE := true
+LOCAL_MODULE_RELATIVE_PATH := hw
+
+LOCAL_LDFLAGS += -Wl,--build-id=sha1
+
+LOCAL_SRC_FILES := \
+	$(VULKAN_ANDROID_FILES)
+
+LOCAL_CFLAGS += -DFORCE_BUILD_AMDGPU   # instructs LLVM to declare LLVMInitializeAMDGPU* functions
+LOCAL_CFLAGS += -DVK_USE_PLATFORM_ANDROID_KHR
+
+$(call mesa-build-with-llvm)
+
+LOCAL_C_INCLUDES := \
+	$(RADV_COMMON_INCLUDES) \
+	$(call generated-sources-dir-for,STATIC_LIBRARIES,libmesa_radv_common,,)
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+	$(MESA_TOP)/src/amd/vulkan \
+	$(intermediates)
+
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+	libmesa_util \
+	libmesa_nir \
+	libmesa_glsl \
+	libmesa_compiler \
+	libmesa_amdgpu_addrlib \
+	libmesa_amd_common \
+	libmesa_radv_common
+
+LOCAL_SHARED_LIBRARIES += $(RADV_SHARED_LIBRARIES) libz libsync liblog
+
+include $(MESA_COMMON_MK)
+include $(BUILD_SHARED_LIBRARY)
diff --git a/src/amd/vulkan/Makefile.am b/src/amd/vulkan/Makefile.am
index 91b994d..e28f032 100644
--- a/src/amd/vulkan/Makefile.am
+++ b/src/amd/vulkan/Makefile.am
@@ -59,6 +59,10 @@
 	$(PTHREAD_CFLAGS) \
 	$(LLVM_CFLAGS)
 
+AM_CXXFLAGS = \
+	$(VISIBILITY_CXXFLAGS) \
+	$(LLVM_CXXFLAGS)
+
 VULKAN_SOURCES = \
 	$(VULKAN_GENERATED_FILES) \
 	$(VULKAN_FILES)
@@ -80,6 +84,22 @@
 	$(DLOPEN_LIBS) \
 	-lm
 
+if HAVE_PLATFORM_DRM
+AM_CPPFLAGS += \
+	-DVK_USE_PLATFORM_DISPLAY_KHR
+
+VULKAN_SOURCES += $(VULKAN_WSI_DISPLAY_FILES)
+endif
+
+if HAVE_XLIB_LEASE
+AM_CPPFLAGS += \
+	-DVK_USE_PLATFORM_XLIB_XRANDR_EXT \
+	$(XCB_RANDR_CFLAGS) \
+	$(XLIB_RANDR_CFLAGS)
+
+VULKAN_LIB_DEPS += $(XCB_RANDR_LIBS)
+endif
+
 if HAVE_PLATFORM_X11
 AM_CPPFLAGS += \
 	$(XCB_DRI3_CFLAGS) \
diff --git a/src/amd/vulkan/Makefile.sources b/src/amd/vulkan/Makefile.sources
index b1624e2..53a6383 100644
--- a/src/amd/vulkan/Makefile.sources
+++ b/src/amd/vulkan/Makefile.sources
@@ -54,6 +54,7 @@
 	radv_meta_resolve_cs.c \
 	radv_meta_resolve_fs.c \
 	radv_nir_to_llvm.c \
+	radv_llvm_helper.cpp \
 	radv_pass.c \
 	radv_pipeline.c \
 	radv_pipeline_cache.c \
@@ -62,6 +63,7 @@
 	radv_shader.c \
 	radv_shader_info.c \
 	radv_shader.h \
+	radv_shader_helper.h \
 	radv_query.c \
 	radv_util.c \
 	radv_util.h \
@@ -79,6 +81,9 @@
 VULKAN_WSI_X11_FILES := \
 	radv_wsi_x11.c
 
+VULKAN_WSI_DISPLAY_FILES := \
+	radv_wsi_display.c
+
 VULKAN_GENERATED_FILES := \
 	radv_entrypoints.c \
 	radv_entrypoints.h \
diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
index b5a99fe..7998ba8 100644
--- a/src/amd/vulkan/meson.build
+++ b/src/amd/vulkan/meson.build
@@ -67,6 +67,7 @@
   'radv_descriptor_set.h',
   'radv_formats.c',
   'radv_image.c',
+  'radv_llvm_helper.cpp',
   'radv_meta.c',
   'radv_meta.h',
   'radv_meta_blit.c',
@@ -88,6 +89,7 @@
   'radv_radeon_winsys.h',
   'radv_shader.c',
   'radv_shader.h',
+  'radv_shader_helper.h',
   'radv_shader_info.c',
   'radv_query.c',
   'radv_util.c',
@@ -115,6 +117,16 @@
   libradv_files += files('radv_wsi_wayland.c')
 endif
 
+if with_platform_drm
+  radv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
+  libradv_files += files('radv_wsi_display.c')
+endif
+
+if with_xlib_lease
+  radv_deps += [dep_xcb_xrandr, dep_xlib_xrandr]
+  radv_flags += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
+endif
+
 libvulkan_radeon = shared_library(
   'vulkan_radeon',
   [libradv_files, radv_entrypoints, radv_extensions_c, vk_format_table_c],
@@ -132,6 +144,7 @@
     idep_nir,
   ],
   c_args : [c_vis_args, no_override_init_args, radv_flags],
+  cpp_args : [cpp_vis_args, radv_flags],
   link_args : [ld_args_bsymbolic, ld_args_gc_sections],
   install : true,
 )
diff --git a/src/amd/vulkan/radv_android.c b/src/amd/vulkan/radv_android.c
index c06c83b..f5d7082 100644
--- a/src/amd/vulkan/radv_android.c
+++ b/src/amd/vulkan/radv_android.c
@@ -122,7 +122,7 @@
 		return result;
 
 	if (gralloc_info->handle->numFds != 1) {
-		return vk_errorf(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR,
+		return vk_errorf(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR,
 		                 "VkNativeBufferANDROID::handle::numFds is %d, "
 		                 "expected 1", gralloc_info->handle->numFds);
 	}
@@ -233,7 +233,7 @@
 	result = radv_GetPhysicalDeviceImageFormatProperties2(phys_dev_h,
 	                                                      &image_format_info, &image_format_props);
 	if (result != VK_SUCCESS) {
-		return vk_errorf(result,
+		return vk_errorf(device->instance, result,
 		                 "radv_GetPhysicalDeviceImageFormatProperties2 failed "
 		                 "inside %s", __func__);
 	}
@@ -252,7 +252,7 @@
 	 * gralloc swapchains.
 	 */
 	if (imageUsage != 0) {
-	return vk_errorf(VK_ERROR_FORMAT_NOT_SUPPORTED,
+	return vk_errorf(device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
 	                "unsupported VkImageUsageFlags(0x%x) for gralloc "
 	                "swapchain", imageUsage);
 	}
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index 12041f0..dae6440 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -226,7 +226,7 @@
 	cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8,
 			       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (cmd_buffer == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	cmd_buffer->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
 	cmd_buffer->device = device;
@@ -238,7 +238,7 @@
 		cmd_buffer->queue_family_index = pool->queue_family_index;
 
 	} else {
-		/* Init the pool_link so we can safefly call list_del when we destroy
+		/* Init the pool_link so we can safely call list_del when we destroy
 		 * the command buffer
 		 */
 		list_inithead(&cmd_buffer->pool_link);
@@ -250,7 +250,7 @@
 	cmd_buffer->cs = device->ws->cs_create(device->ws, ring);
 	if (!cmd_buffer->cs) {
 		vk_free(&cmd_buffer->pool->alloc, cmd_buffer);
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 	}
 
 	*pCommandBuffer = radv_cmd_buffer_to_handle(cmd_buffer);
@@ -305,13 +305,11 @@
 
 	if (cmd_buffer->upload.upload_bo)
 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
-				   cmd_buffer->upload.upload_bo, 8);
+				   cmd_buffer->upload.upload_bo);
 	cmd_buffer->upload.offset = 0;
 
 	cmd_buffer->record_result = VK_SUCCESS;
 
-	cmd_buffer->ring_offsets_idx = -1;
-
 	for (unsigned i = 0; i < VK_PIPELINE_BIND_POINT_RANGE_SIZE; i++) {
 		cmd_buffer->descriptors[i].dirty = 0;
 		cmd_buffer->descriptors[i].valid = 0;
@@ -357,14 +355,15 @@
 				       new_size, 4096,
 				       RADEON_DOMAIN_GTT,
 				       RADEON_FLAG_CPU_ACCESS|
-				       RADEON_FLAG_NO_INTERPROCESS_SHARING);
+				       RADEON_FLAG_NO_INTERPROCESS_SHARING |
+				       RADEON_FLAG_32BIT);
 
 	if (!bo) {
 		cmd_buffer->record_result = VK_ERROR_OUT_OF_DEVICE_MEMORY;
 		return false;
 	}
 
-	radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo, 8);
+	radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
 	if (cmd_buffer->upload.upload_bo) {
 		upload = malloc(sizeof(*upload));
 
@@ -430,9 +429,13 @@
 }
 
 static void
-radv_emit_write_data_packet(struct radeon_winsys_cs *cs, uint64_t va,
+radv_emit_write_data_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t va,
 			    unsigned count, const uint32_t *data)
 {
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
+
+	radeon_check_space(cmd_buffer->device->ws, cs, 4 + count);
+
 	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + count, 0));
 	radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
 		    S_370_WR_CONFIRM(1) |
@@ -445,18 +448,19 @@
 void radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
 {
 	struct radv_device *device = cmd_buffer->device;
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	uint64_t va;
 
 	va = radv_buffer_get_va(device->trace_bo);
 	if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY)
 		va += 4;
 
-	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 7);
-
 	++cmd_buffer->state.trace_id;
-	radv_cs_add_buffer(device->ws, cs, device->trace_bo, 8);
-	radv_emit_write_data_packet(cs, va, 1, &cmd_buffer->state.trace_id);
+	radv_emit_write_data_packet(cmd_buffer, va, 1,
+				    &cmd_buffer->state.trace_id);
+
+	radeon_check_space(cmd_buffer->device->ws, cs, 2);
+
 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
 	radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
 }
@@ -478,6 +482,8 @@
 			ptr = &cmd_buffer->gfx9_fence_idx;
 		}
 
+		radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, 4);
+
 		/* Force wait for graphics or compute engines to be idle. */
 		si_cs_emit_cache_flush(cmd_buffer->cs,
 				       cmd_buffer->device->physical_device->rad_info.chip_class,
@@ -495,7 +501,6 @@
 		   struct radv_pipeline *pipeline, enum ring_type ring)
 {
 	struct radv_device *device = cmd_buffer->device;
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
 	uint32_t data[2];
 	uint64_t va;
 
@@ -512,14 +517,10 @@
 		assert(!"invalid ring type");
 	}
 
-	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(device->ws,
-							   cmd_buffer->cs, 6);
-
 	data[0] = (uintptr_t)pipeline;
 	data[1] = (uintptr_t)pipeline >> 32;
 
-	radv_cs_add_buffer(device->ws, cs, device->trace_bo, 8);
-	radv_emit_write_data_packet(cs, va, 2, data);
+	radv_emit_write_data_packet(cmd_buffer, va, 2, data);
 }
 
 void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
@@ -531,10 +532,8 @@
 		radv_get_descriptors_state(cmd_buffer, bind_point);
 
 	descriptors_state->sets[idx] = set;
-	if (set)
-		descriptors_state->valid |= (1u << idx);
-	else
-		descriptors_state->valid &= ~(1u << idx);
+
+	descriptors_state->valid |= (1u << idx); /* active descriptors */
 	descriptors_state->dirty |= (1u << idx);
 }
 
@@ -545,23 +544,18 @@
 	struct radv_descriptor_state *descriptors_state =
 		radv_get_descriptors_state(cmd_buffer, bind_point);
 	struct radv_device *device = cmd_buffer->device;
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
 	uint32_t data[MAX_SETS * 2] = {};
 	uint64_t va;
 	unsigned i;
 	va = radv_buffer_get_va(device->trace_bo) + 24;
 
-	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(device->ws,
-							   cmd_buffer->cs, 4 + MAX_SETS * 2);
-
 	for_each_bit(i, descriptors_state->valid) {
 		struct radv_descriptor_set *set = descriptors_state->sets[i];
 		data[i * 2] = (uintptr_t)set;
 		data[i * 2 + 1] = (uintptr_t)set >> 32;
 	}
 
-	radv_cs_add_buffer(device->ws, cs, device->trace_bo, 8);
-	radv_emit_write_data_packet(cs, va, MAX_SETS * 2, data);
+	radv_emit_write_data_packet(cmd_buffer, va, MAX_SETS * 2, data);
 }
 
 struct radv_userdata_info *
@@ -583,11 +577,47 @@
 	uint32_t base_reg = pipeline->user_data_0[stage];
 	if (loc->sgpr_idx == -1)
 		return;
-	assert(loc->num_sgprs == 2);
+
+	assert(loc->num_sgprs == (HAVE_32BIT_POINTERS ? 1 : 2));
 	assert(!loc->indirect);
-	radeon_set_sh_reg_seq(cmd_buffer->cs, base_reg + loc->sgpr_idx * 4, 2);
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
+
+	radv_emit_shader_pointer(cmd_buffer->device, cmd_buffer->cs,
+				 base_reg + loc->sgpr_idx * 4, va, false);
+}
+
+static void
+radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
+			      struct radv_pipeline *pipeline,
+			      struct radv_descriptor_state *descriptors_state,
+			      gl_shader_stage stage)
+{
+	struct radv_device *device = cmd_buffer->device;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
+	uint32_t sh_base = pipeline->user_data_0[stage];
+	struct radv_userdata_locations *locs =
+		&pipeline->shaders[stage]->info.user_sgprs_locs;
+	unsigned mask = locs->descriptor_sets_enabled;
+
+	mask &= descriptors_state->dirty & descriptors_state->valid;
+
+	while (mask) {
+		int start, count;
+
+		u_bit_scan_consecutive_range(&mask, &start, &count);
+
+		struct radv_userdata_info *loc = &locs->descriptor_sets[start];
+		unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
+
+		radv_emit_shader_pointer_head(cs, sh_offset, count,
+					      HAVE_32BIT_POINTERS);
+		for (int i = 0; i < count; i++) {
+			struct radv_descriptor_set *set =
+				descriptors_state->sets[start + i];
+
+			radv_emit_shader_pointer_body(device, cs, set->va,
+						      HAVE_32BIT_POINTERS);
+		}
+	}
 }
 
 static void
@@ -845,12 +875,12 @@
 			continue;
 
 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
-				   pipeline->shaders[i]->bo, 8);
+				   pipeline->shaders[i]->bo);
 	}
 
 	if (radv_pipeline_has_gs(pipeline))
 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
-				   pipeline->gs_copy_shader->bo, 8);
+				   pipeline->gs_copy_shader->bo);
 
 	if (unlikely(cmd_buffer->device->trace_bo))
 		radv_save_pipeline(cmd_buffer, pipeline, RING_GFX);
@@ -872,14 +902,6 @@
 {
 	uint32_t count = cmd_buffer->state.dynamic.scissor.count;
 
-	/* Vega10/Raven scissor bug workaround. This must be done before VPORT
-	 * scissor registers are changed. There is also a more efficient but
-	 * more involved alternative workaround.
-	 */
-	if (cmd_buffer->device->physical_device->has_scissor_bug) {
-		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
-		si_emit_cache_flush(cmd_buffer);
-	}
 	si_write_scissors(cmd_buffer->cs, 0, count,
 			  cmd_buffer->state.dynamic.scissor.scissors,
 			  cmd_buffer->state.dynamic.viewport.viewports,
@@ -1152,17 +1174,63 @@
 			       ds->pa_su_poly_offset_db_fmt_cntl);
 }
 
-void
-radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			  struct radv_image *image,
-			  VkClearDepthStencilValue ds_clear_value,
-			  VkImageAspectFlags aspects)
+/**
+ * Update the fast clear depth/stencil values if the image is bound as a
+ * depth/stencil buffer.
+ */
+static void
+radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer,
+				struct radv_image *image,
+				VkClearDepthStencilValue ds_clear_value,
+				VkImageAspectFlags aspects)
 {
+	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
+	struct radv_attachment_info *att;
+	uint32_t att_idx;
+
+	if (!framebuffer || !subpass)
+		return;
+
+	att_idx = subpass->depth_stencil_attachment.attachment;
+	if (att_idx == VK_ATTACHMENT_UNUSED)
+		return;
+
+	att = &framebuffer->attachments[att_idx];
+	if (att->attachment->image != image)
+		return;
+
+	radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
+	radeon_emit(cs, ds_clear_value.stencil);
+	radeon_emit(cs, fui(ds_clear_value.depth));
+
+	/* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
+	 * only needed when clearing Z to 0.0.
+	 */
+	if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
+	    ds_clear_value.depth == 0.0) {
+		VkImageLayout layout = subpass->depth_stencil_attachment.layout;
+
+		radv_update_zrange_precision(cmd_buffer, &att->ds, image,
+					     layout, false);
+	}
+}
+
+/**
+ * Set the clear depth/stencil values to the image's metadata.
+ */
+static void
+radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+			   struct radv_image *image,
+			   VkClearDepthStencilValue ds_clear_value,
+			   VkImageAspectFlags aspects)
+{
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	uint64_t va = radv_buffer_get_va(image->bo);
-	va += image->offset + image->clear_value_offset;
 	unsigned reg_offset = 0, reg_count = 0;
 
-	assert(radv_image_has_htile(image));
+	va += image->offset + image->clear_value_offset;
 
 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
 		++reg_count;
@@ -1173,62 +1241,49 @@
 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
 		++reg_count;
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, 0));
-	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
-				    S_370_WR_CONFIRM(1) |
-				    S_370_ENGINE_SEL(V_370_PFP));
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
+	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + reg_count, 0));
+	radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+			S_370_WR_CONFIRM(1) |
+			S_370_ENGINE_SEL(V_370_PFP));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
 	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
-		radeon_emit(cmd_buffer->cs, ds_clear_value.stencil);
+		radeon_emit(cs, ds_clear_value.stencil);
 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
-		radeon_emit(cmd_buffer->cs, fui(ds_clear_value.depth));
-
-	radeon_set_context_reg_seq(cmd_buffer->cs, R_028028_DB_STENCIL_CLEAR + 4 * reg_offset, reg_count);
-	if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
-		radeon_emit(cmd_buffer->cs, ds_clear_value.stencil); /* R_028028_DB_STENCIL_CLEAR */
-	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
-		radeon_emit(cmd_buffer->cs, fui(ds_clear_value.depth)); /* R_02802C_DB_DEPTH_CLEAR */
-
-	/* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
-	 * only needed when clearing Z to 0.0.
-	 */
-	if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) &&
-	    ds_clear_value.depth == 0.0) {
-		struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
-		const struct radv_subpass *subpass = cmd_buffer->state.subpass;
-
-		if (!framebuffer || !subpass)
-			return;
-
-		if (subpass->depth_stencil_attachment.attachment == VK_ATTACHMENT_UNUSED)
-			return;
-
-		int idx = subpass->depth_stencil_attachment.attachment;
-		VkImageLayout layout = subpass->depth_stencil_attachment.layout;
-		struct radv_attachment_info *att = &framebuffer->attachments[idx];
-		struct radv_image *image = att->attachment->image;
-
-		/* Only needed if the image is currently bound as the depth
-		 * surface.
-		 */
-		if (att->attachment->image != image)
-			return;
-
-		radv_update_zrange_precision(cmd_buffer, &att->ds, image,
-					     layout, false);
-	}
+		radeon_emit(cs, fui(ds_clear_value.depth));
 }
 
-static void
-radv_load_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			   struct radv_image *image)
+/**
+ * Update the clear depth/stencil values for this image.
+ */
+void
+radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+			      struct radv_image *image,
+			      VkClearDepthStencilValue ds_clear_value,
+			      VkImageAspectFlags aspects)
 {
+	assert(radv_image_has_htile(image));
+
+	radv_set_ds_clear_metadata(cmd_buffer, image, ds_clear_value, aspects);
+
+	radv_update_bound_fast_clear_ds(cmd_buffer, image, ds_clear_value,
+				        aspects);
+}
+
+/**
+ * Load the clear depth/stencil values from the image's metadata.
+ */
+static void
+radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+			    struct radv_image *image)
+{
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	VkImageAspectFlags aspects = vk_format_aspects(image->vk_format);
 	uint64_t va = radv_buffer_get_va(image->bo);
-	va += image->offset + image->clear_value_offset;
 	unsigned reg_offset = 0, reg_count = 0;
 
+	va += image->offset + image->clear_value_offset;
+
 	if (!radv_image_has_htile(image))
 		return;
 
@@ -1241,21 +1296,21 @@
 	if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
 		++reg_count;
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, 0));
-	radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
-				    COPY_DATA_DST_SEL(COPY_DATA_REG) |
-				    (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
-	radeon_emit(cmd_buffer->cs, (R_028028_DB_STENCIL_CLEAR + 4 * reg_offset) >> 2);
-	radeon_emit(cmd_buffer->cs, 0);
+	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
+	radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+			COPY_DATA_DST_SEL(COPY_DATA_REG) |
+			(reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, (R_028028_DB_STENCIL_CLEAR + 4 * reg_offset) >> 2);
+	radeon_emit(cs, 0);
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
-	radeon_emit(cmd_buffer->cs, 0);
+	radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
+	radeon_emit(cs, 0);
 }
 
 /*
- *with DCC some colors don't require CMASK elimiation before being
+ * With DCC some colors don't require CMASK elimination before being
  * used as a texture. This sets a predicate value to determine if the
  * cmask eliminate is required.
  */
@@ -1280,55 +1335,108 @@
 	radeon_emit(cmd_buffer->cs, pred_val >> 32);
 }
 
-void
-radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			  struct radv_image *image,
-			  int idx,
-			  uint32_t color_values[2])
+/**
+ * Update the fast clear color values if the image is bound as a color buffer.
+ */
+static void
+radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer,
+				   struct radv_image *image,
+				   int cb_idx,
+				   uint32_t color_values[2])
 {
+	struct radv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
+	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
+	struct radv_attachment_info *att;
+	uint32_t att_idx;
+
+	if (!framebuffer || !subpass)
+		return;
+
+	att_idx = subpass->color_attachments[cb_idx].attachment;
+	if (att_idx == VK_ATTACHMENT_UNUSED)
+		return;
+
+	att = &framebuffer->attachments[att_idx];
+	if (att->attachment->image != image)
+		return;
+
+	radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
+	radeon_emit(cs, color_values[0]);
+	radeon_emit(cs, color_values[1]);
+}
+
+/**
+ * Set the clear color values to the image's metadata.
+ */
+static void
+radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+			      struct radv_image *image,
+			      uint32_t color_values[2])
+{
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	uint64_t va = radv_buffer_get_va(image->bo);
+
 	va += image->offset + image->clear_value_offset;
 
 	assert(radv_image_has_cmask(image) || radv_image_has_dcc(image));
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_WRITE_DATA, 4, 0));
-	radeon_emit(cmd_buffer->cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
-				    S_370_WR_CONFIRM(1) |
-				    S_370_ENGINE_SEL(V_370_PFP));
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
-	radeon_emit(cmd_buffer->cs, color_values[0]);
-	radeon_emit(cmd_buffer->cs, color_values[1]);
-
-	radeon_set_context_reg_seq(cmd_buffer->cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c, 2);
-	radeon_emit(cmd_buffer->cs, color_values[0]);
-	radeon_emit(cmd_buffer->cs, color_values[1]);
+	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 4, 0));
+	radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+			S_370_WR_CONFIRM(1) |
+			S_370_ENGINE_SEL(V_370_PFP));
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, color_values[0]);
+	radeon_emit(cs, color_values[1]);
 }
 
-static void
-radv_load_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			   struct radv_image *image,
-			   int idx)
+/**
+ * Update the clear color values for this image.
+ */
+void
+radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+				 struct radv_image *image,
+				 int cb_idx,
+				 uint32_t color_values[2])
 {
+	assert(radv_image_has_cmask(image) || radv_image_has_dcc(image));
+
+	radv_set_color_clear_metadata(cmd_buffer, image, color_values);
+
+	radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx,
+					   color_values);
+}
+
+/**
+ * Load the clear color values from the image's metadata.
+ */
+static void
+radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+			       struct radv_image *image,
+			       int cb_idx)
+{
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	uint64_t va = radv_buffer_get_va(image->bo);
+
 	va += image->offset + image->clear_value_offset;
 
 	if (!radv_image_has_cmask(image) && !radv_image_has_dcc(image))
 		return;
 
-	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + idx * 0x3c;
+	uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
-	radeon_emit(cmd_buffer->cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
-				    COPY_DATA_DST_SEL(COPY_DATA_REG) |
-				    COPY_DATA_COUNT_SEL);
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
-	radeon_emit(cmd_buffer->cs, reg >> 2);
-	radeon_emit(cmd_buffer->cs, 0);
+	radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
+	radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) |
+			COPY_DATA_DST_SEL(COPY_DATA_REG) |
+			COPY_DATA_COUNT_SEL);
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+	radeon_emit(cs, reg >> 2);
+	radeon_emit(cs, 0);
 
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
-	radeon_emit(cmd_buffer->cs, 0);
+	radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
+	radeon_emit(cs, 0);
 }
 
 static void
@@ -1354,12 +1462,12 @@
 		struct radv_image *image = att->attachment->image;
 		VkImageLayout layout = subpass->color_attachments[i].layout;
 
-		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo, 8);
+		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo);
 
 		assert(att->attachment->aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT);
 		radv_emit_fb_color_state(cmd_buffer, i, att, image, layout);
 
-		radv_load_color_clear_regs(cmd_buffer, image, i);
+		radv_load_color_clear_metadata(cmd_buffer, image, i);
 	}
 
 	if(subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
@@ -1367,7 +1475,7 @@
 		VkImageLayout layout = subpass->depth_stencil_attachment.layout;
 		struct radv_attachment_info *att = &framebuffer->attachments[idx];
 		struct radv_image *image = att->attachment->image;
-		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo, 8);
+		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, att->attachment->bo);
 		MAYBE_UNUSED uint32_t queue_mask = radv_image_queue_family_mask(image,
 										cmd_buffer->queue_family_index,
 										cmd_buffer->queue_family_index);
@@ -1381,7 +1489,7 @@
 			cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS;
 			cmd_buffer->state.offset_scale = att->ds.offset_scale;
 		}
-		radv_load_depth_clear_regs(cmd_buffer, image);
+		radv_load_ds_clear_metadata(cmd_buffer, image);
 	} else {
 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9)
 			radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
@@ -1406,7 +1514,7 @@
 static void
 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
 {
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	struct radv_cmd_state *state = &cmd_buffer->state;
 
 	if (state->index_type != state->last_index_type) {
@@ -1433,6 +1541,7 @@
 
 void radv_set_db_count_control(struct radv_cmd_buffer *cmd_buffer)
 {
+	bool has_perfect_queries = cmd_buffer->state.perfect_occlusion_queries_enabled;
 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
 	uint32_t pa_sc_mode_cntl_1 =
 		pipeline ? pipeline->graphics.ms.pa_sc_mode_cntl_1 : 0;
@@ -1441,39 +1550,38 @@
 	if(!cmd_buffer->state.active_occlusion_queries) {
 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
 			if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
-			    pipeline->graphics.disable_out_of_order_rast_for_occlusion) {
+			    pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
+			    has_perfect_queries) {
 				/* Re-enable out-of-order rasterization if the
 				 * bound pipeline supports it and if it's has
-				 * been disabled before starting occlusion
-				 * queries.
+				 * been disabled before starting any perfect
+				 * occlusion queries.
 				 */
 				radeon_set_context_reg(cmd_buffer->cs,
 						       R_028A4C_PA_SC_MODE_CNTL_1,
 						       pa_sc_mode_cntl_1);
 			}
-			db_count_control = 0;
-		} else {
-			db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
 		}
+		db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
 	} else {
 		const struct radv_subpass *subpass = cmd_buffer->state.subpass;
 		uint32_t sample_rate = subpass ? util_logbase2(subpass->max_sample_count) : 0;
-		bool perfect = cmd_buffer->state.perfect_occlusion_queries_enabled;
 
 		if (cmd_buffer->device->physical_device->rad_info.chip_class >= CIK) {
 			db_count_control =
-				S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+				S_028004_PERFECT_ZPASS_COUNTS(has_perfect_queries) |
 				S_028004_SAMPLE_RATE(sample_rate) |
 				S_028004_ZPASS_ENABLE(1) |
 				S_028004_SLICE_EVEN_ENABLE(1) |
 				S_028004_SLICE_ODD_ENABLE(1);
 
 			if (G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(pa_sc_mode_cntl_1) &&
-			    pipeline->graphics.disable_out_of_order_rast_for_occlusion) {
+			    pipeline->graphics.disable_out_of_order_rast_for_occlusion &&
+			    has_perfect_queries) {
 				/* If the bound pipeline has enabled
 				 * out-of-order rasterization, we should
-				 * disable it before starting occlusion
-				 * queries.
+				 * disable it before starting any perfect
+				 * occlusion queries.
 				 */
 				pa_sc_mode_cntl_1 &= C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE;
 
@@ -1498,7 +1606,8 @@
 	if (states & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
 		radv_emit_viewport(cmd_buffer);
 
-	if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT))
+	if (states & (RADV_CMD_DIRTY_DYNAMIC_SCISSOR | RADV_CMD_DIRTY_DYNAMIC_VIEWPORT) &&
+	    !cmd_buffer->device->physical_device->has_scissor_bug)
 		radv_emit_scissor(cmd_buffer);
 
 	if (states & RADV_CMD_DIRTY_DYNAMIC_LINE_WIDTH)
@@ -1525,48 +1634,6 @@
 }
 
 static void
-emit_stage_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,
-				   struct radv_pipeline *pipeline,
-				   int idx,
-				   uint64_t va,
-				   gl_shader_stage stage)
-{
-	struct radv_userdata_info *desc_set_loc = &pipeline->shaders[stage]->info.user_sgprs_locs.descriptor_sets[idx];
-	uint32_t base_reg = pipeline->user_data_0[stage];
-
-	if (desc_set_loc->sgpr_idx == -1 || desc_set_loc->indirect)
-		return;
-
-	assert(!desc_set_loc->indirect);
-	assert(desc_set_loc->num_sgprs == 2);
-	radeon_set_sh_reg_seq(cmd_buffer->cs,
-			      base_reg + desc_set_loc->sgpr_idx * 4, 2);
-	radeon_emit(cmd_buffer->cs, va);
-	radeon_emit(cmd_buffer->cs, va >> 32);
-}
-
-static void
-radv_emit_descriptor_set_userdata(struct radv_cmd_buffer *cmd_buffer,
-				  VkShaderStageFlags stages,
-				  struct radv_descriptor_set *set,
-				  unsigned idx)
-{
-	if (cmd_buffer->state.pipeline) {
-		radv_foreach_stage(stage, stages) {
-			if (cmd_buffer->state.pipeline->shaders[stage])
-				emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.pipeline,
-								   idx, set->va,
-								   stage);
-		}
-	}
-
-	if (cmd_buffer->state.compute_pipeline && (stages & VK_SHADER_STAGE_COMPUTE_BIT))
-		emit_stage_descriptor_set_userdata(cmd_buffer, cmd_buffer->state.compute_pipeline,
-						   idx, set->va,
-						   MESA_SHADER_COMPUTE);
-}
-
-static void
 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer,
 			    VkPipelineBindPoint bind_point)
 {
@@ -1590,7 +1657,8 @@
 {
 	struct radv_descriptor_state *descriptors_state =
 		radv_get_descriptors_state(cmd_buffer, bind_point);
-	uint32_t size = MAX_SETS * 2 * 4;
+	uint8_t ptr_size = HAVE_32BIT_POINTERS ? 1 : 2;
+	uint32_t size = MAX_SETS * 4 * ptr_size;
 	uint32_t offset;
 	void *ptr;
 	
@@ -1599,13 +1667,14 @@
 		return;
 
 	for (unsigned i = 0; i < MAX_SETS; i++) {
-		uint32_t *uptr = ((uint32_t *)ptr) + i * 2;
+		uint32_t *uptr = ((uint32_t *)ptr) + i * ptr_size;
 		uint64_t set_va = 0;
 		struct radv_descriptor_set *set = descriptors_state->sets[i];
 		if (descriptors_state->valid & (1u << i))
 			set_va = set->va;
 		uptr[0] = set_va & 0xffffffff;
-		uptr[1] = set_va >> 32;
+		if (ptr_size == 2)
+			uptr[1] = set_va >> 32;
 	}
 
 	uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
@@ -1647,7 +1716,8 @@
 					 VK_PIPELINE_BIND_POINT_GRAPHICS;
 	struct radv_descriptor_state *descriptors_state =
 		radv_get_descriptors_state(cmd_buffer, bind_point);
-	unsigned i;
+	struct radv_cmd_state *state = &cmd_buffer->state;
+	bool flush_indirect_descriptors;
 
 	if (!descriptors_state->dirty)
 		return;
@@ -1655,29 +1725,45 @@
 	if (descriptors_state->push_dirty)
 		radv_flush_push_descriptors(cmd_buffer, bind_point);
 
-	if ((cmd_buffer->state.pipeline && cmd_buffer->state.pipeline->need_indirect_descriptor_sets) ||
-	    (cmd_buffer->state.compute_pipeline && cmd_buffer->state.compute_pipeline->need_indirect_descriptor_sets)) {
+	flush_indirect_descriptors =
+		(bind_point == VK_PIPELINE_BIND_POINT_GRAPHICS &&
+		 state->pipeline && state->pipeline->need_indirect_descriptor_sets) ||
+		(bind_point == VK_PIPELINE_BIND_POINT_COMPUTE &&
+		 state->compute_pipeline && state->compute_pipeline->need_indirect_descriptor_sets);
+
+	if (flush_indirect_descriptors)
 		radv_flush_indirect_descriptor_sets(cmd_buffer, bind_point);
-	}
 
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws,
 	                                                   cmd_buffer->cs,
 	                                                   MAX_SETS * MESA_SHADER_STAGES * 4);
 
-	for_each_bit(i, descriptors_state->dirty) {
-		struct radv_descriptor_set *set = descriptors_state->sets[i];
-		if (!(descriptors_state->valid & (1u << i)))
-			continue;
+	if (cmd_buffer->state.pipeline) {
+		radv_foreach_stage(stage, stages) {
+			if (!cmd_buffer->state.pipeline->shaders[stage])
+				continue;
 
-		radv_emit_descriptor_set_userdata(cmd_buffer, stages, set, i);
+			radv_emit_descriptor_pointers(cmd_buffer,
+						      cmd_buffer->state.pipeline,
+						      descriptors_state, stage);
+		}
 	}
+
+	if (cmd_buffer->state.compute_pipeline &&
+	    (stages & VK_SHADER_STAGE_COMPUTE_BIT)) {
+		radv_emit_descriptor_pointers(cmd_buffer,
+					      cmd_buffer->state.compute_pipeline,
+					      descriptors_state,
+					      MESA_SHADER_COMPUTE);
+	}
+
 	descriptors_state->dirty = 0;
 	descriptors_state->push_dirty = false;
 
+	assert(cmd_buffer->cs->cdw <= cdw_max);
+
 	if (unlikely(cmd_buffer->device->trace_bo))
 		radv_save_descriptors(cmd_buffer, bind_point);
-
-	assert(cmd_buffer->cs->cdw <= cdw_max);
 }
 
 static void
@@ -1804,7 +1890,7 @@
 {
 	struct radeon_info *info = &cmd_buffer->device->physical_device->rad_info;
 	struct radv_cmd_state *state = &cmd_buffer->state;
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	uint32_t ia_multi_vgt_param;
 	int32_t primitive_reset_en;
 
@@ -1869,10 +1955,7 @@
 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
 	}
 
-	if (src_stage_mask & (VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
-			      VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
-			      VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT |
-			      VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
+	if (src_stage_mask & (VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
 			      VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
 			      VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
 			      VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
@@ -1883,36 +1966,54 @@
 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
 	} else if (src_stage_mask & (VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
 	                             VK_PIPELINE_STAGE_VERTEX_INPUT_BIT |
-	                             VK_PIPELINE_STAGE_VERTEX_SHADER_BIT)) {
+	                             VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
+				     VK_PIPELINE_STAGE_TESSELLATION_CONTROL_SHADER_BIT |
+				     VK_PIPELINE_STAGE_TESSELLATION_EVALUATION_SHADER_BIT |
+				     VK_PIPELINE_STAGE_GEOMETRY_SHADER_BIT)) {
 		cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
 	}
 }
 
 static enum radv_cmd_flush_bits
 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer,
-				  VkAccessFlags src_flags)
+		      VkAccessFlags src_flags,
+		      struct radv_image *image)
 {
+	bool flush_CB_meta = true, flush_DB_meta = true;
 	enum radv_cmd_flush_bits flush_bits = 0;
 	uint32_t b;
+
+	if (image) {
+		if (!radv_image_has_CB_metadata(image))
+			flush_CB_meta = false;
+		if (!radv_image_has_htile(image))
+			flush_DB_meta = false;
+	}
+
 	for_each_bit(b, src_flags) {
 		switch ((VkAccessFlagBits)(1 << b)) {
 		case VK_ACCESS_SHADER_WRITE_BIT:
 			flush_bits |= RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2;
 			break;
 		case VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT:
-			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
-			              RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
+			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
+			if (flush_CB_meta)
+				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
 			break;
 		case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT:
-			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
-			              RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
+			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
+			if (flush_DB_meta)
+				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
 			break;
 		case VK_ACCESS_TRANSFER_WRITE_BIT:
 			flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
-			              RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
 			              RADV_CMD_FLAG_FLUSH_AND_INV_DB |
-			              RADV_CMD_FLAG_FLUSH_AND_INV_DB_META |
 			              RADV_CMD_FLAG_INV_GLOBAL_L2;
+
+			if (flush_CB_meta)
+				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
+			if (flush_DB_meta)
+				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
 			break;
 		default:
 			break;
@@ -1926,8 +2027,23 @@
                       VkAccessFlags dst_flags,
                       struct radv_image *image)
 {
+	bool flush_CB_meta = true, flush_DB_meta = true;
 	enum radv_cmd_flush_bits flush_bits = 0;
+	bool flush_CB = true, flush_DB = true;
 	uint32_t b;
+
+	if (image) {
+		if (!(image->usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
+			flush_CB = false;
+			flush_DB = false;
+		}
+
+		if (!radv_image_has_CB_metadata(image))
+			flush_CB_meta = false;
+		if (!radv_image_has_htile(image))
+			flush_DB_meta = false;
+	}
+
 	for_each_bit(b, dst_flags) {
 		switch ((VkAccessFlagBits)(1 << b)) {
 		case VK_ACCESS_INDIRECT_COMMAND_READ_BIT:
@@ -1944,16 +2060,16 @@
 			              RADV_CMD_FLAG_INV_GLOBAL_L2;
 			break;
 		case VK_ACCESS_COLOR_ATTACHMENT_READ_BIT:
-			/* TODO: change to image && when the image gets passed
-			 * through from the subpass. */
-			if (!image || (image->usage & VK_IMAGE_USAGE_STORAGE_BIT))
-				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB |
-				              RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
+			if (flush_CB)
+				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
+			if (flush_CB_meta)
+				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
 			break;
 		case VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT:
-			if (!image || (image->usage & VK_IMAGE_USAGE_STORAGE_BIT))
-				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
-				              RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
+			if (flush_DB)
+				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
+			if (flush_DB_meta)
+				flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
 			break;
 		default:
 			break;
@@ -1962,16 +2078,18 @@
 	return flush_bits;
 }
 
-static void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_subpass_barrier *barrier)
+void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
+			  const struct radv_subpass_barrier *barrier)
 {
-	cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask);
+	cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_access_mask,
+							      NULL);
 	radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
 	cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_access_mask,
 	                                                      NULL);
 }
 
 static void radv_handle_subpass_image_transition(struct radv_cmd_buffer *cmd_buffer,
-						 VkAttachmentReference att)
+						 struct radv_subpass_attachment att)
 {
 	unsigned idx = att.attachment;
 	struct radv_image_view *view = cmd_buffer->state.framebuffer->attachments[idx].attachment;
@@ -2166,7 +2284,7 @@
 	struct radv_device *device = cmd_buffer->device;
 	if (device->gfx_init) {
 		uint64_t va = radv_buffer_get_va(device->gfx_init);
-		radv_cs_add_buffer(device->ws, cmd_buffer->cs, device->gfx_init, 8);
+		radv_cs_add_buffer(device->ws, cmd_buffer->cs, device->gfx_init);
 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
 		radeon_emit(cmd_buffer->cs, va);
 		radeon_emit(cmd_buffer->cs, va >> 32);
@@ -2197,6 +2315,7 @@
 	cmd_buffer->state.last_num_instances = -1;
 	cmd_buffer->state.last_vertex_offset = -1;
 	cmd_buffer->state.last_first_instance = -1;
+	cmd_buffer->state.predication_type = -1;
 	cmd_buffer->usage_flags = pBeginInfo->flags;
 
 	/* setup initial configuration into command buffer */
@@ -2230,8 +2349,14 @@
 		radv_cmd_buffer_set_subpass(cmd_buffer, subpass, false);
 	}
 
-	if (unlikely(cmd_buffer->device->trace_bo))
+	if (unlikely(cmd_buffer->device->trace_bo)) {
+		struct radv_device *device = cmd_buffer->device;
+
+		radv_cs_add_buffer(device->ws, cmd_buffer->cs,
+				   device->trace_bo);
+
 		radv_cmd_buffer_trace_emit(cmd_buffer);
+	}
 
 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_RECORDING;
 
@@ -2266,7 +2391,7 @@
 		vb[idx].offset = pOffsets[i];
 
 		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
-				   vb[idx].buffer->bo, 8);
+				   vb[idx].buffer->bo);
 	}
 
 	if (!changed) {
@@ -2302,7 +2427,7 @@
 	int index_size_shift = cmd_buffer->state.index_type ? 2 : 1;
 	cmd_buffer->state.max_index_count = (index_buffer->size - offset) >> index_size_shift;
 	cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
-	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo, 8);
+	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, index_buffer->bo);
 }
 
 
@@ -2314,19 +2439,18 @@
 	struct radeon_winsys *ws = cmd_buffer->device->ws;
 
 	radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
-	if (!set)
-		return;
 
+	assert(set);
 	assert(!(set->layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
 
 	if (!cmd_buffer->device->use_global_bo_list) {
 		for (unsigned j = 0; j < set->layout->buffer_count; ++j)
 			if (set->descriptors[j])
-				radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j], 7);
+				radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
 	}
 
 	if(set->bo)
-		radv_cs_add_buffer(ws, cmd_buffer->cs, set->bo, 8);
+		radv_cs_add_buffer(ws, cmd_buffer->cs, set->bo);
 }
 
 void radv_CmdBindDescriptorSets(
@@ -2525,7 +2649,7 @@
 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
 
 	if (!cmd_buffer->device->ws->cs_finalize(cmd_buffer->cs))
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(cmd_buffer->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 
 	cmd_buffer->status = RADV_CMD_BUFFER_STATUS_EXECUTABLE;
 
@@ -2550,7 +2674,7 @@
 	                               pipeline->max_waves * pipeline->scratch_bytes_per_wave);
 
 	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs,
-			   pipeline->shaders[MESA_SHADER_COMPUTE]->bo, 8);
+			   pipeline->shaders[MESA_SHADER_COMPUTE]->bo);
 
 	if (unlikely(cmd_buffer->device->trace_bo))
 		radv_save_pipeline(cmd_buffer, pipeline, RING_COMPUTE);
@@ -2610,15 +2734,6 @@
 
 		if (radv_pipeline_has_tess(pipeline))
 			cmd_buffer->tess_rings_needed = true;
-
-		if (radv_pipeline_has_gs(pipeline)) {
-			struct radv_userdata_info *loc = radv_lookup_user_sgpr(cmd_buffer->state.pipeline, MESA_SHADER_GEOMETRY,
-									     AC_UD_SCRATCH_RING_OFFSETS);
-			if (cmd_buffer->ring_offsets_idx == -1)
-				cmd_buffer->ring_offsets_idx = loc->sgpr_idx;
-			else if (loc->sgpr_idx != -1)
-				assert(loc->sgpr_idx == cmd_buffer->ring_offsets_idx);
-		}
 		break;
 	default:
 		assert(!"invalid bind point");
@@ -2639,18 +2754,6 @@
 	assert(firstViewport < MAX_VIEWPORTS);
 	assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
 
-	if (cmd_buffer->device->physical_device->has_scissor_bug) {
-		/* Try to skip unnecessary PS partial flushes when the viewports
-		 * don't change.
-		 */
-		if (!(state->dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT |
-				      RADV_CMD_DIRTY_DYNAMIC_SCISSOR)) &&
-		    !memcmp(state->dynamic.viewport.viewports + firstViewport,
-			    pViewports, viewportCount * sizeof(*pViewports))) {
-			return;
-		}
-	}
-
 	memcpy(state->dynamic.viewport.viewports + firstViewport, pViewports,
 	       viewportCount * sizeof(*pViewports));
 
@@ -2670,18 +2773,6 @@
 	assert(firstScissor < MAX_SCISSORS);
 	assert(total_count >= 1 && total_count <= MAX_SCISSORS);
 
-	if (cmd_buffer->device->physical_device->has_scissor_bug) {
-		/* Try to skip unnecessary PS partial flushes when the scissors
-		 * don't change.
-		 */
-		if (!(state->dirty & (RADV_CMD_DIRTY_DYNAMIC_VIEWPORT |
-				      RADV_CMD_DIRTY_DYNAMIC_SCISSOR)) &&
-		    !memcmp(state->dynamic.scissor.scissors + firstScissor,
-			    pScissors, scissorCount * sizeof(*pScissors))) {
-			return;
-		}
-	}
-
 	memcpy(state->dynamic.scissor.scissors + firstScissor, pScissors,
 	       scissorCount * sizeof(*pScissors));
 
@@ -2830,12 +2921,6 @@
 		if (secondary->sample_positions_needed)
 			primary->sample_positions_needed = true;
 
-		if (secondary->ring_offsets_idx != -1) {
-			if (primary->ring_offsets_idx == -1)
-				primary->ring_offsets_idx = secondary->ring_offsets_idx;
-			else
-				assert(secondary->ring_offsets_idx == primary->ring_offsets_idx);
-		}
 		primary->device->ws->cs_execute_secondary(primary->cs, secondary->cs);
 
 
@@ -2903,7 +2988,7 @@
 	pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (pool == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	if (pAllocator)
 		pool->alloc = *pAllocator;
@@ -3006,6 +3091,15 @@
 	radv_cmd_buffer_clear_subpass(cmd_buffer);
 }
 
+void radv_CmdBeginRenderPass2KHR(
+    VkCommandBuffer                             commandBuffer,
+    const VkRenderPassBeginInfo*                pRenderPassBeginInfo,
+    const VkSubpassBeginInfoKHR*                pSubpassBeginInfo)
+{
+	radv_CmdBeginRenderPass(commandBuffer, pRenderPassBeginInfo,
+				pSubpassBeginInfo->contents);
+}
+
 void radv_CmdNextSubpass(
     VkCommandBuffer                             commandBuffer,
     VkSubpassContents                           contents)
@@ -3021,6 +3115,14 @@
 	radv_cmd_buffer_clear_subpass(cmd_buffer);
 }
 
+void radv_CmdNextSubpass2KHR(
+    VkCommandBuffer                             commandBuffer,
+    const VkSubpassBeginInfoKHR*                pSubpassBeginInfo,
+    const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
+{
+	radv_CmdNextSubpass(commandBuffer, pSubpassBeginInfo->contents);
+}
+
 static void radv_emit_view_index(struct radv_cmd_buffer *cmd_buffer, unsigned index)
 {
 	struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
@@ -3059,7 +3161,7 @@
                                  uint64_t index_va,
                                  uint32_t index_count)
 {
-	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, false));
+	radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
 	radeon_emit(cmd_buffer->cs, cmd_buffer->state.max_index_count);
 	radeon_emit(cmd_buffer->cs, index_va);
 	radeon_emit(cmd_buffer->cs, index_va >> 32);
@@ -3074,11 +3176,12 @@
                                   uint64_t count_va,
                                   uint32_t stride)
 {
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA
 	                              : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
 	bool draw_id_enable = radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.needs_draw_id;
 	uint32_t base_reg = cmd_buffer->state.pipeline->graphics.vtx_base_sgpr;
+	bool predicating = cmd_buffer->state.predicating;
 	assert(base_reg);
 
 	/* just reset draw state for vertex data */
@@ -3088,7 +3191,7 @@
 
 	if (draw_count == 1 && !count_va && !draw_id_enable) {
 		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT :
-				     PKT3_DRAW_INDIRECT, 3, false));
+				     PKT3_DRAW_INDIRECT, 3, predicating));
 		radeon_emit(cs, 0);
 		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
 		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
@@ -3096,7 +3199,7 @@
 	} else {
 		radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI :
 				     PKT3_DRAW_INDIRECT_MULTI,
-				     8, false));
+				     8, predicating));
 		radeon_emit(cs, 0);
 		radeon_emit(cs, (base_reg - SI_SH_REG_OFFSET) >> 2);
 		radeon_emit(cs, ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2);
@@ -3162,7 +3265,7 @@
 {
 	struct radv_cmd_state *state = &cmd_buffer->state;
 	struct radeon_winsys *ws = cmd_buffer->device->ws;
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 
 	if (info->indirect) {
 		uint64_t va = radv_buffer_get_va(info->indirect->bo);
@@ -3170,7 +3273,7 @@
 
 		va += info->indirect->offset + info->indirect_offset;
 
-		radv_cs_add_buffer(ws, cs, info->indirect->bo, 8);
+		radv_cs_add_buffer(ws, cs, info->indirect->bo);
 
 		radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
 		radeon_emit(cs, 1);
@@ -3182,7 +3285,7 @@
 			count_va += info->count_buffer->offset +
 				    info->count_buffer_offset;
 
-			radv_cs_add_buffer(ws, cs, info->count_buffer->bo, 8);
+			radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
 		}
 
 		if (!state->subpass->view_mask) {
@@ -3262,10 +3365,55 @@
 	}
 }
 
+/*
+ * Vega and raven have a bug which triggers if there are multiple context
+ * register contexts active at the same time with different scissor values.
+ *
+ * There are two possible workarounds:
+ * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
+ *    there is only ever 1 active set of scissor values at the same time.
+ *
+ * 2) Whenever the hardware switches contexts we have to set the scissor
+ *    registers again even if it is a noop. That way the new context gets
+ *    the correct scissor values.
+ *
+ * This implements option 2. radv_need_late_scissor_emission needs to
+ * return true on affected HW if radv_emit_all_graphics_states sets
+ * any context registers.
+ */
+static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
+                                            bool indexed_draw)
+{
+	struct radv_cmd_state *state = &cmd_buffer->state;
+
+	if (!cmd_buffer->device->physical_device->has_scissor_bug)
+		return false;
+
+	uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
+
+	/* Index & Vertex buffer don't change context regs, and pipeline is handled later. */
+	used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_PIPELINE);
+
+	/* Assume all state changes except  these two can imply context rolls. */
+	if (cmd_buffer->state.dirty & used_states)
+		return true;
+
+	if (cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
+		return true;
+
+	if (indexed_draw && state->pipeline->graphics.prim_restart_enable &&
+	    (state->index_type ? 0xffffffffu : 0xffffu) != state->last_primitive_reset_index)
+		return true;
+
+	return false;
+}
+
 static void
 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
 			      const struct radv_draw_info *info)
 {
+	bool late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info->indexed);
+
 	if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
 	    cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
 		radv_emit_rbplus_state(cmd_buffer);
@@ -3295,6 +3443,9 @@
 	radv_emit_draw_registers(cmd_buffer, info->indexed,
 				 info->instance_count > 1, info->indirect,
 				 info->indirect ? 0 : info->count);
+
+	if (late_scissor_emission)
+		radv_emit_scissor(cmd_buffer);
 }
 
 static void
@@ -3305,7 +3456,6 @@
 		cmd_buffer->device->physical_device->rad_info.chip_class >= CIK;
 	bool pipeline_is_dirty =
 		(cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) &&
-		cmd_buffer->state.pipeline &&
 		cmd_buffer->state.pipeline != cmd_buffer->state.emitted_pipeline;
 
 	MAYBE_UNUSED unsigned cdw_max =
@@ -3502,6 +3652,55 @@
 	radv_draw(cmd_buffer, &info);
 }
 
+void radv_CmdDrawIndirectCountKHR(
+	VkCommandBuffer                             commandBuffer,
+	VkBuffer                                    _buffer,
+	VkDeviceSize                                offset,
+	VkBuffer                                    _countBuffer,
+	VkDeviceSize                                countBufferOffset,
+	uint32_t                                    maxDrawCount,
+	uint32_t                                    stride)
+{
+	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
+	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
+	struct radv_draw_info info = {};
+
+	info.count = maxDrawCount;
+	info.indirect = buffer;
+	info.indirect_offset = offset;
+	info.count_buffer = count_buffer;
+	info.count_buffer_offset = countBufferOffset;
+	info.stride = stride;
+
+	radv_draw(cmd_buffer, &info);
+}
+
+void radv_CmdDrawIndexedIndirectCountKHR(
+	VkCommandBuffer                             commandBuffer,
+	VkBuffer                                    _buffer,
+	VkDeviceSize                                offset,
+	VkBuffer                                    _countBuffer,
+	VkDeviceSize                                countBufferOffset,
+	uint32_t                                    maxDrawCount,
+	uint32_t                                    stride)
+{
+	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+	RADV_FROM_HANDLE(radv_buffer, buffer, _buffer);
+	RADV_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
+	struct radv_draw_info info = {};
+
+	info.indexed = true;
+	info.count = maxDrawCount;
+	info.indirect = buffer;
+	info.indirect_offset = offset;
+	info.count_buffer = count_buffer;
+	info.count_buffer_offset = countBufferOffset;
+	info.stride = stride;
+
+	radv_draw(cmd_buffer, &info);
+}
+
 struct radv_dispatch_info {
 	/**
 	 * Determine the layout of the grid (in block units) to be used.
@@ -3533,7 +3732,8 @@
 	struct radv_shader_variant *compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
 	unsigned dispatch_initiator = cmd_buffer->device->dispatch_initiator;
 	struct radeon_winsys *ws = cmd_buffer->device->ws;
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	bool predicating = cmd_buffer->state.predicating;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	struct radv_userdata_info *loc;
 
 	loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_COMPUTE,
@@ -3546,7 +3746,7 @@
 
 		va += info->indirect->offset + info->indirect_offset;
 
-		radv_cs_add_buffer(ws, cs, info->indirect->bo, 8);
+		radv_cs_add_buffer(ws, cs, info->indirect->bo);
 
 		if (loc->sgpr_idx != -1) {
 			for (unsigned i = 0; i < 3; ++i) {
@@ -3562,7 +3762,7 @@
 		}
 
 		if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
-			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) |
+			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, predicating) |
 					PKT3_SHADER_TYPE_S(1));
 			radeon_emit(cs, va);
 			radeon_emit(cs, va >> 32);
@@ -3574,7 +3774,7 @@
 			radeon_emit(cs, va);
 			radeon_emit(cs, va >> 32);
 
-			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, 0) |
+			radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) |
 					PKT3_SHADER_TYPE_S(1));
 			radeon_emit(cs, 0);
 			radeon_emit(cs, dispatch_initiator);
@@ -3644,7 +3844,7 @@
 			dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
 		}
 
-		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
+		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) |
 				PKT3_SHADER_TYPE_S(1));
 		radeon_emit(cs, blocks[0]);
 		radeon_emit(cs, blocks[1]);
@@ -3794,7 +3994,7 @@
 	for (unsigned i = 0; i < cmd_buffer->state.framebuffer->attachment_count; ++i) {
 		VkImageLayout layout = cmd_buffer->state.pass->attachments[i].final_layout;
 		radv_handle_subpass_image_transition(cmd_buffer,
-		                      (VkAttachmentReference){i, layout});
+		                      (struct radv_subpass_attachment){i, layout});
 	}
 
 	vk_free(&cmd_buffer->pool->alloc, cmd_buffer->state.attachments);
@@ -3805,6 +4005,13 @@
 	cmd_buffer->state.framebuffer = NULL;
 }
 
+void radv_CmdEndRenderPass2KHR(
+    VkCommandBuffer                             commandBuffer,
+    const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
+{
+	radv_CmdEndRenderPass(commandBuffer);
+}
+
 /*
  * For HTILE we have the following interesting clear words:
  *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
@@ -3821,9 +4028,11 @@
 	assert(range->levelCount == 1 || range->levelCount == VK_REMAINING_ARRAY_LAYERS);
 	unsigned layer_count = radv_get_layerCount(image, range);
 	uint64_t size = image->surface.htile_slice_size * layer_count;
+	VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
 	uint64_t offset = image->offset + image->htile_offset +
 	                  image->surface.htile_slice_size * range->baseArrayLayer;
 	struct radv_cmd_state *state = &cmd_buffer->state;
+	VkClearDepthStencilValue value = {};
 
 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB |
 			     RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
@@ -3833,19 +4042,10 @@
 
 	state->flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
 
-	/* Initialize the depth clear registers and update the ZRANGE_PRECISION
-	 * value for the TC-compat bug (because ZRANGE_PRECISION is 1 by
-	 * default). This is only needed whean clearing Z to 0.0f.
-	 */
-	if (radv_image_is_tc_compat_htile(image) && clear_word == 0) {
-		VkImageAspectFlags aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
-		VkClearDepthStencilValue value = {};
+	if (vk_format_is_stencil(image->vk_format))
+		aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
 
-		if (vk_format_is_stencil(image->vk_format))
-			aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
-
-		radv_set_depth_clear_regs(cmd_buffer, image, value, aspects);
-	}
+	radv_set_ds_clear_metadata(cmd_buffer, image, value, aspects);
 }
 
 static void radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer,
@@ -3935,13 +4135,23 @@
 
 	if (radv_image_has_dcc(image)) {
 		uint32_t value = 0xffffffffu; /* Fully expanded mode. */
+		bool need_decompress_pass = false;
 
 		if (radv_layout_dcc_compressed(image, dst_layout,
 					       dst_queue_mask)) {
 			value = 0x20202020u;
+			need_decompress_pass = true;
 		}
 
 		radv_initialize_dcc(cmd_buffer, image, value);
+
+		radv_set_dcc_need_cmask_elim_pred(cmd_buffer, image,
+						  need_decompress_pass);
+	}
+
+	if (radv_image_has_cmask(image) || radv_image_has_dcc(image)) {
+		uint32_t color_values[2] = {};
+		radv_set_color_clear_metadata(cmd_buffer, image, color_values);
 	}
 }
 
@@ -4027,42 +4237,62 @@
 	}
 }
 
-void radv_CmdPipelineBarrier(
-	VkCommandBuffer                             commandBuffer,
-	VkPipelineStageFlags                        srcStageMask,
-	VkPipelineStageFlags                        destStageMask,
-	VkBool32                                    byRegion,
-	uint32_t                                    memoryBarrierCount,
-	const VkMemoryBarrier*                      pMemoryBarriers,
-	uint32_t                                    bufferMemoryBarrierCount,
-	const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
-	uint32_t                                    imageMemoryBarrierCount,
-	const VkImageMemoryBarrier*                 pImageMemoryBarriers)
+struct radv_barrier_info {
+	uint32_t eventCount;
+	const VkEvent *pEvents;
+	VkPipelineStageFlags srcStageMask;
+};
+
+static void
+radv_barrier(struct radv_cmd_buffer *cmd_buffer,
+	     uint32_t memoryBarrierCount,
+	     const VkMemoryBarrier *pMemoryBarriers,
+	     uint32_t bufferMemoryBarrierCount,
+	     const VkBufferMemoryBarrier *pBufferMemoryBarriers,
+	     uint32_t imageMemoryBarrierCount,
+	     const VkImageMemoryBarrier *pImageMemoryBarriers,
+	     const struct radv_barrier_info *info)
 {
-	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	enum radv_cmd_flush_bits src_flush_bits = 0;
 	enum radv_cmd_flush_bits dst_flush_bits = 0;
 
+	for (unsigned i = 0; i < info->eventCount; ++i) {
+		RADV_FROM_HANDLE(radv_event, event, info->pEvents[i]);
+		uint64_t va = radv_buffer_get_va(event->bo);
+
+		radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
+
+		MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
+
+		si_emit_wait_fence(cs, va, 1, 0xffffffff);
+		assert(cmd_buffer->cs->cdw <= cdw_max);
+	}
+
 	for (uint32_t i = 0; i < memoryBarrierCount; i++) {
-		src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask);
+		src_flush_bits |= radv_src_access_flush(cmd_buffer, pMemoryBarriers[i].srcAccessMask,
+							NULL);
 		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pMemoryBarriers[i].dstAccessMask,
 		                                        NULL);
 	}
 
 	for (uint32_t i = 0; i < bufferMemoryBarrierCount; i++) {
-		src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask);
+		src_flush_bits |= radv_src_access_flush(cmd_buffer, pBufferMemoryBarriers[i].srcAccessMask,
+							NULL);
 		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pBufferMemoryBarriers[i].dstAccessMask,
 		                                        NULL);
 	}
 
 	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
 		RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
-		src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask);
+
+		src_flush_bits |= radv_src_access_flush(cmd_buffer, pImageMemoryBarriers[i].srcAccessMask,
+							image);
 		dst_flush_bits |= radv_dst_access_flush(cmd_buffer, pImageMemoryBarriers[i].dstAccessMask,
 		                                        image);
 	}
 
-	radv_stage_flush(cmd_buffer, srcStageMask);
+	radv_stage_flush(cmd_buffer, info->srcStageMask);
 	cmd_buffer->state.flush_bits |= src_flush_bits;
 
 	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
@@ -4084,34 +4314,89 @@
 	cmd_buffer->state.flush_bits |= dst_flush_bits;
 }
 
+void radv_CmdPipelineBarrier(
+	VkCommandBuffer                             commandBuffer,
+	VkPipelineStageFlags                        srcStageMask,
+	VkPipelineStageFlags                        destStageMask,
+	VkBool32                                    byRegion,
+	uint32_t                                    memoryBarrierCount,
+	const VkMemoryBarrier*                      pMemoryBarriers,
+	uint32_t                                    bufferMemoryBarrierCount,
+	const VkBufferMemoryBarrier*                pBufferMemoryBarriers,
+	uint32_t                                    imageMemoryBarrierCount,
+	const VkImageMemoryBarrier*                 pImageMemoryBarriers)
+{
+	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+	struct radv_barrier_info info;
+
+	info.eventCount = 0;
+	info.pEvents = NULL;
+	info.srcStageMask = srcStageMask;
+
+	radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
+		     bufferMemoryBarrierCount, pBufferMemoryBarriers,
+		     imageMemoryBarrierCount, pImageMemoryBarriers, &info);
+}
+
 
 static void write_event(struct radv_cmd_buffer *cmd_buffer,
 			struct radv_event *event,
 			VkPipelineStageFlags stageMask,
 			unsigned value)
 {
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	uint64_t va = radv_buffer_get_va(event->bo);
 
-	radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo, 8);
+	si_emit_cache_flush(cmd_buffer);
+
+	radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo);
 
 	MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 18);
 
+	/* Flags that only require a top-of-pipe event. */
+	VkPipelineStageFlags top_of_pipe_flags =
+		VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+	/* Flags that only require a post-index-fetch event. */
+	VkPipelineStageFlags post_index_fetch_flags =
+		top_of_pipe_flags |
+		VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT |
+		VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
+
 	/* Make sure CP DMA is idle because the driver might have performed a
 	 * DMA operation for copying or filling buffers/images.
 	 */
 	si_cp_dma_wait_for_idle(cmd_buffer);
 
-	/* TODO: this is overkill. Probably should figure something out from
-	 * the stage mask. */
+	/* TODO: Emit EOS events for syncing PS/CS stages. */
 
-	si_cs_emit_write_event_eop(cs,
-				   cmd_buffer->state.predicating,
-				   cmd_buffer->device->physical_device->rad_info.chip_class,
-				   radv_cmd_buffer_uses_mec(cmd_buffer),
-				   V_028A90_BOTTOM_OF_PIPE_TS, 0,
-				   1, va, 2, value,
-				   cmd_buffer->gfx9_eop_bug_va);
+	if (!(stageMask & ~top_of_pipe_flags)) {
+		/* Just need to sync the PFP engine. */
+		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+		radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+				S_370_WR_CONFIRM(1) |
+				S_370_ENGINE_SEL(V_370_PFP));
+		radeon_emit(cs, va);
+		radeon_emit(cs, va >> 32);
+		radeon_emit(cs, value);
+	} else if (!(stageMask & ~post_index_fetch_flags)) {
+		/* Sync ME because PFP reads index and indirect buffers. */
+		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
+		radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
+				S_370_WR_CONFIRM(1) |
+				S_370_ENGINE_SEL(V_370_ME));
+		radeon_emit(cs, va);
+		radeon_emit(cs, va >> 32);
+		radeon_emit(cs, value);
+	} else {
+		/* Otherwise, sync all prior GPU work using an EOP event. */
+		si_cs_emit_write_event_eop(cs,
+					   cmd_buffer->device->physical_device->rad_info.chip_class,
+					   radv_cmd_buffer_uses_mec(cmd_buffer),
+					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
+					   EOP_DATA_SEL_VALUE_32BIT, va, 2, value,
+					   cmd_buffer->gfx9_eop_bug_va);
+	}
 
 	assert(cmd_buffer->cs->cdw <= cdw_max);
 }
@@ -4149,38 +4434,15 @@
 			const VkImageMemoryBarrier* pImageMemoryBarriers)
 {
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radv_barrier_info info;
 
-	for (unsigned i = 0; i < eventCount; ++i) {
-		RADV_FROM_HANDLE(radv_event, event, pEvents[i]);
-		uint64_t va = radv_buffer_get_va(event->bo);
+	info.eventCount = eventCount;
+	info.pEvents = pEvents;
+	info.srcStageMask = 0;
 
-		radv_cs_add_buffer(cmd_buffer->device->ws, cs, event->bo, 8);
-
-		MAYBE_UNUSED unsigned cdw_max = radeon_check_space(cmd_buffer->device->ws, cs, 7);
-
-		si_emit_wait_fence(cs, false, va, 1, 0xffffffff);
-		assert(cmd_buffer->cs->cdw <= cdw_max);
-	}
-
-
-	for (uint32_t i = 0; i < imageMemoryBarrierCount; i++) {
-		RADV_FROM_HANDLE(radv_image, image, pImageMemoryBarriers[i].image);
-
-		radv_handle_image_transition(cmd_buffer, image,
-					     pImageMemoryBarriers[i].oldLayout,
-					     pImageMemoryBarriers[i].newLayout,
-					     pImageMemoryBarriers[i].srcQueueFamilyIndex,
-					     pImageMemoryBarriers[i].dstQueueFamilyIndex,
-					     &pImageMemoryBarriers[i].subresourceRange,
-					     0);
-	}
-
-	/* TODO: figure out how to do memory barriers without waiting */
-	cmd_buffer->state.flush_bits |= RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER |
-					RADV_CMD_FLAG_INV_GLOBAL_L2 |
-					RADV_CMD_FLAG_INV_VMEM_L1 |
-					RADV_CMD_FLAG_INV_SMEM_L1;
+	radv_barrier(cmd_buffer, memoryBarrierCount, pMemoryBarriers,
+		     bufferMemoryBarrierCount, pBufferMemoryBarriers,
+		     imageMemoryBarrierCount, pImageMemoryBarriers, &info);
 }
 
 
@@ -4189,3 +4451,48 @@
 {
    /* No-op */
 }
+
+/* VK_EXT_conditional_rendering */
+void radv_CmdBeginConditionalRenderingEXT(
+	VkCommandBuffer                             commandBuffer,
+	const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
+{
+	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+	RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
+	bool draw_visible = true;
+	uint64_t va;
+
+	va = radv_buffer_get_va(buffer->bo) + pConditionalRenderingBegin->offset;
+
+	/* By default, if the 32-bit value at offset in buffer memory is zero,
+	 * then the rendering commands are discarded, otherwise they are
+	 * executed as normal. If the inverted flag is set, all commands are
+	 * discarded if the value is non zero.
+	 */
+	if (pConditionalRenderingBegin->flags &
+	    VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
+		draw_visible = false;
+	}
+
+	/* Enable predication for this command buffer. */
+	si_emit_set_predication_state(cmd_buffer, draw_visible, va);
+	cmd_buffer->state.predicating = true;
+
+	/* Store conditional rendering user info. */
+	cmd_buffer->state.predication_type = draw_visible;
+	cmd_buffer->state.predication_va = va;
+}
+
+void radv_CmdEndConditionalRenderingEXT(
+	VkCommandBuffer                             commandBuffer)
+{
+	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+
+	/* Disable predication for this command buffer. */
+	si_emit_set_predication_state(cmd_buffer, false, 0);
+	cmd_buffer->state.predicating = false;
+
+	/* Reset conditional rendering user info. */
+	cmd_buffer->state.predication_type = -1;
+	cmd_buffer->state.predication_va = 0;
+}
diff --git a/src/amd/vulkan/radv_cs.h b/src/amd/vulkan/radv_cs.h
index 8405976..a5792fc 100644
--- a/src/amd/vulkan/radv_cs.h
+++ b/src/amd/vulkan/radv_cs.h
@@ -31,7 +31,7 @@
 #include "sid.h"
 
 static inline unsigned radeon_check_space(struct radeon_winsys *ws,
-                                      struct radeon_winsys_cs *cs,
+                                      struct radeon_cmdbuf *cs,
                                       unsigned needed)
 {
         if (cs->max_dw - cs->cdw < needed)
@@ -39,7 +39,7 @@
         return cs->cdw + needed;
 }
 
-static inline void radeon_set_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
         assert(reg < SI_CONTEXT_REG_OFFSET);
         assert(cs->cdw + 2 + num <= cs->max_dw);
@@ -48,13 +48,13 @@
         radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
         radeon_set_config_reg_seq(cs, reg, 1);
         radeon_emit(cs, value);
 }
 
-static inline void radeon_set_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
         assert(reg >= SI_CONTEXT_REG_OFFSET);
         assert(cs->cdw + 2 + num <= cs->max_dw);
@@ -63,14 +63,14 @@
         radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
         radeon_set_context_reg_seq(cs, reg, 1);
         radeon_emit(cs, value);
 }
 
 
-static inline void radeon_set_context_reg_idx(struct radeon_winsys_cs *cs,
+static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
 					      unsigned reg, unsigned idx,
 					      unsigned value)
 {
@@ -81,7 +81,7 @@
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
 	assert(cs->cdw + 2 + num <= cs->max_dw);
@@ -90,13 +90,13 @@
 	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_sh_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
 	assert(cs->cdw + 2 + num <= cs->max_dw);
@@ -105,13 +105,13 @@
 	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_uconfig_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_uconfig_reg_idx(struct radeon_winsys_cs *cs,
+static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
 					      unsigned reg, unsigned idx,
 					      unsigned value)
 {
diff --git a/src/amd/vulkan/radv_debug.c b/src/amd/vulkan/radv_debug.c
index 368bc4b..08fc80c 100644
--- a/src/amd/vulkan/radv_debug.c
+++ b/src/amd/vulkan/radv_debug.c
@@ -80,7 +80,7 @@
 }
 
 static void
-radv_dump_trace(struct radv_device *device, struct radeon_winsys_cs *cs)
+radv_dump_trace(struct radv_device *device, struct radeon_cmdbuf *cs)
 {
 	const char *filename = getenv("RADV_TRACE_FILE");
 	FILE *f = fopen(filename, "w");
@@ -369,11 +369,9 @@
 }
 
 static void
-radv_dump_annotated_shader(struct radv_pipeline *pipeline,
-			   struct radv_shader_variant *shader,
-			   gl_shader_stage stage,
-			   struct ac_wave_info *waves, unsigned num_waves,
-			   FILE *f)
+radv_dump_annotated_shader(struct radv_shader_variant *shader,
+			   gl_shader_stage stage, struct ac_wave_info *waves,
+			   unsigned num_waves, FILE *f)
 {
 	uint64_t start_addr, end_addr;
 	unsigned i;
@@ -444,28 +442,22 @@
 
 static void
 radv_dump_annotated_shaders(struct radv_pipeline *pipeline,
-			    struct radv_shader_variant *compute_shader,
-			    FILE *f)
+			    VkShaderStageFlagBits active_stages, FILE *f)
 {
 	struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP];
 	unsigned num_waves = ac_get_wave_info(waves);
-	unsigned mask;
 
 	fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET
 		"\n\n", num_waves);
 
 	/* Dump annotated active graphics shaders. */
-	mask = pipeline->active_stages;
-	while (mask) {
-		int stage = u_bit_scan(&mask);
+	while (active_stages) {
+		int stage = u_bit_scan(&active_stages);
 
-		radv_dump_annotated_shader(pipeline, pipeline->shaders[stage],
+		radv_dump_annotated_shader(pipeline->shaders[stage],
 					   stage, waves, num_waves, f);
 	}
 
-	radv_dump_annotated_shader(pipeline, compute_shader,
-				   MESA_SHADER_COMPUTE, waves, num_waves, f);
-
 	/* Print waves executing shaders that are not currently bound. */
 	unsigned i;
 	bool found = false;
@@ -523,48 +515,51 @@
 
 static void
 radv_dump_shaders(struct radv_pipeline *pipeline,
-		  struct radv_shader_variant *compute_shader, FILE *f)
+		  VkShaderStageFlagBits active_stages, FILE *f)
 {
-	unsigned mask;
-
 	/* Dump active graphics shaders. */
-	mask = pipeline->active_stages;
-	while (mask) {
-		int stage = u_bit_scan(&mask);
+	while (active_stages) {
+		int stage = u_bit_scan(&active_stages);
 
 		radv_dump_shader(pipeline, pipeline->shaders[stage], stage, f);
 	}
+}
 
-	radv_dump_shader(pipeline, compute_shader, MESA_SHADER_COMPUTE, f);
+static void
+radv_dump_pipeline_state(struct radv_pipeline *pipeline,
+			 VkShaderStageFlagBits active_stages, FILE *f)
+{
+	radv_dump_shaders(pipeline, active_stages, f);
+	radv_dump_annotated_shaders(pipeline, active_stages, f);
+	radv_dump_descriptors(pipeline, f);
 }
 
 static void
 radv_dump_graphics_state(struct radv_pipeline *graphics_pipeline,
 			 struct radv_pipeline *compute_pipeline, FILE *f)
 {
-	struct radv_shader_variant *compute_shader =
-		compute_pipeline ? compute_pipeline->shaders[MESA_SHADER_COMPUTE] : NULL;
+	VkShaderStageFlagBits active_stages;
 
-	if (!graphics_pipeline)
-		return;
+	if (graphics_pipeline) {
+		active_stages = graphics_pipeline->active_stages;
+		radv_dump_pipeline_state(graphics_pipeline, active_stages, f);
+	}
 
-	radv_dump_shaders(graphics_pipeline, compute_shader, f);
-	radv_dump_annotated_shaders(graphics_pipeline, compute_shader, f);
-	radv_dump_descriptors(graphics_pipeline, f);
+	if (compute_pipeline) {
+		active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
+		radv_dump_pipeline_state(compute_pipeline, active_stages, f);
+	}
 }
 
 static void
 radv_dump_compute_state(struct radv_pipeline *compute_pipeline, FILE *f)
 {
+	VkShaderStageFlagBits active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
+
 	if (!compute_pipeline)
 		return;
 
-	radv_dump_shaders(compute_pipeline,
-			  compute_pipeline->shaders[MESA_SHADER_COMPUTE], f);
-	radv_dump_annotated_shaders(compute_pipeline,
-				    compute_pipeline->shaders[MESA_SHADER_COMPUTE],
-				    f);
-	radv_dump_descriptors(compute_pipeline, f);
+	radv_dump_pipeline_state(compute_pipeline, active_stages, f);
 }
 
 static struct radv_pipeline *
@@ -643,11 +638,9 @@
 		snprintf(kernel_version, sizeof(kernel_version),
 			 " / %s", uname_data.release);
 
-	if (HAVE_LLVM > 0) {
-		snprintf(llvm_string, sizeof(llvm_string),
-			 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
-			 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
-	}
+	snprintf(llvm_string, sizeof(llvm_string),
+		 ", LLVM %i.%i.%i", (HAVE_LLVM >> 8) & 0xff,
+		 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
 
 	fprintf(f, "Device name: %s (%s DRM %i.%i.%i%s%s)\n\n",
 		chip_name, device->physical_device->name,
@@ -667,7 +660,7 @@
 }
 
 void
-radv_check_gpu_hangs(struct radv_queue *queue, struct radeon_winsys_cs *cs)
+radv_check_gpu_hangs(struct radv_queue *queue, struct radeon_cmdbuf *cs)
 {
 	struct radv_pipeline *graphics_pipeline, *compute_pipeline;
 	struct radv_device *device = queue->device;
diff --git a/src/amd/vulkan/radv_debug.h b/src/amd/vulkan/radv_debug.h
index 03f218f..9fe4c3b 100644
--- a/src/amd/vulkan/radv_debug.h
+++ b/src/amd/vulkan/radv_debug.h
@@ -44,6 +44,12 @@
 	RADV_DEBUG_NO_SISCHED        = 0x4000,
 	RADV_DEBUG_PREOPTIR          = 0x8000,
 	RADV_DEBUG_NO_DYNAMIC_BOUNDS = 0x10000,
+	RADV_DEBUG_NO_OUT_OF_ORDER   = 0x20000,
+	RADV_DEBUG_INFO              = 0x40000,
+	RADV_DEBUG_ERRORS            = 0x80000,
+	RADV_DEBUG_STARTUP           = 0x100000,
+	RADV_DEBUG_CHECKIR           = 0x200000,
+	RADV_DEBUG_NOTHREADLLVM      = 0x400000,
 };
 
 enum {
@@ -59,7 +65,7 @@
 radv_init_trace(struct radv_device *device);
 
 void
-radv_check_gpu_hangs(struct radv_queue *queue, struct radeon_winsys_cs *cs);
+radv_check_gpu_hangs(struct radv_queue *queue, struct radeon_cmdbuf *cs);
 
 void
 radv_print_spirv(uint32_t *data, uint32_t size, FILE *fp);
diff --git a/src/amd/vulkan/radv_descriptor_set.c b/src/amd/vulkan/radv_descriptor_set.c
index 4b08a1f..c4341f6 100644
--- a/src/amd/vulkan/radv_descriptor_set.c
+++ b/src/amd/vulkan/radv_descriptor_set.c
@@ -95,7 +95,7 @@
 	set_layout = vk_alloc2(&device->alloc, pAllocator, size, 8,
 				 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!set_layout)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	set_layout->flags = pCreateInfo->flags;
 
@@ -106,7 +106,7 @@
 	                                                                pCreateInfo->bindingCount);
 	if (!bindings) {
 		vk_free2(&device->alloc, pAllocator, set_layout);
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 	}
 
 	set_layout->binding_count = max_binding + 1;
@@ -322,7 +322,7 @@
 
 /*
  * Pipeline layouts.  These have nothing to do with the pipeline.  They are
- * just muttiple descriptor set layouts pasted together
+ * just multiple descriptor set layouts pasted together.
  */
 
 VkResult radv_CreatePipelineLayout(
@@ -340,7 +340,7 @@
 	layout = vk_alloc2(&device->alloc, pAllocator, sizeof(*layout), 8,
 			     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (layout == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	layout->num_sets = pCreateInfo->setLayoutCount;
 
@@ -412,7 +412,7 @@
 
 	if (pool->host_memory_base) {
 		if (pool->host_memory_end - pool->host_memory_ptr < mem_size)
-			return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
 
 		set = (struct radv_descriptor_set*)pool->host_memory_ptr;
 		pool->host_memory_ptr += mem_size;
@@ -421,7 +421,7 @@
 		                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
 		if (!set)
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 	}
 
 	memset(set, 0, mem_size);
@@ -437,7 +437,7 @@
 
 		if (!pool->host_memory_base && pool->entry_count == pool->max_entry_count) {
 			vk_free2(&device->alloc, NULL, set);
-			return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
 		}
 
 		/* try to allocate linearly first, so that we don't spend
@@ -466,7 +466,7 @@
 
 			if (pool->size - offset < layout_size) {
 				vk_free2(&device->alloc, NULL, set);
-				return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
+				return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
 			}
 			set->bo = pool->bo;
 			set->mapped_ptr = (uint32_t*)(pool->mapped_ptr + offset);
@@ -478,7 +478,7 @@
 			pool->entries[index].set = set;
 			pool->entry_count++;
 		} else
-			return vk_error(VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_POOL_MEMORY_KHR);
 	}
 
 	if (layout->has_immutable_samplers) {
@@ -580,7 +580,7 @@
 	pool = vk_alloc2(&device->alloc, pAllocator, size, 8,
 	                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!pool)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	memset(pool, 0, sizeof(*pool));
 
@@ -594,7 +594,8 @@
 		pool->bo = device->ws->buffer_create(device->ws, bo_size, 32,
 						     RADEON_DOMAIN_VRAM,
 						     RADEON_FLAG_NO_INTERPROCESS_SHARING |
-						     RADEON_FLAG_READ_ONLY);
+						     RADEON_FLAG_READ_ONLY |
+						     RADEON_FLAG_32BIT);
 		pool->mapped_ptr = (uint8_t*)device->ws->buffer_map(pool->bo);
 	}
 	pool->size = bo_size;
@@ -720,7 +721,7 @@
 	memcpy(dst, buffer_view->state, 4 * 4);
 
 	if (cmd_buffer)
-		radv_cs_add_buffer(device->ws, cmd_buffer->cs, buffer_view->bo, 7);
+		radv_cs_add_buffer(device->ws, cmd_buffer->cs, buffer_view->bo);
 	else
 		*buffer_list = buffer_view->bo;
 }
@@ -750,7 +751,7 @@
 		S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 
 	if (cmd_buffer)
-		radv_cs_add_buffer(device->ws, cmd_buffer->cs, buffer->bo, 7);
+		radv_cs_add_buffer(device->ws, cmd_buffer->cs, buffer->bo);
 	else
 		*buffer_list = buffer->bo;
 }
@@ -794,7 +795,7 @@
 	memcpy(dst, descriptor, 16 * 4);
 
 	if (cmd_buffer)
-		radv_cs_add_buffer(device->ws, cmd_buffer->cs, iview->bo, 7);
+		radv_cs_add_buffer(device->ws, cmd_buffer->cs, iview->bo);
 	else
 		*buffer_list = iview->bo;
 }
@@ -995,7 +996,7 @@
 
 	templ = vk_alloc2(&device->alloc, pAllocator, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!templ)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	templ->entry_count = entry_count;
 	templ->bind_point = pCreateInfo->pipelineBindPoint;
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index c02d0b7..a72cf26 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -49,28 +49,6 @@
 #include "util/debug.h"
 #include "util/mesa-sha1.h"
 
-static bool
-radv_get_build_id(void *ptr, struct mesa_sha1 *ctx)
-{
-	uint32_t timestamp;
-
-#ifdef HAVE_DL_ITERATE_PHDR
-	const struct build_id_note *note = NULL;
-	if ((note = build_id_find_nhdr_for_addr(ptr))) {
-		_mesa_sha1_update(ctx, build_id_data(note), build_id_length(note));
-	} else
-#endif
-	if (disk_cache_get_function_timestamp(ptr, &timestamp)) {
-		if (!timestamp) {
-			fprintf(stderr, "radv: The provided filesystem timestamp for the cache is bogus!\n");
-		}
-
-		_mesa_sha1_update(ctx, &timestamp, sizeof(timestamp));
-	} else
-		return false;
-	return true;
-}
-
 static int
 radv_device_get_cache_uuid(enum radeon_family family, void *uuid)
 {
@@ -81,8 +59,8 @@
 	memset(uuid, 0, VK_UUID_SIZE);
 	_mesa_sha1_init(&ctx);
 
-	if (!radv_get_build_id(radv_device_get_cache_uuid, &ctx) ||
-	    !radv_get_build_id(LLVMInitializeAMDGPUTargetInfo, &ctx))
+	if (!disk_cache_get_function_identifier(radv_device_get_cache_uuid, &ctx) ||
+	    !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo, &ctx))
 		return -1;
 
 	_mesa_sha1_update(&ctx, &family, sizeof(family));
@@ -137,12 +115,9 @@
 	default: chip_string = "AMD RADV unknown"; break;
 	}
 
-	if (HAVE_LLVM > 0) {
-		snprintf(llvm_string, sizeof(llvm_string),
-			 " (LLVM %i.%i.%i)", (HAVE_LLVM >> 8) & 0xff,
-			 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
-	}
-
+	snprintf(llvm_string, sizeof(llvm_string),
+		 " (LLVM %i.%i.%i)", (HAVE_LLVM >> 8) & 0xff,
+		 HAVE_LLVM & 0xff, MESA_LLVM_VERSION_PATCH);
 	snprintf(name, name_len, "%s%s", chip_string, llvm_string);
 }
 
@@ -257,25 +232,43 @@
 	VkResult result;
 	drmVersionPtr version;
 	int fd;
+	int master_fd = -1;
 
 	fd = open(path, O_RDWR | O_CLOEXEC);
-	if (fd < 0)
-		return vk_error(VK_ERROR_INCOMPATIBLE_DRIVER);
+	if (fd < 0) {
+		if (instance->debug_flags & RADV_DEBUG_STARTUP)
+			radv_logi("Could not open device '%s'", path);
+
+		return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER);
+	}
 
 	version = drmGetVersion(fd);
 	if (!version) {
 		close(fd);
-		return vk_errorf(VK_ERROR_INCOMPATIBLE_DRIVER,
+
+		if (instance->debug_flags & RADV_DEBUG_STARTUP)
+			radv_logi("Could not get the kernel driver version for device '%s'", path);
+
+		return vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
 				 "failed to get version %s: %m", path);
 	}
 
 	if (strcmp(version->name, "amdgpu")) {
 		drmFreeVersion(version);
+		if (master_fd != -1)
+			close(master_fd);
 		close(fd);
+
+		if (instance->debug_flags & RADV_DEBUG_STARTUP)
+			radv_logi("Device '%s' is not using the amdgpu kernel driver.", path);
+
 		return VK_ERROR_INCOMPATIBLE_DRIVER;
 	}
 	drmFreeVersion(version);
 
+	if (instance->debug_flags & RADV_DEBUG_STARTUP)
+			radv_logi("Found compatible device '%s'.", path);
+
 	device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
 	device->instance = instance;
 	assert(strlen(path) < ARRAY_SIZE(device->path));
@@ -284,10 +277,28 @@
 	device->ws = radv_amdgpu_winsys_create(fd, instance->debug_flags,
 					       instance->perftest_flags);
 	if (!device->ws) {
-		result = VK_ERROR_INCOMPATIBLE_DRIVER;
+		result = vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER);
 		goto fail;
 	}
 
+	if (instance->enabled_extensions.KHR_display) {
+		master_fd = open(drm_device->nodes[DRM_NODE_PRIMARY], O_RDWR | O_CLOEXEC);
+		if (master_fd >= 0) {
+			uint32_t accel_working = 0;
+			struct drm_amdgpu_info request = {
+				.return_pointer = (uintptr_t)&accel_working,
+				.return_size = sizeof(accel_working),
+				.query = AMDGPU_INFO_ACCEL_WORKING
+			};
+
+			if (drmCommandWrite(master_fd, DRM_AMDGPU_INFO, &request, sizeof (struct drm_amdgpu_info)) < 0 || !accel_working) {
+				close(master_fd);
+				master_fd = -1;
+			}
+		}
+	}
+
+	device->master_fd = master_fd;
 	device->local_fd = fd;
 	device->ws->query_info(device->ws, &device->rad_info);
 
@@ -297,7 +308,7 @@
 
 	if (radv_device_get_cache_uuid(device->rad_info.family, device->cache_uuid)) {
 		device->ws->destroy(device->ws);
-		result = vk_errorf(VK_ERROR_INITIALIZATION_FAILED,
+		result = vk_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
 				   "cannot generate UUID");
 		goto fail;
 	}
@@ -307,7 +318,7 @@
 		(device->instance->perftest_flags & RADV_PERFTEST_SISCHED ? 0x1 : 0) |
 		(device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH ? 0x2 : 0);
 
-	/* The gpu id is already embeded in the uuid so we just pass "radv"
+	/* The gpu id is already embedded in the uuid so we just pass "radv"
 	 * when creating the cache.
 	 */
 	char buf[VK_UUID_SIZE * 2 + 1];
@@ -329,7 +340,7 @@
 		                         device->rad_info.family == CHIP_RAVEN;
 	}
 
-	/* The mere presense of CLEAR_STATE in the IB causes random GPU hangs
+	/* The mere presence of CLEAR_STATE in the IB causes random GPU hangs
 	 * on SI.
 	 */
 	device->has_clear_state = device->rad_info.chip_class >= CIK;
@@ -344,10 +355,10 @@
 	device->has_out_of_order_rast = device->rad_info.chip_class >= VI &&
 					device->rad_info.max_se >= 2;
 	device->out_of_order_rast_allowed = device->has_out_of_order_rast &&
-					    (device->instance->perftest_flags & RADV_PERFTEST_OUT_OF_ORDER);
+					    !(device->instance->debug_flags & RADV_DEBUG_NO_OUT_OF_ORDER);
 
-	device->dcc_msaa_allowed = device->rad_info.chip_class == VI &&
-				   (device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA);
+	device->dcc_msaa_allowed =
+		(device->instance->perftest_flags & RADV_PERFTEST_DCC_MSAA);
 
 	radv_physical_device_init_mem_types(device);
 	radv_fill_device_extension_table(device, &device->supported_extensions);
@@ -355,13 +366,19 @@
 	result = radv_init_wsi(device);
 	if (result != VK_SUCCESS) {
 		device->ws->destroy(device->ws);
+		vk_error(instance, result);
 		goto fail;
 	}
 
+	if ((device->instance->debug_flags & RADV_DEBUG_INFO))
+		ac_print_gpu_info(&device->rad_info);
+
 	return VK_SUCCESS;
 
 fail:
 	close(fd);
+	if (master_fd != -1)
+		close(master_fd);
 	return result;
 }
 
@@ -372,6 +389,8 @@
 	device->ws->destroy(device->ws);
 	disk_cache_destroy(device->disk_cache);
 	close(device->local_fd);
+	if (device->master_fd != -1)
+		close(device->master_fd);
 }
 
 static void *
@@ -419,6 +438,12 @@
 	{"nosisched", RADV_DEBUG_NO_SISCHED},
 	{"preoptir", RADV_DEBUG_PREOPTIR},
 	{"nodynamicbounds", RADV_DEBUG_NO_DYNAMIC_BOUNDS},
+	{"nooutoforder", RADV_DEBUG_NO_OUT_OF_ORDER},
+	{"info", RADV_DEBUG_INFO},
+	{"errors", RADV_DEBUG_ERRORS},
+	{"startup", RADV_DEBUG_STARTUP},
+	{"checkir", RADV_DEBUG_CHECKIR},
+	{"nothreadllvm", RADV_DEBUG_NOTHREADLLVM},
 	{NULL, 0}
 };
 
@@ -434,7 +459,6 @@
 	{"sisched", RADV_PERFTEST_SISCHED},
 	{"localbos", RADV_PERFTEST_LOCAL_BOS},
 	{"binning", RADV_PERFTEST_BINNING},
-	{"outoforderrast", RADV_PERFTEST_OUT_OF_ORDER},
 	{"dccmsaa", RADV_PERFTEST_DCC_MSAA},
 	{NULL, 0}
 };
@@ -442,7 +466,7 @@
 const char *
 radv_get_perftest_option_name(int id)
 {
-	assert(id < ARRAY_SIZE(radv_debug_options) - 1);
+	assert(id < ARRAY_SIZE(radv_perftest_options) - 1);
 	return radv_perftest_options[id].string;
 }
 
@@ -457,10 +481,12 @@
 
 	if (!strcmp(name, "Talos - Linux - 32bit") ||
 	    !strcmp(name, "Talos - Linux - 64bit")) {
-		/* Force enable LLVM sisched for Talos because it looks safe
-		 * and it gives few more FPS.
-		 */
-		instance->perftest_flags |= RADV_PERFTEST_SISCHED;
+		if (!(instance->debug_flags & RADV_DEBUG_NO_SISCHED)) {
+			/* Force enable LLVM sisched for Talos because it looks
+			 * safe and it gives few more FPS.
+			 */
+			instance->perftest_flags |= RADV_PERFTEST_SISCHED;
+		}
 	} else if (!strcmp(name, "DOOM_VFR")) {
 		/* Work around a Doom VFR game bug */
 		instance->debug_flags |= RADV_DEBUG_NO_DYNAMIC_BOUNDS;
@@ -492,13 +518,13 @@
 	    pCreateInfo->pApplicationInfo->apiVersion != 0) {
 		client_version = pCreateInfo->pApplicationInfo->apiVersion;
 	} else {
-		client_version = VK_MAKE_VERSION(1, 0, 0);
+		radv_EnumerateInstanceVersion(&client_version);
 	}
 
 	instance = vk_zalloc2(&default_alloc, pAllocator, sizeof(*instance), 8,
 			      VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
 	if (!instance)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(NULL, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	instance->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
 
@@ -510,13 +536,23 @@
 	instance->apiVersion = client_version;
 	instance->physicalDeviceCount = -1;
 
+	instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"),
+						   radv_debug_options);
+
+	instance->perftest_flags = parse_debug_string(getenv("RADV_PERFTEST"),
+						   radv_perftest_options);
+
+
+	if (instance->debug_flags & RADV_DEBUG_STARTUP)
+		radv_logi("Created an instance");
+
 	for (uint32_t i = 0; i < pCreateInfo->enabledExtensionCount; i++) {
 		const char *ext_name = pCreateInfo->ppEnabledExtensionNames[i];
 		int index = radv_get_instance_extension_index(ext_name);
 
 		if (index < 0 || !radv_supported_instance_extensions.extensions[index]) {
 			vk_free2(&default_alloc, pAllocator, instance);
-			return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT);
+			return vk_error(instance, VK_ERROR_EXTENSION_NOT_PRESENT);
 		}
 
 		instance->enabled_extensions.extensions[index] = true;
@@ -525,29 +561,15 @@
 	result = vk_debug_report_instance_init(&instance->debug_report_callbacks);
 	if (result != VK_SUCCESS) {
 		vk_free2(&default_alloc, pAllocator, instance);
-		return vk_error(result);
+		return vk_error(instance, result);
 	}
 
 	_mesa_locale_init();
 
 	VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
 
-	instance->debug_flags = parse_debug_string(getenv("RADV_DEBUG"),
-						   radv_debug_options);
-
-	instance->perftest_flags = parse_debug_string(getenv("RADV_PERFTEST"),
-						   radv_perftest_options);
-
 	radv_handle_per_app_options(instance, pCreateInfo->pApplicationInfo);
 
-	if (instance->debug_flags & RADV_DEBUG_NO_SISCHED) {
-		/* Disable sisched when the user requests it, this is mostly
-		 * useful when the driver force-enable sisched for the given
-		 * application.
-		 */
-		instance->perftest_flags &= ~RADV_PERFTEST_SISCHED;
-	}
-
 	*pInstance = radv_instance_to_handle(instance);
 
 	return VK_SUCCESS;
@@ -586,8 +608,12 @@
 	instance->physicalDeviceCount = 0;
 
 	max_devices = drmGetDevices2(0, devices, ARRAY_SIZE(devices));
+
+	if (instance->debug_flags & RADV_DEBUG_STARTUP)
+		radv_logi("Found %d drm nodes", max_devices);
+
 	if (max_devices < 1)
-		return vk_error(VK_ERROR_INCOMPATIBLE_DRIVER);
+		return vk_error(instance, VK_ERROR_INCOMPATIBLE_DRIVER);
 
 	for (unsigned i = 0; i < (unsigned)max_devices; i++) {
 		if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER &&
@@ -669,6 +695,7 @@
 	VkPhysicalDevice                            physicalDevice,
 	VkPhysicalDeviceFeatures*                   pFeatures)
 {
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
 	memset(pFeatures, 0, sizeof(*pFeatures));
 
 	*pFeatures = (VkPhysicalDeviceFeatures) {
@@ -692,7 +719,8 @@
 		.alphaToOne                               = true,
 		.multiViewport                            = true,
 		.samplerAnisotropy                        = true,
-		.textureCompressionETC2                   = false,
+		.textureCompressionETC2                   = pdevice->rad_info.chip_class >= GFX9 ||
+		                                            pdevice->rad_info.family == CHIP_STONEY,
 		.textureCompressionASTC_LDR               = false,
 		.textureCompressionBC                     = true,
 		.occlusionQueryPrecise                    = true,
@@ -724,6 +752,7 @@
 	VkPhysicalDevice                            physicalDevice,
 	VkPhysicalDeviceFeatures2KHR               *pFeatures)
 {
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physicalDevice);
 	vk_foreach_struct(ext, pFeatures->pNext) {
 		switch (ext->sType) {
 		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VARIABLE_POINTER_FEATURES_KHR: {
@@ -754,10 +783,11 @@
 		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES: {
 			VkPhysicalDevice16BitStorageFeatures *features =
 			    (VkPhysicalDevice16BitStorageFeatures*)ext;
-			features->storageBuffer16BitAccess = false;
-			features->uniformAndStorageBuffer16BitAccess = false;
-			features->storagePushConstant16 = false;
-			features->storageInputOutput16 = false;
+			bool enabled = HAVE_LLVM >= 0x0700 && pdevice->rad_info.chip_class >= VI;
+			features->storageBuffer16BitAccess = enabled;
+			features->uniformAndStorageBuffer16BitAccess = enabled;
+			features->storagePushConstant16 = enabled;
+			features->storageInputOutput16 = enabled;
 			break;
 		}
 		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
@@ -791,6 +821,20 @@
 			features->runtimeDescriptorArray = true;
 			break;
 		}
+		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
+			VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
+				(VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext;
+			features->conditionalRendering = true;
+			features->inheritedConditionalRendering = false;
+			break;
+		}
+		case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
+			VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
+				(VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext;
+			features->vertexAttributeInstanceRateDivisor = VK_TRUE;
+			features->vertexAttributeInstanceRateZeroDivisor = VK_TRUE;
+			break;
+		}
 		default:
 			break;
 		}
@@ -888,7 +932,7 @@
 		.maxViewports                             = MAX_VIEWPORTS,
 		.maxViewportDimensions                    = { (1 << 14), (1 << 14) },
 		.viewportBoundsRange                      = { INT16_MIN, INT16_MAX },
-		.viewportSubPixelBits                     = 13, /* We take a float? */
+		.viewportSubPixelBits                     = 8,
 		.minMemoryMapAlignment                    = 4096, /* A page */
 		.minTexelBufferOffsetAlignment            = 1,
 		.minUniformBufferOffsetAlignment          = 4,
@@ -1003,6 +1047,7 @@
 							VK_SUBGROUP_FEATURE_VOTE_BIT;
 			if (pdevice->rad_info.chip_class >= VI) {
 				properties->supportedOperations |=
+							VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
 							VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
 							VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT;
 			}
@@ -1284,7 +1329,7 @@
 
 	queue->hw_ctx = device->ws->ctx_create(device->ws, queue->priority);
 	if (!queue->hw_ctx)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	return VK_SUCCESS;
 }
@@ -1379,36 +1424,8 @@
 static void
 radv_device_init_gs_info(struct radv_device *device)
 {
-	switch (device->physical_device->rad_info.family) {
-	case CHIP_OLAND:
-	case CHIP_HAINAN:
-	case CHIP_KAVERI:
-	case CHIP_KABINI:
-	case CHIP_MULLINS:
-	case CHIP_ICELAND:
-	case CHIP_CARRIZO:
-	case CHIP_STONEY:
-		device->gs_table_depth = 16;
-		return;
-	case CHIP_TAHITI:
-	case CHIP_PITCAIRN:
-	case CHIP_VERDE:
-	case CHIP_BONAIRE:
-	case CHIP_HAWAII:
-	case CHIP_TONGA:
-	case CHIP_FIJI:
-	case CHIP_POLARIS10:
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-	case CHIP_VEGAM:
-	case CHIP_VEGA10:
-	case CHIP_VEGA12:
-	case CHIP_RAVEN:
-		device->gs_table_depth = 32;
-		return;
-	default:
-		unreachable("unknown GPU");
-	}
+	device->gs_table_depth = ac_get_gs_table_depth(device->physical_device->rad_info.chip_class,
+						       device->physical_device->rad_info.family);
 }
 
 static int radv_get_device_extension_index(const char *name)
@@ -1441,7 +1458,7 @@
 		unsigned num_features = sizeof(VkPhysicalDeviceFeatures) / sizeof(VkBool32);
 		for (uint32_t i = 0; i < num_features; i++) {
 			if (enabled_feature[i] && !supported_feature[i])
-				return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+				return vk_error(physical_device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
 		}
 	}
 
@@ -1449,7 +1466,7 @@
 			    sizeof(*device), 8,
 			    VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
 	if (!device)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(physical_device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	device->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
 	device->instance = physical_device->instance;
@@ -1466,7 +1483,7 @@
 		int index = radv_get_device_extension_index(ext_name);
 		if (index < 0 || !physical_device->supported_extensions.extensions[index]) {
 			vk_free(&device->alloc, device);
-			return vk_error(VK_ERROR_EXTENSION_NOT_PRESENT);
+			return vk_error(physical_device->instance, VK_ERROR_EXTENSION_NOT_PRESENT);
 		}
 
 		device->enabled_extensions.extensions[index] = true;
@@ -1514,22 +1531,22 @@
 	}
 
 	device->pbb_allowed = device->physical_device->rad_info.chip_class >= GFX9 &&
-	                      (device->instance->perftest_flags & RADV_PERFTEST_BINNING);
+			((device->instance->perftest_flags & RADV_PERFTEST_BINNING) ||
+			 device->physical_device->rad_info.family == CHIP_RAVEN);
 
 	/* Disabled and not implemented for now. */
-	device->dfsm_allowed = device->pbb_allowed && false;
+	device->dfsm_allowed = device->pbb_allowed &&
+	                       device->physical_device->rad_info.family == CHIP_RAVEN;
 
 #ifdef ANDROID
 	device->always_use_syncobj = device->physical_device->rad_info.has_syncobj_wait_for_submit;
 #endif
 
-	device->llvm_supports_spill = true;
-
 	/* The maximum number of scratch waves. Scratch space isn't divided
 	 * evenly between CUs. The number is only a function of the number of CUs.
 	 * We can decrease the constant to decrease the scratch buffer size.
 	 *
-	 * sctx->scratch_waves must be >= the maximum posible size of
+	 * sctx->scratch_waves must be >= the maximum possible size of
 	 * 1 threadgroup, so that the hw doesn't hang from being unable
 	 * to start any.
 	 *
@@ -1566,6 +1583,10 @@
 		if (!radv_init_trace(device))
 			goto fail;
 
+		fprintf(stderr, "*****************************************************************************\n");
+		fprintf(stderr, "* WARNING: RADV_TRACE_FILE is costly and should only be used for debugging! *\n");
+		fprintf(stderr, "*****************************************************************************\n");
+
 		fprintf(stderr, "Trace file will be dumped to %s\n", filename);
 		radv_dump_enabled_options(device, stderr);
 	}
@@ -1680,7 +1701,7 @@
 	}
 
 	/* None supported at this time */
-	return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
+	return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
 }
 
 VkResult radv_EnumerateDeviceLayerProperties(
@@ -1694,7 +1715,7 @@
 	}
 
 	/* None supported at this time */
-	return vk_error(VK_ERROR_LAYER_NOT_PRESENT);
+	return vk_error(NULL, VK_ERROR_LAYER_NOT_PRESENT);
 }
 
 void radv_GetDeviceQueue2(
@@ -1951,6 +1972,126 @@
 	return hs_offchip_param;
 }
 
+static void
+radv_emit_gs_ring_sizes(struct radv_queue *queue, struct radeon_cmdbuf *cs,
+			struct radeon_winsys_bo *esgs_ring_bo,
+			uint32_t esgs_ring_size,
+			struct radeon_winsys_bo *gsvs_ring_bo,
+			uint32_t gsvs_ring_size)
+{
+	if (!esgs_ring_bo && !gsvs_ring_bo)
+		return;
+
+	if (esgs_ring_bo)
+		radv_cs_add_buffer(queue->device->ws, cs, esgs_ring_bo);
+
+	if (gsvs_ring_bo)
+		radv_cs_add_buffer(queue->device->ws, cs, gsvs_ring_bo);
+
+	if (queue->device->physical_device->rad_info.chip_class >= CIK) {
+		radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
+		radeon_emit(cs, esgs_ring_size >> 8);
+		radeon_emit(cs, gsvs_ring_size >> 8);
+	} else {
+		radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
+		radeon_emit(cs, esgs_ring_size >> 8);
+		radeon_emit(cs, gsvs_ring_size >> 8);
+	}
+}
+
+static void
+radv_emit_tess_factor_ring(struct radv_queue *queue, struct radeon_cmdbuf *cs,
+			   unsigned hs_offchip_param, unsigned tf_ring_size,
+			   struct radeon_winsys_bo *tess_rings_bo)
+{
+	uint64_t tf_va;
+
+	if (!tess_rings_bo)
+		return;
+
+	tf_va = radv_buffer_get_va(tess_rings_bo);
+
+	radv_cs_add_buffer(queue->device->ws, cs, tess_rings_bo);
+
+	if (queue->device->physical_device->rad_info.chip_class >= CIK) {
+		radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
+				       S_030938_SIZE(tf_ring_size / 4));
+		radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE,
+				       tf_va >> 8);
+		if (queue->device->physical_device->rad_info.chip_class >= GFX9) {
+			radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI,
+					       S_030944_BASE_HI(tf_va >> 40));
+		}
+		radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM,
+				       hs_offchip_param);
+	} else {
+		radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE,
+				      S_008988_SIZE(tf_ring_size / 4));
+		radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE,
+				      tf_va >> 8);
+		radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM,
+				     hs_offchip_param);
+	}
+}
+
+static void
+radv_emit_compute_scratch(struct radv_queue *queue, struct radeon_cmdbuf *cs,
+			  struct radeon_winsys_bo *compute_scratch_bo)
+{
+	uint64_t scratch_va;
+
+	if (!compute_scratch_bo)
+		return;
+
+	scratch_va = radv_buffer_get_va(compute_scratch_bo);
+
+	radv_cs_add_buffer(queue->device->ws, cs, compute_scratch_bo);
+
+	radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
+	radeon_emit(cs, scratch_va);
+	radeon_emit(cs, S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
+			S_008F04_SWIZZLE_ENABLE(1));
+}
+
+static void
+radv_emit_global_shader_pointers(struct radv_queue *queue,
+				 struct radeon_cmdbuf *cs,
+				 struct radeon_winsys_bo *descriptor_bo)
+{
+	uint64_t va;
+
+	if (!descriptor_bo)
+		return;
+
+	va = radv_buffer_get_va(descriptor_bo);
+
+	radv_cs_add_buffer(queue->device->ws, cs, descriptor_bo);
+
+	if (queue->device->physical_device->rad_info.chip_class >= GFX9) {
+		uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
+				   R_00B130_SPI_SHADER_USER_DATA_VS_0,
+				   R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS,
+				   R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS};
+
+		for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
+			radv_emit_shader_pointer(queue->device, cs, regs[i],
+						 va, true);
+		}
+	} else {
+		uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
+				   R_00B130_SPI_SHADER_USER_DATA_VS_0,
+				   R_00B230_SPI_SHADER_USER_DATA_GS_0,
+				   R_00B330_SPI_SHADER_USER_DATA_ES_0,
+				   R_00B430_SPI_SHADER_USER_DATA_HS_0,
+				   R_00B530_SPI_SHADER_USER_DATA_LS_0};
+
+		for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
+			radv_emit_shader_pointer(queue->device, cs, regs[i],
+						 va, true);
+		}
+	}
+}
+
 static VkResult
 radv_get_preamble_cs(struct radv_queue *queue,
                      uint32_t scratch_size,
@@ -1959,9 +2100,9 @@
 		     uint32_t gsvs_ring_size,
 		     bool needs_tess_rings,
 		     bool needs_sample_positions,
-		     struct radeon_winsys_cs **initial_full_flush_preamble_cs,
-                     struct radeon_winsys_cs **initial_preamble_cs,
-                     struct radeon_winsys_cs **continue_preamble_cs)
+		     struct radeon_cmdbuf **initial_full_flush_preamble_cs,
+                     struct radeon_cmdbuf **initial_preamble_cs,
+                     struct radeon_cmdbuf **continue_preamble_cs)
 {
 	struct radeon_winsys_bo *scratch_bo = NULL;
 	struct radeon_winsys_bo *descriptor_bo = NULL;
@@ -1969,7 +2110,7 @@
 	struct radeon_winsys_bo *esgs_ring_bo = NULL;
 	struct radeon_winsys_bo *gsvs_ring_bo = NULL;
 	struct radeon_winsys_bo *tess_rings_bo = NULL;
-	struct radeon_winsys_cs *dest_cs[3] = {0};
+	struct radeon_cmdbuf *dest_cs[3] = {0};
 	bool add_tess_rings = false, add_sample_positions = false;
 	unsigned tess_factor_ring_size = 0, tess_offchip_ring_size = 0;
 	unsigned max_offchip_buffers;
@@ -2094,7 +2235,7 @@
 		descriptor_bo = queue->descriptor_bo;
 
 	for(int i = 0; i < 3; ++i) {
-		struct radeon_winsys_cs *cs = NULL;
+		struct radeon_cmdbuf *cs = NULL;
 		cs = queue->device->ws->cs_create(queue->device->ws,
 						  queue->queue_family_index ? RING_COMPUTE : RING_GFX);
 		if (!cs)
@@ -2103,19 +2244,7 @@
 		dest_cs[i] = cs;
 
 		if (scratch_bo)
-			radv_cs_add_buffer(queue->device->ws, cs, scratch_bo, 8);
-
-		if (esgs_ring_bo)
-			radv_cs_add_buffer(queue->device->ws, cs, esgs_ring_bo, 8);
-
-		if (gsvs_ring_bo)
-			radv_cs_add_buffer(queue->device->ws, cs, gsvs_ring_bo, 8);
-
-		if (tess_rings_bo)
-			radv_cs_add_buffer(queue->device->ws, cs, tess_rings_bo, 8);
-
-		if (descriptor_bo)
-			radv_cs_add_buffer(queue->device->ws, cs, descriptor_bo, 8);
+			radv_cs_add_buffer(queue->device->ws, cs, scratch_bo);
 
 		if (descriptor_bo != queue->descriptor_bo) {
 			uint32_t *map = (uint32_t*)queue->device->ws->buffer_map(descriptor_bo);
@@ -2148,80 +2277,12 @@
 			radeon_emit(cs, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
 		}
 
-		if (esgs_ring_bo || gsvs_ring_bo) {
-			if (queue->device->physical_device->rad_info.chip_class >= CIK) {
-				radeon_set_uconfig_reg_seq(cs, R_030900_VGT_ESGS_RING_SIZE, 2);
-				radeon_emit(cs, esgs_ring_size >> 8);
-				radeon_emit(cs, gsvs_ring_size >> 8);
-			} else {
-				radeon_set_config_reg_seq(cs, R_0088C8_VGT_ESGS_RING_SIZE, 2);
-				radeon_emit(cs, esgs_ring_size >> 8);
-				radeon_emit(cs, gsvs_ring_size >> 8);
-			}
-		}
-
-		if (tess_rings_bo) {
-			uint64_t tf_va = radv_buffer_get_va(tess_rings_bo);
-			if (queue->device->physical_device->rad_info.chip_class >= CIK) {
-				radeon_set_uconfig_reg(cs, R_030938_VGT_TF_RING_SIZE,
-						       S_030938_SIZE(tess_factor_ring_size / 4));
-				radeon_set_uconfig_reg(cs, R_030940_VGT_TF_MEMORY_BASE,
-						       tf_va >> 8);
-				if (queue->device->physical_device->rad_info.chip_class >= GFX9) {
-					radeon_set_uconfig_reg(cs, R_030944_VGT_TF_MEMORY_BASE_HI,
-							       S_030944_BASE_HI(tf_va >> 40));
-				}
-				radeon_set_uconfig_reg(cs, R_03093C_VGT_HS_OFFCHIP_PARAM, hs_offchip_param);
-			} else {
-				radeon_set_config_reg(cs, R_008988_VGT_TF_RING_SIZE,
-						      S_008988_SIZE(tess_factor_ring_size / 4));
-				radeon_set_config_reg(cs, R_0089B8_VGT_TF_MEMORY_BASE,
-						      tf_va >> 8);
-				radeon_set_config_reg(cs, R_0089B0_VGT_HS_OFFCHIP_PARAM,
-						      hs_offchip_param);
-			}
-		}
-
-		if (descriptor_bo) {
-			uint64_t va = radv_buffer_get_va(descriptor_bo);
-			if (queue->device->physical_device->rad_info.chip_class >= GFX9) {
-				uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
-						R_00B130_SPI_SHADER_USER_DATA_VS_0,
-						R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS,
-						R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS};
-
-				for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
-					radeon_set_sh_reg_seq(cs, regs[i], 2);
-					radeon_emit(cs, va);
-					radeon_emit(cs, va >> 32);
-				}
-			} else {
-				uint32_t regs[] = {R_00B030_SPI_SHADER_USER_DATA_PS_0,
-						R_00B130_SPI_SHADER_USER_DATA_VS_0,
-						R_00B230_SPI_SHADER_USER_DATA_GS_0,
-						R_00B330_SPI_SHADER_USER_DATA_ES_0,
-						R_00B430_SPI_SHADER_USER_DATA_HS_0,
-						R_00B530_SPI_SHADER_USER_DATA_LS_0};
-
-				for (int i = 0; i < ARRAY_SIZE(regs); ++i) {
-					radeon_set_sh_reg_seq(cs, regs[i], 2);
-					radeon_emit(cs, va);
-					radeon_emit(cs, va >> 32);
-				}
-			}
-		}
-
-		if (compute_scratch_bo) {
-			uint64_t scratch_va = radv_buffer_get_va(compute_scratch_bo);
-			uint32_t rsrc1 = S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) |
-			                 S_008F04_SWIZZLE_ENABLE(1);
-
-			radv_cs_add_buffer(queue->device->ws, cs, compute_scratch_bo, 8);
-
-			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, 2);
-			radeon_emit(cs, scratch_va);
-			radeon_emit(cs, rsrc1);
-		}
+		radv_emit_gs_ring_sizes(queue, cs, esgs_ring_bo, esgs_ring_size,
+					gsvs_ring_bo, gsvs_ring_size);
+		radv_emit_tess_factor_ring(queue, cs, hs_offchip_param,
+					   tess_factor_ring_size, tess_rings_bo);
+		radv_emit_global_shader_pointers(queue, cs, descriptor_bo);
+		radv_emit_compute_scratch(queue, cs, compute_scratch_bo);
 
 		if (i == 0) {
 			si_cs_emit_cache_flush(cs,
@@ -2233,7 +2294,8 @@
 			                       RADV_CMD_FLAG_INV_ICACHE |
 			                       RADV_CMD_FLAG_INV_SMEM_L1 |
 			                       RADV_CMD_FLAG_INV_VMEM_L1 |
-			                       RADV_CMD_FLAG_INV_GLOBAL_L2, 0);
+			                       RADV_CMD_FLAG_INV_GLOBAL_L2 |
+					       RADV_CMD_FLAG_START_PIPELINE_STATS, 0);
 		} else if (i == 1) {
 			si_cs_emit_cache_flush(cs,
 			                       queue->device->physical_device->rad_info.chip_class,
@@ -2243,7 +2305,8 @@
 			                       RADV_CMD_FLAG_INV_ICACHE |
 			                       RADV_CMD_FLAG_INV_SMEM_L1 |
 			                       RADV_CMD_FLAG_INV_VMEM_L1 |
-			                       RADV_CMD_FLAG_INV_GLOBAL_L2, 0);
+			                       RADV_CMD_FLAG_INV_GLOBAL_L2 |
+					       RADV_CMD_FLAG_START_PIPELINE_STATS, 0);
 		}
 
 		if (!queue->device->ws->cs_finalize(cs))
@@ -2328,10 +2391,11 @@
 		queue->device->ws->buffer_destroy(gsvs_ring_bo);
 	if (tess_rings_bo && tess_rings_bo != queue->tess_rings_bo)
 		queue->device->ws->buffer_destroy(tess_rings_bo);
-	return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+	return vk_error(queue->device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 }
 
-static VkResult radv_alloc_sem_counts(struct radv_winsys_sem_counts *counts,
+static VkResult radv_alloc_sem_counts(struct radv_instance *instance,
+				      struct radv_winsys_sem_counts *counts,
 				      int num_sems,
 				      const VkSemaphore *sems,
 				      VkFence _fence,
@@ -2360,14 +2424,14 @@
 	if (counts->syncobj_count) {
 		counts->syncobj = (uint32_t *)malloc(sizeof(uint32_t) * counts->syncobj_count);
 		if (!counts->syncobj)
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 	}
 
 	if (counts->sem_count) {
 		counts->sem = (struct radeon_winsys_sem **)malloc(sizeof(struct radeon_winsys_sem *) * counts->sem_count);
 		if (!counts->sem) {
 			free(counts->syncobj);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 	}
 
@@ -2396,7 +2460,8 @@
 	return VK_SUCCESS;
 }
 
-void radv_free_sem_info(struct radv_winsys_sem_info *sem_info)
+static void
+radv_free_sem_info(struct radv_winsys_sem_info *sem_info)
 {
 	free(sem_info->wait.syncobj);
 	free(sem_info->wait.sem);
@@ -2419,20 +2484,22 @@
 	}
 }
 
-VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
-			     int num_wait_sems,
-			     const VkSemaphore *wait_sems,
-			     int num_signal_sems,
-			     const VkSemaphore *signal_sems,
-			     VkFence fence)
+static VkResult
+radv_alloc_sem_info(struct radv_instance *instance,
+		    struct radv_winsys_sem_info *sem_info,
+		    int num_wait_sems,
+		    const VkSemaphore *wait_sems,
+		    int num_signal_sems,
+		    const VkSemaphore *signal_sems,
+		    VkFence fence)
 {
 	VkResult ret;
 	memset(sem_info, 0, sizeof(*sem_info));
 
-	ret = radv_alloc_sem_counts(&sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE, true);
+	ret = radv_alloc_sem_counts(instance, &sem_info->wait, num_wait_sems, wait_sems, VK_NULL_HANDLE, true);
 	if (ret)
 		return ret;
-	ret = radv_alloc_sem_counts(&sem_info->signal, num_signal_sems, signal_sems, fence, false);
+	ret = radv_alloc_sem_counts(instance, &sem_info->signal, num_signal_sems, signal_sems, fence, false);
 	if (ret)
 		radv_free_sem_info(sem_info);
 
@@ -2450,7 +2517,7 @@
 	VkResult result;
 	struct radv_winsys_sem_info sem_info;
 
-	result = radv_alloc_sem_info(&sem_info, 0, NULL, 0, NULL,
+	result = radv_alloc_sem_info(queue->device->instance, &sem_info, 0, NULL, 0, NULL,
 	                             radv_fence_to_handle(fence));
 	if (result != VK_SUCCESS)
 		return result;
@@ -2461,9 +2528,8 @@
 	                                   false, fence->fence);
 	radv_free_sem_info(&sem_info);
 
-	/* TODO: find a better error */
 	if (ret)
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(queue->device->instance, VK_ERROR_DEVICE_LOST);
 
 	return VK_SUCCESS;
 }
@@ -2483,7 +2549,7 @@
 	uint32_t scratch_size = 0;
 	uint32_t compute_scratch_size = 0;
 	uint32_t esgs_ring_size = 0, gsvs_ring_size = 0;
-	struct radeon_winsys_cs *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL;
+	struct radeon_cmdbuf *initial_preamble_cs = NULL, *initial_flush_preamble_cs = NULL, *continue_preamble_cs = NULL;
 	VkResult result;
 	bool fence_emitted = false;
 	bool tess_rings_needed = false;
@@ -2514,13 +2580,14 @@
 		return result;
 
 	for (uint32_t i = 0; i < submitCount; i++) {
-		struct radeon_winsys_cs **cs_array;
+		struct radeon_cmdbuf **cs_array;
 		bool do_flush = !i || pSubmits[i].pWaitDstStageMask;
 		bool can_patch = true;
 		uint32_t advance;
 		struct radv_winsys_sem_info sem_info;
 
-		result = radv_alloc_sem_info(&sem_info,
+		result = radv_alloc_sem_info(queue->device->instance,
+					     &sem_info,
 					     pSubmits[i].waitSemaphoreCount,
 					     pSubmits[i].pWaitSemaphores,
 					     pSubmits[i].signalSemaphoreCount,
@@ -2546,7 +2613,7 @@
 			continue;
 		}
 
-		cs_array = malloc(sizeof(struct radeon_winsys_cs *) *
+		cs_array = malloc(sizeof(struct radeon_cmdbuf *) *
 					        (pSubmits[i].commandBufferCount));
 
 		for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) {
@@ -2562,7 +2629,9 @@
 		}
 
 		for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j += advance) {
-			struct radeon_winsys_cs *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs;
+			struct radeon_cmdbuf *initial_preamble = (do_flush && !j) ? initial_flush_preamble_cs : initial_preamble_cs;
+			const struct radv_winsys_bo_list *bo_list = NULL;
+
 			advance = MIN2(max_cs_submission,
 				       pSubmits[i].commandBufferCount - j);
 
@@ -2572,12 +2641,14 @@
 			sem_info.cs_emit_wait = j == 0;
 			sem_info.cs_emit_signal = j + advance == pSubmits[i].commandBufferCount;
 
-			if (unlikely(queue->device->use_global_bo_list))
+			if (unlikely(queue->device->use_global_bo_list)) {
 				pthread_mutex_lock(&queue->device->bo_list.mutex);
+				bo_list = &queue->device->bo_list.list;
+			}
 
 			ret = queue->device->ws->cs_submit(ctx, queue->queue_idx, cs_array + j,
 							advance, initial_preamble, continue_preamble_cs,
-							&sem_info, &queue->device->bo_list.list,
+							&sem_info, bo_list,
 							can_patch, base_fence);
 
 			if (unlikely(queue->device->use_global_bo_list))
@@ -2602,7 +2673,9 @@
 
 	if (fence) {
 		if (!fence_emitted) {
-			radv_signal_fence(queue, fence);
+			result = radv_signal_fence(queue, fence);
+			if (result != VK_SUCCESS)
+				return result;
 		}
 		fence->submitted = true;
 	}
@@ -2761,7 +2834,7 @@
 	mem = vk_alloc2(&device->alloc, pAllocator, sizeof(*mem), 8,
 			  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (mem == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	if (wsi_info && wsi_info->implicit_sync)
 		flags |= RADEON_FLAG_IMPLICIT_SYNC;
@@ -2899,7 +2972,7 @@
 		return VK_SUCCESS;
 	}
 
-	return vk_error(VK_ERROR_MEMORY_MAP_FAILED);
+	return vk_error(device->instance, VK_ERROR_MEMORY_MAP_FAILED);
 }
 
 void radv_UnmapMemory(
@@ -3159,6 +3232,8 @@
 	RADV_FROM_HANDLE(radv_queue, queue, _queue);
 	struct radeon_winsys_fence *base_fence = fence ? fence->fence : NULL;
 	bool fence_emitted = false;
+	VkResult result;
+	int ret;
 
 	for (uint32_t i = 0; i < bindInfoCount; ++i) {
 		struct radv_winsys_sem_info sem_info;
@@ -3173,7 +3248,8 @@
 		}
 
 		VkResult result;
-		result = radv_alloc_sem_info(&sem_info,
+		result = radv_alloc_sem_info(queue->device->instance,
+					     &sem_info,
 					     pBindInfo[i].waitSemaphoreCount,
 					     pBindInfo[i].pWaitSemaphores,
 					     pBindInfo[i].signalSemaphoreCount,
@@ -3183,11 +3259,16 @@
 			return result;
 
 		if (pBindInfo[i].waitSemaphoreCount || pBindInfo[i].signalSemaphoreCount) {
-			queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
-			                             &queue->device->empty_cs[queue->queue_family_index],
-			                             1, NULL, NULL,
-						     &sem_info, NULL,
-			                             false, base_fence);
+			ret = queue->device->ws->cs_submit(queue->hw_ctx, queue->queue_idx,
+							  &queue->device->empty_cs[queue->queue_family_index],
+							  1, NULL, NULL,
+							  &sem_info, NULL,
+							  false, base_fence);
+			if (ret) {
+				radv_loge("failed to submit CS %d\n", i);
+				abort();
+			}
+
 			fence_emitted = true;
 			if (fence)
 				fence->submitted = true;
@@ -3199,7 +3280,9 @@
 
 	if (fence) {
 		if (!fence_emitted) {
-			radv_signal_fence(queue, fence);
+			result = radv_signal_fence(queue, fence);
+			if (result != VK_SUCCESS)
+				return result;
 		}
 		fence->submitted = true;
 	}
@@ -3224,8 +3307,9 @@
 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
 	if (!fence)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
+	fence->fence_wsi = NULL;
 	fence->submitted = false;
 	fence->signalled = !!(pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT);
 	fence->temp_syncobj = 0;
@@ -3233,7 +3317,7 @@
 		int ret = device->ws->create_syncobj(device->ws, &fence->syncobj);
 		if (ret) {
 			vk_free2(&device->alloc, pAllocator, fence);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 		if (pCreateInfo->flags & VK_FENCE_CREATE_SIGNALED_BIT) {
 			device->ws->signal_syncobj(device->ws, fence->syncobj);
@@ -3243,7 +3327,7 @@
 		fence->fence = device->ws->create_fence();
 		if (!fence->fence) {
 			vk_free2(&device->alloc, pAllocator, fence);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 		fence->syncobj = 0;
 	}
@@ -3270,6 +3354,8 @@
 		device->ws->destroy_syncobj(device->ws, fence->syncobj);
 	if (fence->fence)
 		device->ws->destroy_fence(fence->fence);
+	if (fence->fence_wsi)
+		fence->fence_wsi->destroy(fence->fence_wsi);
 	vk_free2(&device->alloc, pAllocator, fence);
 }
 
@@ -3295,7 +3381,19 @@
 {
 	for (uint32_t i = 0; i < fenceCount; ++i) {
 		RADV_FROM_HANDLE(radv_fence, fence, pFences[i]);
-		if (fence->syncobj || fence->temp_syncobj || (!fence->signalled && !fence->submitted))
+		if (fence->fence == NULL || fence->syncobj ||
+		    fence->temp_syncobj ||
+		    (!fence->signalled && !fence->submitted))
+			return false;
+	}
+	return true;
+}
+
+static bool radv_all_fences_syncobj(uint32_t fenceCount, const VkFence *pFences)
+{
+	for (uint32_t i = 0; i < fenceCount; ++i) {
+		RADV_FROM_HANDLE(radv_fence, fence, pFences[i]);
+		if (fence->syncobj == 0 && fence->temp_syncobj == 0)
 			return false;
 	}
 	return true;
@@ -3311,10 +3409,12 @@
 	RADV_FROM_HANDLE(radv_device, device, _device);
 	timeout = radv_get_absolute_timeout(timeout);
 
-	if (device->always_use_syncobj) {
+	if (device->always_use_syncobj &&
+	    radv_all_fences_syncobj(fenceCount, pFences))
+	{
 		uint32_t *handles = malloc(sizeof(uint32_t) * fenceCount);
 		if (!handles)
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 		for (uint32_t i = 0; i < fenceCount; ++i) {
 			RADV_FROM_HANDLE(radv_fence, fence, pFences[i]);
@@ -3333,7 +3433,7 @@
 			uint32_t wait_count = 0;
 			struct radeon_winsys_fence **fences = malloc(sizeof(struct radeon_winsys_fence *) * fenceCount);
 			if (!fences)
-				return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+				return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 			for (uint32_t i = 0; i < fenceCount; ++i) {
 				RADV_FROM_HANDLE(radv_fence, fence, pFences[i]);
@@ -3381,21 +3481,34 @@
 		if (fence->signalled)
 			continue;
 
-		if (!fence->submitted) {
-			while(radv_get_current_time() <= timeout && !fence->submitted)
-				/* Do nothing */;
+		if (fence->fence) {
+			if (!fence->submitted) {
+				while(radv_get_current_time() <= timeout &&
+				      !fence->submitted)
+					/* Do nothing */;
 
-			if (!fence->submitted)
+				if (!fence->submitted)
+					return VK_TIMEOUT;
+
+				/* Recheck as it may have been set by
+				 * submitting operations. */
+
+				if (fence->signalled)
+					continue;
+			}
+
+			expired = device->ws->fence_wait(device->ws,
+							 fence->fence,
+							 true, timeout);
+			if (!expired)
 				return VK_TIMEOUT;
-
-			/* Recheck as it may have been set by submitting operations. */
-			if (fence->signalled)
-				continue;
 		}
 
-		expired = device->ws->fence_wait(device->ws, fence->fence, true, timeout);
-		if (!expired)
-			return VK_TIMEOUT;
+		if (fence->fence_wsi) {
+			VkResult result = fence->fence_wsi->wait(fence->fence_wsi, timeout);
+			if (result != VK_SUCCESS)
+				return result;
+		}
 
 		fence->signalled = true;
 	}
@@ -3447,9 +3560,19 @@
 		return VK_SUCCESS;
 	if (!fence->submitted)
 		return VK_NOT_READY;
-	if (!device->ws->fence_wait(device->ws, fence->fence, false, 0))
-		return VK_NOT_READY;
+	if (fence->fence) {
+		if (!device->ws->fence_wait(device->ws, fence->fence, false, 0))
+			return VK_NOT_READY;
+	}
+	if (fence->fence_wsi) {
+		VkResult result = fence->fence_wsi->wait(fence->fence_wsi, 0);
 
+		if (result != VK_SUCCESS) {
+			if (result == VK_TIMEOUT)
+				return VK_NOT_READY;
+			return result;
+		}
+	}
 	return VK_SUCCESS;
 }
 
@@ -3472,7 +3595,7 @@
 					       sizeof(*sem), 8,
 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!sem)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	sem->temp_syncobj = 0;
 	/* create a syncobject if we are going to export this semaphore */
@@ -3481,14 +3604,14 @@
 		int ret = device->ws->create_syncobj(device->ws, &sem->syncobj);
 		if (ret) {
 			vk_free2(&device->alloc, pAllocator, sem);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 		sem->sem = NULL;
 	} else {
 		sem->sem = device->ws->create_sem(device->ws);
 		if (!sem->sem) {
 			vk_free2(&device->alloc, pAllocator, sem);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 		sem->syncobj = 0;
 	}
@@ -3526,14 +3649,14 @@
 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
 	if (!event)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	event->bo = device->ws->buffer_create(device->ws, 8, 8,
 					      RADEON_DOMAIN_GTT,
 					      RADEON_FLAG_VA_UNCACHED | RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING);
 	if (!event->bo) {
 		vk_free2(&device->alloc, pAllocator, event);
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	}
 
 	event->map = (uint64_t*)device->ws->buffer_map(event->bo);
@@ -3602,7 +3725,7 @@
 	buffer = vk_alloc2(&device->alloc, pAllocator, sizeof(*buffer), 8,
 			     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (buffer == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	buffer->size = pCreateInfo->size;
 	buffer->usage = pCreateInfo->usage;
@@ -3619,7 +3742,7 @@
 		                                       4096, 0, RADEON_FLAG_VIRTUAL);
 		if (!buffer->bo) {
 			vk_free2(&device->alloc, pAllocator, buffer);
-			return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 		}
 	}
 
@@ -4107,7 +4230,7 @@
 	framebuffer = vk_alloc2(&device->alloc, pAllocator, size, 8,
 				  VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (framebuffer == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	framebuffer->attachment_count = pCreateInfo->attachmentCount;
 	framebuffer->width = pCreateInfo->width;
@@ -4325,7 +4448,7 @@
 	sampler = vk_alloc2(&device->alloc, pAllocator, sizeof(*sampler), 8,
 			      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!sampler)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	radv_init_sampler(device, sampler, pCreateInfo);
 	*pSampler = radv_sampler_to_handle(sampler);
@@ -4407,7 +4530,7 @@
 
 	bool ret = radv_get_memory_fd(device, memory, pFD);
 	if (ret == false)
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	return VK_SUCCESS;
 }
 
@@ -4416,6 +4539,8 @@
 				       int fd,
 				       VkMemoryFdPropertiesKHR *pMemoryFdProperties)
 {
+   RADV_FROM_HANDLE(radv_device, device, _device);
+
    switch (handleType) {
    case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT:
       pMemoryFdProperties->memoryTypeBits = (1 << RADV_MEM_TYPE_COUNT) - 1;
@@ -4429,7 +4554,7 @@
        *
        * So opaque handle types fall into the default "unsupported" case.
        */
-      return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+      return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
    }
 }
 
@@ -4440,7 +4565,7 @@
 	uint32_t syncobj_handle = 0;
 	int ret = device->ws->import_syncobj(device->ws, fd, &syncobj_handle);
 	if (ret != 0)
-		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+		return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 
 	if (*syncobj)
 		device->ws->destroy_syncobj(device->ws, *syncobj);
@@ -4461,7 +4586,7 @@
 	if (!syncobj_handle) {
 		int ret = device->ws->create_syncobj(device->ws, &syncobj_handle);
 		if (ret) {
-			return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+			return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 		}
 	}
 
@@ -4470,7 +4595,7 @@
 	} else {
 		int ret = device->ws->import_syncobj_from_sync_file(device->ws, syncobj_handle, fd);
 	if (ret != 0)
-		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+		return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 	}
 
 	*syncobj = syncobj_handle;
@@ -4537,7 +4662,7 @@
 	}
 
 	if (ret)
-		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+		return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 	return VK_SUCCESS;
 }
 
@@ -4626,7 +4751,7 @@
 	}
 
 	if (ret)
-		return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
+		return vk_error(device->instance, VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
 	return VK_SUCCESS;
 }
 
diff --git a/src/amd/vulkan/radv_entrypoints_gen.py b/src/amd/vulkan/radv_entrypoints_gen.py
index 802c739..377b544 100644
--- a/src/amd/vulkan/radv_entrypoints_gen.py
+++ b/src/amd/vulkan/radv_entrypoints_gen.py
@@ -116,7 +116,7 @@
    uint32_t num;
 };
 
-/* We use a big string constant to avoid lots of reloctions from the entry
+/* We use a big string constant to avoid lots of relocations from the entry
  * point table to lots of little strings. The entries in the entry point table
  * store the index into this big string.
  */
@@ -136,7 +136,7 @@
 /* Hash table stats:
  * size ${len(strmap.sorted_strings)} entries
  * collisions entries:
-% for i in xrange(10):
+% for i in range(10):
  *     ${i}${'+' if i == 9 else ' '}     ${strmap.collisions[i]}
 % endfor
  */
@@ -430,7 +430,7 @@
             e_clone.name = e.name
             entrypoints[e.name] = e_clone
 
-    return [e for e in entrypoints.itervalues() if e.enabled]
+    return [e for e in entrypoints.values() if e.enabled]
 
 
 def get_entrypoints_defines(doc):
@@ -446,7 +446,10 @@
 
     for extension in doc.findall('./extensions/extension[@platform]'):
         platform = extension.attrib['platform']
-        define = 'VK_USE_PLATFORM_' + platform.upper() + '_KHR'
+        ext = '_KHR'
+        if platform.upper() == 'XLIB_XRANDR':
+            ext = '_EXT'
+        define = 'VK_USE_PLATFORM_' + platform.upper() + ext
 
         for entrypoint in extension.findall('./require/command'):
             fullname = entrypoint.attrib['name']
diff --git a/src/amd/vulkan/radv_extensions.py b/src/amd/vulkan/radv_extensions.py
index dbe0ff4..028d10f 100644
--- a/src/amd/vulkan/radv_extensions.py
+++ b/src/amd/vulkan/radv_extensions.py
@@ -51,11 +51,14 @@
 # and dEQP-VK.api.info.device fail due to the duplicated strings.
 EXTENSIONS = [
     Extension('VK_ANDROID_native_buffer',                 5, 'ANDROID && device->rad_info.has_syncobj_wait_for_submit'),
+    Extension('VK_KHR_16bit_storage',                     1, 'HAVE_LLVM >= 0x0700'),
     Extension('VK_KHR_bind_memory2',                      1, True),
+    Extension('VK_KHR_create_renderpass2',                1, True),
     Extension('VK_KHR_dedicated_allocation',              1, True),
     Extension('VK_KHR_descriptor_update_template',        1, True),
     Extension('VK_KHR_device_group',                      1, True),
     Extension('VK_KHR_device_group_creation',             1, True),
+    Extension('VK_KHR_draw_indirect_count',               1, True),
     Extension('VK_KHR_external_fence',                    1, 'device->rad_info.has_syncobj_wait_for_submit'),
     Extension('VK_KHR_external_fence_capabilities',       1, True),
     Extension('VK_KHR_external_fence_fd',                 1, 'device->rad_info.has_syncobj_wait_for_submit'),
@@ -65,6 +68,7 @@
     Extension('VK_KHR_external_semaphore',                1, 'device->rad_info.has_syncobj'),
     Extension('VK_KHR_external_semaphore_capabilities',   1, True),
     Extension('VK_KHR_external_semaphore_fd',             1, 'device->rad_info.has_syncobj'),
+    Extension('VK_KHR_get_display_properties2',           1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
     Extension('VK_KHR_get_memory_requirements2',          1, True),
     Extension('VK_KHR_get_physical_device_properties2',   1, True),
     Extension('VK_KHR_get_surface_capabilities2',         1, 'RADV_HAS_SURFACE'),
@@ -85,6 +89,12 @@
     Extension('VK_KHR_xcb_surface',                       6, 'VK_USE_PLATFORM_XCB_KHR'),
     Extension('VK_KHR_xlib_surface',                      6, 'VK_USE_PLATFORM_XLIB_KHR'),
     Extension('VK_KHR_multiview',                         1, True),
+    Extension('VK_KHR_display',                          23, 'VK_USE_PLATFORM_DISPLAY_KHR'),
+    Extension('VK_EXT_direct_mode_display',               1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
+    Extension('VK_EXT_acquire_xlib_display',              1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
+    Extension('VK_EXT_conditional_rendering',             1, True),
+    Extension('VK_EXT_display_surface_counter',           1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
+    Extension('VK_EXT_display_control',                   1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
     Extension('VK_EXT_debug_report',                      9, True),
     Extension('VK_EXT_depth_range_unrestricted',          1, True),
     Extension('VK_EXT_descriptor_indexing',               2, True),
@@ -94,7 +104,8 @@
     Extension('VK_EXT_global_priority',                   1, 'device->rad_info.has_ctx_priority'),
     Extension('VK_EXT_sampler_filter_minmax',             1, 'device->rad_info.chip_class >= CIK'),
     Extension('VK_EXT_shader_viewport_index_layer',       1, True),
-    Extension('VK_EXT_vertex_attribute_divisor',          1, True),
+    Extension('VK_EXT_shader_stencil_export',             1, True),
+    Extension('VK_EXT_vertex_attribute_divisor',          3, True),
     Extension('VK_AMD_draw_indirect_count',               1, True),
     Extension('VK_AMD_gcn_shader',                        1, True),
     Extension('VK_AMD_rasterization_order',               1, 'device->has_out_of_order_rast'),
@@ -195,9 +206,9 @@
    };
 };
 
-const VkExtensionProperties radv_instance_extensions[RADV_INSTANCE_EXTENSION_COUNT];
-const VkExtensionProperties radv_device_extensions[RADV_DEVICE_EXTENSION_COUNT];
-const struct radv_instance_extension_table radv_supported_instance_extensions;
+extern const VkExtensionProperties radv_instance_extensions[RADV_INSTANCE_EXTENSION_COUNT];
+extern const VkExtensionProperties radv_device_extensions[RADV_DEVICE_EXTENSION_COUNT];
+extern const struct radv_instance_extension_table radv_supported_instance_extensions;
 
 
 struct radv_physical_device;
@@ -213,12 +224,12 @@
 #include "vk_util.h"
 
 /* Convert the VK_USE_PLATFORM_* defines to booleans */
-%for platform in ['ANDROID', 'WAYLAND', 'XCB', 'XLIB']:
-#ifdef VK_USE_PLATFORM_${platform}_KHR
-#   undef VK_USE_PLATFORM_${platform}_KHR
-#   define VK_USE_PLATFORM_${platform}_KHR true
+%for platform in ['ANDROID_KHR', 'WAYLAND_KHR', 'XCB_KHR', 'XLIB_KHR', 'DISPLAY_KHR', 'XLIB_XRANDR_EXT']:
+#ifdef VK_USE_PLATFORM_${platform}
+#   undef VK_USE_PLATFORM_${platform}
+#   define VK_USE_PLATFORM_${platform} true
 #else
-#   define VK_USE_PLATFORM_${platform}_KHR false
+#   define VK_USE_PLATFORM_${platform} false
 #endif
 %endfor
 
@@ -232,7 +243,9 @@
 
 #define RADV_HAS_SURFACE (VK_USE_PLATFORM_WAYLAND_KHR || \\
                          VK_USE_PLATFORM_XCB_KHR || \\
-                         VK_USE_PLATFORM_XLIB_KHR)
+                         VK_USE_PLATFORM_XLIB_KHR || \\
+                         VK_USE_PLATFORM_DISPLAY_KHR)
+
 
 const VkExtensionProperties radv_instance_extensions[RADV_INSTANCE_EXTENSION_COUNT] = {
 %for ext in instance_extensions:
diff --git a/src/amd/vulkan/radv_formats.c b/src/amd/vulkan/radv_formats.c
index 77ef222..6253c27 100644
--- a/src/amd/vulkan/radv_formats.c
+++ b/src/amd/vulkan/radv_formats.c
@@ -224,6 +224,28 @@
 		}
 	}
 
+	if (desc->layout == VK_FORMAT_LAYOUT_ETC) {
+		switch (format) {
+		case VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK:
+		case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
+		case VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK:
+		case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
+		case VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK:
+		case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
+		case VK_FORMAT_EAC_R11_UNORM_BLOCK:
+		case VK_FORMAT_EAC_R11_SNORM_BLOCK:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_R;
+		case VK_FORMAT_EAC_R11G11_UNORM_BLOCK:
+		case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
+			return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
+		default:
+			break;
+		}
+	}
+
 	if (format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
 		return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
 	} else if (format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
@@ -321,10 +343,8 @@
 			return V_008F14_IMG_DATA_FORMAT_32;
 		case 2:
 			return V_008F14_IMG_DATA_FORMAT_32_32;
-#if 0 /* Not supported for render targets */
 		case 3:
 			return V_008F14_IMG_DATA_FORMAT_32_32_32;
-#endif
 		case 4:
 			return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
 		}
@@ -351,10 +371,15 @@
 				case VK_FORMAT_BC2_SRGB_BLOCK:
 				case VK_FORMAT_BC3_SRGB_BLOCK:
 				case VK_FORMAT_BC7_SRGB_BLOCK:
+				case VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK:
+				case VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK:
+				case VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK:
 					return V_008F14_IMG_NUM_FORMAT_SRGB;
 				case VK_FORMAT_BC4_SNORM_BLOCK:
 				case VK_FORMAT_BC5_SNORM_BLOCK:
 			        case VK_FORMAT_BC6H_SFLOAT_BLOCK:
+				case VK_FORMAT_EAC_R11_SNORM_BLOCK:
+				case VK_FORMAT_EAC_R11G11_SNORM_BLOCK:
 					return V_008F14_IMG_NUM_FORMAT_SNORM;
 				default:
 					return V_008F14_IMG_NUM_FORMAT_UNORM;
@@ -586,6 +611,16 @@
 		return;
 	}
 
+	if (desc->layout == VK_FORMAT_LAYOUT_ETC &&
+	    physical_device->rad_info.family != CHIP_VEGA10 &&
+	    physical_device->rad_info.family != CHIP_RAVEN &&
+	    physical_device->rad_info.family != CHIP_STONEY) {
+		out_properties->linearTilingFeatures = linear;
+		out_properties->optimalTilingFeatures = tiled;
+		out_properties->bufferFeatures = buffer;
+		return;
+	}
+
 	if (radv_is_storage_image_format_supported(physical_device, format)) {
 		tiled |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
 		linear |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT;
@@ -638,13 +673,17 @@
 				tiled |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
 			}
 		}
-		if (tiled && util_is_power_of_two_or_zero(vk_format_get_blocksize(format)) && !scaled) {
+		if (tiled && !scaled) {
 			tiled |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR |
 			         VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR;
 		}
+
+		/* Tiled formatting does not support NPOT pixel sizes */
+		if (!util_is_power_of_two_or_zero(vk_format_get_blocksize(format)))
+			tiled = 0;
 	}
 
-	if (linear && util_is_power_of_two_or_zero(vk_format_get_blocksize(format)) && !scaled) {
+	if (linear && !scaled) {
 		linear |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT_KHR |
 		          VK_FORMAT_FEATURE_TRANSFER_DST_BIT_KHR;
 	}
@@ -878,194 +917,87 @@
 				  uint32_t clear_vals[2],
 				  VkClearColorValue *value)
 {
-	uint8_t r = 0, g = 0, b = 0, a = 0;
 	const struct vk_format_description *desc = vk_format_description(format);
 
-	if (vk_format_get_component_bits(format, VK_FORMAT_COLORSPACE_RGB, 0) <= 8) {
-		if (desc->colorspace == VK_FORMAT_COLORSPACE_RGB) {
-			r = float_to_ubyte(value->float32[0]);
-			g = float_to_ubyte(value->float32[1]);
-			b = float_to_ubyte(value->float32[2]);
-			a = float_to_ubyte(value->float32[3]);
-		} else if (desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) {
-			r = util_format_linear_float_to_srgb_8unorm(value->float32[0]);
-			g = util_format_linear_float_to_srgb_8unorm(value->float32[1]);
-			b = util_format_linear_float_to_srgb_8unorm(value->float32[2]);
-			a = float_to_ubyte(value->float32[3]);
-		}
-	}
-	switch (format) {
-	case VK_FORMAT_R8_UNORM:
-	case VK_FORMAT_R8_SRGB:
-		clear_vals[0] = r;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8_UNORM:
-	case VK_FORMAT_R8G8_SRGB:
-		clear_vals[0] = r | g << 8;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8B8A8_SRGB:
-	case VK_FORMAT_R8G8B8A8_UNORM:
-		clear_vals[0] = r | g << 8 | b << 16 | a << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_B8G8R8A8_SRGB:
-	case VK_FORMAT_B8G8R8A8_UNORM:
-		clear_vals[0] = b | g << 8 | r << 16 | a << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
-	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
-		clear_vals[0] = r | g << 8 | b << 16 | a << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8_UINT:
-		clear_vals[0] = value->uint32[0] & 0xff;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8_SINT:
-		clear_vals[0] = value->int32[0] & 0xff;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16_UINT:
-		clear_vals[0] = value->uint32[0] & 0xffff;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8_UINT:
-		clear_vals[0] = value->uint32[0] & 0xff;
-		clear_vals[0] |= (value->uint32[1] & 0xff) << 8;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8_SINT:
-		clear_vals[0] = value->int32[0] & 0xff;
-		clear_vals[0] |= (value->int32[1] & 0xff) << 8;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8B8A8_UINT:
-		clear_vals[0] = value->uint32[0] & 0xff;
-		clear_vals[0] |= (value->uint32[1] & 0xff) << 8;
-		clear_vals[0] |= (value->uint32[2] & 0xff) << 16;
-		clear_vals[0] |= (value->uint32[3] & 0xff) << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R8G8B8A8_SINT:
-		clear_vals[0] = value->int32[0] & 0xff;
-		clear_vals[0] |= (value->int32[1] & 0xff) << 8;
-		clear_vals[0] |= (value->int32[2] & 0xff) << 16;
-		clear_vals[0] |= (value->int32[3] & 0xff) << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
-		clear_vals[0] = value->uint32[0] & 0xff;
-		clear_vals[0] |= (value->uint32[1] & 0xff) << 8;
-		clear_vals[0] |= (value->uint32[2] & 0xff) << 16;
-		clear_vals[0] |= (value->uint32[3] & 0xff) << 24;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16_UINT:
-		clear_vals[0] = value->uint32[0] & 0xffff;
-		clear_vals[0] |= (value->uint32[1] & 0xffff) << 16;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16B16A16_UINT:
-		clear_vals[0] = value->uint32[0] & 0xffff;
-		clear_vals[0] |= (value->uint32[1] & 0xffff) << 16;
-		clear_vals[1] = value->uint32[2] & 0xffff;
-		clear_vals[1] |= (value->uint32[3] & 0xffff) << 16;
-		break;
-	case VK_FORMAT_R32_UINT:
-		clear_vals[0] = value->uint32[0];
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R32G32_UINT:
-		clear_vals[0] = value->uint32[0];
-		clear_vals[1] = value->uint32[1];
-		break;
-	case VK_FORMAT_R32_SINT:
-		clear_vals[0] = value->int32[0];
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16_SFLOAT:
-		clear_vals[0] = util_float_to_half(value->float32[0]);
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16_SFLOAT:
-		clear_vals[0] = util_float_to_half(value->float32[0]);
-		clear_vals[0] |= (uint32_t)util_float_to_half(value->float32[1]) << 16;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16B16A16_SFLOAT:
-		clear_vals[0] = util_float_to_half(value->float32[0]);
-		clear_vals[0] |= (uint32_t)util_float_to_half(value->float32[1]) << 16;
-		clear_vals[1] = util_float_to_half(value->float32[2]);
-		clear_vals[1] |= (uint32_t)util_float_to_half(value->float32[3]) << 16;
-		break;
-	case VK_FORMAT_R16_UNORM:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16_UNORM:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
-		clear_vals[0] |= ((uint16_t)util_iround(CLAMP(value->float32[1], 0.0f, 1.0f) * 0xffff)) << 16;
-		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R16G16B16A16_UNORM:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0xffff)) & 0xffff;
-		clear_vals[0] |= ((uint16_t)util_iround(CLAMP(value->float32[1], 0.0f, 1.0f) * 0xffff)) << 16;
-		clear_vals[1] = ((uint16_t)util_iround(CLAMP(value->float32[2], 0.0f, 1.0f) * 0xffff)) & 0xffff;
-		clear_vals[1] |= ((uint16_t)util_iround(CLAMP(value->float32[3], 0.0f, 1.0f) * 0xffff)) << 16;
-		break;
-	case VK_FORMAT_R16G16B16A16_SNORM:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], -1.0f, 1.0f) * 0x7fff)) & 0xffff;
-		clear_vals[0] |= ((uint16_t)util_iround(CLAMP(value->float32[1], -1.0f, 1.0f) * 0x7fff)) << 16;
-		clear_vals[1] = ((uint16_t)util_iround(CLAMP(value->float32[2], -1.0f, 1.0f) * 0x7fff)) & 0xffff;
-		clear_vals[1] |= ((uint16_t)util_iround(CLAMP(value->float32[3], -1.0f, 1.0f) * 0x7fff)) << 16;
-		break;
-	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
-		clear_vals[0] = ((uint16_t)util_iround(CLAMP(value->float32[0], 0.0f, 1.0f) * 0x3ff)) & 0x3ff;
-		clear_vals[0] |= (((uint16_t)util_iround(CLAMP(value->float32[1], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 10;
-		clear_vals[0] |= (((uint16_t)util_iround(CLAMP(value->float32[2], 0.0f, 1.0f) * 0x3ff)) & 0x3ff) << 20;
-		clear_vals[0] |= (((uint16_t)util_iround(CLAMP(value->float32[3], 0.0f, 1.0f) * 0x3)) & 0x3) << 30;
-		clear_vals[1] = 0;
-		return true;
-	case VK_FORMAT_R32G32_SFLOAT:
-		clear_vals[0] = fui(value->float32[0]);
-		clear_vals[1] = fui(value->float32[1]);
-		break;
-	case VK_FORMAT_R32_SFLOAT:
-		clear_vals[1] = 0;
-		clear_vals[0] = fui(value->float32[0]);
-		break;
-	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
+	if (format == VK_FORMAT_B10G11R11_UFLOAT_PACK32) {
 		clear_vals[0] = float3_to_r11g11b10f(value->float32);
 		clear_vals[1] = 0;
-		break;
-	case VK_FORMAT_R32G32B32A32_SFLOAT:
-		if (value->float32[0] != value->float32[1] ||
-		    value->float32[0] != value->float32[2])
-			return false;
-		clear_vals[0] = fui(value->float32[0]);
-		clear_vals[1] = fui(value->float32[3]);
-		break;
-	case VK_FORMAT_R32G32B32A32_UINT:
-		if (value->uint32[0] != value->uint32[1] ||
-		    value->uint32[0] != value->uint32[2])
-			return false;
-		clear_vals[0] = value->uint32[0];
-		clear_vals[1] = value->uint32[3];
-		break;
-	case VK_FORMAT_R32G32B32A32_SINT:
-		if (value->int32[0] != value->int32[1] ||
-		    value->int32[0] != value->int32[2])
-			return false;
-		clear_vals[0] = value->int32[0];
-		clear_vals[1] = value->int32[3];
-		break;
-	default:
-		fprintf(stderr, "failed to fast clear %d\n", format);
+		return true;
+	}
+
+	if (desc->layout != VK_FORMAT_LAYOUT_PLAIN) {
+		fprintf(stderr, "failed to fast clear for non-plain format %d\n", format);
 		return false;
 	}
+
+	if (!util_is_power_of_two_or_zero(desc->block.bits)) {
+		fprintf(stderr, "failed to fast clear for NPOT format %d\n", format);
+		return false;
+	}
+
+	if (desc->block.bits > 64) {
+		/*
+		 * We have a 128 bits format, check if the first 3 components are the same.
+		 * Every elements has to be 32 bits since we don't support 64-bit formats,
+		 * and we can skip swizzling checks as alpha always comes last for these and
+		 * we do not care about the rest as they have to be the same.
+		 */
+		if (desc->channel[0].type == VK_FORMAT_TYPE_FLOAT) {
+			if (value->float32[0] != value->float32[1] ||
+			    value->float32[0] != value->float32[2])
+				return false;
+		} else {
+			if (value->uint32[0] != value->uint32[1] ||
+			    value->uint32[0] != value->uint32[2])
+				return false;
+		}
+		clear_vals[0] = value->uint32[0];
+		clear_vals[1] = value->uint32[3];
+		return true;
+	}
+	uint64_t clear_val = 0;
+
+	for (unsigned c = 0; c < 4; ++c) {
+		if (desc->swizzle[c] >= 4)
+			continue;
+
+		const struct vk_format_channel_description *channel = &desc->channel[desc->swizzle[c]];
+		assert(channel->size);
+
+		uint64_t v = 0;
+		if (channel->pure_integer) {
+			v = value->uint32[c]  & ((1ULL << channel->size) - 1);
+		} else if (channel->normalized) {
+			if (channel->type == VK_FORMAT_TYPE_UNSIGNED &&
+			    desc->swizzle[c] < 3 &&
+			    desc->colorspace == VK_FORMAT_COLORSPACE_SRGB) {
+				assert(channel->size == 8);
+
+				v = util_format_linear_float_to_srgb_8unorm(value->float32[c]);
+			} else if (channel->type == VK_FORMAT_TYPE_UNSIGNED) {
+				v = MAX2(MIN2(value->float32[c], 1.0f), 0.0f) * ((1ULL << channel->size) - 1);
+			} else  {
+				v = MAX2(MIN2(value->float32[c], 1.0f), -1.0f) * ((1ULL << (channel->size - 1)) - 1);
+			}
+		} else if (channel->type == VK_FORMAT_TYPE_FLOAT) {
+			if (channel->size == 32) {
+				memcpy(&v, &value->float32[c], 4);
+			} else if(channel->size == 16) {
+				v = util_float_to_half(value->float32[c]);
+			} else {
+				fprintf(stderr, "failed to fast clear for unhandled float size in format %d\n", format);
+				return false;
+			}
+		} else {
+			fprintf(stderr, "failed to fast clear for unhandled component type in format %d\n", format);
+			return false;
+		}
+		clear_val |= (v & ((1ULL << channel->size) - 1)) << channel->shift;
+	}
+
+	clear_vals[0] = clear_val;
+	clear_vals[1] = clear_val >> 32;
+
 	return true;
 }
 
@@ -1344,7 +1276,7 @@
 			 *    vkGetPhysicalDeviceImageFormatProperties2KHR returns
 			 *    VK_ERROR_FORMAT_NOT_SUPPORTED.
 			 */
-			result = vk_errorf(VK_ERROR_FORMAT_NOT_SUPPORTED,
+			result = vk_errorf(physical_device->instance, VK_ERROR_FORMAT_NOT_SUPPORTED,
 					   "unsupported VkExternalMemoryTypeFlagBitsKHR 0x%x",
 					   external_info->handleType);
 			goto fail;
diff --git a/src/amd/vulkan/radv_icd.py b/src/amd/vulkan/radv_icd.py
index 78ed379..cc86bbf 100644
--- a/src/amd/vulkan/radv_icd.py
+++ b/src/amd/vulkan/radv_icd.py
@@ -44,4 +44,4 @@
     }
 
     with open(args.out, 'w') as f:
-        json.dump(json_data, f, indent = 4, sort_keys=True)
+        json.dump(json_data, f, indent = 4, sort_keys=True, separators=(',', ': '))
diff --git a/src/amd/vulkan/radv_image.c b/src/amd/vulkan/radv_image.c
index 8e7666d..ce680ec 100644
--- a/src/amd/vulkan/radv_image.c
+++ b/src/amd/vulkan/radv_image.c
@@ -110,6 +110,8 @@
 {
 	bool dcc_compatible_formats;
 	bool blendable;
+	bool shareable = vk_find_struct_const(pCreateInfo->pNext,
+	                                      EXTERNAL_MEMORY_IMAGE_CREATE_INFO_KHR) != NULL;
 
 	/* DCC (Delta Color Compression) is only available for GFX8+. */
 	if (device->physical_device->rad_info.chip_class < VI)
@@ -118,6 +120,11 @@
 	if (device->instance->debug_flags & RADV_DEBUG_NO_DCC)
 		return false;
 
+	/* FIXME: DCC is broken for shareable images starting with GFX9 */
+	if (device->physical_device->rad_info.chip_class >= GFX9 &&
+	    shareable)
+		return false;
+
 	/* TODO: Enable DCC for storage images. */
 	if ((pCreateInfo->usage & VK_IMAGE_USAGE_STORAGE_BIT) ||
 	    (pCreateInfo->flags & VK_IMAGE_CREATE_EXTENDED_USAGE_BIT_KHR))
@@ -414,7 +421,7 @@
 		else
 			return V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
 	default:
-		unreachable("illegale image type");
+		unreachable("illegal image type");
 	}
 }
 
@@ -534,7 +541,7 @@
 	if (device->physical_device->rad_info.chip_class >= GFX9) {
 		unsigned bc_swizzle = gfx9_border_color_swizzle(swizzle);
 
-		/* Depth is the the last accessible layer on Gfx9.
+		/* Depth is the last accessible layer on Gfx9.
 		 * The hw doesn't need to know the total number of layers.
 		 */
 		if (type == V_008F1C_SQ_RSRC_IMG_3D)
@@ -619,7 +626,7 @@
 			S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
 			S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
 			S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
-			S_008F1C_TYPE(radv_tex_dim(image->type, view_type, 1, 0, false, false));
+			S_008F1C_TYPE(radv_tex_dim(image->type, view_type, image->info.array_size, 0, false, false));
 		fmask_state[4] = 0;
 		fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
 		fmask_state[6] = 0;
@@ -667,7 +674,7 @@
 	si_make_texture_descriptor(device, image, false,
 				   (VkImageViewType)image->type, image->vk_format,
 				   &fixedmapping, 0, image->info.levels - 1, 0,
-				   image->info.array_size,
+				   image->info.array_size - 1,
 				   image->info.width, image->info.height,
 				   image->info.depth,
 				   desc, NULL);
@@ -726,56 +733,20 @@
 			  unsigned nr_samples,
 			  struct radv_fmask_info *out)
 {
-	/* FMASK is allocated like an ordinary texture. */
-	struct radeon_surf fmask = {};
-	struct ac_surf_info info = image->info;
-	memset(out, 0, sizeof(*out));
-
 	if (device->physical_device->rad_info.chip_class >= GFX9) {
-		out->alignment = image->surface.u.gfx9.fmask_alignment;
-		out->size = image->surface.u.gfx9.fmask_size;
+		out->alignment = image->surface.fmask_alignment;
+		out->size = image->surface.fmask_size;
+		out->tile_swizzle = image->surface.fmask_tile_swizzle;
 		return;
 	}
 
-	fmask.blk_w = image->surface.blk_w;
-	fmask.blk_h = image->surface.blk_h;
-	info.samples = 1;
-	fmask.flags = image->surface.flags | RADEON_SURF_FMASK;
-
-	if (!image->shareable)
-		info.surf_index = &device->fmask_mrt_offset_counter;
-
-	/* Force 2D tiling if it wasn't set. This may occur when creating
-	 * FMASK for MSAA resolve on R6xx. On R6xx, the single-sample
-	 * destination buffer must have an FMASK too. */
-	fmask.flags = RADEON_SURF_CLR(fmask.flags, MODE);
-	fmask.flags |= RADEON_SURF_SET(RADEON_SURF_MODE_2D, MODE);
-
-	switch (nr_samples) {
-	case 2:
-	case 4:
-		fmask.bpe = 1;
-		break;
-	case 8:
-		fmask.bpe = 4;
-		break;
-	default:
-		return;
-	}
-
-	device->ws->surface_init(device->ws, &info, &fmask);
-	assert(fmask.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
-
-	out->slice_tile_max = (fmask.u.legacy.level[0].nblk_x * fmask.u.legacy.level[0].nblk_y) / 64;
-	if (out->slice_tile_max)
-		out->slice_tile_max -= 1;
-
-	out->tile_mode_index = fmask.u.legacy.tiling_index[0];
-	out->pitch_in_pixels = fmask.u.legacy.level[0].nblk_x;
-	out->bank_height = fmask.u.legacy.bankh;
-	out->tile_swizzle = fmask.tile_swizzle;
-	out->alignment = MAX2(256, fmask.surf_alignment);
-	out->size = fmask.surf_size;
+	out->slice_tile_max = image->surface.u.legacy.fmask.slice_tile_max;
+	out->tile_mode_index = image->surface.u.legacy.fmask.tiling_index;
+	out->pitch_in_pixels = image->surface.u.legacy.fmask.pitch_in_pixels;
+	out->bank_height = image->surface.u.legacy.fmask.bankh;
+	out->tile_swizzle = image->surface.fmask_tile_swizzle;
+	out->alignment = image->surface.fmask_alignment;
+	out->size = image->surface.fmask_size;
 
 	assert(!out->tile_swizzle || !image->shareable);
 }
@@ -801,8 +772,8 @@
 	unsigned cl_width, cl_height;
 
 	if (device->physical_device->rad_info.chip_class >= GFX9) {
-		out->alignment = image->surface.u.gfx9.cmask_alignment;
-		out->size = image->surface.u.gfx9.cmask_size;
+		out->alignment = image->surface.cmask_alignment;
+		out->size = image->surface.cmask_size;
 		return;
 	}
 
@@ -959,13 +930,14 @@
 	image = vk_zalloc2(&device->alloc, alloc, sizeof(*image), 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!image)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	image->type = pCreateInfo->imageType;
 	image->info.width = pCreateInfo->extent.width;
 	image->info.height = pCreateInfo->extent.height;
 	image->info.depth = pCreateInfo->extent.depth;
 	image->info.samples = pCreateInfo->samples;
+	image->info.storage_samples = pCreateInfo->samples;
 	image->info.array_size = pCreateInfo->arrayLayers;
 	image->info.levels = pCreateInfo->mipLevels;
 	image->info.num_channels = vk_format_get_nr_components(pCreateInfo->format);
@@ -1043,7 +1015,7 @@
 		                                      0, RADEON_FLAG_VIRTUAL);
 		if (!image->bo) {
 			vk_free2(&device->alloc, alloc, image);
-			return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 		}
 	}
 
@@ -1358,7 +1330,7 @@
 	view = vk_alloc2(&device->alloc, pAllocator, sizeof(*view), 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (view == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	radv_image_view_init(view, device, pCreateInfo);
 
@@ -1406,7 +1378,7 @@
 	view = vk_alloc2(&device->alloc, pAllocator, sizeof(*view), 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (!view)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	radv_buffer_view_init(view, device, pCreateInfo);
 
diff --git a/src/amd/vulkan/radv_llvm_helper.cpp b/src/amd/vulkan/radv_llvm_helper.cpp
new file mode 100644
index 0000000..ed05e11
--- /dev/null
+++ b/src/amd/vulkan/radv_llvm_helper.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright © 2018 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "ac_llvm_util.h"
+#include "ac_llvm_build.h"
+#include "radv_shader_helper.h"
+
+#include <list>
+class radv_llvm_per_thread_info {
+public:
+	radv_llvm_per_thread_info(enum radeon_family arg_family,
+				enum ac_target_machine_options arg_tm_options)
+		: family(arg_family), tm_options(arg_tm_options) {}
+
+	~radv_llvm_per_thread_info()
+	{
+		ac_destroy_llvm_passes(passes);
+		ac_destroy_llvm_compiler(&llvm_info);
+	}
+
+	bool init(void)
+	{
+		if (!ac_init_llvm_compiler(&llvm_info,
+					  true,
+					  family,
+					  tm_options))
+			return false;
+
+		passes = ac_create_llvm_passes(llvm_info.tm);
+		if (!passes)
+			return false;
+
+		return true;
+	}
+
+	bool compile_to_memory_buffer(LLVMModuleRef module,
+				      struct ac_shader_binary *binary)
+	{
+		return ac_compile_module_to_binary(passes, module, binary);
+	}
+
+	bool is_same(enum radeon_family arg_family,
+		     enum ac_target_machine_options arg_tm_options) {
+		if (arg_family == family &&
+		    arg_tm_options == tm_options)
+			return true;
+		return false;
+	}
+	struct ac_llvm_compiler llvm_info;
+private:
+	enum radeon_family family;
+	enum ac_target_machine_options tm_options;
+	struct ac_compiler_passes *passes;
+};
+
+/* we have to store a linked list per thread due to the possiblity of multiple gpus being required */
+static thread_local std::list<radv_llvm_per_thread_info> radv_llvm_per_thread_list;
+
+bool radv_compile_to_binary(struct ac_llvm_compiler *info,
+			    LLVMModuleRef module,
+			    struct ac_shader_binary *binary)
+{
+	radv_llvm_per_thread_info *thread_info = nullptr;
+
+	for (auto &I : radv_llvm_per_thread_list) {
+		if (I.llvm_info.tm == info->tm) {
+			thread_info = &I;
+			break;
+		}
+	}
+
+	if (!thread_info) {
+		struct ac_compiler_passes *passes = ac_create_llvm_passes(info->tm);
+		bool ret = ac_compile_module_to_binary(passes, module, binary);
+		ac_destroy_llvm_passes(passes);
+		return ret;
+	}
+
+	return thread_info->compile_to_memory_buffer(module, binary);
+}
+
+bool radv_init_llvm_compiler(struct ac_llvm_compiler *info,
+			     bool okay_to_leak_target_library_info,
+			     bool thread_compiler,
+			     enum radeon_family family,
+			     enum ac_target_machine_options tm_options)
+{
+	if (thread_compiler) {
+		for (auto &I : radv_llvm_per_thread_list) {
+			if (I.is_same(family, tm_options)) {
+				*info = I.llvm_info;
+				return true;
+			}
+		}
+
+		radv_llvm_per_thread_list.emplace_back(family, tm_options);
+		radv_llvm_per_thread_info &tinfo = radv_llvm_per_thread_list.back();
+
+		if (!tinfo.init()) {
+			radv_llvm_per_thread_list.pop_back();
+			return false;
+		}
+
+		*info = tinfo.llvm_info;
+		return true;
+	}
+
+	if (!ac_init_llvm_compiler(info,
+				   okay_to_leak_target_library_info,
+				   family,
+				   tm_options))
+		return false;
+	return true;
+}
+
+void radv_destroy_llvm_compiler(struct ac_llvm_compiler *info,
+				bool thread_compiler)
+{
+	if (!thread_compiler)
+		ac_destroy_llvm_compiler(info);
+}
diff --git a/src/amd/vulkan/radv_meta.c b/src/amd/vulkan/radv_meta.c
index 56b72d1..a59f38f 100644
--- a/src/amd/vulkan/radv_meta.c
+++ b/src/amd/vulkan/radv_meta.c
@@ -80,10 +80,9 @@
 	}
 
 	if (state->flags & RADV_META_SAVE_DESCRIPTORS) {
-		if (descriptors_state->valid & (1 << 0))
-			state->old_descriptor_set0 = descriptors_state->sets[0];
-		else
-			state->old_descriptor_set0 = NULL;
+		state->old_descriptor_set0 = descriptors_state->sets[0];
+		if (!state->old_descriptor_set0)
+			state->flags &= ~RADV_META_SAVE_DESCRIPTORS;
 	}
 
 	if (state->flags & RADV_META_SAVE_CONSTANTS) {
@@ -515,18 +514,20 @@
 	nir_ssa_def *tmp;
 	nir_if *outer_if = NULL;
 
-	nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
+	nir_ssa_def *input_img_deref = &nir_build_deref_var(b, input_img)->dest.ssa;
+
+	nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
 	tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
 	tex->op = nir_texop_txf_ms;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(img_coord);
 	tex->src[1].src_type = nir_tex_src_ms_index;
 	tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
+	tex->src[2].src_type = nir_tex_src_texture_deref;
+	tex->src[2].src = nir_src_for_ssa(input_img_deref);
 	tex->dest_type = nir_type_float;
 	tex->is_array = false;
 	tex->coord_components = 2;
-	tex->texture = nir_deref_var_create(tex, input_img);
-	tex->sampler = NULL;
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(b, &tex->instr);
@@ -534,16 +535,16 @@
 	tmp = &tex->dest.ssa;
 
 	if (!is_integer && samples > 1) {
-		nir_tex_instr *tex_all_same = nir_tex_instr_create(b->shader, 1);
+		nir_tex_instr *tex_all_same = nir_tex_instr_create(b->shader, 2);
 		tex_all_same->sampler_dim = GLSL_SAMPLER_DIM_MS;
 		tex_all_same->op = nir_texop_samples_identical;
 		tex_all_same->src[0].src_type = nir_tex_src_coord;
 		tex_all_same->src[0].src = nir_src_for_ssa(img_coord);
+		tex_all_same->src[1].src_type = nir_tex_src_texture_deref;
+		tex_all_same->src[1].src = nir_src_for_ssa(input_img_deref);
 		tex_all_same->dest_type = nir_type_float;
 		tex_all_same->is_array = false;
 		tex_all_same->coord_components = 2;
-		tex_all_same->texture = nir_deref_var_create(tex_all_same, input_img);
-		tex_all_same->sampler = NULL;
 
 		nir_ssa_dest_init(&tex_all_same->instr, &tex_all_same->dest, 1, 32, "tex");
 		nir_builder_instr_insert(b, &tex_all_same->instr);
@@ -555,18 +556,18 @@
 
 		b->cursor = nir_after_cf_list(&if_stmt->then_list);
 		for (int i = 1; i < samples; i++) {
-			nir_tex_instr *tex_add = nir_tex_instr_create(b->shader, 2);
+			nir_tex_instr *tex_add = nir_tex_instr_create(b->shader, 3);
 			tex_add->sampler_dim = GLSL_SAMPLER_DIM_MS;
 			tex_add->op = nir_texop_txf_ms;
 			tex_add->src[0].src_type = nir_tex_src_coord;
 			tex_add->src[0].src = nir_src_for_ssa(img_coord);
 			tex_add->src[1].src_type = nir_tex_src_ms_index;
 			tex_add->src[1].src = nir_src_for_ssa(nir_imm_int(b, i));
+			tex_add->src[2].src_type = nir_tex_src_texture_deref;
+			tex_add->src[2].src = nir_src_for_ssa(input_img_deref);
 			tex_add->dest_type = nir_type_float;
 			tex_add->is_array = false;
 			tex_add->coord_components = 2;
-			tex_add->texture = nir_deref_var_create(tex_add, input_img);
-			tex_add->sampler = NULL;
 
 			nir_ssa_dest_init(&tex_add->instr, &tex_add->dest, 4, 32, "tex");
 			nir_builder_instr_insert(b, &tex_add->instr);
diff --git a/src/amd/vulkan/radv_meta.h b/src/amd/vulkan/radv_meta.h
index 4a9abae..35067f6 100644
--- a/src/amd/vulkan/radv_meta.h
+++ b/src/amd/vulkan/radv_meta.h
@@ -199,10 +199,6 @@
 				 uint32_t region_count,
 				 const VkImageResolve *regions);
 
-void radv_blit_to_prime_linear(struct radv_cmd_buffer *cmd_buffer,
-			       struct radv_image *image,
-			       struct radv_image *linear_image);
-
 uint32_t radv_clear_cmask(struct radv_cmd_buffer *cmd_buffer,
 			  struct radv_image *image, uint32_t value);
 uint32_t radv_clear_dcc(struct radv_cmd_buffer *cmd_buffer,
diff --git a/src/amd/vulkan/radv_meta_blit.c b/src/amd/vulkan/radv_meta_blit.c
index 3ff4849..67c26aa 100644
--- a/src/amd/vulkan/radv_meta_blit.c
+++ b/src/amd/vulkan/radv_meta_blit.c
@@ -131,16 +131,20 @@
 	sampler->data.descriptor_set = 0;
 	sampler->data.binding = 0;
 
-	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
+	nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
+
+	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3);
 	tex->sampler_dim = tex_dim;
 	tex->op = nir_texop_tex;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(tex_pos);
+	tex->src[1].src_type = nir_tex_src_texture_deref;
+	tex->src[1].src = nir_src_for_ssa(tex_deref);
+	tex->src[2].src_type = nir_tex_src_sampler_deref;
+	tex->src[2].src = nir_src_for_ssa(tex_deref);
 	tex->dest_type = nir_type_float; /* TODO */
 	tex->is_array = glsl_sampler_type_is_array(sampler_type);
 	tex->coord_components = tex_pos->num_components;
-	tex->texture = nir_deref_var_create(tex, sampler);
-	tex->sampler = nir_deref_var_create(tex, sampler);
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(&b, &tex->instr);
@@ -185,16 +189,20 @@
 	sampler->data.descriptor_set = 0;
 	sampler->data.binding = 0;
 
-	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
+	nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
+
+	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3);
 	tex->sampler_dim = tex_dim;
 	tex->op = nir_texop_tex;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(tex_pos);
+	tex->src[1].src_type = nir_tex_src_texture_deref;
+	tex->src[1].src = nir_src_for_ssa(tex_deref);
+	tex->src[2].src_type = nir_tex_src_sampler_deref;
+	tex->src[2].src = nir_src_for_ssa(tex_deref);
 	tex->dest_type = nir_type_float; /* TODO */
 	tex->is_array = glsl_sampler_type_is_array(sampler_type);
 	tex->coord_components = tex_pos->num_components;
-	tex->texture = nir_deref_var_create(tex, sampler);
-	tex->sampler = nir_deref_var_create(tex, sampler);
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(&b, &tex->instr);
@@ -239,16 +247,20 @@
 	sampler->data.descriptor_set = 0;
 	sampler->data.binding = 0;
 
-	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 1);
+	nir_ssa_def *tex_deref = &nir_build_deref_var(&b, sampler)->dest.ssa;
+
+	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3);
 	tex->sampler_dim = tex_dim;
 	tex->op = nir_texop_tex;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(tex_pos);
+	tex->src[1].src_type = nir_tex_src_texture_deref;
+	tex->src[1].src = nir_src_for_ssa(tex_deref);
+	tex->src[2].src_type = nir_tex_src_sampler_deref;
+	tex->src[2].src = nir_src_for_ssa(tex_deref);
 	tex->dest_type = nir_type_float; /* TODO */
 	tex->is_array = glsl_sampler_type_is_array(sampler_type);
 	tex->coord_components = tex_pos->num_components;
-	tex->texture = nir_deref_var_create(tex, sampler);
-	tex->sampler = nir_deref_var_create(tex, sampler);
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(&b, &tex->instr);
@@ -508,6 +520,7 @@
 	RADV_FROM_HANDLE(radv_image, src_image, srcImage);
 	RADV_FROM_HANDLE(radv_image, dest_image, destImage);
 	struct radv_meta_saved_state saved_state;
+	bool old_predicating;
 
 	/* From the Vulkan 1.0 spec:
 	 *
@@ -522,6 +535,12 @@
 		       RADV_META_SAVE_CONSTANTS |
 		       RADV_META_SAVE_DESCRIPTORS);
 
+	/* VK_EXT_conditional_rendering says that blit commands should not be
+	 * affected by conditional rendering.
+	 */
+	old_predicating = cmd_buffer->state.predicating;
+	cmd_buffer->state.predicating = false;
+
 	for (unsigned r = 0; r < regionCount; r++) {
 		const VkImageSubresourceLayers *src_res = &pRegions[r].srcSubresource;
 		const VkImageSubresourceLayers *dst_res = &pRegions[r].dstSubresource;
@@ -636,6 +655,9 @@
 		}
 	}
 
+	/* Restore conditional rendering. */
+	cmd_buffer->state.predicating = old_predicating;
+
 	radv_meta_restore(&saved_state, cmd_buffer);
 }
 
diff --git a/src/amd/vulkan/radv_meta_blit2d.c b/src/amd/vulkan/radv_meta_blit2d.c
index 4a718c6..cac0a4d 100644
--- a/src/amd/vulkan/radv_meta_blit2d.c
+++ b/src/amd/vulkan/radv_meta_blit2d.c
@@ -485,22 +485,25 @@
 		nir_ssa_dest_init(&sample_idx->instr, &sample_idx->dest, 1, 32, "sample_idx");
 		nir_builder_instr_insert(b, &sample_idx->instr);
 	}
-	nir_tex_instr *tex = nir_tex_instr_create(b->shader, is_multisampled ? 3 : 2);
+
+	nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+
+	nir_tex_instr *tex = nir_tex_instr_create(b->shader, is_multisampled ? 4 : 3);
 	tex->sampler_dim = dim;
 	tex->op = is_multisampled ? nir_texop_txf_ms : nir_texop_txf;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(is_3d ? tex_pos_3d : tex_pos);
 	tex->src[1].src_type = is_multisampled ? nir_tex_src_ms_index : nir_tex_src_lod;
 	tex->src[1].src = nir_src_for_ssa(is_multisampled ? &sample_idx->dest.ssa : nir_imm_int(b, 0));
+	tex->src[2].src_type = nir_tex_src_texture_deref;
+	tex->src[2].src = nir_src_for_ssa(tex_deref);
 	if (is_multisampled) {
-		tex->src[2].src_type = nir_tex_src_lod;
-		tex->src[2].src = nir_src_for_ssa(nir_imm_int(b, 0));
+		tex->src[3].src_type = nir_tex_src_lod;
+		tex->src[3].src = nir_src_for_ssa(nir_imm_int(b, 0));
 	}
 	tex->dest_type = nir_type_uint;
 	tex->is_array = false;
 	tex->coord_components = is_3d ? 3 : 2;
-	tex->texture = nir_deref_var_create(tex, sampler);
-	tex->sampler = NULL;
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(b, &tex->instr);
@@ -534,16 +537,18 @@
 	pos_x = nir_iadd(b, pos_x, pos_y);
 	//pos_x = nir_iadd(b, pos_x, nir_imm_int(b, 100000));
 
-	nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
+	nir_ssa_def *tex_deref = &nir_build_deref_var(b, sampler)->dest.ssa;
+
+	nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
 	tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
 	tex->op = nir_texop_txf;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(pos_x);
+	tex->src[1].src_type = nir_tex_src_texture_deref;
+	tex->src[1].src = nir_src_for_ssa(tex_deref);
 	tex->dest_type = nir_type_uint;
 	tex->is_array = false;
 	tex->coord_components = 1;
-	tex->texture = nir_deref_var_create(tex, sampler);
-	tex->sampler = NULL;
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(b, &tex->instr);
@@ -603,8 +608,7 @@
 	}
 
 	nir_ssa_def *pos_int = nir_f2i32(&b, nir_load_var(&b, tex_pos_in));
-	unsigned swiz[4] = { 0, 1 };
-	nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false);
+	nir_ssa_def *tex_pos = nir_channels(&b, pos_int, 0x3);
 
 	nir_ssa_def *color = txf_func(&b, device, tex_pos, is_3d, is_multisampled);
 	nir_store_var(&b, color_out, color, 0xf);
@@ -637,8 +641,7 @@
 	}
 
 	nir_ssa_def *pos_int = nir_f2i32(&b, nir_load_var(&b, tex_pos_in));
-	unsigned swiz[4] = { 0, 1 };
-	nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false);
+	nir_ssa_def *tex_pos = nir_channels(&b, pos_int, 0x3);
 
 	nir_ssa_def *color = txf_func(&b, device, tex_pos, is_3d, is_multisampled);
 	nir_store_var(&b, color_out, color, 0x1);
@@ -671,8 +674,7 @@
 	}
 
 	nir_ssa_def *pos_int = nir_f2i32(&b, nir_load_var(&b, tex_pos_in));
-	unsigned swiz[4] = { 0, 1 };
-	nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false);
+	nir_ssa_def *tex_pos = nir_channels(&b, pos_int, 0x3);
 
 	nir_ssa_def *color = txf_func(&b, device, tex_pos, is_3d, is_multisampled);
 	nir_store_var(&b, color_out, color, 0x1);
diff --git a/src/amd/vulkan/radv_meta_buffer.c b/src/amd/vulkan/radv_meta_buffer.c
index 2e1ba2c..f1887e3 100644
--- a/src/amd/vulkan/radv_meta_buffer.c
+++ b/src/amd/vulkan/radv_meta_buffer.c
@@ -25,7 +25,7 @@
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 
 	nir_ssa_def *offset = nir_imul(&b, global_id, nir_imm_int(&b, 16));
-	offset = nir_swizzle(&b, offset, (unsigned[]) {0, 0, 0, 0}, 1, false);
+	offset = nir_channel(&b, offset, 0);
 
 	nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
 	                                                          nir_intrinsic_vulkan_resource_index);
@@ -77,7 +77,7 @@
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
 
 	nir_ssa_def *offset = nir_imul(&b, global_id, nir_imm_int(&b, 16));
-	offset = nir_swizzle(&b, offset, (unsigned[]) {0, 0, 0, 0}, 1, false);
+	offset = nir_channel(&b, offset, 0);
 
 	nir_intrinsic_instr *dst_buf = nir_intrinsic_instr_create(b.shader,
 	                                                          nir_intrinsic_vulkan_resource_index);
@@ -415,7 +415,7 @@
 	} else if (size) {
 		uint64_t va = radv_buffer_get_va(bo);
 		va += offset;
-		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, bo, 8);
+		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, bo);
 		si_cp_dma_clear_buffer(cmd_buffer, va, size, value);
 	}
 
@@ -438,8 +438,8 @@
 		src_va += src_offset;
 		dst_va += dst_offset;
 
-		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, src_bo, 8);
-		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_bo, 8);
+		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, src_bo);
+		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_bo);
 
 		si_cp_dma_buffer_copy(cmd_buffer, src_va, dst_va, size);
 	}
@@ -472,6 +472,13 @@
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 	RADV_FROM_HANDLE(radv_buffer, src_buffer, srcBuffer);
 	RADV_FROM_HANDLE(radv_buffer, dest_buffer, destBuffer);
+	bool old_predicating;
+
+	/* VK_EXT_conditional_rendering says that copy commands should not be
+	 * affected by conditional rendering.
+	 */
+	old_predicating = cmd_buffer->state.predicating;
+	cmd_buffer->state.predicating = false;
 
 	for (unsigned r = 0; r < regionCount; r++) {
 		uint64_t src_offset = src_buffer->offset + pRegions[r].srcOffset;
@@ -481,6 +488,9 @@
 		radv_copy_buffer(cmd_buffer, src_buffer->bo, dest_buffer->bo,
 				 src_offset, dest_offset, copy_size);
 	}
+
+	/* Restore conditional rendering. */
+	cmd_buffer->state.predicating = old_predicating;
 }
 
 void radv_CmdUpdateBuffer(
@@ -506,7 +516,7 @@
 	if (dataSize < RADV_BUFFER_OPS_CS_THRESHOLD) {
 		si_emit_cache_flush(cmd_buffer);
 
-		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo, 8);
+		radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
 
 		radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, words + 4);
 
diff --git a/src/amd/vulkan/radv_meta_bufimage.c b/src/amd/vulkan/radv_meta_bufimage.c
index 69e15d3..aa17c25 100644
--- a/src/amd/vulkan/radv_meta_bufimage.c
+++ b/src/amd/vulkan/radv_meta_bufimage.c
@@ -88,18 +88,20 @@
 	nir_builder_instr_insert(&b, &stride->instr);
 
 	nir_ssa_def *img_coord = nir_iadd(&b, global_id, &offset->dest.ssa);
-	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
+	nir_ssa_def *input_img_deref = &nir_build_deref_var(&b, input_img)->dest.ssa;
+
+	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3);
 	tex->sampler_dim = dim;
 	tex->op = nir_texop_txf;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(nir_channels(&b, img_coord, is_3d ? 0x7 : 0x3));
 	tex->src[1].src_type = nir_tex_src_lod;
 	tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
+	tex->src[2].src_type = nir_tex_src_texture_deref;
+	tex->src[2].src = nir_src_for_ssa(input_img_deref);
 	tex->dest_type = nir_type_float;
 	tex->is_array = false;
 	tex->coord_components = is_3d ? 3 : 2;
-	tex->texture = nir_deref_var_create(tex, input_img);
-	tex->sampler = NULL;
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(&b, &tex->instr);
@@ -113,11 +115,11 @@
 	nir_ssa_def *coord = nir_vec4(&b, tmp, tmp, tmp, tmp);
 
 	nir_ssa_def *outval = &tex->dest.ssa;
-	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_var_store);
-	store->src[0] = nir_src_for_ssa(coord);
-	store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
-	store->src[2] = nir_src_for_ssa(outval);
-	store->variables[0] = nir_deref_var_create(store, output_img);
+	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_deref_store);
+	store->src[0] = nir_src_for_ssa(&nir_build_deref_var(&b, output_img)->dest.ssa);
+	store->src[1] = nir_src_for_ssa(coord);
+	store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
+	store->src[3] = nir_src_for_ssa(outval);
 
 	nir_builder_instr_insert(&b, &store->instr);
 	return b.shader;
@@ -320,29 +322,30 @@
 	nir_ssa_def *buf_coord = nir_vec4(&b, tmp, tmp, tmp, tmp);
 
 	nir_ssa_def *img_coord = nir_iadd(&b, global_id, &offset->dest.ssa);
+	nir_ssa_def *input_img_deref = &nir_build_deref_var(&b, input_img)->dest.ssa;
 
-	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
+	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3);
 	tex->sampler_dim = GLSL_SAMPLER_DIM_BUF;
 	tex->op = nir_texop_txf;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(nir_channels(&b, buf_coord, 1));
 	tex->src[1].src_type = nir_tex_src_lod;
 	tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
+	tex->src[2].src_type = nir_tex_src_texture_deref;
+	tex->src[2].src = nir_src_for_ssa(input_img_deref);
 	tex->dest_type = nir_type_float;
 	tex->is_array = false;
 	tex->coord_components = 1;
-	tex->texture = nir_deref_var_create(tex, input_img);
-	tex->sampler = NULL;
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(&b, &tex->instr);
 
 	nir_ssa_def *outval = &tex->dest.ssa;
-	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_var_store);
-	store->src[0] = nir_src_for_ssa(img_coord);
-	store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
-	store->src[2] = nir_src_for_ssa(outval);
-	store->variables[0] = nir_deref_var_create(store, output_img);
+	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_deref_store);
+	store->src[0] = nir_src_for_ssa(&nir_build_deref_var(&b, output_img)->dest.ssa);
+	store->src[1] = nir_src_for_ssa(img_coord);
+	store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
+	store->src[3] = nir_src_for_ssa(outval);
 
 	nir_builder_instr_insert(&b, &store->instr);
 	return b.shader;
@@ -532,31 +535,32 @@
 	nir_builder_instr_insert(&b, &dst_offset->instr);
 
 	nir_ssa_def *src_coord = nir_iadd(&b, global_id, &src_offset->dest.ssa);
+	nir_ssa_def *input_img_deref = &nir_build_deref_var(&b, input_img)->dest.ssa;
 
 	nir_ssa_def *dst_coord = nir_iadd(&b, global_id, &dst_offset->dest.ssa);
 
-	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
+	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3);
 	tex->sampler_dim = dim;
 	tex->op = nir_texop_txf;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(nir_channels(&b, src_coord, is_3d ? 0x7 : 0x3));
 	tex->src[1].src_type = nir_tex_src_lod;
 	tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
+	tex->src[2].src_type = nir_tex_src_texture_deref;
+	tex->src[2].src = nir_src_for_ssa(input_img_deref);
 	tex->dest_type = nir_type_float;
 	tex->is_array = false;
 	tex->coord_components = is_3d ? 3 : 2;
-	tex->texture = nir_deref_var_create(tex, input_img);
-	tex->sampler = NULL;
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(&b, &tex->instr);
 
 	nir_ssa_def *outval = &tex->dest.ssa;
-	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_var_store);
-	store->src[0] = nir_src_for_ssa(dst_coord);
-	store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
-	store->src[2] = nir_src_for_ssa(outval);
-	store->variables[0] = nir_deref_var_create(store, output_img);
+	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_deref_store);
+	store->src[0] = nir_src_for_ssa(&nir_build_deref_var(&b, output_img)->dest.ssa);
+	store->src[1] = nir_src_for_ssa(dst_coord);
+	store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
+	store->src[3] = nir_src_for_ssa(outval);
 
 	nir_builder_instr_insert(&b, &store->instr);
 	return b.shader;
@@ -748,11 +752,11 @@
 	comps[3] = nir_imm_int(&b, 0);
 	global_id = nir_vec(&b, comps, 4);
 
-	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_var_store);
-	store->src[0] = nir_src_for_ssa(global_id);
-	store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
-	store->src[2] = nir_src_for_ssa(&clear_val->dest.ssa);
-	store->variables[0] = nir_deref_var_create(store, output_img);
+	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_deref_store);
+	store->src[0] = nir_src_for_ssa(&nir_build_deref_var(&b, output_img)->dest.ssa);
+	store->src[1] = nir_src_for_ssa(global_id);
+	store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
+	store->src[3] = nir_src_for_ssa(&clear_val->dest.ssa);
 
 	nir_builder_instr_insert(&b, &store->instr);
 	return b.shader;
diff --git a/src/amd/vulkan/radv_meta_clear.c b/src/amd/vulkan/radv_meta_clear.c
index 373072d..d7c9849 100644
--- a/src/amd/vulkan/radv_meta_clear.c
+++ b/src/amd/vulkan/radv_meta_clear.c
@@ -366,10 +366,10 @@
 
 	struct radv_subpass clear_subpass = {
 		.color_count = 1,
-		.color_attachments = (VkAttachmentReference[]) {
+		.color_attachments = (struct radv_subpass_attachment[]) {
 			subpass->color_attachments[clear_att->colorAttachment]
 		},
-		.depth_stencil_attachment = (VkAttachmentReference) { VK_ATTACHMENT_UNUSED, VK_IMAGE_LAYOUT_UNDEFINED }
+		.depth_stencil_attachment = (struct radv_subpass_attachment) { VK_ATTACHMENT_UNUSED, VK_IMAGE_LAYOUT_UNDEFINED }
 	};
 
 	radv_cmd_buffer_set_subpass(cmd_buffer, &clear_subpass, false);
@@ -645,7 +645,8 @@
 	if (depth_view_can_fast_clear(cmd_buffer, iview, aspects,
 	                              subpass->depth_stencil_attachment.layout,
 	                              clear_rect, clear_value))
-		radv_set_depth_clear_regs(cmd_buffer, iview->image, clear_value, aspects);
+		radv_update_ds_clear_metadata(cmd_buffer, iview->image,
+					      clear_value, aspects);
 
 	radv_CmdSetViewport(radv_cmd_buffer_to_handle(cmd_buffer), 0, 1, &(VkViewport) {
 			.x = clear_rect->rect.offset.x,
@@ -744,7 +745,7 @@
 				      iview->image->offset + iview->image->htile_offset,
 				      iview->image->surface.htile_size, clear_word);
 
-	radv_set_depth_clear_regs(cmd_buffer, iview->image, clear_value, aspects);
+	radv_update_ds_clear_metadata(cmd_buffer, iview->image, clear_value, aspects);
 	if (post_flush) {
 		*post_flush |= flush_bits;
 	} else {
@@ -993,7 +994,7 @@
 	const struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
 	const struct radv_image_view *iview = fb->attachments[pass_att].attachment;
 	VkClearColorValue clear_value = clear_att->clearValue.color;
-	uint32_t clear_color[2], flush_bits;
+	uint32_t clear_color[2], flush_bits = 0;
 	uint32_t cmask_clear_value;
 	bool ret;
 
@@ -1019,8 +1020,6 @@
 	if (iview->image->info.levels > 1)
 		goto fail;
 
-	if (iview->image->surface.is_linear)
-		goto fail;
 	if (!radv_image_extent_compare(iview->image, &iview->extent))
 		goto fail;
 
@@ -1090,7 +1089,7 @@
 		if (!can_avoid_fast_clear_elim)
 			need_decompress_pass = true;
 
-		flush_bits = radv_clear_dcc(cmd_buffer, iview->image, reset_value);
+		flush_bits |= radv_clear_dcc(cmd_buffer, iview->image, reset_value);
 
 		radv_set_dcc_need_cmask_elim_pred(cmd_buffer, iview->image,
 						  need_decompress_pass);
@@ -1105,7 +1104,8 @@
 		cmd_buffer->state.flush_bits |= flush_bits;
 	}
 
-	radv_set_color_clear_regs(cmd_buffer, iview->image, subpass_att, clear_color);
+	radv_update_color_clear_metadata(cmd_buffer, iview->image, subpass_att,
+					 clear_color);
 
 	return true;
 fail:
diff --git a/src/amd/vulkan/radv_meta_copy.c b/src/amd/vulkan/radv_meta_copy.c
index 2df5f87..f4de552 100644
--- a/src/amd/vulkan/radv_meta_copy.c
+++ b/src/amd/vulkan/radv_meta_copy.c
@@ -72,6 +72,7 @@
 	case 2: return VK_FORMAT_R8G8_UINT;
 	case 4: return VK_FORMAT_R8G8B8A8_UINT;
 	case 8: return VK_FORMAT_R16G16B16A16_UINT;
+	case 12: return VK_FORMAT_R32G32B32_UINT;
 	case 16: return VK_FORMAT_R32G32B32A32_UINT;
 	default:
 		unreachable("Invalid format block size");
@@ -116,6 +117,7 @@
 {
 	bool cs = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE;
 	struct radv_meta_saved_state saved_state;
+	bool old_predicating;
 
 	/* The Vulkan 1.0 spec says "dstImage must have a sample count equal to
 	 * VK_SAMPLE_COUNT_1_BIT."
@@ -128,6 +130,12 @@
 		       RADV_META_SAVE_CONSTANTS |
 		       RADV_META_SAVE_DESCRIPTORS);
 
+	/* VK_EXT_conditional_rendering says that copy commands should not be
+	 * affected by conditional rendering.
+	 */
+	old_predicating = cmd_buffer->state.predicating;
+	cmd_buffer->state.predicating = false;
+
 	for (unsigned r = 0; r < regionCount; r++) {
 
 		/**
@@ -207,6 +215,9 @@
 		}
 	}
 
+	/* Restore conditional rendering. */
+	cmd_buffer->state.predicating = old_predicating;
+
 	radv_meta_restore(&saved_state, cmd_buffer);
 }
 
@@ -235,12 +246,19 @@
                           const VkBufferImageCopy* pRegions)
 {
 	struct radv_meta_saved_state saved_state;
+	bool old_predicating;
 
 	radv_meta_save(&saved_state, cmd_buffer,
 		       RADV_META_SAVE_COMPUTE_PIPELINE |
 		       RADV_META_SAVE_CONSTANTS |
 		       RADV_META_SAVE_DESCRIPTORS);
 
+	/* VK_EXT_conditional_rendering says that copy commands should not be
+	 * affected by conditional rendering.
+	 */
+	old_predicating = cmd_buffer->state.predicating;
+	cmd_buffer->state.predicating = false;
+
 	for (unsigned r = 0; r < regionCount; r++) {
 
 		/**
@@ -312,6 +330,9 @@
 		}
 	}
 
+	/* Restore conditional rendering. */
+	cmd_buffer->state.predicating = old_predicating;
+
 	radv_meta_restore(&saved_state, cmd_buffer);
 }
 
@@ -343,6 +364,7 @@
 {
 	bool cs = cmd_buffer->queue_family_index == RADV_QUEUE_COMPUTE;
 	struct radv_meta_saved_state saved_state;
+	bool old_predicating;
 
 	/* From the Vulkan 1.0 spec:
 	 *
@@ -357,6 +379,12 @@
 		       RADV_META_SAVE_CONSTANTS |
 		       RADV_META_SAVE_DESCRIPTORS);
 
+	/* VK_EXT_conditional_rendering says that copy commands should not be
+	 * affected by conditional rendering.
+	 */
+	old_predicating = cmd_buffer->state.predicating;
+	cmd_buffer->state.predicating = false;
+
 	for (unsigned r = 0; r < regionCount; r++) {
 		assert(pRegions[r].srcSubresource.aspectMask ==
 		       pRegions[r].dstSubresource.aspectMask);
@@ -464,6 +492,9 @@
 		}
 	}
 
+	/* Restore conditional rendering. */
+	cmd_buffer->state.predicating = old_predicating;
+
 	radv_meta_restore(&saved_state, cmd_buffer);
 }
 
@@ -485,24 +516,3 @@
 			dest_image, destImageLayout,
 			regionCount, pRegions);
 }
-
-void radv_blit_to_prime_linear(struct radv_cmd_buffer *cmd_buffer,
-			       struct radv_image *image,
-			       struct radv_image *linear_image)
-{
-	struct VkImageCopy image_copy = { 0 };
-
-	image_copy.srcSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-	image_copy.srcSubresource.layerCount = 1;
-
-	image_copy.dstSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
-	image_copy.dstSubresource.layerCount = 1;
-
-	image_copy.extent.width = image->info.width;
-	image_copy.extent.height = image->info.height;
-	image_copy.extent.depth = 1;
-
-	meta_copy_image(cmd_buffer, image, VK_IMAGE_LAYOUT_GENERAL, linear_image,
-			VK_IMAGE_LAYOUT_GENERAL,
-			1, &image_copy);
-}
diff --git a/src/amd/vulkan/radv_meta_fast_clear.c b/src/amd/vulkan/radv_meta_fast_clear.c
index d5af7a1..74868d5 100644
--- a/src/amd/vulkan/radv_meta_fast_clear.c
+++ b/src/amd/vulkan/radv_meta_fast_clear.c
@@ -66,19 +66,20 @@
 						b.shader->info.cs.local_size[2], 0);
 
 	nir_ssa_def *global_id = nir_iadd(&b, nir_imul(&b, wg_id, block_size), invoc_id);
+	nir_ssa_def *input_img_deref = &nir_build_deref_var(&b, input_img)->dest.ssa;
 
-	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2);
+	nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3);
 	tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
 	tex->op = nir_texop_txf;
 	tex->src[0].src_type = nir_tex_src_coord;
 	tex->src[0].src = nir_src_for_ssa(nir_channels(&b, global_id, 3));
 	tex->src[1].src_type = nir_tex_src_lod;
 	tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
+	tex->src[2].src_type = nir_tex_src_texture_deref;
+	tex->src[2].src = nir_src_for_ssa(input_img_deref);
 	tex->dest_type = nir_type_float;
 	tex->is_array = false;
 	tex->coord_components = 2;
-	tex->texture = nir_deref_var_create(tex, input_img);
-	tex->sampler = NULL;
 
 	nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex");
 	nir_builder_instr_insert(&b, &tex->instr);
@@ -90,11 +91,11 @@
 	nir_builder_instr_insert(&b, &bar->instr);
 
 	nir_ssa_def *outval = &tex->dest.ssa;
-	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_var_store);
-	store->src[0] = nir_src_for_ssa(global_id);
-	store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
-	store->src[2] = nir_src_for_ssa(outval);
-	store->variables[0] = nir_deref_var_create(store, output_img);
+	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_deref_store);
+	store->src[0] = nir_src_for_ssa(&nir_build_deref_var(&b, output_img)->dest.ssa);
+	store->src[1] = nir_src_for_ssa(global_id);
+	store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
+	store->src[3] = nir_src_for_ssa(outval);
 
 	nir_builder_instr_insert(&b, &store->instr);
 	return b.shader;
@@ -570,7 +571,7 @@
 		va += image->dcc_pred_offset;
 	}
 
-	si_emit_set_predication_state(cmd_buffer, va);
+	si_emit_set_predication_state(cmd_buffer, true, va);
 }
 
 /**
@@ -585,6 +586,7 @@
 	VkDevice device_h = radv_device_to_handle(cmd_buffer->device);
 	VkCommandBuffer cmd_buffer_h = radv_cmd_buffer_to_handle(cmd_buffer);
 	uint32_t layer_count = radv_get_layerCount(image, subresourceRange);
+	bool old_predicating = false;
 	VkPipeline pipeline;
 
 	assert(cmd_buffer->queue_family_index == RADV_QUEUE_GENERAL);
@@ -602,6 +604,8 @@
 	}
 
 	if (!decompress_dcc && radv_image_has_dcc(image)) {
+		old_predicating = cmd_buffer->state.predicating;
+
 		radv_emit_set_predication_state_from_image(cmd_buffer, image, true);
 		cmd_buffer->state.predicating = true;
 	}
@@ -668,8 +672,21 @@
 
 	}
 	if (!decompress_dcc && radv_image_has_dcc(image)) {
-		cmd_buffer->state.predicating = false;
+		cmd_buffer->state.predicating = old_predicating;
+
 		radv_emit_set_predication_state_from_image(cmd_buffer, image, false);
+
+		/* Clear the image's fast-clear eliminate predicate because
+		 * FMASK and DCC also imply a fast-clear eliminate.
+		 */
+		radv_set_dcc_need_cmask_elim_pred(cmd_buffer, image, false);
+
+		if (cmd_buffer->state.predication_type != -1) {
+			/* Restore previous conditional rendering user state. */
+			si_emit_set_predication_state(cmd_buffer,
+						      cmd_buffer->state.predication_type,
+						      cmd_buffer->state.predication_va);
+		}
 	}
 	radv_meta_restore(&saved_state, cmd_buffer);
 }
diff --git a/src/amd/vulkan/radv_meta_resolve.c b/src/amd/vulkan/radv_meta_resolve.c
index d4d3552..2c8ba53 100644
--- a/src/amd/vulkan/radv_meta_resolve.c
+++ b/src/amd/vulkan/radv_meta_resolve.c
@@ -358,7 +358,8 @@
 		*method = RESOLVE_COMPUTE;
 	else if (vk_format_is_int(src_image->vk_format))
 		*method = RESOLVE_COMPUTE;
-	else if (src_image->info.array_size > 1)
+	else if (src_image->info.array_size > 1 ||
+		 dest_image->info.array_size > 1)
 		*method = RESOLVE_COMPUTE;
 	
 	if (radv_layout_dcc_compressed(dest_image, dest_image_layout, queue_mask)) {
@@ -613,8 +614,8 @@
 		return;
 
 	for (uint32_t i = 0; i < subpass->color_count; ++i) {
-		VkAttachmentReference src_att = subpass->color_attachments[i];
-		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+		struct radv_subpass_attachment src_att = subpass->color_attachments[i];
+		struct radv_subpass_attachment dest_att = subpass->resolve_attachments[i];
 
 		if (src_att.attachment == VK_ATTACHMENT_UNUSED ||
 		    dest_att.attachment == VK_ATTACHMENT_UNUSED)
@@ -641,8 +642,8 @@
 		       RADV_META_SAVE_GRAPHICS_PIPELINE);
 
 	for (uint32_t i = 0; i < subpass->color_count; ++i) {
-		VkAttachmentReference src_att = subpass->color_attachments[i];
-		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+		struct radv_subpass_attachment src_att = subpass->color_attachments[i];
+		struct radv_subpass_attachment dest_att = subpass->resolve_attachments[i];
 
 		if (src_att.attachment == VK_ATTACHMENT_UNUSED ||
 		    dest_att.attachment == VK_ATTACHMENT_UNUSED)
@@ -657,7 +658,7 @@
 
 		struct radv_subpass resolve_subpass = {
 			.color_count = 2,
-			.color_attachments = (VkAttachmentReference[]) { src_att, dest_att },
+			.color_attachments = (struct radv_subpass_attachment[]) { src_att, dest_att },
 			.depth_stencil_attachment = { .attachment = VK_ATTACHMENT_UNUSED },
 		};
 
@@ -684,8 +685,8 @@
 	struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
 
 	for (uint32_t i = 0; i < subpass->color_count; ++i) {
-		VkAttachmentReference src_att = subpass->color_attachments[i];
-		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+		struct radv_subpass_attachment src_att = subpass->color_attachments[i];
+		struct radv_subpass_attachment dest_att = subpass->resolve_attachments[i];
 
 		if (src_att.attachment == VK_ATTACHMENT_UNUSED ||
 		    dest_att.attachment == VK_ATTACHMENT_UNUSED)
diff --git a/src/amd/vulkan/radv_meta_resolve_cs.c b/src/amd/vulkan/radv_meta_resolve_cs.c
index 322e72e..2d79cb0 100644
--- a/src/amd/vulkan/radv_meta_resolve_cs.c
+++ b/src/amd/vulkan/radv_meta_resolve_cs.c
@@ -135,11 +135,11 @@
 		outval = radv_meta_build_resolve_srgb_conversion(&b, outval);
 
 	nir_ssa_def *coord = nir_iadd(&b, global_id, &dst_offset->dest.ssa);
-	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_var_store);
-	store->src[0] = nir_src_for_ssa(coord);
-	store->src[1] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
-	store->src[2] = nir_src_for_ssa(outval);
-	store->variables[0] = nir_deref_var_create(store, output_img);
+	nir_intrinsic_instr *store = nir_intrinsic_instr_create(b.shader, nir_intrinsic_image_deref_store);
+	store->src[0] = nir_src_for_ssa(&nir_build_deref_var(&b, output_img)->dest.ssa);
+	store->src[1] = nir_src_for_ssa(coord);
+	store->src[2] = nir_src_for_ssa(nir_ssa_undef(&b, 1, 32));
+	store->src[3] = nir_src_for_ssa(outval);
 	nir_builder_instr_insert(&b, &store->instr);
 	return b.shader;
 }
@@ -473,25 +473,15 @@
 	struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
 	struct radv_meta_saved_state saved_state;
-	/* FINISHME(perf): Skip clears for resolve attachments.
-	 *
-	 * From the Vulkan 1.0 spec:
-	 *
-	 *    If the first use of an attachment in a render pass is as a resolve
-	 *    attachment, then the loadOp is effectively ignored as the resolve is
-	 *    guaranteed to overwrite all pixels in the render area.
+	struct radv_subpass_barrier barrier;
+
+	/* Resolves happen before the end-of-subpass barriers get executed, so
+	 * we have to make the attachment shader-readable.
 	 */
-
-	if (!subpass->has_resolve)
-		return;
-
-	/* Resolves happen before the end-of-subpass barriers get executed,
-	 * so we have to make the attachment shader-readable */
-	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
-	                                RADV_CMD_FLAG_FLUSH_AND_INV_CB |
-	                                RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
-	                                RADV_CMD_FLAG_INV_GLOBAL_L2 |
-	                                RADV_CMD_FLAG_INV_VMEM_L1;
+	barrier.src_stage_mask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+	barrier.src_access_mask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+	barrier.dst_access_mask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
+	radv_subpass_barrier(cmd_buffer, &barrier);
 
 	radv_decompress_resolve_subpass_src(cmd_buffer);
 
@@ -501,8 +491,8 @@
 		       RADV_META_SAVE_DESCRIPTORS);
 
 	for (uint32_t i = 0; i < subpass->color_count; ++i) {
-		VkAttachmentReference src_att = subpass->color_attachments[i];
-		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+		struct radv_subpass_attachment src_att = subpass->color_attachments[i];
+		struct radv_subpass_attachment dest_att = subpass->resolve_attachments[i];
 		struct radv_image_view *src_iview = cmd_buffer->state.framebuffer->attachments[src_att.attachment].attachment;
 		struct radv_image_view *dst_iview = cmd_buffer->state.framebuffer->attachments[dest_att.attachment].attachment;
 		if (dest_att.attachment == VK_ATTACHMENT_UNUSED)
diff --git a/src/amd/vulkan/radv_meta_resolve_fs.c b/src/amd/vulkan/radv_meta_resolve_fs.c
index ef8c1d8..76f00bf 100644
--- a/src/amd/vulkan/radv_meta_resolve_fs.c
+++ b/src/amd/vulkan/radv_meta_resolve_fs.c
@@ -580,39 +580,25 @@
 	struct radv_framebuffer *fb = cmd_buffer->state.framebuffer;
 	const struct radv_subpass *subpass = cmd_buffer->state.subpass;
 	struct radv_meta_saved_state saved_state;
+	struct radv_subpass_barrier barrier;
 
-	/* FINISHME(perf): Skip clears for resolve attachments.
-	 *
-	 * From the Vulkan 1.0 spec:
-	 *
-	 *    If the first use of an attachment in a render pass is as a resolve
-	 *    attachment, then the loadOp is effectively ignored as the resolve is
-	 *    guaranteed to overwrite all pixels in the render area.
-	 */
+	/* Resolves happen before the end-of-subpass barriers get executed,
+	 * so we have to make the attachment shader-readable */
+	barrier.src_stage_mask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+	barrier.src_access_mask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+	barrier.dst_access_mask = VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
+	radv_subpass_barrier(cmd_buffer, &barrier);
 
-	if (!subpass->has_resolve)
-		return;
+	radv_decompress_resolve_subpass_src(cmd_buffer);
 
 	radv_meta_save(&saved_state, cmd_buffer,
 		       RADV_META_SAVE_GRAPHICS_PIPELINE |
 		       RADV_META_SAVE_CONSTANTS |
 		       RADV_META_SAVE_DESCRIPTORS);
 
-	/* Resolves happen before the end-of-subpass barriers get executed,
-	 * so we have to make the attachment shader-readable */
-	cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
-	                                RADV_CMD_FLAG_FLUSH_AND_INV_CB |
-	                                RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
-	                                RADV_CMD_FLAG_FLUSH_AND_INV_DB |
-	                                RADV_CMD_FLAG_FLUSH_AND_INV_DB_META |
-	                                RADV_CMD_FLAG_INV_GLOBAL_L2 |
-	                                RADV_CMD_FLAG_INV_VMEM_L1;
-
-	radv_decompress_resolve_subpass_src(cmd_buffer);
-
 	for (uint32_t i = 0; i < subpass->color_count; ++i) {
-		VkAttachmentReference src_att = subpass->color_attachments[i];
-		VkAttachmentReference dest_att = subpass->resolve_attachments[i];
+		struct radv_subpass_attachment src_att = subpass->color_attachments[i];
+		struct radv_subpass_attachment dest_att = subpass->resolve_attachments[i];
 
 		if (src_att.attachment == VK_ATTACHMENT_UNUSED ||
 		    dest_att.attachment == VK_ATTACHMENT_UNUSED)
@@ -623,7 +609,7 @@
 
 		struct radv_subpass resolve_subpass = {
 			.color_count = 1,
-			.color_attachments = (VkAttachmentReference[]) { dest_att },
+			.color_attachments = (struct radv_subpass_attachment[]) { dest_att },
 			.depth_stencil_attachment = { .attachment = VK_ATTACHMENT_UNUSED },
 		};
 
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index ef14e7b..ac3d806 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -27,6 +27,7 @@
 
 #include "radv_private.h"
 #include "radv_shader.h"
+#include "radv_shader_helper.h"
 #include "nir/nir.h"
 
 #include <llvm-c/Core.h>
@@ -81,7 +82,6 @@
 	LLVMValueRef hs_ring_tess_offchip;
 	LLVMValueRef hs_ring_tess_factor;
 
-	LLVMValueRef sample_pos_offset;
 	LLVMValueRef persp_sample, persp_center, persp_centroid;
 	LLVMValueRef linear_sample, linear_center, linear_centroid;
 
@@ -124,6 +124,98 @@
 	return container_of(abi, ctx, abi);
 }
 
+struct ac_build_if_state
+{
+	struct radv_shader_context *ctx;
+	LLVMValueRef condition;
+	LLVMBasicBlockRef entry_block;
+	LLVMBasicBlockRef true_block;
+	LLVMBasicBlockRef false_block;
+	LLVMBasicBlockRef merge_block;
+};
+
+static LLVMBasicBlockRef
+ac_build_insert_new_block(struct radv_shader_context *ctx, const char *name)
+{
+	LLVMBasicBlockRef current_block;
+	LLVMBasicBlockRef next_block;
+	LLVMBasicBlockRef new_block;
+
+	/* get current basic block */
+	current_block = LLVMGetInsertBlock(ctx->ac.builder);
+
+	/* chqeck if there's another block after this one */
+	next_block = LLVMGetNextBasicBlock(current_block);
+	if (next_block) {
+		/* insert the new block before the next block */
+		new_block = LLVMInsertBasicBlockInContext(ctx->context, next_block, name);
+	}
+	else {
+		/* append new block after current block */
+		LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
+		new_block = LLVMAppendBasicBlockInContext(ctx->context, function, name);
+	}
+	return new_block;
+}
+
+static void
+ac_nir_build_if(struct ac_build_if_state *ifthen,
+		struct radv_shader_context *ctx,
+		LLVMValueRef condition)
+{
+	LLVMBasicBlockRef block = LLVMGetInsertBlock(ctx->ac.builder);
+
+	memset(ifthen, 0, sizeof *ifthen);
+	ifthen->ctx = ctx;
+	ifthen->condition = condition;
+	ifthen->entry_block = block;
+
+	/* create endif/merge basic block for the phi functions */
+	ifthen->merge_block = ac_build_insert_new_block(ctx, "endif-block");
+
+	/* create/insert true_block before merge_block */
+	ifthen->true_block =
+		LLVMInsertBasicBlockInContext(ctx->context,
+					      ifthen->merge_block,
+					      "if-true-block");
+
+	/* successive code goes into the true block */
+	LLVMPositionBuilderAtEnd(ctx->ac.builder, ifthen->true_block);
+}
+
+/**
+ * End a conditional.
+ */
+static void
+ac_nir_build_endif(struct ac_build_if_state *ifthen)
+{
+	LLVMBuilderRef builder = ifthen->ctx->ac.builder;
+
+	/* Insert branch to the merge block from current block */
+	LLVMBuildBr(builder, ifthen->merge_block);
+
+	/*
+	 * Now patch in the various branch instructions.
+	 */
+
+	/* Insert the conditional branch instruction at the end of entry_block */
+	LLVMPositionBuilderAtEnd(builder, ifthen->entry_block);
+	if (ifthen->false_block) {
+		/* we have an else clause */
+		LLVMBuildCondBr(builder, ifthen->condition,
+				ifthen->true_block, ifthen->false_block);
+	}
+	else {
+		/* no else clause */
+		LLVMBuildCondBr(builder, ifthen->condition,
+				ifthen->true_block, ifthen->merge_block);
+	}
+
+	/* Resume building code at end of the ifthen->merge_block */
+	LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
+}
+
+
 static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx)
 {
 	switch (ctx->stage) {
@@ -330,7 +422,7 @@
 			    "");
 }
 
-#define MAX_ARGS 23
+#define MAX_ARGS 64
 struct arg_info {
 	LLVMTypeRef types[MAX_ARGS];
 	LLVMValueRef *assign[MAX_ARGS];
@@ -388,7 +480,7 @@
                      unsigned num_return_elems,
 		     struct arg_info *args,
 		     unsigned max_workgroup_size,
-		     bool unsafe_math)
+		     const struct radv_nir_compiler_options *options)
 {
 	LLVMTypeRef main_function_type, ret_type;
 	LLVMBasicBlockRef main_function_body;
@@ -419,12 +511,18 @@
 		}
 	}
 
+	if (options->address32_hi) {
+		ac_llvm_add_target_dep_function_attr(main_function,
+						     "amdgpu-32bit-address-high-bits",
+						     options->address32_hi);
+	}
+
 	if (max_workgroup_size) {
 		ac_llvm_add_target_dep_function_attr(main_function,
 						     "amdgpu-max-work-group-size",
 						     max_workgroup_size);
 	}
-	if (unsafe_math) {
+	if (options->unsafe_math) {
 		/* These were copied from some LLVM test. */
 		LLVMAddTargetDependentFunctionAttr(main_function,
 						   "less-precise-fpmad",
@@ -447,13 +545,12 @@
 
 
 static void
-set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs,
-	uint32_t indirect_offset)
+set_loc(struct radv_userdata_info *ud_info, uint8_t *sgpr_idx,
+	uint8_t num_sgprs, bool indirect)
 {
 	ud_info->sgpr_idx = *sgpr_idx;
 	ud_info->num_sgprs = num_sgprs;
-	ud_info->indirect = indirect_offset > 0;
-	ud_info->indirect_offset = indirect_offset;
+	ud_info->indirect = indirect;
 	*sgpr_idx += num_sgprs;
 }
 
@@ -465,23 +562,35 @@
 		&ctx->shader_info->user_sgprs_locs.shader_data[idx];
 	assert(ud_info);
 
-	set_loc(ud_info, sgpr_idx, num_sgprs, 0);
+	set_loc(ud_info, sgpr_idx, num_sgprs, false);
+}
+
+static void
+set_loc_shader_ptr(struct radv_shader_context *ctx, int idx, uint8_t *sgpr_idx)
+{
+	bool use_32bit_pointers = HAVE_32BIT_POINTERS &&
+				  idx != AC_UD_SCRATCH_RING_OFFSETS;
+
+	set_loc_shader(ctx, idx, sgpr_idx, use_32bit_pointers ? 1 : 2);
 }
 
 static void
 set_loc_desc(struct radv_shader_context *ctx, int idx,  uint8_t *sgpr_idx,
-	     uint32_t indirect_offset)
+	     bool indirect)
 {
-	struct radv_userdata_info *ud_info =
-		&ctx->shader_info->user_sgprs_locs.descriptor_sets[idx];
+	struct radv_userdata_locations *locs =
+		&ctx->shader_info->user_sgprs_locs;
+	struct radv_userdata_info *ud_info = &locs->descriptor_sets[idx];
 	assert(ud_info);
 
-	set_loc(ud_info, sgpr_idx, 2, indirect_offset);
+	set_loc(ud_info, sgpr_idx, HAVE_32BIT_POINTERS ? 1 : 2, indirect);
+
+	if (!indirect)
+		locs->descriptor_sets_enabled |= 1 << idx;
 }
 
 struct user_sgpr_info {
 	bool need_ring_offsets;
-	uint8_t sgpr_count;
 	bool indirect_all_descriptor_sets;
 };
 
@@ -514,7 +623,8 @@
 {
 	uint8_t count = 0;
 
-	count += ctx->shader_info->info.vs.has_vertex_buffers ? 2 : 0;
+	if (ctx->shader_info->info.vs.has_vertex_buffers)
+		count += HAVE_32BIT_POINTERS ? 1 : 2;
 	count += ctx->shader_info->info.vs.needs_draw_id ? 3 : 2;
 
 	return count;
@@ -527,6 +637,8 @@
 				bool needs_view_index,
 				struct user_sgpr_info *user_sgpr_info)
 {
+	uint8_t user_sgpr_count = 0;
+
 	memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info));
 
 	/* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
@@ -543,25 +655,25 @@
 
 	/* 2 user sgprs will nearly always be allocated for scratch/rings */
 	if (ctx->options->supports_spill || user_sgpr_info->need_ring_offsets) {
-		user_sgpr_info->sgpr_count += 2;
+		user_sgpr_count += 2;
 	}
 
 	switch (stage) {
 	case MESA_SHADER_COMPUTE:
 		if (ctx->shader_info->info.cs.uses_grid_size)
-			user_sgpr_info->sgpr_count += 3;
+			user_sgpr_count += 3;
 		break;
 	case MESA_SHADER_FRAGMENT:
-		user_sgpr_info->sgpr_count += ctx->shader_info->info.ps.needs_sample_positions;
+		user_sgpr_count += ctx->shader_info->info.ps.needs_sample_positions;
 		break;
 	case MESA_SHADER_VERTEX:
 		if (!ctx->is_gs_copy_shader)
-			user_sgpr_info->sgpr_count += count_vs_user_sgprs(ctx);
+			user_sgpr_count += count_vs_user_sgprs(ctx);
 		break;
 	case MESA_SHADER_TESS_CTRL:
 		if (has_previous_stage) {
 			if (previous_stage == MESA_SHADER_VERTEX)
-				user_sgpr_info->sgpr_count += count_vs_user_sgprs(ctx);
+				user_sgpr_count += count_vs_user_sgprs(ctx);
 		}
 		break;
 	case MESA_SHADER_TESS_EVAL:
@@ -569,7 +681,7 @@
 	case MESA_SHADER_GEOMETRY:
 		if (has_previous_stage) {
 			if (previous_stage == MESA_SHADER_VERTEX) {
-				user_sgpr_info->sgpr_count += count_vs_user_sgprs(ctx);
+				user_sgpr_count += count_vs_user_sgprs(ctx);
 			}
 		}
 		break;
@@ -578,19 +690,18 @@
 	}
 
 	if (needs_view_index)
-		user_sgpr_info->sgpr_count++;
+		user_sgpr_count++;
 
 	if (ctx->shader_info->info.loads_push_constants)
-		user_sgpr_info->sgpr_count += 2;
+		user_sgpr_count += HAVE_32BIT_POINTERS ? 1 : 2;
 
 	uint32_t available_sgprs = ctx->options->chip_class >= GFX9 && stage != MESA_SHADER_COMPUTE ? 32 : 16;
-	uint32_t remaining_sgprs = available_sgprs - user_sgpr_info->sgpr_count;
+	uint32_t remaining_sgprs = available_sgprs - user_sgpr_count;
+	uint32_t num_desc_set =
+		util_bitcount(ctx->shader_info->info.desc_set_used_mask);
 
-	if (remaining_sgprs / 2 < util_bitcount(ctx->shader_info->info.desc_set_used_mask)) {
-		user_sgpr_info->sgpr_count += 2;
+	if (remaining_sgprs / (HAVE_32BIT_POINTERS ? 1 : 2) < num_desc_set) {
 		user_sgpr_info->indirect_all_descriptor_sets = true;
-	} else {
-		user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
 	}
 }
 
@@ -603,7 +714,7 @@
 			   struct arg_info *args,
 			   LLVMValueRef *desc_sets)
 {
-	LLVMTypeRef type = ac_array_in_const_addr_space(ctx->ac.i8);
+	LLVMTypeRef type = ac_array_in_const32_addr_space(ctx->ac.i8);
 	unsigned num_sets = ctx->options->layout ?
 			    ctx->options->layout->num_sets : 0;
 	unsigned stage_mask = 1 << stage;
@@ -621,7 +732,7 @@
 			}
 		}
 	} else {
-		add_array_arg(args, ac_array_in_const_addr_space(type), desc_sets);
+		add_array_arg(args, ac_array_in_const32_addr_space(type), desc_sets);
 	}
 
 	if (ctx->shader_info->info.loads_push_constants) {
@@ -641,7 +752,8 @@
 	    (stage == MESA_SHADER_VERTEX ||
 	     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
 		if (ctx->shader_info->info.vs.has_vertex_buffers) {
-			add_arg(args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32),
+			add_arg(args, ARG_SGPR,
+				ac_array_in_const32_addr_space(ctx->ac.v4i32),
 				&ctx->vertex_buffers);
 		}
 		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
@@ -694,18 +806,17 @@
 		for (unsigned i = 0; i < num_sets; ++i) {
 			if ((ctx->shader_info->info.desc_set_used_mask & (1 << i)) &&
 			    ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
-				set_loc_desc(ctx, i, user_sgpr_idx, 0);
+				set_loc_desc(ctx, i, user_sgpr_idx, false);
 			} else
 				ctx->descriptor_sets[i] = NULL;
 		}
 	} else {
-		set_loc_shader(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS,
-			       user_sgpr_idx, 2);
+		set_loc_shader_ptr(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS,
+			           user_sgpr_idx);
 
 		for (unsigned i = 0; i < num_sets; ++i) {
 			if ((ctx->shader_info->info.desc_set_used_mask & (1 << i)) &&
 			    ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
-				set_loc_desc(ctx, i, user_sgpr_idx, i * 8);
 				ctx->descriptor_sets[i] =
 					ac_build_load_to_sgpr(&ctx->ac,
 							      desc_sets,
@@ -718,7 +829,7 @@
 	}
 
 	if (ctx->shader_info->info.loads_push_constants) {
-		set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
+		set_loc_shader_ptr(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx);
 	}
 }
 
@@ -732,8 +843,8 @@
 	    (stage == MESA_SHADER_VERTEX ||
 	     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
 		if (ctx->shader_info->info.vs.has_vertex_buffers) {
-			set_loc_shader(ctx, AC_UD_VS_VERTEX_BUFFERS,
-				       user_sgpr_idx, 2);
+			set_loc_shader_ptr(ctx, AC_UD_VS_VERTEX_BUFFERS,
+					   user_sgpr_idx);
 		}
 
 		unsigned vs_num = 2;
@@ -759,7 +870,7 @@
 		calling_conv = RADEON_LLVM_AMDGPU_GS;
 		break;
 	case MESA_SHADER_TESS_CTRL:
-		calling_conv = HAVE_LLVM >= 0x0500 ? RADEON_LLVM_AMDGPU_HS : RADEON_LLVM_AMDGPU_VS;
+		calling_conv = RADEON_LLVM_AMDGPU_HS;
 		break;
 	case MESA_SHADER_FRAGMENT:
 		calling_conv = RADEON_LLVM_AMDGPU_PS;
@@ -986,10 +1097,6 @@
 					   previous_stage, &user_sgpr_info,
 					   &args, &desc_sets);
 
-		if (ctx->shader_info->info.ps.needs_sample_positions)
-			add_arg(&args, ARG_SGPR, ctx->ac.i32,
-				&ctx->sample_pos_offset);
-
 		add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.prim_mask);
 		add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_sample);
 		add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_center);
@@ -1014,8 +1121,7 @@
 
 	ctx->main_function = create_llvm_function(
 	    ctx->context, ctx->ac.module, ctx->ac.builder, NULL, 0, &args,
-	    ctx->max_workgroup_size,
-	    ctx->options->unsafe_math);
+	    ctx->max_workgroup_size, ctx->options);
 	set_llvm_calling_convention(ctx->main_function, stage);
 
 
@@ -1032,8 +1138,8 @@
 	user_sgpr_idx = 0;
 
 	if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) {
-		set_loc_shader(ctx, AC_UD_SCRATCH_RING_OFFSETS,
-			       &user_sgpr_idx, 2);
+		set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS,
+				   &user_sgpr_idx);
 		if (ctx->options->supports_spill) {
 			ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
 							       LLVMPointerType(ctx->ac.i8, AC_CONST_ADDR_SPACE),
@@ -1086,10 +1192,6 @@
 			set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
 		break;
 	case MESA_SHADER_FRAGMENT:
-		if (ctx->shader_info->info.ps.needs_sample_positions) {
-			set_loc_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET,
-				       &user_sgpr_idx, 1);
-		}
 		break;
 	default:
 		unreachable("Shader stage not implemented");
@@ -1376,6 +1478,8 @@
 		if (!(writemask & (1 << chan)))
 			continue;
 		LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
+		value = ac_to_integer(&ctx->ac, value);
+		value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
 
 		if (store_lds || is_tess_factor) {
 			LLVMValueRef dw_addr_chan =
@@ -1472,10 +1576,13 @@
 							ctx->ac.i32_0,
 							vtx_offset, soffset,
 							0, 1, 0, true, false);
-
-			value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i],
-						    type, "");
 		}
+
+		if (ac_get_type_size(type) == 2) {
+			value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], ctx->ac.i32, "");
+			value[i] = LLVMBuildTrunc(ctx->ac.builder, value[i], ctx->ac.i16, "");
+		}
+		value[i] = LLVMBuildBitCast(ctx->ac.builder, value[i], type, "");
 	}
 	result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
 	result = ac_to_integer(&ctx->ac, result);
@@ -1519,6 +1626,30 @@
 	return NULL;
 }
 
+static uint32_t
+radv_get_sample_pos_offset(uint32_t num_samples)
+{
+	uint32_t sample_pos_offset = 0;
+
+	switch (num_samples) {
+	case 2:
+		sample_pos_offset = 1;
+		break;
+	case 4:
+		sample_pos_offset = 3;
+		break;
+	case 8:
+		sample_pos_offset = 7;
+		break;
+	case 16:
+		sample_pos_offset = 15;
+		break;
+	default:
+		break;
+	}
+	return sample_pos_offset;
+}
+
 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi,
 					 LLVMValueRef sample_id)
 {
@@ -1530,7 +1661,12 @@
 	ptr = LLVMBuildBitCast(ctx->ac.builder, ptr,
 			       ac_array_in_const_addr_space(ctx->ac.v2f32), "");
 
-	sample_id = LLVMBuildAdd(ctx->ac.builder, sample_id, ctx->sample_pos_offset, "");
+	uint32_t sample_pos_offset =
+		radv_get_sample_pos_offset(ctx->options->key.fs.num_samples);
+
+	sample_id =
+		LLVMBuildAdd(ctx->ac.builder, sample_id,
+			     LLVMConstInt(ctx->ac.i32, sample_pos_offset, false), "");
 	result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
 
 	return result;
@@ -1540,9 +1676,14 @@
 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
 {
 	struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
-	uint8_t log2_ps_iter_samples = ctx->shader_info->info.ps.force_persample ?
-		ctx->options->key.fs.log2_num_samples :
-		ctx->options->key.fs.log2_ps_iter_samples;
+	uint8_t log2_ps_iter_samples;
+
+	if (ctx->shader_info->info.ps.force_persample) {
+		log2_ps_iter_samples =
+			util_logbase2(ctx->options->key.fs.num_samples);
+	} else {
+		log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
+	}
 
 	/* The bit pattern matches that used by fixed function fragment
 	 * processing. */
@@ -1592,6 +1733,8 @@
 	/* loop num outputs */
 	idx = 0;
 	for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
+		unsigned output_usage_mask =
+			ctx->shader_info->info.gs.output_usage_mask[i];
 		LLVMValueRef *out_ptr = &addrs[i * 4];
 		int length = 4;
 		int slot = idx;
@@ -1605,15 +1748,21 @@
 			length = ctx->num_output_clips + ctx->num_output_culls;
 			if (length > 4)
 				slot_inc = 2;
+			output_usage_mask = (1 << length) - 1;
 		}
+
 		for (unsigned j = 0; j < length; j++) {
+			if (!(output_usage_mask & (1 << j)))
+				continue;
+
 			LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder,
 							     out_ptr[j], "");
 			LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, (slot * 4 + j) * ctx->gs_max_out_vertices, false);
 			voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
 			voffset = LLVMBuildMul(ctx->ac.builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
 
-			out_val = LLVMBuildBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
+			out_val = ac_to_integer(&ctx->ac, out_val);
+			out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
 
 			ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring,
 						    out_val, 1,
@@ -1768,7 +1917,8 @@
 	index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
 
 	list = ac_build_gep0(&ctx->ac, list, LLVMConstInt(ctx->ac.i32, offset, 0));
-	list = LLVMBuildPointerCast(builder, list, ac_array_in_const_addr_space(type), "");
+	list = LLVMBuildPointerCast(builder, list,
+				    ac_array_in_const32_addr_space(type), "");
 
 	return ac_build_load_to_sgpr(&ctx->ac, list, index);
 }
@@ -1831,6 +1981,7 @@
 
 	variable->data.driver_location = variable->data.location * 4;
 
+	enum glsl_base_type type = glsl_get_base_type(variable->type);
 	for (unsigned i = 0; i < attrib_count; ++i) {
 		LLVMValueRef output[4];
 		unsigned attrib_index = variable->data.location + i - VERT_ATTRIB_GENERIC0;
@@ -1839,8 +1990,7 @@
 			uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[attrib_index];
 
 			if (divisor) {
-				buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.instance_id,
-				                            ctx->abi.start_instance, "");
+				buffer_index = ctx->abi.instance_id;
 
 				if (divisor != 1) {
 					buffer_index = LLVMBuildUDiv(ctx->ac.builder, buffer_index,
@@ -1857,6 +2007,8 @@
 			} else {
 				buffer_index = ctx->ac.i32_0;
 			}
+
+			buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.start_instance, buffer_index, "");
 		} else
 			buffer_index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id,
 			                            ctx->abi.base_vertex, "");
@@ -1874,14 +2026,21 @@
 		for (unsigned chan = 0; chan < 4; chan++) {
 			LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
 			output[chan] = LLVMBuildExtractElement(ctx->ac.builder, input, llvm_chan, "");
+			if (type == GLSL_TYPE_FLOAT16) {
+				output[chan] = LLVMBuildBitCast(ctx->ac.builder, output[chan], ctx->ac.f32, "");
+				output[chan] = LLVMBuildFPTrunc(ctx->ac.builder, output[chan], ctx->ac.f16, "");
+			}
 		}
 
 		unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (attrib_index * 2)) & 3;
 		output[3] = adjust_vertex_fetch_alpha(ctx, alpha_adjust, output[3]);
 
 		for (unsigned chan = 0; chan < 4; chan++) {
-			ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] =
-				ac_to_integer(&ctx->ac, output[chan]);
+			output[chan] = ac_to_integer(&ctx->ac, output[chan]);
+			if (type == GLSL_TYPE_UINT16 || type == GLSL_TYPE_INT16)
+				output[chan] = LLVMBuildTrunc(ctx->ac.builder, output[chan], ctx->ac.i16, "");
+
+			ctx->inputs[ac_llvm_reg_index_soa(variable->data.location + i, chan)] = output[chan];
 		}
 	}
 }
@@ -1895,7 +2054,7 @@
 	LLVMValueRef attr_number;
 	unsigned chan;
 	LLVMValueRef i, j;
-	bool interp = interp_param != NULL;
+	bool interp = !LLVMIsUndef(interp_param);
 
 	attr_number = LLVMConstInt(ctx->ac.i32, attr, false);
 
@@ -1933,6 +2092,8 @@
 							      llvm_chan,
 							      attr_number,
 							      prim_mask);
+			result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, "");
+			result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], LLVMTypeOf(interp_param), "");
 		}
 	}
 }
@@ -1943,7 +2104,7 @@
 {
 	int idx = variable->data.location;
 	unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
-	LLVMValueRef interp;
+	LLVMValueRef interp = NULL;
 	uint64_t mask;
 
 	variable->data.driver_location = idx * 4;
@@ -1959,8 +2120,11 @@
 			interp_type = INTERP_CENTER;
 
 		interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type);
-	} else
-		interp = NULL;
+	}
+	bool is_16bit = glsl_type_is_16bit(variable->type);
+	LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32;
+	if (interp == NULL)
+		interp = LLVMGetUndef(type);
 
 	for (unsigned i = 0; i < attrib_count; ++i)
 		ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp;
@@ -2019,8 +2183,10 @@
 	unsigned index = 0;
 
 	if (ctx->shader_info->info.ps.uses_input_attachments ||
-	    ctx->shader_info->info.needs_multiview_view_index)
+	    ctx->shader_info->info.needs_multiview_view_index) {
 		ctx->input_mask |= 1ull << VARYING_SLOT_LAYER;
+		ctx->inputs[ac_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)] = LLVMGetUndef(ctx->ac.i32);
+	}
 
 	for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
 		LLVMValueRef interp_param;
@@ -2035,7 +2201,7 @@
 			interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask,
 					inputs);
 
-			if (!interp_param)
+			if (LLVMIsUndef(interp_param))
 				ctx->shader_info->fs.flat_shaded_mask |= 1u << index;
 			++index;
 		} else if (i == VARYING_SLOT_CLIP_DIST0) {
@@ -2134,6 +2300,10 @@
 	args->out[2] = LLVMGetUndef(ctx->ac.f32);
 	args->out[3] = LLVMGetUndef(ctx->ac.f32);
 
+	if (!values)
+		return;
+
+	bool is_16bit = ac_get_type_size(LLVMTypeOf(values[0])) == 2;
 	if (ctx->stage == MESA_SHADER_FRAGMENT && target >= V_008DFC_SQ_EXP_MRT) {
 		unsigned index = target - V_008DFC_SQ_EXP_MRT;
 		unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
@@ -2171,6 +2341,12 @@
 		case V_028714_SPI_SHADER_FP16_ABGR:
 			args->enabled_channels = 0x5;
 			packf = ac_build_cvt_pkrtz_f16;
+			if (is_16bit) {
+				for (unsigned chan = 0; chan < 4; chan++)
+					values[chan] = LLVMBuildFPExt(ctx->ac.builder,
+								      values[chan],
+								      ctx->ac.f32, "");
+			}
 			break;
 
 		case V_028714_SPI_SHADER_UNORM16_ABGR:
@@ -2186,11 +2362,23 @@
 		case V_028714_SPI_SHADER_UINT16_ABGR:
 			args->enabled_channels = 0x5;
 			packi = ac_build_cvt_pk_u16;
+			if (is_16bit) {
+				for (unsigned chan = 0; chan < 4; chan++)
+					values[chan] = LLVMBuildZExt(ctx->ac.builder,
+								      values[chan],
+								      ctx->ac.i32, "");
+			}
 			break;
 
 		case V_028714_SPI_SHADER_SINT16_ABGR:
 			args->enabled_channels = 0x5;
 			packi = ac_build_cvt_pk_i16;
+			if (is_16bit) {
+				for (unsigned chan = 0; chan < 4; chan++)
+					values[chan] = LLVMBuildSExt(ctx->ac.builder,
+								      values[chan],
+								      ctx->ac.i32, "");
+			}
 			break;
 
 		default:
@@ -2233,7 +2421,13 @@
 		return;
 	}
 
-	memcpy(&args->out[0], values, sizeof(values[0]) * 4);
+	if (is_16bit) {
+		for (unsigned chan = 0; chan < 4; chan++) {
+			values[chan] = LLVMBuildBitCast(ctx->ac.builder, values[chan], ctx->ac.i16, "");
+			args->out[chan] = LLVMBuildZExt(ctx->ac.builder, values[chan], ctx->ac.i32, "");
+		}
+	} else
+		memcpy(&args->out[0], values, sizeof(values[0]) * 4);
 
 	for (unsigned i = 0; i < 4; ++i) {
 		if (!(args->enabled_channels & (1 << i)))
@@ -2426,10 +2620,9 @@
 			output_usage_mask =
 				ctx->shader_info->info.tes.output_usage_mask[i];
 		} else {
-			/* Enable all channels for the GS copy shader because
-			 * we don't know the output usage mask currently.
-			 */
-			output_usage_mask = 0xf;
+			assert(ctx->is_gs_copy_shader);
+			output_usage_mask =
+				ctx->shader_info->info.gs.output_usage_mask[i];
 		}
 
 		radv_export_param(ctx, param_count, values, output_usage_mask);
@@ -2509,14 +2702,26 @@
 	for (unsigned i = 0; i < AC_LLVM_MAX_OUTPUTS; ++i) {
 		LLVMValueRef dw_addr = NULL;
 		LLVMValueRef *out_ptr = &ctx->abi.outputs[i * 4];
+		unsigned output_usage_mask;
 		int param_index;
 		int length = 4;
 
 		if (!(ctx->output_mask & (1ull << i)))
 			continue;
 
-		if (i == VARYING_SLOT_CLIP_DIST0)
+		if (ctx->stage == MESA_SHADER_VERTEX) {
+			output_usage_mask =
+				ctx->shader_info->info.vs.output_usage_mask[i];
+		} else {
+			assert(ctx->stage == MESA_SHADER_TESS_EVAL);
+			output_usage_mask =
+				ctx->shader_info->info.tes.output_usage_mask[i];
+		}
+
+		if (i == VARYING_SLOT_CLIP_DIST0) {
 			length = ctx->num_output_clips + ctx->num_output_culls;
+			output_usage_mask = (1 << length) - 1;
+		}
 
 		param_index = shader_io_get_unique_index(i);
 
@@ -2525,14 +2730,22 @@
 			                       LLVMConstInt(ctx->ac.i32, param_index * 4, false),
 			                       "");
 		}
+
 		for (j = 0; j < length; j++) {
+			if (!(output_usage_mask & (1 << j)))
+				continue;
+
 			LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
-			out_val = LLVMBuildBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
+			out_val = ac_to_integer(&ctx->ac, out_val);
+			out_val = LLVMBuildZExtOrBitCast(ctx->ac.builder, out_val, ctx->ac.i32, "");
 
 			if (ctx->ac.chip_class  >= GFX9) {
-				ac_lds_store(&ctx->ac, dw_addr,
-					     LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""));
-				dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, "");
+				LLVMValueRef dw_addr_offset =
+					LLVMBuildAdd(ctx->ac.builder, dw_addr,
+						     LLVMConstInt(ctx->ac.i32,
+								  j, false), "");
+
+				ac_lds_store(&ctx->ac, dw_addr_offset, out_val);
 			} else {
 				ac_build_buffer_store_dword(&ctx->ac,
 				                            ctx->esgs_ring,
@@ -2568,104 +2781,15 @@
 						    LLVMConstInt(ctx->ac.i32, param * 4, false),
 						    "");
 		for (unsigned j = 0; j < length; j++) {
-			ac_lds_store(&ctx->ac, dw_addr,
-				     LLVMBuildLoad(ctx->ac.builder, out_ptr[j], ""));
+			LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, out_ptr[j], "");
+			value = ac_to_integer(&ctx->ac, value);
+			value = LLVMBuildZExtOrBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
+			ac_lds_store(&ctx->ac, dw_addr, value);
 			dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr, ctx->ac.i32_1, "");
 		}
 	}
 }
 
-struct ac_build_if_state
-{
-	struct radv_shader_context *ctx;
-	LLVMValueRef condition;
-	LLVMBasicBlockRef entry_block;
-	LLVMBasicBlockRef true_block;
-	LLVMBasicBlockRef false_block;
-	LLVMBasicBlockRef merge_block;
-};
-
-static LLVMBasicBlockRef
-ac_build_insert_new_block(struct radv_shader_context *ctx, const char *name)
-{
-	LLVMBasicBlockRef current_block;
-	LLVMBasicBlockRef next_block;
-	LLVMBasicBlockRef new_block;
-
-	/* get current basic block */
-	current_block = LLVMGetInsertBlock(ctx->ac.builder);
-
-	/* chqeck if there's another block after this one */
-	next_block = LLVMGetNextBasicBlock(current_block);
-	if (next_block) {
-		/* insert the new block before the next block */
-		new_block = LLVMInsertBasicBlockInContext(ctx->context, next_block, name);
-	}
-	else {
-		/* append new block after current block */
-		LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
-		new_block = LLVMAppendBasicBlockInContext(ctx->context, function, name);
-	}
-	return new_block;
-}
-
-static void
-ac_nir_build_if(struct ac_build_if_state *ifthen,
-		struct radv_shader_context *ctx,
-		LLVMValueRef condition)
-{
-	LLVMBasicBlockRef block = LLVMGetInsertBlock(ctx->ac.builder);
-
-	memset(ifthen, 0, sizeof *ifthen);
-	ifthen->ctx = ctx;
-	ifthen->condition = condition;
-	ifthen->entry_block = block;
-
-	/* create endif/merge basic block for the phi functions */
-	ifthen->merge_block = ac_build_insert_new_block(ctx, "endif-block");
-
-	/* create/insert true_block before merge_block */
-	ifthen->true_block =
-		LLVMInsertBasicBlockInContext(ctx->context,
-					      ifthen->merge_block,
-					      "if-true-block");
-
-	/* successive code goes into the true block */
-	LLVMPositionBuilderAtEnd(ctx->ac.builder, ifthen->true_block);
-}
-
-/**
- * End a conditional.
- */
-static void
-ac_nir_build_endif(struct ac_build_if_state *ifthen)
-{
-	LLVMBuilderRef builder = ifthen->ctx->ac.builder;
-
-	/* Insert branch to the merge block from current block */
-	LLVMBuildBr(builder, ifthen->merge_block);
-
-	/*
-	 * Now patch in the various branch instructions.
-	 */
-
-	/* Insert the conditional branch instruction at the end of entry_block */
-	LLVMPositionBuilderAtEnd(builder, ifthen->entry_block);
-	if (ifthen->false_block) {
-		/* we have an else clause */
-		LLVMBuildCondBr(builder, ifthen->condition,
-				ifthen->true_block, ifthen->false_block);
-	}
-	else {
-		/* no else clause */
-		LLVMBuildCondBr(builder, ifthen->condition,
-				ifthen->true_block, ifthen->merge_block);
-	}
-
-	/* Resume building code at end of the ifthen->merge_block */
-	LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
-}
-
 static void
 write_tess_factors(struct radv_shader_context *ctx)
 {
@@ -2720,7 +2844,7 @@
 		outer[i] = LLVMGetUndef(ctx->ac.i32);
 	}
 
-	// LINES reverseal
+	// LINES reversal
 	if (ctx->options->key.tcs.primitive_mode == GL_ISOLINES) {
 		outer[0] = out[1] = ac_lds_load(&ctx->ac, lds_outer);
 		lds_outer = LLVMBuildAdd(ctx->ac.builder, lds_outer,
@@ -2951,30 +3075,12 @@
 	}
 }
 
-static void ac_llvm_finalize_module(struct radv_shader_context *ctx)
+static void ac_llvm_finalize_module(struct radv_shader_context *ctx,
+				    LLVMPassManagerRef passmgr,
+				    const struct radv_nir_compiler_options *options)
 {
-	LLVMPassManagerRef passmgr;
-	/* Create the pass manager */
-	passmgr = LLVMCreateFunctionPassManagerForModule(
-							ctx->ac.module);
-
-	/* This pass should eliminate all the load and store instructions */
-	LLVMAddPromoteMemoryToRegisterPass(passmgr);
-
-	/* Add some optimization passes */
-	LLVMAddScalarReplAggregatesPass(passmgr);
-	LLVMAddLICMPass(passmgr);
-	LLVMAddAggressiveDCEPass(passmgr);
-	LLVMAddCFGSimplificationPass(passmgr);
-	LLVMAddInstructionCombiningPass(passmgr);
-
-	/* Run the pass */
-	LLVMInitializeFunctionPassManager(passmgr);
-	LLVMRunFunctionPassManager(passmgr, ctx->main_function);
-	LLVMFinalizeFunctionPassManager(passmgr);
-
+	LLVMRunPassManager(passmgr, ctx->ac.module);
 	LLVMDisposeBuilder(ctx->ac.builder);
-	LLVMDisposePassManager(passmgr);
 
 	ac_llvm_context_dispose(&ctx->ac);
 }
@@ -3015,9 +3121,16 @@
 static void
 ac_setup_rings(struct radv_shader_context *ctx)
 {
-	if ((ctx->stage == MESA_SHADER_VERTEX && ctx->options->key.vs.as_es) ||
-	    (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->options->key.tes.as_es)) {
-		ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_ESGS_VS, false));
+	if (ctx->options->chip_class <= VI &&
+	    (ctx->stage == MESA_SHADER_GEOMETRY ||
+	     ctx->options->key.vs.as_es || ctx->options->key.tes.as_es)) {
+		unsigned ring = ctx->stage == MESA_SHADER_GEOMETRY ? RING_ESGS_GS
+								   : RING_ESGS_VS;
+		LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, ring, false);
+
+		ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac,
+						       ctx->ring_offsets,
+						       offset);
 	}
 
 	if (ctx->is_gs_copy_shader) {
@@ -3028,7 +3141,6 @@
 		uint32_t num_entries = 64;
 		LLVMValueRef gsvs_ring_stride = LLVMConstInt(ctx->ac.i32, ctx->max_gsvs_emit_size, false);
 		LLVMValueRef gsvs_ring_desc = LLVMConstInt(ctx->ac.i32, ctx->max_gsvs_emit_size << 16, false);
-		ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_ESGS_GS, false));
 		ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));
 
 		ctx->gsvs_ring = LLVMBuildBitCast(ctx->ac.builder, ctx->gsvs_ring, ctx->ac.v4i32, "");
@@ -3098,7 +3210,7 @@
 
 
 static
-LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
+LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm,
                                        struct nir_shader *const *shaders,
                                        int shader_count,
                                        struct radv_shader_variant_info *shader_info,
@@ -3108,18 +3220,10 @@
 	unsigned i;
 	ctx.options = options;
 	ctx.shader_info = shader_info;
-	ctx.context = LLVMContextCreate();
 
-	ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class,
-			     options->family);
-	ctx.ac.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
-	LLVMSetTarget(ctx.ac.module, options->supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--");
-
-	LLVMTargetDataRef data_layout = LLVMCreateTargetDataLayout(tm);
-	char *data_layout_str = LLVMCopyStringRepOfTargetData(data_layout);
-	LLVMSetDataLayout(ctx.ac.module, data_layout_str);
-	LLVMDisposeTargetData(data_layout);
-	LLVMDisposeMessage(data_layout_str);
+	ac_llvm_context_init(&ctx.ac, options->chip_class, options->family);
+	ctx.context = ctx.ac.context;
+	ctx.ac.module = ac_create_module(ac_llvm->tm, ctx.context);
 
 	enum ac_float_mode float_mode =
 		options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
@@ -3274,7 +3378,7 @@
 	if (options->dump_preoptir)
 		ac_dump_module(ctx.ac.module);
 
-	ac_llvm_finalize_module(&ctx);
+	ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options);
 
 	if (shader_count == 1)
 		ac_nir_eliminate_const_vs_outputs(&ctx);
@@ -3304,15 +3408,10 @@
 
 static unsigned ac_llvm_compile(LLVMModuleRef M,
                                 struct ac_shader_binary *binary,
-                                LLVMTargetMachineRef tm)
+                                struct ac_llvm_compiler *ac_llvm)
 {
 	unsigned retval = 0;
-	char *err;
 	LLVMContextRef llvm_ctx;
-	LLVMMemoryBufferRef out_buffer;
-	unsigned buffer_size;
-	const char *buffer_data;
-	LLVMBool mem_err;
 
 	/* Setup Diagnostic Handler*/
 	llvm_ctx = LLVMGetModuleContext(M);
@@ -3321,31 +3420,12 @@
 	                                &retval);
 
 	/* Compile IR*/
-	mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile,
-	                                              &err, &out_buffer);
-
-	/* Process Errors/Warnings */
-	if (mem_err) {
-		fprintf(stderr, "%s: %s", __FUNCTION__, err);
-		free(err);
+	if (!radv_compile_to_binary(ac_llvm, M, binary))
 		retval = 1;
-		goto out;
-	}
-
-	/* Extract Shader Code*/
-	buffer_size = LLVMGetBufferSize(out_buffer);
-	buffer_data = LLVMGetBufferStart(out_buffer);
-
-	ac_elf_read(buffer_data, buffer_size, binary);
-
-	/* Clean up */
-	LLVMDisposeMemoryBuffer(out_buffer);
-
-out:
 	return retval;
 }
 
-static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
+static void ac_compile_llvm_module(struct ac_llvm_compiler *ac_llvm,
 				   LLVMModuleRef llvm_module,
 				   struct ac_shader_binary *binary,
 				   struct ac_shader_config *config,
@@ -3364,7 +3444,7 @@
 		LLVMDisposeMessage(llvm_ir);
 	}
 
-	int v = ac_llvm_compile(llvm_module, binary, tm);
+	int v = ac_llvm_compile(llvm_module, binary, ac_llvm);
 	if (v) {
 		fprintf(stderr, "compile failed\n");
 	}
@@ -3474,7 +3554,7 @@
 }
 
 void
-radv_compile_nir_shader(LLVMTargetMachineRef tm,
+radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm,
 			struct ac_shader_binary *binary,
 			struct ac_shader_config *config,
 			struct radv_shader_variant_info *shader_info,
@@ -3485,10 +3565,10 @@
 
 	LLVMModuleRef llvm_module;
 
-	llvm_module = ac_translate_nir_to_llvm(tm, nir, nir_count, shader_info,
+	llvm_module = ac_translate_nir_to_llvm(ac_llvm, nir, nir_count, shader_info,
 	                                       options);
 
-	ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info,
+	ac_compile_llvm_module(ac_llvm, llvm_module, binary, config, shader_info,
 			       nir[0]->info.stage, options);
 
 	for (int i = 0; i < nir_count; ++i)
@@ -3537,6 +3617,12 @@
 						     vtx_offset, soffset,
 						     0, 1, 1, true, false);
 
+			LLVMTypeRef type = LLVMGetAllocatedType(ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
+			if (ac_get_type_size(type) == 2) {
+				value = LLVMBuildBitCast(ctx->ac.builder, value, ctx->ac.i32, "");
+				value = LLVMBuildTrunc(ctx->ac.builder, value, ctx->ac.i16, "");
+			}
+
 			LLVMBuildStore(ctx->ac.builder,
 				       ac_to_float(&ctx->ac, value), ctx->abi.outputs[ac_llvm_reg_index_soa(i, j)]);
 		}
@@ -3546,7 +3632,7 @@
 }
 
 void
-radv_compile_gs_copy_shader(LLVMTargetMachineRef tm,
+radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
 			    struct nir_shader *geom_shader,
 			    struct ac_shader_binary *binary,
 			    struct ac_shader_config *config,
@@ -3554,16 +3640,14 @@
 			    const struct radv_nir_compiler_options *options)
 {
 	struct radv_shader_context ctx = {0};
-	ctx.context = LLVMContextCreate();
 	ctx.options = options;
 	ctx.shader_info = shader_info;
 
-	ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class,
-			     options->family);
-	ctx.ac.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
+	ac_llvm_context_init(&ctx.ac, options->chip_class, options->family);
+	ctx.context = ctx.ac.context;
+	ctx.ac.module = ac_create_module(ac_llvm->tm, ctx.context);
 
 	ctx.is_gs_copy_shader = true;
-	LLVMSetTarget(ctx.ac.module, "amdgcn--");
 
 	enum ac_float_mode float_mode =
 		options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
@@ -3572,6 +3656,8 @@
 	ctx.ac.builder = ac_create_builder(ctx.context, float_mode);
 	ctx.stage = MESA_SHADER_VERTEX;
 
+	radv_nir_shader_info_pass(geom_shader, options, &shader_info->info);
+
 	create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX);
 
 	ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
@@ -3590,8 +3676,8 @@
 
 	LLVMBuildRetVoid(ctx.ac.builder);
 
-	ac_llvm_finalize_module(&ctx);
+	ac_llvm_finalize_module(&ctx, ac_llvm->passmgr, options);
 
-	ac_compile_llvm_module(tm, ctx.ac.module, binary, config, shader_info,
+	ac_compile_llvm_module(ac_llvm, ctx.ac.module, binary, config, shader_info,
 			       MESA_SHADER_VERTEX, options);
 }
diff --git a/src/amd/vulkan/radv_pass.c b/src/amd/vulkan/radv_pass.c
index a7d54d7..9cd1b31 100644
--- a/src/amd/vulkan/radv_pass.c
+++ b/src/amd/vulkan/radv_pass.c
@@ -50,7 +50,7 @@
 	pass = vk_alloc2(&device->alloc, pAllocator, size, 8,
 			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (pass == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	memset(pass, 0, size);
 	pass->attachment_count = pCreateInfo->attachmentCount;
@@ -80,7 +80,7 @@
 		// att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp;
 	}
 	uint32_t subpass_attachment_count = 0;
-	VkAttachmentReference *p;
+	struct radv_subpass_attachment *p;
 	for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
 		const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
 
@@ -94,11 +94,11 @@
 	if (subpass_attachment_count) {
 		pass->subpass_attachments =
 			vk_alloc2(&device->alloc, pAllocator,
-				    subpass_attachment_count * sizeof(VkAttachmentReference), 8,
+				    subpass_attachment_count * sizeof(struct radv_subpass_attachment), 8,
 				    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 		if (pass->subpass_attachments == NULL) {
 			vk_free2(&device->alloc, pAllocator, pass);
-			return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 		}
 	} else
 		pass->subpass_attachments = NULL;
@@ -119,8 +119,10 @@
 			p += desc->inputAttachmentCount;
 
 			for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
-				subpass->input_attachments[j]
-					= desc->pInputAttachments[j];
+				subpass->input_attachments[j] = (struct radv_subpass_attachment) {
+					.attachment = desc->pInputAttachments[j].attachment,
+					.layout = desc->pInputAttachments[j].layout,
+				};
 				if (desc->pInputAttachments[j].attachment != VK_ATTACHMENT_UNUSED)
 					pass->attachments[desc->pInputAttachments[j].attachment].view_mask |= subpass->view_mask;
 			}
@@ -131,8 +133,10 @@
 			p += desc->colorAttachmentCount;
 
 			for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
-				subpass->color_attachments[j]
-					= desc->pColorAttachments[j];
+				subpass->color_attachments[j] = (struct radv_subpass_attachment) {
+					.attachment = desc->pColorAttachments[j].attachment,
+					.layout = desc->pColorAttachments[j].layout,
+				};
 				if (desc->pColorAttachments[j].attachment != VK_ATTACHMENT_UNUSED) {
 					pass->attachments[desc->pColorAttachments[j].attachment].view_mask |= subpass->view_mask;
 					color_sample_count = pCreateInfo->pAttachments[desc->pColorAttachments[j].attachment].samples;
@@ -147,8 +151,10 @@
 
 			for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
 				uint32_t a = desc->pResolveAttachments[j].attachment;
-				subpass->resolve_attachments[j]
-					= desc->pResolveAttachments[j];
+				subpass->resolve_attachments[j] = (struct radv_subpass_attachment) {
+					.attachment = desc->pResolveAttachments[j].attachment,
+					.layout = desc->pResolveAttachments[j].layout,
+				};
 				if (a != VK_ATTACHMENT_UNUSED) {
 					subpass->has_resolve = true;
 					pass->attachments[desc->pResolveAttachments[j].attachment].view_mask |= subpass->view_mask;
@@ -157,8 +163,167 @@
 		}
 
 		if (desc->pDepthStencilAttachment) {
-			subpass->depth_stencil_attachment =
-				*desc->pDepthStencilAttachment;
+			subpass->depth_stencil_attachment = (struct radv_subpass_attachment) {
+				.attachment = desc->pDepthStencilAttachment->attachment,
+				.layout = desc->pDepthStencilAttachment->layout,
+			};
+			if (desc->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) {
+				pass->attachments[desc->pDepthStencilAttachment->attachment].view_mask |= subpass->view_mask;
+				depth_sample_count = pCreateInfo->pAttachments[desc->pDepthStencilAttachment->attachment].samples;
+			}
+		} else {
+			subpass->depth_stencil_attachment.attachment = VK_ATTACHMENT_UNUSED;
+		}
+
+		subpass->max_sample_count = MAX2(color_sample_count,
+						 depth_sample_count);
+	}
+
+	for (unsigned i = 0; i < pCreateInfo->dependencyCount; ++i) {
+		uint32_t dst = pCreateInfo->pDependencies[i].dstSubpass;
+		if (dst == VK_SUBPASS_EXTERNAL) {
+			pass->end_barrier.src_stage_mask = pCreateInfo->pDependencies[i].srcStageMask;
+			pass->end_barrier.src_access_mask = pCreateInfo->pDependencies[i].srcAccessMask;
+			pass->end_barrier.dst_access_mask = pCreateInfo->pDependencies[i].dstAccessMask;
+		} else {
+			pass->subpasses[dst].start_barrier.src_stage_mask = pCreateInfo->pDependencies[i].srcStageMask;
+			pass->subpasses[dst].start_barrier.src_access_mask = pCreateInfo->pDependencies[i].srcAccessMask;
+			pass->subpasses[dst].start_barrier.dst_access_mask = pCreateInfo->pDependencies[i].dstAccessMask;
+		}
+	}
+
+	*pRenderPass = radv_render_pass_to_handle(pass);
+
+	return VK_SUCCESS;
+}
+
+VkResult radv_CreateRenderPass2KHR(
+    VkDevice                                    _device,
+    const VkRenderPassCreateInfo2KHR*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkRenderPass*                               pRenderPass)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	struct radv_render_pass *pass;
+	size_t size;
+	size_t attachments_offset;
+
+	assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR);
+
+	size = sizeof(*pass);
+	size += pCreateInfo->subpassCount * sizeof(pass->subpasses[0]);
+	attachments_offset = size;
+	size += pCreateInfo->attachmentCount * sizeof(pass->attachments[0]);
+
+	pass = vk_alloc2(&device->alloc, pAllocator, size, 8,
+			   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+	if (pass == NULL)
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+
+	memset(pass, 0, size);
+	pass->attachment_count = pCreateInfo->attachmentCount;
+	pass->subpass_count = pCreateInfo->subpassCount;
+	pass->attachments = (void *) pass + attachments_offset;
+
+	for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
+		struct radv_render_pass_attachment *att = &pass->attachments[i];
+
+		att->format = pCreateInfo->pAttachments[i].format;
+		att->samples = pCreateInfo->pAttachments[i].samples;
+		att->load_op = pCreateInfo->pAttachments[i].loadOp;
+		att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp;
+		att->initial_layout =  pCreateInfo->pAttachments[i].initialLayout;
+		att->final_layout =  pCreateInfo->pAttachments[i].finalLayout;
+		// att->store_op = pCreateInfo->pAttachments[i].storeOp;
+		// att->stencil_store_op = pCreateInfo->pAttachments[i].stencilStoreOp;
+	}
+	uint32_t subpass_attachment_count = 0;
+	struct radv_subpass_attachment *p;
+	for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+		const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i];
+
+		subpass_attachment_count +=
+			desc->inputAttachmentCount +
+			desc->colorAttachmentCount +
+			(desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
+			(desc->pDepthStencilAttachment != NULL);
+	}
+
+	if (subpass_attachment_count) {
+		pass->subpass_attachments =
+			vk_alloc2(&device->alloc, pAllocator,
+				    subpass_attachment_count * sizeof(struct radv_subpass_attachment), 8,
+				    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+		if (pass->subpass_attachments == NULL) {
+			vk_free2(&device->alloc, pAllocator, pass);
+			return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
+		}
+	} else
+		pass->subpass_attachments = NULL;
+
+	p = pass->subpass_attachments;
+	for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+		const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i];
+		uint32_t color_sample_count = 1, depth_sample_count = 1;
+		struct radv_subpass *subpass = &pass->subpasses[i];
+
+		subpass->input_count = desc->inputAttachmentCount;
+		subpass->color_count = desc->colorAttachmentCount;
+		subpass->view_mask = desc->viewMask;
+
+		if (desc->inputAttachmentCount > 0) {
+			subpass->input_attachments = p;
+			p += desc->inputAttachmentCount;
+
+			for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
+				subpass->input_attachments[j] = (struct radv_subpass_attachment) {
+					.attachment = desc->pInputAttachments[j].attachment,
+					.layout = desc->pInputAttachments[j].layout,
+				};
+				if (desc->pInputAttachments[j].attachment != VK_ATTACHMENT_UNUSED)
+					pass->attachments[desc->pInputAttachments[j].attachment].view_mask |= subpass->view_mask;
+			}
+		}
+
+		if (desc->colorAttachmentCount > 0) {
+			subpass->color_attachments = p;
+			p += desc->colorAttachmentCount;
+
+			for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+				subpass->color_attachments[j] = (struct radv_subpass_attachment) {
+					.attachment = desc->pColorAttachments[j].attachment,
+					.layout = desc->pColorAttachments[j].layout,
+				};
+				if (desc->pColorAttachments[j].attachment != VK_ATTACHMENT_UNUSED) {
+					pass->attachments[desc->pColorAttachments[j].attachment].view_mask |= subpass->view_mask;
+					color_sample_count = pCreateInfo->pAttachments[desc->pColorAttachments[j].attachment].samples;
+				}
+			}
+		}
+
+		subpass->has_resolve = false;
+		if (desc->pResolveAttachments) {
+			subpass->resolve_attachments = p;
+			p += desc->colorAttachmentCount;
+
+			for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+				uint32_t a = desc->pResolveAttachments[j].attachment;
+				subpass->resolve_attachments[j] = (struct radv_subpass_attachment) {
+					.attachment = desc->pResolveAttachments[j].attachment,
+					.layout = desc->pResolveAttachments[j].layout,
+				};
+				if (a != VK_ATTACHMENT_UNUSED) {
+					subpass->has_resolve = true;
+					pass->attachments[desc->pResolveAttachments[j].attachment].view_mask |= subpass->view_mask;
+				}
+			}
+		}
+
+		if (desc->pDepthStencilAttachment) {
+			subpass->depth_stencil_attachment = (struct radv_subpass_attachment) {
+				.attachment = desc->pDepthStencilAttachment->attachment,
+				.layout = desc->pDepthStencilAttachment->layout,
+			};
 			if (desc->pDepthStencilAttachment->attachment != VK_ATTACHMENT_UNUSED) {
 				pass->attachments[desc->pDepthStencilAttachment->attachment].view_mask |= subpass->view_mask;
 				depth_sample_count = pCreateInfo->pAttachments[desc->pDepthStencilAttachment->attachment].samples;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 501bc6f..6a51efa 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -174,7 +174,7 @@
 	if (scratch_bytes_per_wave && max_waves < min_waves) {
 		/* Not really true at this moment, but will be true on first
 		 * execution. Avoid having hanging shaders. */
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	}
 	pipeline->scratch_bytes_per_wave = scratch_bytes_per_wave;
 	pipeline->max_waves = max_waves;
@@ -1780,13 +1780,13 @@
 				ac_lower_indirect_derefs(ordered_shaders[i],
 				                         pipeline->device->physical_device->rad_info.chip_class);
 			}
-			radv_optimize_nir(ordered_shaders[i]);
+			radv_optimize_nir(ordered_shaders[i], false);
 
 			if (nir_lower_global_vars_to_local(ordered_shaders[i - 1])) {
 				ac_lower_indirect_derefs(ordered_shaders[i - 1],
 				                         pipeline->device->physical_device->rad_info.chip_class);
 			}
-			radv_optimize_nir(ordered_shaders[i - 1]);
+			radv_optimize_nir(ordered_shaders[i - 1], false);
 		}
 	}
 }
@@ -1806,6 +1806,9 @@
 	struct radv_pipeline_key key;
 	memset(&key, 0, sizeof(key));
 
+	if (pCreateInfo->flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)
+		key.optimisations_disabled = 1;
+
 	key.has_multiview_view_index = has_view_index;
 
 	uint32_t binding_input_rate = 0;
@@ -1865,8 +1868,7 @@
 	    pCreateInfo->pMultisampleState->rasterizationSamples > 1) {
 		uint32_t num_samples = pCreateInfo->pMultisampleState->rasterizationSamples;
 		uint32_t ps_iter_samples = radv_pipeline_get_ps_iter_samples(pCreateInfo->pMultisampleState);
-		key.multisample = true;
-		key.log2_num_samples = util_logbase2(num_samples);
+		key.num_samples = num_samples;
 		key.log2_ps_iter_samples = util_logbase2(ps_iter_samples);
 	}
 
@@ -1906,12 +1908,11 @@
 	for(int i = 0; i < MESA_SHADER_STAGES; ++i)
 		keys[i].has_multiview_view_index = key->has_multiview_view_index;
 
-	keys[MESA_SHADER_FRAGMENT].fs.multisample = key->multisample;
 	keys[MESA_SHADER_FRAGMENT].fs.col_format = key->col_format;
 	keys[MESA_SHADER_FRAGMENT].fs.is_int8 = key->is_int8;
 	keys[MESA_SHADER_FRAGMENT].fs.is_int10 = key->is_int10;
 	keys[MESA_SHADER_FRAGMENT].fs.log2_ps_iter_samples = key->log2_ps_iter_samples;
-	keys[MESA_SHADER_FRAGMENT].fs.log2_num_samples = key->log2_num_samples;
+	keys[MESA_SHADER_FRAGMENT].fs.num_samples = key->num_samples;
 }
 
 static void
@@ -1958,7 +1959,8 @@
                          struct radv_device *device,
                          struct radv_pipeline_cache *cache,
                          struct radv_pipeline_key key,
-                         const VkPipelineShaderStageCreateInfo **pStages)
+                         const VkPipelineShaderStageCreateInfo **pStages,
+                         const VkPipelineCreateFlags flags)
 {
 	struct radv_shader_module fs_m = {0};
 	struct radv_shader_module *modules[MESA_SHADER_STAGES] = { 0, };
@@ -2022,7 +2024,8 @@
 
 		nir[i] = radv_shader_compile_to_nir(device, modules[i],
 						    stage ? stage->pName : "main", i,
-						    stage ? stage->pSpecializationInfo : NULL);
+						    stage ? stage->pSpecializationInfo : NULL,
+						    flags);
 
 		/* We don't want to alter meta shaders IR directly so clone it
 		 * first.
@@ -2040,20 +2043,23 @@
 			if (i != last)
 				mask = mask | nir_var_shader_out;
 
-			nir_lower_io_to_scalar_early(nir[i], mask);
-			radv_optimize_nir(nir[i]);
+			if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT)) {
+				nir_lower_io_to_scalar_early(nir[i], mask);
+				radv_optimize_nir(nir[i], false);
+			}
 		}
 	}
 
 	if (nir[MESA_SHADER_TESS_CTRL]) {
-		nir_lower_tes_patch_vertices(nir[MESA_SHADER_TESS_EVAL], nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out);
+		nir_lower_patch_vertices(nir[MESA_SHADER_TESS_EVAL], nir[MESA_SHADER_TESS_CTRL]->info.tess.tcs_vertices_out, NULL);
 		merge_tess_info(&nir[MESA_SHADER_TESS_EVAL]->info, &nir[MESA_SHADER_TESS_CTRL]->info);
 	}
 
-	radv_link_shaders(pipeline, nir);
+	if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
+		radv_link_shaders(pipeline, nir);
 
 	for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
-		if (modules[i] && radv_can_dump_shader(device, modules[i]))
+		if (radv_can_dump_shader(device, modules[i], false))
 			nir_print_shader(nir[i], stderr);
 	}
 
@@ -2487,7 +2493,7 @@
 }
 
 static void
-radv_pipeline_generate_binning_state(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_binning_state(struct radeon_cmdbuf *cs,
 				     struct radv_pipeline *pipeline,
 				     const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
@@ -2508,6 +2514,7 @@
 	switch (pipeline->device->physical_device->rad_info.family) {
 	case CHIP_VEGA10:
 	case CHIP_VEGA12:
+	case CHIP_VEGA20:
 		context_states_per_bin = 1;
 		persistent_states_per_bin = 1;
 		fpovs_per_batch = 63;
@@ -2543,7 +2550,7 @@
 
 
 static void
-radv_pipeline_generate_depth_stencil_state(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *cs,
                                            struct radv_pipeline *pipeline,
                                            const VkGraphicsPipelineCreateInfo *pCreateInfo,
                                            const struct radv_graphics_pipeline_create_info *extra)
@@ -2625,7 +2632,7 @@
 }
 
 static void
-radv_pipeline_generate_blend_state(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_blend_state(struct radeon_cmdbuf *cs,
                                    struct radv_pipeline *pipeline,
                                    const struct radv_blend_state *blend)
 {
@@ -2652,7 +2659,7 @@
 
 
 static void
-radv_pipeline_generate_raster_state(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_raster_state(struct radeon_cmdbuf *cs,
                                     const VkGraphicsPipelineCreateInfo *pCreateInfo)
 {
 	const VkPipelineRasterizationStateCreateInfo *vkraster = pCreateInfo->pRasterizationState;
@@ -2693,7 +2700,7 @@
 
 
 static void
-radv_pipeline_generate_multisample_state(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_multisample_state(struct radeon_cmdbuf *cs,
                                          struct radv_pipeline *pipeline)
 {
 	struct radv_multisample_state *ms = &pipeline->graphics.ms;
@@ -2704,39 +2711,10 @@
 
 	radeon_set_context_reg(cs, R_028804_DB_EQAA, ms->db_eqaa);
 	radeon_set_context_reg(cs, R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
-
-	if (pipeline->shaders[MESA_SHADER_FRAGMENT]->info.info.ps.needs_sample_positions) {
-		uint32_t offset;
-		struct radv_userdata_info *loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_FRAGMENT, AC_UD_PS_SAMPLE_POS_OFFSET);
-		uint32_t base_reg = pipeline->user_data_0[MESA_SHADER_FRAGMENT];
-		if (loc->sgpr_idx == -1)
-			return;
-		assert(loc->num_sgprs == 1);
-		assert(!loc->indirect);
-		switch (pipeline->graphics.ms.num_samples) {
-		default:
-			offset = 0;
-			break;
-		case 2:
-			offset = 1;
-			break;
-		case 4:
-			offset = 3;
-			break;
-		case 8:
-			offset = 7;
-			break;
-		case 16:
-			offset = 15;
-			break;
-		}
-
-		radeon_set_sh_reg(cs, base_reg + loc->sgpr_idx * 4, offset);
-	}
 }
 
 static void
-radv_pipeline_generate_vgt_gs_mode(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *cs,
                                    const struct radv_pipeline *pipeline)
 {
 	const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
@@ -2760,7 +2738,7 @@
 }
 
 static void
-radv_pipeline_generate_hw_vs(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *cs,
 			     struct radv_pipeline *pipeline,
 			     struct radv_shader_variant *shader)
 {
@@ -2819,7 +2797,7 @@
 }
 
 static void
-radv_pipeline_generate_hw_es(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_hw_es(struct radeon_cmdbuf *cs,
 			     struct radv_pipeline *pipeline,
 			     struct radv_shader_variant *shader)
 {
@@ -2833,7 +2811,7 @@
 }
 
 static void
-radv_pipeline_generate_hw_ls(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_hw_ls(struct radeon_cmdbuf *cs,
 			     struct radv_pipeline *pipeline,
 			     struct radv_shader_variant *shader,
 			     const struct radv_tessellation_state *tess)
@@ -2856,7 +2834,7 @@
 }
 
 static void
-radv_pipeline_generate_hw_hs(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs,
 			     struct radv_pipeline *pipeline,
 			     struct radv_shader_variant *shader,
 			     const struct radv_tessellation_state *tess)
@@ -2882,7 +2860,7 @@
 }
 
 static void
-radv_pipeline_generate_vertex_shader(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *cs,
 				     struct radv_pipeline *pipeline,
 				     const struct radv_tessellation_state *tess)
 {
@@ -2902,7 +2880,7 @@
 }
 
 static void
-radv_pipeline_generate_tess_shaders(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *cs,
 				    struct radv_pipeline *pipeline,
 				    const struct radv_tessellation_state *tess)
 {
@@ -2935,7 +2913,7 @@
 }
 
 static void
-radv_pipeline_generate_geometry_shader(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *cs,
 				       struct radv_pipeline *pipeline,
 				       const struct radv_gs_state *gs_state)
 {
@@ -3015,7 +2993,7 @@
 }
 
 static void
-radv_pipeline_generate_ps_inputs(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *cs,
                                  struct radv_pipeline *pipeline)
 {
 	struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
@@ -3119,7 +3097,7 @@
 }
 
 static void
-radv_pipeline_generate_fragment_shader(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *cs,
 				       struct radv_pipeline *pipeline)
 {
 	struct radv_shader_variant *ps;
@@ -3162,7 +3140,7 @@
 }
 
 static void
-radv_pipeline_generate_vgt_vertex_reuse(struct radeon_winsys_cs *cs,
+radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *cs,
 					struct radv_pipeline *pipeline)
 {
 	if (pipeline->device->physical_device->rad_info.family < CHIP_POLARIS10)
@@ -3300,8 +3278,9 @@
 		}
 	}
 	/* GS requirement. */
-	if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3)
-		ia_multi_vgt_param.partial_es_wave = true;
+	if (radv_pipeline_has_gs(pipeline) && device->physical_device->rad_info.chip_class <= VI)
+		if (SI_GS_PER_ES / ia_multi_vgt_param.primgroup_size >= pipeline->device->gs_table_depth - 3)
+			ia_multi_vgt_param.partial_es_wave = true;
 
 	ia_multi_vgt_param.wd_switch_on_eop = false;
 	if (device->physical_device->rad_info.chip_class >= CIK) {
@@ -3358,6 +3337,17 @@
 		}
 	}
 
+	/* Workaround for a VGT hang when strip primitive types are used with
+	 * primitive restart.
+	 */
+	if (pipeline->graphics.prim_restart_enable &&
+	    (prim == V_008958_DI_PT_LINESTRIP ||
+	     prim == V_008958_DI_PT_TRISTRIP ||
+	     prim == V_008958_DI_PT_LINESTRIP_ADJ ||
+	     prim == V_008958_DI_PT_TRISTRIP_ADJ)) {
+		ia_multi_vgt_param.partial_vs_wave = true;
+	}
+
 	ia_multi_vgt_param.base =
 		S_028AA8_PRIMGROUP_SIZE(ia_multi_vgt_param.primgroup_size - 1) |
 		/* The following field was moved to VGT_SHADER_STAGES_EN in GFX9. */
@@ -3442,7 +3432,7 @@
 
 	radv_create_shaders(pipeline, device, cache, 
 	                    radv_generate_graphics_pipeline_key(pipeline, pCreateInfo, &blend, has_view_index),
-	                    pStages);
+	                    pStages, pCreateInfo->flags);
 
 	pipeline->graphics.spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
 	radv_pipeline_init_multisample_state(pipeline, &blend, pCreateInfo);
@@ -3549,7 +3539,7 @@
 	pipeline = vk_zalloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
 			      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (pipeline == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	result = radv_pipeline_init(pipeline, device, cache,
 				    pCreateInfo, extra, pAllocator);
@@ -3668,14 +3658,14 @@
 	pipeline = vk_zalloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
 			      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (pipeline == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	pipeline->device = device;
 	pipeline->layout = radv_pipeline_layout_from_handle(pCreateInfo->layout);
 	assert(pipeline->layout);
 
 	pStages[MESA_SHADER_COMPUTE] = &pCreateInfo->stage;
-	radv_create_shaders(pipeline, device, cache, (struct radv_pipeline_key) {0}, pStages);
+	radv_create_shaders(pipeline, device, cache, (struct radv_pipeline_key) {0}, pStages, pCreateInfo->flags);
 
 	pipeline->user_data_0[MESA_SHADER_COMPUTE] = radv_pipeline_stage_to_user_data_0(pipeline, MESA_SHADER_COMPUTE, device->physical_device->rad_info.chip_class);
 	pipeline->need_indirect_descriptor_sets |= pipeline->shaders[MESA_SHADER_COMPUTE]->info.need_indirect_descriptor_sets;
diff --git a/src/amd/vulkan/radv_pipeline_cache.c b/src/amd/vulkan/radv_pipeline_cache.c
index 8aa7bd6..7e2c305 100644
--- a/src/amd/vulkan/radv_pipeline_cache.c
+++ b/src/amd/vulkan/radv_pipeline_cache.c
@@ -206,7 +206,7 @@
 
 	table = malloc(byte_size);
 	if (table == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(cache->device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	cache->hash_table = table;
 	cache->table_size = table_size;
@@ -514,7 +514,7 @@
 			    sizeof(*cache), 8,
 			    VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (cache == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	if (pAllocator)
 		cache->alloc = *pAllocator;
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 3f997d3..a187f76 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -57,8 +57,11 @@
 #include "ac_nir_to_llvm.h"
 #include "ac_gpu_info.h"
 #include "ac_surface.h"
+#include "ac_llvm_build.h"
+#include "ac_llvm_util.h"
 #include "radv_descriptor_set.h"
 #include "radv_extensions.h"
+#include "radv_cs.h"
 
 #include <llvm-c/TargetMachine.h>
 
@@ -77,6 +80,7 @@
 #include "radv_entrypoints.h"
 
 #include "wsi_common.h"
+#include "wsi_common_display.h"
 
 #define ATI_VENDOR_ID 0x1002
 
@@ -215,20 +219,19 @@
  * propagating errors. Might be useful to plug in a stack trace here.
  */
 
-VkResult __vk_errorf(VkResult error, const char *file, int line, const char *format, ...);
+struct radv_instance;
 
-#ifdef DEBUG
-#define vk_error(error) __vk_errorf(error, __FILE__, __LINE__, NULL);
-#define vk_errorf(error, format, ...) __vk_errorf(error, __FILE__, __LINE__, format, ## __VA_ARGS__);
-#else
-#define vk_error(error) error
-#define vk_errorf(error, format, ...) error
-#endif
+VkResult __vk_errorf(struct radv_instance *instance, VkResult error, const char *file, int line, const char *format, ...);
+
+#define vk_error(instance, error) __vk_errorf(instance, error, __FILE__, __LINE__, NULL);
+#define vk_errorf(instance, error, format, ...) __vk_errorf(instance, error, __FILE__, __LINE__, format, ## __VA_ARGS__);
 
 void __radv_finishme(const char *file, int line, const char *format, ...)
 	radv_printflike(3, 4);
 void radv_loge(const char *format, ...) radv_printflike(1, 2);
 void radv_loge_v(const char *format, va_list va);
+void radv_logi(const char *format, ...) radv_printflike(1, 2);
+void radv_logi_v(const char *format, va_list va);
 
 /**
  * Print a FINISHME message, including its source location.
@@ -284,6 +287,7 @@
 	uint8_t                                     cache_uuid[VK_UUID_SIZE];
 
 	int local_fd;
+	int master_fd;
 	struct wsi_device                       wsi_device;
 
 	bool has_rbplus; /* if RB+ register exist */
@@ -358,9 +362,9 @@
 	uint32_t is_int8;
 	uint32_t is_int10;
 	uint8_t log2_ps_iter_samples;
-	uint8_t log2_num_samples;
-	uint32_t multisample : 1;
+	uint8_t num_samples;
 	uint32_t has_multiview_view_index : 1;
+	uint32_t optimisations_disabled : 1;
 };
 
 void
@@ -597,9 +601,9 @@
 	struct radeon_winsys_bo *esgs_ring_bo;
 	struct radeon_winsys_bo *gsvs_ring_bo;
 	struct radeon_winsys_bo *tess_rings_bo;
-	struct radeon_winsys_cs *initial_preamble_cs;
-	struct radeon_winsys_cs *initial_full_flush_preamble_cs;
-	struct radeon_winsys_cs *continue_preamble_cs;
+	struct radeon_cmdbuf *initial_preamble_cs;
+	struct radeon_cmdbuf *initial_full_flush_preamble_cs;
+	struct radeon_cmdbuf *continue_preamble_cs;
 };
 
 struct radv_bo_list {
@@ -620,10 +624,9 @@
 
 	struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES];
 	int queue_count[RADV_MAX_QUEUE_FAMILIES];
-	struct radeon_winsys_cs *empty_cs[RADV_MAX_QUEUE_FAMILIES];
+	struct radeon_cmdbuf *empty_cs[RADV_MAX_QUEUE_FAMILIES];
 
 	bool always_use_syncobj;
-	bool llvm_supports_spill;
 	bool has_distributed_tess;
 	bool pbb_allowed;
 	bool dfsm_allowed;
@@ -831,6 +834,9 @@
 	RADV_CMD_FLAG_PS_PARTIAL_FLUSH = 1 << 10,
 	RADV_CMD_FLAG_CS_PARTIAL_FLUSH = 1 << 11,
 	RADV_CMD_FLAG_VGT_FLUSH        = 1 << 12,
+	/* Pipeline query controls. */
+	RADV_CMD_FLAG_START_PIPELINE_STATS = 1 << 13,
+	RADV_CMD_FLAG_STOP_PIPELINE_STATS  = 1 << 14,
 
 	RADV_CMD_FLUSH_AND_INV_FRAMEBUFFER = (RADV_CMD_FLAG_FLUSH_AND_INV_CB |
 					      RADV_CMD_FLAG_FLUSH_AND_INV_CB_META |
@@ -965,6 +971,7 @@
 	enum radv_cmd_flush_bits                     flush_bits;
 	unsigned                                     active_occlusion_queries;
 	bool                                         perfect_occlusion_queries_enabled;
+	unsigned                                     active_pipeline_queries;
 	float					     offset_scale;
 	uint32_t                                      trace_id;
 	uint32_t                                      last_ia_multi_vgt_param;
@@ -975,6 +982,10 @@
 
 	/* Whether CP DMA is busy/idle. */
 	bool dma_is_busy;
+
+	/* Conditional rendering info. */
+	int predication_type; /* -1: disabled, 0: normal, 1: inverted */
+	uint64_t predication_va;
 };
 
 struct radv_cmd_pool {
@@ -1011,7 +1022,7 @@
 	VkCommandBufferUsageFlags                    usage_flags;
 	VkCommandBufferLevel                         level;
 	enum radv_cmd_buffer_status status;
-	struct radeon_winsys_cs *cs;
+	struct radeon_cmdbuf *cs;
 	struct radv_cmd_state state;
 	struct radv_vertex_binding                   vertex_bindings[MAX_VBS];
 	uint32_t queue_family_index;
@@ -1033,7 +1044,6 @@
 
 	VkResult record_result;
 
-	int ring_offsets_idx; /* just used for verification */
 	uint32_t gfx9_fence_offset;
 	struct radeon_winsys_bo *gfx9_fence_bo;
 	uint32_t gfx9_fence_idx;
@@ -1054,16 +1064,15 @@
 
 void cik_create_gfx_config(struct radv_device *device);
 
-void si_write_viewport(struct radeon_winsys_cs *cs, int first_vp,
+void si_write_viewport(struct radeon_cmdbuf *cs, int first_vp,
 		       int count, const VkViewport *viewports);
-void si_write_scissors(struct radeon_winsys_cs *cs, int first,
+void si_write_scissors(struct radeon_cmdbuf *cs, int first,
 		       int count, const VkRect2D *scissors,
 		       const VkViewport *viewports, bool can_use_guardband);
 uint32_t si_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer,
 				   bool instanced_draw, bool indirect_draw,
 				   uint32_t draw_vertex_count);
-void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
-				bool predicated,
+void si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs,
 				enum chip_class chip_class,
 				bool is_mec,
 				unsigned event, unsigned event_flags,
@@ -1073,18 +1082,18 @@
 				uint32_t new_fence,
 				uint64_t gfx9_eop_bug_va);
 
-void si_emit_wait_fence(struct radeon_winsys_cs *cs,
-			bool predicated,
+void si_emit_wait_fence(struct radeon_cmdbuf *cs,
 			uint64_t va, uint32_t ref,
 			uint32_t mask);
-void si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
+void si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
 			    enum chip_class chip_class,
 			    uint32_t *fence_ptr, uint64_t va,
 			    bool is_mec,
 			    enum radv_cmd_flush_bits flush_bits,
 			    uint64_t gfx9_eop_bug_va);
 void si_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer);
-void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va);
+void si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer,
+				   bool inverted, uint64_t va);
 void si_cp_dma_buffer_copy(struct radv_cmd_buffer *cmd_buffer,
 			   uint64_t src_va, uint64_t dest_va,
 			   uint64_t size);
@@ -1114,17 +1123,20 @@
 void radv_cmd_buffer_resolve_subpass(struct radv_cmd_buffer *cmd_buffer);
 void radv_cmd_buffer_resolve_subpass_cs(struct radv_cmd_buffer *cmd_buffer);
 void radv_cmd_buffer_resolve_subpass_fs(struct radv_cmd_buffer *cmd_buffer);
-void radv_cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples);
+void radv_cayman_emit_msaa_sample_locs(struct radeon_cmdbuf *cs, int nr_samples);
 unsigned radv_cayman_get_maxdist(int log_samples);
 void radv_device_init_msaa(struct radv_device *device);
-void radv_set_depth_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			       struct radv_image *image,
-			       VkClearDepthStencilValue ds_clear_value,
-			       VkImageAspectFlags aspects);
-void radv_set_color_clear_regs(struct radv_cmd_buffer *cmd_buffer,
-			       struct radv_image *image,
-			       int idx,
-			       uint32_t color_values[2]);
+
+void radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+				   struct radv_image *image,
+				   VkClearDepthStencilValue ds_clear_value,
+				   VkImageAspectFlags aspects);
+
+void radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer,
+				      struct radv_image *image,
+				      int cb_idx,
+				      uint32_t color_values[2]);
+
 void radv_set_dcc_need_cmask_elim_pred(struct radv_cmd_buffer *cmd_buffer,
 				       struct radv_image *image,
 				       bool value);
@@ -1136,6 +1148,41 @@
 			struct radv_device_memory *memory,
 			int *pFD);
 
+static inline void
+radv_emit_shader_pointer_head(struct radeon_cmdbuf *cs,
+			      unsigned sh_offset, unsigned pointer_count,
+			      bool use_32bit_pointers)
+{
+	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, pointer_count * (use_32bit_pointers ? 1 : 2), 0));
+	radeon_emit(cs, (sh_offset - SI_SH_REG_OFFSET) >> 2);
+}
+
+static inline void
+radv_emit_shader_pointer_body(struct radv_device *device,
+			      struct radeon_cmdbuf *cs,
+			      uint64_t va, bool use_32bit_pointers)
+{
+	radeon_emit(cs, va);
+
+	if (use_32bit_pointers) {
+		assert(va == 0 ||
+		       (va >> 32) == device->physical_device->rad_info.address32_hi);
+	} else {
+		radeon_emit(cs, va >> 32);
+	}
+}
+
+static inline void
+radv_emit_shader_pointer(struct radv_device *device,
+			 struct radeon_cmdbuf *cs,
+			 uint32_t sh_offset, uint64_t va, bool global)
+{
+	bool use_32bit_pointers = HAVE_32BIT_POINTERS && !global;
+
+	radv_emit_shader_pointer_head(cs, sh_offset, 1, use_32bit_pointers);
+	radv_emit_shader_pointer_body(device, cs, va, use_32bit_pointers);
+}
+
 static inline struct radv_descriptor_state *
 radv_get_descriptors_state(struct radv_cmd_buffer *cmd_buffer,
 			   VkPipelineBindPoint bind_point)
@@ -1242,7 +1289,7 @@
 	struct radv_shader_variant *gs_copy_shader;
 	VkShaderStageFlags                           active_stages;
 
-	struct radeon_winsys_cs                      cs;
+	struct radeon_cmdbuf                      cs;
 
 	struct radv_vertex_elements_info             vertex_elements;
 
@@ -1452,6 +1499,17 @@
 }
 
 /**
+ * Return whether the image has CB metadata.
+ */
+static inline bool
+radv_image_has_CB_metadata(const struct radv_image *image)
+{
+	return radv_image_has_cmask(image) ||
+	       radv_image_has_fmask(image) ||
+	       radv_image_has_dcc(image);
+}
+
+/**
  * Return whether the image has HTILE metadata for depth surfaces.
  */
 static inline bool
@@ -1660,13 +1718,21 @@
 	VkAccessFlags        dst_access_mask;
 };
 
+void radv_subpass_barrier(struct radv_cmd_buffer *cmd_buffer,
+			  const struct radv_subpass_barrier *barrier);
+
+struct radv_subpass_attachment {
+	uint32_t         attachment;
+	VkImageLayout    layout;
+};
+
 struct radv_subpass {
 	uint32_t                                     input_count;
 	uint32_t                                     color_count;
-	VkAttachmentReference *                      input_attachments;
-	VkAttachmentReference *                      color_attachments;
-	VkAttachmentReference *                      resolve_attachments;
-	VkAttachmentReference                        depth_stencil_attachment;
+	struct radv_subpass_attachment *             input_attachments;
+	struct radv_subpass_attachment *             color_attachments;
+	struct radv_subpass_attachment *             resolve_attachments;
+	struct radv_subpass_attachment               depth_stencil_attachment;
 
 	/** Subpass has at least one resolve attachment */
 	bool                                         has_resolve;
@@ -1690,7 +1756,7 @@
 struct radv_render_pass {
 	uint32_t                                     attachment_count;
 	uint32_t                                     subpass_count;
-	VkAttachmentReference *                      subpass_attachments;
+	struct radv_subpass_attachment *             subpass_attachments;
 	struct radv_render_pass_attachment *         attachments;
 	struct radv_subpass_barrier                  end_barrier;
 	struct radv_subpass                          subpasses[0];
@@ -1716,14 +1782,6 @@
 	uint32_t temp_syncobj;
 };
 
-VkResult radv_alloc_sem_info(struct radv_winsys_sem_info *sem_info,
-			     int num_wait_sems,
-			     const VkSemaphore *wait_sems,
-			     int num_signal_sems,
-			     const VkSemaphore *signal_sems,
-			     VkFence fence);
-void radv_free_sem_info(struct radv_winsys_sem_info *sem_info);
-
 void radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer,
 			     VkPipelineBindPoint bind_point,
 			     struct radv_descriptor_set *set,
@@ -1757,6 +1815,7 @@
 
 struct radv_fence {
 	struct radeon_winsys_fence *fence;
+	struct wsi_fence *fence_wsi;
 	bool submitted;
 	bool signalled;
 
@@ -1768,14 +1827,14 @@
 struct radv_shader_variant_info;
 struct radv_nir_compiler_options;
 
-void radv_compile_gs_copy_shader(LLVMTargetMachineRef tm,
+void radv_compile_gs_copy_shader(struct ac_llvm_compiler *ac_llvm,
 				 struct nir_shader *geom_shader,
 				 struct ac_shader_binary *binary,
 				 struct ac_shader_config *config,
 				 struct radv_shader_variant_info *shader_info,
 				 const struct radv_nir_compiler_options *option);
 
-void radv_compile_nir_shader(LLVMTargetMachineRef tm,
+void radv_compile_nir_shader(struct ac_llvm_compiler *ac_llvm,
 			     struct ac_shader_binary *binary,
 			     struct ac_shader_config *config,
 			     struct radv_shader_variant_info *shader_info,
diff --git a/src/amd/vulkan/radv_query.c b/src/amd/vulkan/radv_query.c
index dccdee3..427e677 100644
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@@ -188,10 +188,8 @@
 	load->num_components = 2;
 	nir_builder_instr_insert(&b, &load->instr);
 
-	const unsigned swizzle0[] = {0,0,0,0};
-	const unsigned swizzle1[] = {1,1,1,1};
-	nir_store_var(&b, start, nir_swizzle(&b, &load->dest.ssa, swizzle0, 1, false), 0x1);
-	nir_store_var(&b, end, nir_swizzle(&b, &load->dest.ssa, swizzle1, 1, false), 0x1);
+	nir_store_var(&b, start, nir_channel(&b, &load->dest.ssa, 0), 0x1);
+	nir_store_var(&b, end, nir_channel(&b, &load->dest.ssa, 1), 0x1);
 
 	nir_ssa_def *start_done = nir_ilt(&b, nir_load_var(&b, start), nir_imm_int64(&b, 0));
 	nir_ssa_def *end_done = nir_ilt(&b, nir_load_var(&b, end), nir_imm_int64(&b, 0));
@@ -650,12 +648,19 @@
 {
 	struct radv_device *device = cmd_buffer->device;
 	struct radv_meta_saved_state saved_state;
+	bool old_predicating;
 
 	radv_meta_save(&saved_state, cmd_buffer,
 		       RADV_META_SAVE_COMPUTE_PIPELINE |
 		       RADV_META_SAVE_CONSTANTS |
 		       RADV_META_SAVE_DESCRIPTORS);
 
+	/* VK_EXT_conditional_rendering says that copy commands should not be
+	 * affected by conditional rendering.
+	 */
+	old_predicating = cmd_buffer->state.predicating;
+	cmd_buffer->state.predicating = false;
+
 	struct radv_buffer dst_buffer = {
 		.bo = dst_bo,
 		.offset = dst_offset,
@@ -738,6 +743,9 @@
 	                                RADV_CMD_FLAG_INV_VMEM_L1 |
 	                                RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
 
+	/* Restore conditional rendering. */
+	cmd_buffer->state.predicating = old_predicating;
+
 	radv_meta_restore(&saved_state, cmd_buffer);
 }
 
@@ -753,7 +761,7 @@
 					       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 
 	if (!pool)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 
 	switch(pCreateInfo->queryType) {
@@ -783,7 +791,7 @@
 
 	if (!pool->bo) {
 		vk_free2(&device->alloc, pAllocator, pool);
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	}
 
 	pool->ptr = device->ws->buffer_map(pool->bo);
@@ -791,7 +799,7 @@
 	if (!pool->ptr) {
 		device->ws->buffer_destroy(pool->bo);
 		vk_free2(&device->alloc, pAllocator, pool);
-		return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
 	}
 	memset(pool->ptr, 0, pool->size);
 
@@ -950,14 +958,14 @@
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
 	RADV_FROM_HANDLE(radv_buffer, dst_buffer, dstBuffer);
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	unsigned elem_size = (flags & VK_QUERY_RESULT_64_BIT) ? 8 : 4;
 	uint64_t va = radv_buffer_get_va(pool->bo);
 	uint64_t dest_va = radv_buffer_get_va(dst_buffer->bo);
 	dest_va += dst_buffer->offset + dstOffset;
 
-	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo, 8);
-	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo, 8);
+	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, pool->bo);
+	radv_cs_add_buffer(cmd_buffer->device->ws, cmd_buffer->cs, dst_buffer->bo);
 
 	switch (pool->type) {
 	case VK_QUERY_TYPE_OCCLUSION:
@@ -992,7 +1000,7 @@
 				uint64_t avail_va = va + pool->availability_offset + 4 * query;
 
 				/* This waits on the ME. All copies below are done on the ME */
-				si_emit_wait_fence(cs, false, avail_va, 1, 0xffffffff);
+				si_emit_wait_fence(cs, avail_va, 1, 0xffffffff);
 			}
 		}
 		radv_query_shader(cmd_buffer, cmd_buffer->device->meta_state.query.pipeline_statistics_query_pipeline,
@@ -1015,7 +1023,7 @@
 				uint64_t avail_va = va + pool->availability_offset + 4 * query;
 
 				/* This waits on the ME. All copies below are done on the ME */
-				si_emit_wait_fence(cs, false, avail_va, 1, 0xffffffff);
+				si_emit_wait_fence(cs, avail_va, 1, 0xffffffff);
 			}
 			if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
 				uint64_t avail_va = va + pool->availability_offset + 4 * query;
@@ -1082,7 +1090,7 @@
 			     VkQueryType query_type,
 			     VkQueryControlFlags flags)
 {
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	switch (query_type) {
 	case VK_QUERY_TYPE_OCCLUSION:
 		radeon_check_space(cmd_buffer->device->ws, cs, 7);
@@ -1118,6 +1126,12 @@
 	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 		radeon_check_space(cmd_buffer->device->ws, cs, 4);
 
+		++cmd_buffer->state.active_pipeline_queries;
+		if (cmd_buffer->state.active_pipeline_queries == 1) {
+			cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_STOP_PIPELINE_STATS;
+			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_START_PIPELINE_STATS;
+		}
+
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
@@ -1133,19 +1147,19 @@
 			   uint64_t va, uint64_t avail_va,
 			   VkQueryType query_type)
 {
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	switch (query_type) {
 	case VK_QUERY_TYPE_OCCLUSION:
 		radeon_check_space(cmd_buffer->device->ws, cs, 14);
 
 		cmd_buffer->state.active_occlusion_queries--;
 		if (cmd_buffer->state.active_occlusion_queries == 0) {
+			radv_set_db_count_control(cmd_buffer);
+
 			/* Reset the perfect occlusion queries hint now that no
 			 * queries are active.
 			 */
 			cmd_buffer->state.perfect_occlusion_queries_enabled = false;
-
-			radv_set_db_count_control(cmd_buffer);
 		}
 
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -1157,6 +1171,11 @@
 	case VK_QUERY_TYPE_PIPELINE_STATISTICS:
 		radeon_check_space(cmd_buffer->device->ws, cs, 16);
 
+		cmd_buffer->state.active_pipeline_queries--;
+		if (cmd_buffer->state.active_pipeline_queries == 0) {
+			cmd_buffer->state.flush_bits &= ~RADV_CMD_FLAG_START_PIPELINE_STATS;
+			cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_STOP_PIPELINE_STATS;
+		}
 		va += pipelinestat_block_size;
 
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -1165,11 +1184,11 @@
 		radeon_emit(cs, va >> 32);
 
 		si_cs_emit_write_event_eop(cs,
-					   false,
 					   cmd_buffer->device->physical_device->rad_info.chip_class,
 					   radv_cmd_buffer_uses_mec(cmd_buffer),
 					   V_028A90_BOTTOM_OF_PIPE_TS, 0,
-					   1, avail_va, 0, 1,
+					   EOP_DATA_SEL_VALUE_32BIT,
+					   avail_va, 0, 1,
 					   cmd_buffer->gfx9_eop_bug_va);
 		break;
 	default:
@@ -1185,10 +1204,10 @@
 {
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	uint64_t va = radv_buffer_get_va(pool->bo);
 
-	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo, 8);
+	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
 
 	if (cmd_buffer->pending_reset_query) {
 		if (pool->size >= RADV_BUFFER_OPS_CS_THRESHOLD) {
@@ -1254,12 +1273,12 @@
 	RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
 	RADV_FROM_HANDLE(radv_query_pool, pool, queryPool);
 	bool mec = radv_cmd_buffer_uses_mec(cmd_buffer);
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	uint64_t va = radv_buffer_get_va(pool->bo);
 	uint64_t avail_va = va + pool->availability_offset + 4 * query;
 	uint64_t query_va = va + pool->stride * query;
 
-	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo, 5);
+	radv_cs_add_buffer(cmd_buffer->device->ws, cs, pool->bo);
 
 	int num_queries = 1;
 	if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask)
@@ -1289,18 +1308,18 @@
 			break;
 		default:
 			si_cs_emit_write_event_eop(cs,
-						   false,
 						   cmd_buffer->device->physical_device->rad_info.chip_class,
 						   mec,
 						   V_028A90_BOTTOM_OF_PIPE_TS, 0,
-						   3, query_va, 0, 0,
+						   EOP_DATA_SEL_TIMESTAMP,
+						   query_va, 0, 0,
 						   cmd_buffer->gfx9_eop_bug_va);
 			si_cs_emit_write_event_eop(cs,
-						   false,
 						   cmd_buffer->device->physical_device->rad_info.chip_class,
 						   mec,
 						   V_028A90_BOTTOM_OF_PIPE_TS, 0,
-						   1, avail_va, 0, 1,
+						   EOP_DATA_SEL_VALUE_32BIT,
+						   avail_va, 0, 1,
 						   cmd_buffer->gfx9_eop_bug_va);
 			break;
 		}
diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h
index 7f19934..5266617 100644
--- a/src/amd/vulkan/radv_radeon_winsys.h
+++ b/src/amd/vulkan/radv_radeon_winsys.h
@@ -57,6 +57,7 @@
 	RADEON_FLAG_IMPLICIT_SYNC = (1 << 5),
 	RADEON_FLAG_NO_INTERPROCESS_SHARING = (1 << 6),
 	RADEON_FLAG_READ_ONLY =     (1 << 7),
+	RADEON_FLAG_32BIT =         (1 << 8),
 };
 
 enum radeon_bo_usage { /* bitfield */
@@ -95,7 +96,7 @@
 	RADEON_CURRENT_MCLK,
 };
 
-struct radeon_winsys_cs {
+struct radeon_cmdbuf {
 	unsigned cdw;  /* Number of used dwords. */
 	unsigned max_dw; /* Maximum number of dwords. */
 	uint32_t *buf; /* The base pointer of the chunk. */
@@ -233,36 +234,35 @@
 	bool (*ctx_wait_idle)(struct radeon_winsys_ctx *ctx,
 	                      enum ring_type ring_type, int ring_index);
 
-	struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys *ws,
+	struct radeon_cmdbuf *(*cs_create)(struct radeon_winsys *ws,
 					      enum ring_type ring_type);
 
-	void (*cs_destroy)(struct radeon_winsys_cs *cs);
+	void (*cs_destroy)(struct radeon_cmdbuf *cs);
 
-	void (*cs_reset)(struct radeon_winsys_cs *cs);
+	void (*cs_reset)(struct radeon_cmdbuf *cs);
 
-	bool (*cs_finalize)(struct radeon_winsys_cs *cs);
+	bool (*cs_finalize)(struct radeon_cmdbuf *cs);
 
-	void (*cs_grow)(struct radeon_winsys_cs * cs, size_t min_size);
+	void (*cs_grow)(struct radeon_cmdbuf * cs, size_t min_size);
 
 	int (*cs_submit)(struct radeon_winsys_ctx *ctx,
 			 int queue_index,
-			 struct radeon_winsys_cs **cs_array,
+			 struct radeon_cmdbuf **cs_array,
 			 unsigned cs_count,
-			 struct radeon_winsys_cs *initial_preamble_cs,
-			 struct radeon_winsys_cs *continue_preamble_cs,
+			 struct radeon_cmdbuf *initial_preamble_cs,
+			 struct radeon_cmdbuf *continue_preamble_cs,
 			 struct radv_winsys_sem_info *sem_info,
 			 const struct radv_winsys_bo_list *bo_list, /* optional */
 			 bool can_patch,
 			 struct radeon_winsys_fence *fence);
 
-	void (*cs_add_buffer)(struct radeon_winsys_cs *cs,
-			      struct radeon_winsys_bo *bo,
-			      uint8_t priority);
+	void (*cs_add_buffer)(struct radeon_cmdbuf *cs,
+			      struct radeon_winsys_bo *bo);
 
-	void (*cs_execute_secondary)(struct radeon_winsys_cs *parent,
-				    struct radeon_winsys_cs *child);
+	void (*cs_execute_secondary)(struct radeon_cmdbuf *parent,
+				    struct radeon_cmdbuf *child);
 
-	void (*cs_dump)(struct radeon_winsys_cs *cs, FILE* file, const int *trace_ids, int trace_id_count);
+	void (*cs_dump)(struct radeon_cmdbuf *cs, FILE* file, const int *trace_ids, int trace_id_count);
 
 	int (*surface_init)(struct radeon_winsys *ws,
 			    const struct ac_surf_info *surf_info,
@@ -306,12 +306,12 @@
 
 };
 
-static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
+static inline void radeon_emit(struct radeon_cmdbuf *cs, uint32_t value)
 {
 	cs->buf[cs->cdw++] = value;
 }
 
-static inline void radeon_emit_array(struct radeon_winsys_cs *cs,
+static inline void radeon_emit_array(struct radeon_cmdbuf *cs,
 				     const uint32_t *values, unsigned count)
 {
 	memcpy(cs->buf + cs->cdw, values, count * 4);
@@ -324,14 +324,13 @@
 }
 
 static inline void radv_cs_add_buffer(struct radeon_winsys *ws,
-				      struct radeon_winsys_cs *cs,
-				      struct radeon_winsys_bo *bo,
-				      uint8_t priority)
+				      struct radeon_cmdbuf *cs,
+				      struct radeon_winsys_bo *bo)
 {
 	if (bo->is_local)
 		return;
 
-	ws->cs_add_buffer(cs, bo, priority);
+	ws->cs_add_buffer(cs, bo);
 }
 
 #endif /* RADV_RADEON_WINSYS_H */
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index ac29335..4093d36 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -30,6 +30,7 @@
 #include "radv_debug.h"
 #include "radv_private.h"
 #include "radv_shader.h"
+#include "radv_shader_helper.h"
 #include "nir/nir.h"
 #include "nir/nir_builder.h"
 #include "spirv/nir_spirv.h"
@@ -90,7 +91,7 @@
 			     sizeof(*module) + pCreateInfo->codeSize, 8,
 			     VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
 	if (module == NULL)
-		return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+		return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
 
 	module->nir = NULL;
 	module->size = pCreateInfo->codeSize;
@@ -118,7 +119,7 @@
 }
 
 void
-radv_optimize_nir(struct nir_shader *shader)
+radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively)
 {
         bool progress;
 
@@ -126,7 +127,7 @@
                 progress = false;
 
                 NIR_PASS_V(shader, nir_lower_vars_to_ssa);
-		NIR_PASS_V(shader, nir_lower_64bit_pack);
+		NIR_PASS_V(shader, nir_lower_pack);
                 NIR_PASS_V(shader, nir_lower_alu_to_scalar);
                 NIR_PASS_V(shader, nir_lower_phis_to_scalar);
 
@@ -150,7 +151,7 @@
                 if (shader->options->max_unroll_iterations) {
                         NIR_PASS(progress, shader, nir_opt_loop_unroll, 0);
                 }
-        } while (progress);
+        } while (progress && !optimize_conservatively);
 
         NIR_PASS(progress, shader, nir_opt_shrink_load);
         NIR_PASS(progress, shader, nir_opt_move_load_ubo);
@@ -161,12 +162,9 @@
 			   struct radv_shader_module *module,
 			   const char *entrypoint_name,
 			   gl_shader_stage stage,
-			   const VkSpecializationInfo *spec_info)
+			   const VkSpecializationInfo *spec_info,
+			   const VkPipelineCreateFlags flags)
 {
-	if (strcmp(entrypoint_name, "main") != 0) {
-		radv_finishme("Multiple shaders per module not really supported");
-	}
-
 	nir_shader *nir;
 	nir_function *entry_point;
 	if (module->nir) {
@@ -225,6 +223,8 @@
 				.shader_viewport_index_layer = true,
 				.descriptor_array_dynamic_indexing = true,
 				.runtime_descriptor_array = true,
+				.stencil_export = true,
+				.storage_16bit = true,
 			},
 		};
 		entry_point = spirv_to_nir(spirv, module->size / 4,
@@ -244,6 +244,7 @@
 		NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_local);
 		NIR_PASS_V(nir, nir_lower_returns);
 		NIR_PASS_V(nir, nir_inline_functions);
+		NIR_PASS_V(nir, nir_copy_prop);
 
 		/* Pick off the single entrypoint that we want */
 		foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
@@ -258,13 +259,20 @@
 		 */
 		NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_shader_out);
 
-		NIR_PASS_V(nir, nir_remove_dead_variables,
-		           nir_var_shader_in | nir_var_shader_out | nir_var_system_value);
-
 		/* Now that we've deleted all but the main function, we can go ahead and
 		 * lower the rest of the constant initializers.
 		 */
 		NIR_PASS_V(nir, nir_lower_constant_initializers, ~0);
+
+		/* Split member structs.  We do this before lower_io_to_temporaries so that
+		 * it doesn't lower system values to temporaries by accident.
+		 */
+		NIR_PASS_V(nir, nir_split_var_copies);
+		NIR_PASS_V(nir, nir_split_per_member_structs);
+
+		NIR_PASS_V(nir, nir_remove_dead_variables,
+		           nir_var_shader_in | nir_var_shader_out | nir_var_system_value);
+
 		NIR_PASS_V(nir, nir_lower_system_values);
 		NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
 	}
@@ -281,7 +289,20 @@
 	nir_lower_tex(nir, &tex_options);
 
 	nir_lower_vars_to_ssa(nir);
+
+	if (nir->info.stage == MESA_SHADER_VERTEX ||
+	    nir->info.stage == MESA_SHADER_GEOMETRY) {
+		NIR_PASS_V(nir, nir_lower_io_to_temporaries,
+			   nir_shader_get_entrypoint(nir), true, true);
+	} else if (nir->info.stage == MESA_SHADER_TESS_EVAL||
+		   nir->info.stage == MESA_SHADER_FRAGMENT) {
+		NIR_PASS_V(nir, nir_lower_io_to_temporaries,
+			   nir_shader_get_entrypoint(nir), true, false);
+	}
+
+	nir_split_var_copies(nir);
 	nir_lower_var_copies(nir);
+
 	nir_lower_global_vars_to_local(nir);
 	nir_remove_dead_variables(nir, nir_var_local);
 	nir_lower_subgroups(nir, &(struct nir_lower_subgroups_options) {
@@ -294,7 +315,8 @@
 			.lower_vote_eq_to_ballot = 1,
 		});
 
-	radv_optimize_nir(nir);
+	if (!(flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT))
+		radv_optimize_nir(nir, false);
 
 	/* Indirect lowering must be called after the radv_optimize_nir() loop
 	 * has been called at least once. Otherwise indirect lowering can
@@ -302,7 +324,7 @@
 	 * considered too large for unrolling.
 	 */
 	ac_lower_indirect_derefs(nir, device->physical_device->rad_info.chip_class);
-	radv_optimize_nir(nir);
+	radv_optimize_nir(nir, flags & VK_PIPELINE_CREATE_DISABLE_OPTIMIZATION_BIT);
 
 	return nir;
 }
@@ -340,8 +362,8 @@
 	slab->bo = device->ws->buffer_create(device->ws, slab->size, 256,
 	                                     RADEON_DOMAIN_VRAM,
 					     RADEON_FLAG_NO_INTERPROCESS_SHARING |
-					     device->physical_device->cpdma_prefetch_writes_memory ?
-					             0 : RADEON_FLAG_READ_ONLY);
+					     (device->physical_device->cpdma_prefetch_writes_memory ?
+					             0 : RADEON_FLAG_READ_ONLY));
 	slab->ptr = (char*)device->ws->buffer_map(slab->bo);
 	list_inithead(&slab->shaders);
 
@@ -365,6 +387,16 @@
 	mtx_destroy(&device->shader_slab_mutex);
 }
 
+/* For the UMR disassembler. */
+#define DEBUGGER_END_OF_CODE_MARKER    0xbf9f0000 /* invalid instruction */
+#define DEBUGGER_NUM_MARKERS           5
+
+static unsigned
+radv_get_shader_binary_size(struct ac_shader_binary *binary)
+{
+	return binary->code_size + DEBUGGER_NUM_MARKERS * 4;
+}
+
 static void
 radv_fill_shader_variant(struct radv_device *device,
 			 struct radv_shader_variant *variant,
@@ -372,17 +404,15 @@
 			 gl_shader_stage stage)
 {
 	bool scratch_enabled = variant->config.scratch_bytes_per_wave > 0;
+	struct radv_shader_info *info = &variant->info.info;
 	unsigned vgpr_comp_cnt = 0;
 
-	if (scratch_enabled && !device->llvm_supports_spill)
-		radv_finishme("shader scratch support only available with LLVM 4.0");
-
-	variant->code_size = binary->code_size;
+	variant->code_size = radv_get_shader_binary_size(binary);
 	variant->rsrc2 = S_00B12C_USER_SGPR(variant->info.num_user_sgprs) |
 			 S_00B12C_USER_SGPR_MSB(variant->info.num_user_sgprs >> 5) |
 			 S_00B12C_SCRATCH_EN(scratch_enabled);
 
-	variant->rsrc1 =  S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) |
+	variant->rsrc1 = S_00B848_VGPRS((variant->config.num_vgprs - 1) / 4) |
 		S_00B848_SGPRS((variant->config.num_sgprs - 1) / 8) |
 		S_00B848_DX10_CLAMP(1) |
 		S_00B848_FLOAT_MODE(variant->config.float_mode);
@@ -393,10 +423,11 @@
 		variant->rsrc2 |= S_00B12C_OC_LDS_EN(1);
 		break;
 	case MESA_SHADER_TESS_CTRL:
-		if (device->physical_device->rad_info.chip_class >= GFX9)
+		if (device->physical_device->rad_info.chip_class >= GFX9) {
 			vgpr_comp_cnt = variant->info.vs.vgpr_comp_cnt;
-		else
+		} else {
 			variant->rsrc2 |= S_00B12C_OC_LDS_EN(1);
+		}
 		break;
 	case MESA_SHADER_VERTEX:
 	case MESA_SHADER_GEOMETRY:
@@ -404,8 +435,7 @@
 		break;
 	case MESA_SHADER_FRAGMENT:
 		break;
-	case MESA_SHADER_COMPUTE: {
-		struct radv_shader_info *info = &variant->info.info;
+	case MESA_SHADER_COMPUTE:
 		variant->rsrc2 |=
 			S_00B84C_TGID_X_EN(info->cs.uses_block_id[0]) |
 			S_00B84C_TGID_Y_EN(info->cs.uses_block_id[1]) |
@@ -415,7 +445,6 @@
 			S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) |
 			S_00B84C_LDS_SIZE(variant->config.lds_size);
 		break;
-	}
 	default:
 		unreachable("unsupported shader type");
 		break;
@@ -423,7 +452,6 @@
 
 	if (device->physical_device->rad_info.chip_class >= GFX9 &&
 	    stage == MESA_SHADER_GEOMETRY) {
-		struct radv_shader_info *info = &variant->info.info;
 		unsigned es_type = variant->info.gs.es_type;
 		unsigned gs_vgpr_comp_cnt, es_vgpr_comp_cnt;
 
@@ -438,26 +466,34 @@
 		/* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and
 		 * VGPR[0:4] are always loaded.
 		 */
-		if (info->uses_invocation_id)
+		if (info->uses_invocation_id) {
 			gs_vgpr_comp_cnt = 3; /* VGPR3 contains InvocationID. */
-		else if (info->uses_prim_id)
+		} else if (info->uses_prim_id) {
 			gs_vgpr_comp_cnt = 2; /* VGPR2 contains PrimitiveID. */
-		else if (variant->info.gs.vertices_in >= 3)
+		} else if (variant->info.gs.vertices_in >= 3) {
 			gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */
-		else
+		} else {
 			gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */
+		}
 
 		variant->rsrc1 |= S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt);
 		variant->rsrc2 |= S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) |
 		                  S_00B22C_OC_LDS_EN(es_type == MESA_SHADER_TESS_EVAL);
 	} else if (device->physical_device->rad_info.chip_class >= GFX9 &&
-	    stage == MESA_SHADER_TESS_CTRL)
+		   stage == MESA_SHADER_TESS_CTRL) {
 		variant->rsrc1 |= S_00B428_LS_VGPR_COMP_CNT(vgpr_comp_cnt);
-	else
+	} else {
 		variant->rsrc1 |= S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt);
+	}
 
 	void *ptr = radv_alloc_shader_memory(device, variant);
 	memcpy(ptr, binary->code, binary->code_size);
+
+	/* Add end-of-code markers for the UMR disassembler. */
+       uint32_t *ptr32 = (uint32_t *)ptr + binary->code_size / 4;
+       for (unsigned i = 0; i < DEBUGGER_NUM_MARKERS; i++)
+		ptr32[i] = DEBUGGER_END_OF_CODE_MARKER;
+
 }
 
 static void radv_init_llvm_target()
@@ -488,52 +524,9 @@
 
 static once_flag radv_init_llvm_target_once_flag = ONCE_FLAG_INIT;
 
-static LLVMTargetRef radv_get_llvm_target(const char *triple)
+static void radv_init_llvm_once(void)
 {
-	LLVMTargetRef target = NULL;
-	char *err_message = NULL;
-
 	call_once(&radv_init_llvm_target_once_flag, radv_init_llvm_target);
-
-	if (LLVMGetTargetFromTriple(triple, &target, &err_message)) {
-		fprintf(stderr, "Cannot find target for triple %s ", triple);
-		if (err_message) {
-			fprintf(stderr, "%s\n", err_message);
-		}
-		LLVMDisposeMessage(err_message);
-		return NULL;
-	}
-	return target;
-}
-
-static LLVMTargetMachineRef radv_create_target_machine(enum radeon_family family,
-						       enum ac_target_machine_options tm_options,
-						       const char **out_triple)
-{
-	assert(family >= CHIP_TAHITI);
-	char features[256];
-	const char *triple = (tm_options & AC_TM_SUPPORTS_SPILL) ? "amdgcn-mesa-mesa3d" : "amdgcn--";
-	LLVMTargetRef target = radv_get_llvm_target(triple);
-
-	snprintf(features, sizeof(features),
-		 "+DumpCode,+vgpr-spilling,-fp32-denormals,+fp64-denormals%s%s%s%s",
-		 tm_options & AC_TM_SISCHED ? ",+si-scheduler" : "",
-		 tm_options & AC_TM_FORCE_ENABLE_XNACK ? ",+xnack" : "",
-		 tm_options & AC_TM_FORCE_DISABLE_XNACK ? ",-xnack" : "",
-		 tm_options & AC_TM_PROMOTE_ALLOCA_TO_SCRATCH ? ",-promote-alloca" : "");
-
-	LLVMTargetMachineRef tm = LLVMCreateTargetMachine(
-	                             target,
-	                             triple,
-	                             ac_get_llvm_processor_name(family),
-				     features,
-	                             LLVMCodeGenLevelDefault,
-	                             LLVMRelocDefault,
-	                             LLVMCodeModelDefault);
-
-	if (out_triple)
-		*out_triple = triple;
-	return tm;
 }
 
 static struct radv_shader_variant *
@@ -551,38 +544,46 @@
 	enum ac_target_machine_options tm_options = 0;
 	struct radv_shader_variant *variant;
 	struct ac_shader_binary binary;
-	LLVMTargetMachineRef tm;
-
+	struct ac_llvm_compiler ac_llvm;
+	bool thread_compiler;
 	variant = calloc(1, sizeof(struct radv_shader_variant));
 	if (!variant)
 		return NULL;
 
 	options->family = chip_family;
 	options->chip_class = device->physical_device->rad_info.chip_class;
-	options->dump_shader = radv_can_dump_shader(device, module);
+	options->dump_shader = radv_can_dump_shader(device, module, gs_copy_shader);
 	options->dump_preoptir = options->dump_shader &&
 				 device->instance->debug_flags & RADV_DEBUG_PREOPTIR;
 	options->record_llvm_ir = device->keep_shader_info;
+	options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
 	options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
+	options->address32_hi = device->physical_device->rad_info.address32_hi;
 
 	if (options->supports_spill)
 		tm_options |= AC_TM_SUPPORTS_SPILL;
 	if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED)
 		tm_options |= AC_TM_SISCHED;
-	tm = radv_create_target_machine(chip_family, tm_options, NULL);
+	if (options->check_ir)
+		tm_options |= AC_TM_CHECK_IR;
 
+	thread_compiler = !(device->instance->debug_flags & RADV_DEBUG_NOTHREADLLVM);
+	radv_init_llvm_once();
+	radv_init_llvm_compiler(&ac_llvm, false,
+				thread_compiler,
+				chip_family, tm_options);
 	if (gs_copy_shader) {
 		assert(shader_count == 1);
-		radv_compile_gs_copy_shader(tm, *shaders, &binary,
+		radv_compile_gs_copy_shader(&ac_llvm, *shaders, &binary,
 					    &variant->config, &variant->info,
 					    options);
 	} else {
-		radv_compile_nir_shader(tm, &binary, &variant->config,
+		radv_compile_nir_shader(&ac_llvm, &binary, &variant->config,
 					&variant->info, shaders, shader_count,
 					options);
 	}
 
-	LLVMDisposeTargetMachine(tm);
+	radv_destroy_llvm_compiler(&ac_llvm, thread_compiler);
 
 	radv_fill_shader_variant(device, variant, &binary, stage);
 
@@ -629,7 +630,7 @@
 		options.key = *key;
 
 	options.unsafe_math = !!(device->instance->debug_flags & RADV_DEBUG_UNSAFE_MATH);
-	options.supports_spill = device->llvm_supports_spill;
+	options.supports_spill = true;
 
 	return shader_variant_create(device, module, shaders, shader_count, shaders[shader_count - 1]->info.stage,
 				     &options, false, code_out, code_size_out);
@@ -693,17 +694,7 @@
 	unsigned max_simd_waves;
 	unsigned lds_per_wave = 0;
 
-	switch (device->physical_device->rad_info.family) {
-	/* These always have 8 waves: */
-	case CHIP_POLARIS10:
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-	case CHIP_VEGAM:
-		max_simd_waves = 8;
-		break;
-	default:
-		max_simd_waves = 10;
-	}
+	max_simd_waves = ac_get_max_simd_waves(device->physical_device->rad_info.family);
 
 	conf = &variant->config;
 
@@ -788,7 +779,7 @@
 	/* Spec doesn't indicate what to do if the stage is invalid, so just
 	 * return no info for this. */
 	if (!variant)
-		return vk_error(VK_ERROR_FEATURE_NOT_PRESENT);
+		return vk_error(device->instance, VK_ERROR_FEATURE_NOT_PRESENT);
 
 	switch (infoType) {
 	case VK_SHADER_INFO_TYPE_STATISTICS_AMD:
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index ae76d0c..c490b69 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -95,10 +95,9 @@
 struct radv_fs_variant_key {
 	uint32_t col_format;
 	uint8_t log2_ps_iter_samples;
-	uint8_t log2_num_samples;
+	uint8_t num_samples;
 	uint32_t is_int8;
 	uint32_t is_int10;
-	uint32_t multisample : 1;
 };
 
 struct radv_shader_variant_key {
@@ -120,9 +119,11 @@
 	bool dump_shader;
 	bool dump_preoptir;
 	bool record_llvm_ir;
+	bool check_ir;
 	enum radeon_family family;
 	enum chip_class chip_class;
 	uint32_t tess_offchip_block_dw_size;
+	uint32_t address32_hi;
 };
 
 enum radv_ud_index {
@@ -134,7 +135,6 @@
 	AC_UD_VS_VERTEX_BUFFERS = AC_UD_SHADER_START,
 	AC_UD_VS_BASE_VERTEX_START_INSTANCE,
 	AC_UD_VS_MAX_UD,
-	AC_UD_PS_SAMPLE_POS_OFFSET = AC_UD_SHADER_START,
 	AC_UD_PS_MAX_UD,
 	AC_UD_CS_GRID_SIZE = AC_UD_SHADER_START,
 	AC_UD_CS_MAX_UD,
@@ -159,6 +159,9 @@
 	} vs;
 	struct {
 		uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
+	} gs;
+	struct {
+		uint8_t output_usage_mask[VARYING_SLOT_VAR31 + 1];
 	} tes;
 	struct {
 		bool force_persample;
@@ -189,12 +192,12 @@
 	int8_t sgpr_idx;
 	uint8_t num_sgprs;
 	bool indirect;
-	uint32_t indirect_offset;
 };
 
 struct radv_userdata_locations {
 	struct radv_userdata_info descriptor_sets[RADV_UD_MAX_SETS];
 	struct radv_userdata_info shader_data[AC_UD_MAX_UD];
+	uint32_t descriptor_sets_enabled;
 };
 
 struct radv_vs_output_info {
@@ -295,14 +298,15 @@
 };
 
 void
-radv_optimize_nir(struct nir_shader *shader);
+radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively);
 
 nir_shader *
 radv_shader_compile_to_nir(struct radv_device *device,
 			   struct radv_shader_module *module,
 			   const char *entrypoint_name,
 			   gl_shader_stage stage,
-			   const VkSpecializationInfo *spec_info);
+			   const VkSpecializationInfo *spec_info,
+			   const VkPipelineCreateFlags flags);
 
 void *
 radv_alloc_shader_memory(struct radv_device *device,
@@ -330,10 +334,6 @@
 radv_shader_variant_destroy(struct radv_device *device,
 			    struct radv_shader_variant *variant);
 
-bool
-radv_lower_indirect_derefs(struct nir_shader *nir,
-                           struct radv_physical_device *device);
-
 const char *
 radv_get_shader_name(struct radv_shader_variant *var, gl_shader_stage stage);
 
@@ -345,11 +345,14 @@
 
 static inline bool
 radv_can_dump_shader(struct radv_device *device,
-		     struct radv_shader_module *module)
+		     struct radv_shader_module *module,
+		     bool is_gs_copy_shader)
 {
+	if (!(device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS))
+		return false;
+
 	/* Only dump non-meta shaders, useful for debugging purposes. */
-	return device->instance->debug_flags & RADV_DEBUG_DUMP_SHADERS &&
-	       module && !module->nir;
+	return (module && !module->nir) || is_gs_copy_shader;
 }
 
 static inline bool
diff --git a/src/intel/tools/gen_disasm.h b/src/amd/vulkan/radv_shader_helper.h
similarity index 68%
copy from src/intel/tools/gen_disasm.h
copy to src/amd/vulkan/radv_shader_helper.h
index c8c18b2..3c81f5b 100644
--- a/src/intel/tools/gen_disasm.h
+++ b/src/amd/vulkan/radv_shader_helper.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014 Intel Corporation
+ * Copyright © 2018 Red Hat.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -20,26 +20,25 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
-
-#ifndef GEN_DISASM_H
-#define GEN_DISASM_H
-
-#include "intel/dev/gen_device_info.h"
-
+#ifndef RADV_SHADER_HELPER_H
+#define RADV_SHADER_HELPER_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct gen_disasm;
+bool radv_init_llvm_compiler(struct ac_llvm_compiler *info,
+			     bool okay_to_leak_target_library_info,
+			     bool thread_compiler,
+			     enum radeon_family family,
+			     enum ac_target_machine_options tm_options);
+void radv_destroy_llvm_compiler(struct ac_llvm_compiler *info,
+				bool thread_compiler);
 
-struct gen_disasm *gen_disasm_create(const struct gen_device_info *devinfo);
-void gen_disasm_disassemble(struct gen_disasm *disasm,
-                            void *assembly, int start, FILE *out);
-
-void gen_disasm_destroy(struct gen_disasm *disasm);
+bool radv_compile_to_binary(struct ac_llvm_compiler *info,
+			    LLVMModuleRef module,
+			    struct ac_shader_binary *binary);
 
 #ifdef __cplusplus
 }
 #endif
-
-#endif /* GEN_DISASM_H */
+#endif
diff --git a/src/amd/vulkan/radv_shader_info.c b/src/amd/vulkan/radv_shader_info.c
index 489b9d8..a45c847 100644
--- a/src/amd/vulkan/radv_shader_info.c
+++ b/src/amd/vulkan/radv_shader_info.c
@@ -23,6 +23,7 @@
 #include "radv_private.h"
 #include "radv_shader.h"
 #include "nir/nir.h"
+#include "nir/nir_deref.h"
 
 static void mark_sampler_desc(const nir_variable *var,
 			      struct radv_shader_info *info)
@@ -47,44 +48,128 @@
 		info->tcs.outputs_written |= (mask << param);
 }
 
-static void get_deref_offset(nir_deref_var *deref, unsigned *const_out)
+static void
+get_deref_offset(nir_deref_instr *instr,
+                 unsigned *const_out)
 {
-	nir_deref *tail = &deref->deref;
-	unsigned const_offset = 0;
+        nir_variable *var = nir_deref_instr_get_variable(instr);
+        nir_deref_path path;
+        unsigned idx_lvl = 1;
 
-	if (deref->var->data.compact) {
-		assert(tail->child->deref_type == nir_deref_type_array);
-		assert(glsl_type_is_scalar(glsl_without_array(deref->var->type)));
-
-		nir_deref_array *deref_array = nir_deref_as_array(tail->child);
-		/* We always lower indirect dereferences for "compact" array vars. */
-		assert(deref_array->deref_array_type == nir_deref_array_type_direct);
-
-		*const_out = deref_array->base_offset;
+	if (var->data.compact) {
+		assert(instr->deref_type == nir_deref_type_array);
+		nir_const_value *v = nir_src_as_const_value(instr->arr.index);
+		assert(v);
+		*const_out = v->u32[0];
 		return;
 	}
 
-	while (tail->child != NULL) {
-		const struct glsl_type *parent_type = tail->type;
-		tail = tail->child;
+	nir_deref_path_init(&path, instr, NULL);
 
-		if (tail->deref_type == nir_deref_type_array) {
-			nir_deref_array *deref_array = nir_deref_as_array(tail);
-			unsigned size = glsl_count_attribute_slots(tail->type, false);
+	uint32_t const_offset = 0;
 
-			const_offset += size * deref_array->base_offset;
-		} else if (tail->deref_type == nir_deref_type_struct) {
-			nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
+	for (; path.path[idx_lvl]; ++idx_lvl) {
+		const struct glsl_type *parent_type = path.path[idx_lvl - 1]->type;
+		if (path.path[idx_lvl]->deref_type == nir_deref_type_struct) {
+			unsigned index = path.path[idx_lvl]->strct.index;
 
-			for (unsigned i = 0; i < deref_struct->index; i++) {
+			for (unsigned i = 0; i < index; i++) {
 				const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
 				const_offset += glsl_count_attribute_slots(ft, false);
 			}
+		} else if(path.path[idx_lvl]->deref_type == nir_deref_type_array) {
+			unsigned size = glsl_count_attribute_slots(path.path[idx_lvl]->type, false);
+			nir_const_value *v = nir_src_as_const_value(path.path[idx_lvl]->arr.index);
+			if (v)
+				const_offset += v->u32[0] * size;
 		} else
-			unreachable("unsupported deref type");
+			unreachable("Uhandled deref type in get_deref_instr_offset");
 	}
 
 	*const_out = const_offset;
+
+	nir_deref_path_finish(&path);
+}
+
+static void
+gather_intrinsic_load_deref_info(const nir_shader *nir,
+			       const nir_intrinsic_instr *instr,
+			       struct radv_shader_info *info)
+{
+	switch (nir->info.stage) {
+	case MESA_SHADER_VERTEX: {
+		nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+
+		if (var->data.mode == nir_var_shader_in) {
+			unsigned idx = var->data.location;
+			uint8_t mask = nir_ssa_def_components_read(&instr->dest.ssa);
+
+			info->vs.input_usage_mask[idx] |=
+				mask << var->data.location_frac;
+		}
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+static void
+gather_intrinsic_store_deref_info(const nir_shader *nir,
+				const nir_intrinsic_instr *instr,
+				struct radv_shader_info *info)
+{
+	nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+
+	if (var->data.mode == nir_var_shader_out) {
+		unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
+		unsigned idx = var->data.location;
+		unsigned comp = var->data.location_frac;
+		unsigned const_offset = 0;
+
+		get_deref_offset(nir_instr_as_deref(instr->src[0].ssa->parent_instr), &const_offset);
+
+		switch (nir->info.stage) {
+		case MESA_SHADER_VERTEX:
+			for (unsigned i = 0; i < attrib_count; i++) {
+				info->vs.output_usage_mask[idx + i + const_offset] |=
+					instr->const_index[0] << comp;
+			}
+			break;
+		case MESA_SHADER_GEOMETRY:
+			for (unsigned i = 0; i < attrib_count; i++) {
+				info->gs.output_usage_mask[idx + i + const_offset] |=
+					instr->const_index[0] << comp;
+			}
+			break;
+		case MESA_SHADER_TESS_EVAL:
+			for (unsigned i = 0; i < attrib_count; i++) {
+				info->tes.output_usage_mask[idx + i + const_offset] |=
+					instr->const_index[0] << comp;
+			}
+			break;
+		case MESA_SHADER_TESS_CTRL: {
+			unsigned param = shader_io_get_unique_index(idx);
+			const struct glsl_type *type = var->type;
+
+			if (!var->data.patch)
+				type = glsl_get_array_element(var->type);
+
+			unsigned slots =
+				var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
+						  : glsl_count_attribute_slots(type, false);
+
+			if (idx == VARYING_SLOT_CLIP_DIST0)
+				slots = (nir->info.clip_distance_array_size +
+					 nir->info.cull_distance_array_size > 4) ? 2 : 1;
+
+			mark_tess_output(info, var->data.patch, param, slots);
+			break;
+		}
+		default:
+			break;
+		}
+	}
 }
 
 static void
@@ -92,7 +177,7 @@
 		      struct radv_shader_info *info)
 {
 	switch (instr->intrinsic) {
-	case nir_intrinsic_interp_var_at_sample:
+	case nir_intrinsic_interp_deref_at_sample:
 		info->ps.needs_sample_positions = true;
 		break;
 	case nir_intrinsic_load_draw_id:
@@ -145,20 +230,19 @@
 	case nir_intrinsic_vulkan_resource_index:
 		info->desc_set_used_mask |= (1 << nir_intrinsic_desc_set(instr));
 		break;
-	case nir_intrinsic_image_var_load:
-	case nir_intrinsic_image_var_store:
-	case nir_intrinsic_image_var_atomic_add:
-	case nir_intrinsic_image_var_atomic_min:
-	case nir_intrinsic_image_var_atomic_max:
-	case nir_intrinsic_image_var_atomic_and:
-	case nir_intrinsic_image_var_atomic_or:
-	case nir_intrinsic_image_var_atomic_xor:
-	case nir_intrinsic_image_var_atomic_exchange:
-	case nir_intrinsic_image_var_atomic_comp_swap:
-	case nir_intrinsic_image_var_size: {
-		const struct glsl_type *type = instr->variables[0]->var->type;
-		if(instr->variables[0]->deref.child)
-			type = instr->variables[0]->deref.child->type;
+	case nir_intrinsic_image_deref_load:
+	case nir_intrinsic_image_deref_store:
+	case nir_intrinsic_image_deref_atomic_add:
+	case nir_intrinsic_image_deref_atomic_min:
+	case nir_intrinsic_image_deref_atomic_max:
+	case nir_intrinsic_image_deref_atomic_and:
+	case nir_intrinsic_image_deref_atomic_or:
+	case nir_intrinsic_image_deref_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
+	case nir_intrinsic_image_deref_size: {
+		nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
+		const struct glsl_type *type = glsl_without_array(var->type);
 
 		enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
 		if (dim == GLSL_SAMPLER_DIM_SUBPASS ||
@@ -166,17 +250,17 @@
 			info->ps.layer_input = true;
 			info->ps.uses_input_attachments = true;
 		}
-		mark_sampler_desc(instr->variables[0]->var, info);
+		mark_sampler_desc(var, info);
 
-		if (nir_intrinsic_image_var_store ||
-		    nir_intrinsic_image_var_atomic_add ||
-		    nir_intrinsic_image_var_atomic_min ||
-		    nir_intrinsic_image_var_atomic_max ||
-		    nir_intrinsic_image_var_atomic_and ||
-		    nir_intrinsic_image_var_atomic_or ||
-		    nir_intrinsic_image_var_atomic_xor ||
-		    nir_intrinsic_image_var_atomic_exchange ||
-		    nir_intrinsic_image_var_atomic_comp_swap) {
+		if (nir_intrinsic_image_deref_store ||
+		    nir_intrinsic_image_deref_atomic_add ||
+		    nir_intrinsic_image_deref_atomic_min ||
+		    nir_intrinsic_image_deref_atomic_max ||
+		    nir_intrinsic_image_deref_atomic_and ||
+		    nir_intrinsic_image_deref_atomic_or ||
+		    nir_intrinsic_image_deref_atomic_xor ||
+		    nir_intrinsic_image_deref_atomic_exchange ||
+		    nir_intrinsic_image_deref_atomic_comp_swap) {
 			if (nir->info.stage == MESA_SHADER_FRAGMENT)
 				info->ps.writes_memory = true;
 		}
@@ -196,56 +280,12 @@
 		if (nir->info.stage == MESA_SHADER_FRAGMENT)
 			info->ps.writes_memory = true;
 		break;
-	case nir_intrinsic_load_var:
-		if (nir->info.stage == MESA_SHADER_VERTEX) {
-			nir_deref_var *dvar = instr->variables[0];
-			nir_variable *var = dvar->var;
-
-			if (var->data.mode == nir_var_shader_in) {
-				unsigned idx = var->data.location;
-				uint8_t mask =
-					nir_ssa_def_components_read(&instr->dest.ssa) << var->data.location_frac;
-				info->vs.input_usage_mask[idx] |= mask;
-			}
-		}
+	case nir_intrinsic_load_deref:
+		gather_intrinsic_load_deref_info(nir, instr, info);
 		break;
-	case nir_intrinsic_store_var: {
-		nir_deref_var *dvar = instr->variables[0];
-		nir_variable *var = dvar->var;
-
-		if (var->data.mode == nir_var_shader_out) {
-			unsigned attrib_count = glsl_count_attribute_slots(var->type, false);
-			unsigned idx = var->data.location;
-			unsigned comp = var->data.location_frac;
-			unsigned const_offset = 0;
-
-			get_deref_offset(dvar, &const_offset);
-
-			if (nir->info.stage == MESA_SHADER_VERTEX) {
-				for (unsigned i = 0; i < attrib_count; i++) {
-					info->vs.output_usage_mask[idx + i + const_offset] |=
-						instr->const_index[0] << comp;
-				}
-			} else if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
-				for (unsigned i = 0; i < attrib_count; i++) {
-					info->tes.output_usage_mask[idx + i + const_offset] |=
-						instr->const_index[0] << comp;
-				}
-			} else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-				unsigned param = shader_io_get_unique_index(idx);
-				const struct glsl_type *type = var->type;
-				if (!var->data.patch)
-					type = glsl_get_array_element(var->type);
-				unsigned slots =
-					var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
-					: glsl_count_attribute_slots(type, false);
-				if (idx == VARYING_SLOT_CLIP_DIST0)
-					slots = (nir->info.clip_distance_array_size + nir->info.cull_distance_array_size > 4) ? 2 : 1;
-				mark_tess_output(info, var->data.patch, param, slots);
-			}
-		}
+	case nir_intrinsic_store_deref:
+		gather_intrinsic_store_deref_info(nir, instr, info);
 		break;
-	}
 	default:
 		break;
 	}
@@ -255,10 +295,18 @@
 gather_tex_info(const nir_shader *nir, const nir_tex_instr *instr,
 		struct radv_shader_info *info)
 {
-	if (instr->sampler)
-		mark_sampler_desc(instr->sampler->var, info);
-	if (instr->texture)
-		mark_sampler_desc(instr->texture->var, info);
+	for (unsigned i = 0; i < instr->num_srcs; i++) {
+		switch (instr->src[i].src_type) {
+		case nir_tex_src_texture_deref:
+			mark_sampler_desc(nir_deref_instr_get_variable(nir_src_as_deref(instr->src[i].src)), info);
+			break;
+		case nir_tex_src_sampler_deref:
+			mark_sampler_desc(nir_deref_instr_get_variable(nir_src_as_deref(instr->src[i].src)), info);
+			break;
+		default:
+			break;
+		}
+	}
 }
 
 static void
@@ -395,7 +443,7 @@
 	struct nir_function *func =
 		(struct nir_function *)exec_list_get_head_const(&nir->functions);
 
-	if (options->layout->dynamic_offset_count)
+	if (options->layout && options->layout->dynamic_offset_count)
 		info->loads_push_constants = true;
 
 	nir_foreach_variable(variable, &nir->inputs)
diff --git a/src/amd/vulkan/radv_util.c b/src/amd/vulkan/radv_util.c
index b892eb7..72bedc6 100644
--- a/src/amd/vulkan/radv_util.c
+++ b/src/amd/vulkan/radv_util.c
@@ -29,6 +29,7 @@
 #include <assert.h>
 
 #include "radv_private.h"
+#include "radv_debug.h"
 #include "vk_enum_to_str.h"
 
 #include "util/u_math.h"
@@ -53,6 +54,26 @@
 	fprintf(stderr, "\n");
 }
 
+/** Log an error message.  */
+void radv_printflike(1, 2)
+	radv_logi(const char *format, ...)
+{
+	va_list va;
+
+	va_start(va, format);
+	radv_logi_v(format, va);
+	va_end(va);
+}
+
+/** \see radv_logi() */
+void
+radv_logi_v(const char *format, va_list va)
+{
+	fprintf(stderr, "radv: info: ");
+	vfprintf(stderr, format, va);
+	fprintf(stderr, "\n");
+}
+
 void radv_printflike(3, 4)
 	__radv_finishme(const char *file, int line, const char *format, ...)
 {
@@ -67,13 +88,19 @@
 }
 
 VkResult
-__vk_errorf(VkResult error, const char *file, int line, const char *format, ...)
+__vk_errorf(struct radv_instance *instance, VkResult error, const char *file,
+	    int line, const char *format, ...)
 {
 	va_list ap;
 	char buffer[256];
 
 	const char *error_str = vk_Result_to_str(error);
 
+#ifndef DEBUG
+	if (instance && !(instance->debug_flags & RADV_DEBUG_ERRORS))
+		return error;
+#endif
+
 	if (format) {
 		va_start(ap, format);
 		vsnprintf(buffer, sizeof(buffer), format, ap);
diff --git a/src/amd/vulkan/radv_wsi.c b/src/amd/vulkan/radv_wsi.c
index 9276504..6479bea 100644
--- a/src/amd/vulkan/radv_wsi.c
+++ b/src/amd/vulkan/radv_wsi.c
@@ -41,7 +41,8 @@
 	return wsi_device_init(&physical_device->wsi_device,
 			       radv_physical_device_to_handle(physical_device),
 			       radv_wsi_proc_addr,
-			       &physical_device->instance->alloc);
+			       &physical_device->instance->alloc,
+			       physical_device->master_fd);
 }
 
 void
@@ -102,6 +103,18 @@
 						    pSurfaceCapabilities);
 }
 
+VkResult radv_GetPhysicalDeviceSurfaceCapabilities2EXT(
+ 	VkPhysicalDevice                            physicalDevice,
+	VkSurfaceKHR                                surface,
+	VkSurfaceCapabilities2EXT*                  pSurfaceCapabilities)
+{
+	RADV_FROM_HANDLE(radv_physical_device, device, physicalDevice);
+
+	return wsi_common_get_surface_capabilities2ext(&device->wsi_device,
+						       surface,
+						       pSurfaceCapabilities);
+}
+
 VkResult radv_GetPhysicalDeviceSurfaceFormatsKHR(
 	VkPhysicalDevice                            physicalDevice,
 	VkSurfaceKHR                                surface,
@@ -193,23 +206,38 @@
 }
 
 VkResult radv_AcquireNextImageKHR(
-	VkDevice                                     _device,
+	VkDevice                                     device,
 	VkSwapchainKHR                               swapchain,
 	uint64_t                                     timeout,
 	VkSemaphore                                  semaphore,
-	VkFence                                      _fence,
+	VkFence                                      fence,
+	uint32_t*                                    pImageIndex)
+{
+	VkAcquireNextImageInfoKHR acquire_info = {
+		.sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
+		.swapchain = swapchain,
+		.timeout = timeout,
+		.semaphore = semaphore,
+		.fence = fence,
+		.deviceMask = 0,
+	};
+
+	return radv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
+}
+
+VkResult radv_AcquireNextImage2KHR(
+	VkDevice                                     _device,
+	const VkAcquireNextImageInfoKHR*             pAcquireInfo,
 	uint32_t*                                    pImageIndex)
 {
 	RADV_FROM_HANDLE(radv_device, device, _device);
 	struct radv_physical_device *pdevice = device->physical_device;
-	RADV_FROM_HANDLE(radv_fence, fence, _fence);
+	RADV_FROM_HANDLE(radv_fence, fence, pAcquireInfo->fence);
 
-	VkResult result = wsi_common_acquire_next_image(&pdevice->wsi_device,
-							_device,
-							swapchain,
-							timeout,
-							semaphore,
-							pImageIndex);
+	VkResult result = wsi_common_acquire_next_image2(&pdevice->wsi_device,
+							 _device,
+                                                         pAcquireInfo,
+							 pImageIndex);
 
 	if (fence && (result == VK_SUCCESS || result == VK_SUBOPTIMAL_KHR)) {
 		fence->submitted = true;
diff --git a/src/amd/vulkan/radv_wsi_display.c b/src/amd/vulkan/radv_wsi_display.c
new file mode 100644
index 0000000..d8743a0
--- /dev/null
+++ b/src/amd/vulkan/radv_wsi_display.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright © 2017 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include "radv_private.h"
+#include "radv_cs.h"
+#include "util/disk_cache.h"
+#include "util/strtod.h"
+#include "vk_util.h"
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+#include <amdgpu.h>
+#include <amdgpu_drm.h>
+#include "winsys/amdgpu/radv_amdgpu_winsys_public.h"
+#include "ac_llvm_util.h"
+#include "vk_format.h"
+#include "sid.h"
+#include "util/debug.h"
+#include "wsi_common_display.h"
+
+#define MM_PER_PIXEL     (1.0/96.0 * 25.4)
+
+VkResult
+radv_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device,
+                                           uint32_t *property_count,
+                                           VkDisplayPropertiesKHR *properties)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_display_get_physical_device_display_properties(
+		physical_device,
+		&pdevice->wsi_device,
+		property_count,
+		properties);
+}
+
+VkResult
+radv_GetPhysicalDeviceDisplayProperties2KHR(VkPhysicalDevice physical_device,
+                                            uint32_t *property_count,
+                                            VkDisplayProperties2KHR *properties)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_display_get_physical_device_display_properties2(
+		physical_device,
+		&pdevice->wsi_device,
+		property_count,
+		properties);
+}
+
+VkResult
+radv_GetPhysicalDeviceDisplayPlanePropertiesKHR(
+	VkPhysicalDevice physical_device,
+	uint32_t *property_count,
+	VkDisplayPlanePropertiesKHR *properties)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_display_get_physical_device_display_plane_properties(
+		physical_device,
+		&pdevice->wsi_device,
+		property_count,
+		properties);
+}
+
+VkResult
+radv_GetPhysicalDeviceDisplayPlaneProperties2KHR(
+	VkPhysicalDevice physical_device,
+	uint32_t *property_count,
+	VkDisplayPlaneProperties2KHR *properties)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_display_get_physical_device_display_plane_properties2(
+		physical_device,
+		&pdevice->wsi_device,
+		property_count,
+		properties);
+}
+
+VkResult
+radv_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device,
+                                         uint32_t plane_index,
+                                         uint32_t *display_count,
+                                         VkDisplayKHR *displays)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_display_get_display_plane_supported_displays(
+		physical_device,
+		&pdevice->wsi_device,
+		plane_index,
+		display_count,
+		displays);
+}
+
+
+VkResult
+radv_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device,
+                                 VkDisplayKHR display,
+                                 uint32_t *property_count,
+                                 VkDisplayModePropertiesKHR *properties)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_display_get_display_mode_properties(physical_device,
+						       &pdevice->wsi_device,
+						       display,
+						       property_count,
+						       properties);
+}
+
+VkResult
+radv_GetDisplayModeProperties2KHR(VkPhysicalDevice physical_device,
+                                  VkDisplayKHR display,
+                                  uint32_t *property_count,
+                                  VkDisplayModeProperties2KHR *properties)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_display_get_display_mode_properties2(physical_device,
+						        &pdevice->wsi_device,
+						        display,
+						        property_count,
+						        properties);
+}
+
+VkResult
+radv_CreateDisplayModeKHR(VkPhysicalDevice physical_device,
+                          VkDisplayKHR display,
+                          const VkDisplayModeCreateInfoKHR *create_info,
+                          const VkAllocationCallbacks *allocator,
+                          VkDisplayModeKHR *mode)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_display_create_display_mode(physical_device,
+					       &pdevice->wsi_device,
+					       display,
+					       create_info,
+					       allocator,
+					       mode);
+}
+
+VkResult
+radv_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device,
+                                    VkDisplayModeKHR mode_khr,
+                                    uint32_t plane_index,
+                                    VkDisplayPlaneCapabilitiesKHR *capabilities)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_get_display_plane_capabilities(physical_device,
+						  &pdevice->wsi_device,
+						  mode_khr,
+						  plane_index,
+						  capabilities);
+}
+
+VkResult
+radv_GetDisplayPlaneCapabilities2KHR(VkPhysicalDevice physical_device,
+                                     const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo,
+                                     VkDisplayPlaneCapabilities2KHR *capabilities)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_get_display_plane_capabilities2(physical_device,
+						   &pdevice->wsi_device,
+						   pDisplayPlaneInfo,
+						   capabilities);
+}
+
+VkResult
+radv_CreateDisplayPlaneSurfaceKHR(
+	VkInstance _instance,
+	const VkDisplaySurfaceCreateInfoKHR *create_info,
+	const VkAllocationCallbacks *allocator,
+	VkSurfaceKHR *surface)
+{
+	RADV_FROM_HANDLE(radv_instance, instance, _instance);
+	const VkAllocationCallbacks *alloc;
+
+	if (allocator)
+		alloc = allocator;
+	else
+		alloc = &instance->alloc;
+
+	return wsi_create_display_surface(_instance, alloc,
+					  create_info, surface);
+}
+
+VkResult
+radv_ReleaseDisplayEXT(VkPhysicalDevice physical_device,
+		       VkDisplayKHR     display)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_release_display(physical_device,
+				   &pdevice->wsi_device,
+				   display);
+}
+
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+VkResult
+radv_AcquireXlibDisplayEXT(VkPhysicalDevice     physical_device,
+			   Display              *dpy,
+			   VkDisplayKHR         display)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_acquire_xlib_display(physical_device,
+					&pdevice->wsi_device,
+					dpy,
+					display);
+}
+
+VkResult
+radv_GetRandROutputDisplayEXT(VkPhysicalDevice  physical_device,
+			      Display           *dpy,
+			      RROutput          output,
+			      VkDisplayKHR      *display)
+{
+	RADV_FROM_HANDLE(radv_physical_device, pdevice, physical_device);
+
+	return wsi_get_randr_output_display(physical_device,
+					    &pdevice->wsi_device,
+					    dpy,
+					    output,
+					    display);
+}
+#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */
+
+/* VK_EXT_display_control */
+
+VkResult
+radv_DisplayPowerControlEXT(VkDevice                    _device,
+			    VkDisplayKHR                display,
+			    const VkDisplayPowerInfoEXT *display_power_info)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+
+	return wsi_display_power_control(_device,
+					 &device->physical_device->wsi_device,
+					 display,
+					 display_power_info);
+}
+
+VkResult
+radv_RegisterDeviceEventEXT(VkDevice                    _device,
+			    const VkDeviceEventInfoEXT  *device_event_info,
+			    const VkAllocationCallbacks *allocator,
+			    VkFence                     *_fence)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+	struct radv_fence            *fence;
+	VkResult                     ret;
+
+	fence = vk_alloc2(&device->instance->alloc, allocator, sizeof (*fence),
+			  8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+	if (!fence)
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+	fence->fence = NULL;
+	fence->submitted = true;
+	fence->signalled = false;
+	fence->syncobj = 0;
+	fence->temp_syncobj = 0;
+
+	ret = wsi_register_device_event(_device,
+					&device->physical_device->wsi_device,
+					device_event_info,
+					allocator,
+					&fence->fence_wsi);
+	if (ret == VK_SUCCESS)
+		*_fence = radv_fence_to_handle(fence);
+	else
+		vk_free2(&device->instance->alloc, allocator, fence);
+	return ret;
+}
+
+VkResult
+radv_RegisterDisplayEventEXT(VkDevice                           _device,
+			     VkDisplayKHR                       display,
+			     const VkDisplayEventInfoEXT        *display_event_info,
+			     const VkAllocationCallbacks        *allocator,
+			     VkFence                            *_fence)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+
+	struct radv_fence            *fence;
+	VkResult                     ret;
+
+	fence = vk_alloc2(&device->instance->alloc, allocator, sizeof (*fence),
+			  8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+	if (!fence)
+		return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+	fence->fence = NULL;
+	fence->submitted = true;
+	fence->signalled = false;
+	fence->syncobj = 0;
+	fence->temp_syncobj = 0;
+
+	ret = wsi_register_display_event(_device,
+					 &device->physical_device->wsi_device,
+					 display,
+					 display_event_info,
+					 allocator,
+					 &(fence->fence_wsi));
+
+	if (ret == VK_SUCCESS)
+		*_fence = radv_fence_to_handle(fence);
+	else
+		vk_free2(&device->instance->alloc, allocator, fence);
+	return ret;
+}
+
+VkResult
+radv_GetSwapchainCounterEXT(VkDevice                    _device,
+			    VkSwapchainKHR              swapchain,
+			    VkSurfaceCounterFlagBitsEXT flag_bits,
+			    uint64_t                    *value)
+{
+	RADV_FROM_HANDLE(radv_device, device, _device);
+
+	return wsi_get_swapchain_counter(_device,
+					 &device->physical_device->wsi_device,
+					 swapchain,
+					 flag_bits,
+					 value);
+}
+
diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index 7cd863e..63e07e4 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -37,114 +37,20 @@
 
 static void
 si_write_harvested_raster_configs(struct radv_physical_device *physical_device,
-                                  struct radeon_winsys_cs *cs,
+                                  struct radeon_cmdbuf *cs,
 				  unsigned raster_config,
 				  unsigned raster_config_1)
 {
-	unsigned sh_per_se = MAX2(physical_device->rad_info.max_sh_per_se, 1);
 	unsigned num_se = MAX2(physical_device->rad_info.max_se, 1);
-	unsigned rb_mask = physical_device->rad_info.enabled_rb_mask;
-	unsigned num_rb = MIN2(physical_device->rad_info.num_render_backends, 16);
-	unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
-	unsigned rb_per_se = num_rb / num_se;
-	unsigned se_mask[4];
+	unsigned raster_config_se[4];
 	unsigned se;
 
-	se_mask[0] = ((1 << rb_per_se) - 1) & rb_mask;
-	se_mask[1] = (se_mask[0] << rb_per_se) & rb_mask;
-	se_mask[2] = (se_mask[1] << rb_per_se) & rb_mask;
-	se_mask[3] = (se_mask[2] << rb_per_se) & rb_mask;
-
-	assert(num_se == 1 || num_se == 2 || num_se == 4);
-	assert(sh_per_se == 1 || sh_per_se == 2);
-	assert(rb_per_pkr == 1 || rb_per_pkr == 2);
-
-	/* XXX: I can't figure out what the *_XSEL and *_YSEL
-	 * fields are for, so I'm leaving them as their default
-	 * values. */
-
-	if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
-			     (!se_mask[2] && !se_mask[3]))) {
-		raster_config_1 &= C_028354_SE_PAIR_MAP;
-
-		if (!se_mask[0] && !se_mask[1]) {
-			raster_config_1 |=
-				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
-		} else {
-			raster_config_1 |=
-				S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
-		}
-	}
+	ac_get_harvested_configs(&physical_device->rad_info,
+				 raster_config,
+				 &raster_config_1,
+				 raster_config_se);
 
 	for (se = 0; se < num_se; se++) {
-		unsigned raster_config_se = raster_config;
-		unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
-		unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
-		int idx = (se / 2) * 2;
-
-		if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
-			raster_config_se &= C_028350_SE_MAP;
-
-			if (!se_mask[idx]) {
-				raster_config_se |=
-					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
-			} else {
-				raster_config_se |=
-					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
-			}
-		}
-
-		pkr0_mask &= rb_mask;
-		pkr1_mask &= rb_mask;
-		if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
-			raster_config_se &= C_028350_PKR_MAP;
-
-			if (!pkr0_mask) {
-				raster_config_se |=
-					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
-			} else {
-				raster_config_se |=
-					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
-			}
-		}
-
-		if (rb_per_se >= 2) {
-			unsigned rb0_mask = 1 << (se * rb_per_se);
-			unsigned rb1_mask = rb0_mask << 1;
-
-			rb0_mask &= rb_mask;
-			rb1_mask &= rb_mask;
-			if (!rb0_mask || !rb1_mask) {
-				raster_config_se &= C_028350_RB_MAP_PKR0;
-
-				if (!rb0_mask) {
-					raster_config_se |=
-						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
-				} else {
-					raster_config_se |=
-						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
-				}
-			}
-
-			if (rb_per_se > 2) {
-				rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
-				rb1_mask = rb0_mask << 1;
-				rb0_mask &= rb_mask;
-				rb1_mask &= rb_mask;
-				if (!rb0_mask || !rb1_mask) {
-					raster_config_se &= C_028350_RB_MAP_PKR1;
-
-					if (!rb0_mask) {
-						raster_config_se |=
-							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
-					} else {
-						raster_config_se |=
-							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
-					}
-				}
-			}
-		}
-
 		/* GRBM_GFX_INDEX has a different offset on SI and CI+ */
 		if (physical_device->rad_info.chip_class < CIK)
 			radeon_set_config_reg(cs, R_00802C_GRBM_GFX_INDEX,
@@ -155,9 +61,7 @@
 			radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
 					       S_030800_SE_INDEX(se) | S_030800_SH_BROADCAST_WRITES(1) |
 					       S_030800_INSTANCE_BROADCAST_WRITES(1));
-		radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config_se);
-		if (physical_device->rad_info.chip_class >= CIK)
-			radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
+		radeon_set_context_reg(cs, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
 	}
 
 	/* GRBM_GFX_INDEX has a different offset on SI and CI+ */
@@ -170,11 +74,14 @@
 		radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX,
 				       S_030800_SE_BROADCAST_WRITES(1) | S_030800_SH_BROADCAST_WRITES(1) |
 				       S_030800_INSTANCE_BROADCAST_WRITES(1));
+
+	if (physical_device->rad_info.chip_class >= CIK)
+		radeon_set_context_reg(cs, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
 }
 
 static void
 si_emit_compute(struct radv_physical_device *physical_device,
-                struct radeon_winsys_cs *cs)
+                struct radeon_cmdbuf *cs)
 {
 	radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
 	radeon_emit(cs, 0);
@@ -228,94 +135,15 @@
 
 static void
 si_set_raster_config(struct radv_physical_device *physical_device,
-		     struct radeon_winsys_cs *cs)
+		     struct radeon_cmdbuf *cs)
 {
 	unsigned num_rb = MIN2(physical_device->rad_info.num_render_backends, 16);
 	unsigned rb_mask = physical_device->rad_info.enabled_rb_mask;
 	unsigned raster_config, raster_config_1;
 
-	switch (physical_device->rad_info.family) {
-	case CHIP_TAHITI:
-	case CHIP_PITCAIRN:
-		raster_config = 0x2a00126a;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_VERDE:
-		raster_config = 0x0000124a;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_OLAND:
-		raster_config = 0x00000082;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_HAINAN:
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_BONAIRE:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_HAWAII:
-		raster_config = 0x3a00161a;
-		raster_config_1 = 0x0000002e;
-		break;
-	case CHIP_FIJI:
-		if (physical_device->rad_info.cik_macrotile_mode_array[0] == 0x000000e8) {
-			/* old kernels with old tiling config */
-			raster_config = 0x16000012;
-			raster_config_1 = 0x0000002a;
-		} else {
-			raster_config = 0x3a00161a;
-			raster_config_1 = 0x0000002e;
-		}
-		break;
-	case CHIP_POLARIS10:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x0000002a;
-		break;
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_VEGAM:
-		raster_config = 0x3a00161a;
-		raster_config_1 = 0x0000002e;
-		break;
-	case CHIP_TONGA:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x0000002a;
-		break;
-	case CHIP_ICELAND:
-		if (num_rb == 1)
-			raster_config = 0x00000000;
-		else
-			raster_config = 0x00000002;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_CARRIZO:
-		raster_config = 0x00000002;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_KAVERI:
-		/* KV should be 0x00000002, but that causes problems with radeon */
-		raster_config = 0x00000000; /* 0x00000002 */
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_KABINI:
-	case CHIP_MULLINS:
-	case CHIP_STONEY:
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
-		break;
-	default:
-		fprintf(stderr,
-			"radv: Unknown GPU, using 0 for raster_config\n");
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
-		break;
-	}
+	ac_get_raster_config(&physical_device->rad_info,
+			     &raster_config,
+			     &raster_config_1);
 
 	/* Always use the default config when all backends are enabled
 	 * (or when we failed to determine the enabled backends).
@@ -335,7 +163,7 @@
 
 static void
 si_emit_config(struct radv_physical_device *physical_device,
-	       struct radeon_winsys_cs *cs)
+	       struct radeon_cmdbuf *cs)
 {
 	int i;
 
@@ -514,6 +342,7 @@
 		switch (physical_device->rad_info.family) {
 		case CHIP_VEGA10:
 		case CHIP_VEGA12:
+		case CHIP_VEGA20:
 			pc_lines = 4096;
 			break;
 		case CHIP_RAVEN:
@@ -571,7 +400,7 @@
 void
 cik_create_gfx_config(struct radv_device *device)
 {
-	struct radeon_winsys_cs *cs = device->ws->cs_create(device->ws, RING_GFX);
+	struct radeon_cmdbuf *cs = device->ws->cs_create(device->ws, RING_GFX);
 	if (!cs)
 		return;
 
@@ -628,7 +457,7 @@
 }
 
 void
-si_write_viewport(struct radeon_winsys_cs *cs, int first_vp,
+si_write_viewport(struct radeon_cmdbuf *cs, int first_vp,
                   int count, const VkViewport *viewports)
 {
 	int i;
@@ -687,7 +516,7 @@
 }
 
 void
-si_write_scissors(struct radeon_winsys_cs *cs, int first,
+si_write_scissors(struct radeon_cmdbuf *cs, int first,
                   int count, const VkRect2D *scissors,
                   const VkViewport *viewports, bool can_use_guardband)
 {
@@ -703,16 +532,16 @@
 		VkRect2D scissor = si_intersect_scissor(&scissors[i], &viewport_scissor);
 
 		get_viewport_xform(viewports + i, scale, translate);
-		scale[0] = abs(scale[0]);
-		scale[1] = abs(scale[1]);
+		scale[0] = fabsf(scale[0]);
+		scale[1] = fabsf(scale[1]);
 
 		if (scale[0] < 0.5)
 			scale[0] = 0.5;
 		if (scale[1] < 0.5)
 			scale[1] = 0.5;
 
-		guardband_x = MIN2(guardband_x, (max_range - abs(translate[0])) / scale[0]);
-		guardband_y = MIN2(guardband_y, (max_range - abs(translate[1])) / scale[1]);
+		guardband_x = MIN2(guardband_x, (max_range - fabsf(translate[0])) / scale[0]);
+		guardband_y = MIN2(guardband_y, (max_range - fabsf(translate[1])) / scale[1]);
 
 		radeon_emit(cs, S_028250_TL_X(scissor.offset.x) |
 			    S_028250_TL_Y(scissor.offset.y) |
@@ -844,8 +673,7 @@
 
 }
 
-void si_cs_emit_write_event_eop(struct radeon_winsys_cs *cs,
-				bool predicated,
+void si_cs_emit_write_event_eop(struct radeon_cmdbuf *cs,
 				enum chip_class chip_class,
 				bool is_mec,
 				unsigned event, unsigned event_flags,
@@ -859,22 +687,28 @@
 		EVENT_INDEX(5) |
 		event_flags;
 	unsigned is_gfx8_mec = is_mec && chip_class < GFX9;
+	unsigned sel = EOP_DATA_SEL(data_sel);
+
+	/* Wait for write confirmation before writing data, but don't send
+	 * an interrupt. */
+	if (data_sel != EOP_DATA_SEL_DISCARD)
+		sel |= EOP_INT_SEL(EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM);
 
 	if (chip_class >= GFX9 || is_gfx8_mec) {
 		/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
 		 * counters) must immediately precede every timestamp event to
 		 * prevent a GPU hang on GFX9.
 		 */
-		if (chip_class == GFX9) {
+		if (chip_class == GFX9 && !is_mec) {
 			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 			radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 			radeon_emit(cs, gfx9_eop_bug_va);
 			radeon_emit(cs, gfx9_eop_bug_va >> 32);
 		}
 
-		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, predicated));
+		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, is_gfx8_mec ? 5 : 6, false));
 		radeon_emit(cs, op);
-		radeon_emit(cs, EOP_DATA_SEL(data_sel));
+		radeon_emit(cs, sel);
 		radeon_emit(cs, va);            /* address lo */
 		radeon_emit(cs, va >> 32);      /* address hi */
 		radeon_emit(cs, new_fence);     /* immediate data lo */
@@ -888,30 +722,29 @@
 			 * (and optional cache flushes executed) before the timestamp
 			 * is written.
 			 */
-			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, predicated));
+			radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false));
 			radeon_emit(cs, op);
 			radeon_emit(cs, va);
-			radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel));
+			radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 			radeon_emit(cs, old_fence); /* immediate data */
 			radeon_emit(cs, 0); /* unused */
 		}
 
-		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, predicated));
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, false));
 		radeon_emit(cs, op);
 		radeon_emit(cs, va);
-		radeon_emit(cs, ((va >> 32) & 0xffff) | EOP_DATA_SEL(data_sel));
+		radeon_emit(cs, ((va >> 32) & 0xffff) | sel);
 		radeon_emit(cs, new_fence); /* immediate data */
 		radeon_emit(cs, 0); /* unused */
 	}
 }
 
 void
-si_emit_wait_fence(struct radeon_winsys_cs *cs,
-		   bool predicated,
+si_emit_wait_fence(struct radeon_cmdbuf *cs,
 		   uint64_t va, uint32_t ref,
 		   uint32_t mask)
 {
-	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, predicated));
+	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, false));
 	radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
 	radeon_emit(cs, va);
 	radeon_emit(cs, va >> 32);
@@ -921,15 +754,14 @@
 }
 
 static void
-si_emit_acquire_mem(struct radeon_winsys_cs *cs,
+si_emit_acquire_mem(struct radeon_cmdbuf *cs,
                     bool is_mec,
-		    bool predicated,
 		    bool is_gfx9,
                     unsigned cp_coher_cntl)
 {
 	if (is_mec || is_gfx9) {
 		uint32_t hi_val = is_gfx9 ? 0xffffff : 0xff;
-		radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, predicated) |
+		radeon_emit(cs, PKT3(PKT3_ACQUIRE_MEM, 5, false) |
 		                            PKT3_SHADER_TYPE_S(is_mec));
 		radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
 		radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
@@ -939,7 +771,7 @@
 		radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
 	} else {
 		/* ACQUIRE_MEM is only required on a compute ring. */
-		radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, predicated));
+		radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, false));
 		radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
 		radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
 		radeon_emit(cs, 0);               /* CP_COHER_BASE */
@@ -948,7 +780,7 @@
 }
 
 void
-si_cs_emit_cache_flush(struct radeon_winsys_cs *cs,
+si_cs_emit_cache_flush(struct radeon_cmdbuf *cs,
                        enum chip_class chip_class,
 		       uint32_t *flush_cnt,
 		       uint64_t flush_va,
@@ -980,11 +812,12 @@
 			/* Necessary for DCC */
 			if (chip_class >= VI) {
 				si_cs_emit_write_event_eop(cs,
-							   false,
 							   chip_class,
 							   is_mec,
 							   V_028A90_FLUSH_AND_INV_CB_DATA_TS,
-							   0, 0, 0, 0, 0,
+							   0,
+							   EOP_DATA_SEL_DISCARD,
+							   0, 0, 0,
 							   gfx9_eop_bug_va);
 			}
 		}
@@ -1020,26 +853,9 @@
 	if (chip_class >= GFX9 && flush_cb_db) {
 		unsigned cb_db_event, tc_flags;
 
-#if 0
-		/* This breaks a bunch of:
-		   dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input*.
-		   use the big hammer always.
-		*/
 		/* Set the CB/DB flush event. */
-		switch (flush_cb_db) {
-		case RADV_CMD_FLAG_FLUSH_AND_INV_CB:
-			cb_db_event = V_028A90_FLUSH_AND_INV_CB_DATA_TS;
-			break;
-		case RADV_CMD_FLAG_FLUSH_AND_INV_DB:
-			cb_db_event = V_028A90_FLUSH_AND_INV_DB_DATA_TS;
-			break;
-		default:
-			/* both CB & DB */
-			cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
-		}
-#else
 		cb_db_event = V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT;
-#endif
+
 		/* These are the only allowed combinations. If you need to
 		 * do multiple operations at once, do them separately.
 		 * All operations that invalidate L2 also seem to invalidate
@@ -1070,10 +886,11 @@
 		assert(flush_cnt);
 		uint32_t old_fence = (*flush_cnt)++;
 
-		si_cs_emit_write_event_eop(cs, false, chip_class, false, cb_db_event, tc_flags, 1,
+		si_cs_emit_write_event_eop(cs, chip_class, false, cb_db_event, tc_flags,
+					   EOP_DATA_SEL_VALUE_32BIT,
 					   flush_va, old_fence, *flush_cnt,
 					   gfx9_eop_bug_va);
-		si_emit_wait_fence(cs, false, flush_va, *flush_cnt, 0xffffffff);
+		si_emit_wait_fence(cs, flush_va, *flush_cnt, 0xffffffff);
 	}
 
 	/* VGT state sync */
@@ -1097,7 +914,7 @@
 
 	if ((flush_bits & RADV_CMD_FLAG_INV_GLOBAL_L2) ||
 	    (chip_class <= CIK && (flush_bits & RADV_CMD_FLAG_WRITEBACK_GLOBAL_L2))) {
-		si_emit_acquire_mem(cs, is_mec, false, chip_class >= GFX9,
+		si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9,
 				    cp_coher_cntl |
 				    S_0085F0_TC_ACTION_ENA(1) |
 				    S_0085F0_TCL1_ACTION_ENA(1) |
@@ -1111,7 +928,7 @@
 			 *
 			 * WB doesn't work without NC.
 			 */
-			si_emit_acquire_mem(cs, is_mec, false,
+			si_emit_acquire_mem(cs, is_mec,
 					    chip_class >= GFX9,
 					    cp_coher_cntl |
 					    S_0301F0_TC_WB_ACTION_ENA(1) |
@@ -1120,7 +937,7 @@
 		}
 		if (flush_bits & RADV_CMD_FLAG_INV_VMEM_L1) {
 			si_emit_acquire_mem(cs, is_mec,
-					    false, chip_class >= GFX9,
+					    chip_class >= GFX9,
 					    cp_coher_cntl |
 					    S_0085F0_TCL1_ACTION_ENA(1));
 			cp_coher_cntl = 0;
@@ -1131,7 +948,17 @@
 	 * Therefore, it should be last. Done in PFP.
 	 */
 	if (cp_coher_cntl)
-		si_emit_acquire_mem(cs, is_mec, false, chip_class >= GFX9, cp_coher_cntl);
+		si_emit_acquire_mem(cs, is_mec, chip_class >= GFX9, cp_coher_cntl);
+
+	if (flush_bits & RADV_CMD_FLAG_START_PIPELINE_STATS) {
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
+			        EVENT_INDEX(0));
+	} else if (flush_bits & RADV_CMD_FLAG_STOP_PIPELINE_STATS) {
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
+			        EVENT_INDEX(0));
+	}
 }
 
 void
@@ -1146,7 +973,9 @@
 	                                          RADV_CMD_FLAG_FLUSH_AND_INV_DB_META |
 	                                          RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
 	                                          RADV_CMD_FLAG_VS_PARTIAL_FLUSH |
-	                                          RADV_CMD_FLAG_VGT_FLUSH);
+	                                          RADV_CMD_FLAG_VGT_FLUSH |
+						  RADV_CMD_FLAG_START_PIPELINE_STATS |
+						  RADV_CMD_FLAG_STOP_PIPELINE_STATS);
 
 	if (!cmd_buffer->state.flush_bits)
 		return;
@@ -1176,12 +1005,23 @@
 
 /* sets the CP predication state using a boolean stored at va */
 void
-si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
+si_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer,
+			      bool inverted, uint64_t va)
 {
 	uint32_t op = 0;
 
-	if (va)
-		op = PRED_OP(PREDICATION_OP_BOOL64) | PREDICATION_DRAW_VISIBLE;
+	if (va) {
+		op = PRED_OP(PREDICATION_OP_BOOL64);
+
+		/* By default, our internal rendering commands are discarded
+		 * only if the predicate is non-zero (ie. DRAW_VISIBLE). But
+		 * VK_EXT_conditional_rendering also allows to discard commands
+		 * when the predicate is zero, which means we have to use a
+		 * different flag.
+		 */
+		op |= inverted ? PREDICATION_DRAW_VISIBLE :
+				 PREDICATION_DRAW_NOT_VISIBLE;
+	}
 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9) {
 		radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
 		radeon_emit(cmd_buffer->cs, op);
@@ -1227,7 +1067,7 @@
 			   uint64_t dst_va, uint64_t src_va,
 			   unsigned size, unsigned flags)
 {
-	struct radeon_winsys_cs *cs = cmd_buffer->cs;
+	struct radeon_cmdbuf *cs = cmd_buffer->cs;
 	uint32_t header = 0, command = 0;
 
 	assert(size <= cp_dma_max_byte_count(cmd_buffer));
@@ -1255,9 +1095,9 @@
 	if (cmd_buffer->device->physical_device->rad_info.chip_class >= GFX9 &&
 	    !(flags & CP_DMA_CLEAR) &&
 	    src_va == dst_va)
-		header |= S_411_DSL_SEL(V_411_NOWHERE); /* prefetch only */
+		header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
 	else if (flags & CP_DMA_USE_L2)
-		header |= S_411_DSL_SEL(V_411_DST_ADDR_TC_L2);
+		header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2);
 
 	if (flags & CP_DMA_CLEAR)
 		header |= S_411_SRC_SEL(V_411_DATA);
@@ -1533,7 +1373,7 @@
 	return max_dist[log_samples];
 }
 
-void radv_cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples)
+void radv_cayman_emit_msaa_sample_locs(struct radeon_cmdbuf *cs, int nr_samples)
 {
 	switch (nr_samples) {
 	default:
diff --git a/src/amd/vulkan/vk_format_layout.csv b/src/amd/vulkan/vk_format_layout.csv
index ae9ceda..f9c2e6f 100644
--- a/src/amd/vulkan/vk_format_layout.csv
+++ b/src/amd/vulkan/vk_format_layout.csv
@@ -148,16 +148,16 @@
 VK_FORMAT_BC6H_SFLOAT_BLOCK          , bptc,  4, 4, x128,     ,     ,     , xyz1, rgb
 VK_FORMAT_BC7_UNORM_BLOCK            , bptc,  4, 4, x128,     ,     ,     , xyzw, rgb
 VK_FORMAT_BC7_SRGB_BLOCK             , bptc,  4, 4, x128,     ,     ,     , xyzw, srgb
-VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK,
-VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK,
-VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK,
-VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK,
-VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK,
-VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK,
-VK_FORMAT_EAC_R11_UNORM_BLOCK,
-VK_FORMAT_EAC_R11_SNORM_BLOCK,
-VK_FORMAT_EAC_R11G11_UNORM_BLOCK,
-VK_FORMAT_EAC_R11G11_SNORM_BLOCK,
+VK_FORMAT_ETC2_R8G8B8_UNORM_BLOCK    , etc,   4, 4, x64,      ,     ,     , xyz1, rgb
+VK_FORMAT_ETC2_R8G8B8_SRGB_BLOCK     , etc,   4, 4, x64,      ,     ,     , xyz1, srgb
+VK_FORMAT_ETC2_R8G8B8A1_UNORM_BLOCK  , etc,   4, 4, x64,      ,     ,     , xyzw, rgb
+VK_FORMAT_ETC2_R8G8B8A1_SRGB_BLOCK   , etc,   4, 4, x64,      ,     ,     , xyzw, srgb
+VK_FORMAT_ETC2_R8G8B8A8_UNORM_BLOCK  , etc,   4, 4, x128,     ,     ,     , xyzw, rgb
+VK_FORMAT_ETC2_R8G8B8A8_SRGB_BLOCK   , etc,   4, 4, x128,     ,     ,     , xyzw, srgb
+VK_FORMAT_EAC_R11_UNORM_BLOCK        , etc,   4, 4, x64,      ,     ,     , x001, rgb
+VK_FORMAT_EAC_R11_SNORM_BLOCK        , etc,   4, 4, x64,      ,     ,     , x001, rgb
+VK_FORMAT_EAC_R11G11_UNORM_BLOCK     , etc,   4, 4, x128,     ,     ,     , xy01, rgb
+VK_FORMAT_EAC_R11G11_SNORM_BLOCK     , etc,   4, 4, x128,     ,     ,     , xy01, rgb
 VK_FORMAT_ASTC_4x4_UNORM_BLOCK,
 VK_FORMAT_ASTC_4x4_SRGB_BLOCK,
 VK_FORMAT_ASTC_5x4_UNORM_BLOCK,
diff --git a/src/amd/vulkan/vk_format_table.py b/src/amd/vulkan/vk_format_table.py
index cd1af62..604aac8 100644
--- a/src/amd/vulkan/vk_format_table.py
+++ b/src/amd/vulkan/vk_format_table.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 CopyRight = '''
 /**************************************************************************
@@ -79,24 +80,24 @@
     if format.nr_channels() <= 1:
         func(format.le_channels, format.le_swizzles)
     else:
-        print '#ifdef PIPE_ARCH_BIG_ENDIAN'
+        print('#ifdef PIPE_ARCH_BIG_ENDIAN')
         func(format.be_channels, format.be_swizzles)
-        print '#else'
+        print('#else')
         func(format.le_channels, format.le_swizzles)
-        print '#endif'
+        print('#endif')
 
 def write_format_table(formats):
-    print '/* This file is autogenerated by vk_format_table.py from vk_format_layout.csv. Do not edit directly. */'
-    print
+    print('/* This file is autogenerated by vk_format_table.py from vk_format_layout.csv. Do not edit directly. */')
+    print()
     # This will print the copyright message on the top of this file
-    print CopyRight.strip()
-    print
-    print '#include "stdbool.h"'
-    print '#include "vk_format.h"'
-    print
+    print(CopyRight.strip())
+    print()
+    print('#include "stdbool.h"')
+    print('#include "vk_format.h"')
+    print()
     
     def do_channel_array(channels, swizzles):
-        print "   {"
+        print("   {")
         for i in range(4):
             channel = channels[i]
             if i < 3:
@@ -104,13 +105,13 @@
             else:
                 sep = ""
             if channel.size:
-                print "      {%s, %s, %s, %s, %u, %u}%s\t/* %s = %s */" % (type_map[channel.type], bool_map(channel.norm), bool_map(channel.pure), bool_map(channel.scaled), channel.size, channel.shift, sep, "xyzw"[i], channel.name)
+                print("      {%s, %s, %s, %s, %u, %u}%s\t/* %s = %s */" % (type_map[channel.type], bool_map(channel.norm), bool_map(channel.pure), bool_map(channel.scaled), channel.size, channel.shift, sep, "xyzw"[i], channel.name))
             else:
-                print "      {0, 0, 0, 0, 0}%s" % (sep,)
-        print "   },"
+                print("      {0, 0, 0, 0, 0}%s" % (sep,))
+        print("   },")
 
     def do_swizzle_array(channels, swizzles):
-        print "   {"
+        print("   {")
         for i in range(4):
             swizzle = swizzles[i]
             if i < 3:
@@ -121,43 +122,43 @@
                 comment = colorspace_channels_map[format.colorspace][i]
             except (KeyError, IndexError):
                 comment = 'ignored'
-            print "      %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment)
-        print "   },"
+            print("      %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment))
+        print("   },")
 
     for format in formats:
-        print 'static const struct vk_format_description'
-        print 'vk_format_%s_description = {' % (format.short_name(),)
-        print "   %s," % (format.name,)
-        print "   \"%s\"," % (format.name,)
-        print "   \"%s\"," % (format.short_name(),)
-        print "   {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size())
-        print "   %s," % (layout_map(format.layout),)
-        print "   %u,\t/* nr_channels */" % (format.nr_channels(),)
-        print "   %s,\t/* is_array */" % (bool_map(format.is_array()),)
-        print "   %s,\t/* is_bitmask */" % (bool_map(format.is_bitmask()),)
-        print "   %s,\t/* is_mixed */" % (bool_map(format.is_mixed()),)
+        print('static const struct vk_format_description')
+        print('vk_format_%s_description = {' % (format.short_name(),))
+        print("   %s," % (format.name,))
+        print("   \"%s\"," % (format.name,))
+        print("   \"%s\"," % (format.short_name(),))
+        print("   {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size()))
+        print("   %s," % (layout_map(format.layout),))
+        print("   %u,\t/* nr_channels */" % (format.nr_channels(),))
+        print("   %s,\t/* is_array */" % (bool_map(format.is_array()),))
+        print("   %s,\t/* is_bitmask */" % (bool_map(format.is_bitmask()),))
+        print("   %s,\t/* is_mixed */" % (bool_map(format.is_mixed()),))
         print_channels(format, do_channel_array)
         print_channels(format, do_swizzle_array)
-        print "   %s," % (colorspace_map(format.colorspace),)
-        print "};"
-        print
+        print("   %s," % (colorspace_map(format.colorspace),))
+        print("};")
+        print()
         
-    print "const struct vk_format_description *"
-    print "vk_format_description(VkFormat format)"
-    print "{"
-    print "   if (format > VK_FORMAT_END_RANGE) {"
-    print "      return NULL;"
-    print "   }"
-    print
-    print "   switch (format) {"
+    print("const struct vk_format_description *")
+    print("vk_format_description(VkFormat format)")
+    print("{")
+    print("   if (format > VK_FORMAT_END_RANGE) {")
+    print("      return NULL;")
+    print("   }")
+    print()
+    print("   switch (format) {")
     for format in formats:
-        print "   case %s:" % format.name
-        print "      return &vk_format_%s_description;" % (format.short_name(),)
-    print "   default:"
-    print "      return NULL;"
-    print "   }"
-    print "}"
-    print
+        print("   case %s:" % format.name)
+        print("      return &vk_format_%s_description;" % (format.short_name(),))
+    print("   default:")
+    print("      return NULL;")
+    print("   }")
+    print("}")
+    print()
 
 
 def main():
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
index 8ed3e53..25764d9 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_bo.c
@@ -38,7 +38,6 @@
 
 #include "util/u_atomic.h"
 
-
 static void radv_amdgpu_winsys_bo_destroy(struct radeon_winsys_bo *_bo);
 
 static int
@@ -306,7 +305,9 @@
 	}
 
 	r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
-				  size, alignment, 0, &va, &va_handle, 0);
+				  size, alignment, 0, &va, &va_handle,
+				  (flags & RADEON_FLAG_32BIT ? AMDGPU_VA_RANGE_32_BIT : 0) |
+				   AMDGPU_VA_RANGE_HIGH);
 	if (r)
 		goto error_va_alloc;
 
@@ -424,7 +425,8 @@
 		goto error;
 
 	if (amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
-	                          size, 1 << 12, 0, &va, &va_handle, 0))
+	                          size, 1 << 12, 0, &va, &va_handle,
+				  AMDGPU_VA_RANGE_HIGH))
 		goto error_va_alloc;
 
 	if (amdgpu_bo_va_op(buf_handle, 0, size, va, 0, AMDGPU_VA_OP_MAP))
@@ -480,7 +482,8 @@
 		goto error_query;
 
 	r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
-				  result.alloc_size, 1 << 20, 0, &va, &va_handle, 0);
+				  result.alloc_size, 1 << 20, 0, &va, &va_handle,
+				  AMDGPU_VA_RANGE_HIGH);
 	if (r)
 		goto error_query;
 
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
index 3103a29..149c256 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c
@@ -41,7 +41,7 @@
 };
 
 struct radv_amdgpu_cs {
-	struct radeon_winsys_cs base;
+	struct radeon_cmdbuf base;
 	struct radv_amdgpu_winsys *ws;
 
 	struct amdgpu_cs_ib_info    ib;
@@ -51,7 +51,6 @@
 	unsigned                    max_num_buffers;
 	unsigned                    num_buffers;
 	amdgpu_bo_handle            *handles;
-	uint8_t                     *priorities;
 
 	struct radeon_winsys_bo     **old_ib_buffers;
 	unsigned                    num_old_ib_buffers;
@@ -66,16 +65,15 @@
 	unsigned                    num_virtual_buffers;
 	unsigned                    max_num_virtual_buffers;
 	struct radeon_winsys_bo     **virtual_buffers;
-	uint8_t                     *virtual_buffer_priorities;
 	int                         *virtual_buffer_hash_table;
 
 	/* For chips that don't support chaining. */
-	struct radeon_winsys_cs     *old_cs_buffers;
+	struct radeon_cmdbuf     *old_cs_buffers;
 	unsigned                    num_old_cs_buffers;
 };
 
 static inline struct radv_amdgpu_cs *
-radv_amdgpu_cs(struct radeon_winsys_cs *base)
+radv_amdgpu_cs(struct radeon_cmdbuf *base)
 {
 	return (struct radv_amdgpu_cs*)base;
 }
@@ -193,7 +191,7 @@
 	return false;
 }
 
-static void radv_amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
+static void radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
 {
 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
 
@@ -206,17 +204,15 @@
 		cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
 
 	for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
-		struct radeon_winsys_cs *rcs = &cs->old_cs_buffers[i];
+		struct radeon_cmdbuf *rcs = &cs->old_cs_buffers[i];
 		free(rcs->buf);
 	}
 
 	free(cs->old_cs_buffers);
 	free(cs->old_ib_buffers);
 	free(cs->virtual_buffers);
-	free(cs->virtual_buffer_priorities);
 	free(cs->virtual_buffer_hash_table);
 	free(cs->handles);
-	free(cs->priorities);
 	free(cs);
 }
 
@@ -229,7 +225,7 @@
 	cs->hw_ip = ring_to_hw_ip(ring_type);
 }
 
-static struct radeon_winsys_cs *
+static struct radeon_cmdbuf *
 radv_amdgpu_cs_create(struct radeon_winsys *ws,
 		      enum ring_type ring_type)
 {
@@ -266,7 +262,7 @@
 		cs->ib_size_ptr = &cs->ib.size;
 		cs->ib.size = 0;
 
-		ws->cs_add_buffer(&cs->base, cs->ib_buffer, 8);
+		ws->cs_add_buffer(&cs->base, cs->ib_buffer);
 	} else {
 		cs->base.buf = malloc(16384);
 		cs->base.max_dw = 4096;
@@ -279,7 +275,7 @@
 	return &cs->base;
 }
 
-static void radv_amdgpu_cs_grow(struct radeon_winsys_cs *_cs, size_t min_size)
+static void radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
 {
 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
 
@@ -355,7 +351,7 @@
 	ib_size = MIN2(ib_size, 0xfffff);
 
 	while (!cs->base.cdw || (cs->base.cdw & 7) != 4)
-		cs->base.buf[cs->base.cdw++] = 0xffff1000;
+		radeon_emit(&cs->base, 0xffff1000);
 
 	*cs->ib_size_ptr |= cs->base.cdw + 4;
 
@@ -387,13 +383,14 @@
 		cs->ib_buffer = cs->old_ib_buffers[--cs->num_old_ib_buffers];
 	}
 
-	cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8);
+	cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
 
-	cs->base.buf[cs->base.cdw++] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
-	cs->base.buf[cs->base.cdw++] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
-	cs->base.buf[cs->base.cdw++] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32;
-	cs->ib_size_ptr = cs->base.buf + cs->base.cdw;
-	cs->base.buf[cs->base.cdw++] = S_3F2_CHAIN(1) | S_3F2_VALID(1);
+	radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
+	radeon_emit(&cs->base, radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va);
+	radeon_emit(&cs->base, radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32);
+	radeon_emit(&cs->base, S_3F2_CHAIN(1) | S_3F2_VALID(1));
+
+	cs->ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
 
 	cs->base.buf = (uint32_t *)cs->ib_mapped;
 	cs->base.cdw = 0;
@@ -401,13 +398,13 @@
 
 }
 
-static bool radv_amdgpu_cs_finalize(struct radeon_winsys_cs *_cs)
+static bool radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
 {
 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
 
 	if (cs->ws->use_ib_bos) {
 		while (!cs->base.cdw || (cs->base.cdw & 7) != 0)
-			cs->base.buf[cs->base.cdw++] = 0xffff1000;
+			radeon_emit(&cs->base, 0xffff1000);
 
 		*cs->ib_size_ptr |= cs->base.cdw;
 
@@ -417,7 +414,7 @@
 	return !cs->failed;
 }
 
-static void radv_amdgpu_cs_reset(struct radeon_winsys_cs *_cs)
+static void radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs)
 {
 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
 	cs->base.cdw = 0;
@@ -438,7 +435,7 @@
 	cs->num_virtual_buffers = 0;
 
 	if (cs->ws->use_ib_bos) {
-		cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer, 8);
+		cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
 
 		for (unsigned i = 0; i < cs->num_old_ib_buffers; ++i)
 			cs->ws->base.buffer_destroy(cs->old_ib_buffers[i]);
@@ -449,7 +446,7 @@
 		cs->ib.size = 0;
 	} else {
 		for (unsigned i = 0; i < cs->num_old_cs_buffers; ++i) {
-			struct radeon_winsys_cs *rcs = &cs->old_cs_buffers[i];
+			struct radeon_cmdbuf *rcs = &cs->old_cs_buffers[i];
 			free(rcs->buf);
 		}
 
@@ -482,26 +479,21 @@
 }
 
 static void radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs,
-					       amdgpu_bo_handle bo,
-					       uint8_t priority)
+					       amdgpu_bo_handle bo)
 {
 	unsigned hash;
 	int index = radv_amdgpu_cs_find_buffer(cs, bo);
 
-	if (index != -1) {
-		cs->priorities[index] = MAX2(cs->priorities[index], priority);
+	if (index != -1)
 		return;
-	}
 
 	if (cs->num_buffers == cs->max_num_buffers) {
 		unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
 		cs->handles = realloc(cs->handles, new_count * sizeof(amdgpu_bo_handle));
-		cs->priorities = realloc(cs->priorities, new_count * sizeof(uint8_t));
 		cs->max_num_buffers = new_count;
 	}
 
 	cs->handles[cs->num_buffers] = bo;
-	cs->priorities[cs->num_buffers] = priority;
 
 	hash = ((uintptr_t)bo >> 6) & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
 	cs->buffer_hash_table[hash] = cs->num_buffers;
@@ -509,9 +501,8 @@
 	++cs->num_buffers;
 }
 
-static void radv_amdgpu_cs_add_virtual_buffer(struct radeon_winsys_cs *_cs,
-                                              struct radeon_winsys_bo *bo,
-                                              uint8_t priority)
+static void radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf *_cs,
+                                              struct radeon_winsys_bo *bo)
 {
 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
 	unsigned hash = ((uintptr_t)bo >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
@@ -526,12 +517,10 @@
 	if (cs->virtual_buffer_hash_table[hash] >= 0) {
 		int idx = cs->virtual_buffer_hash_table[hash];
 		if (cs->virtual_buffers[idx] == bo) {
-			cs->virtual_buffer_priorities[idx] = MAX2(cs->virtual_buffer_priorities[idx], priority);
 			return;
 		}
 		for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
 			if (cs->virtual_buffers[i] == bo) {
-				cs->virtual_buffer_priorities[i] = MAX2(cs->virtual_buffer_priorities[i], priority);
 				cs->virtual_buffer_hash_table[hash] = i;
 				return;
 			}
@@ -541,59 +530,54 @@
 	if(cs->max_num_virtual_buffers <= cs->num_virtual_buffers) {
 		cs->max_num_virtual_buffers = MAX2(2, cs->max_num_virtual_buffers * 2);
 		cs->virtual_buffers = realloc(cs->virtual_buffers, sizeof(struct radv_amdgpu_virtual_virtual_buffer*) * cs->max_num_virtual_buffers);
-		cs->virtual_buffer_priorities = realloc(cs->virtual_buffer_priorities, sizeof(uint8_t) * cs->max_num_virtual_buffers);
 	}
 
 	cs->virtual_buffers[cs->num_virtual_buffers] = bo;
-	cs->virtual_buffer_priorities[cs->num_virtual_buffers] = priority;
 
 	cs->virtual_buffer_hash_table[hash] = cs->num_virtual_buffers;
 	++cs->num_virtual_buffers;
 
 }
 
-static void radv_amdgpu_cs_add_buffer(struct radeon_winsys_cs *_cs,
-				 struct radeon_winsys_bo *_bo,
-				 uint8_t priority)
+static void radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs,
+				      struct radeon_winsys_bo *_bo)
 {
 	struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
 	struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
 
 	if (bo->is_virtual)  {
-		radv_amdgpu_cs_add_virtual_buffer(_cs, _bo, priority);
+		radv_amdgpu_cs_add_virtual_buffer(_cs, _bo);
 		return;
 	}
 
 	if (bo->base.is_local)
 		return;
 
-	radv_amdgpu_cs_add_buffer_internal(cs, bo->bo, priority);
+	radv_amdgpu_cs_add_buffer_internal(cs, bo->bo);
 }
 
-static void radv_amdgpu_cs_execute_secondary(struct radeon_winsys_cs *_parent,
-					     struct radeon_winsys_cs *_child)
+static void radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent,
+					     struct radeon_cmdbuf *_child)
 {
 	struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
 	struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
 
 	for (unsigned i = 0; i < child->num_buffers; ++i) {
-		radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i],
-						   child->priorities[i]);
+		radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i]);
 	}
 
 	for (unsigned i = 0; i < child->num_virtual_buffers; ++i) {
-		radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i],
-		                          child->virtual_buffer_priorities[i]);
+		radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]);
 	}
 
 	if (parent->ws->use_ib_bos) {
 		if (parent->base.cdw + 4 > parent->base.max_dw)
 			radv_amdgpu_cs_grow(&parent->base, 4);
 
-		parent->base.buf[parent->base.cdw++] = PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0);
-		parent->base.buf[parent->base.cdw++] = child->ib.ib_mc_address;
-		parent->base.buf[parent->base.cdw++] = child->ib.ib_mc_address >> 32;
-		parent->base.buf[parent->base.cdw++] = child->ib.size;
+		radeon_emit(&parent->base, PKT3(PKT3_INDIRECT_BUFFER_CIK, 2, 0));
+		radeon_emit(&parent->base, child->ib.ib_mc_address);
+		radeon_emit(&parent->base, child->ib.ib_mc_address >> 32);
+		radeon_emit(&parent->base, child->ib.size);
 	} else {
 		if (parent->base.cdw + child->base.cdw > parent->base.max_dw)
 			radv_amdgpu_cs_grow(&parent->base, child->base.cdw);
@@ -604,11 +588,11 @@
 }
 
 static int radv_amdgpu_create_bo_list(struct radv_amdgpu_winsys *ws,
-				      struct radeon_winsys_cs **cs_array,
+				      struct radeon_cmdbuf **cs_array,
 				      unsigned count,
 				      struct radv_amdgpu_winsys_bo **extra_bo_array,
 				      unsigned num_extra_bo,
-				      struct radeon_winsys_cs *extra_cs,
+				      struct radeon_cmdbuf *extra_cs,
 				      const struct radv_winsys_bo_list *radv_bo_list,
 				      amdgpu_bo_list_handle *bo_list)
 {
@@ -645,7 +629,7 @@
 			return 0;
 		}
 		r = amdgpu_bo_list_create(ws->dev, cs->num_buffers, cs->handles,
-					  cs->priorities, bo_list);
+					  NULL, bo_list);
 	} else {
 		unsigned total_buffer_count = num_extra_bo;
 		unsigned unique_bo_count = num_extra_bo;
@@ -669,16 +653,13 @@
 			return 0;
 		}
 		amdgpu_bo_handle *handles = malloc(sizeof(amdgpu_bo_handle) * total_buffer_count);
-		uint8_t *priorities = malloc(sizeof(uint8_t) * total_buffer_count);
-		if (!handles || !priorities) {
+		if (!handles) {
 			free(handles);
-			free(priorities);
 			return -ENOMEM;
 		}
 
 		for (unsigned i = 0; i < num_extra_bo; i++) {
 			handles[i] = extra_bo_array[i]->bo;
-			priorities[i] = 8;
 		}
 
 		for (unsigned i = 0; i < count + !!extra_cs; ++i) {
@@ -694,7 +675,6 @@
 
 			if (unique_bo_count == 0 && !cs->num_virtual_buffers) {
 				memcpy(handles, cs->handles, cs->num_buffers * sizeof(amdgpu_bo_handle));
-				memcpy(priorities, cs->priorities, cs->num_buffers * sizeof(uint8_t));
 				unique_bo_count = cs->num_buffers;
 				continue;
 			}
@@ -704,14 +684,11 @@
 				for (unsigned k = 0; k < unique_bo_so_far; ++k) {
 					if (handles[k] == cs->handles[j]) {
 						found = true;
-						priorities[k] = MAX2(priorities[k],
-								     cs->priorities[j]);
 						break;
 					}
 				}
 				if (!found) {
 					handles[unique_bo_count] = cs->handles[j];
-					priorities[unique_bo_count] = cs->priorities[j];
 					++unique_bo_count;
 				}
 			}
@@ -723,14 +700,11 @@
 					for (unsigned m = 0; m < unique_bo_count; ++m) {
 						if (handles[m] == bo->bo) {
 							found = true;
-							priorities[m] = MAX2(priorities[m],
-									cs->virtual_buffer_priorities[j]);
 							break;
 						}
 					}
 					if (!found) {
 						handles[unique_bo_count] = bo->bo;
-						priorities[unique_bo_count] = cs->virtual_buffer_priorities[j];
 						++unique_bo_count;
 					}
 				}
@@ -739,20 +713,17 @@
 
 		if (radv_bo_list) {
 			unsigned unique_bo_so_far = unique_bo_count;
-			const unsigned default_bo_priority = 7;
 			for (unsigned i = 0; i < radv_bo_list->count; ++i) {
 				struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(radv_bo_list->bos[i]);
 				bool found = false;
 				for (unsigned j = 0; j < unique_bo_so_far; ++j) {
 					if (bo->bo == handles[j]) {
 						found = true;
-						priorities[j] = MAX2(priorities[j], default_bo_priority);
 						break;
 					}
 				}
 				if (!found) {
 					handles[unique_bo_count] = bo->bo;
-					priorities[unique_bo_count] = default_bo_priority;
 					++unique_bo_count;
 				}
 			}
@@ -760,13 +731,12 @@
 
 		if (unique_bo_count > 0) {
 			r = amdgpu_bo_list_create(ws->dev, unique_bo_count, handles,
-						  priorities, bo_list);
+						  NULL, bo_list);
 		} else {
 			*bo_list = 0;
 		}
 
 		free(handles);
-		free(priorities);
 	}
 
 	return r;
@@ -794,10 +764,10 @@
 						int queue_idx,
 						struct radv_winsys_sem_info *sem_info,
 						const struct radv_winsys_bo_list *radv_bo_list,
-						struct radeon_winsys_cs **cs_array,
+						struct radeon_cmdbuf **cs_array,
 						unsigned cs_count,
-						struct radeon_winsys_cs *initial_preamble_cs,
-						struct radeon_winsys_cs *continue_preamble_cs,
+						struct radeon_cmdbuf *initial_preamble_cs,
+						struct radeon_cmdbuf *continue_preamble_cs,
 						struct radeon_winsys_fence *_fence)
 {
 	int r;
@@ -876,10 +846,10 @@
 						 int queue_idx,
 						 struct radv_winsys_sem_info *sem_info,
 						 const struct radv_winsys_bo_list *radv_bo_list,
-						 struct radeon_winsys_cs **cs_array,
+						 struct radeon_cmdbuf **cs_array,
 						 unsigned cs_count,
-						 struct radeon_winsys_cs *initial_preamble_cs,
-						 struct radeon_winsys_cs *continue_preamble_cs,
+						 struct radeon_cmdbuf *initial_preamble_cs,
+						 struct radeon_cmdbuf *continue_preamble_cs,
 						 struct radeon_winsys_fence *_fence)
 {
 	int r;
@@ -893,7 +863,7 @@
 	for (unsigned i = 0; i < cs_count;) {
 		struct radv_amdgpu_cs *cs0 = radv_amdgpu_cs(cs_array[i]);
 		struct amdgpu_cs_ib_info ibs[AMDGPU_CS_MAX_IBS_PER_SUBMIT];
-		struct radeon_winsys_cs *preamble_cs = i ? continue_preamble_cs : initial_preamble_cs;
+		struct radeon_cmdbuf *preamble_cs = i ? continue_preamble_cs : initial_preamble_cs;
 		unsigned cnt = MIN2(AMDGPU_CS_MAX_IBS_PER_SUBMIT - !!preamble_cs,
 		                    cs_count - i);
 
@@ -958,10 +928,10 @@
 					       int queue_idx,
 					       struct radv_winsys_sem_info *sem_info,
 					       const struct radv_winsys_bo_list *radv_bo_list,
-					       struct radeon_winsys_cs **cs_array,
+					       struct radeon_cmdbuf **cs_array,
 					       unsigned cs_count,
-					       struct radeon_winsys_cs *initial_preamble_cs,
-					       struct radeon_winsys_cs *continue_preamble_cs,
+					       struct radeon_cmdbuf *initial_preamble_cs,
+					       struct radeon_cmdbuf *continue_preamble_cs,
 					       struct radeon_winsys_fence *_fence)
 {
 	int r;
@@ -983,7 +953,7 @@
 		struct amdgpu_cs_ib_info ibs[AMDGPU_CS_MAX_IBS_PER_SUBMIT] = {0};
 		unsigned number_of_ibs = 1;
 		struct radeon_winsys_bo *bos[AMDGPU_CS_MAX_IBS_PER_SUBMIT] = {0};
-		struct radeon_winsys_cs *preamble_cs = i ? continue_preamble_cs : initial_preamble_cs;
+		struct radeon_cmdbuf *preamble_cs = i ? continue_preamble_cs : initial_preamble_cs;
 		struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
 		uint32_t *ptr;
 		unsigned cnt = 0;
@@ -996,7 +966,7 @@
 			 * IB per submit.
 			 */
 			unsigned new_cs_count = cs->num_old_cs_buffers + 1;
-			struct radeon_winsys_cs *new_cs_array[AMDGPU_CS_MAX_IBS_PER_SUBMIT];
+			struct radeon_cmdbuf *new_cs_array[AMDGPU_CS_MAX_IBS_PER_SUBMIT];
 			unsigned idx = 0;
 
 			for (unsigned j = 0; j < cs->num_old_cs_buffers; j++)
@@ -1004,7 +974,7 @@
 			new_cs_array[idx++] = cs_array[i];
 
 			for (unsigned j = 0; j < new_cs_count; j++) {
-				struct radeon_winsys_cs *rcs = new_cs_array[j];
+				struct radeon_cmdbuf *rcs = new_cs_array[j];
 				bool needs_preamble = preamble_cs && j == 0;
 				unsigned size = 0;
 
@@ -1134,10 +1104,10 @@
 
 static int radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx,
 					int queue_idx,
-					struct radeon_winsys_cs **cs_array,
+					struct radeon_cmdbuf **cs_array,
 					unsigned cs_count,
-					struct radeon_winsys_cs *initial_preamble_cs,
-					struct radeon_winsys_cs *continue_preamble_cs,
+					struct radeon_cmdbuf *initial_preamble_cs,
+					struct radeon_cmdbuf *continue_preamble_cs,
 					struct radv_winsys_sem_info *sem_info,
 					const struct radv_winsys_bo_list *bo_list,
 					bool can_patch,
@@ -1196,7 +1166,7 @@
 	return ret;
 }
 
-static void radv_amdgpu_winsys_cs_dump(struct radeon_winsys_cs *_cs,
+static void radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf *_cs,
                                        FILE* file,
                                        const int *trace_ids, int trace_id_count)
 {
@@ -1406,12 +1376,9 @@
 	}
 
 	if (sem_info->wait.sem_count && sem_info->cs_emit_wait) {
-		sem_dependencies = malloc(sizeof(struct drm_amdgpu_cs_chunk_dep) * sem_info->wait.sem_count);
-		if (!sem_dependencies) {
-			r = -ENOMEM;
-			goto error_out;
-		}
+		sem_dependencies = alloca(sizeof(struct drm_amdgpu_cs_chunk_dep) * sem_info->wait.sem_count);
 		int sem_count = 0;
+
 		for (unsigned j = 0; j < sem_info->wait.sem_count; j++) {
 			sem = (struct amdgpu_cs_fence *)sem_info->wait.sem[j];
 			if (!sem->context)
@@ -1450,7 +1417,6 @@
 				 chunks,
 				 &request->seq_no);
 error_out:
-	free(sem_dependencies);
 	free(wait_syncobj);
 	free(signal_syncobj);
 	return r;
diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
index 42e83f1..9706c04 100644
--- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
+++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_winsys.c
@@ -45,13 +45,6 @@
 	if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
 		return false;
 
-	/* LLVM 5.0 is required for GFX9. */
-	if (ws->info.chip_class >= GFX9 && HAVE_LLVM < 0x0500) {
-		fprintf(stderr, "amdgpu: LLVM 5.0 is required, got LLVM %i.%i\n",
-			HAVE_LLVM >> 8, HAVE_LLVM & 255);
-		return false;
-	}
-
 	ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment);
 	if (!ws->addrlib) {
 		fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
diff --git a/src/broadcom/Android.genxml.mk b/src/broadcom/Android.genxml.mk
index 1e3f9e5..91e0de0 100644
--- a/src/broadcom/Android.genxml.mk
+++ b/src/broadcom/Android.genxml.mk
@@ -42,27 +42,31 @@
 define pack-header-gen
 	@mkdir -p $(dir $@)
 	@echo "Gen Header: $(PRIVATE_MODULE) <= $(notdir $(@))"
-	$(hide) $(PRIVATE_SCRIPT) $(PRIVATE_SCRIPT_FLAGS) $(PRIVATE_XML) > $@
+	$(hide) $(PRIVATE_SCRIPT) $(PRIVATE_SCRIPT_FLAGS) $(PRIVATE_XML) $(PRIVATE_VER) > $@
 endef
 
 $(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
 $(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v21.xml
+$(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: PRIVATE_VER := 21
 $(intermediates)/broadcom/cle/v3d_packet_v21_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v21.xml $(LOCAL_PATH)/cle/gen_pack_header.py
 	$(call pack-header-gen)
 
 $(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
 $(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml
+$(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: PRIVATE_VER := 33
 $(intermediates)/broadcom/cle/v3d_packet_v33_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py
 	$(call pack-header-gen)
 
 $(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
-$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v41.xml
-$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v41.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: PRIVATE_VER := 41
+$(intermediates)/broadcom/cle/v3d_packet_v41_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py
 	$(call pack-header-gen)
 
 $(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(LOCAL_PATH)/cle/gen_pack_header.py
-$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v42.xml
-$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v42.xml $(LOCAL_PATH)/cle/gen_pack_header.py
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_XML := $(LOCAL_PATH)/cle/v3d_packet_v33.xml
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: PRIVATE_VER := 42
+$(intermediates)/broadcom/cle/v3d_packet_v42_pack.h: $(LOCAL_PATH)/cle/v3d_packet_v33.xml $(LOCAL_PATH)/cle/gen_pack_header.py
 	$(call pack-header-gen)
 
 $(intermediates)/broadcom/cle/v3d_xml.h: $(addprefix $(MESA_TOP)/src/broadcom/,$(BROADCOM_GENXML_XML_FILES)) $(MESA_TOP)/src/intel/genxml/gen_zipped_file.py
diff --git a/src/broadcom/Makefile.am b/src/broadcom/Makefile.am
index 49267de..4faa772 100644
--- a/src/broadcom/Makefile.am
+++ b/src/broadcom/Makefile.am
@@ -60,6 +60,6 @@
 
 include Makefile.genxml.am
 include Makefile.cle.am
-include Makefile.vc5.am
+include Makefile.v3d.am
 
 CLEANFILES += $(BUILT_SOURCES)
diff --git a/src/broadcom/Makefile.genxml.am b/src/broadcom/Makefile.genxml.am
index 6cfabae..8828207 100644
--- a/src/broadcom/Makefile.genxml.am
+++ b/src/broadcom/Makefile.genxml.am
@@ -28,9 +28,21 @@
 
 $(BROADCOM_GENXML_GENERATED_FILES): cle/gen_pack_header.py
 
-.xml_pack.h:
+cle/v3d_packet_v21_pack.h: $(srcdir)/cle/v3d_packet_v21.xml $(srcdir)/cle/gen_pack_header.py
 	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/cle/gen_pack_header.py $< > $@ || ($(RM) $@; false)
+	$(PYTHON_GEN) $(srcdir)/cle/gen_pack_header.py $< 21 > $@ || ($(RM) $@; false)
+
+cle/v3d_packet_v33_pack.h: $(srcdir)/cle/v3d_packet_v33.xml $(srcdir)/cle/gen_pack_header.py
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/cle/gen_pack_header.py $< 33 > $@ || ($(RM) $@; false)
+
+cle/v3d_packet_v41_pack.h: $(srcdir)/cle/v3d_packet_v33.xml $(srcdir)/cle/gen_pack_header.py
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/cle/gen_pack_header.py $< 41 > $@ || ($(RM) $@; false)
+
+cle/v3d_packet_v42_pack.h: $(srcdir)/cle/v3d_packet_v33.xml $(srcdir)/cle/gen_pack_header.py
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/cle/gen_pack_header.py $< 42 > $@ || ($(RM) $@; false)
 
 GEN_ZIPPED = $(srcdir)/../intel/genxml/gen_zipped_file.py
 cle/v3d_xml.h: $(GEN_ZIPPED) $(BROADCOM_GENXML_XML_FILES)
diff --git a/src/broadcom/Makefile.sources b/src/broadcom/Makefile.sources
index dc225b9..5955acd 100644
--- a/src/broadcom/Makefile.sources
+++ b/src/broadcom/Makefile.sources
@@ -9,8 +9,6 @@
 BROADCOM_GENXML_XML_FILES = \
 	cle/v3d_packet_v21.xml \
 	cle/v3d_packet_v33.xml \
-	cle/v3d_packet_v41.xml \
-	cle/v3d_packet_v42.xml \
 	$()
 
 BROADCOM_FILES = \
@@ -30,6 +28,7 @@
 	compiler/vir_lower_uniforms.c \
 	compiler/vir_opt_copy_propagate.c \
 	compiler/vir_opt_dead_code.c \
+	compiler/vir_opt_small_immediates.c \
 	compiler/vir_register_allocate.c \
 	compiler/vir_to_qpu.c \
 	compiler/qpu_schedule.c \
diff --git a/src/broadcom/Makefile.vc5.am b/src/broadcom/Makefile.v3d.am
similarity index 87%
rename from src/broadcom/Makefile.vc5.am
rename to src/broadcom/Makefile.v3d.am
index c56cf89..97ef2d7 100644
--- a/src/broadcom/Makefile.vc5.am
+++ b/src/broadcom/Makefile.v3d.am
@@ -3,9 +3,9 @@
 noinst_LTLIBRARIES += libbroadcom_v41.la
 noinst_LTLIBRARIES += libbroadcom_v42.la
 
-if USE_VC5_SIMULATOR
-AM_CFLAGS += $(VC5_SIMULATOR_CFLAGS)
-libbroadcom_la_LDFLAGS = $(VC5_SIMULATOR_LIBS)
+if USE_V3D_SIMULATOR
+AM_CFLAGS += $(V3D_SIMULATOR_CFLAGS)
+libbroadcom_la_LDFLAGS = $(V3D_SIMULATOR_LIBS)
 endif
 
 libbroadcom_v33_la_SOURCES = $(BROADCOM_PER_VERSION_SOURCES)
diff --git a/src/broadcom/cle/gen_pack_header.py b/src/broadcom/cle/gen_pack_header.py
index e576276..f79ee42 100644
--- a/src/broadcom/cle/gen_pack_header.py
+++ b/src/broadcom/cle/gen_pack_header.py
@@ -43,7 +43,7 @@
 #ifndef %(guard)s
 #define %(guard)s
 
-#include "v3d_packet_helpers.h"
+#include "cle/v3d_packet_helpers.h"
 
 """
 
@@ -129,6 +129,12 @@
         else:
             self.default = None
 
+        if "minus_one" in attrs:
+            assert(attrs["minus_one"] == "true")
+            self.minus_one = True
+        else:
+            self.minus_one = False
+
         ufixed_match = Field.ufixed_pattern.match(self.type)
         if ufixed_match:
             self.type = 'ufixed'
@@ -146,6 +152,8 @@
             type = 'bool'
         elif self.type == 'float':
             type = 'float'
+        elif self.type == 'f187':
+            type = 'float'
         elif self.type == 'ufixed':
             type = 'float'
         elif self.type == 'sfixed':
@@ -186,6 +194,8 @@
         self.count = count
         self.size = 0
         self.fields = []
+        self.min_ver = 0
+        self.max_ver = 0
 
     def emit_template_struct(self, dim):
         if self.count == 0:
@@ -208,7 +218,7 @@
             first_byte = field.start // 8
             last_byte = field.end // 8
 
-            for b in xrange(first_byte, last_byte + 1):
+            for b in range(first_byte, last_byte + 1):
                 if not b in bytes:
                     bytes[b] = self.Byte()
 
@@ -228,6 +238,10 @@
         relocs_emitted = set()
         memcpy_fields = set()
 
+        for field in self.fields:
+            if field.minus_one:
+                print("   assert(values->%s >= 1);" % field.name)
+
         for index in range(self.length):
             # Handle MBZ bytes
             if not index in bytes:
@@ -252,7 +266,7 @@
             # uints/ints with no merged fields.
             if len(byte.fields) == 1:
                 field = byte.fields[0]
-                if field.type in ["float", "uint", "int"] and field.start % 8 == 0 and field.end - field.start == 31:
+                if field.type in ["float", "uint", "int"] and field.start % 8 == 0 and field.end - field.start == 31 and not field.minus_one:
                     if field in memcpy_fields:
                         continue
 
@@ -281,6 +295,10 @@
                 end -= field_byte_start
                 extra_shift = 0
 
+                value = "values->%s" % name
+                if field.minus_one:
+                    value = "%s - 1" % value
+
                 if field.type == "mbo":
                     s = "__gen_mbo(%d, %d)" % \
                         (start, end)
@@ -288,28 +306,31 @@
                     extra_shift = (31 - (end - start)) // 8 * 8
                     s = "__gen_address_offset(&values->%s)" % byte.address.name
                 elif field.type == "uint":
-                    s = "__gen_uint(values->%s, %d, %d)" % \
-                        (name, start, end)
+                    s = "__gen_uint(%s, %d, %d)" % \
+                        (value, start, end)
                 elif field.type in self.parser.enums:
-                    s = "__gen_uint(values->%s, %d, %d)" % \
-                        (name, start, end)
+                    s = "__gen_uint(%s, %d, %d)" % \
+                        (value, start, end)
                 elif field.type == "int":
-                    s = "__gen_sint(values->%s, %d, %d)" % \
-                        (name, start, end)
+                    s = "__gen_sint(%s, %d, %d)" % \
+                        (value, start, end)
                 elif field.type == "bool":
-                    s = "__gen_uint(values->%s, %d, %d)" % \
-                        (name, start, end)
+                    s = "__gen_uint(%s, %d, %d)" % \
+                        (value, start, end)
                 elif field.type == "float":
                     s = "#error %s float value mixed in with other fields" % name
+                elif field.type == "f187":
+                    s = "__gen_uint(fui(%s) >> 16, %d, %d)" % \
+                        (value, start, end)
                 elif field.type == "offset":
-                    s = "__gen_offset(values->%s, %d, %d)" % \
-                        (name, start, end)
+                    s = "__gen_offset(%s, %d, %d)" % \
+                        (value, start, end)
                 elif field.type == 'ufixed':
-                    s = "__gen_ufixed(values->%s, %d, %d, %d)" % \
-                        (name, start, end, field.fractional_size)
+                    s = "__gen_ufixed(%s, %d, %d, %d)" % \
+                        (value, start, end, field.fractional_size)
                 elif field.type == 'sfixed':
-                    s = "__gen_sfixed(values->%s, %d, %d, %d)" % \
-                        (name, start, end, field.fractional_size)
+                    s = "__gen_sfixed(%s, %d, %d, %d)" % \
+                        (value, start, end, field.fractional_size)
                 elif field.type in self.parser.structs:
                     s = "__gen_uint(v%d_%d, %d, %d)" % \
                         (index, field_index, start, end)
@@ -354,6 +375,8 @@
                     convert = "__gen_unpack_uint"
                 elif field.type == "float":
                     convert = "__gen_unpack_float"
+                elif field.type == "f187":
+                    convert = "__gen_unpack_f187"
                 elif field.type == "offset":
                     convert = "__gen_unpack_offset"
                 elif field.type == 'ufixed':
@@ -366,8 +389,11 @@
                     print("/* unhandled field %s, type %s */\n" % (field.name, field.type))
                     s = None
 
-                print("   values->%s = %s(%s);" % \
-                      (field.name, convert, ', '.join(args)))
+                plusone = ""
+                if field.minus_one:
+                    plusone = " + 1"
+                print("   values->%s = %s(%s)%s;" % \
+                      (field.name, convert, ', '.join(args), plusone))
 
 class Value(object):
     def __init__(self, attrs):
@@ -375,7 +401,7 @@
         self.value = int(attrs["value"])
 
 class Parser(object):
-    def __init__(self):
+    def __init__(self, ver):
         self.parser = xml.parsers.expat.ParserCreate()
         self.parser.StartElementHandler = self.start_element
         self.parser.EndElementHandler = self.end_element
@@ -386,6 +412,7 @@
         # Set of enum names we've seen.
         self.enums = set()
         self.registers = {}
+        self.ver = ver
 
     def gen_prefix(self, name):
         if name[0] == "_":
@@ -396,10 +423,27 @@
     def gen_guard(self):
         return self.gen_prefix("PACK_H")
 
+    def attrs_version_valid(self, attrs):
+        if "min_ver" in attrs and self.ver < attrs["min_ver"]:
+            return False
+
+        if "max_ver" in attrs and self.ver > attrs["max_ver"]:
+            return False
+
+        return True
+
+    def group_enabled(self):
+        if self.group.min_ver != 0 and self.ver < self.group.min_ver:
+            return False
+
+        if self.group.max_ver != 0 and self.ver > self.group.max_ver:
+            return False
+
+        return True
+
     def start_element(self, name, attrs):
         if name == "vcxml":
-            self.platform = "V3D {}".format(attrs["gen"])
-            self.ver = attrs["gen"].replace('.', '')
+            self.platform = "V3D {}.{}".format(self.ver[0], self.ver[1])
             print(pack_header % {'license': license, 'platform': self.platform, 'guard': self.gen_guard()})
         elif name in ("packet", "struct", "register"):
             default_field = None
@@ -432,6 +476,11 @@
                 field.values = []
                 self.group.fields.append(field)
 
+            if "min_ver" in attrs:
+                self.group.min_ver = attrs["min_ver"]
+            if "max_ver" in attrs:
+                self.group.max_ver = attrs["max_ver"]
+
         elif name == "field":
             self.group.fields.append(Field(self, attrs))
             self.values = []
@@ -439,12 +488,14 @@
             self.values = []
             self.enum = safe_name(attrs["name"])
             self.enums.add(attrs["name"])
+            self.enum_enabled = self.attrs_version_valid(attrs)
             if "prefix" in attrs:
                 self.prefix = attrs["prefix"]
             else:
                 self.prefix= None
         elif name == "value":
-            self.values.append(Value(attrs))
+            if self.attrs_version_valid(attrs):
+                self.values.append(Value(attrs))
 
     def end_element(self, name):
         if name  == "packet":
@@ -463,7 +514,8 @@
         elif name  == "field":
             self.group.fields[-1].values = self.values
         elif name  == "enum":
-            self.emit_enum()
+            if self.enum_enabled:
+                self.emit_enum()
             self.enum = None
         elif name == "vcxml":
             print('#endif /* %s */' % self.gen_guard())
@@ -508,6 +560,9 @@
         print('')
 
     def emit_packet(self):
+        if not self.group_enabled():
+            return
+
         name = self.packet
 
         assert(self.group.fields[0].name == "opcode")
@@ -522,6 +577,9 @@
         print('')
 
     def emit_register(self):
+        if not self.group_enabled():
+            return
+
         name = self.register
         if not self.reg_num == None:
             print('#define %-33s 0x%04x' %
@@ -532,6 +590,9 @@
         self.emit_unpack_function(self.register, self.group)
 
     def emit_struct(self):
+        if not self.group_enabled():
+            return
+
         name = self.struct
 
         self.emit_header(name)
@@ -562,5 +623,5 @@
 
 input_file = sys.argv[1]
 
-p = Parser()
+p = Parser(sys.argv[2])
 p.parse(input_file)
diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
index b1ff832..9838967 100644
--- a/src/broadcom/cle/meson.build
+++ b/src/broadcom/cle/meson.build
@@ -18,16 +18,31 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+# [version, cle XML version]
 v3d_versions = [
-  21,
-  33,
-  41,
-  42,
+  [21, 21],
+  [33, 33],
+  [41, 33],
+  [42, 33]
 ]
 
 v3d_xml_files = []
-foreach v: v3d_versions
-  v3d_xml_files += 'v3d_packet_v@0@.xml'.format(v)
+v3d_xml_pack = []
+foreach _v : v3d_versions
+  v = _v[0]
+  xmlver = _v[1]
+  f = 'v3d_packet_v@0@.xml'.format(xmlver)
+  _name = 'v3d_packet_v@0@_pack.h'.format(v)
+  if not v3d_xml_files.contains(f)
+    v3d_xml_files += f
+  endif
+  v3d_xml_pack += custom_target(
+    _name,
+    input : ['gen_pack_header.py', f],
+    output : _name,
+    command : [prog_python2, '@INPUT@', '@0@'.format(v)],
+    capture : true,
+  )
 endforeach
 
 v3d_xml_h = custom_target(
@@ -38,18 +53,6 @@
   capture : true,
 )
 
-v3d_xml_pack = []
-foreach f : v3d_xml_files
-  _name = '@0@_pack.h'.format(f.split('.')[0])
-  v3d_xml_pack += custom_target(
-    _name,
-    input : ['gen_pack_header.py', f],
-    output : _name,
-    command : [prog_python2, '@INPUT@'],
-    capture : true,
-  )
-endforeach
-
 libbroadcom_cle = static_library(
   ['broadcom_cle', v3d_xml_h],
   'v3d_decoder.c',
diff --git a/src/broadcom/cle/v3d_decoder.c b/src/broadcom/cle/v3d_decoder.c
index 9c457b7..373a1d9 100644
--- a/src/broadcom/cle/v3d_decoder.c
+++ b/src/broadcom/cle/v3d_decoder.c
@@ -37,6 +37,7 @@
 #include "v3d_decoder.h"
 #include "v3d_packet_helpers.h"
 #include "v3d_xml.h"
+#include "broadcom/clif/clif_private.h"
 
 struct v3d_spec {
         uint32_t ver;
@@ -58,6 +59,7 @@
 
 struct parser_context {
         XML_Parser parser;
+        const struct v3d_device_info *devinfo;
         int foo;
         struct location loc;
 
@@ -68,6 +70,9 @@
         struct v3d_value *values[256];
 
         struct v3d_spec *spec;
+
+        int parse_depth;
+        int parse_skip_depth;
 };
 
 const char *
@@ -311,6 +316,8 @@
                 return (struct v3d_type) { .kind = V3D_TYPE_BOOL };
         else if (strcmp(s, "float") == 0)
                 return (struct v3d_type) { .kind = V3D_TYPE_FLOAT };
+        else if (strcmp(s, "f187") == 0)
+                return (struct v3d_type) { .kind = V3D_TYPE_F187 };
         else if (strcmp(s, "address") == 0)
                 return (struct v3d_type) { .kind = V3D_TYPE_ADDRESS };
         else if (strcmp(s, "offset") == 0)
@@ -359,6 +366,9 @@
                 else if (strcmp(atts[i], "default") == 0) {
                         field->has_default = true;
                         field->default_value = strtoul(atts[i + 1], &p, 0);
+                } else if (strcmp(atts[i], "minus_one") == 0) {
+                        assert(strcmp(atts[i + 1], "true") == 0);
+                        field->minus_one = true;
                 }
         }
 
@@ -411,6 +421,25 @@
         return;
 }
 
+static bool
+ver_in_range(int ver, int min_ver, int max_ver)
+{
+        return ((min_ver == 0 || ver >= min_ver) &&
+                (max_ver == 0 || ver <= max_ver));
+}
+
+static bool
+skip_if_ver_mismatch(struct parser_context *ctx, int min_ver, int max_ver)
+{
+        if (!ctx->parse_skip_depth && !ver_in_range(ctx->devinfo->ver,
+                                                    min_ver, max_ver)) {
+                assert(ctx->parse_depth != 0);
+                ctx->parse_skip_depth = ctx->parse_depth;
+        }
+
+        return ctx->parse_skip_depth;
+}
+
 static void
 start_element(void *data, const char *element_name, const char **atts)
 {
@@ -418,20 +447,35 @@
         int i;
         const char *name = NULL;
         const char *ver = NULL;
+        int min_ver = 0;
+        int max_ver = 0;
 
         ctx->loc.line_number = XML_GetCurrentLineNumber(ctx->parser);
 
         for (i = 0; atts[i]; i += 2) {
-                if (strcmp(atts[i], "name") == 0)
+                if (strcmp(atts[i], "shortname") == 0)
+                        name = atts[i + 1];
+                else if (strcmp(atts[i], "name") == 0 && !name)
                         name = atts[i + 1];
                 else if (strcmp(atts[i], "gen") == 0)
                         ver = atts[i + 1];
+                else if (strcmp(atts[i], "min_ver") == 0)
+                        min_ver = strtoul(atts[i + 1], NULL, 0);
+                else if (strcmp(atts[i], "max_ver") == 0)
+                        max_ver = strtoul(atts[i + 1], NULL, 0);
         }
 
+        if (skip_if_ver_mismatch(ctx, min_ver, max_ver))
+                goto skip;
+
         if (strcmp(element_name, "vcxml") == 0) {
                 if (ver == NULL)
                         fail(&ctx->loc, "no ver given");
 
+                /* Make sure that we picked an XML that matched our version.
+                 */
+                assert(ver_in_range(ctx->devinfo->ver, min_ver, max_ver));
+
                 int major, minor;
                 int n = sscanf(ver, "%d.%d", &major, &minor);
                 if (n == 0)
@@ -467,6 +511,15 @@
                 assert(ctx->nvalues < ARRAY_SIZE(ctx->values));
         }
 
+skip:
+        ctx->parse_depth++;
+}
+
+static int
+field_offset_compare(const void *a, const void *b)
+{
+        return ((*(const struct v3d_field **)a)->start -
+                (*(const struct v3d_field **)b)->start);
 }
 
 static void
@@ -475,6 +528,14 @@
         struct parser_context *ctx = data;
         struct v3d_spec *spec = ctx->spec;
 
+        ctx->parse_depth--;
+
+        if (ctx->parse_skip_depth) {
+                if (ctx->parse_skip_depth == ctx->parse_depth)
+                        ctx->parse_skip_depth = 0;
+                return;
+        }
+
         if (strcmp(name, "packet") == 0 ||
             strcmp(name, "struct") == 0 ||
             strcmp(name, "register") == 0) {
@@ -499,6 +560,13 @@
                 else if (strcmp(name, "register") == 0)
                         spec->registers[spec->nregisters++] = group;
 
+                /* Sort the fields in increasing offset order.  The XML might
+                 * be specified in any order, but we'll want to iterate from
+                 * the bottom.
+                 */
+                qsort(group->fields, group->nfields, sizeof(*group->fields),
+                      field_offset_compare);
+
                 assert(spec->ncommands < ARRAY_SIZE(spec->commands));
                 assert(spec->nstructs < ARRAY_SIZE(spec->structs));
                 assert(spec->nregisters < ARRAY_SIZE(spec->registers));
@@ -586,10 +654,14 @@
         uint32_t text_offset = 0, text_length = 0, total_length;
 
         for (int i = 0; i < ARRAY_SIZE(genxml_files_table); i++) {
-                if (genxml_files_table[i].gen_10 == devinfo->ver) {
+                if (i != 0) {
+                        assert(genxml_files_table[i - 1].gen_10 <
+                               genxml_files_table[i].gen_10);
+                }
+
+                if (genxml_files_table[i].gen_10 <= devinfo->ver) {
                         text_offset = genxml_files_table[i].offset;
                         text_length = genxml_files_table[i].length;
-                        break;
                 }
         }
 
@@ -600,6 +672,7 @@
 
         memset(&ctx, 0, sizeof ctx);
         ctx.parser = XML_ParserCreate(NULL);
+        ctx.devinfo = devinfo;
         XML_SetUserData(ctx.parser, &ctx);
         if (ctx.parser == NULL) {
                 fprintf(stderr, "failed to create parser\n");
@@ -687,14 +760,12 @@
 void
 v3d_field_iterator_init(struct v3d_field_iterator *iter,
                         struct v3d_group *group,
-                        const uint8_t *p,
-                        bool print_colors)
+                        const uint8_t *p)
 {
         memset(iter, 0, sizeof(*iter));
 
         iter->group = group;
         iter->p = p;
-        iter->print_colors = print_colors;
 }
 
 static const char *
@@ -763,7 +834,7 @@
 
         iter->field = iter->group->fields[iter->field_iter++];
         if (iter->field->name)
-                strncpy(iter->name, iter->field->name, sizeof(iter->name));
+                snprintf(iter->name, sizeof(iter->name), "%s", iter->field->name);
         else
                 memset(iter->name, 0, sizeof(iter->name));
         iter->offset = iter_group_offset_bits(iter, iter->group_iter) / 8 +
@@ -774,7 +845,7 @@
 }
 
 bool
-v3d_field_iterator_next(struct v3d_field_iterator *iter)
+v3d_field_iterator_next(struct clif_dump *clif, struct v3d_field_iterator *iter)
 {
         if (!iter_advance_field(iter))
                 return false;
@@ -786,33 +857,64 @@
         int s = group_member_offset + iter->field->start;
         int e = group_member_offset + iter->field->end;
 
+        assert(!iter->field->minus_one ||
+               iter->field->type.kind == V3D_TYPE_INT ||
+               iter->field->type.kind == V3D_TYPE_UINT);
+
         switch (iter->field->type.kind) {
         case V3D_TYPE_UNKNOWN:
         case V3D_TYPE_INT: {
                 uint32_t value = __gen_unpack_sint(iter->p, s, e);
+                if (iter->field->minus_one)
+                        value++;
                 snprintf(iter->value, sizeof(iter->value), "%d", value);
                 enum_name = v3d_get_enum_name(&iter->field->inline_enum, value);
                 break;
         }
         case V3D_TYPE_UINT: {
                 uint32_t value = __gen_unpack_uint(iter->p, s, e);
+                if (iter->field->minus_one)
+                        value++;
+                if (strcmp(iter->field->name, "Vec size") == 0 && value == 0)
+                        value = 1 << (e - s);
                 snprintf(iter->value, sizeof(iter->value), "%u", value);
                 enum_name = v3d_get_enum_name(&iter->field->inline_enum, value);
                 break;
         }
-        case V3D_TYPE_BOOL: {
-                const char *true_string =
-                        iter->print_colors ? "\e[0;35mtrue\e[0m" : "true";
+        case V3D_TYPE_BOOL:
                 snprintf(iter->value, sizeof(iter->value), "%s",
                          __gen_unpack_uint(iter->p, s, e) ?
-                         true_string : "false");
+                         "1 /* true */" : "0 /* false */");
                 break;
-        }
         case V3D_TYPE_FLOAT:
                 snprintf(iter->value, sizeof(iter->value), "%f",
                          __gen_unpack_float(iter->p, s, e));
                 break;
-        case V3D_TYPE_ADDRESS:
+
+        case V3D_TYPE_F187:
+                snprintf(iter->value, sizeof(iter->value), "%f",
+                         __gen_unpack_f187(iter->p, s, e));
+                break;
+
+        case V3D_TYPE_ADDRESS: {
+                uint32_t addr =
+                        __gen_unpack_uint(iter->p, s, e) << (31 - (e - s));
+                struct clif_bo *bo = clif_lookup_bo(clif, addr);
+                if (bo) {
+                        snprintf(iter->value, sizeof(iter->value),
+                                 "[%s+0x%08x] /* 0x%08x */",
+                                 bo->name, addr - bo->offset, addr);
+                } else if (addr) {
+                        snprintf(iter->value, sizeof(iter->value),
+                                 "/* XXX: BO unknown */ 0x%08x", addr);
+                } else {
+                        snprintf(iter->value, sizeof(iter->value),
+                                 "[null]");
+                }
+
+                break;
+        }
+
         case V3D_TYPE_OFFSET:
                 snprintf(iter->value, sizeof(iter->value), "0x%08"PRIx64,
                          __gen_unpack_uint(iter->p, s, e) << (31 - (e - s)));
@@ -825,14 +927,24 @@
                                              iter->field->type.v3d_struct->name);
                 break;
         case V3D_TYPE_SFIXED:
-                snprintf(iter->value, sizeof(iter->value), "%f",
-                         __gen_unpack_sfixed(iter->p, s, e,
-                                             iter->field->type.f));
+                if (clif->pretty) {
+                        snprintf(iter->value, sizeof(iter->value), "%f",
+                                 __gen_unpack_sfixed(iter->p, s, e,
+                                                     iter->field->type.f));
+                } else {
+                        snprintf(iter->value, sizeof(iter->value), "%u",
+                                 (unsigned)__gen_unpack_uint(iter->p, s, e));
+                }
                 break;
         case V3D_TYPE_UFIXED:
-                snprintf(iter->value, sizeof(iter->value), "%f",
-                         __gen_unpack_ufixed(iter->p, s, e,
-                                             iter->field->type.f));
+                if (clif->pretty) {
+                        snprintf(iter->value, sizeof(iter->value), "%f",
+                                 __gen_unpack_ufixed(iter->p, s, e,
+                                                     iter->field->type.f));
+                } else {
+                        snprintf(iter->value, sizeof(iter->value), "%u",
+                                 (unsigned)__gen_unpack_uint(iter->p, s, e));
+                }
                 break;
         case V3D_TYPE_MBO:
                 break;
@@ -853,26 +965,40 @@
         if (enum_name) {
                 int length = strlen(iter->value);
                 snprintf(iter->value + length, sizeof(iter->value) - length,
-                         " (%s)", enum_name);
+                         " /* %s */", enum_name);
         }
 
         return true;
 }
 
 void
-v3d_print_group(FILE *outfile, struct v3d_group *group,
-                uint64_t offset, const uint8_t *p, bool color)
+v3d_print_group(struct clif_dump *clif, struct v3d_group *group,
+                uint64_t offset, const uint8_t *p)
 {
         struct v3d_field_iterator iter;
 
-        v3d_field_iterator_init(&iter, group, p, color);
-        while (v3d_field_iterator_next(&iter)) {
-                fprintf(outfile, "    %s: %s\n", iter.name, iter.value);
+        v3d_field_iterator_init(&iter, group, p);
+        while (v3d_field_iterator_next(clif, &iter)) {
+                /* Clif parsing uses the packet name, and expects no
+                 * sub-id.
+                 */
+                if (strcmp(iter.field->name, "sub-id") == 0 ||
+                    strcmp(iter.field->name, "unused") == 0 ||
+                    strcmp(iter.field->name, "Pad") == 0)
+                        continue;
+
+                if (clif->pretty) {
+                        fprintf(clif->out, "    %s: %s\n",
+                                iter.name, iter.value);
+                } else {
+                        fprintf(clif->out, "  /* %30s: */ %s\n",
+                                iter.name, iter.value);
+                }
                 if (iter.struct_desc) {
                         uint64_t struct_offset = offset + iter.offset;
-                        v3d_print_group(outfile, iter.struct_desc,
+                        v3d_print_group(clif, iter.struct_desc,
                                         struct_offset,
-                                        &p[iter.offset], color);
+                                        &p[iter.offset]);
                 }
         }
 }
diff --git a/src/broadcom/cle/v3d_decoder.h b/src/broadcom/cle/v3d_decoder.h
index 541d877..b5ead383 100644
--- a/src/broadcom/cle/v3d_decoder.h
+++ b/src/broadcom/cle/v3d_decoder.h
@@ -34,6 +34,7 @@
 struct v3d_spec;
 struct v3d_group;
 struct v3d_field;
+struct clif_dump;
 
 struct v3d_group *v3d_spec_find_struct(struct v3d_spec *spec, const char *name);
 struct v3d_spec *v3d_spec_load(const struct v3d_device_info *devinfo);
@@ -57,7 +58,6 @@
         int group_iter;
 
         struct v3d_field *field;
-        bool print_colors;
 };
 
 struct v3d_group {
@@ -99,6 +99,7 @@
                 V3D_TYPE_UINT,
                 V3D_TYPE_BOOL,
                 V3D_TYPE_FLOAT,
+                V3D_TYPE_F187,
                 V3D_TYPE_ADDRESS,
                 V3D_TYPE_OFFSET,
                 V3D_TYPE_STRUCT,
@@ -125,6 +126,7 @@
         char *name;
         int start, end;
         struct v3d_type type;
+        bool minus_one;
         bool has_default;
         uint32_t default_value;
 
@@ -133,14 +135,13 @@
 
 void v3d_field_iterator_init(struct v3d_field_iterator *iter,
                              struct v3d_group *group,
-                             const uint8_t *p,
-                             bool print_colors);
+                             const uint8_t *p);
 
-bool v3d_field_iterator_next(struct v3d_field_iterator *iter);
+bool v3d_field_iterator_next(struct clif_dump *clif,
+                             struct v3d_field_iterator *iter);
 
-void v3d_print_group(FILE *out,
+void v3d_print_group(struct clif_dump *clif,
                      struct v3d_group *group,
-                     uint64_t offset, const uint8_t *p,
-                     bool color);
+                     uint64_t offset, const uint8_t *p);
 
 #endif /* V3D_DECODER_H */
diff --git a/src/broadcom/cle/v3d_packet_helpers.h b/src/broadcom/cle/v3d_packet_helpers.h
index bc1bf3e..f340b79 100644
--- a/src/broadcom/cle/v3d_packet_helpers.h
+++ b/src/broadcom/cle/v3d_packet_helpers.h
@@ -26,12 +26,15 @@
 #include <stdbool.h>
 #include <assert.h>
 #include <math.h>
+#include <gallium/auxiliary/util/u_math.h>
 
 #ifdef HAVE_VALGRIND
 #include <valgrind.h>
 #include <memcheck.h>
 #define VG(x) x
+#ifndef NDEBUG
 #define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
+#endif
 #else
 #define VG(x)
 #endif
@@ -203,3 +206,11 @@
    return f->f;
 }
 
+static inline float
+__gen_unpack_f187(const uint8_t *restrict cl, uint32_t start, uint32_t end)
+{
+   assert(end - start == 15);
+   uint32_t bits = __gen_unpack_uint(cl, start, end);
+   return uif(bits << 16);
+}
+
diff --git a/src/broadcom/cle/v3d_packet_v21.xml b/src/broadcom/cle/v3d_packet_v21.xml
index 9ca9833..df838a7 100644
--- a/src/broadcom/cle/v3d_packet_v21.xml
+++ b/src/broadcom/cle/v3d_packet_v21.xml
@@ -1,4 +1,4 @@
-<vcxml gen="2.1">
+<vcxml gen="2.1" min_ver="21" max_ver="21">
 
   <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
     <value name="NEVER" value="0"/>
diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
index aac9fbf..f471d54 100644
--- a/src/broadcom/cle/v3d_packet_v33.xml
+++ b/src/broadcom/cle/v3d_packet_v33.xml
@@ -1,4 +1,4 @@
-<vcxml gen="3.3">
+<vcxml gen="3.3" min_ver="33" max_ver="42">
 
   <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
     <value name="NEVER" value="0"/>
@@ -69,7 +69,7 @@
     <value name="TRIANGLE_FAN_TF" value="22"/>
   </enum>
 
-  <enum name="TMU Filter" prefix="V3D_TMU_FILTER">
+  <enum name="TMU Filter" prefix="V3D_TMU_FILTER" max_ver="33">
     <!-- Names are mip filter, min filter, mag filter -->
     <value name="MIN_LIN_MIP_NONE_MAG_LIN" value="0"/>
     <value name="MIN_LIN_MIP_NONE_MAG_NEAR" value="1"/>
@@ -92,6 +92,36 @@
     <value name="ANISOTROPIC_16_1" value="15"/>
   </enum>
 
+  <enum name="Border Color Mode" prefix="V3D_BORDER_COLOR" min_ver="41">
+    <value name="0000" value="0"/>
+    <value name="0001" value="1"/>
+    <value name="1111" value="2"/>
+    <value name="Follows" value="7"/>
+  </enum>
+
+  <enum name="Wrap Mode" prefix="V3D_WRAP_MODE" min_ver="41">
+      <value name="Wrap mode REPEAT" value="0"/>
+      <value name="Wrap mode CLAMP" value="1"/>
+      <value name="Wrap mode MIRROR" value="2"/>
+      <value name="Wrap mode BORDER" value="3"/>
+      <value name="Wrap mode MIRROR_ONCE" value="4"/>
+  </enum>
+
+  <enum name="TMU Op" prefix="V3D_TMU_OP" min_ver="41">
+    <value name="Write ADD, Read Prefetch" value="0"/>
+    <value name="Write SUB, Read Clear" value="1"/>
+    <value name="Write XCHG, Read Flush" value="2"/>
+    <value name="Write CMPXCHG, Read Flush" value="3"/>
+    <value name="Write UMIN, Full L1 Clear" value="4"/>
+    <value name="Write UMAX" value="5"/>
+    <value name="Write SMIN" value="6"/>
+    <value name="Write SMAX" value="7"/>
+    <value name="Write AND, Read INC" value="8"/>
+    <value name="Write OR, Read DEC" value="9"/>
+    <value name="Write XOR, Read NOT" value="10"/>
+    <value name="Regular" value="15"/>
+  </enum>
+
   <enum name="Varying Flags Action" prefix="V3D_VARYING_FLAGS_ACTION">
     <value name="unchanged" value="0"/>
     <value name="zeroed" value="1"/>
@@ -110,7 +140,6 @@
   <enum name="Decimate Mode" prefix="V3D_DECIMATE_MODE">
     <value name="sample 0" value="0"/>
     <value name="4x" value="1"/>
-    <value name="16x" value="2"/>
     <value name="all samples" value="3"/>
   </enum>
 
@@ -138,6 +167,13 @@
     <value name="depth_16" value="2"/>
   </enum>
 
+  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
+    <value name="none" value="0"/> <!-- no clamping -->
+    <value name="norm" value="1"/> <!-- [0,1] for f16 -->
+    <value name="pos" value="2"/> <!-- [0, for f16 -->
+    <value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
+  </enum>
+
   <enum name="Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT">
     <!--
 	Formats appear with their channels named from the low bits to
@@ -180,35 +216,48 @@
     <value name="rgba8ui"  value="34"/>
     <value name="rg8ui"    value="35"/>
     <value name="r8ui"     value="36"/>
-    <value name="srgbx8"   value="37"/>
-    <value name="rgbx8"    value="38"/>
+    <value name="srgbx8"   value="37" max_ver="33"/>
+    <value name="rgbx8"    value="38" max_ver="33"/>
+    <value name="bstc"     value="39" min_ver="41"/>
+    <value name="d32f"     value="40" min_ver="41"/>
+    <value name="d24"      value="41" min_ver="41"/>
+    <value name="d16"      value="42" min_ver="41"/>
+    <value name="d24s8"    value="43" min_ver="41"/>
+    <value name="s8"       value="44" min_ver="41"/>
   </enum>
 
-  <enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS">
+  <enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
     <value name="depth_component32f" value="0"/>
     <value name="depth_component24" value="1"/> <!-- depth low, pad high -->
     <value name="depth_component16" value="2"/>
     <value name="depth24_stencil8" value="3"/> <!-- stencil low, depth high -->
   </enum>
 
+  <enum name="Dither Mode" prefix="V3D_DITHER_MODE">
+    <value name="None" value="0"/>
+    <value name="RGB" value="1"/>
+    <value name="A" value="2"/>
+    <value name="RGBA" value="3"/>
+  </enum>
+
   <packet code="0" name="Halt"/>
   <packet code="1" name="NOP"/>
   <packet code="4" name="Flush"/>
   <packet code="5" name="Flush All State"/>
   <packet code="6" name="Start Tile Binning"/>
-  <packet code="7" name="Increment Semaphore"/>
-  <packet code="8" name="Wait on Semaphore"/>
-  <packet code="9" name="Wait for previous frame"/>
-  <packet code="10" name="Enable Z-only rendering" cl="R"/>
-  <packet code="11" name="Disable Z-only rendering" cl="R"/>
-  <packet code="12" name="End of Z-only rendering in frame"/>
-  <packet code="13" name="End of rendering"/>
+  <packet code="7" shortname="incr_semaphore" name="Increment Semaphore"/>
+  <packet code="8" shortname="wait_semaphore" name="Wait on Semaphore"/>
+  <packet code="9" shortname="wait_prev_frame" name="Wait for previous frame"/>
+  <packet code="10" shortname="enable_z_only" name="Enable Z-only rendering" cl="R"/>
+  <packet code="11" shortname="disable_z_only" name="Disable Z-only rendering" cl="R"/>
+  <packet code="12" shortname="end_z_only" name="End of Z-only rendering in frame"/>
+  <packet code="13" shortname="end_render" name="End of rendering"/>
 
-  <packet code="14" name="Wait for transform feedback" cl="B">
+  <packet code="14" shortname="wait_transform_feedback" name="Wait for transform feedback" cl="B">
     <field name="Block count" size="8" start="0" type="uint"/>
   </packet>
 
-  <packet code="15" name="Branch to auto-chained sub-list">
+  <packet code="15" shortname="branch_sub_autochain" name="Branch to auto-chained sub-list">
     <field name="address" size="32" start="0" type="address"/>
   </packet>
 
@@ -216,62 +265,69 @@
     <field name="address" size="32" start="0" type="address"/>
   </packet>
 
-  <packet code="17" name="Branch to Sub-list">
+  <packet code="17" shortname="branch_sub" name="Branch to Sub-list">
     <field name="address" size="32" start="0" type="address"/>
   </packet>
 
-  <packet code="18" name="Return from sub-list"/>
-  <packet code="19" name="Flush VCD cache"/>
+  <packet code="18" shortname="return" name="Return from sub-list"/>
+  <packet code="19" shortname="clear_vcd_cache" name="Flush VCD cache"/>
 
-  <packet code="20" name="Start Address of Generic Tile List">
+  <packet code="20" shortname="generic_tile_list" name="Start Address of Generic Tile List">
     <field name="start" size="32" start="0" type="address"/>
     <field name="end" size="32" start="32" type="address"/>
   </packet>
 
-  <packet code="21" name="Branch to Implicit Tile List">
+  <packet code="21" shortname="branch_implicit_tile" name="Branch to Implicit Tile List">
     <field name="tile list set number" size="8" start="0" type="uint"/>
   </packet>
 
-  <packet code="22" name="Branch to Explicit Supertile">
+  <packet code="22" shortname="branch_explicit_supertile" name="Branch to Explicit Supertile">
     <field name="Absolute address of explicit supertile render list" size="32" start="24" type="address"/>
     <field name="explicit supertile number" size="8" start="16" type="uint"/>
     <field name="row number" size="8" start="8" type="uint"/>
     <field name="column number" size="8" start="0" type="uint"/>
   </packet>
 
-  <packet code="23" name="Supertile Coordinates">
+  <packet code="23" shortname="supertile_coords" name="Supertile Coordinates">
     <field name="row number in supertiles" size="8" start="8" type="uint"/>
     <field name="column number in supertiles" size="8" start="0" type="uint"/>
   </packet>
 
-  <packet code="24" name="Store Multi-Sample Resolved Tile Color Buffer" cl="R"/>
+  <packet code="24" shortname="store_subsample" name="Store Multi-Sample Resolved Tile Color Buffer" cl="R" max_ver="33"/>
 
-  <packet code="25" name="Store Multi-Sample Resolved Tile Color Buffer (extended)" cl="R">
+  <packet code="25" shortname="store_subsample_ex" name="Store Multi-Sample Resolved Tile Color Buffer (extended)" cl="R" max_ver="33">
     <field name="Disable Color Buffer write" size="8" start="8" type="uint"/>
     <field name="Enable Z write" size="1" start="7" type="bool"/>
     <field name="Enable Stencil write" size="1" start="6" type="bool"/>
     <!-- bit 5 unused -->
-    <field name="Disable Colour buffer(s) clear on write" size="1" start="4" type="bool"/>
+    <field name="Disable Color buffer(s) clear on write" size="1" start="4" type="bool"/>
     <field name="Disable Stencil buffer clear on write" size="1" start="3" type="bool"/>
     <field name="Disable Z buffer clear on write" size="1" start="2" type="bool"/>
     <field name="Disable fast opportunistic write out in multisample mode" size="1" start="1" type="bool"/>
     <field name="Last Tile of Frame" size="1" start="0" type="bool"/>
   </packet>
 
-  <packet code="26" name="Reload Tile Colour Buffer" cl="R">
-    <field name="Disable Colour Buffer load" size="8" start="8" type="uint"/>
+  <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
+    <field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
+    <field name="Clear all Render Targets" size="1" start="0" type="bool"/>
+  </packet>
+
+  <packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
+    <field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
     <field name="Enable Z load" size="1" start="7" type="bool"/>
     <field name="Enable Stencil load" size="1" start="6" type="bool"/>
   </packet>
 
-  <packet code="27" name="End of Tile Marker" cl="R"/>
+  <packet code="26" shortname="end_loads" name="End of Loads" cl="R" min_ver="41"/>
 
-  <packet code="29" name="Store Tile Buffer General" cl="R">
+  <packet code="27" shortname="end_tile" name="End of Tile Marker" cl="R"/>
+
+  <packet code="29" shortname="store_general" name="Store Tile Buffer General" cl="R" max_ver="33">
     <field name="Address" size="24" start="24" type="address"/>
     <field name="Padded height of output image in UIF blocks" size="13" start="11" type="uint"/>
     <field name="XOR UIF" size="1" start="10" type="bool"/>
     <field name="Last Tile of Frame" size="1" start="8" type="bool"/>
-    <field name="Disable Colour buffer(s) clear on write" size="1" start="7" type="bool"/>
+    <field name="Disable Color buffer(s) clear on write" size="1" start="7" type="bool"/>
     <field name="Disable Stencil buffer clear on write" size="1" start="6" type="bool"/>
     <field name="Disable Z buffer clear on write" size="1" start="5" type="bool"/>
     <field name="Raw Mode" size="1" start="4" type="bool"/>
@@ -287,7 +343,40 @@
     </field>
   </packet>
 
-  <packet code="30" name="Load Tile Buffer General" cl="R">
+  <packet code="29" shortname="store" name="Store Tile Buffer General" cl="R" min_ver="41">
+    <field name="Address" size="32" start="64" type="address"/>
+
+    <!-- used for y flip -->
+    <field name="Height" size="16" start="48" type="uint"/>
+
+    <!-- height in ub for UIF, byte stride for raster -->
+    <field name="Height in UB or Stride" size="20" start="28" type="uint"/>
+
+    <field name="R/B swap" size="1" start="20" type="bool"/>
+    <field name="Channel Reverse" size="1" start="19" type="bool"/>
+    <field name="Clear buffer being stored" size="1" start="18" type="bool"/>
+    <field name="Output Image Format" size="6" start="12" type="Output Image Format"/>
+
+    <field name="Decimate mode" size="2" start="10" type="Decimate Mode"/>
+
+    <field name="Dither Mode" size="2" start="8" type="Dither Mode"/>
+
+    <field name="Flip Y" size="1" start="7" type="bool"/>
+
+    <field name="Memory Format" size="3" start="4" type="Memory Format"/>
+    <field name="Buffer to Store" size="4" start="0" type="uint">
+      <value name="Render target 0" value="0"/>
+      <value name="Render target 1" value="1"/>
+      <value name="Render target 2" value="2"/>
+      <value name="Render target 3" value="3"/>
+      <value name="None" value="8"/>
+      <value name="Z" value="9"/>
+      <value name="Stencil" value="10"/>
+      <value name="Z+Stencil" value="11"/>
+    </field>
+  </packet>
+
+  <packet code="30" shortname="load_general" name="Load Tile Buffer General" cl="R" max_ver="33">
     <field name="Address" size="24" start="24" type="address"/>
     <field name="Padded height of output image in UIF blocks" size="13" start="11" type="uint"/>
     <field name="XOR UIF" size="1" start="10" type="bool"/>
@@ -304,7 +393,40 @@
     </field>
   </packet>
 
-  <packet code="32" name="Indexed Primitive List" cl="B">
+  <packet code="30" shortname="load" name="Load Tile Buffer General" cl="R" min_ver="41">
+    <field name="Address" size="32" start="64" type="address"/>
+
+    <!-- used for y flip -->
+    <field name="Height" size="16" start="48" type="uint"/>
+
+    <!-- height in ub for UIF, byte stride for raster -->
+    <field name="Height in UB or Stride" size="20" start="28" type="uint"/>
+
+    <field name="R/B swap" size="1" start="20" type="bool"/>
+    <field name="Channel Reverse" size="1" start="19" type="bool"/>
+
+    <field name="Input Image Format" size="6" start="12" type="Output Image Format"/>
+
+    <field name="Decimate mode" size="2" start="10" type="Decimate Mode"/>
+
+    <field name="Flip Y" size="1" start="7" type="bool"/>
+
+    <field name="Memory Format" size="3" start="4" type="Memory Format"/>
+    <field name="Buffer to Load" size="4" start="0" type="uint">
+      <value name="Render target 0" value="0"/>
+      <value name="Render target 1" value="1"/>
+      <value name="Render target 2" value="2"/>
+      <value name="Render target 3" value="3"/>
+      <value name="None" value="8"/>
+      <value name="Z" value="9"/>
+      <value name="Stencil" value="10"/>
+      <value name="Z+Stencil" value="11"/>
+    </field>
+  </packet>
+
+  <packet code="31" shortname="tf_draw_flush_and_count" name="Transform Feedback Flush and Count"/>
+
+  <packet code="32" name="Indexed Prim List" cl="B" max_ver="33">
     <field name="Minimum index" size="32" start="104" type="uint"/>
     <field name="Enable Primitive Restarts" size="1" start="103" type="bool"/>
     <field name="Maximum index" size="31" start="72" type="uint"/>
@@ -320,7 +442,22 @@
     <field name="mode" size="5" start="0" type="Primitive"/>
   </packet>
 
-  <packet code="34" name="Indexed Instanced Primitive List" cl="B">
+  <packet code="32" name="Indexed Prim List" cl="B" min_ver="41">
+    <field name="Index Offset" size="32" start="40" type="uint"/>
+
+    <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
+    <field name="Length" size="31" start="8" type="uint"/>
+
+    <field name="Index type" size="2" start="6" type="uint">
+      <value name="Index type 8-bit" value="0"/>
+      <value name="Index type 16-bit" value="1"/>
+      <value name="Index type 32-bit" value="2"/>
+    </field>
+
+    <field name="mode" size="6" start="0" type="Primitive"/>
+  </packet>
+
+  <packet code="34" name="Indexed Instanced Prim List" cl="B" max_ver="33">
     <field name="Enable Primitive Restarts" size="1" start="135" type="bool"/>
     <field name="Maximum index" size="31" start="104" type="uint"/>
     <field name="Address of Indices List" size="32" start="72" type="address"/>
@@ -336,14 +473,29 @@
     <field name="mode" size="5" start="0" type="Primitive"/>
   </packet>
 
-  <packet code="36" name="Vertex Array Primitives" cl="B">
+  <packet code="34" name="Indexed Instanced Prim List" cl="B" min_ver="41">
+    <field name="Index Offset" size="32" start="72" type="uint"/>
+    <field name="Number of Instances" size="32" start="40" type="uint"/>
+    <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
+    <field name="Instance Length" size="31" start="8" type="uint"/>
+
+    <field name="Index type" size="2" start="6" type="uint">
+      <value name="Index type 8-bit" value="0"/>
+      <value name="Index type 16-bit" value="1"/>
+      <value name="Index type 32-bit" value="2"/>
+    </field>
+
+    <field name="mode" size="6" start="0" type="Primitive"/>
+  </packet>
+
+  <packet code="36" name="Vertex Array Prims" cl="B">
     <field name="Index of First Vertex" size="32" start="40" type="uint"/>
     <field name="Length" size="32" start="8" type="uint"/>
 
     <field name="mode" size="8" start="0" type="Primitive"/>
   </packet>
 
-  <packet code="38" name="Vertex Array Instanced Primitives" cl="B">
+  <packet code="38" name="Vertex Array Instanced Prims" cl="B">
     <field name="Index of First Vertex" size="32" start="72" type="uint"/>
     <field name="Number of Instances" size="32" start="40" type="uint"/>
     <field name="Instance Length" size="32" start="8" type="uint"/>
@@ -357,11 +509,13 @@
     <field name="Base Vertex" size="32" start="0" type="uint"/>
   </packet>
 
-  <packet code="56" name="Primitive List Format">
-    <field name="data type" size="1" start="6" type="uint">
-      <value name="List Indexed" value="0"/>
-      <value name="List 32-bit X/Y" value="1"/>
-    </field>
+  <packet code="44" name="Index Buffer Setup" cl="B" min_ver="41">
+    <field name="Address" size="32" start="0" type="address"/>
+    <field name="Size" size="32" start="32" type="uint"/>
+  </packet>
+
+  <packet code="56" name="Prim List Format">
+    <field name="tri strip or fan" size="1" start="7" type="bool"/>
     <field name="primitive type" size="6" start="0" type="uint">
       <value name="List Points" value="0"/>
       <value name="List Lines" value="1"/>
@@ -369,29 +523,57 @@
     </field>
   </packet>
 
-  <packet code="64" name="GL Shader State">
+  <packet code="64" shortname="gl_shader" name="GL Shader State">
     <field name="address" size="27" start="5" type="address"/>
     <field name="number of attribute arrays" size="5" start="0" type="uint"/>
   </packet>
 
-  <packet code="74" name="Transform Feedback Enable">
+  <packet code="71" name="VCM Cache Size" min_ver="41">
+    <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
+    <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
+  </packet>
+
+  <packet code="73" name="VCM Cache Size" max_ver="33">
+    <field name="Number of 16-vertex batches for rendering" size="4" start="4" type="uint"/>
+    <field name="Number of 16-vertex batches for binning" size="4" start="0" type="uint"/>
+  </packet>
+
+  <packet code="73" name="Transform Feedback Buffer" min_ver="41">
+    <field name="Buffer Address" size="32" start="32" type="address"/>
+    <field name="Buffer Size in 32-bit words" size="30" start="2" type="uint"/>
+    <field name="Buffer Number" size="2" start="0" type="uint"/>
+  </packet>
+
+  <packet code="74" name="Transform Feedback Enable" max_ver="33">
     <field name="number of 32-bit Output Buffer Address following" size="3" start="8" type="uint"/>
     <field name="number of 16-bit Output Data Specs following" size="5" start="11" type="uint"/>
   </packet>
 
+  <packet code="74" name="Transform Feedback Specs" min_ver="41">
+    <field name="Enable" size="1" start="7" type="bool"/>
+    <field name="Number of 16-bit Output Data Specs following" size="5" start="0" type="uint"/>
+  </packet>
+
   <packet code="75" name="Flush Transform Feedback Data"/>
 
-  <struct name="Transform Feedback Output Data Spec">
+  <struct name="Transform Feedback Output Data Spec" max_ver="33">
     <field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/>
-    <field name="Number of consecutive Vertex Values to output as 32-bit values minus 1" size="4" start="8" type="uint"/>
+    <field name="Number of consecutive Vertex Values to output as 32-bit values" size="4" start="8" type="uint" minus_one="true"/>
     <field name="Output Buffer to write to" size="2" start="12" type="uint"/>
   </struct>
 
+  <struct name="Transform Feedback Output Data Spec" min_ver="41">
+    <field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/>
+    <field name="Number of consecutive Vertex Values to output as 32-bit values" size="4" start="8" type="uint" minus_one="true"/>
+    <field name="Output Buffer to write to" size="2" start="12" type="uint"/>
+    <field name="Stream number" size="2" start="14" type="uint"/>
+  </struct>
+
   <struct name="Transform Feedback Output Address">
     <field name="address" size="32" start="0" type="address"/>
   </struct>
 
-  <packet code="80" name="Stencil Config">
+  <packet code="80" name="Stencil Cfg">
     <field name="Stencil Write Mask" size="8" start="32" type="uint"/>
     <field name="Back Config" size="1" start="29" type="bool"/>
     <field name="Front Config" size="1" start="28" type="bool"/>
@@ -403,36 +585,59 @@
     <field name="Stencil Ref Value" size="8" start="0" type="uint"/>
   </packet>
 
-  <packet code="84" name="Blend Config">
-    <field name="VG Coverage Modes" size="2" start="28" type="uint"/>
-    <field name="Colour blend dst factor" size="4" start="20" type="Blend Factor"/>
-    <field name="Colour blend src factor" size="4" start="16" type="Blend Factor"/>
-    <field name="Colour blend mode" size="4" start="12" type="Blend Mode"/>
+  <packet code="83" name="Blend Enables" min_ver="41">
+    <field name="Mask" size="8" start="0" type="uint"/>
+  </packet>
+
+  <packet code="84" name="Blend Cfg" max_ver="33">
+    <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
+    <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
+    <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
     <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
     <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
     <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
   </packet>
 
-  <packet code="86" name="Blend Constant Colour">
+  <packet code="84" name="Blend Cfg" min_ver="41">
+    <field name="Render Target Mask" size="4" start="24" type="uint"/>
+    <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
+    <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
+    <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
+    <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
+    <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
+    <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
+  </packet>
+
+  <packet code="86" shortname="blend_ccolor" name="Blend Constant Color">
     <field name="Alpha (F16)" size="16" start="48" type="uint"/>
     <field name="Blue (F16)" size="16" start="32" type="uint"/>
     <field name="Green (F16)" size="16" start="16" type="uint"/>
     <field name="Red (F16)" size="16" start="0" type="uint"/>
   </packet>
 
-  <packet code="87" name="Colour Write Masks">
-    <field name="Reserved" size="16" start="16" type="uint"/>
-    <field name="Render Target 3 per colour component write masks" size="4" start="12" type="uint"/>
-    <field name="Render Target 2 per colour component write masks" size="4" start="8" type="uint"/>
-    <field name="Render Target 1 per colour component write masks" size="4" start="4" type="uint"/>
-    <field name="Render Target 0 per colour component write masks" size="4" start="0" type="uint"/>
+  <packet code="87" shortname="color_wmasks" name="Color Write Masks">
+    <field name="Mask" size="32" start="0" type="uint"/>
   </packet>
 
-  <packet code="92" name="Occlusion Query Counter">
+  <packet code="88" name="Zero All Centroid Flags" min_ver="41"/>
+
+  <packet code="89" name="Centroid Flags" min_ver="41">
+    <field name="Centroid Flags for varyings V0*24" size="24" start="8" type="uint"/>
+    <field name="Action for Centroid Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
+    <field name="Action for Centroid Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
+    <field name="Varying offset V0" size="4" start="0" type="uint"/>
+  </packet>
+
+  <packet code="91" name="Sample State" min_ver="41">
+    <field name="Coverage" size="16" start="16" type="f187"/>
+    <field name="Mask" size="4" start="0" type="uint"/>
+  </packet>
+
+  <packet code="92" shortname="occlusion_query_counter_enable" name="Occlusion Query Counter">
     <field name="address" size="32" start="0" type="address"/>
   </packet>
 
-  <packet code="96" name="Configuration Bits">
+  <packet code="96" name="Cfg Bits">
     <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
     <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
     <field name="Blend enable" size="1" start="19" type="bool"/>
@@ -442,8 +647,6 @@
     <field name="Z updates enable" size="1" start="15" type="bool"/>
     <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
     <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
-    <field name="Coverage Update Mode" size="2" start="9" type="uint"/>
-    <field name="Coverage Pipe Select" size="1" start="8" type="bool"/>
     <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
     <field name="Line Rasterization" size="2" start="4" type="uint"/>
     <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
@@ -452,15 +655,24 @@
     <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
   </packet>
 
-  <packet code="97" name="Zero All Flat Shade Flags"/>
+  <packet code="97" shortname="zero_all_flatshade_flags" name="Zero All Flat Shade Flags"/>
 
-  <packet code="98" name="Flat Shade Flags">
+  <packet code="98" shortname="flatshade_flags" name="Flat Shade Flags">
     <field name="Flat Shade Flags for varyings V0*24" size="24" start="8" type="uint"/>
     <field name="Action for Flat Shade Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
     <field name="Action for Flat Shade Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
     <field name="Varying offset V0" size="4" start="0" type="uint"/>
   </packet>
 
+  <packet code="99" shortname="zero_all_noperspective_flags" name="Zero All Non-perspective Flags" min_ver="41"/>
+
+  <packet code="100" shortname="noperspective_flags" name="Non-perspective Flags" min_ver="41">
+    <field name="Non-perspective Flags for varyings V0*24" size="24" start="8" type="uint"/>
+    <field name="Action for Non-perspective Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
+    <field name="Action for Non-perspective Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
+    <field name="Varying offset V0" size="4" start="0" type="uint"/>
+  </packet>
+
   <packet code="104" name="Point size">
     <field name="Point Size" size="32" start="0" type="float"/>
   </packet>
@@ -469,40 +681,56 @@
     <field name="Line width" size="32" start="0" type="float"/>
   </packet>
 
-  <packet name="Depth Offset" code="106">
-    <!-- these fields are both float-1-8-7 encoded (top 16 bits of a float32) -->
-    <field name="Depth Offset Units" size="16" start="16" type="uint"/>
-    <field name="Depth Offset Factor" size="16" start="0" type="uint"/>
+  <packet name="Depth Offset" code="106" max_ver="33">
+    <field name="Depth Offset Units" size="16" start="16" type="f187"/>
+    <field name="Depth Offset Factor" size="16" start="0" type="f187"/>
   </packet>
 
-  <packet name="Clip Window" code="107">
+  <packet name="Depth Offset" code="106" min_ver="41">
+    <field name="Limit" size="32" start="32" type="float"/>
+    <field name="Depth Offset Units" size="16" start="16" type="f187"/>
+    <field name="Depth Offset Factor" size="16" start="0" type="f187"/>
+  </packet>
+
+  <packet shortname="clip" name="clip_window" code="107">
     <field name="Clip Window Height in pixels" size="16" start="48" type="uint"/>
     <field name="Clip Window Width in pixels" size="16" start="32" type="uint"/>
     <field name="Clip Window Bottom Pixel Coordinate" size="16" start="16" type="uint"/>
     <field name="Clip Window Left Pixel Coordinate" size="16" start="0" type="uint"/>
   </packet>
 
-  <packet name="Viewport Offset" code="108">
+  <packet name="Viewport Offset" code="108" max_ver="33">
     <field name="Viewport Centre Y-coordinate" size="32" start="32" type="s24.8"/>
     <field name="Viewport Centre X-coordinate" size="32" start="0" type="s24.8"/>
   </packet>
 
-  <packet name="Clipper Z min/max clipping planes" code="109">
+  <packet name="Viewport Offset" code="108" min_ver="41">
+    <field name="Coarse Y" size="10" start="54" type="uint"/>
+    <field name="Viewport Centre Y-coordinate" size="22" start="32" type="s14.8"/>
+    <field name="Coarse X" size="10" start="22" type="uint"/>
+    <field name="Viewport Centre X-coordinate" size="22" start="0" type="s14.8"/>
+  </packet>
+
+  <packet shortname="clipz" name="Clipper Z min/max clipping planes" code="109">
     <field name="Maximum Zw" size="32" start="32" type="float"/>
     <field name="Minimum Zw" size="32" start="0" type="float"/>
   </packet>
 
-  <packet name="Clipper XY Scaling" code="110" cl="B">
+  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
     <field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
     <field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
   </packet>
 
-  <packet name="Clipper Z Scale and Offset" code="111" cl="B">
+  <packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
     <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
     <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
   </packet>
 
-  <packet code="120" name="Tile Binning Mode Configuration (Part1)">
+  <packet name="Number of Layers" code="119" min_ver="41">
+    <field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
+  </packet>
+
+  <packet code="120" name="Tile Binning Mode Cfg (Part1)" max_ver="33">
     <field name="Double-buffer in non-ms mode" size="1" start="63" type="bool"/>
     <field name="Multisample Mode (4x)" size="1" start="62" type="bool"/>
 
@@ -528,14 +756,42 @@
     <field name="sub-id" size="1" start="0" type="uint" default="0"/>
   </packet>
 
-  <packet code="120" name="Tile Binning Mode Configuration (Part2)" cl="B">
+  <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
+
+    <field name="Height (in pixels)" size="12" start="48" type="uint" minus_one="true"/>
+    <field name="Width (in pixels)" size="12" start="32" type="uint" minus_one="true"/>
+
+    <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/>
+    <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/>
+
+    <field name="Maximum BPP of all render targets" size="2" start="12" type="uint">
+      <value name="Render target maximum 32bpp" value="0"/>
+      <value name="Render target maximum 64bpp" value="1"/>
+      <value name="Render target maximum 128bpp" value="2"/>
+    </field>
+
+    <field name="Number of Render Targets" size="4" start="8" type="uint" minus_one="true"/>
+
+    <field name="tile allocation block size" size="2" start="4" type="uint">
+      <value name="tile allocation block size 64b" value="0"/>
+      <value name="tile allocation block size 128b" value="1"/>
+      <value name="tile allocation block size 256b" value="2"/>
+    </field>
+    <field name="tile allocation initial block size" size="2" start="2" type="uint">
+      <value name="tile allocation initial block size 64b" value="0"/>
+      <value name="tile allocation initial block size 128b" value="1"/>
+      <value name="tile allocation initial block size 256b" value="2"/>
+    </field>
+  </packet>
+
+  <packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
     <field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
     <field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
 
     <field name="sub-id" size="1" start="0" type="uint" default="1"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Configuration (Common Configuration)" cl="R">
+  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" max_ver="33">
     <field name="Disable Render Target Stores" size="8" start="56" type="uint"/>
     <field name="Enable Z Store" size="1" start="55" type="bool"/>
     <field name="Enable Stencil Store" size="1" start="54" type="bool"/>
@@ -547,7 +803,6 @@
       <value name="Early-Z direction GT/GE" value="1"/>
     </field>
 
-    <field name="Select Coverage Mode" size="1" start="44" type="bool"/>
     <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
     <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
 
@@ -559,12 +814,37 @@
 
     <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
     <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
-    <field name="Number of Render Targets Minus 1" size="4" start="4" type="uint"/>
+    <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
 
     <field name="sub-id" size="4" start="0" type="uint" default="0"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Configuration (Render Target config)" cl="R">
+  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
+    <field name="Pad" size="12" start="52" type="uint"/>
+
+    <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
+    <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
+
+    <field name="Early-Z disable" size="1" start="46" type="bool"/>
+
+    <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
+      <value name="Early-Z direction LT/LE" value="0"/>
+      <value name="Early-Z direction GT/GE" value="1"/>
+    </field>
+
+    <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
+    <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
+
+    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
+
+    <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
+    <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
+    <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
+
+    <field name="sub-id" size="4" start="0" type="uint" default="0"/>
+  </packet>
+
+  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
     <field name="Address" size="32" start="32" type="address"/>
 
     <field name="Pad" size="4" start="28" type="uint"/>
@@ -573,8 +853,7 @@
 
     <field name="Memory Format" size="3" start="24" type="Memory Format"/>
 
-    <field name="A dithered" size="1" start="23" type="bool"/>
-    <field name="BGR dithered" size="1" start="22" type="bool"/>
+    <field name="Dither Mode" size="2" start="22" type="Dither Mode"/>
 
     <field name="Output image format" size="6" start="16" type="Output Image Format"/>
 
@@ -586,7 +865,30 @@
     <field name="sub-id" size="4" start="0" type="uint" default="2"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Configuration (Z/Stencil config)" cl="R">
+  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
+
+    <field name="Pad" size="28" start="34" type="uint"/>
+
+    <field name="Render Target 3 Clamp" size="2" start="32" type="Render Target Clamp"/>
+    <field name="Render Target 3 Internal Type" size="4" start="30" type="Internal Type"/>
+    <field name="Render Target 3 Internal BPP" size="2" start="28" type="Internal BPP"/>
+
+    <field name="Render Target 2 Clamp" size="2" start="26" type="Render Target Clamp"/>
+    <field name="Render Target 2 Internal Type" size="4" start="22" type="Internal Type"/>
+    <field name="Render Target 2 Internal BPP" size="2" start="20" type="Internal BPP"/>
+
+    <field name="Render Target 1 Clamp" size="2" start="18" type="Render Target Clamp"/>
+    <field name="Render Target 1 Internal Type" size="4" start="14" type="Internal Type"/>
+    <field name="Render Target 1 Internal BPP" size="2" start="12" type="Internal BPP"/>
+
+    <field name="Render Target 0 Clamp" size="2" start="10" type="Render Target Clamp"/>
+    <field name="Render Target 0 Internal Type" size="4" start="6" type="Internal Type"/>
+    <field name="Render Target 0 Internal BPP" size="2" start="4" type="Internal BPP"/>
+
+    <field name="sub-id" size="4" start="0" type="uint" default="1"/>
+  </packet>
+
+  <packet code="121" name="Tile Rendering Mode Cfg (Z/Stencil)" cl="R" max_ver="33">
     <field name="Address" size="26" start="38" type="address"/>
 
     <field name="Padded height of output image in UIF blocks" size="13" start="25" type="uint"/>
@@ -605,16 +907,25 @@
     <field name="sub-id" size="4" start="0" type="uint" default="1"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Configuration (Z Stencil Clear Values)" cl="R">
+  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" max_ver="33">
     <field name="unused" size="16" start="48" type="uint"/>
 
     <field name="Z Clear Value" size="32" start="16" type="float"/>
 
-    <field name="Stencil/VG Mask Clear Value" size="8" start="8" type="uint"/>
+    <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
     <field name="sub-id" size="4" start="0" type="uint" default="3"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Configuration (Clear Colors Part1)" cl="R">
+  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
+    <field name="unused" size="16" start="48" type="uint"/>
+
+    <field name="Z Clear Value" size="32" start="16" type="float"/>
+
+    <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
+    <field name="sub-id" size="4" start="0" type="uint" default="2"/>
+  </packet>
+
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
     <!-- Express this as a 56-bit field? -->
     <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
     <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
@@ -623,7 +934,16 @@
     <field name="sub-id" size="4" start="0" type="uint" default="4"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Configuration (Clear Colors Part2)" cl="R">
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
+    <!-- Express this as a 56-bit field? -->
+    <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
+    <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
+
+    <field name="Render Target number" size="4" start="4" type="uint"/>
+    <field name="sub-id" size="4" start="0" type="uint" default="3"/>
+  </packet>
+
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
     <!-- Express this as a 56-bit field? -->
     <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
     <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
@@ -632,7 +952,16 @@
     <field name="sub-id" size="4" start="0" type="uint" default="5"/>
   </packet>
 
-  <packet code="121" name="Tile Rendering Mode Configuration (Clear Colors Part3)" cl="R">
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
+    <!-- Express this as a 56-bit field? -->
+    <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
+    <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
+
+    <field name="Render Target number" size="4" start="4" type="uint"/>
+    <field name="sub-id" size="4" start="0" type="uint" default="4"/>
+  </packet>
+
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
     <field name="pad" size="11" start="53" type="uint"/>
     <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
     <!-- image height is for Y flipping -->
@@ -643,12 +972,24 @@
     <field name="sub-id" size="4" start="0" type="uint" default="6"/>
   </packet>
 
-  <packet code="124" name="Tile Coordinates">
+  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
+    <field name="pad" size="11" start="53" type="uint"/>
+    <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
+    <!-- image height is for Y flipping -->
+    <field name="Raster Row Stride or Image Height in Pixels" size="16" start="24" type="uint"/>
+    <field name="Clear Color high 16 bits" size="16" start="8" type="uint"/>
+
+    <field name="Render Target number" size="4" start="4" type="uint"/>
+    <field name="sub-id" size="4" start="0" type="uint" default="5"/>
+  </packet>
+
+  <packet code="124" shortname="tile_coords" name="Tile Coordinates">
     <field name="tile row number" size="12" start="12" type="uint"/>
     <field name="tile column number" size="12" start="0" type="uint"/>
   </packet>
 
-  <packet code="122" name="Multicore Rendering Supertile Configuration" cl="R">
+  <packet code="122" name="Multicore Rendering Supertile Cfg" cl="R">
+    <field name="Number of Bin Tile Lists" size="3" start="61" type="uint" minus_one="true"/>
     <field name="Supertile Raster Order" size="1" start="60" type="bool"/>
     <field name="Multicore Enable" size="1" start="56" type="bool"/>
 
@@ -658,17 +999,17 @@
     <field name="Total Frame Height in Supertiles" size="8" start="24" type="uint"/>
     <field name="Total Frame Width in Supertiles" size="8" start="16" type="uint"/>
 
-    <field name="Supertile Height in Tiles minus 1" size="8" start="8" type="uint"/>
-    <field name="Supertile Width in Tiles minus 1" size="8" start="0" type="uint"/>
+    <field name="Supertile Height in Tiles" size="8" start="8" type="uint" minus_one="true"/>
+    <field name="Supertile Width in Tiles" size="8" start="0" type="uint" minus_one="true"/>
   </packet>
 
-  <packet code="123" name="Multicore Rendering Tile List Set Base" cl="R">
+  <packet code="123" shortname="multicore_rendering_tile_list_base" name="Multicore Rendering Tile List Set Base" cl="R">
     <field name="address" size="26" start="6" type="address"/>
     <field name="Tile List Set Number" size="4" start="0" type="uint"/>
   </packet>
 
   <!-- add fields -->
-  <packet code="125" name="Tile Coordinates Implicit"/>
+  <packet code="125" shortname="implicit_tile_coords" name="Tile Coordinates Implicit"/>
 
   <packet code="126" name="Tile List Initial Block Size">
     <field name="Use auto-chained tile lists" size="1" start="2" type="bool"/>
@@ -680,7 +1021,7 @@
     </field>
   </packet>
 
-  <struct name="GL Shader State Record">
+  <struct name="GL Shader State Record" max_ver="33">
     <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
     <field name="Enable clipping" size="1" start="1" type="bool"/>
     <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
@@ -702,19 +1043,102 @@
     <field name="Fragment Shader Code Address" size="29" start="99" type="address"/>
     <field name="Fragment Shader 2-way threadable" size="1" start="96" type="bool"/>
     <field name="Fragment Shader 4-way threadable" size="1" start="97" type="bool"/>
-    <field name="Propagate NaNs" size="1" start="98" type="bool"/>
+    <field name="Fragment Shader Propagate NaNs" size="1" start="98" type="bool"/>
     <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
     <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
     <field name="Vertex Shader 2-way threadable" size="1" start="160" type="bool"/>
     <field name="Vertex Shader 4-way threadable" size="1" start="161" type="bool"/>
+    <field name="Vertex Shader Propagate NaNs" size="1" start="162" type="bool"/>
     <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
     <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
     <field name="Coordinate Shader 2-way threadable" size="1" start="224" type="bool"/>
     <field name="Coordinate Shader 4-way threadable" size="1" start="225" type="bool"/>
+    <field name="Coordinate Shader Propagate NaNs" size="1" start="226" type="bool"/>
     <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
   </struct>
 
-  <struct name="GL Shader State Attribute Record">
+  <struct name="GL Shader State Record" min_ver="41">
+    <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
+    <field name="Enable clipping" size="1" start="1" type="bool"/>
+
+    <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
+    <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
+    <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
+    <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
+    <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
+    <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
+
+    <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
+    <field name="Turn off early-z test" size="1" start="9" type="bool"/>
+    <field name="Coordinate shader has separate input and output VPM blocks" size="1" start="10" type="bool"/>
+    <field name="Vertex shader has separate input and output VPM blocks" size="1" start="11" type="bool"/>
+    <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
+    <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
+    <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
+    <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
+    <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
+    <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
+    <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
+    <field name="No prim pack" size="1" start="19" type="bool"/>
+
+    <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
+
+    <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
+    <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
+
+    <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
+    <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
+
+    <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
+    <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
+
+    <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
+    <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
+
+    <field name="Address of default attribute values" size="32" start="8b" type="address"/>
+
+    <field name="Fragment Shader Code Address" size="29" start="99" type="address"/>
+    <field name="Fragment Shader 4-way threadable" size="1" start="96" type="bool"/>
+    <field name="Fragment Shader start in final thread section" size="1" start="97" type="bool"/>
+    <field name="Fragment Shader Propagate NaNs" size="1" start="98" type="bool"/>
+    <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
+
+    <field name="Vertex Shader Code Address" size="29" start="163" type="address"/>
+    <field name="Vertex Shader 4-way threadable" size="1" start="160" type="bool"/>
+    <field name="Vertex Shader start in final thread section" size="1" start="161" type="bool"/>
+    <field name="Vertex Shader Propagate NaNs" size="1" start="162" type="bool"/>
+    <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
+
+    <field name="Coordinate Shader Code Address" size="29" start="227" type="address"/>
+    <field name="Coordinate Shader 4-way threadable" size="1" start="224" type="bool"/>
+    <field name="Coordinate Shader start in final thread section" size="1" start="225" type="bool"/>
+    <field name="Coordinate Shader Propagate NaNs" size="1" start="226" type="bool"/>
+    <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
+  </struct>
+
+  <struct name="Geometry Shader State Record" min_ver="41">
+    <field name="Geometry Bin Mode Shader Code Address" size="32" start="0b" type="address"/>
+    <field name="4-way threadable" size="1" start="0" type="bool"/>
+    <field name="Start in final thread section" size="1" start="1" type="bool"/>
+    <field name="Propagate NaNs" size="1" start="2" type="bool"/>
+    <field name="Geometry Bin Mode Shader Uniforms Address" size="32" start="4b" type="address"/>
+    <field name="Geometry Render Mode Shader Code Address" size="32" start="8b" type="address"/>
+    <field name="Geometry Render Mode Shader Uniforms Address" size="32" start="12b" type="address"/>
+  </struct>
+
+  <struct name="Tessellation Shader State Record" min_ver="41">
+    <field name="Tessellation Bin Mode Control Shader Code Address" size="32" start="0b" type="address"/>
+    <field name="Tessellation Bin Mode Control Shader Uniforms Address" size="32" start="4b" type="address"/>
+    <field name="Tessellation Render Mode Control Shader Code Address" size="32" start="8b" type="address"/>
+    <field name="Tessellation Render Mode Control Shader Uniforms Address" size="32" start="12b" type="address"/>
+
+    <field name="Tessellation Bin Mode Evaluation Shader Code Address" size="32" start="16b" type="address"/>
+    <field name="Tessellation Bin Mode Evaluation Shader Uniforms Address" size="32" start="20b" type="address"/>
+    <field name="Tessellation Render Mode Evaluation Shader Code Address" size="32" start="24b" type="address"/>
+    <field name="Tessellation Render Mode Evaluation Shader Uniforms Address" size="32" start="28b" type="address"/>
+  </struct>
+
+  <struct name="GL Shader State Attribute Record" max_ver="33">
     <field name="Address" size="32" start="0" type="address"/>
 
     <field name="Vec size" size="2" start="32" type="uint"/>
@@ -738,6 +1162,31 @@
     <field name="Stride" size="32" start="8b" type="uint"/>
   </struct>
 
+  <struct name="GL Shader State Attribute Record" min_ver="41">
+    <field name="Address" size="32" start="0" type="address"/>
+
+    <field name="Vec size" size="2" start="32" type="uint"/>
+    <field name="Type" size="3" start="34" type="uint">
+      <value name="Attribute half-float" value="1"/>
+      <value name="Attribute float" value="2"/>
+      <value name="Attribute fixed" value="3"/>
+      <value name="Attribute byte" value="4"/>
+      <value name="Attribute short" value="5"/>
+      <value name="Attribute int" value="6"/>
+      <value name="Attribute int2_10_10_10" value="7"/>
+    </field>
+    <field name="Signed int type" size="1" start="37" type="bool"/>
+    <field name="Normalized int type" size="1" start="38" type="bool"/>
+    <field name="Read as int/uint" size="1" start="39" type="bool"/>
+
+    <field name="Number of values read by Coordinate shader" size="4" start="40" type="uint"/>
+    <field name="Number of values read by Vertex shader" size="4" start="44" type="uint"/>
+
+    <field name="Instance Divisor" size="16" start="6b" type="uint"/>
+    <field name="Stride" size="32" start="8b" type="uint"/>
+    <field name="Maximum Index" size="32" start="12b" type="uint"/>
+  </struct>
+
   <struct name="VPM generic block write setup">
     <field name="id" size="2" start="30" type="uint" default="0"/>
     <field name="id0" size="3" start="27" type="uint" default="0"/>
@@ -774,7 +1223,7 @@
     <field name="addr" size="13" start="0" type="uint"/>
   </struct>
 
-  <struct name="Texture Uniform Parameter 0 CFG_MODE=1">
+  <struct name="Texture Uniform Parameter 0 CFG_MODE=1" max_ver="33">
     <field name="Per-pixel mask enable" size="1" start="31" type="bool"/>
 
     <field name="Texel offset for r coordinate" size="4" start="27" type="int"/>
@@ -825,12 +1274,53 @@
     </field>
   </struct>
 
-  <struct name="Texture Uniform Parameter 1 CFG_MODE=1">
+  <struct name="Texture Uniform Parameter 1 CFG_MODE=1" max_ver="33">
     <field name="Texture state record base address" size="28" start="4" type="address"/>
     <field name="Return words of texture data" size="4" start="0" type="uint"/>
   </struct>
 
-  <struct name="Texture Shader State">
+  <struct name="TMU Config Parameter 0" min_ver="41">
+    <field name="Texture state address" size="32" start="0" type="address"/>
+    <field name="Return words of texture data" size="4" start="0" type="uint"/>
+  </struct>
+
+  <struct name="TMU Config Parameter 1" min_ver="41">
+    <field name="Sampler state address" size="32" start="0" type="address"/>
+    <field name="Per-pixel mask enable" size="1" start="2" type="bool"/>
+    <field name="Unnormalized coordinates" size="1" start="1" type="bool"/>
+    <field name="Output Type 32-bit" size="1" start="0" type="bool"/>
+  </struct>
+
+  <struct name="TMU Config Parameter 2" min_ver="41" max_ver="41">
+    <field name="Pad" size="24" start="8" type="uint"/>
+    <field name="Op" size="4" start="20" type="TMU Op"/>
+    <field name="Offset R" size="4" start="16" type="int"/>
+    <field name="Offset T" size="4" start="12" type="int"/>
+    <field name="Offset S" size="4" start="8" type="int"/>
+    <field name="Gather Mode" size="1" start="7" type="bool"/>
+    <field name="Gather Component" size="2" start="5" type="uint"/>
+    <field name="Coefficient Mode" size="1" start="4" type="bool"/>
+    <field name="Sample Number" size="2" start="2" type="uint"/>
+    <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
+    <field name="Offset Format 8" size="1" start="0" type="bool"/>
+  </struct>
+
+  <struct name="TMU Config Parameter 2" min_ver="42">
+    <field name="Pad" size="23" start="9" type="uint"/>
+    <field name="LOD Query" size="1" start="8" type="bool"/>
+    <field name="Op" size="4" start="20" type="TMU Op"/>
+    <field name="Offset R" size="4" start="16" type="int"/>
+    <field name="Offset T" size="4" start="12" type="int"/>
+    <field name="Offset S" size="4" start="8" type="int"/>
+    <field name="Gather Mode" size="1" start="7" type="bool"/>
+    <field name="Gather Component" size="2" start="5" type="uint"/>
+    <field name="Coefficient Mode" size="1" start="4" type="bool"/>
+    <field name="Sample Number" size="2" start="2" type="uint"/>
+    <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
+    <field name="Offset Format 8" size="1" start="0" type="bool"/>
+  </struct>
+
+  <struct name="Texture Shader State" max_ver="33">
     <field name="UIF XOR disable" size="1" start="255" type="bool"/>
     <field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
     <field name="Level 0 XOR enable" size="1" start="252" type="bool"/>
@@ -883,6 +1373,74 @@
     <field name="Filter" size="4" start="0" type="TMU Filter"/>
   </struct>
 
+  <struct name="Texture Shader State" min_ver="41">
+    <field name="Pad" size="56" start="136" type="uint"/>
+    <field name="UIF XOR disable" size="1" start="135" type="bool"/>
+    <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
+    <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
+    <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
+
+    <field name="Base Level" size="4" start="124" type="uint"/>
+    <field name="Max Level" size="4" start="120" type="uint"/>
+
+    <field name="Swizzle A" size="3" start="117" type="uint">
+      <value name="Swizzle Zero" value="0"/>
+      <value name="Swizzle One" value="1"/>
+      <value name="Swizzle Red" value="2"/>
+      <value name="Swizzle Green" value="3"/>
+      <value name="Swizzle Blue" value="4"/>
+      <value name="Swizzle Alpha" value="5"/>
+    </field>
+
+    <field name="Swizzle B" size="3" start="114" type="uint"/>
+    <field name="Swizzle G" size="3" start="111" type="uint"/>
+    <field name="Swizzle R" size="3" start="108" type="uint"/>
+    <field name="Extended" size="1" start="107" type="bool"/>
+
+    <field name="Texture type" size="7" start="100" type="uint"/>
+    <field name="Image Depth" size="14" start="86" type="uint"/>
+    <field name="Image Height" size="14" start="72" type="uint"/>
+    <field name="Image Width" size="14" start="58" type="uint"/>
+
+    <field name="Array Stride (64-byte aligned)" size="26" start="32" type="uint"/>
+
+    <field name="Texture base pointer" size="32" start="0" type="address"/>
+
+    <field name="Reverse Standard Border Color" size="1" start="5" type="bool"/>
+    <field name="AHDR" size="1" start="4" type="bool"/>
+    <field name="sRGB" size="1" start="3" type="bool"/>
+    <field name="Flip S and T on incoming request" size="1" start="2" type="bool"/>
+    <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
+    <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
+  </struct>
+
+  <struct name="Sampler State" min_ver="41">
+    <field name="Border color Alpha" size="32" start="160" type="uint"/>
+    <field name="Border color Blue" size="32" start="128" type="uint"/>
+    <field name="Border color Green" size="32" start="96" type="uint"/>
+    <field name="Border color Red" size="32" start="64" type="uint"/>
+
+    <field name="Maximum Anisotropy" size="2" start="61" type="uint"/>
+    <field name="Border Color Mode" size="3" start="58" type="Border Color Mode"/>
+    <field name="Wrap I Border" size="1" start="57" type="bool"/>
+    <field name="Wrap R" size="3" start="54" type="Wrap Mode"/>
+    <field name="Wrap T" size="3" start="51" type="Wrap Mode"/>
+    <field name="Wrap S" size="3" start="48" type="Wrap Mode"/>
+
+    <field name="Fixed Bias" size="16" start="32" type="s8.8"/>
+    <field name="Max Level-of-Detail" size="12" start="20" type="u4.8"/>
+    <field name="Min Level-of-Detail" size="12" start="8" type="u4.8"/>
+
+    <field name="sRGB Disable" size="1" start="7" type="bool"/>
+
+    <field name="Depth Compare Function" size="3" start="4" type="Compare Function"/>
+
+    <field name="Anisotropy Enable" size="1" start="3" type="bool"/>
+    <field name="Mip filter Nearest" size="1" start="2" type="bool"/>
+    <field name="Min filter Nearest" size="1" start="1" type="bool"/>
+    <field name="Mag filter Nearest" size="1" start="0" type="bool"/>
+  </struct>
+
   <enum name="Texture Data Formats">
     <!--
 	most formats here have R in the low bits, A in the high bits.
diff --git a/src/broadcom/cle/v3d_packet_v41.xml b/src/broadcom/cle/v3d_packet_v41.xml
deleted file mode 100644
index 32934d7..0000000
--- a/src/broadcom/cle/v3d_packet_v41.xml
+++ /dev/null
@@ -1,1050 +0,0 @@
-<vcxml gen="4.1">
-
-  <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
-    <value name="NEVER" value="0"/>
-    <value name="LESS" value="1"/>
-    <value name="EQUAL" value="2"/>
-    <value name="LEQUAL" value="3"/>
-    <value name="GREATER" value="4"/>
-    <value name="NOTEQUAL" value="5"/>
-    <value name="GEQUAL" value="6"/>
-    <value name="ALWAYS" value="7"/>
-  </enum>
-
-  <enum name="Blend Factor" prefix="V3D_BLEND_FACTOR">
-    <value name="ZERO" value="0"/>
-    <value name="ONE" value="1"/>
-    <value name="SRC_COLOR" value="2"/>
-    <value name="INV_SRC_COLOR" value="3"/>
-    <value name="DST_COLOR" value="4"/>
-    <value name="INV_DST_COLOR" value="5"/>
-    <value name="SRC_ALPHA" value="6"/>
-    <value name="INV_SRC_ALPHA" value="7"/>
-    <value name="DST_ALPHA" value="8"/>
-    <value name="INV_DST_ALPHA" value="9"/>
-    <value name="CONST_COLOR" value="10"/>
-    <value name="INV_CONST_COLOR" value="11"/>
-    <value name="CONST_ALPHA" value="12"/>
-    <value name="INV_CONST_ALPHA" value="13"/>
-    <value name="SRC_ALPHA_SATURATE" value="14"/>
-  </enum>
-
-  <enum name="Blend Mode" prefix="V3D_BLEND_MODE">
-    <value name="ADD" value="0"/>
-    <value name="SUB" value="1"/>
-    <value name="RSUB" value="2"/>
-    <value name="MIN" value="3"/>
-    <value name="MAX" value="4"/>
-    <value name="MUL" value="5"/>
-    <value name="SCREEN" value="6"/>
-    <value name="DARKEN" value="7"/>
-    <value name="LIGHTEN" value="8"/>
-  </enum>
-
-  <enum name="Stencil Op" prefix="V3D_STENCIL_OP">
-    <value name="ZERO" value="0"/>
-    <value name="KEEP" value="1"/>
-    <value name="REPLACE" value="2"/>
-    <value name="INCR" value="3"/>
-    <value name="DECR" value="4"/>
-    <value name="INVERT" value="5"/>
-    <value name="INCWRAP" value="6"/>
-    <value name="DECWRAP" value="7"/>
-  </enum>
-
-  <enum name="Primitive" prefix="V3D_PRIM">
-    <value name="POINTS" value="0"/>
-    <value name="LINES" value="1"/>
-    <value name="LINE_LOOP" value="2"/>
-    <value name="LINE_STRIP" value="3"/>
-    <value name="TRIANGLES" value="4"/>
-    <value name="TRIANGLE_STRIP" value="5"/>
-    <value name="TRIANGLE_FAN" value="6"/>
-    <value name="POINTS_TF" value="16"/>
-    <value name="LINES_TF" value="17"/>
-    <value name="LINE_LOOP_TF" value="18"/>
-    <value name="LINE_STRIP_TF" value="19"/>
-    <value name="TRIANGLES_TF" value="20"/>
-    <value name="TRIANGLE_STRIP_TF" value="21"/>
-    <value name="TRIANGLE_FAN_TF" value="22"/>
-  </enum>
-
-  <enum name="Border Colour Mode" prefix="V3D_BORDER_COLOUR">
-    <value name="0000" value="0"/>
-    <value name="0001" value="1"/>
-    <value name="1111" value="2"/>
-    <value name="Follows" value="7"/>
-  </enum>
-
-  <enum name="Wrap Mode" prefix="V3D_WRAP_MODE">
-      <value name="Wrap mode REPEAT" value="0"/>
-      <value name="Wrap mode CLAMP" value="1"/>
-      <value name="Wrap mode MIRROR" value="2"/>
-      <value name="Wrap mode BORDER" value="3"/>
-      <value name="Wrap mode MIRROR_ONCE" value="4"/>
-  </enum>
-
-  <enum name="TMU Op" prefix="V3D_TMU_OP">
-    <value name="Write ADD, Read Prefetch" value="0"/>
-    <value name="Write SUB, Read Clear" value="1"/>
-    <value name="Write XCHG, Read Flush" value="2"/>
-    <value name="Write CMPXCHG, Read Flush" value="3"/>
-    <value name="Write UMIN, Full L1 Clear" value="4"/>
-    <value name="Write UMAX" value="5"/>
-    <value name="Write SMIN" value="6"/>
-    <value name="Write SMAX" value="7"/>
-    <value name="Write AND, Read INC" value="8"/>
-    <value name="Write OR, Read DEC" value="9"/>
-    <value name="Write XOR, Read NOT" value="10"/>
-    <value name="Regular" value="15"/>
-  </enum>
-
-  <enum name="Varying Flags Action" prefix="V3D_VARYING_FLAGS_ACTION">
-    <value name="unchanged" value="0"/>
-    <value name="zeroed" value="1"/>
-    <value name="set" value="2"/>
-  </enum>
-
-  <enum name="Memory Format" prefix="V3D_MEMORY_FORMAT">
-    <value name="Raster" value="0"/>
-    <value name="Lineartile" value="1"/>
-    <value name="UB-linear (1 UIF block wide)" value="2"/>
-    <value name="UB-linear (2 UIF blocks wide)" value="3"/>
-    <value name="UIF (No XOR)" value="4"/>
-    <value name="UIF (XOR)" value="5"/>
-  </enum>
-
-  <enum name="Decimate Mode" prefix="V3D_DECIMATE_MODE">
-    <value name="sample 0" value="0"/>
-    <value name="4x" value="1"/>
-    <value name="16x" value="2"/>
-    <value name="all samples" value="3"/>
-  </enum>
-
-  <enum name="Internal Type" prefix="V3D_INTERNAL_TYPE">
-    <value name="8i" value="0"/>
-    <value name="8ui" value="1"/>
-    <value name="8" value="2"/>
-    <value name="16i" value="4"/>
-    <value name="16ui" value="5"/>
-    <value name="16f" value="6"/>
-    <value name="32i" value="8"/>
-    <value name="32ui" value="9"/>
-    <value name="32f" value="10"/>
-  </enum>
-
-  <enum name="Internal BPP" prefix="V3D_INTERNAL_BPP">
-    <value name="32" value="0"/>
-    <value name="64" value="1"/>
-    <value name="128" value="2"/>
-  </enum>
-
-  <enum name="Internal Depth Type" prefix="V3D_INTERNAL_TYPE">
-    <value name="depth_32f" value="0"/>
-    <value name="depth_24" value="1"/>
-    <value name="depth_16" value="2"/>
-  </enum>
-
-  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP">
-    <value name="none" value="0"/> <!-- no clamping -->
-    <value name="norm" value="1"/> <!-- [0,1] for f16 -->
-    <value name="pos" value="2"/> <!-- [0, for f16 -->
-  </enum>
-
-  <enum name="Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT">
-    <!--
-	Formats appear with their channels named from the low bits to
-	the high bits.
-    -->
-    <value name="srgb8_alpha8" value="0"/>
-    <value name="srgb" value="1"/>
-    <value name="rgb10_a2ui" value="2"/>
-    <value name="rgb10_a2" value="3"/>
-    <value name="abgr1555" value="4"/>
-    <value name="alpha-masked abgr1555" value="5"/>
-    <value name="abgr4444" value="6"/>
-    <value name="bgr565" value="7"/>
-    <value name="r11f_g11f_b10f" value="8"/>
-    <value name="rgba32f"  value="9"/>
-    <value name="rg32f"    value="10"/>
-    <value name="r32f"     value="11"/>
-    <value name="rgba32i"  value="12"/>
-    <value name="rg32i"    value="13"/>
-    <value name="r32i"     value="14"/>
-    <value name="rgba32ui" value="15"/>
-    <value name="rg32ui"   value="16"/>
-    <value name="r32ui"    value="17"/>
-    <value name="rgba16f"  value="18"/>
-    <value name="rg16f"    value="19"/>
-    <value name="r16f"     value="20"/>
-    <value name="rgba16i"  value="21"/>
-    <value name="rg16i"    value="22"/>
-    <value name="r16i"     value="23"/>
-    <value name="rgba16ui" value="24"/>
-    <value name="rg16ui"   value="25"/>
-    <value name="r16ui"    value="26"/>
-    <value name="rgba8"    value="27"/>
-    <value name="rgb8"     value="28"/>
-    <value name="rg8"      value="29"/>
-    <value name="r8"       value="30"/>
-    <value name="rgba8i"   value="31"/>
-    <value name="rg8i"     value="32"/>
-    <value name="r8i"      value="33"/>
-    <value name="rgba8ui"  value="34"/>
-    <value name="rg8ui"    value="35"/>
-    <value name="r8ui"     value="36"/>
-    <!-- rgbx8/srgbx8 were removed -->
-    <value name="bstc"     value="39"/>
-    <value name="d32f"     value="40"/>
-    <value name="d24"      value="41"/>
-    <value name="d16"      value="42"/>
-    <value name="d24s8"    value="43"/>
-    <value name="s8"       value="44"/>
-  </enum>
-
-  <packet code="0" name="Halt"/>
-  <packet code="1" name="NOP"/>
-  <packet code="4" name="Flush"/>
-  <packet code="5" name="Flush All State"/>
-  <packet code="6" name="Start Tile Binning"/>
-  <packet code="7" name="Increment Semaphore"/>
-  <packet code="8" name="Wait on Semaphore"/>
-  <packet code="9" name="Wait for previous frame"/>
-  <packet code="10" name="Enable Z-only rendering" cl="R"/>
-  <packet code="11" name="Disable Z-only rendering" cl="R"/>
-  <packet code="12" name="End of Z-only rendering in frame"/>
-  <packet code="13" name="End of rendering"/>
-
-  <packet code="14" name="Wait for transform feedback" cl="B">
-    <field name="Block count" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="15" name="Branch to auto-chained sub-list">
-    <field name="address" size="32" start="0" type="address"/>
-  </packet>
-
-  <packet code="16" name="Branch">
-    <field name="address" size="32" start="0" type="address"/>
-  </packet>
-
-  <packet code="17" name="Branch to Sub-list">
-    <field name="address" size="32" start="0" type="address"/>
-  </packet>
-
-  <packet code="18" name="Return from sub-list"/>
-  <packet code="19" name="Flush VCD cache"/>
-
-  <packet code="20" name="Start Address of Generic Tile List">
-    <field name="start" size="32" start="0" type="address"/>
-    <field name="end" size="32" start="32" type="address"/>
-  </packet>
-
-  <packet code="21" name="Branch to Implicit Tile List">
-    <field name="tile list set number" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="22" name="Branch to Explicit Supertile">
-    <field name="Absolute address of explicit supertile render list" size="32" start="24" type="address"/>
-    <field name="explicit supertile number" size="8" start="16" type="uint"/>
-    <field name="row number" size="8" start="8" type="uint"/>
-    <field name="column number" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="23" name="Supertile Coordinates">
-    <field name="row number in supertiles" size="8" start="8" type="uint"/>
-    <field name="column number in supertiles" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="25" name="Clear Tile Buffers" cl="R">
-    <field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
-    <field name="Clear all Render Targets" size="1" start="0" type="bool"/>
-  </packet>
-
-  <packet code="26" name="End of Loads" cl="R"/>
-  <packet code="27" name="End of Tile Marker" cl="R"/>
-
-  <packet code="29" name="Store Tile Buffer General" cl="R">
-    <field name="Address" size="32" start="64" type="address"/>
-
-    <!-- used for y flip -->
-    <field name="Height" size="16" start="48" type="uint"/>
-
-    <!-- height in ub for UIF, byte stride for raster -->
-    <field name="Height in UB or Stride" size="20" start="28" type="uint"/>
-
-    <field name="R/B swap" size="1" start="20" type="bool"/>
-    <field name="Channel Reverse" size="1" start="19" type="bool"/>
-    <field name="Clear buffer being stored" size="1" start="18" type="bool"/>
-    <field name="Output Image Format" size="6" start="12" type="Output Image Format"/>
-
-    <field name="Decimate" size="2" start="10" type="Decimate Mode"/>
-
-    <field name="A dithered" size="1" start="9" type="bool"/>
-    <field name="BGR dithered" size="1" start="8" type="bool"/>
-
-    <field name="Flip Y" size="1" start="7" type="bool"/>
-
-    <field name="Memory Format" size="3" start="4" type="Memory Format"/>
-    <field name="Buffer to Store" size="4" start="0" type="uint">
-      <value name="Render target 0" value="0"/>
-      <value name="Render target 1" value="1"/>
-      <value name="Render target 2" value="2"/>
-      <value name="Render target 3" value="3"/>
-      <value name="None" value="8"/>
-      <value name="Z" value="9"/>
-      <value name="Stencil" value="10"/>
-      <value name="Z+Stencil" value="11"/>
-    </field>
-  </packet>
-
-  <packet code="30" name="Load Tile Buffer General" cl="R">
-    <field name="Address" size="32" start="64" type="address"/>
-
-    <!-- used for y flip -->
-    <field name="Height" size="16" start="48" type="uint"/>
-
-    <!-- height in ub for UIF, byte stride for raster -->
-    <field name="Height in UB or Stride" size="20" start="28" type="uint"/>
-
-    <field name="R/B swap" size="1" start="20" type="bool"/>
-    <field name="Channel Reverse" size="1" start="19" type="bool"/>
-
-    <field name="Input Image Format" size="6" start="12" type="Output Image Format"/>
-
-    <field name="Decimate" size="2" start="10" type="Decimate Mode"/>
-
-    <field name="Flip Y" size="1" start="7" type="bool"/>
-
-    <field name="Memory Format" size="3" start="4" type="Memory Format"/>
-    <field name="Buffer to Load" size="4" start="0" type="uint">
-      <value name="Render target 0" value="0"/>
-      <value name="Render target 1" value="1"/>
-      <value name="Render target 2" value="2"/>
-      <value name="Render target 3" value="3"/>
-      <value name="None" value="8"/>
-      <value name="Z" value="9"/>
-      <value name="Stencil" value="10"/>
-      <value name="Z+Stencil" value="11"/>
-    </field>
-  </packet>
-
-  <packet code="32" name="Indexed Primitive List" cl="B">
-    <field name="Index Offset" size="32" start="40" type="uint"/>
-
-    <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
-    <field name="Length" size="31" start="8" type="uint"/>
-
-    <field name="Index type" size="2" start="6" type="uint">
-      <value name="Index type 8-bit" value="0"/>
-      <value name="Index type 16-bit" value="1"/>
-      <value name="Index type 32-bit" value="2"/>
-    </field>
-
-    <field name="mode" size="6" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="34" name="Indexed Instanced Primitive List" cl="B">
-    <field name="Index Offset" size="32" start="72" type="uint"/>
-    <field name="Number of Instances" size="32" start="40" type="uint"/>
-    <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
-    <field name="Instance Length" size="31" start="8" type="uint"/>
-
-    <field name="Index type" size="2" start="6" type="uint">
-      <value name="Index type 8-bit" value="0"/>
-      <value name="Index type 16-bit" value="1"/>
-      <value name="Index type 32-bit" value="2"/>
-    </field>
-
-    <field name="mode" size="6" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="36" name="Vertex Array Primitives" cl="B">
-    <field name="Index of First Vertex" size="32" start="40" type="uint"/>
-    <field name="Length" size="32" start="8" type="uint"/>
-
-    <field name="mode" size="8" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="38" name="Vertex Array Instanced Primitives" cl="B">
-    <field name="Index of First Vertex" size="32" start="72" type="uint"/>
-    <field name="Number of Instances" size="32" start="40" type="uint"/>
-    <field name="Instance Length" size="32" start="8" type="uint"/>
-
-    <field name="mode" size="8" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="43" name="Base Vertex Base Instance" cl="B">
-    <field name="Base Instance" size="32" start="32" type="uint"/>
-
-    <field name="Base Vertex" size="32" start="0" type="uint"/>
-  </packet>
-
-  <packet code="44" name="Index Buffer Setup" cl="B">
-    <field name="Address" size="32" start="0" type="address"/>
-    <field name="Size" size="32" start="32" type="uint"/>
-  </packet>
-
-  <packet code="56" name="Primitive List Format">
-    <field name="data type" size="1" start="6" type="uint">
-      <value name="List Indexed" value="0"/>
-      <value name="List 32-bit X/Y" value="1"/>
-    </field>
-    <field name="primitive type" size="6" start="0" type="uint">
-      <value name="List Points" value="0"/>
-      <value name="List Lines" value="1"/>
-      <value name="List Triangles" value="2"/>
-    </field>
-  </packet>
-
-  <packet code="64" name="GL Shader State">
-    <field name="address" size="27" start="5" type="address"/>
-    <field name="number of attribute arrays" size="5" start="0" type="uint"/>
-  </packet>
-
-  <packet code="73" name="Transform Feedback Buffer">
-    <field name="Buffer Address" size="32" start="32" type="address"/>
-    <field name="Buffer Size in 32-bit words" size="30" start="2" type="uint"/>
-    <field name="Buffer Number" size="2" start="0" type="uint"/>
-  </packet>
-
-  <packet code="74" name="Transform Feedback Specs">
-    <field name="Enable" size="1" start="7" type="bool"/>
-    <field name="Number of 16-bit Output Data Specs following" size="5" start="0" type="uint"/>
-  </packet>
-
-  <packet code="75" name="Flush Transform Feedback Data"/>
-
-  <struct name="Transform Feedback Output Data Spec">
-    <field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/>
-    <field name="Number of consecutive Vertex Values to output as 32-bit values minus 1" size="4" start="8" type="uint"/>
-    <field name="Output Buffer to write to" size="2" start="12" type="uint"/>
-  </struct>
-
-  <struct name="Transform Feedback Output Address">
-    <field name="address" size="32" start="0" type="address"/>
-  </struct>
-
-  <packet code="80" name="Stencil Config">
-    <field name="Stencil Write Mask" size="8" start="32" type="uint"/>
-    <field name="Back Config" size="1" start="29" type="bool"/>
-    <field name="Front Config" size="1" start="28" type="bool"/>
-    <field name="Stencil Pass Op" size="3" start="25" type="Stencil Op"/>
-    <field name="Depth Test Fail Op" size="3" start="22" type="Stencil Op"/>
-    <field name="Stencil Test Fail Op" size="3" start="19" type="Stencil Op"/>
-    <field name="Stencil Test Function" size="3" start="16" type="Compare Function"/>
-    <field name="Stencil Test Mask" size="8" start="8" type="uint"/>
-    <field name="Stencil Ref Value" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="83" name="Blend Enables">
-    <field name="Mask" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="84" name="Blend Config">
-    <field name="VG Coverage Modes" size="2" start="28" type="uint"/>
-    <field name="Render Target Mask" size="4" start="24" type="uint"/>
-    <field name="Colour blend dst factor" size="4" start="20" type="Blend Factor"/>
-    <field name="Colour blend src factor" size="4" start="16" type="Blend Factor"/>
-    <field name="Colour blend mode" size="4" start="12" type="Blend Mode"/>
-    <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
-    <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
-    <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
-  </packet>
-
-  <packet code="86" name="Blend Constant Colour">
-    <field name="Alpha (F16)" size="16" start="48" type="uint"/>
-    <field name="Blue (F16)" size="16" start="32" type="uint"/>
-    <field name="Green (F16)" size="16" start="16" type="uint"/>
-    <field name="Red (F16)" size="16" start="0" type="uint"/>
-  </packet>
-
-  <packet code="87" name="Colour Write Masks">
-    <field name="Reserved" size="16" start="16" type="uint"/>
-    <field name="Render Target 3 per colour component write masks" size="4" start="12" type="uint"/>
-    <field name="Render Target 2 per colour component write masks" size="4" start="8" type="uint"/>
-    <field name="Render Target 1 per colour component write masks" size="4" start="4" type="uint"/>
-    <field name="Render Target 0 per colour component write masks" size="4" start="0" type="uint"/>
-  </packet>
-
-  <packet code="88" name="Zero All Centroid Flags"/>
-
-  <packet code="89" name="Centroid Flags">
-    <field name="Centroid Flags for varyings V0*24" size="24" start="8" type="uint"/>
-    <field name="Action for Centroid Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
-    <field name="Action for Centroid Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
-    <field name="Varying offset V0" size="4" start="0" type="uint"/>
-  </packet>
-
-  <packet code="92" name="Occlusion Query Counter">
-    <field name="address" size="32" start="0" type="address"/>
-  </packet>
-
-  <packet code="96" name="Configuration Bits">
-    <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
-    <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
-    <field name="Blend enable" size="1" start="19" type="bool"/>
-    <field name="Stencil enable" size="1" start="18" type="bool"/>
-    <field name="Early Z updates enable" size="1" start="17" type="bool"/>
-    <field name="Early Z enable" size="1" start="16" type="bool"/>
-    <field name="Z updates enable" size="1" start="15" type="bool"/>
-    <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
-    <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
-    <field name="Coverage Update Mode" size="2" start="9" type="uint"/>
-    <field name="Coverage Pipe Select" size="1" start="8" type="bool"/>
-    <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
-    <field name="Line Rasterization" size="2" start="4" type="uint"/>
-    <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
-    <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
-    <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
-    <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
-  </packet>
-
-  <packet code="97" name="Zero All Flat Shade Flags"/>
-
-  <packet code="98" name="Flat Shade Flags">
-    <field name="Flat Shade Flags for varyings V0*24" size="24" start="8" type="uint"/>
-    <field name="Action for Flat Shade Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
-    <field name="Action for Flat Shade Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
-    <field name="Varying offset V0" size="4" start="0" type="uint"/>
-  </packet>
-
-  <packet code="99" name="Zero All Non-perspective Flags"/>
-
-  <packet code="100" name="Non-perspective Flags">
-    <field name="Non-perspective Flags for varyings V0*24" size="24" start="8" type="uint"/>
-    <field name="Action for Non-perspective Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
-    <field name="Action for Non-perspectivey Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
-    <field name="Varying offset V0" size="4" start="0" type="uint"/>
-  </packet>
-
-  <packet code="104" name="Point size">
-    <field name="Point Size" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet code="105" name="Line width">
-    <field name="Line width" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet name="Depth Offset" code="106">
-    <field name="Limit" size="32" start="32" type="float"/>
-    <!-- these fields are both float-1-8-7 encoded (top 16 bits of a float32) -->
-    <field name="Depth Offset Units" size="16" start="16" type="uint"/>
-    <field name="Depth Offset Factor" size="16" start="0" type="uint"/>
-  </packet>
-
-  <packet name="Clip Window" code="107">
-    <field name="Clip Window Height in pixels" size="16" start="48" type="uint"/>
-    <field name="Clip Window Width in pixels" size="16" start="32" type="uint"/>
-    <field name="Clip Window Bottom Pixel Coordinate" size="16" start="16" type="uint"/>
-    <field name="Clip Window Left Pixel Coordinate" size="16" start="0" type="uint"/>
-  </packet>
-
-  <packet name="Viewport Offset" code="108">
-    <field name="Coarse Y" size="10" start="54" type="uint"/>
-    <field name="Viewport Centre Y-coordinate" size="22" start="32" type="s14.8"/>
-    <field name="Coarse X" size="10" start="22" type="uint"/>
-    <field name="Viewport Centre X-coordinate" size="22" start="0" type="s14.8"/>
-  </packet>
-
-  <packet name="Clipper Z min/max clipping planes" code="109">
-    <field name="Maximum Zw" size="32" start="32" type="float"/>
-    <field name="Minimum Zw" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet name="Clipper XY Scaling" code="110" cl="B">
-    <field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
-    <field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet name="Clipper Z Scale and Offset" code="111" cl="B">
-    <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
-    <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet name="Number of Layers" code="119">
-    <field name="Number of Layers Minus 1" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="120" name="Tile Binning Mode Configuration (Part1)">
-
-    <field name="Height (in pixels minus 1)" size="12" start="48" type="uint"/>
-    <field name="Width (in pixels minus 1)" size="12" start="32" type="uint"/>
-
-    <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/>
-    <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/>
-
-    <field name="Maximum BPP of all render targets" size="2" start="12" type="uint">
-      <value name="Render target maximum 32bpp" value="0"/>
-      <value name="Render target maximum 64bpp" value="1"/>
-      <value name="Render target maximum 128bpp" value="2"/>
-    </field>
-
-    <field name="Number of Render Targets minus 1" size="4" start="8" type="uint"/>
-
-    <field name="tile allocation block size" size="2" start="4" type="uint">
-      <value name="tile allocation block size 64b" value="0"/>
-      <value name="tile allocation block size 128b" value="1"/>
-      <value name="tile allocation block size 256b" value="2"/>
-    </field>
-    <field name="tile allocation initial block size" size="2" start="2" type="uint">
-      <value name="tile allocation initial block size 64b" value="0"/>
-      <value name="tile allocation initial block size 128b" value="1"/>
-      <value name="tile allocation initial block size 256b" value="2"/>
-    </field>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Common Configuration)" cl="R">
-    <field name="Pad" size="12" start="52" type="uint"/>
-
-    <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
-    <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
-
-    <field name="Early-Z disable" size="1" start="46" type="bool"/>
-
-    <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
-      <value name="Early-Z direction LT/LE" value="0"/>
-      <value name="Early-Z direction GT/GE" value="1"/>
-    </field>
-
-    <field name="Select Coverage Mode" size="1" start="44" type="bool"/>
-    <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
-    <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
-
-    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
-
-    <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
-    <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
-    <field name="Number of Render Targets Minus 1" size="4" start="4" type="uint"/>
-
-    <field name="sub-id" size="4" start="0" type="uint" default="0"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Render Target config)" cl="R">
-
-    <field name="Pad" size="28" start="34" type="uint"/>
-
-    <field name="Render Target 3 Clamp" size="2" start="32" type="Render Target Clamp"/>
-    <field name="Render Target 3 Internal Type" size="4" start="30" type="Internal Type"/>
-    <field name="Render Target 3 Internal BPP" size="2" start="28" type="Internal BPP"/>
-
-    <field name="Render Target 2 Clamp" size="2" start="26" type="Render Target Clamp"/>
-    <field name="Render Target 2 Internal Type" size="4" start="22" type="Internal Type"/>
-    <field name="Render Target 2 Internal BPP" size="2" start="20" type="Internal BPP"/>
-
-    <field name="Render Target 1 Clamp" size="2" start="18" type="Render Target Clamp"/>
-    <field name="Render Target 1 Internal Type" size="4" start="14" type="Internal Type"/>
-    <field name="Render Target 1 Internal BPP" size="2" start="12" type="Internal BPP"/>
-
-    <field name="Render Target 0 Clamp" size="2" start="10" type="Render Target Clamp"/>
-    <field name="Render Target 0 Internal Type" size="4" start="6" type="Internal Type"/>
-    <field name="Render Target 0 Internal BPP" size="2" start="4" type="Internal BPP"/>
-
-    <field name="sub-id" size="4" start="0" type="uint" default="1"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Z Stencil Clear Values)" cl="R">
-    <field name="unused" size="16" start="48" type="uint"/>
-
-    <field name="Z Clear Value" size="32" start="16" type="float"/>
-
-    <field name="Stencil/VG Mask Clear Value" size="8" start="8" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="2"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Clear Colors Part1)" cl="R">
-    <!-- Express this as a 56-bit field? -->
-    <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
-    <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
-
-    <field name="Render Target number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="3"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Clear Colors Part2)" cl="R">
-    <!-- Express this as a 56-bit field? -->
-    <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
-    <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
-
-    <field name="Render Target number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="4"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Clear Colors Part3)" cl="R">
-    <field name="pad" size="11" start="53" type="uint"/>
-    <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
-    <!-- image height is for Y flipping -->
-    <field name="Raster Row Stride or Image Height in Pixels" size="16" start="24" type="uint"/>
-    <field name="Clear Color high 16 bits" size="16" start="8" type="uint"/>
-
-    <field name="Render Target number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="5"/>
-  </packet>
-
-  <packet code="124" name="Tile Coordinates">
-    <field name="tile row number" size="12" start="12" type="uint"/>
-    <field name="tile column number" size="12" start="0" type="uint"/>
-  </packet>
-
-  <packet code="122" name="Multicore Rendering Supertile Configuration" cl="R">
-    <field name="Supertile Raster Order" size="1" start="60" type="bool"/>
-    <field name="Multicore Enable" size="1" start="56" type="bool"/>
-
-    <field name="Total Frame Height in Tiles" size="12" start="44" type="uint"/>
-    <field name="Total Frame Width in Tiles" size="12" start="32" type="uint"/>
-
-    <field name="Total Frame Height in Supertiles" size="8" start="24" type="uint"/>
-    <field name="Total Frame Width in Supertiles" size="8" start="16" type="uint"/>
-
-    <field name="Supertile Height in Tiles minus 1" size="8" start="8" type="uint"/>
-    <field name="Supertile Width in Tiles minus 1" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="123" name="Multicore Rendering Tile List Set Base" cl="R">
-    <field name="address" size="26" start="6" type="address"/>
-    <field name="Tile List Set Number" size="4" start="0" type="uint"/>
-  </packet>
-
-  <!-- add fields -->
-  <packet code="125" name="Tile Coordinates Implicit"/>
-
-  <packet code="126" name="Tile List Initial Block Size">
-    <field name="Use auto-chained tile lists" size="1" start="2" type="bool"/>
-
-    <field name="Size of first block in chained tile lists" size="2" start="0" type="uint">
-      <value name="tile allocation block size 64b" value="0"/>
-      <value name="tile allocation block size 128b" value="1"/>
-      <value name="tile allocation block size 256b" value="2"/>
-    </field>
-  </packet>
-
-  <struct name="Geometry Shader State Record">
-    <field name="Geometry Bin Mode Shader Code Address" size="32" start="0b" type="address"/>
-    <field name="4-way threadable" size="1" start="0" type="bool"/>
-    <field name="Start in final thread section" size="1" start="1" type="bool"/>
-    <field name="Propagate NaNs" size="1" start="2" type="bool"/>
-    <field name="Geometry Bin Mode Shader Uniforms Address" size="32" start="4b" type="address"/>
-    <field name="Geometry Render Mode Shader Code Address" size="32" start="8b" type="address"/>
-    <field name="Geometry Render Mode Shader Uniforms Address" size="32" start="12b" type="address"/>
-  </struct>
-
-  <struct name="Tessellation Shader State Record">
-    <field name="Tessellation Bin Mode Control Shader Code Address" size="32" start="0b" type="address"/>
-    <field name="Tessellation Bin Mode Control Shader Uniforms Address" size="32" start="4b" type="address"/>
-    <field name="Tessellation Render Mode Control Shader Code Address" size="32" start="8b" type="address"/>
-    <field name="Tessellation Render Mode Control Shader Uniforms Address" size="32" start="12b" type="address"/>
-
-    <field name="Tessellation Bin Mode Evaluation Shader Code Address" size="32" start="16b" type="address"/>
-    <field name="Tessellation Bin Mode Evaluation Shader Uniforms Address" size="32" start="20b" type="address"/>
-    <field name="Tessellation Render Mode Evaluation Shader Code Address" size="32" start="24b" type="address"/>
-    <field name="Tessellation Render Mode Evaluation Shader Uniforms Address" size="32" start="28b" type="address"/>
-  </struct>
-
-  <struct name="GL Shader State Record">
-    <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
-    <field name="Enable clipping" size="1" start="1" type="bool"/>
-
-    <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
-    <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
-    <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
-    <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
-    <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
-    <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
-
-    <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
-    <field name="Turn off early-z test" size="1" start="9" type="bool"/>
-    <field name="Coordinate shader has separate input and output VPM blocks" size="1" start="10" type="bool"/>
-    <field name="Vertex shader has separate input and output VPM blocks" size="1" start="11" type="bool"/>
-    <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
-    <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
-    <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
-    <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
-    <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
-    <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
-    <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
-
-    <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
-
-    <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
-    <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
-
-    <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
-    <field name="Min Coord Shader input segments required in play minus 1" size="4" start="44" type="uint"/>
-
-    <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
-    <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
-
-    <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
-    <field name="Min Vertex Shader input segments required in play minus 1" size="4" start="60" type="uint"/>
-
-    <field name="Address of default attribute values" size="32" start="8b" type="address"/>
-
-    <field name="Fragment Shader Code Address" size="32" start="12b" type="address"/>
-    <field name="Fragment Shader 4-way threadable" size="1" start="96" type="bool"/>
-    <field name="Fragment Shader start in final thread section" size="1" start="97" type="bool"/>
-    <field name="Propagate NaNs" size="1" start="98" type="bool"/>
-    <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
-
-    <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
-    <field name="Vertex Shader 4-way threadable" size="1" start="160" type="bool"/>
-    <field name="Vertex Shader start in final thread section" size="1" start="161" type="bool"/>
-    <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
-
-    <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
-    <field name="Coordinate Shader 4-way threadable" size="1" start="224" type="bool"/>
-    <field name="Coordinate Shader start in final thread section" size="1" start="225" type="bool"/>
-    <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
-  </struct>
-
-  <struct name="GL Shader State Attribute Record">
-    <field name="Address" size="32" start="0" type="address"/>
-
-    <field name="Vec size" size="2" start="32" type="uint"/>
-    <field name="Type" size="3" start="34" type="uint">
-      <value name="Attribute half-float" value="1"/>
-      <value name="Attribute float" value="2"/>
-      <value name="Attribute fixed" value="3"/>
-      <value name="Attribute byte" value="4"/>
-      <value name="Attribute short" value="5"/>
-      <value name="Attribute int" value="6"/>
-      <value name="Attribute int2_10_10_10" value="7"/>
-    </field>
-    <field name="Signed int type" size="1" start="37" type="bool"/>
-    <field name="Normalized int type" size="1" start="38" type="bool"/>
-    <field name="Read as int/uint" size="1" start="39" type="bool"/>
-
-    <field name="Number of values read by Coordinate shader" size="4" start="40" type="uint"/>
-    <field name="Number of values read by Vertex shader" size="4" start="44" type="uint"/>
-
-    <field name="Instance Divisor" size="16" start="6b" type="uint"/>
-    <field name="Stride" size="32" start="8b" type="uint"/>
-    <field name="Maximum Index" size="32" start="12b" type="uint"/>
-  </struct>
-
-  <struct name="VPM generic block write setup">
-    <field name="id" size="2" start="30" type="uint" default="0"/>
-    <field name="id0" size="3" start="27" type="uint" default="0"/>
-
-    <field name="horiz" size="1" start="24" type="bool"/>
-    <field name="laned" size="1" start="23" type="bool"/>
-    <field name="segs" size="1" start="22" type="bool"/>
-    <field name="stride" size="7" start="15" type="int"/>
-
-    <field name="size" size="2" start="13" type="uint">
-      <value name="VPM setup size 8-bit" value="0"/>
-      <value name="VPM setup size 16-bit" value="1"/>
-      <value name="VPM setup size 32-bit" value="2"/>
-    </field>
-
-    <field name="addr" size="13" start="0" type="uint"/>
-  </struct>
-
-  <struct name="VPM generic block read setup">
-    <field name="id" size="2" start="30" type="uint" default="1"/>
-
-    <field name="horiz" size="1" start="29" type="bool"/>
-    <field name="laned" size="1" start="28" type="bool"/>
-    <field name="segs" size="1" start="27" type="bool"/>
-    <field name="num" size="5" start="22" type="uint"/>
-    <field name="stride" size="7" start="15" type="int"/>
-
-    <field name="size" size="2" start="13" type="uint">
-      <value name="VPM setup size 8-bit" value="0"/>
-      <value name="VPM setup size 16-bit" value="1"/>
-      <value name="VPM setup size 32-bit" value="2"/>
-    </field>
-
-    <field name="addr" size="13" start="0" type="uint"/>
-  </struct>
-
-  <struct name="TMU Config Parameter 0">
-    <field name="Texture state address" size="32" start="0" type="address"/>
-    <field name="Return words of texture data" size="4" start="0" type="uint"/>
-  </struct>
-
-  <struct name="TMU Config Parameter 1">
-    <field name="Sampler state address" size="32" start="0" type="address"/>
-    <field name="Per-pixel mask enable" size="1" start="2" type="bool"/>
-    <field name="Unnormalized coordinates" size="1" start="1" type="bool"/>
-    <field name="Output Type 32-bit" size="1" start="0" type="bool"/>
-  </struct>
-
-  <struct name="TMU Config Parameter 2">
-    <field name="Pad" size="24" start="8" type="uint"/>
-    <field name="Op" size="4" start="20" type="TMU Op"/>
-    <field name="Offset R" size="4" start="16" type="int"/>
-    <field name="Offset T" size="4" start="12" type="int"/>
-    <field name="Offset S" size="4" start="8" type="int"/>
-    <field name="Gather Mode" size="1" start="7" type="bool"/>
-    <field name="Gather Component" size="2" start="5" type="uint"/>
-    <field name="Coefficient Mode" size="1" start="4" type="bool"/>
-    <field name="Sample Number" size="2" start="2" type="uint"/>
-    <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
-    <field name="Offset Format 8" size="1" start="0" type="bool"/>
-  </struct>
-
-  <struct name="Texture Shader State">
-    <field name="Pad" size="56" start="136" type="uint"/>
-    <field name="UIF XOR disable" size="1" start="135" type="bool"/>
-    <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
-    <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
-    <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
-
-    <field name="Base Level" size="4" start="124" type="uint"/>
-    <field name="Max Level" size="4" start="120" type="uint"/>
-
-    <field name="Swizzle A" size="3" start="117" type="uint">
-      <value name="Swizzle Zero" value="0"/>
-      <value name="Swizzle One" value="1"/>
-      <value name="Swizzle Red" value="2"/>
-      <value name="Swizzle Green" value="3"/>
-      <value name="Swizzle Blue" value="4"/>
-      <value name="Swizzle Alpha" value="5"/>
-    </field>
-
-    <field name="Swizzle B" size="3" start="114" type="uint"/>
-    <field name="Swizzle G" size="3" start="111" type="uint"/>
-    <field name="Swizzle R" size="3" start="108" type="uint"/>
-    <field name="Extended" size="1" start="107" type="bool"/>
-
-    <field name="Texture type" size="7" start="100" type="uint"/>
-    <field name="Image Depth" size="14" start="86" type="uint"/>
-    <field name="Image Height" size="14" start="72" type="uint"/>
-    <field name="Image Width" size="14" start="58" type="uint"/>
-
-    <field name="Array Stride (64-byte aligned)" size="26" start="32" type="uint"/>
-
-    <field name="Texture base pointer" size="32" start="0" type="address"/>
-
-    <field name="Reverse Standard Border Colour" size="1" start="5" type="bool"/>
-    <field name="AHDR" size="1" start="4" type="bool"/>
-    <field name="sRGB" size="1" start="3" type="bool"/>
-    <field name="Flip S and T on incoming request" size="1" start="2" type="bool"/>
-    <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
-    <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
-  </struct>
-
-  <struct name="Sampler State">
-    <field name="Border colour Alpha" size="32" start="160" type="uint"/>
-    <field name="Border colour Blue" size="32" start="128" type="uint"/>
-    <field name="Border colour Green" size="32" start="96" type="uint"/>
-    <field name="Border colour Red" size="32" start="64" type="uint"/>
-
-    <field name="Maximum Anisotropy" size="2" start="61" type="uint"/>
-    <field name="Border Colour Mode" size="3" start="58" type="Border Colour Mode"/>
-    <field name="Wrap I Border" size="1" start="57" type="bool"/>
-    <field name="Wrap R" size="3" start="54" type="Wrap Mode"/>
-    <field name="Wrap T" size="3" start="51" type="Wrap Mode"/>
-    <field name="Wrap S" size="3" start="48" type="Wrap Mode"/>
-
-    <field name="Fixed Bias" size="16" start="32" type="s8.8"/>
-    <field name="Max Level-of-Detail" size="12" start="20" type="u4.8"/>
-    <field name="Min Level-of-Detail" size="12" start="8" type="u4.8"/>
-
-    <field name="sRGB Disable" size="1" start="7" type="bool"/>
-
-    <field name="Depth Compare Function" size="3" start="4" type="Compare Function"/>
-
-    <field name="Anisotropy Enable" size="1" start="3" type="bool"/>
-    <field name="Mip filter Nearest" size="1" start="2" type="bool"/>
-    <field name="Min filter Nearest" size="1" start="1" type="bool"/>
-    <field name="Mag filter Nearest" size="1" start="0" type="bool"/>
-  </struct>
-
-  <enum name="Texture Data Formats">
-    <!--
-	most formats here have R in the low bits, A in the high bits.
-	Exceptions noted.
-    -->
-    <value name="Texture Data Format R8" value="0"/>
-    <value name="Texture Data Format R8 SNORM" value="1"/>
-    <value name="Texture Data Format RG8" value="2"/>
-    <value name="Texture Data Format RG8 SNORM" value="3"/>
-    <value name="Texture Data Format RGBA8" value="4"/>
-    <value name="Texture Data Format RGBA8 SNORM" value="5"/>
-    <value name="Texture Data Format RGB565" value="6"/> <!-- B in low bits -->
-    <value name="Texture Data Format RGBA4" value="7"/> <!-- A low, R high -->
-    <value name="Texture Data Format RGB5_A1" value="8"/> <!-- A low, R high -->
-    <value name="Texture Data Format RGB10_A2" value="9"/> <!-- R low, A high -->
-    <value name="Texture Data Format R16" value="10"/>
-    <value name="Texture Data Format R16 SNORM" value="11"/>
-    <value name="Texture Data Format RG16" value="12"/>
-    <value name="Texture Data Format RG16 SNORM" value="13"/>
-    <value name="Texture Data Format RGBA16" value="14"/>
-    <value name="Texture Data Format RGBA16 SNORM" value="15"/>
-    <value name="Texture Data Format R16F" value="16"/>
-    <value name="Texture Data Format RG16F" value="17"/>
-    <value name="Texture Data Format RGBA16F" value="18"/>
-    <value name="Texture Data Format R11F_G11F_B10F" value="19"/>
-    <value name="Texture Data Format RGB9_E5" value="20"/>
-    <value name="Texture Data Format DEPTH COMP16" value="21"/>
-    <value name="Texture Data Format DEPTH COMP24" value="22"/>
-    <value name="Texture Data Format DEPTH COMP32F" value="23"/>
-    <value name="Texture Data Format DEPTH24_X8" value="24"/> <!-- X low, D high -->
-    <value name="Texture Data Format R4" value="25"/>
-    <value name="Texture Data Format R1" value="26"/>
-    <!-- generic unfiltered 8-bit sample -->
-    <value name="Texture Data Format S8" value="27"/>
-    <!-- generic unfiltered 16-bit sample -->
-    <value name="Texture Data Format S16" value="28"/>
-    <!-- generic unfiltered 32-bit sample -->
-    <value name="Texture Data Format R32F" value="29"/>
-    <!-- generic unfiltered 64-bit sample -->
-    <value name="Texture Data Format RG32F" value="30"/>
-    <!-- generic unfiltered 128-bit sample -->
-    <value name="Texture Data Format RGBA32F" value="31"/>
-
-    <value name="Texture Data Format RGB8_ETC2" value="32"/>
-    <value name="Texture Data Format RGB8_PUNCHTHROUGH_ALPHA1" value="33"/>
-
-    <value name="Texture Data Format R11_EAC" value="34"/>
-    <value name="Texture Data Format SIGNED_R11_EAC" value="35"/>
-    <value name="Texture Data Format RG11_EAC" value="36"/>
-    <value name="Texture Data Format SIGNED_RG11_EAC" value="37"/>
-
-    <value name="Texture Data Format RGBA8_ETC2_EAC" value="38"/>
-    <value name="Texture Data Format YCBCR_LUMA" value="39"/>
-    <value name="Texture Data Format YCBCR_420_CHROMA" value="40"/>
-
-    <value name="Texture Data Format BC1" value="48"/>
-    <value name="Texture Data Format BC2" value="49"/>
-    <value name="Texture Data Format BC3" value="50"/>
-
-    <value name="Texture Data Format ASTC_4x4" value="64"/>
-    <value name="Texture Data Format ASTC_5x4" value="65"/>
-    <value name="Texture Data Format ASTC_5x5" value="66"/>
-    <value name="Texture Data Format ASTC_6x5" value="67"/>
-    <value name="Texture Data Format ASTC_6x6" value="68"/>
-    <value name="Texture Data Format ASTC_8x5" value="69"/>
-    <value name="Texture Data Format ASTC_8x6" value="70"/>
-    <value name="Texture Data Format ASTC_8x8" value="71"/>
-    <value name="Texture Data Format ASTC_10x5" value="72"/>
-    <value name="Texture Data Format ASTC_10x6" value="73"/>
-    <value name="Texture Data Format ASTC_10x8" value="74"/>
-    <value name="Texture Data Format ASTC_10x10" value="75"/>
-    <value name="Texture Data Format ASTC_12x10" value="76"/>
-    <value name="Texture Data Format ASTC_12x12" value="77"/>
-
-    <value name="Texture Data Format R8I" value="96"/>
-    <value name="Texture Data Format R8UI" value="97"/>
-    <value name="Texture Data Format RG8I" value="98"/>
-    <value name="Texture Data Format RG8UI" value="99"/>
-    <value name="Texture Data Format RGBA8I" value="100"/>
-    <value name="Texture Data Format RGBA8UI" value="101"/>
-
-    <value name="Texture Data Format R16I" value="102"/>
-    <value name="Texture Data Format R16UI" value="103"/>
-    <value name="Texture Data Format RG16I" value="104"/>
-    <value name="Texture Data Format RG16UI" value="105"/>
-    <value name="Texture Data Format RGBA16I" value="106"/>
-    <value name="Texture Data Format RGBA16UI" value="107"/>
-
-    <value name="Texture Data Format R32I" value="108"/>
-    <value name="Texture Data Format R32UI" value="109"/>
-    <value name="Texture Data Format RG32I" value="110"/>
-    <value name="Texture Data Format RG32UI" value="111"/>
-    <value name="Texture Data Format RGBA32I" value="112"/>
-    <value name="Texture Data Format RGBA32UI" value="113"/>
-    <value name="Texture Data Format RGB10_A2UI" value="114"/>
-
-  </enum>
-</vcxml>
diff --git a/src/broadcom/cle/v3d_packet_v42.xml b/src/broadcom/cle/v3d_packet_v42.xml
deleted file mode 100644
index db128b5..0000000
--- a/src/broadcom/cle/v3d_packet_v42.xml
+++ /dev/null
@@ -1,1052 +0,0 @@
-<vcxml gen="4.2">
-
-  <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
-    <value name="NEVER" value="0"/>
-    <value name="LESS" value="1"/>
-    <value name="EQUAL" value="2"/>
-    <value name="LEQUAL" value="3"/>
-    <value name="GREATER" value="4"/>
-    <value name="NOTEQUAL" value="5"/>
-    <value name="GEQUAL" value="6"/>
-    <value name="ALWAYS" value="7"/>
-  </enum>
-
-  <enum name="Blend Factor" prefix="V3D_BLEND_FACTOR">
-    <value name="ZERO" value="0"/>
-    <value name="ONE" value="1"/>
-    <value name="SRC_COLOR" value="2"/>
-    <value name="INV_SRC_COLOR" value="3"/>
-    <value name="DST_COLOR" value="4"/>
-    <value name="INV_DST_COLOR" value="5"/>
-    <value name="SRC_ALPHA" value="6"/>
-    <value name="INV_SRC_ALPHA" value="7"/>
-    <value name="DST_ALPHA" value="8"/>
-    <value name="INV_DST_ALPHA" value="9"/>
-    <value name="CONST_COLOR" value="10"/>
-    <value name="INV_CONST_COLOR" value="11"/>
-    <value name="CONST_ALPHA" value="12"/>
-    <value name="INV_CONST_ALPHA" value="13"/>
-    <value name="SRC_ALPHA_SATURATE" value="14"/>
-  </enum>
-
-  <enum name="Blend Mode" prefix="V3D_BLEND_MODE">
-    <value name="ADD" value="0"/>
-    <value name="SUB" value="1"/>
-    <value name="RSUB" value="2"/>
-    <value name="MIN" value="3"/>
-    <value name="MAX" value="4"/>
-    <value name="MUL" value="5"/>
-    <value name="SCREEN" value="6"/>
-    <value name="DARKEN" value="7"/>
-    <value name="LIGHTEN" value="8"/>
-  </enum>
-
-  <enum name="Stencil Op" prefix="V3D_STENCIL_OP">
-    <value name="ZERO" value="0"/>
-    <value name="KEEP" value="1"/>
-    <value name="REPLACE" value="2"/>
-    <value name="INCR" value="3"/>
-    <value name="DECR" value="4"/>
-    <value name="INVERT" value="5"/>
-    <value name="INCWRAP" value="6"/>
-    <value name="DECWRAP" value="7"/>
-  </enum>
-
-  <enum name="Primitive" prefix="V3D_PRIM">
-    <value name="POINTS" value="0"/>
-    <value name="LINES" value="1"/>
-    <value name="LINE_LOOP" value="2"/>
-    <value name="LINE_STRIP" value="3"/>
-    <value name="TRIANGLES" value="4"/>
-    <value name="TRIANGLE_STRIP" value="5"/>
-    <value name="TRIANGLE_FAN" value="6"/>
-    <value name="POINTS_TF" value="16"/>
-    <value name="LINES_TF" value="17"/>
-    <value name="LINE_LOOP_TF" value="18"/>
-    <value name="LINE_STRIP_TF" value="19"/>
-    <value name="TRIANGLES_TF" value="20"/>
-    <value name="TRIANGLE_STRIP_TF" value="21"/>
-    <value name="TRIANGLE_FAN_TF" value="22"/>
-  </enum>
-
-  <enum name="Border Colour Mode" prefix="V3D_BORDER_COLOUR">
-    <value name="0000" value="0"/>
-    <value name="0001" value="1"/>
-    <value name="1111" value="2"/>
-    <value name="Follows" value="7"/>
-  </enum>
-
-  <enum name="Wrap Mode" prefix="V3D_WRAP_MODE">
-      <value name="Wrap mode REPEAT" value="0"/>
-      <value name="Wrap mode CLAMP" value="1"/>
-      <value name="Wrap mode MIRROR" value="2"/>
-      <value name="Wrap mode BORDER" value="3"/>
-      <value name="Wrap mode MIRROR_ONCE" value="4"/>
-  </enum>
-
-  <enum name="TMU Op" prefix="V3D_TMU_OP">
-    <value name="Write ADD, Read Prefetch" value="0"/>
-    <value name="Write SUB, Read Clear" value="1"/>
-    <value name="Write XCHG, Read Flush" value="2"/>
-    <value name="Write CMPXCHG, Read Flush" value="3"/>
-    <value name="Write UMIN, Full L1 Clear" value="4"/>
-    <value name="Write UMAX" value="5"/>
-    <value name="Write SMIN" value="6"/>
-    <value name="Write SMAX" value="7"/>
-    <value name="Write AND, Read INC" value="8"/>
-    <value name="Write OR, Read DEC" value="9"/>
-    <value name="Write XOR, Read NOT" value="10"/>
-    <value name="Regular" value="15"/>
-  </enum>
-
-  <enum name="Varying Flags Action" prefix="V3D_VARYING_FLAGS_ACTION">
-    <value name="unchanged" value="0"/>
-    <value name="zeroed" value="1"/>
-    <value name="set" value="2"/>
-  </enum>
-
-  <enum name="Memory Format" prefix="V3D_MEMORY_FORMAT">
-    <value name="Raster" value="0"/>
-    <value name="Lineartile" value="1"/>
-    <value name="UB-linear (1 UIF block wide)" value="2"/>
-    <value name="UB-linear (2 UIF blocks wide)" value="3"/>
-    <value name="UIF (No XOR)" value="4"/>
-    <value name="UIF (XOR)" value="5"/>
-  </enum>
-
-  <enum name="Decimate Mode" prefix="V3D_DECIMATE_MODE">
-    <value name="sample 0" value="0"/>
-    <value name="4x" value="1"/>
-    <value name="16x" value="2"/>
-    <value name="all samples" value="3"/>
-  </enum>
-
-  <enum name="Internal Type" prefix="V3D_INTERNAL_TYPE">
-    <value name="8i" value="0"/>
-    <value name="8ui" value="1"/>
-    <value name="8" value="2"/>
-    <value name="16i" value="4"/>
-    <value name="16ui" value="5"/>
-    <value name="16f" value="6"/>
-    <value name="32i" value="8"/>
-    <value name="32ui" value="9"/>
-    <value name="32f" value="10"/>
-  </enum>
-
-  <enum name="Internal BPP" prefix="V3D_INTERNAL_BPP">
-    <value name="32" value="0"/>
-    <value name="64" value="1"/>
-    <value name="128" value="2"/>
-  </enum>
-
-  <enum name="Internal Depth Type" prefix="V3D_INTERNAL_TYPE">
-    <value name="depth_32f" value="0"/>
-    <value name="depth_24" value="1"/>
-    <value name="depth_16" value="2"/>
-  </enum>
-
-  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP">
-    <value name="none" value="0"/> <!-- no clamping -->
-    <value name="norm" value="1"/> <!-- [0,1] for f16 -->
-    <value name="pos" value="2"/> <!-- [0, for f16 -->
-    <value name="int" value="3"/> <!-- clamp to integer RT's range -->
-  </enum>
-
-  <enum name="Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT">
-    <!--
-	Formats appear with their channels named from the low bits to
-	the high bits.
-    -->
-    <value name="srgb8_alpha8" value="0"/>
-    <value name="srgb" value="1"/>
-    <value name="rgb10_a2ui" value="2"/>
-    <value name="rgb10_a2" value="3"/>
-    <value name="abgr1555" value="4"/>
-    <value name="alpha-masked abgr1555" value="5"/>
-    <value name="abgr4444" value="6"/>
-    <value name="bgr565" value="7"/>
-    <value name="r11f_g11f_b10f" value="8"/>
-    <value name="rgba32f"  value="9"/>
-    <value name="rg32f"    value="10"/>
-    <value name="r32f"     value="11"/>
-    <value name="rgba32i"  value="12"/>
-    <value name="rg32i"    value="13"/>
-    <value name="r32i"     value="14"/>
-    <value name="rgba32ui" value="15"/>
-    <value name="rg32ui"   value="16"/>
-    <value name="r32ui"    value="17"/>
-    <value name="rgba16f"  value="18"/>
-    <value name="rg16f"    value="19"/>
-    <value name="r16f"     value="20"/>
-    <value name="rgba16i"  value="21"/>
-    <value name="rg16i"    value="22"/>
-    <value name="r16i"     value="23"/>
-    <value name="rgba16ui" value="24"/>
-    <value name="rg16ui"   value="25"/>
-    <value name="r16ui"    value="26"/>
-    <value name="rgba8"    value="27"/>
-    <value name="rgb8"     value="28"/>
-    <value name="rg8"      value="29"/>
-    <value name="r8"       value="30"/>
-    <value name="rgba8i"   value="31"/>
-    <value name="rg8i"     value="32"/>
-    <value name="r8i"      value="33"/>
-    <value name="rgba8ui"  value="34"/>
-    <value name="rg8ui"    value="35"/>
-    <value name="r8ui"     value="36"/>
-    <!-- rgbx8/srgbx8 were removed -->
-    <value name="bstc"     value="39"/>
-    <value name="d32f"     value="40"/>
-    <value name="d24"      value="41"/>
-    <value name="d16"      value="42"/>
-    <value name="d24s8"    value="43"/>
-    <value name="s8"       value="44"/>
-  </enum>
-
-  <packet code="0" name="Halt"/>
-  <packet code="1" name="NOP"/>
-  <packet code="4" name="Flush"/>
-  <packet code="5" name="Flush All State"/>
-  <packet code="6" name="Start Tile Binning"/>
-  <packet code="7" name="Increment Semaphore"/>
-  <packet code="8" name="Wait on Semaphore"/>
-  <packet code="9" name="Wait for previous frame"/>
-  <packet code="10" name="Enable Z-only rendering" cl="R"/>
-  <packet code="11" name="Disable Z-only rendering" cl="R"/>
-  <packet code="12" name="End of Z-only rendering in frame"/>
-  <packet code="13" name="End of rendering"/>
-
-  <packet code="14" name="Wait for transform feedback" cl="B">
-    <field name="Block count" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="15" name="Branch to auto-chained sub-list">
-    <field name="address" size="32" start="0" type="address"/>
-  </packet>
-
-  <packet code="16" name="Branch">
-    <field name="address" size="32" start="0" type="address"/>
-  </packet>
-
-  <packet code="17" name="Branch to Sub-list">
-    <field name="address" size="32" start="0" type="address"/>
-  </packet>
-
-  <packet code="18" name="Return from sub-list"/>
-  <packet code="19" name="Flush VCD cache"/>
-
-  <packet code="20" name="Start Address of Generic Tile List">
-    <field name="start" size="32" start="0" type="address"/>
-    <field name="end" size="32" start="32" type="address"/>
-  </packet>
-
-  <packet code="21" name="Branch to Implicit Tile List">
-    <field name="tile list set number" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="22" name="Branch to Explicit Supertile">
-    <field name="Absolute address of explicit supertile render list" size="32" start="24" type="address"/>
-    <field name="explicit supertile number" size="8" start="16" type="uint"/>
-    <field name="row number" size="8" start="8" type="uint"/>
-    <field name="column number" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="23" name="Supertile Coordinates">
-    <field name="row number in supertiles" size="8" start="8" type="uint"/>
-    <field name="column number in supertiles" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="25" name="Clear Tile Buffers" cl="R">
-    <field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
-    <field name="Clear all Render Targets" size="1" start="0" type="bool"/>
-  </packet>
-
-  <packet code="26" name="End of Loads" cl="R"/>
-  <packet code="27" name="End of Tile Marker" cl="R"/>
-
-  <packet code="29" name="Store Tile Buffer General" cl="R">
-    <field name="Address" size="32" start="64" type="address"/>
-
-    <!-- used for y flip -->
-    <field name="Height" size="16" start="48" type="uint"/>
-
-    <!-- height in ub for UIF, byte stride for raster -->
-    <field name="Height in UB or Stride" size="20" start="28" type="uint"/>
-
-    <field name="R/B swap" size="1" start="20" type="bool"/>
-    <field name="Channel Reverse" size="1" start="19" type="bool"/>
-    <field name="Clear buffer being stored" size="1" start="18" type="bool"/>
-    <field name="Output Image Format" size="6" start="12" type="Output Image Format"/>
-
-    <field name="Decimate" size="2" start="10" type="Decimate Mode"/>
-
-    <field name="A dithered" size="1" start="9" type="bool"/>
-    <field name="BGR dithered" size="1" start="8" type="bool"/>
-
-    <field name="Flip Y" size="1" start="7" type="bool"/>
-
-    <field name="Memory Format" size="3" start="4" type="Memory Format"/>
-    <field name="Buffer to Store" size="4" start="0" type="uint">
-      <value name="Render target 0" value="0"/>
-      <value name="Render target 1" value="1"/>
-      <value name="Render target 2" value="2"/>
-      <value name="Render target 3" value="3"/>
-      <value name="None" value="8"/>
-      <value name="Z" value="9"/>
-      <value name="Stencil" value="10"/>
-      <value name="Z+Stencil" value="11"/>
-    </field>
-  </packet>
-
-  <packet code="30" name="Load Tile Buffer General" cl="R">
-    <field name="Address" size="32" start="64" type="address"/>
-
-    <!-- used for y flip -->
-    <field name="Height" size="16" start="48" type="uint"/>
-
-    <!-- height in ub for UIF, byte stride for raster -->
-    <field name="Height in UB or Stride" size="20" start="28" type="uint"/>
-
-    <field name="R/B swap" size="1" start="20" type="bool"/>
-    <field name="Channel Reverse" size="1" start="19" type="bool"/>
-
-    <field name="Input Image Format" size="6" start="12" type="Output Image Format"/>
-
-    <field name="Decimate" size="2" start="10" type="Decimate Mode"/>
-
-    <field name="Flip Y" size="1" start="7" type="bool"/>
-
-    <field name="Memory Format" size="3" start="4" type="Memory Format"/>
-    <field name="Buffer to Load" size="4" start="0" type="uint">
-      <value name="Render target 0" value="0"/>
-      <value name="Render target 1" value="1"/>
-      <value name="Render target 2" value="2"/>
-      <value name="Render target 3" value="3"/>
-      <value name="None" value="8"/>
-      <value name="Z" value="9"/>
-      <value name="Stencil" value="10"/>
-      <value name="Z+Stencil" value="11"/>
-    </field>
-  </packet>
-
-  <packet code="32" name="Indexed Primitive List" cl="B">
-    <field name="Index Offset" size="32" start="40" type="uint"/>
-
-    <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
-    <field name="Length" size="31" start="8" type="uint"/>
-
-    <field name="Index type" size="2" start="6" type="uint">
-      <value name="Index type 8-bit" value="0"/>
-      <value name="Index type 16-bit" value="1"/>
-      <value name="Index type 32-bit" value="2"/>
-    </field>
-
-    <field name="mode" size="6" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="34" name="Indexed Instanced Primitive List" cl="B">
-    <field name="Index Offset" size="32" start="72" type="uint"/>
-    <field name="Number of Instances" size="32" start="40" type="uint"/>
-    <field name="Enable Primitive Restarts" size="1" start="39" type="bool"/>
-    <field name="Instance Length" size="31" start="8" type="uint"/>
-
-    <field name="Index type" size="2" start="6" type="uint">
-      <value name="Index type 8-bit" value="0"/>
-      <value name="Index type 16-bit" value="1"/>
-      <value name="Index type 32-bit" value="2"/>
-    </field>
-
-    <field name="mode" size="6" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="36" name="Vertex Array Primitives" cl="B">
-    <field name="Index of First Vertex" size="32" start="40" type="uint"/>
-    <field name="Length" size="32" start="8" type="uint"/>
-
-    <field name="mode" size="8" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="38" name="Vertex Array Instanced Primitives" cl="B">
-    <field name="Index of First Vertex" size="32" start="72" type="uint"/>
-    <field name="Number of Instances" size="32" start="40" type="uint"/>
-    <field name="Instance Length" size="32" start="8" type="uint"/>
-
-    <field name="mode" size="8" start="0" type="Primitive"/>
-  </packet>
-
-  <packet code="43" name="Base Vertex Base Instance" cl="B">
-    <field name="Base Instance" size="32" start="32" type="uint"/>
-
-    <field name="Base Vertex" size="32" start="0" type="uint"/>
-  </packet>
-
-  <packet code="44" name="Index Buffer Setup" cl="B">
-    <field name="Address" size="32" start="0" type="address"/>
-    <field name="Size" size="32" start="32" type="uint"/>
-  </packet>
-
-  <packet code="56" name="Primitive List Format">
-    <field name="data type" size="1" start="6" type="uint">
-      <value name="List Indexed" value="0"/>
-      <value name="List 32-bit X/Y" value="1"/>
-    </field>
-    <field name="primitive type" size="6" start="0" type="uint">
-      <value name="List Points" value="0"/>
-      <value name="List Lines" value="1"/>
-      <value name="List Triangles" value="2"/>
-    </field>
-  </packet>
-
-  <packet code="64" name="GL Shader State">
-    <field name="address" size="27" start="5" type="address"/>
-    <field name="number of attribute arrays" size="5" start="0" type="uint"/>
-  </packet>
-
-  <packet code="73" name="Transform Feedback Buffer">
-    <field name="Buffer Address" size="32" start="32" type="address"/>
-    <field name="Buffer Size in 32-bit words" size="30" start="2" type="uint"/>
-    <field name="Buffer Number" size="2" start="0" type="uint"/>
-  </packet>
-
-  <packet code="74" name="Transform Feedback Specs">
-    <field name="Enable" size="1" start="7" type="bool"/>
-    <field name="Number of 16-bit Output Data Specs following" size="5" start="0" type="uint"/>
-  </packet>
-
-  <packet code="75" name="Flush Transform Feedback Data"/>
-
-  <struct name="Transform Feedback Output Data Spec">
-    <field name="First Shaded Vertex Value to output" size="8" start="0" type="uint"/>
-    <field name="Number of consecutive Vertex Values to output as 32-bit values minus 1" size="4" start="8" type="uint"/>
-    <field name="Output Buffer to write to" size="2" start="12" type="uint"/>
-  </struct>
-
-  <struct name="Transform Feedback Output Address">
-    <field name="address" size="32" start="0" type="address"/>
-  </struct>
-
-  <packet code="80" name="Stencil Config">
-    <field name="Stencil Write Mask" size="8" start="32" type="uint"/>
-    <field name="Back Config" size="1" start="29" type="bool"/>
-    <field name="Front Config" size="1" start="28" type="bool"/>
-    <field name="Stencil Pass Op" size="3" start="25" type="Stencil Op"/>
-    <field name="Depth Test Fail Op" size="3" start="22" type="Stencil Op"/>
-    <field name="Stencil Test Fail Op" size="3" start="19" type="Stencil Op"/>
-    <field name="Stencil Test Function" size="3" start="16" type="Compare Function"/>
-    <field name="Stencil Test Mask" size="8" start="8" type="uint"/>
-    <field name="Stencil Ref Value" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="83" name="Blend Enables">
-    <field name="Mask" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="84" name="Blend Config">
-    <field name="VG Coverage Modes" size="2" start="28" type="uint"/>
-    <field name="Render Target Mask" size="4" start="24" type="uint"/>
-    <field name="Colour blend dst factor" size="4" start="20" type="Blend Factor"/>
-    <field name="Colour blend src factor" size="4" start="16" type="Blend Factor"/>
-    <field name="Colour blend mode" size="4" start="12" type="Blend Mode"/>
-    <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
-    <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
-    <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
-  </packet>
-
-  <packet code="86" name="Blend Constant Colour">
-    <field name="Alpha (F16)" size="16" start="48" type="uint"/>
-    <field name="Blue (F16)" size="16" start="32" type="uint"/>
-    <field name="Green (F16)" size="16" start="16" type="uint"/>
-    <field name="Red (F16)" size="16" start="0" type="uint"/>
-  </packet>
-
-  <packet code="87" name="Colour Write Masks">
-    <field name="Reserved" size="16" start="16" type="uint"/>
-    <field name="Render Target 3 per colour component write masks" size="4" start="12" type="uint"/>
-    <field name="Render Target 2 per colour component write masks" size="4" start="8" type="uint"/>
-    <field name="Render Target 1 per colour component write masks" size="4" start="4" type="uint"/>
-    <field name="Render Target 0 per colour component write masks" size="4" start="0" type="uint"/>
-  </packet>
-
-  <packet code="88" name="Zero All Centroid Flags"/>
-
-  <packet code="89" name="Centroid Flags">
-    <field name="Centroid Flags for varyings V0*24" size="24" start="8" type="uint"/>
-    <field name="Action for Centroid Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
-    <field name="Action for Centroid Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
-    <field name="Varying offset V0" size="4" start="0" type="uint"/>
-  </packet>
-
-  <packet code="92" name="Occlusion Query Counter">
-    <field name="address" size="32" start="0" type="address"/>
-  </packet>
-
-  <packet code="96" name="Configuration Bits">
-    <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
-    <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
-    <field name="Blend enable" size="1" start="19" type="bool"/>
-    <field name="Stencil enable" size="1" start="18" type="bool"/>
-    <field name="Early Z updates enable" size="1" start="17" type="bool"/>
-    <field name="Early Z enable" size="1" start="16" type="bool"/>
-    <field name="Z updates enable" size="1" start="15" type="bool"/>
-    <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
-    <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
-    <field name="Coverage Update Mode" size="2" start="9" type="uint"/>
-    <field name="Coverage Pipe Select" size="1" start="8" type="bool"/>
-    <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
-    <field name="Line Rasterization" size="2" start="4" type="uint"/>
-    <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
-    <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
-    <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
-    <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
-  </packet>
-
-  <packet code="97" name="Zero All Flat Shade Flags"/>
-
-  <packet code="98" name="Flat Shade Flags">
-    <field name="Flat Shade Flags for varyings V0*24" size="24" start="8" type="uint"/>
-    <field name="Action for Flat Shade Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
-    <field name="Action for Flat Shade Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
-    <field name="Varying offset V0" size="4" start="0" type="uint"/>
-  </packet>
-
-  <packet code="99" name="Zero All Non-perspective Flags"/>
-
-  <packet code="100" name="Non-perspective Flags">
-    <field name="Non-perspective Flags for varyings V0*24" size="24" start="8" type="uint"/>
-    <field name="Action for Non-perspective Flags of higher numbered varyings" size="2" start="6" type="Varying Flags Action"/>
-    <field name="Action for Non-perspectivey Flags of lower numbered varyings" size="2" start="4" type="Varying Flags Action"/>
-    <field name="Varying offset V0" size="4" start="0" type="uint"/>
-  </packet>
-
-  <packet code="104" name="Point size">
-    <field name="Point Size" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet code="105" name="Line width">
-    <field name="Line width" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet name="Depth Offset" code="106">
-    <field name="Limit" size="32" start="32" type="float"/>
-    <!-- these fields are both float-1-8-7 encoded (top 16 bits of a float32) -->
-    <field name="Depth Offset Units" size="16" start="16" type="uint"/>
-    <field name="Depth Offset Factor" size="16" start="0" type="uint"/>
-  </packet>
-
-  <packet name="Clip Window" code="107">
-    <field name="Clip Window Height in pixels" size="16" start="48" type="uint"/>
-    <field name="Clip Window Width in pixels" size="16" start="32" type="uint"/>
-    <field name="Clip Window Bottom Pixel Coordinate" size="16" start="16" type="uint"/>
-    <field name="Clip Window Left Pixel Coordinate" size="16" start="0" type="uint"/>
-  </packet>
-
-  <packet name="Viewport Offset" code="108">
-    <field name="Coarse Y" size="10" start="54" type="uint"/>
-    <field name="Viewport Centre Y-coordinate" size="22" start="32" type="s14.8"/>
-    <field name="Coarse X" size="10" start="22" type="uint"/>
-    <field name="Viewport Centre X-coordinate" size="22" start="0" type="s14.8"/>
-  </packet>
-
-  <packet name="Clipper Z min/max clipping planes" code="109">
-    <field name="Maximum Zw" size="32" start="32" type="float"/>
-    <field name="Minimum Zw" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet name="Clipper XY Scaling" code="110" cl="B">
-    <field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
-    <field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet name="Clipper Z Scale and Offset" code="111" cl="B">
-    <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
-    <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
-  </packet>
-
-  <packet name="Number of Layers" code="119">
-    <field name="Number of Layers Minus 1" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="120" name="Tile Binning Mode Configuration (Part1)">
-
-    <field name="Height (in pixels minus 1)" size="12" start="48" type="uint"/>
-    <field name="Width (in pixels minus 1)" size="12" start="32" type="uint"/>
-
-    <field name="Double-buffer in non-ms mode" size="1" start="15" type="bool"/>
-    <field name="Multisample Mode (4x)" size="1" start="14" type="bool"/>
-
-    <field name="Maximum BPP of all render targets" size="2" start="12" type="uint">
-      <value name="Render target maximum 32bpp" value="0"/>
-      <value name="Render target maximum 64bpp" value="1"/>
-      <value name="Render target maximum 128bpp" value="2"/>
-    </field>
-
-    <field name="Number of Render Targets minus 1" size="4" start="8" type="uint"/>
-
-    <field name="tile allocation block size" size="2" start="4" type="uint">
-      <value name="tile allocation block size 64b" value="0"/>
-      <value name="tile allocation block size 128b" value="1"/>
-      <value name="tile allocation block size 256b" value="2"/>
-    </field>
-    <field name="tile allocation initial block size" size="2" start="2" type="uint">
-      <value name="tile allocation initial block size 64b" value="0"/>
-      <value name="tile allocation initial block size 128b" value="1"/>
-      <value name="tile allocation initial block size 256b" value="2"/>
-    </field>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Common Configuration)" cl="R">
-    <field name="Pad" size="12" start="52" type="uint"/>
-
-    <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
-    <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
-
-    <field name="Early-Z disable" size="1" start="46" type="bool"/>
-
-    <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
-      <value name="Early-Z direction LT/LE" value="0"/>
-      <value name="Early-Z direction GT/GE" value="1"/>
-    </field>
-
-    <field name="Select Coverage Mode" size="1" start="44" type="bool"/>
-    <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
-    <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
-
-    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
-
-    <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
-    <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
-    <field name="Number of Render Targets Minus 1" size="4" start="4" type="uint"/>
-
-    <field name="sub-id" size="4" start="0" type="uint" default="0"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Render Target config)" cl="R">
-
-    <field name="Pad" size="28" start="34" type="uint"/>
-
-    <field name="Render Target 3 Clamp" size="2" start="32" type="Render Target Clamp"/>
-    <field name="Render Target 3 Internal Type" size="4" start="30" type="Internal Type"/>
-    <field name="Render Target 3 Internal BPP" size="2" start="28" type="Internal BPP"/>
-
-    <field name="Render Target 2 Clamp" size="2" start="26" type="Render Target Clamp"/>
-    <field name="Render Target 2 Internal Type" size="4" start="22" type="Internal Type"/>
-    <field name="Render Target 2 Internal BPP" size="2" start="20" type="Internal BPP"/>
-
-    <field name="Render Target 1 Clamp" size="2" start="18" type="Render Target Clamp"/>
-    <field name="Render Target 1 Internal Type" size="4" start="14" type="Internal Type"/>
-    <field name="Render Target 1 Internal BPP" size="2" start="12" type="Internal BPP"/>
-
-    <field name="Render Target 0 Clamp" size="2" start="10" type="Render Target Clamp"/>
-    <field name="Render Target 0 Internal Type" size="4" start="6" type="Internal Type"/>
-    <field name="Render Target 0 Internal BPP" size="2" start="4" type="Internal BPP"/>
-
-    <field name="sub-id" size="4" start="0" type="uint" default="1"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Z Stencil Clear Values)" cl="R">
-    <field name="unused" size="16" start="48" type="uint"/>
-
-    <field name="Z Clear Value" size="32" start="16" type="float"/>
-
-    <field name="Stencil/VG Mask Clear Value" size="8" start="8" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="2"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Clear Colors Part1)" cl="R">
-    <!-- Express this as a 56-bit field? -->
-    <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
-    <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
-
-    <field name="Render Target number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="3"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Clear Colors Part2)" cl="R">
-    <!-- Express this as a 56-bit field? -->
-    <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
-    <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
-
-    <field name="Render Target number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="4"/>
-  </packet>
-
-  <packet code="121" name="Tile Rendering Mode Configuration (Clear Colors Part3)" cl="R">
-    <field name="pad" size="11" start="53" type="uint"/>
-    <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
-    <!-- image height is for Y flipping -->
-    <field name="Raster Row Stride or Image Height in Pixels" size="16" start="24" type="uint"/>
-    <field name="Clear Color high 16 bits" size="16" start="8" type="uint"/>
-
-    <field name="Render Target number" size="4" start="4" type="uint"/>
-    <field name="sub-id" size="4" start="0" type="uint" default="5"/>
-  </packet>
-
-  <packet code="124" name="Tile Coordinates">
-    <field name="tile row number" size="12" start="12" type="uint"/>
-    <field name="tile column number" size="12" start="0" type="uint"/>
-  </packet>
-
-  <packet code="122" name="Multicore Rendering Supertile Configuration" cl="R">
-    <field name="Supertile Raster Order" size="1" start="60" type="bool"/>
-    <field name="Multicore Enable" size="1" start="56" type="bool"/>
-
-    <field name="Total Frame Height in Tiles" size="12" start="44" type="uint"/>
-    <field name="Total Frame Width in Tiles" size="12" start="32" type="uint"/>
-
-    <field name="Total Frame Height in Supertiles" size="8" start="24" type="uint"/>
-    <field name="Total Frame Width in Supertiles" size="8" start="16" type="uint"/>
-
-    <field name="Supertile Height in Tiles minus 1" size="8" start="8" type="uint"/>
-    <field name="Supertile Width in Tiles minus 1" size="8" start="0" type="uint"/>
-  </packet>
-
-  <packet code="123" name="Multicore Rendering Tile List Set Base" cl="R">
-    <field name="address" size="26" start="6" type="address"/>
-    <field name="Tile List Set Number" size="4" start="0" type="uint"/>
-  </packet>
-
-  <!-- add fields -->
-  <packet code="125" name="Tile Coordinates Implicit"/>
-
-  <packet code="126" name="Tile List Initial Block Size">
-    <field name="Use auto-chained tile lists" size="1" start="2" type="bool"/>
-
-    <field name="Size of first block in chained tile lists" size="2" start="0" type="uint">
-      <value name="tile allocation block size 64b" value="0"/>
-      <value name="tile allocation block size 128b" value="1"/>
-      <value name="tile allocation block size 256b" value="2"/>
-    </field>
-  </packet>
-
-  <struct name="Geometry Shader State Record">
-    <field name="Geometry Bin Mode Shader Code Address" size="32" start="0b" type="address"/>
-    <field name="4-way threadable" size="1" start="0" type="bool"/>
-    <field name="Start in final thread section" size="1" start="1" type="bool"/>
-    <field name="Propagate NaNs" size="1" start="2" type="bool"/>
-    <field name="Geometry Bin Mode Shader Uniforms Address" size="32" start="4b" type="address"/>
-    <field name="Geometry Render Mode Shader Code Address" size="32" start="8b" type="address"/>
-    <field name="Geometry Render Mode Shader Uniforms Address" size="32" start="12b" type="address"/>
-  </struct>
-
-  <struct name="Tessellation Shader State Record">
-    <field name="Tessellation Bin Mode Control Shader Code Address" size="32" start="0b" type="address"/>
-    <field name="Tessellation Bin Mode Control Shader Uniforms Address" size="32" start="4b" type="address"/>
-    <field name="Tessellation Render Mode Control Shader Code Address" size="32" start="8b" type="address"/>
-    <field name="Tessellation Render Mode Control Shader Uniforms Address" size="32" start="12b" type="address"/>
-
-    <field name="Tessellation Bin Mode Evaluation Shader Code Address" size="32" start="16b" type="address"/>
-    <field name="Tessellation Bin Mode Evaluation Shader Uniforms Address" size="32" start="20b" type="address"/>
-    <field name="Tessellation Render Mode Evaluation Shader Code Address" size="32" start="24b" type="address"/>
-    <field name="Tessellation Render Mode Evaluation Shader Uniforms Address" size="32" start="28b" type="address"/>
-  </struct>
-
-  <struct name="GL Shader State Record">
-    <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
-    <field name="Enable clipping" size="1" start="1" type="bool"/>
-
-    <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
-    <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
-    <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
-    <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
-    <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
-    <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
-
-    <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
-    <field name="Turn off early-z test" size="1" start="9" type="bool"/>
-    <field name="Coordinate shader has separate input and output VPM blocks" size="1" start="10" type="bool"/>
-    <field name="Vertex shader has separate input and output VPM blocks" size="1" start="11" type="bool"/>
-    <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
-    <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
-    <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
-    <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
-    <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
-    <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
-    <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
-
-    <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
-
-    <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
-    <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
-
-    <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
-    <field name="Min Coord Shader input segments required in play minus 1" size="4" start="44" type="uint"/>
-
-    <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
-    <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
-
-    <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
-    <field name="Min Vertex Shader input segments required in play minus 1" size="4" start="60" type="uint"/>
-
-    <field name="Address of default attribute values" size="32" start="8b" type="address"/>
-
-    <field name="Fragment Shader Code Address" size="32" start="12b" type="address"/>
-    <field name="Fragment Shader 4-way threadable" size="1" start="96" type="bool"/>
-    <field name="Fragment Shader start in final thread section" size="1" start="97" type="bool"/>
-    <field name="Propagate NaNs" size="1" start="98" type="bool"/>
-    <field name="Fragment Shader Uniforms Address" size="32" start="16b" type="address"/>
-
-    <field name="Vertex Shader Code Address" size="32" start="20b" type="address"/>
-    <field name="Vertex Shader 4-way threadable" size="1" start="160" type="bool"/>
-    <field name="Vertex Shader start in final thread section" size="1" start="161" type="bool"/>
-    <field name="Vertex Shader Uniforms Address" size="32" start="24b" type="address"/>
-
-    <field name="Coordinate Shader Code Address" size="32" start="28b" type="address"/>
-    <field name="Coordinate Shader 4-way threadable" size="1" start="224" type="bool"/>
-    <field name="Coordinate Shader start in final thread section" size="1" start="225" type="bool"/>
-    <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
-  </struct>
-
-  <struct name="GL Shader State Attribute Record">
-    <field name="Address" size="32" start="0" type="address"/>
-
-    <field name="Vec size" size="2" start="32" type="uint"/>
-    <field name="Type" size="3" start="34" type="uint">
-      <value name="Attribute half-float" value="1"/>
-      <value name="Attribute float" value="2"/>
-      <value name="Attribute fixed" value="3"/>
-      <value name="Attribute byte" value="4"/>
-      <value name="Attribute short" value="5"/>
-      <value name="Attribute int" value="6"/>
-      <value name="Attribute int2_10_10_10" value="7"/>
-    </field>
-    <field name="Signed int type" size="1" start="37" type="bool"/>
-    <field name="Normalized int type" size="1" start="38" type="bool"/>
-    <field name="Read as int/uint" size="1" start="39" type="bool"/>
-
-    <field name="Number of values read by Coordinate shader" size="4" start="40" type="uint"/>
-    <field name="Number of values read by Vertex shader" size="4" start="44" type="uint"/>
-
-    <field name="Instance Divisor" size="16" start="6b" type="uint"/>
-    <field name="Stride" size="32" start="8b" type="uint"/>
-    <field name="Maximum Index" size="32" start="12b" type="uint"/>
-  </struct>
-
-  <struct name="VPM generic block write setup">
-    <field name="id" size="2" start="30" type="uint" default="0"/>
-    <field name="id0" size="3" start="27" type="uint" default="0"/>
-
-    <field name="horiz" size="1" start="24" type="bool"/>
-    <field name="laned" size="1" start="23" type="bool"/>
-    <field name="segs" size="1" start="22" type="bool"/>
-    <field name="stride" size="7" start="15" type="int"/>
-
-    <field name="size" size="2" start="13" type="uint">
-      <value name="VPM setup size 8-bit" value="0"/>
-      <value name="VPM setup size 16-bit" value="1"/>
-      <value name="VPM setup size 32-bit" value="2"/>
-    </field>
-
-    <field name="addr" size="13" start="0" type="uint"/>
-  </struct>
-
-  <struct name="VPM generic block read setup">
-    <field name="id" size="2" start="30" type="uint" default="1"/>
-
-    <field name="horiz" size="1" start="29" type="bool"/>
-    <field name="laned" size="1" start="28" type="bool"/>
-    <field name="segs" size="1" start="27" type="bool"/>
-    <field name="num" size="5" start="22" type="uint"/>
-    <field name="stride" size="7" start="15" type="int"/>
-
-    <field name="size" size="2" start="13" type="uint">
-      <value name="VPM setup size 8-bit" value="0"/>
-      <value name="VPM setup size 16-bit" value="1"/>
-      <value name="VPM setup size 32-bit" value="2"/>
-    </field>
-
-    <field name="addr" size="13" start="0" type="uint"/>
-  </struct>
-
-  <struct name="TMU Config Parameter 0">
-    <field name="Texture state address" size="32" start="0" type="address"/>
-    <field name="Return words of texture data" size="4" start="0" type="uint"/>
-  </struct>
-
-  <struct name="TMU Config Parameter 1">
-    <field name="Sampler state address" size="32" start="0" type="address"/>
-    <field name="Per-pixel mask enable" size="1" start="2" type="bool"/>
-    <field name="Unnormalized coordinates" size="1" start="1" type="bool"/>
-    <field name="Output Type 32-bit" size="1" start="0" type="bool"/>
-  </struct>
-
-  <struct name="TMU Config Parameter 2">
-    <field name="Pad" size="23" start="9" type="uint"/>
-    <field name="LOD Query" size="1" start="8" type="bool"/>
-    <field name="Op" size="4" start="20" type="TMU Op"/>
-    <field name="Offset R" size="4" start="16" type="int"/>
-    <field name="Offset T" size="4" start="12" type="int"/>
-    <field name="Offset S" size="4" start="8" type="int"/>
-    <field name="Gather Mode" size="1" start="7" type="bool"/>
-    <field name="Gather Component" size="2" start="5" type="uint"/>
-    <field name="Coefficient Mode" size="1" start="4" type="bool"/>
-    <field name="Sample Number" size="2" start="2" type="uint"/>
-    <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
-    <field name="Offset Format 8" size="1" start="0" type="bool"/>
-  </struct>
-
-  <struct name="Texture Shader State">
-    <field name="Pad" size="56" start="136" type="uint"/>
-    <field name="UIF XOR disable" size="1" start="135" type="bool"/>
-    <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
-    <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
-    <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
-
-    <field name="Base Level" size="4" start="124" type="uint"/>
-    <field name="Max Level" size="4" start="120" type="uint"/>
-
-    <field name="Swizzle A" size="3" start="117" type="uint">
-      <value name="Swizzle Zero" value="0"/>
-      <value name="Swizzle One" value="1"/>
-      <value name="Swizzle Red" value="2"/>
-      <value name="Swizzle Green" value="3"/>
-      <value name="Swizzle Blue" value="4"/>
-      <value name="Swizzle Alpha" value="5"/>
-    </field>
-
-    <field name="Swizzle B" size="3" start="114" type="uint"/>
-    <field name="Swizzle G" size="3" start="111" type="uint"/>
-    <field name="Swizzle R" size="3" start="108" type="uint"/>
-    <field name="Extended" size="1" start="107" type="bool"/>
-
-    <field name="Texture type" size="7" start="100" type="uint"/>
-    <field name="Image Depth" size="14" start="86" type="uint"/>
-    <field name="Image Height" size="14" start="72" type="uint"/>
-    <field name="Image Width" size="14" start="58" type="uint"/>
-
-    <field name="Array Stride (64-byte aligned)" size="26" start="32" type="uint"/>
-
-    <field name="Texture base pointer" size="32" start="0" type="address"/>
-
-    <field name="Reverse Standard Border Colour" size="1" start="5" type="bool"/>
-    <field name="AHDR" size="1" start="4" type="bool"/>
-    <field name="sRGB" size="1" start="3" type="bool"/>
-    <field name="Flip S and T on incoming request" size="1" start="2" type="bool"/>
-    <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
-    <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
-  </struct>
-
-  <struct name="Sampler State">
-    <field name="Border colour Alpha" size="32" start="160" type="uint"/>
-    <field name="Border colour Blue" size="32" start="128" type="uint"/>
-    <field name="Border colour Green" size="32" start="96" type="uint"/>
-    <field name="Border colour Red" size="32" start="64" type="uint"/>
-
-    <field name="Maximum Anisotropy" size="2" start="61" type="uint"/>
-    <field name="Border Colour Mode" size="3" start="58" type="Border Colour Mode"/>
-    <field name="Wrap I Border" size="1" start="57" type="bool"/>
-    <field name="Wrap R" size="3" start="54" type="Wrap Mode"/>
-    <field name="Wrap T" size="3" start="51" type="Wrap Mode"/>
-    <field name="Wrap S" size="3" start="48" type="Wrap Mode"/>
-
-    <field name="Fixed Bias" size="16" start="32" type="s8.8"/>
-    <field name="Max Level-of-Detail" size="12" start="20" type="u4.8"/>
-    <field name="Min Level-of-Detail" size="12" start="8" type="u4.8"/>
-
-    <field name="sRGB Disable" size="1" start="7" type="bool"/>
-
-    <field name="Depth Compare Function" size="3" start="4" type="Compare Function"/>
-
-    <field name="Anisotropy Enable" size="1" start="3" type="bool"/>
-    <field name="Mip filter Nearest" size="1" start="2" type="bool"/>
-    <field name="Min filter Nearest" size="1" start="1" type="bool"/>
-    <field name="Mag filter Nearest" size="1" start="0" type="bool"/>
-  </struct>
-
-  <enum name="Texture Data Formats">
-    <!--
-	most formats here have R in the low bits, A in the high bits.
-	Exceptions noted.
-    -->
-    <value name="Texture Data Format R8" value="0"/>
-    <value name="Texture Data Format R8 SNORM" value="1"/>
-    <value name="Texture Data Format RG8" value="2"/>
-    <value name="Texture Data Format RG8 SNORM" value="3"/>
-    <value name="Texture Data Format RGBA8" value="4"/>
-    <value name="Texture Data Format RGBA8 SNORM" value="5"/>
-    <value name="Texture Data Format RGB565" value="6"/> <!-- B in low bits -->
-    <value name="Texture Data Format RGBA4" value="7"/> <!-- A low, R high -->
-    <value name="Texture Data Format RGB5_A1" value="8"/> <!-- A low, R high -->
-    <value name="Texture Data Format RGB10_A2" value="9"/> <!-- R low, A high -->
-    <value name="Texture Data Format R16" value="10"/>
-    <value name="Texture Data Format R16 SNORM" value="11"/>
-    <value name="Texture Data Format RG16" value="12"/>
-    <value name="Texture Data Format RG16 SNORM" value="13"/>
-    <value name="Texture Data Format RGBA16" value="14"/>
-    <value name="Texture Data Format RGBA16 SNORM" value="15"/>
-    <value name="Texture Data Format R16F" value="16"/>
-    <value name="Texture Data Format RG16F" value="17"/>
-    <value name="Texture Data Format RGBA16F" value="18"/>
-    <value name="Texture Data Format R11F_G11F_B10F" value="19"/>
-    <value name="Texture Data Format RGB9_E5" value="20"/>
-    <value name="Texture Data Format DEPTH COMP16" value="21"/>
-    <value name="Texture Data Format DEPTH COMP24" value="22"/>
-    <value name="Texture Data Format DEPTH COMP32F" value="23"/>
-    <value name="Texture Data Format DEPTH24_X8" value="24"/> <!-- X low, D high -->
-    <value name="Texture Data Format R4" value="25"/>
-    <value name="Texture Data Format R1" value="26"/>
-    <!-- generic unfiltered 8-bit sample -->
-    <value name="Texture Data Format S8" value="27"/>
-    <!-- generic unfiltered 16-bit sample -->
-    <value name="Texture Data Format S16" value="28"/>
-    <!-- generic unfiltered 32-bit sample -->
-    <value name="Texture Data Format R32F" value="29"/>
-    <!-- generic unfiltered 64-bit sample -->
-    <value name="Texture Data Format RG32F" value="30"/>
-    <!-- generic unfiltered 128-bit sample -->
-    <value name="Texture Data Format RGBA32F" value="31"/>
-
-    <value name="Texture Data Format RGB8_ETC2" value="32"/>
-    <value name="Texture Data Format RGB8_PUNCHTHROUGH_ALPHA1" value="33"/>
-
-    <value name="Texture Data Format R11_EAC" value="34"/>
-    <value name="Texture Data Format SIGNED_R11_EAC" value="35"/>
-    <value name="Texture Data Format RG11_EAC" value="36"/>
-    <value name="Texture Data Format SIGNED_RG11_EAC" value="37"/>
-
-    <value name="Texture Data Format RGBA8_ETC2_EAC" value="38"/>
-    <value name="Texture Data Format YCBCR_LUMA" value="39"/>
-    <value name="Texture Data Format YCBCR_420_CHROMA" value="40"/>
-
-    <value name="Texture Data Format BC1" value="48"/>
-    <value name="Texture Data Format BC2" value="49"/>
-    <value name="Texture Data Format BC3" value="50"/>
-
-    <value name="Texture Data Format ASTC_4x4" value="64"/>
-    <value name="Texture Data Format ASTC_5x4" value="65"/>
-    <value name="Texture Data Format ASTC_5x5" value="66"/>
-    <value name="Texture Data Format ASTC_6x5" value="67"/>
-    <value name="Texture Data Format ASTC_6x6" value="68"/>
-    <value name="Texture Data Format ASTC_8x5" value="69"/>
-    <value name="Texture Data Format ASTC_8x6" value="70"/>
-    <value name="Texture Data Format ASTC_8x8" value="71"/>
-    <value name="Texture Data Format ASTC_10x5" value="72"/>
-    <value name="Texture Data Format ASTC_10x6" value="73"/>
-    <value name="Texture Data Format ASTC_10x8" value="74"/>
-    <value name="Texture Data Format ASTC_10x10" value="75"/>
-    <value name="Texture Data Format ASTC_12x10" value="76"/>
-    <value name="Texture Data Format ASTC_12x12" value="77"/>
-
-    <value name="Texture Data Format R8I" value="96"/>
-    <value name="Texture Data Format R8UI" value="97"/>
-    <value name="Texture Data Format RG8I" value="98"/>
-    <value name="Texture Data Format RG8UI" value="99"/>
-    <value name="Texture Data Format RGBA8I" value="100"/>
-    <value name="Texture Data Format RGBA8UI" value="101"/>
-
-    <value name="Texture Data Format R16I" value="102"/>
-    <value name="Texture Data Format R16UI" value="103"/>
-    <value name="Texture Data Format RG16I" value="104"/>
-    <value name="Texture Data Format RG16UI" value="105"/>
-    <value name="Texture Data Format RGBA16I" value="106"/>
-    <value name="Texture Data Format RGBA16UI" value="107"/>
-
-    <value name="Texture Data Format R32I" value="108"/>
-    <value name="Texture Data Format R32UI" value="109"/>
-    <value name="Texture Data Format RG32I" value="110"/>
-    <value name="Texture Data Format RG32UI" value="111"/>
-    <value name="Texture Data Format RGBA32I" value="112"/>
-    <value name="Texture Data Format RGBA32UI" value="113"/>
-    <value name="Texture Data Format RGB10_A2UI" value="114"/>
-
-  </enum>
-</vcxml>
diff --git a/src/broadcom/clif/clif_dump.c b/src/broadcom/clif/clif_dump.c
index 1f57a6d..2bc73e6 100644
--- a/src/broadcom/clif/clif_dump.c
+++ b/src/broadcom/clif/clif_dump.c
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "drm-uapi/v3d_drm.h"
 #include "clif_dump.h"
 #include "clif_private.h"
 #include "util/list.h"
@@ -51,17 +52,14 @@
 
 struct clif_dump *
 clif_dump_init(const struct v3d_device_info *devinfo,
-               FILE *out,
-               bool (*lookup_vaddr)(void *data, uint32_t addr, void **vaddr),
-               void *data)
+               FILE *out, bool pretty)
 {
         struct clif_dump *clif = rzalloc(NULL, struct clif_dump);
 
         clif->devinfo = devinfo;
-        clif->lookup_vaddr = lookup_vaddr;
         clif->out = out;
-        clif->data = data;
         clif->spec = v3d_spec_load(devinfo);
+        clif->pretty = pretty;
 
         list_inithead(&clif->worklist);
 
@@ -74,83 +72,289 @@
         ralloc_free(clif);
 }
 
+struct clif_bo *
+clif_lookup_bo(struct clif_dump *clif, uint32_t addr)
+{
+        for (int i = 0; i < clif->bo_count; i++) {
+                struct clif_bo *bo = &clif->bo[i];
+
+                if (addr >= bo->offset &&
+                    addr < bo->offset + bo->size) {
+                        return bo;
+                }
+        }
+
+        return NULL;
+}
+
+static bool
+clif_lookup_vaddr(struct clif_dump *clif, uint32_t addr, void **vaddr)
+{
+        struct clif_bo *bo = clif_lookup_bo(clif, addr);
+        if (!bo)
+                return false;
+
+        *vaddr = bo->vaddr + addr - bo->offset;
+        return true;
+}
+
 #define out_uint(_clif, field) out(_clif, "    /* %s = */ %u\n",        \
                             #field,  values-> field);
 
 static bool
 clif_dump_packet(struct clif_dump *clif, uint32_t offset, const uint8_t *cl,
-                 uint32_t *size)
+                 uint32_t *size, bool reloc_mode)
 {
         if (clif->devinfo->ver >= 41)
-                return v3d41_clif_dump_packet(clif, offset, cl, size);
+                return v3d41_clif_dump_packet(clif, offset, cl, size, reloc_mode);
         else
-                return v3d33_clif_dump_packet(clif, offset, cl, size);
+                return v3d33_clif_dump_packet(clif, offset, cl, size, reloc_mode);
 }
 
-static void
-clif_dump_cl(struct clif_dump *clif, uint32_t start, uint32_t end)
+static uint32_t
+clif_dump_cl(struct clif_dump *clif, uint32_t start, uint32_t end,
+             bool reloc_mode)
 {
-        void *start_vaddr;
-        if (!clif->lookup_vaddr(clif->data, start, &start_vaddr)) {
+        struct clif_bo *bo = clif_lookup_bo(clif, start);
+        if (!bo) {
                 out(clif, "Failed to look up address 0x%08x\n",
                     start);
-                return;
+                return 0;
         }
 
+        void *start_vaddr = bo->vaddr + start - bo->offset;
+
         /* The end address is optional (for example, a BRANCH instruction
          * won't set an end), but is used for BCL/RCL termination.
          */
         void *end_vaddr = NULL;
-        if (end && !clif->lookup_vaddr(clif->data, end, &end_vaddr)) {
+        if (end && !clif_lookup_vaddr(clif, end, &end_vaddr)) {
                 out(clif, "Failed to look up address 0x%08x\n",
                     end);
-                return;
+                return 0;
         }
 
+        if (!reloc_mode)
+                out(clif, "@format ctrllist  /* [%s+0x%08x] */\n",
+                    bo->name, start - bo->offset);
+
         uint32_t size;
         uint8_t *cl = start_vaddr;
-        while (clif_dump_packet(clif, start, cl, &size)) {
+        while (clif_dump_packet(clif, start, cl, &size, reloc_mode)) {
                 cl += size;
                 start += size;
 
                 if (cl == end_vaddr)
                         break;
         }
+
+        return (void *)cl - bo->vaddr;
+}
+
+/* Walks the worklist, parsing the relocs for any memory regions that might
+ * themselves have additional relocations.
+ */
+static uint32_t
+clif_dump_gl_shader_state_record(struct clif_dump *clif,
+                                 struct reloc_worklist_entry *reloc,
+                                 void *vaddr)
+{
+        struct v3d_group *state = v3d_spec_find_struct(clif->spec,
+                                                       "GL Shader State Record");
+        struct v3d_group *attr = v3d_spec_find_struct(clif->spec,
+                                                      "GL Shader State Attribute Record");
+        assert(state);
+        assert(attr);
+        uint32_t offset = 0;
+
+        out(clif, "@format shadrec_gl_main\n");
+        v3d_print_group(clif, state, 0, vaddr + offset);
+        offset += v3d_group_get_length(state);
+
+        for (int i = 0; i < reloc->shader_state.num_attrs; i++) {
+                out(clif, "@format shadrec_gl_attr /* %d */\n", i);
+                v3d_print_group(clif, attr, 0, vaddr + offset);
+                offset += v3d_group_get_length(attr);
+        }
+
+        return offset;
 }
 
 static void
 clif_process_worklist(struct clif_dump *clif)
 {
-        while (!list_empty(&clif->worklist)) {
-                struct reloc_worklist_entry *reloc =
-                        list_first_entry(&clif->worklist,
-                                         struct reloc_worklist_entry, link);
-                list_del(&reloc->link);
-
+        list_for_each_entry_safe(struct reloc_worklist_entry, reloc,
+                                 &clif->worklist, link) {
                 void *vaddr;
-                if (!clif->lookup_vaddr(clif->data, reloc->addr, &vaddr)) {
+                if (!clif_lookup_vaddr(clif, reloc->addr, &vaddr)) {
                         out(clif, "Failed to look up address 0x%08x\n",
                             reloc->addr);
                         continue;
                 }
 
                 switch (reloc->type) {
+                case reloc_cl:
+                        clif_dump_cl(clif, reloc->addr, reloc->cl.end, true);
+                        break;
+
                 case reloc_gl_shader_state:
-                        if (clif->devinfo->ver >= 41) {
-                                v3d41_clif_dump_gl_shader_state_record(clif,
-                                                                       reloc,
-                                                                       vaddr);
-                        } else {
-                                v3d33_clif_dump_gl_shader_state_record(clif,
-                                                                       reloc,
-                                                                       vaddr);
-                        }
                         break;
                 case reloc_generic_tile_list:
                         clif_dump_cl(clif, reloc->addr,
-                                     reloc->generic_tile_list.end);
+                                     reloc->generic_tile_list.end, true);
                         break;
                 }
+        }
+}
+
+static int
+worklist_entry_compare(const void *a, const void *b)
+{
+        return ((*(struct reloc_worklist_entry **)a)->addr -
+                (*(struct reloc_worklist_entry **)b)->addr);
+}
+
+static bool
+clif_dump_if_blank(struct clif_dump *clif, struct clif_bo *bo,
+                   uint32_t start, uint32_t end)
+{
+        for (int i = start; i < end; i++) {
+                if (((uint8_t *)bo->vaddr)[i] != 0)
+                        return false;
+        }
+
+        out(clif, "\n");
+        out(clif, "@format blank %d /* [%s+0x%08x..0x%08x] */\n", end - start,
+            bo->name, start, end - 1);
+        return true;
+}
+
+/* Dumps the binary data in the BO from start to end (relative to the start of
+ * the BO).
+ */
+static void
+clif_dump_binary(struct clif_dump *clif, struct clif_bo *bo,
+                 uint32_t start, uint32_t end)
+{
+        if (start == end)
+                return;
+
+        if (clif_dump_if_blank(clif, bo, start, end))
+                return;
+
+        out(clif, "@format binary /* [%s+0x%08x] */\n",
+            bo->name, start);
+
+        uint32_t offset = start;
+        int dumped_in_line = 0;
+        while (offset < end) {
+                if (clif_dump_if_blank(clif, bo, offset, end))
+                        return;
+
+                if (end - offset >= 4) {
+                        out(clif, "0x%08x ", *(uint32_t *)(bo->vaddr + offset));
+                        offset += 4;
+                } else {
+                        out(clif, "0x%02x ", *(uint8_t *)(bo->vaddr + offset));
+                        offset++;
+                }
+
+                if (++dumped_in_line == 8) {
+                        out(clif, "\n");
+                        dumped_in_line = 0;
+                }
+        }
+        if (dumped_in_line)
+                out(clif, "\n");
+}
+
+/* Walks the list of relocations, dumping each buffer's contents (using our
+ * codegenned dump routines for pretty printing, and most importantly proper
+ * address references so that the CLIF parser can relocate buffers).
+ */
+static void
+clif_dump_buffers(struct clif_dump *clif)
+{
+        int num_relocs = 0;
+        list_for_each_entry(struct reloc_worklist_entry, reloc,
+                            &clif->worklist, link) {
+                num_relocs++;
+        }
+        struct reloc_worklist_entry **relocs =
+                ralloc_array(clif, struct reloc_worklist_entry *, num_relocs);
+        int i = 0;
+        list_for_each_entry(struct reloc_worklist_entry, reloc,
+                            &clif->worklist, link) {
+                relocs[i++] = reloc;
+        }
+        qsort(relocs, num_relocs, sizeof(*relocs), worklist_entry_compare);
+
+        struct clif_bo *bo = NULL;
+        uint32_t offset = 0;
+
+        for (i = 0; i < num_relocs; i++) {
+                struct reloc_worklist_entry *reloc = relocs[i];
+                struct clif_bo *new_bo = clif_lookup_bo(clif, reloc->addr);
+
+                if (!new_bo) {
+                        out(clif, "Failed to look up address 0x%08x\n",
+                            reloc->addr);
+                        continue;
+                }
+
+                if (new_bo != bo) {
+                        if (bo) {
+                                /* Finish out the last of the last BO. */
+                                clif_dump_binary(clif, bo,
+                                                 offset,
+                                                 bo->size);
+                        }
+
+                        out(clif, "\n");
+                        out(clif, "@buffer %s\n", new_bo->name);
+                        bo = new_bo;
+                        offset = 0;
+                        bo->dumped = true;
+                }
+
+                int reloc_offset = reloc->addr - bo->offset;
+                if (offset != reloc_offset)
+                        clif_dump_binary(clif, bo, offset, reloc_offset);
+                offset = reloc_offset;
+
+                switch (reloc->type) {
+                case reloc_cl:
+                        offset = clif_dump_cl(clif, reloc->addr, reloc->cl.end,
+                                              false);
+                        out(clif, "\n");
+                        break;
+
+                case reloc_gl_shader_state:
+                        offset += clif_dump_gl_shader_state_record(clif,
+                                                                   reloc,
+                                                                   bo->vaddr +
+                                                                   offset);
+                        break;
+                case reloc_generic_tile_list:
+                        offset = clif_dump_cl(clif, reloc->addr,
+                                              reloc->generic_tile_list.end,
+                                              false);
+                        break;
+                }
+                out(clif, "\n");
+        }
+
+        if (bo) {
+                clif_dump_binary(clif, bo, offset, bo->size);
+        }
+
+        /* For any BOs that didn't have relocations, just dump them raw. */
+        for (int i = 0; i < clif->bo_count; i++) {
+                bo = &clif->bo[i];
+                if (bo->dumped)
+                        continue;
+                out(clif, "@buffer %s\n", bo->name);
+                clif_dump_binary(clif, bo, 0, bo->size);
                 out(clif, "\n");
         }
 }
@@ -158,8 +362,83 @@
 void
 clif_dump_add_cl(struct clif_dump *clif, uint32_t start, uint32_t end)
 {
-        clif_dump_cl(clif, start, end);
-        out(clif, "\n");
+        struct reloc_worklist_entry *entry =
+                clif_dump_add_address_to_worklist(clif, reloc_cl, start);
 
+        entry->cl.end = end;
+}
+
+static int
+clif_bo_offset_compare(const void *a, const void *b)
+{
+        return ((struct clif_bo *)a)->offset - ((struct clif_bo *)b)->offset;
+}
+
+void
+clif_dump(struct clif_dump *clif, const struct drm_v3d_submit_cl *submit)
+{
+        clif_dump_add_cl(clif, submit->bcl_start, submit->bcl_end);
+        clif_dump_add_cl(clif, submit->rcl_start, submit->rcl_end);
+
+        qsort(clif->bo, clif->bo_count, sizeof(clif->bo[0]),
+              clif_bo_offset_compare);
+
+        /* A buffer needs to be defined before we can emit a CLIF address
+         * referencing it, so emit them all now.
+         */
+        for (int i = 0; i < clif->bo_count; i++) {
+                out(clif, "@createbuf_aligned 4096 %s\n", clif->bo[i].name);
+        }
+
+        /* Walk the worklist figuring out the locations of structs based on
+         * the CL contents.
+         */
         clif_process_worklist(clif);
+
+        /* Dump the contents of the buffers using the relocations we found to
+         * pretty-print structures.
+         */
+        clif_dump_buffers(clif);
+
+        out(clif, "@add_bin 0\n  ");
+        out_address(clif, submit->bcl_start);
+        out(clif, "\n  ");
+        out_address(clif, submit->bcl_end);
+        out(clif, "\n  ");
+        out_address(clif, submit->qma);
+        out(clif, "\n  %d\n  ", submit->qms);
+        out_address(clif, submit->qts);
+        out(clif, "\n");
+        out(clif, "@wait_bin_all_cores\n");
+
+        out(clif, "@add_render 0\n  ");
+        out_address(clif, submit->rcl_start);
+        out(clif, "\n  ");
+        out_address(clif, submit->rcl_end);
+        out(clif, "\n  ");
+        out_address(clif, submit->qma);
+        out(clif, "\n");
+        out(clif, "@wait_render_all_cores\n");
+}
+
+void
+clif_dump_add_bo(struct clif_dump *clif, const char *name,
+                 uint32_t offset, uint32_t size, void *vaddr)
+{
+        if (clif->bo_count >= clif->bo_array_size) {
+                clif->bo_array_size = MAX2(4, clif->bo_array_size * 2);
+                clif->bo = reralloc(clif, clif->bo, struct clif_bo,
+                                    clif->bo_array_size);
+        }
+
+        /* CLIF relocs use the buffer name, so make sure they're unique. */
+        for (int i = 0; i < clif->bo_count; i++)
+                assert(strcmp(clif->bo[i].name, name) != 0);
+
+        clif->bo[clif->bo_count].name = ralloc_strdup(clif, name);
+        clif->bo[clif->bo_count].offset = offset;
+        clif->bo[clif->bo_count].size = size;
+        clif->bo[clif->bo_count].vaddr = vaddr;
+        clif->bo[clif->bo_count].dumped = false;
+        clif->bo_count++;
 }
diff --git a/src/broadcom/clif/clif_dump.h b/src/broadcom/clif/clif_dump.h
index d46cc84..8de3a2c 100644
--- a/src/broadcom/clif/clif_dump.h
+++ b/src/broadcom/clif/clif_dump.h
@@ -29,14 +29,15 @@
 
 struct v3d_device_info;
 struct clif_dump;
+struct drm_v3d_submit_cl;
 
 struct clif_dump *clif_dump_init(const struct v3d_device_info *devinfo,
-                                 FILE *output,
-                                 bool (*lookup_vaddr)(void *data, uint32_t addr,
-                                                      void **vaddr),
-                                 void *data);
+                                 FILE *output, bool pretty);
+void clif_dump(struct clif_dump *clif, const struct drm_v3d_submit_cl *submit);
 void clif_dump_destroy(struct clif_dump *clif);
 
+void clif_dump_add_bo(struct clif_dump *clif, const char *name,
+                      uint32_t offset, uint32_t size, void *vaddr);
 void clif_dump_add_cl(struct clif_dump *clif, uint32_t start, uint32_t end);
 
 #endif
diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
index 87de176..597d0b5 100644
--- a/src/broadcom/clif/clif_private.h
+++ b/src/broadcom/clif/clif_private.h
@@ -28,20 +28,36 @@
 #include <stdarg.h>
 #include "util/list.h"
 
+struct clif_bo {
+        const char *name;
+        uint32_t offset;
+        uint32_t size;
+        void *vaddr;
+        bool dumped;
+};
+
 struct clif_dump {
         const struct v3d_device_info *devinfo;
-        bool (*lookup_vaddr)(void *data, uint32_t addr, void **vaddr);
         FILE *out;
-        /* Opaque data from the caller that is passed to the callbacks. */
-        void *data;
 
         struct v3d_spec *spec;
 
         /* List of struct reloc_worklist_entry */
         struct list_head worklist;
+
+        struct clif_bo *bo;
+        int bo_count;
+        int bo_array_size;
+
+        /**
+         * Flag to switch from CLIF ABI to slightly more human-readable
+         * output.
+         */
+        bool pretty;
 };
 
 enum reloc_worklist_type {
+        reloc_cl,
         reloc_gl_shader_state,
         reloc_generic_tile_list,
 };
@@ -54,6 +70,9 @@
 
         union {
                 struct {
+                        uint32_t end;
+                } cl;
+                struct {
                         uint32_t num_attrs;
                 } shader_state;
                 struct {
@@ -62,26 +81,20 @@
         };
 };
 
+struct clif_bo *
+clif_lookup_bo(struct clif_dump *clif, uint32_t addr);
+
 struct reloc_worklist_entry *
 clif_dump_add_address_to_worklist(struct clif_dump *clif,
                                   enum reloc_worklist_type type,
                                   uint32_t addr);
 
 bool v3d33_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
-                            const uint8_t *cl, uint32_t *size);
-void v3d33_clif_dump_gl_shader_state_record(struct clif_dump *clif,
-                                            struct reloc_worklist_entry *reloc,
-                                            void *vaddr);
+                            const uint8_t *cl, uint32_t *size, bool reloc_mode);
 bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
-                            const uint8_t *cl, uint32_t *size);
-void v3d41_clif_dump_gl_shader_state_record(struct clif_dump *clif,
-                                            struct reloc_worklist_entry *reloc,
-                                            void *vaddr);
+                            const uint8_t *cl, uint32_t *size, bool reloc_mode);
 bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
-                            const uint8_t *cl, uint32_t *size);
-void v3d42_clif_dump_gl_shader_state_record(struct clif_dump *clif,
-                                            struct reloc_worklist_entry *reloc,
-                                            void *vaddr);
+                            const uint8_t *cl, uint32_t *size, bool reloc_mode);
 
 static inline void
 out(struct clif_dump *clif, const char *fmt, ...)
@@ -93,4 +106,18 @@
         va_end(args);
 }
 
+static inline void
+out_address(struct clif_dump *clif, uint32_t addr)
+{
+        struct clif_bo *bo = clif_lookup_bo(clif, addr);
+        if (bo) {
+                out(clif, "[%s+0x%08x] /* 0x%08x */",
+                    bo->name, addr - bo->offset, addr);
+        } else if (addr) {
+                out(clif, "/* XXX: BO unknown */ 0x%08x", addr);
+        } else {
+                out(clif, "[null]");
+        }
+}
+
 #endif /* CLIF_PRIVATE_H */
diff --git a/src/broadcom/clif/v3dx_dump.c b/src/broadcom/clif/v3dx_dump.c
index b1d1655..9cf59f8 100644
--- a/src/broadcom/clif/v3dx_dump.c
+++ b/src/broadcom/clif/v3dx_dump.c
@@ -21,7 +21,10 @@
  * IN THE SOFTWARE.
  */
 
+#include <ctype.h>
+#include <stdlib.h>
 #include <string.h>
+#include "util/macros.h"
 #include "broadcom/cle/v3d_decoder.h"
 #include "clif_dump.h"
 #include "clif_private.h"
@@ -34,9 +37,29 @@
 #include "broadcom/cle/v3dx_pack.h"
 #include "broadcom/common/v3d_macros.h"
 
+static char *
+clif_name(const char *xml_name)
+{
+        char *name = malloc(strlen(xml_name) + 1);
+
+        int j = 0;
+        for (int i = 0; i < strlen(xml_name); i++) {
+                if (xml_name[i] == ' ') {
+                        name[j++] = '_';
+                } else if (xml_name[i] == '(' || xml_name[i] == ')') {
+                        /* skip */
+                } else {
+                        name[j++] = toupper(xml_name[i]);
+                }
+        }
+        name[j++] = 0;
+
+        return name;
+}
+
 bool
 v3dX(clif_dump_packet)(struct clif_dump *clif, uint32_t offset,
-                       const uint8_t *cl, uint32_t *size)
+                       const uint8_t *cl, uint32_t *size, bool reloc_mode)
 {
         struct v3d_group *inst = v3d_spec_find_instruction(clif->spec, cl);
         if (!inst) {
@@ -46,21 +69,27 @@
 
         *size = v3d_group_get_length(inst);
 
-        out(clif, "%s\n", v3d_group_get_name(inst));
-        v3d_print_group(clif->out, inst, 0, cl, "");
+        if (!reloc_mode) {
+                char *name = clif_name(v3d_group_get_name(inst));
+                out(clif, "%s\n", name);
+                free(name);
+                v3d_print_group(clif, inst, 0, cl);
+        }
 
         switch (*cl) {
         case V3DX(GL_SHADER_STATE_opcode): {
                 struct V3DX(GL_SHADER_STATE) values;
                 V3DX(GL_SHADER_STATE_unpack)(cl, &values);
 
-                struct reloc_worklist_entry *reloc =
-                        clif_dump_add_address_to_worklist(clif,
-                                                          reloc_gl_shader_state,
-                                                          values.address);
-                if (reloc) {
-                        reloc->shader_state.num_attrs =
-                                values.number_of_attribute_arrays;
+                if (reloc_mode) {
+                        struct reloc_worklist_entry *reloc =
+                                clif_dump_add_address_to_worklist(clif,
+                                                                  reloc_gl_shader_state,
+                                                                  values.address);
+                        if (reloc) {
+                                reloc->shader_state.num_attrs =
+                                        values.number_of_attribute_arrays;
+                        }
                 }
                 return true;
         }
@@ -87,10 +116,13 @@
                 cl += *size;
 
                 for (int i = 0; i < values.number_of_16_bit_output_data_specs_following; i++) {
-                        v3d_print_group(clif->out, spec, 0, cl, "");
+                        if (!reloc_mode)
+                                v3d_print_group(clif, spec, 0, cl);
                         cl += v3d_group_get_length(spec);
                         *size += v3d_group_get_length(spec);
                 }
+                if (!reloc_mode)
+                        out(clif, "@format ctrllist\n");
                 break;
         }
 #else /* V3D_VERSION < 40 */
@@ -107,13 +139,15 @@
                 cl += *size;
 
                 for (int i = 0; i < values.number_of_16_bit_output_data_specs_following; i++) {
-                        v3d_print_group(clif->out, spec, 0, cl, "");
+                        if (!reloc_mode)
+                                v3d_print_group(clif, spec, 0, cl);
                         cl += v3d_group_get_length(spec);
                         *size += v3d_group_get_length(spec);
                 }
 
                 for (int i = 0; i < values.number_of_32_bit_output_buffer_address_following; i++) {
-                        v3d_print_group(clif->out, addr, 0, cl, "");
+                        if (!reloc_mode)
+                                v3d_print_group(clif, addr, 0, cl);
                         cl += v3d_group_get_length(addr);
                         *size += v3d_group_get_length(addr);
                 }
@@ -138,26 +172,3 @@
 
         return true;
 }
-
-void
-v3dX(clif_dump_gl_shader_state_record)(struct clif_dump *clif,
-                                       struct reloc_worklist_entry *reloc,
-                                       void *vaddr)
-{
-        struct v3d_group *state = v3d_spec_find_struct(clif->spec,
-                                                       "GL Shader State Record");
-        struct v3d_group *attr = v3d_spec_find_struct(clif->spec,
-                                                      "GL Shader State Attribute Record");
-        assert(state);
-        assert(attr);
-
-        out(clif, "GL Shader State Record at 0x%08x\n", reloc->addr);
-        v3d_print_group(clif->out, state, 0, vaddr, "");
-        vaddr += v3d_group_get_length(state);
-
-        for (int i = 0; i < reloc->shader_state.num_attrs; i++) {
-                out(clif, "  Attribute %d\n", i);
-                v3d_print_group(clif->out, attr, 0, vaddr, "");
-                vaddr += v3d_group_get_length(attr);
-        }
-}
diff --git a/src/broadcom/common/v3d_debug.c b/src/broadcom/common/v3d_debug.c
index 630bfe0..9740444 100644
--- a/src/broadcom/common/v3d_debug.c
+++ b/src/broadcom/common/v3d_debug.c
@@ -41,6 +41,7 @@
 
 static const struct debug_control debug_control[] = {
         { "cl",          V3D_DEBUG_CL},
+        { "clif",        V3D_DEBUG_CLIF},
         { "qpu",         V3D_DEBUG_QPU},
         { "vir",         V3D_DEBUG_VIR},
         { "nir",         V3D_DEBUG_NIR},
@@ -52,6 +53,7 @@
         { "fs",          V3D_DEBUG_FS},
         { "vs",          V3D_DEBUG_VS},
         { "cs",          V3D_DEBUG_CS},
+        { "always_flush", V3D_DEBUG_ALWAYS_FLUSH},
         { NULL,    0 }
 };
 
diff --git a/src/broadcom/common/v3d_debug.h b/src/broadcom/common/v3d_debug.h
index bdb9518..d9f5255 100644
--- a/src/broadcom/common/v3d_debug.h
+++ b/src/broadcom/common/v3d_debug.h
@@ -54,10 +54,15 @@
 #define V3D_DEBUG_PERF			(1 << 10)
 #define V3D_DEBUG_NORAST		(1 << 11)
 #define V3D_DEBUG_ALWAYS_FLUSH		(1 << 12)
+#define V3D_DEBUG_CLIF			(1 << 13)
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "BROADCOM-MESA"
+#if ANDROID_API_LEVEL >= 26
+#include <log/log.h>
+#else
 #include <cutils/log.h>
+#endif /* use log/log.h start from android 8 major version */
 #ifndef ALOGW
 #define ALOGW LOGW
 #endif
diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
index 5685c7a..b0a2a02 100644
--- a/src/broadcom/common/v3d_device_info.h
+++ b/src/broadcom/common/v3d_device_info.h
@@ -27,13 +27,14 @@
 #include <stdint.h>
 
 /**
- * Struct for tracking features of the V3D chip. This is where we'll store
- * boolean flags for features in a specific version, but for now it's just the
- * version
+ * Struct for tracking features of the V3D chip across driver and compiler.
  */
 struct v3d_device_info {
         /** Simple V3D version: major * 10 + minor */
         uint8_t ver;
+
+        /** Size of the VPM, in bytes. */
+        int vpm_size;
 };
 
 #endif
diff --git a/src/broadcom/compiler/meson.build b/src/broadcom/compiler/meson.build
index 0cdf8a5..86ef365 100644
--- a/src/broadcom/compiler/meson.build
+++ b/src/broadcom/compiler/meson.build
@@ -26,6 +26,7 @@
   'vir_lower_uniforms.c',
   'vir_opt_copy_propagate.c',
   'vir_opt_dead_code.c',
+  'vir_opt_small_immediates.c',
   'vir_register_allocate.c',
   'vir_to_qpu.c',
   'qpu_schedule.c',
@@ -46,3 +47,5 @@
   dependencies : [dep_libdrm, dep_valgrind, idep_nir_headers],
   build_by_default : false,
 )
+
+v3d_libs += libbroadcom_compiler
diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index d6c2d19..158c1c3 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -74,13 +74,6 @@
 }
 
 static struct qreg
-vir_SFU(struct v3d_compile *c, int waddr, struct qreg src)
-{
-        vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, waddr), src);
-        return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4));
-}
-
-static struct qreg
 indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr)
 {
         struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0);
@@ -244,14 +237,6 @@
         return r;
 };
 
-static inline struct qreg
-vir_SAT(struct v3d_compile *c, struct qreg val)
-{
-        return vir_FMAX(c,
-                        vir_FMIN(c, val, vir_uniform_f(c, 1.0)),
-                        vir_uniform_f(c, 0.0));
-}
-
 static struct qreg
 ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level)
 {
@@ -338,8 +323,7 @@
                 input = vir_FADD(c, input, vir_uniform_f(c, 0.5));
 
         struct qreg periods = vir_FROUND(c, input);
-        struct qreg sin_output = vir_SFU(c, V3D_QPU_WADDR_SIN,
-                                         vir_FSUB(c, input, periods));
+        struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods));
         return vir_XOR(c, sin_output, vir_SHL(c,
                                               vir_FTOIN(c, periods),
                                               vir_uniform_ui(c, -1)));
@@ -377,8 +361,7 @@
         c->inputs[attr * 4 + 0] = vir_FXCD(c);
         c->inputs[attr * 4 + 1] = vir_FYCD(c);
         c->inputs[attr * 4 + 2] = c->payload_z;
-        c->inputs[attr * 4 + 3] = vir_SFU(c, V3D_QPU_WADDR_RECIP,
-                                          c->payload_w);
+        c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w);
 }
 
 static struct qreg
@@ -436,15 +419,14 @@
                 /* FALLTHROUGH */
         case INTERP_MODE_SMOOTH:
                 if (var->data.centroid) {
+                        BITSET_SET(c->centroid_flags, i);
                         return vir_FADD(c, vir_FMUL(c, vary,
                                                     c->payload_w_centroid), r5);
                 } else {
                         return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5);
                 }
         case INTERP_MODE_NOPERSPECTIVE:
-                /* C appears after the mov from the varying.
-                   XXX: improve ldvary setup.
-                */
+                BITSET_SET(c->noperspective_flags, i);
                 return vir_FADD(c, vir_MOV(c, vary), r5);
         case INTERP_MODE_FLAT:
                 BITSET_SET(c->flat_shade_flags, i);
@@ -754,6 +736,10 @@
                 result = vir_NOT(c, src[0]);
                 break;
 
+        case nir_op_ufind_msb:
+                result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0]));
+                break;
+
         case nir_op_imul:
                 result = vir_UMUL(c, src[0], src[1]);
                 break;
@@ -787,16 +773,16 @@
                 break;
 
         case nir_op_frcp:
-                result = vir_SFU(c, V3D_QPU_WADDR_RECIP, src[0]);
+                result = vir_RECIP(c, src[0]);
                 break;
         case nir_op_frsq:
-                result = vir_SFU(c, V3D_QPU_WADDR_RSQRT, src[0]);
+                result = vir_RSQRT(c, src[0]);
                 break;
         case nir_op_fexp2:
-                result = vir_SFU(c, V3D_QPU_WADDR_EXP, src[0]);
+                result = vir_EXP(c, src[0]);
                 break;
         case nir_op_flog2:
-                result = vir_SFU(c, V3D_QPU_WADDR_LOG, src[0]);
+                result = vir_LOG(c, src[0]);
                 break;
 
         case nir_op_fceil:
@@ -852,6 +838,13 @@
                 result = vir_FDY(c, src[0]);
                 break;
 
+        case nir_op_uadd_carry:
+                vir_PF(c, vir_ADD(c, src[0], src[1]), V3D_QPU_PF_PUSHC);
+                result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA,
+                                            vir_uniform_ui(c, ~0),
+                                            vir_uniform_ui(c, 0)));
+                break;
+
         default:
                 fprintf(stderr, "unknown NIR ALU inst: ");
                 nir_print_instr(&instr->instr, stderr);
@@ -908,6 +901,16 @@
                         has_any_tlb_color_write = true;
         }
 
+        if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) {
+                struct nir_variable *var = c->output_color_var[0];
+                struct qreg *color = &c->outputs[var->data.driver_location * 4];
+
+                vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0),
+                                vir_AND(c,
+                                        vir_MSF(c),
+                                        vir_FTOC(c, color[3])));
+        }
+
         if (c->output_position_index != -1) {
                 struct qinst *inst = vir_MOV_dest(c,
                                                   vir_reg(QFILE_TLBU, 0),
@@ -918,7 +921,9 @@
                                        TLB_TYPE_DEPTH |
                                        TLB_DEPTH_TYPE_PER_PIXEL |
                                        0xffffff00);
-        } else if (c->s->info.fs.uses_discard || !has_any_tlb_color_write) {
+        } else if (c->s->info.fs.uses_discard ||
+                   c->fs_key->sample_alpha_to_coverage ||
+                   !has_any_tlb_color_write) {
                 /* Emit passthrough Z if it needed to be delayed until shader
                  * end due to potential discards.
                  *
@@ -957,6 +962,9 @@
                 conf |= TLB_SAMPLE_MODE_PER_PIXEL;
                 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT;
 
+                if (c->fs_key->swap_color_rb & (1 << rt))
+                        num_components = MAX2(num_components, 3);
+
                 assert(num_components != 0);
                 switch (glsl_get_base_type(var->type)) {
                 case GLSL_TYPE_UINT:
@@ -985,7 +993,7 @@
                         struct qreg b = color[2];
                         struct qreg a = color[3];
 
-                        if (c->fs_key->f32_color_rb) {
+                        if (c->fs_key->f32_color_rb & (1 << rt)) {
                                 conf |= TLB_TYPE_F32_COLOR;
                                 conf |= ((num_components - 1) <<
                                          TLB_VEC_SIZE_MINUS_1_SHIFT);
@@ -1003,15 +1011,20 @@
                                 b = color[0];
                         }
 
+                        if (c->fs_key->sample_alpha_to_one)
+                                a = vir_uniform_f(c, 1.0);
+
                         if (c->fs_key->f32_color_rb & (1 << rt)) {
-                                inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]);
+                                inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), r);
                                 inst->src[vir_get_implicit_uniform_src(inst)] =
                                         vir_uniform_ui(c, conf);
 
-                                for (int i = 1; i < num_components; i++) {
-                                        inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0),
-                                                            color[i]);
-                                }
+                                if (num_components >= 2)
+                                        vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), g);
+                                if (num_components >= 3)
+                                        vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), b);
+                                if (num_components >= 4)
+                                        vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), a);
                         } else {
                                 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g);
                                 if (conf != ~0) {
@@ -1129,8 +1142,8 @@
         setup_default_position(c);
 
         uint32_t vpm_index = 0;
-        struct qreg rcp_w = vir_SFU(c, V3D_QPU_WADDR_RECIP,
-                                    c->outputs[c->output_position_index + 3]);
+        struct qreg rcp_w = vir_RECIP(c,
+                                      c->outputs[c->output_position_index + 3]);
 
         emit_vpm_write_setup(c);
 
@@ -1348,7 +1361,7 @@
                 assert(array_len == 1);
                 (void)array_len;
 
-                for (int i = 0; i < glsl_get_vector_elements(var->type); i++) {
+                for (int i = 0; i < 4; i++) {
                         add_output(c, loc + var->data.location_frac + i,
                                    var->data.location,
                                    var->data.location_frac + i);
@@ -1893,8 +1906,11 @@
         .lower_all_io_to_temps = true,
         .lower_extract_byte = true,
         .lower_extract_word = true,
-        .lower_bitfield_insert = true,
-        .lower_bitfield_extract = true,
+        .lower_bfm = true,
+        .lower_bitfield_insert_to_shifts = true,
+        .lower_bitfield_extract_to_shifts = true,
+        .lower_bitfield_reverse = true,
+        .lower_bit_count = true,
         .lower_pack_unorm_2x16 = true,
         .lower_pack_snorm_2x16 = true,
         .lower_pack_unorm_4x8 = true,
@@ -1902,12 +1918,16 @@
         .lower_unpack_unorm_4x8 = true,
         .lower_unpack_snorm_4x8 = true,
         .lower_fdiv = true,
+        .lower_find_lsb = true,
         .lower_ffma = true,
         .lower_flrp32 = true,
         .lower_fpow = true,
         .lower_fsat = true,
         .lower_fsqrt = true,
+        .lower_ifind_msb = true,
         .lower_ldexp = true,
+        .lower_mul_high = true,
+        .lower_wpos_pntc = true,
         .native_integers = true,
 };
 
@@ -1985,6 +2005,29 @@
                 c->last_thrsw->is_last_thrsw = true;
 }
 
+/* There's a flag in the shader for "center W is needed for reasons other than
+ * non-centroid varyings", so we just walk the program after VIR optimization
+ * to see if it's used.  It should be harmless to set even if we only use
+ * center W for varyings.
+ */
+static void
+vir_check_payload_w(struct v3d_compile *c)
+{
+        if (c->s->info.stage != MESA_SHADER_FRAGMENT)
+                return;
+
+        vir_for_each_inst_inorder(inst, c) {
+                for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                        if (inst->src[i].file == QFILE_REG &&
+                            inst->src[i].index == 0) {
+                                c->uses_center_w = true;
+                                return;
+                        }
+                }
+        }
+
+}
+
 void
 v3d_nir_to_vir(struct v3d_compile *c)
 {
@@ -2024,6 +2067,8 @@
         vir_optimize(c);
         vir_lower_uniforms(c);
 
+        vir_check_payload_w(c);
+
         /* XXX: vir_schedule_instructions(c); */
 
         if (V3D_DEBUG & (V3D_DEBUG_VIR |
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index b404390..4f3b621 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -343,7 +343,7 @@
                 add_read_dep(state, state->last_sf, n);
                 break;
 
-        case V3D_QPU_A_FLBPOP:
+        case V3D_QPU_A_FLPOP:
                 add_write_dep(state, &state->last_sf, n);
                 break;
 
@@ -402,7 +402,7 @@
                 add_write_dep(state, &state->last_tmu_config, n);
         }
 
-        if (inst->sig.ldtmu) {
+        if (v3d_qpu_waits_on_tmu(inst)) {
                 /* TMU loads are coming from a FIFO, so ordering is important.
                  */
                 add_write_dep(state, &state->last_tmu_write, n);
@@ -459,10 +459,10 @@
 
 struct choose_scoreboard {
         int tick;
-        int last_sfu_write_tick;
+        int last_magic_sfu_write_tick;
         int last_ldvary_tick;
         int last_uniforms_reset_tick;
-        uint32_t last_waddr_add, last_waddr_mul;
+        int last_thrsw_tick;
         bool tlb_locked;
 };
 
@@ -471,22 +471,8 @@
                    const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
 {
         switch (mux) {
-        case V3D_QPU_MUX_A:
-                if (scoreboard->last_waddr_add == inst->raddr_a ||
-                    scoreboard->last_waddr_mul == inst->raddr_a) {
-                        return true;
-                }
-                break;
-
-        case V3D_QPU_MUX_B:
-                if (scoreboard->last_waddr_add == inst->raddr_b ||
-                    scoreboard->last_waddr_mul == inst->raddr_b) {
-                        return true;
-                }
-                break;
-
         case V3D_QPU_MUX_R4:
-                if (scoreboard->tick - scoreboard->last_sfu_write_tick <= 2)
+                if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick <= 2)
                         return true;
                 break;
 
@@ -551,7 +537,7 @@
          * This would normally be prevented by dependency tracking, but might
          * occur if a dead SFU computation makes it to scheduling.
          */
-        if (scoreboard->tick - scoreboard->last_sfu_write_tick < 2 &&
+        if (scoreboard->tick - scoreboard->last_magic_sfu_write_tick < 2 &&
             v3d_qpu_writes_r4(devinfo, inst))
                 return true;
 
@@ -579,7 +565,7 @@
         next_score++;
 
         /* Schedule texture read results collection late to hide latency. */
-        if (inst->sig.ldtmu)
+        if (v3d_qpu_waits_on_tmu(inst))
                 return next_score;
         next_score++;
 
@@ -610,6 +596,8 @@
 {
         if (v3d_qpu_uses_vpm(inst))
                 return true;
+        if (v3d_qpu_uses_sfu(inst))
+                return true;
 
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
@@ -618,6 +606,9 @@
                         return true;
                 }
 
+                if (inst->alu.add.op == V3D_QPU_A_TMUWT)
+                        return true;
+
                 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
                     inst->alu.mul.magic_write &&
                     qpu_magic_waddr_is_periph(inst->alu.mul.waddr)) {
@@ -683,7 +674,8 @@
 
         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_B)) {
                 if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_B) &&
-                    a->raddr_b != b->raddr_b) {
+                    (a->raddr_b != b->raddr_b ||
+                     a->sig.small_imm != b->sig.small_imm)) {
                         return false;
                 }
                 merge.raddr_b = b->raddr_b;
@@ -840,16 +832,13 @@
                                   enum v3d_qpu_waddr waddr)
 {
         if (v3d_qpu_magic_waddr_is_sfu(waddr))
-                scoreboard->last_sfu_write_tick = scoreboard->tick;
+                scoreboard->last_magic_sfu_write_tick = scoreboard->tick;
 }
 
 static void
 update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
                              const struct v3d_qpu_instr *inst)
 {
-        scoreboard->last_waddr_add = ~0;
-        scoreboard->last_waddr_mul = ~0;
-
         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH)
                 return;
 
@@ -859,8 +848,6 @@
                 if (inst->alu.add.magic_write) {
                         update_scoreboard_for_magic_waddr(scoreboard,
                                                           inst->alu.add.waddr);
-                } else {
-                        scoreboard->last_waddr_add = inst->alu.add.waddr;
                 }
         }
 
@@ -868,8 +855,6 @@
                 if (inst->alu.mul.magic_write) {
                         update_scoreboard_for_magic_waddr(scoreboard,
                                                           inst->alu.mul.waddr);
-                } else {
-                        scoreboard->last_waddr_mul = inst->alu.mul.waddr;
                 }
         }
 
@@ -929,7 +914,7 @@
          *
          * because we associate the first load_tmu0 with the *second* tmu0_s.
          */
-        if (v3d_qpu_magic_waddr_is_tmu(waddr) && after->sig.ldtmu)
+        if (v3d_qpu_magic_waddr_is_tmu(waddr) && v3d_qpu_waits_on_tmu(after))
                 return 100;
 
         /* Assume that anything depending on us is consuming the SFU result. */
@@ -1073,6 +1058,10 @@
                 return false;
 
         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                /* GFXH-1625: TMUWT not allowed in the final instruction. */
+                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
+                        return false;
+
                 /* No writing physical registers at the end. */
                 if (!inst->alu.add.magic_write ||
                     !inst->alu.mul.magic_write) {
@@ -1107,10 +1096,16 @@
 }
 
 static bool
-valid_thrsw_sequence(struct v3d_compile *c,
+valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard,
                      struct qinst *qinst, int instructions_in_sequence,
                      bool is_thrend)
 {
+        /* No emitting our thrsw while the previous thrsw hasn't happened yet. */
+        if (scoreboard->last_thrsw_tick + 3 >
+            scoreboard->tick - instructions_in_sequence) {
+                return false;
+        }
+
         for (int slot = 0; slot < instructions_in_sequence; slot++) {
                 /* No scheduling SFU when the result would land in the other
                  * thread.  The simulator complains for safety, though it
@@ -1171,7 +1166,8 @@
                 if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
                         break;
 
-                if (!valid_thrsw_sequence(c, prev_inst, slots_filled + 1,
+                if (!valid_thrsw_sequence(c, scoreboard,
+                                          prev_inst, slots_filled + 1,
                                           is_thrend)) {
                         break;
                 }
@@ -1185,7 +1181,9 @@
         if (merge_inst) {
                 merge_inst->qpu.sig.thrsw = true;
                 needs_free = true;
+                scoreboard->last_thrsw_tick = scoreboard->tick - slots_filled;
         } else {
+                scoreboard->last_thrsw_tick = scoreboard->tick;
                 insert_scheduled_instruction(c, block, scoreboard, inst);
                 time++;
                 slots_filled++;
@@ -1484,11 +1482,10 @@
 
         struct choose_scoreboard scoreboard;
         memset(&scoreboard, 0, sizeof(scoreboard));
-        scoreboard.last_waddr_add = ~0;
-        scoreboard.last_waddr_mul = ~0;
         scoreboard.last_ldvary_tick = -10;
-        scoreboard.last_sfu_write_tick = -10;
+        scoreboard.last_magic_sfu_write_tick = -10;
         scoreboard.last_uniforms_reset_tick = -10;
+        scoreboard.last_thrsw_tick = -10;
 
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
index 4ef587c..fb2ed12 100644
--- a/src/broadcom/compiler/qpu_validate.c
+++ b/src/broadcom/compiler/qpu_validate.c
@@ -41,7 +41,15 @@
         int last_sfu_write;
         int last_branch_ip;
         int last_thrsw_ip;
+
+        /* Set when we've found the last-THRSW signal, or if we were started
+         * in single-segment mode.
+         */
         bool last_thrsw_found;
+
+        /* Set when we've found the THRSW after the last THRSW */
+        bool thrend_found;
+
         int thrsw_count;
 };
 
@@ -116,6 +124,19 @@
                 fail_instr(state, "LDUNIF after a LDVARY");
         }
 
+        /* GFXH-1633 */
+        bool last_reads_ldunif = (state->last && (state->last->sig.ldunif ||
+                                                  state->last->sig.ldunifrf));
+        bool last_reads_ldunifa = (state->last && (state->last->sig.ldunifa ||
+                                                   state->last->sig.ldunifarf));
+        bool reads_ldunif = inst->sig.ldunif || inst->sig.ldunifrf;
+        bool reads_ldunifa = inst->sig.ldunifa || inst->sig.ldunifarf;
+        if ((last_reads_ldunif && reads_ldunifa) ||
+            (last_reads_ldunifa && reads_ldunif)) {
+                fail_instr(state,
+                           "LDUNIF and LDUNIFA can't be next to each other");
+        }
+
         int tmu_writes = 0;
         int sfu_writes = 0;
         int vpm_writes = 0;
@@ -204,6 +225,9 @@
                 if (in_branch_delay_slots(state))
                         fail_instr(state, "THRSW in a branch delay slot.");
 
+                if (state->last_thrsw_found)
+                        state->thrend_found = true;
+
                 if (state->last_thrsw_ip == state->ip - 1) {
                         /* If it's the second THRSW in a row, then it's just a
                          * last-thrsw signal.
@@ -221,6 +245,28 @@
                 }
         }
 
+        if (state->thrend_found &&
+            state->last_thrsw_ip - state->ip <= 2 &&
+            inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                if ((inst->alu.add.op != V3D_QPU_A_NOP &&
+                     !inst->alu.add.magic_write)) {
+                        fail_instr(state, "RF write after THREND");
+                }
+
+                if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
+                     !inst->alu.mul.magic_write)) {
+                        fail_instr(state, "RF write after THREND");
+                }
+
+                if (v3d_qpu_sig_writes_address(devinfo, &inst->sig))
+                        fail_instr(state, "RF write after THREND");
+
+                /* GFXH-1625: No TMUWT in the last instruction */
+                if (state->last_thrsw_ip - state->ip == 2 &&
+                    inst->alu.add.op == V3D_QPU_A_TMUWT)
+                        fail_instr(state, "TMUWT in last instruction");
+        }
+
         if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
                 if (in_branch_delay_slots(state))
                         fail_instr(state, "branch in a branch delay slot.");
@@ -262,6 +308,8 @@
                 .last_thrsw_ip = -10,
                 .last_branch_ip = -10,
                 .ip = 0,
+
+                .last_thrsw_found = !c->last_thrsw,
         };
 
         vir_for_each_block(block, c) {
@@ -273,8 +321,6 @@
                            "thread switch found without last-THRSW in program");
         }
 
-        if (state.thrsw_count == 0 ||
-            (state.last_thrsw_found && state.thrsw_count == 1)) {
+        if (!state.thrend_found)
                 fail_instr(&state, "No program-end THRSW found");
-        }
 }
diff --git a/src/broadcom/compiler/v3d40_tex.c b/src/broadcom/compiler/v3d40_tex.c
index 0b41f37..9f1fd9a 100644
--- a/src/broadcom/compiler/v3d40_tex.c
+++ b/src/broadcom/compiler/v3d40_tex.c
@@ -31,9 +31,12 @@
 #include "cle/v3d_packet_v41_pack.h"
 
 static void
-vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val)
+vir_TMU_WRITE(struct v3d_compile *c, enum v3d_qpu_waddr waddr, struct qreg val,
+              int *tmu_writes)
 {
         vir_MOV_dest(c, vir_reg(QFILE_MAGIC, waddr), val);
+
+        (*tmu_writes)++;
 }
 
 static void
@@ -49,6 +52,10 @@
 v3d40_vir_emit_tex(struct v3d_compile *c, nir_tex_instr *instr)
 {
         unsigned unit = instr->texture_index;
+        int tmu_writes = 0;
+        static const struct V3D41_TMU_CONFIG_PARAMETER_2 p2_unpacked_default = {
+                .op = V3D_TMU_OP_REGULAR,
+        };
 
         struct V3D41_TMU_CONFIG_PARAMETER_0 p0_unpacked = {
         };
@@ -82,29 +89,32 @@
                         if (non_array_components > 1) {
                                 vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUT,
                                               ntq_get_src(c, instr->src[i].src,
-                                                          1));
+                                                          1), &tmu_writes);
                         }
                         if (non_array_components > 2) {
                                 vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUR,
                                               ntq_get_src(c, instr->src[i].src,
-                                                          2));
+                                                          2), &tmu_writes);
                         }
 
                         if (instr->is_array) {
                                 vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUI,
                                               ntq_get_src(c, instr->src[i].src,
-                                                          instr->coord_components - 1));
+                                                          instr->coord_components - 1),
+                                              &tmu_writes);
                         }
                         break;
 
                 case nir_tex_src_bias:
                         vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0));
+                                      ntq_get_src(c, instr->src[i].src, 0),
+                                      &tmu_writes);
                         break;
 
                 case nir_tex_src_lod:
                         vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUB,
-                                      ntq_get_src(c, instr->src[i].src, 0));
+                                      ntq_get_src(c, instr->src[i].src, 0),
+                                      &tmu_writes);
 
                         if (instr->op != nir_texop_txf &&
                             instr->op != nir_texop_tg4) {
@@ -114,7 +124,8 @@
 
                 case nir_tex_src_comparator:
                         vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUDREF,
-                                      ntq_get_src(c, instr->src[i].src, 0));
+                                      ntq_get_src(c, instr->src[i].src, 0),
+                                      &tmu_writes);
                         break;
 
                 case nir_tex_src_offset: {
@@ -145,6 +156,14 @@
                 (1 << MIN2(instr_return_channels,
                            c->key->tex[unit].return_channels)) - 1;
 
+        /* Word enables can't ask for more channels than the output type could
+         * provide (2 for f16, 4 for 32-bit).
+         */
+        assert(!p1_unpacked.output_type_32_bit ||
+               p0_unpacked.return_words_of_texture_data < (1 << 4));
+        assert(p1_unpacked.output_type_32_bit ||
+               p0_unpacked.return_words_of_texture_data < (1 << 2));
+
         uint32_t p0_packed;
         V3D41_TMU_CONFIG_PARAMETER_0_pack(NULL,
                                           (uint8_t *)&p0_packed,
@@ -169,19 +188,26 @@
 
         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P0, p0_packed);
         vir_WRTMUC(c, QUNIFORM_TMU_CONFIG_P1, p1_packed);
-        vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
+        if (memcmp(&p2_unpacked, &p2_unpacked_default, sizeof(p2_unpacked)) != 0)
+                vir_WRTMUC(c, QUNIFORM_CONSTANT, p2_packed);
 
         if (instr->op == nir_texop_txf) {
                 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE);
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSF, s, &tmu_writes);
         } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUSCM, s, &tmu_writes);
         } else {
-                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s);
+                vir_TMU_WRITE(c, V3D_QPU_WADDR_TMUS, s, &tmu_writes);
         }
 
         vir_emit_thrsw(c);
 
+        /* The input FIFO has 16 slots across all threads, so make sure we
+         * don't overfill our allocation.
+         */
+        while (tmu_writes > 16 / c->threads)
+                c->threads /= 2;
+
         struct qreg return_values[4];
         for (int i = 0; i < 4; i++) {
                 /* Swizzling .zw of an RG texture should give undefined
diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
index 207e2973..070e6a3 100644
--- a/src/broadcom/compiler/v3d_compiler.h
+++ b/src/broadcom/compiler/v3d_compiler.h
@@ -115,6 +115,7 @@
  */
 struct qpu_reg {
         bool magic;
+        bool smimm;
         int index;
 };
 
@@ -244,8 +245,6 @@
 
         QUNIFORM_TEXTURE_BORDER_COLOR,
 
-        QUNIFORM_STENCIL,
-
         QUNIFORM_ALPHA_REF,
         QUNIFORM_SAMPLE_MASK,
 
@@ -302,18 +301,11 @@
                 uint8_t swizzle[4];
                 uint8_t return_size;
                 uint8_t return_channels;
-                union {
-                        struct {
-                                unsigned compare_mode:1;
-                                unsigned compare_func:3;
-                                bool clamp_s:1;
-                                bool clamp_t:1;
-                                bool clamp_r:1;
-                        };
-                        struct {
-                                uint16_t msaa_width, msaa_height;
-                        };
-                };
+                unsigned compare_mode:1;
+                unsigned compare_func:3;
+                bool clamp_s:1;
+                bool clamp_t:1;
+                bool clamp_r:1;
         } tex[V3D_MAX_TEXTURE_SAMPLERS];
         uint8_t ucp_enables;
 };
@@ -485,6 +477,12 @@
          */
         uint32_t flat_shade_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
 
+        uint32_t noperspective_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+        uint32_t centroid_flags[BITSET_WORDS(V3D_MAX_FS_INPUTS)];
+
+        bool uses_center_w;
+
         struct v3d_ubo_range *ubo_ranges;
         bool *ubo_range_used;
         uint32_t ubo_ranges_array_size;
@@ -650,6 +648,9 @@
 
         /* Total number of components written, for the shader state record. */
         uint32_t vpm_output_size;
+
+        /* Value to be programmed in VCM_CACHE_SIZE. */
+        uint8_t vcm_cache_size;
 };
 
 struct v3d_fs_prog_data {
@@ -664,8 +665,13 @@
          */
         uint32_t flat_shade_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
 
+        uint32_t noperspective_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
+
+        uint32_t centroid_flags[((V3D_MAX_FS_INPUTS - 1) / 24) + 1];
+
         bool writes_z;
         bool discard;
+        bool uses_center_w;
 };
 
 /* Special nir_load_input intrinsic index for loading the current TLB
@@ -865,6 +871,33 @@
                                            a, b));                      \
 }
 
+#define VIR_SFU(name)                                                      \
+static inline struct qreg                                                \
+vir_##name(struct v3d_compile *c, struct qreg a)                         \
+{                                                                        \
+        if (c->devinfo->ver >= 41) {                                     \
+                return vir_emit_def(c, vir_add_inst(V3D_QPU_A_##name,    \
+                                                    c->undef,            \
+                                                    a, c->undef));       \
+        } else {                                                         \
+                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
+                return vir_FMOV(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
+        }                                                                \
+}                                                                        \
+static inline struct qinst *                                             \
+vir_##name##_dest(struct v3d_compile *c, struct qreg dest,               \
+                  struct qreg a)                                         \
+{                                                                        \
+        if (c->devinfo->ver >= 41) {                                     \
+                return vir_emit_nondef(c, vir_add_inst(V3D_QPU_A_##name, \
+                                                       dest,             \
+                                                       a, c->undef));    \
+        } else {                                                         \
+                vir_FMOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_##name), a); \
+                return vir_FMOV_dest(c, dest, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R4)); \
+        }                                                                \
+}
+
 #define VIR_A_ALU2(name) VIR_ALU2(name, vir_add_inst, V3D_QPU_A_##name)
 #define VIR_M_ALU2(name) VIR_ALU2(name, vir_mul_inst, V3D_QPU_M_##name)
 #define VIR_A_ALU1(name) VIR_ALU1(name, vir_add_inst, V3D_QPU_A_##name)
@@ -898,18 +931,19 @@
 VIR_A_ALU2(XOR)
 VIR_A_ALU2(VADD)
 VIR_A_ALU2(VSUB)
-VIR_A_ALU2(STVPMV)
+VIR_A_NODST_2(STVPMV)
 VIR_A_ALU1(NOT)
 VIR_A_ALU1(NEG)
 VIR_A_ALU1(FLAPUSH)
 VIR_A_ALU1(FLBPUSH)
-VIR_A_ALU1(FLBPOP)
+VIR_A_ALU1(FLPOP)
 VIR_A_ALU1(SETMSF)
 VIR_A_ALU1(SETREVF)
 VIR_A_ALU0(TIDX)
 VIR_A_ALU0(EIDX)
 VIR_A_ALU1(LDVPMV_IN)
 VIR_A_ALU1(LDVPMV_OUT)
+VIR_A_ALU0(TMUWT)
 
 VIR_A_ALU0(FXCD)
 VIR_A_ALU0(XCD)
@@ -946,6 +980,13 @@
 VIR_M_ALU1(MOV)
 VIR_M_ALU1(FMOV)
 
+VIR_SFU(RECIP)
+VIR_SFU(RSQRT)
+VIR_SFU(EXP)
+VIR_SFU(LOG)
+VIR_SFU(SIN)
+VIR_SFU(RSQRT2)
+
 static inline struct qinst *
 vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
              struct qreg dest, struct qreg src)
diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
index 93990ee..6b55b0e 100644
--- a/src/broadcom/compiler/vir.c
+++ b/src/broadcom/compiler/vir.c
@@ -74,6 +74,8 @@
 int
 vir_get_implicit_uniform_src(struct qinst *inst)
 {
+        if (!vir_has_implicit_uniform(inst))
+                return -1;
         return vir_get_nsrc(inst) - 1;
 }
 
@@ -96,6 +98,7 @@
                 case V3D_QPU_A_STVPMD:
                 case V3D_QPU_A_STVPMP:
                 case V3D_QPU_A_VPMWT:
+                case V3D_QPU_A_TMUWT:
                         return true;
                 default:
                         break;
@@ -192,6 +195,11 @@
         if (inst->dst.file == QFILE_MAGIC)
                 return v3d_qpu_magic_waddr_is_tmu(inst->dst.index);
 
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+            inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
+                return true;
+        }
+
         return false;
 }
 
@@ -444,6 +452,16 @@
 {
         assert(inst->dst.file == QFILE_NULL);
 
+        /* If we're emitting an instruction that's a def, it had better be
+         * writing a register.
+         */
+        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
+                       v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
+                assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
+                       v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
+        }
+
         inst->dst = vir_get_temp(c);
 
         if (inst->dst.file == QFILE_TEMP)
@@ -572,7 +590,7 @@
 {
         struct nir_lower_tex_options tex_options = {
                 .lower_txd = true,
-                .lower_rect = false, /* XXX */
+                .lower_rect = false, /* XXX: Use this on V3D 3.x */
                 .lower_txp = ~0,
                 /* Apply swizzles to all samplers. */
                 .swizzle_result = ~0,
@@ -738,10 +756,29 @@
         if (prog_data->uses_iid)
                 prog_data->vpm_input_size++;
 
-        /* Input/output segment size are in 8x32-bit multiples. */
+        /* Input/output segment size are in sectors (8 rows of 32 bits per
+         * channel).
+         */
         prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
         prog_data->vpm_output_size = align(c->num_vpm_writes, 8) / 8;
 
+        /* Compute VCM cache size.  We set up our program to take up less than
+         * half of the VPM, so that any set of bin and render programs won't
+         * run out of space.  We need space for at least one input segment,
+         * and then allocate the rest to output segments (one for the current
+         * program, the rest to VCM).  The valid range of the VCM cache size
+         * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
+         * batches.
+         */
+        assert(c->devinfo->vpm_size);
+        int sector_size = 16 * sizeof(uint32_t) * 8;
+        int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
+        int half_vpm = vpm_size_in_sectors / 2;
+        int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
+        int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
+        assert(vpm_output_batches >= 2);
+        prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
+
         return v3d_return_qpu_insts(c, final_assembly_size);
 }
 
@@ -758,6 +795,12 @@
         for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
                 if (BITSET_TEST(c->flat_shade_flags, i))
                         prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
+
+                if (BITSET_TEST(c->noperspective_flags, i))
+                        prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
+
+                if (BITSET_TEST(c->centroid_flags, i))
+                        prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
         }
 }
 
@@ -837,7 +880,9 @@
         v3d_set_fs_prog_data_inputs(c, prog_data);
         prog_data->writes_z = (c->s->info.outputs_written &
                                (1 << FRAG_RESULT_DEPTH));
-        prog_data->discard = c->s->info.fs.uses_discard;
+        prog_data->discard = (c->s->info.fs.uses_discard ||
+                              c->fs_key->sample_alpha_to_coverage);
+        prog_data->uses_center_w = c->uses_center_w;
 
         return v3d_return_qpu_insts(c, final_assembly_size);
 }
@@ -927,6 +972,17 @@
         return vir_reg(QFILE_UNIF, uniform);
 }
 
+static bool
+vir_can_set_flags(struct v3d_compile *c, struct qinst *inst)
+{
+        if (c->devinfo->ver >= 40 && (v3d_qpu_reads_vpm(&inst->qpu) ||
+                                      v3d_qpu_uses_sfu(&inst->qpu))) {
+                return false;
+        }
+
+        return true;
+}
+
 void
 vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf)
 {
@@ -946,7 +1002,8 @@
 
         if (src.file != QFILE_TEMP ||
             !c->defs[src.index] ||
-            last_inst != c->defs[src.index]) {
+            last_inst != c->defs[src.index] ||
+            !vir_can_set_flags(c, last_inst)) {
                 /* XXX: Make the MOV be the appropriate type */
                 last_inst = vir_MOV_dest(c, vir_reg(QFILE_NULL, 0), src);
         }
@@ -979,6 +1036,7 @@
 
                 OPTPASS(vir_opt_copy_propagate);
                 OPTPASS(vir_opt_dead_code);
+                OPTPASS(vir_opt_small_immediates);
 
                 if (!progress)
                         break;
diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
index 88b5dc9..c435783 100644
--- a/src/broadcom/compiler/vir_dump.c
+++ b/src/broadcom/compiler/vir_dump.c
@@ -25,7 +25,8 @@
 #include "v3d_compiler.h"
 
 static void
-vir_print_reg(struct v3d_compile *c, struct qreg reg)
+vir_print_reg(struct v3d_compile *c, const struct qinst *inst,
+              struct qreg reg)
 {
         static const char *files[] = {
                 [QFILE_TEMP] = "t",
@@ -58,12 +59,20 @@
                 fprintf(stderr, "%s", v3d_qpu_magic_waddr_name(reg.index));
                 break;
 
-        case QFILE_SMALL_IMM:
-                if ((int)reg.index >= -16 && (int)reg.index <= 15)
-                        fprintf(stderr, "%d", reg.index);
+        case QFILE_SMALL_IMM: {
+                uint32_t unpacked;
+                bool ok = v3d_qpu_small_imm_unpack(c->devinfo,
+                                                   inst->qpu.raddr_b,
+                                                   &unpacked);
+                assert(ok); (void) ok;
+
+                if ((int)inst->qpu.raddr_b >= -16 &&
+                    (int)inst->qpu.raddr_b <= 15)
+                        fprintf(stderr, "%d", unpacked);
                 else
-                        fprintf(stderr, "%f", uif(reg.index));
+                        fprintf(stderr, "%f", uif(unpacked));
                 break;
+        }
 
         case QFILE_VPM:
                 fprintf(stderr, "vpm%d.%d",
@@ -220,7 +229,7 @@
                 fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.auf));
                 fprintf(stderr, " ");
 
-                vir_print_reg(c, inst->dst);
+                vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
 
                 unpack[0] = instr->alu.add.a_unpack;
@@ -232,7 +241,7 @@
                 fprintf(stderr, "%s", v3d_qpu_uf_name(instr->flags.muf));
                 fprintf(stderr, " ");
 
-                vir_print_reg(c, inst->dst);
+                vir_print_reg(c, inst, inst->dst);
                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
 
                 unpack[0] = instr->alu.mul.a_unpack;
@@ -241,7 +250,7 @@
 
         for (int i = 0; i < sideband_nsrc; i++) {
                 fprintf(stderr, ", ");
-                vir_print_reg(c, inst->src[i]);
+                vir_print_reg(c, inst, inst->src[i]);
                 if (i < nsrc)
                         fprintf(stderr, "%s", v3d_qpu_unpack_name(unpack[i]));
         }
@@ -307,7 +316,7 @@
 
                 if (vir_has_implicit_uniform(inst)) {
                         fprintf(stderr, " ");
-                        vir_print_reg(c, inst->src[vir_get_implicit_uniform_src(inst)]);
+                        vir_print_reg(c, inst, inst->src[vir_get_implicit_uniform_src(inst)]);
                 }
 
                 break;
diff --git a/src/broadcom/compiler/vir_lower_uniforms.c b/src/broadcom/compiler/vir_lower_uniforms.c
index 7f3bb84..1e90040 100644
--- a/src/broadcom/compiler/vir_lower_uniforms.c
+++ b/src/broadcom/compiler/vir_lower_uniforms.c
@@ -76,9 +76,7 @@
 {
         if (inst->src[i].file != QFILE_UNIF)
                 return false;
-        if (vir_has_implicit_uniform(inst))
-                return i != vir_get_implicit_uniform_src(inst);
-        return true;
+        return i != vir_get_implicit_uniform_src(inst);
 }
 
 /* Returns the number of different uniform values referenced by the
@@ -152,7 +150,7 @@
                  * reference a temp instead.
                  */
                 vir_for_each_block(block, c) {
-                        struct qinst *mov = NULL;
+                        struct qreg temp = c->undef;
 
                         vir_for_each_inst(inst, block) {
                                 uint32_t nsrc = vir_get_nsrc(inst);
@@ -162,29 +160,27 @@
                                 if (count <= 1)
                                         continue;
 
-                                /* If the block doesn't have a load of the
-                                 * uniform yet, add it.  We could potentially
-                                 * do better and CSE MOVs from multiple blocks
-                                 * into dominating blocks, except that may
-                                 * cause troubles for register allocation.
-                                 */
-                                if (!mov) {
-                                        mov = vir_mul_inst(V3D_QPU_M_MOV,
-                                                           vir_get_temp(c),
-                                                           unif, c->undef);
-                                        list_add(&mov->link,
-                                                 &block->instructions);
-                                        c->defs[mov->dst.index] = mov;
-                                }
-
                                 bool removed = false;
                                 for (int i = 0; i < nsrc; i++) {
                                         if (is_lowerable_uniform(inst, i) &&
                                             inst->src[i].index == max_index) {
-                                                inst->src[i].file =
-                                                        mov->dst.file;
-                                                inst->src[i].index =
-                                                        mov->dst.index;
+                                                /* If the block doesn't have a
+                                                 * load of the uniform yet,
+                                                 * add it now.  We could
+                                                 * potentially do better and
+                                                 * CSE MOVs from multiple
+                                                 * blocks into dominating
+                                                 * blocks, except that may
+                                                 * cause troubles for register
+                                                 * allocation.
+                                                 */
+                                                if (temp.file == QFILE_NULL) {
+                                                        c->cursor =
+                                                                vir_before_inst(inst);
+                                                        temp = vir_MOV(c, unif);
+                                                }
+
+                                                inst->src[i] = temp;
                                                 remove_uniform(ht, unif);
                                                 removed = true;
                                         }
diff --git a/src/broadcom/compiler/vir_opt_dead_code.c b/src/broadcom/compiler/vir_opt_dead_code.c
index 7ce05fb..362fc9e 100644
--- a/src/broadcom/compiler/vir_opt_dead_code.c
+++ b/src/broadcom/compiler/vir_opt_dead_code.c
@@ -85,6 +85,16 @@
         return false;
 }
 
+static bool
+can_write_to_null(struct v3d_compile *c, struct qinst *inst)
+{
+        /* The SFU instructions must write to a physical register. */
+        if (c->devinfo->ver >= 41 && v3d_qpu_uses_sfu(&inst->qpu))
+                return false;
+
+        return true;
+}
+
 bool
 vir_opt_dead_code(struct v3d_compile *c)
 {
@@ -122,7 +132,8 @@
                                  * it's nicer to read the VIR code without
                                  * unused destination regs.
                                  */
-                                if (inst->dst.file == QFILE_TEMP) {
+                                if (inst->dst.file == QFILE_TEMP &&
+                                    can_write_to_null(c, inst)) {
                                         if (debug) {
                                                 fprintf(stderr,
                                                         "Removing dst from: ");
diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
new file mode 100644
index 0000000..5491f9c
--- /dev/null
+++ b/src/broadcom/compiler/vir_opt_small_immediates.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file v3d_opt_small_immediates.c
+ *
+ * Turns references to small constant uniform values into small immediates
+ * fields.
+ */
+
+#include "v3d_compiler.h"
+
+static bool debug;
+
+bool
+vir_opt_small_immediates(struct v3d_compile *c)
+{
+        bool progress = false;
+
+        vir_for_each_inst_inorder(inst, c) {
+                if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU)
+                        continue;
+
+                /* The small immediate value sits in the raddr B field, so we
+                 * can't have 2 small immediates in one instruction (unless
+                 * they're the same value, but that should be optimized away
+                 * elsewhere).
+                 */
+                bool uses_small_imm = false;
+                for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                        if (inst->src[i].file == QFILE_SMALL_IMM)
+                                uses_small_imm = true;
+                }
+                if (uses_small_imm)
+                        continue;
+
+                for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                        struct qreg src = vir_follow_movs(c, inst->src[i]);
+
+                        if (src.file != QFILE_UNIF ||
+                            c->uniform_contents[src.index] !=
+                            QUNIFORM_CONSTANT) {
+                                continue;
+                        }
+
+                        if (vir_has_implicit_uniform(inst) &&
+                            i == vir_get_implicit_uniform_src(inst)) {
+                                /* No turning the implicit uniform read into
+                                 * an immediate.
+                                 */
+                                continue;
+                        }
+
+                        /* Check if the uniform is suitable as a small
+                         * immediate.
+                         */
+                        uint32_t imm = c->uniform_data[src.index];
+                        uint32_t packed;
+                        if (!v3d_qpu_small_imm_pack(c->devinfo, imm, &packed))
+                                continue;
+
+                        /* Check that we don't have any other signals already
+                         * that would be incompatible with small_imm.
+                         */
+                        struct v3d_qpu_sig new_sig = inst->qpu.sig;
+                        uint32_t sig_packed;
+                        new_sig.small_imm = true;
+                        if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
+                                continue;
+
+                        if (debug) {
+                                fprintf(stderr, "opt_small_immediate() from: ");
+                                vir_dump_inst(c, inst);
+                                fprintf(stderr, "\n");
+                        }
+                        inst->qpu.sig.small_imm = true;
+                        inst->qpu.raddr_b = packed;
+
+                        inst->src[i].file = QFILE_SMALL_IMM;
+                        inst->src[i].index = imm;
+                        if (debug) {
+                                fprintf(stderr, "to: ");
+                                vir_dump_inst(c, inst);
+                                fprintf(stderr, "\n");
+                        }
+                        progress = true;
+                        break;
+                }
+        }
+
+        return progress;
+}
diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
index 4ec5f23..61d2735 100644
--- a/src/broadcom/compiler/vir_register_allocate.c
+++ b/src/broadcom/compiler/vir_register_allocate.c
@@ -94,6 +94,15 @@
                                 }
                         }
 
+                        /* Refuse to spill a ldvary's dst, because that means
+                         * that ldvary's r5 would end up being used across a
+                         * thrsw.
+                         */
+                        if (inst->qpu.sig.ldvary) {
+                                assert(inst->dst.file == QFILE_TEMP);
+                                BITSET_CLEAR(c->spillable, inst->dst.index);
+                        }
+
                         if (inst->is_last_thrsw)
                                 started_last_seg = true;
 
@@ -102,7 +111,7 @@
                                 started_last_seg = true;
 
                         /* Track when we're in between a TMU setup and the
-                         * final LDTMU from that TMU setup.  We can't
+                         * final LDTMU or TMUWT from that TMU setup.  We can't
                          * spill/fill any temps during that time, because that
                          * involves inserting a new TMU setup/LDTMU sequence.
                          */
@@ -110,6 +119,10 @@
                             is_last_ldtmu(inst, block))
                                 in_tmu_operation = false;
 
+                        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
+                            inst->qpu.alu.add.op == V3D_QPU_A_TMUWT)
+                                in_tmu_operation = false;
+
                         if (v3d_qpu_writes_tmu(&inst->qpu))
                                 in_tmu_operation = true;
                 }
@@ -206,6 +219,7 @@
                                      inst->dst);
                         v3d_emit_spill_tmua(c, spill_offset);
                         vir_emit_thrsw(c);
+                        vir_TMUWT(c);
                         c->spills++;
                 }
 
@@ -238,6 +252,43 @@
                 BITSET_CLEAR(c->spillable, i);
 }
 
+struct v3d_ra_select_callback_data {
+        uint32_t next_acc;
+        uint32_t next_phys;
+};
+
+static unsigned int
+v3d_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
+{
+        struct v3d_ra_select_callback_data *v3d_ra = data;
+
+        /* Choose an accumulator if possible (I think it's lower power than
+         * phys regs), but round-robin through them to give post-RA
+         * instruction selection more options.
+         */
+        for (int i = 0; i < ACC_COUNT; i++) {
+                int acc_off = (v3d_ra->next_acc + i) % ACC_COUNT;
+                int acc = ACC_INDEX + acc_off;
+
+                if (BITSET_TEST(regs, acc)) {
+                        v3d_ra->next_acc = acc_off + 1;
+                        return acc;
+                }
+        }
+
+        for (int i = 0; i < PHYS_COUNT; i++) {
+                int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
+                int phys = PHYS_INDEX + phys_off;
+
+                if (BITSET_TEST(regs, phys)) {
+                        v3d_ra->next_phys = phys_off + 1;
+                        return phys;
+                }
+        }
+
+        unreachable("RA must pass us at least one possible reg.");
+}
+
 bool
 vir_init_reg_sets(struct v3d_compiler *compiler)
 {
@@ -309,6 +360,13 @@
         struct qpu_reg *temp_registers = calloc(c->num_temps,
                                                 sizeof(*temp_registers));
         int acc_nodes[ACC_COUNT];
+        struct v3d_ra_select_callback_data callback_data = {
+                .next_acc = 0,
+                /* Start at RF3, to try to keep the TLB writes from using
+                 * RF0-2.
+                 */
+                .next_phys = 3,
+        };
 
         *spilled = false;
 
@@ -328,6 +386,7 @@
         struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
                                                          c->num_temps +
                                                          ARRAY_SIZE(acc_nodes));
+        ra_set_select_reg_callback(g, v3d_ra_select_callback, &callback_data);
 
         /* Make some fixed nodes for the accumulators, which we will need to
          * interfere with when ops have implied r3/r4 writes or for the thread
@@ -400,6 +459,19 @@
                                 class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
                                 break;
 
+                        case V3D_QPU_A_RECIP:
+                        case V3D_QPU_A_RSQRT:
+                        case V3D_QPU_A_EXP:
+                        case V3D_QPU_A_LOG:
+                        case V3D_QPU_A_SIN:
+                        case V3D_QPU_A_RSQRT2:
+                                /* The SFU instructions write directly to the
+                                 * phys regfile.
+                                 */
+                                assert(inst->dst.file == QFILE_TEMP);
+                                class_bits[inst->dst.index] &= CLASS_BIT_PHYS;
+                                break;
+
                         default:
                                 break;
                         }
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index 83b1936..b5a7b84 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -109,6 +109,12 @@
 static void
 set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
 {
+        if (src.smimm) {
+                assert(instr->sig.small_imm);
+                *mux = V3D_QPU_MUX_B;
+                return;
+        }
+
         if (src.magic) {
                 assert(src.index >= V3D_QPU_WADDR_R0 &&
                        src.index <= V3D_QPU_WADDR_R5);
@@ -244,15 +250,7 @@
                                 src[i] = qpu_acc(5);
                                 break;
                         case QFILE_SMALL_IMM:
-                                abort(); /* XXX */
-#if 0
-                                src[i].mux = QPU_MUX_SMALL_IMM;
-                                src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
-                                /* This should only have returned a valid
-                                 * small immediate field, not ~0 for failure.
-                                 */
-                                assert(src[i].addr <= 47);
-#endif
+                                src[i].smimm = true;
                                 break;
 
                         case QFILE_VPM:
@@ -405,7 +403,10 @@
                         c->qpu_inst_count);
         }
 
-        if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+        /* The QPU cycle estimates are pretty broken (see waddr_latency()), so
+         * don't report them for now.
+         */
+        if (false) {
                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
                         vir_get_stage_name(c),
                         c->program_id, c->variant_id,
diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
index 6c8ea61..d3ea362 100644
--- a/src/broadcom/meson.build
+++ b/src/broadcom/meson.build
@@ -22,36 +22,37 @@
 
 subdir('cle')
 
-vc5_versions = ['33', '41', '42']
+v3d_versions = ['33', '41', '42']
+v3d_libs = []
 
-if with_gallium_vc5
+if with_gallium_v3d
   subdir('compiler')
   subdir('qpu')
+endif
 
-  per_version_libs = []
-  foreach ver : vc5_versions
-    per_version_libs += static_library(
-      'libbroadcom-v' + ver,
-      [
-        files('clif/v3dx_dump.c'),
-        v3d_xml_pack
-      ],
-      include_directories : [inc_common, inc_broadcom, inc_src],
-      c_args : [c_vis_args, no_override_init_args, '-DV3D_VERSION=' + ver],
-      dependencies: [dep_valgrind, dep_thread],
-    )
-  endforeach
-
-  libbroadcom_vc5 = static_library(
-    'libbroadcom_vc5',
+per_version_libs = []
+foreach ver : v3d_versions
+  per_version_libs += static_library(
+    'libbroadcom-v' + ver,
     [
-      files('common/v3d_debug.c', 'clif/clif_dump.c'),
-      v3d_xml_pack,
+      files('clif/v3dx_dump.c'),
+      v3d_xml_pack
     ],
     include_directories : [inc_common, inc_broadcom, inc_src],
-    c_args : [c_vis_args, no_override_init_args],
-    link_whole : [libbroadcom_compiler, libbroadcom_qpu] + per_version_libs,
-    build_by_default : false,
+    c_args : [c_vis_args, no_override_init_args, '-DV3D_VERSION=' + ver],
     dependencies: [dep_valgrind, dep_thread],
   )
-endif
+endforeach
+
+libbroadcom_v3d = static_library(
+  'libbroadcom_v3d',
+  [
+    files('common/v3d_debug.c', 'clif/clif_dump.c'),
+    v3d_xml_pack,
+  ],
+  include_directories : [inc_common, inc_broadcom, inc_src],
+  c_args : [c_vis_args, no_override_init_args],
+  link_whole : v3d_libs + per_version_libs,
+  build_by_default : false,
+  dependencies: [dep_valgrind, dep_thread],
+)
diff --git a/src/broadcom/qpu/meson.build b/src/broadcom/qpu/meson.build
index 5521a80..8a40016 100644
--- a/src/broadcom/qpu/meson.build
+++ b/src/broadcom/qpu/meson.build
@@ -33,6 +33,8 @@
   build_by_default : false,
 )
 
+v3d_libs += libbroadcom_qpu
+
 test(
   'qpu_disasm',
   executable(
diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
index 213a082..0846cc8 100644
--- a/src/broadcom/qpu/qpu_instr.c
+++ b/src/broadcom/qpu/qpu_instr.c
@@ -106,7 +106,8 @@
                 [V3D_QPU_A_NEG] = "neg",
                 [V3D_QPU_A_FLAPUSH] = "flapush",
                 [V3D_QPU_A_FLBPUSH] = "flbpush",
-                [V3D_QPU_A_FLBPOP] = "flbpop",
+                [V3D_QPU_A_FLPOP] = "flpop",
+                [V3D_QPU_A_RECIP] = "recip",
                 [V3D_QPU_A_SETMSF] = "setmsf",
                 [V3D_QPU_A_SETREVF] = "setrevf",
                 [V3D_QPU_A_NOP] = "nop",
@@ -135,6 +136,11 @@
                 [V3D_QPU_A_LDVPMD_IN] = "ldvpmd_in",
                 [V3D_QPU_A_LDVPMD_OUT] = "ldvpmd_out",
                 [V3D_QPU_A_LDVPMP] = "ldvpmp",
+                [V3D_QPU_A_RSQRT] = "rsqrt",
+                [V3D_QPU_A_EXP] = "exp",
+                [V3D_QPU_A_LOG] = "log",
+                [V3D_QPU_A_SIN] = "sin",
+                [V3D_QPU_A_RSQRT2] = "rsqrt2",
                 [V3D_QPU_A_LDVPMG_IN] = "ldvpmg_in",
                 [V3D_QPU_A_LDVPMG_OUT] = "ldvpmg_out",
                 [V3D_QPU_A_FCMP] = "fcmp",
@@ -368,7 +374,8 @@
         [V3D_QPU_A_NEG] = D | A,
         [V3D_QPU_A_FLAPUSH] = D | A,
         [V3D_QPU_A_FLBPUSH] = D | A,
-        [V3D_QPU_A_FLBPOP] = D | A,
+        [V3D_QPU_A_FLPOP] = D | A,
+        [V3D_QPU_A_RECIP] = D | A,
         [V3D_QPU_A_SETMSF] = D | A,
         [V3D_QPU_A_SETREVF] = D | A,
         [V3D_QPU_A_NOP] = 0,
@@ -401,6 +408,11 @@
         [V3D_QPU_A_LDVPMD_IN] = D | A,
         [V3D_QPU_A_LDVPMD_OUT] = D | A,
         [V3D_QPU_A_LDVPMP] = D | A,
+        [V3D_QPU_A_RSQRT] = D | A,
+        [V3D_QPU_A_EXP] = D | A,
+        [V3D_QPU_A_LOG] = D | A,
+        [V3D_QPU_A_SIN] = D | A,
+        [V3D_QPU_A_RSQRT2] = D | A,
         [V3D_QPU_A_LDVPMG_IN] = D | A | B,
         [V3D_QPU_A_LDVPMG_OUT] = D | A | B,
 
@@ -514,6 +526,14 @@
 }
 
 bool
+v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst)
+{
+        return (inst->sig.ldtmu ||
+                (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
+                 inst->alu.add.op == V3D_QPU_A_TMUWT));
+}
+
+bool
 v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr)
 {
         return (waddr == V3D_QPU_WADDR_TLB ||
@@ -591,6 +611,36 @@
 }
 
 bool
+v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
+{
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                switch (inst->alu.add.op) {
+                case V3D_QPU_A_RECIP:
+                case V3D_QPU_A_RSQRT:
+                case V3D_QPU_A_EXP:
+                case V3D_QPU_A_LOG:
+                case V3D_QPU_A_SIN:
+                case V3D_QPU_A_RSQRT2:
+                        return true;
+                default:
+                        break;
+                }
+
+                if (inst->alu.add.magic_write &&
+                    v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr)) {
+                        return true;
+                }
+
+                if (inst->alu.mul.magic_write &&
+                    v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr)) {
+                        return true;
+                }
+        }
+
+        return false;
+}
+
+bool
 v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst)
 {
         return (inst->type == V3D_QPU_INSTR_TYPE_ALU &&
diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
index e5e9a9a..c2b4ebd 100644
--- a/src/broadcom/qpu/qpu_instr.h
+++ b/src/broadcom/qpu/qpu_instr.h
@@ -165,7 +165,8 @@
         V3D_QPU_A_NEG,
         V3D_QPU_A_FLAPUSH,
         V3D_QPU_A_FLBPUSH,
-        V3D_QPU_A_FLBPOP,
+        V3D_QPU_A_FLPOP,
+        V3D_QPU_A_RECIP,
         V3D_QPU_A_SETMSF,
         V3D_QPU_A_SETREVF,
         V3D_QPU_A_NOP,
@@ -194,6 +195,11 @@
         V3D_QPU_A_LDVPMD_IN,
         V3D_QPU_A_LDVPMD_OUT,
         V3D_QPU_A_LDVPMP,
+        V3D_QPU_A_RSQRT,
+        V3D_QPU_A_EXP,
+        V3D_QPU_A_LOG,
+        V3D_QPU_A_SIN,
+        V3D_QPU_A_RSQRT2,
         V3D_QPU_A_LDVPMG_IN,
         V3D_QPU_A_LDVPMG_OUT,
         V3D_QPU_A_FCMP,
@@ -438,6 +444,7 @@
 bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+bool v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
@@ -445,6 +452,7 @@
                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
 bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux);
 bool v3d_qpu_uses_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
 bool v3d_qpu_reads_vpm(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
index 4615fd3..70f31d7 100644
--- a/src/broadcom/qpu/qpu_pack.c
+++ b/src/broadcom/qpu/qpu_pack.c
@@ -492,7 +492,8 @@
         { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG },
         { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH },
         { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH },
-        { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLBPOP },
+        { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP },
+        { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP },
         { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF },
         { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF },
         { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 },
@@ -522,6 +523,11 @@
         { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
         { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
         { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 },
+        { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 },
+        { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 },
+        { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 },
+        { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 },
+        { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 },
         { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
 
         /* FIXME: MORE COMPLICATED */
diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
index 814e032..2e8d980 100644
--- a/src/broadcom/qpu/tests/qpu_disasm.c
+++ b/src/broadcom/qpu/tests/qpu_disasm.c
@@ -40,6 +40,7 @@
         { 33, 0x3c002380b6edb000ull, "or  rf0, r3, r3      ; mov  vpm, r3" },
         { 33, 0x57403006bbb80000ull, "nop                  ; fmul  r0, rf0, r5 ; ldvpm; ldunif" },
         { 33, 0x9c094adef634b000ull, "ffloor.ifb  rf30.l, r3; fmul.pushz  rf43.l, r5, r1.h" },
+        { 33, 0xb0044c56ba326840ull, "flpop  rf22, rf33    ; fmul.pushz  rf49.l, r4.h, r1.abs" },
 
         /* vfmul input packing */
         { 33, 0x101e8b6e8aad4000ull, "fmax.nornn  rf46, r4.l, r2.l; vfmul.ifnb  rf45, r3, r5" },
@@ -83,6 +84,14 @@
         { 41, 0x3de02040f8ff7201ull, "stvpmv  1, rf8       ; mov  r1, 1" },
         { 41, 0xd8000e50bb2d3000ull, "sampid  rf16         ; fmul  rf57.h, r3, r1.l" },
 
+        /* v4.1 SFU instructions. */
+        { 41, 0xe98d60c1ba2aef80ull, "recip  rf1, rf62     ; fmul  r3.h, r2.l, r1.l; ldunifrf.rf53" },
+        { 41, 0x7d87c2debc51c000ull, "rsqrt  rf30, r4      ; fmul  rf11, r4.h, r2.h; ldunifrf.rf31" },
+        { 41, 0xb182475abc2bb000ull, "rsqrt2  rf26, r3     ; fmul  rf29.l, r2.h, r1.abs; ldunifrf.rf9" },
+        { 41, 0x79880808bc0b6900ull, "sin  rf8, rf36       ; fmul  rf32, r2.h, r0.l; ldunifrf.rf32" },
+        { 41, 0x04092094bc5a28c0ull, "exp.ifb  rf20, r2    ; add  r2, rf35, r2" },
+        { 41, 0xe00648bfbc32a000ull, "log  rf63, r2        ; fmul.andnn  rf34.h, r4.l, r1.abs" },
+
         /* v4.2 changes */
         { 42, 0x3c203192bb814000ull, "barrierid  syncb     ; nop               ; thrsw" },
 };
@@ -112,9 +121,10 @@
         for (int i = 0; i < ARRAY_SIZE(tests); i++) {
                 devinfo.ver = tests[i].ver;
 
-                printf("Testing v%d.%d 0x%016llx... ",
+                printf("Testing v%d.%d 0x%016llx (\"%s\")... ",
                        devinfo.ver / 10, devinfo.ver % 10,
-                       (long long)tests[i].inst);
+                       (long long)tests[i].inst,
+                        tests[i].expected);
 
                 const char *disasm_output = v3d_qpu_disasm(&devinfo,
                                                            tests[i].inst);
diff --git a/src/compiler/Makefile.glsl.am b/src/compiler/Makefile.glsl.am
index ad19b14..02a7f43 100644
--- a/src/compiler/Makefile.glsl.am
+++ b/src/compiler/Makefile.glsl.am
@@ -31,7 +31,6 @@
 	SConscript.glsl
 
 TESTS += glsl/glcpp/tests/glcpp-test.sh			\
-	glsl/glcpp/tests/glcpp-test-cr-lf.sh		\
 	glsl/tests/blob-test				\
 	glsl/tests/cache-test				\
 	glsl/tests/general-ir-test			\
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index aca9dab..27a54e0 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -25,6 +25,16 @@
 	glsl/builtin_types.cpp \
 	glsl/builtin_variables.cpp \
 	glsl/generate_ir.cpp \
+	glsl/gl_nir_lower_atomics.c \
+	glsl/gl_nir_lower_samplers.c \
+	glsl/gl_nir_lower_samplers_as_deref.c \
+	glsl/gl_nir_link_atomics.c \
+	glsl/gl_nir_link_uniform_initializers.c \
+	glsl/gl_nir_link_uniforms.c \
+	glsl/gl_nir_link_xfb.c \
+	glsl/gl_nir_linker.c \
+	glsl/gl_nir_linker.h \
+	glsl/gl_nir.h \
 	glsl/glsl_parser_extras.cpp \
 	glsl/glsl_parser_extras.h \
 	glsl/glsl_symbol_table.cpp \
@@ -67,6 +77,8 @@
 	glsl/ir_visitor.h \
 	glsl/linker.cpp \
 	glsl/linker.h \
+	glsl/linker_util.h \
+	glsl/linker_util.cpp \
 	glsl/link_atomics.cpp \
 	glsl/link_functions.cpp \
 	glsl/link_interface_blocks.cpp \
@@ -118,7 +130,6 @@
 	glsl/opt_constant_folding.cpp \
 	glsl/opt_constant_propagation.cpp \
 	glsl/opt_constant_variable.cpp \
-	glsl/opt_copy_propagation.cpp \
 	glsl/opt_copy_propagation_elements.cpp \
 	glsl/opt_dead_builtin_variables.cpp \
 	glsl/opt_dead_builtin_varyings.cpp \
@@ -192,12 +203,17 @@
 	nir/nir.c \
 	nir/nir.h \
 	nir/nir_builder.h \
+	nir/nir_builtin_builder.c \
+	nir/nir_builtin_builder.h \
 	nir/nir_clone.c \
 	nir/nir_constant_expressions.h \
 	nir/nir_control_flow.c \
 	nir/nir_control_flow.h \
 	nir/nir_control_flow_private.h \
+	nir/nir_deref.c \
+	nir/nir_deref.h \
 	nir/nir_dominance.c \
+	nir/nir_format_convert.h \
 	nir/nir_from_ssa.c \
 	nir/nir_gather_info.c \
 	nir/nir_gs_count_vertices.c \
@@ -208,12 +224,12 @@
 	nir/nir_liveness.c \
 	nir/nir_loop_analyze.c \
 	nir/nir_loop_analyze.h \
-	nir/nir_lower_64bit_packing.c \
 	nir/nir_lower_alpha_test.c \
+	nir/nir_lower_alu.c \
 	nir/nir_lower_alu_to_scalar.c \
-	nir/nir_lower_atomics.c \
 	nir/nir_lower_atomics_to_ssbo.c \
 	nir/nir_lower_bitmap.c \
+	nir/nir_lower_bit_size.c \
 	nir/nir_lower_clamp_color_outputs.c \
 	nir/nir_lower_clip.c \
 	nir/nir_lower_clip_cull_distance_arrays.c \
@@ -231,14 +247,12 @@
 	nir/nir_lower_io_arrays_to_elements.c \
 	nir/nir_lower_io_to_temporaries.c \
 	nir/nir_lower_io_to_scalar.c \
-	nir/nir_lower_io_types.c \
+	nir/nir_lower_packing.c \
 	nir/nir_lower_passthrough_edgeflags.c \
 	nir/nir_lower_patch_vertices.c \
 	nir/nir_lower_phis_to_scalar.c \
 	nir/nir_lower_regs_to_ssa.c \
 	nir/nir_lower_returns.c \
-	nir/nir_lower_samplers.c \
-	nir/nir_lower_samplers_as_deref.c \
 	nir/nir_lower_subgroups.c \
 	nir/nir_lower_system_values.c \
 	nir/nir_lower_tex.c \
@@ -250,6 +264,7 @@
 	nir/nir_lower_wpos_center.c \
 	nir/nir_lower_wpos_ytransform.c \
 	nir/nir_metadata.c \
+	nir/nir_move_load_const.c \
 	nir/nir_move_vec_src_uses_to_dest.c \
 	nir/nir_normalize_cubemap_coords.c \
 	nir/nir_opt_conditional_discard.c \
@@ -264,6 +279,7 @@
 	nir/nir_opt_if.c \
 	nir/nir_opt_intrinsics.c \
 	nir/nir_opt_loop_unroll.c \
+	nir/nir_opt_large_constants.c \
 	nir/nir_opt_move_comparisons.c \
 	nir/nir_opt_move_load_ubo.c \
 	nir/nir_opt_peephole_select.c \
@@ -282,6 +298,7 @@
 	nir/nir_search_helpers.h \
 	nir/nir_serialize.c \
 	nir/nir_serialize.h \
+	nir/nir_split_per_member_structs.c \
 	nir/nir_split_var_copies.c \
 	nir/nir_sweep.c \
 	nir/nir_to_lcssa.c \
diff --git a/src/compiler/blob.c b/src/compiler/blob.c
index 5e8671b..c89092e 100644
--- a/src/compiler/blob.c
+++ b/src/compiler/blob.c
@@ -291,6 +291,13 @@
    memcpy(dest, bytes, size);
 }
 
+void
+blob_skip_bytes(struct blob_reader *blob, size_t size)
+{
+   if (ensure_can_read (blob, size))
+      blob->current += size;
+}
+
 /* These next three read functions have identical form. If we add any beyond
  * these first three we should probably switch to generating these with a
  * preprocessor macro.
diff --git a/src/compiler/blob.h b/src/compiler/blob.h
index 2b975d4..b56fa4b 100644
--- a/src/compiler/blob.h
+++ b/src/compiler/blob.h
@@ -295,6 +295,12 @@
 blob_copy_bytes(struct blob_reader *blob, void *dest, size_t size);
 
 /**
+ * Skip \size bytes within the blob.
+ */
+void
+blob_skip_bytes(struct blob_reader *blob, size_t size);
+
+/**
  * Read a uint32_t from the current location, (and update the current location
  * to just past this uint32_t).
  *
diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h
index 9b88ff5..4d5e045 100644
--- a/src/compiler/glsl/ast.h
+++ b/src/compiler/glsl/ast.h
@@ -626,6 +626,16 @@
           * Flag set if GL_ARB_post_depth_coverage layout qualifier is used.
           */
          unsigned post_depth_coverage:1;
+
+         /**
+          * Flags for the layout qualifers added by ARB_fragment_shader_interlock
+          */
+
+         unsigned pixel_interlock_ordered:1;
+         unsigned pixel_interlock_unordered:1;
+         unsigned sample_interlock_ordered:1;
+         unsigned sample_interlock_unordered:1;
+
          /**
           * Flag set if GL_INTEL_conservartive_rasterization layout qualifier
           * is used.
diff --git a/src/compiler/glsl/ast_function.cpp b/src/compiler/glsl/ast_function.cpp
index 22d58e4..1fa3f75 100644
--- a/src/compiler/glsl/ast_function.cpp
+++ b/src/compiler/glsl/ast_function.cpp
@@ -348,6 +348,49 @@
    return true;
 }
 
+struct copy_index_deref_data {
+   void *mem_ctx;
+   exec_list *before_instructions;
+};
+
+static void
+copy_index_derefs_to_temps(ir_instruction *ir, void *data)
+{
+   struct copy_index_deref_data *d = (struct copy_index_deref_data *)data;
+
+   if (ir->ir_type == ir_type_dereference_array) {
+      ir_dereference_array *a = (ir_dereference_array *) ir;
+      ir = a->array->as_dereference();
+
+      ir_rvalue *idx = a->array_index;
+      if (idx->as_dereference_variable()) {
+         ir_variable *var = idx->variable_referenced();
+
+         /* If the index is read only it cannot change so there is no need
+          * to copy it.
+          */
+         if (var->data.read_only || var->data.memory_read_only)
+            return;
+
+         ir_variable *tmp = new(d->mem_ctx) ir_variable(idx->type, "idx_tmp",
+                                                        ir_var_temporary);
+         d->before_instructions->push_tail(tmp);
+
+         ir_dereference_variable *const deref_tmp_1 =
+            new(d->mem_ctx) ir_dereference_variable(tmp);
+         ir_assignment *const assignment =
+            new(d->mem_ctx) ir_assignment(deref_tmp_1,
+                                          idx->clone(d->mem_ctx, NULL));
+         d->before_instructions->push_tail(assignment);
+
+         /* Replace the array index with a dereference of the new temporary */
+         ir_dereference_variable *const deref_tmp_2 =
+            new(d->mem_ctx) ir_dereference_variable(tmp);
+         a->array_index = deref_tmp_2;
+      }
+   }
+}
+
 static void
 fix_parameter(void *mem_ctx, ir_rvalue *actual, const glsl_type *formal_type,
               exec_list *before_instructions, exec_list *after_instructions,
@@ -362,6 +405,17 @@
        && (expr == NULL || expr->operation != ir_binop_vector_extract))
       return;
 
+   /* An array index could also be an out variable so we need to make a copy
+    * of them before the function is called.
+    */
+   if (!actual->as_dereference_variable()) {
+      struct copy_index_deref_data data;
+      data.mem_ctx = mem_ctx;
+      data.before_instructions = before_instructions;
+
+      visit_tree(actual, copy_index_derefs_to_temps, &data);
+   }
+
    /* To convert an out parameter, we need to create a temporary variable to
     * hold the value before conversion, and then perform the conversion after
     * the function call returns.
@@ -529,7 +583,8 @@
     * If the function call is a constant expression, don't generate any
     * instructions; just generate an ir_constant.
     */
-   if (state->is_version(120, 100)) {
+   if (state->is_version(120, 100) ||
+       state->ctx->Const.AllowGLSLBuiltinConstantExpression) {
       ir_constant *value = sig->constant_expression_value(ctx,
                                                           actual_parameters,
                                                           NULL);
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 88c70c4..d3f7a0f 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -1397,8 +1397,7 @@
 
    switch (this->oper) {
    case ast_aggregate:
-      assert(!"ast_aggregate: Should never get here.");
-      break;
+      unreachable("ast_aggregate: Should never get here.");
 
    case ast_assign: {
       this->subexpressions[0]->set_is_lhs(true);
@@ -1684,6 +1683,12 @@
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
       orig_type = op[0]->type;
+
+      /* Break out if operand types were not parsed successfully. */
+      if ((op[0]->type == glsl_type::error_type ||
+           op[1]->type == glsl_type::error_type))
+         break;
+
       type = arithmetic_result_type(op[0], op[1],
                                     (this->oper == ast_mul_assign),
                                     state, & loc);
@@ -1851,9 +1856,11 @@
        *   expressions; such use results in a compile-time error."
        */
       if (type->contains_opaque()) {
-         _mesa_glsl_error(&loc, state, "opaque variables cannot be operands "
-                          "of the ?: operator");
-         error_emitted = true;
+         if (!(state->has_bindless() && (type->is_image() || type->is_sampler()))) {
+            _mesa_glsl_error(&loc, state, "variables of type %s cannot be "
+                             "operands of the ?: operator", type->name);
+            error_emitted = true;
+         }
       }
 
       ir_constant *cond_val = op[0]->constant_expression_value(ctx);
@@ -1979,15 +1986,13 @@
    }
 
    case ast_unsized_array_dim:
-      assert(!"ast_unsized_array_dim: Should never get here.");
-      break;
+      unreachable("ast_unsized_array_dim: Should never get here.");
 
    case ast_function_call:
       /* Should *NEVER* get here.  ast_function_call should always be handled
        * by ast_function_expression::hir.
        */
-      assert(0);
-      break;
+      unreachable("ast_function_call: handled elsewhere ");
 
    case ast_identifier: {
       /* ast_identifier can appear several places in a full abstract syntax
@@ -3905,6 +3910,16 @@
 
    if (state->has_bindless())
       apply_bindless_qualifier_to_variable(qual, var, state, loc);
+
+   if (qual->flags.q.pixel_interlock_ordered ||
+       qual->flags.q.pixel_interlock_unordered ||
+       qual->flags.q.sample_interlock_ordered ||
+       qual->flags.q.sample_interlock_unordered) {
+      _mesa_glsl_error(loc, state, "interlock layout qualifiers: "
+                       "pixel_interlock_ordered, pixel_interlock_unordered, "
+                       "sample_interlock_ordered and sample_interlock_unordered, "
+                       "only valid in fragment shader input layout declaration.");
+   }
 }
 
 static void
diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index 14ea936..c2b6e6b 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -637,6 +637,10 @@
       valid_in_mask.flags.q.early_fragment_tests = 1;
       valid_in_mask.flags.q.inner_coverage = 1;
       valid_in_mask.flags.q.post_depth_coverage = 1;
+      valid_in_mask.flags.q.pixel_interlock_ordered = 1;
+      valid_in_mask.flags.q.pixel_interlock_unordered = 1;
+      valid_in_mask.flags.q.sample_interlock_ordered = 1;
+      valid_in_mask.flags.q.sample_interlock_unordered = 1;
       break;
    case MESA_SHADER_COMPUTE:
       valid_in_mask.flags.q.local_size = 7;
@@ -708,6 +712,35 @@
       r = false;
    }
 
+   if (state->in_qualifier->flags.q.pixel_interlock_ordered) {
+      state->fs_pixel_interlock_ordered = true;
+      state->in_qualifier->flags.q.pixel_interlock_ordered = false;
+   }
+
+   if (state->in_qualifier->flags.q.pixel_interlock_unordered) {
+      state->fs_pixel_interlock_unordered = true;
+      state->in_qualifier->flags.q.pixel_interlock_unordered = false;
+   }
+
+   if (state->in_qualifier->flags.q.sample_interlock_ordered) {
+      state->fs_sample_interlock_ordered = true;
+      state->in_qualifier->flags.q.sample_interlock_ordered = false;
+   }
+
+   if (state->in_qualifier->flags.q.sample_interlock_unordered) {
+      state->fs_sample_interlock_unordered = true;
+      state->in_qualifier->flags.q.sample_interlock_unordered = false;
+   }
+
+   if (state->fs_pixel_interlock_ordered +
+       state->fs_pixel_interlock_unordered +
+       state->fs_sample_interlock_ordered +
+       state->fs_sample_interlock_unordered > 1) {
+      _mesa_glsl_error(loc, state,
+                       "only one interlock mode can be used at any time.");
+      r = false;
+   }
+
    /* We allow the creation of multiple cs_input_layout nodes. Coherence among
     * all existing nodes is checked later, when the AST node is transformed
     * into HIR.
@@ -776,7 +809,7 @@
                     "%s '%s':"
                     "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s"
                     "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s"
-                    "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+                    "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
                     message, name,
                     bad.flags.q.invariant ? " invariant" : "",
                     bad.flags.q.precise ? " precise" : "",
@@ -840,6 +873,10 @@
                     bad.flags.q.bound_sampler ? " bound_sampler" : "",
                     bad.flags.q.bound_image ? " bound_image" : "",
                     bad.flags.q.post_depth_coverage ? " post_depth_coverage" : "",
+                    bad.flags.q.pixel_interlock_ordered ? " pixel_interlock_ordered" : "",
+                    bad.flags.q.pixel_interlock_unordered ? " pixel_interlock_unordered": "",
+                    bad.flags.q.sample_interlock_ordered ? " sample_interlock_ordered": "",
+                    bad.flags.q.sample_interlock_unordered ? " sample_interlock_unordered": "",
                     bad.flags.q.non_coherent ? " noncoherent" : "");
    return false;
 }
diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
index e1ee994..7119903 100644
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -446,7 +446,8 @@
 {
    return state->stage == MESA_SHADER_FRAGMENT &&
           (state->is_version(110, 300) ||
-           state->OES_standard_derivatives_enable);
+           state->OES_standard_derivatives_enable ||
+           state->ctx->Const.AllowGLSLRelaxedES);
 }
 
 static bool
@@ -513,6 +514,12 @@
 }
 
 static bool
+supports_arb_fragment_shader_interlock(const _mesa_glsl_parse_state *state)
+{
+   return state->ARB_fragment_shader_interlock_enable;
+}
+
+static bool
 shader_clock(const _mesa_glsl_parse_state *state)
 {
    return state->ARB_shader_clock_enable;
@@ -982,6 +989,14 @@
    ir_function_signature *_read_invocation_intrinsic(const glsl_type *type);
    ir_function_signature *_read_invocation(const glsl_type *type);
 
+
+   ir_function_signature *_invocation_interlock_intrinsic(
+      builtin_available_predicate avail,
+      enum ir_intrinsic_id id);
+   ir_function_signature *_invocation_interlock(
+      const char *intrinsic_name,
+      builtin_available_predicate avail);
+
    ir_function_signature *_shader_clock_intrinsic(builtin_available_predicate avail,
                                                   const glsl_type *type);
    ir_function_signature *_shader_clock(builtin_available_predicate avail,
@@ -1219,6 +1234,16 @@
                                           ir_intrinsic_memory_barrier_shared),
                 NULL);
 
+   add_function("__intrinsic_begin_invocation_interlock",
+                _invocation_interlock_intrinsic(
+                   supports_arb_fragment_shader_interlock,
+                   ir_intrinsic_begin_invocation_interlock), NULL);
+
+   add_function("__intrinsic_end_invocation_interlock",
+                _invocation_interlock_intrinsic(
+                   supports_arb_fragment_shader_interlock,
+                   ir_intrinsic_end_invocation_interlock), NULL);
+
    add_function("__intrinsic_shader_clock",
                 _shader_clock_intrinsic(shader_clock,
                                         glsl_type::uvec2_type),
@@ -3294,6 +3319,18 @@
                               glsl_type::uint64_t_type),
                 NULL);
 
+   add_function("beginInvocationInterlockARB",
+                _invocation_interlock(
+                   "__intrinsic_begin_invocation_interlock",
+                   supports_arb_fragment_shader_interlock),
+                NULL);
+
+   add_function("endInvocationInterlockARB",
+                _invocation_interlock(
+                   "__intrinsic_end_invocation_interlock",
+                   supports_arb_fragment_shader_interlock),
+                NULL);
+
    add_function("anyInvocationARB",
                 _vote("__intrinsic_vote_any", vote),
                 NULL);
@@ -6228,6 +6265,24 @@
 }
 
 ir_function_signature *
+builtin_builder::_invocation_interlock_intrinsic(builtin_available_predicate avail,
+                                                 enum ir_intrinsic_id id)
+{
+   MAKE_INTRINSIC(glsl_type::void_type, id, avail, 0);
+   return sig;
+}
+
+ir_function_signature *
+builtin_builder::_invocation_interlock(const char *intrinsic_name,
+                                       builtin_available_predicate avail)
+{
+   MAKE_SIG(glsl_type::void_type, avail, 0);
+   body.emit(call(shader->symbols->get_function(intrinsic_name),
+                  NULL, sig->parameters));
+   return sig;
+}
+
+ir_function_signature *
 builtin_builder::_shader_clock_intrinsic(builtin_available_predicate avail,
                                          const glsl_type *type)
 {
diff --git a/src/intel/tools/gen_disasm.h b/src/compiler/glsl/gl_nir.h
similarity index 60%
copy from src/intel/tools/gen_disasm.h
copy to src/compiler/glsl/gl_nir.h
index c8c18b2..59d5f65 100644
--- a/src/intel/tools/gen_disasm.h
+++ b/src/compiler/glsl/gl_nir.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014 Intel Corporation
+ * Copyright © 2018 Timothy Arceri
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -16,30 +16,32 @@
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
  */
 
-#ifndef GEN_DISASM_H
-#define GEN_DISASM_H
-
-#include "intel/dev/gen_device_info.h"
+#ifndef GL_NIR_H
+#define GL_NIR_H
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct gen_disasm;
+struct nir_shader;
+struct gl_shader_program;
 
-struct gen_disasm *gen_disasm_create(const struct gen_device_info *devinfo);
-void gen_disasm_disassemble(struct gen_disasm *disasm,
-                            void *assembly, int start, FILE *out);
+bool gl_nir_lower_atomics(nir_shader *shader,
+                          const struct gl_shader_program *shader_program,
+                          bool use_binding_as_idx);
 
-void gen_disasm_destroy(struct gen_disasm *disasm);
+bool gl_nir_lower_samplers(nir_shader *shader,
+                           const struct gl_shader_program *shader_program);
+bool gl_nir_lower_samplers_as_deref(nir_shader *shader,
+                                    const struct gl_shader_program *shader_program);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif /* GEN_DISASM_H */
+#endif /* GL_NIR_H */
diff --git a/src/compiler/glsl/gl_nir_link_atomics.c b/src/compiler/glsl/gl_nir_link_atomics.c
new file mode 100644
index 0000000..887ac1b
--- /dev/null
+++ b/src/compiler/glsl/gl_nir_link_atomics.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "linker_util.h"
+#include "gl_nir_linker.h"
+#include "compiler/glsl/ir_uniform.h" /* for gl_uniform_storage */
+#include "main/context.h"
+
+/* This file do the common link for GLSL atomic counter uniforms, using NIR,
+ * instead of IR as the counter-part glsl/link_uniforms.cpp
+ *
+ * Also note that this is tailored for ARB_gl_spirv needs and particularities
+ */
+
+struct active_atomic_counter_uniform {
+   unsigned loc;
+   nir_variable *var;
+};
+
+struct active_atomic_buffer {
+   struct active_atomic_counter_uniform *uniforms;
+   unsigned num_uniforms;
+   unsigned uniform_buffer_size;
+   unsigned stage_counter_references[MESA_SHADER_STAGES];
+   unsigned size;
+};
+
+static void
+add_atomic_counter(const void *ctx,
+                   struct active_atomic_buffer *buffer,
+                   unsigned uniform_loc,
+                   nir_variable *var)
+{
+   if (buffer->num_uniforms >= buffer->uniform_buffer_size) {
+      if (buffer->uniform_buffer_size == 0)
+         buffer->uniform_buffer_size = 1;
+      else
+         buffer->uniform_buffer_size *= 2;
+      buffer->uniforms = reralloc(ctx,
+                                  buffer->uniforms,
+                                  struct active_atomic_counter_uniform,
+                                  buffer->uniform_buffer_size);
+   }
+
+   struct active_atomic_counter_uniform *uniform =
+      buffer->uniforms + buffer->num_uniforms;
+   uniform->loc = uniform_loc;
+   uniform->var = var;
+   buffer->num_uniforms++;
+}
+
+static void
+process_atomic_variable(const struct glsl_type *t,
+                        struct gl_shader_program *prog,
+                        unsigned *uniform_loc,
+                        nir_variable *var,
+                        struct active_atomic_buffer *buffers,
+                        unsigned *num_buffers,
+                        int *offset,
+                        unsigned shader_stage)
+{
+   /* FIXME: Arrays of arrays get counted separately. For example:
+    * x1[3][3][2] = 9 uniforms, 18 atomic counters
+    * x2[3][2]    = 3 uniforms, 6 atomic counters
+    * x3[2]       = 1 uniform, 2 atomic counters
+    *
+    * However this code marks all the counters as active even when they
+    * might not be used.
+    */
+   if (glsl_type_is_array(t) &&
+       glsl_type_is_array(glsl_get_array_element(t))) {
+      for (unsigned i = 0; i < glsl_get_length(t); i++) {
+         process_atomic_variable(glsl_get_array_element(t),
+                                 prog,
+                                 uniform_loc,
+                                 var,
+                                 buffers, num_buffers,
+                                 offset,
+                                 shader_stage);
+      }
+   } else {
+      struct active_atomic_buffer *buf = buffers + var->data.binding;
+      struct gl_uniform_storage *const storage =
+         &prog->data->UniformStorage[*uniform_loc];
+
+      /* If this is the first time the buffer is used, increment
+       * the counter of buffers used.
+       */
+      if (buf->size == 0)
+         (*num_buffers)++;
+
+      add_atomic_counter(buffers, /* ctx */
+                         buf,
+                         *uniform_loc,
+                         var);
+
+      /* When checking for atomic counters we should count every member in
+       * an array as an atomic counter reference.
+       */
+      if (glsl_type_is_array(t))
+         buf->stage_counter_references[shader_stage] += glsl_get_length(t);
+      else
+         buf->stage_counter_references[shader_stage]++;
+      buf->size = MAX2(buf->size, *offset + glsl_atomic_size(t));
+
+      storage->offset = *offset;
+      *offset += glsl_atomic_size(t);
+
+      (*uniform_loc)++;
+   }
+}
+
+static struct active_atomic_buffer *
+find_active_atomic_counters(struct gl_context *ctx,
+                            struct gl_shader_program *prog,
+                            unsigned *num_buffers)
+{
+   struct active_atomic_buffer *buffers =
+      rzalloc_array(NULL, /* ctx */
+                    struct active_atomic_buffer,
+                    ctx->Const.MaxAtomicBufferBindings);
+   *num_buffers = 0;
+
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
+      struct gl_linked_shader *sh = prog->_LinkedShaders[i];
+      if (sh == NULL)
+         continue;
+
+      nir_shader *nir = sh->Program->nir;
+
+      nir_foreach_variable(var, &nir->uniforms) {
+         if (!glsl_contains_atomic(var->type))
+            continue;
+
+         int offset = var->data.offset;
+         unsigned uniform_loc = var->data.location;
+
+         process_atomic_variable(var->type,
+                                 prog,
+                                 &uniform_loc,
+                                 var,
+                                 buffers,
+                                 num_buffers,
+                                 &offset,
+                                 i);
+      }
+   }
+
+   return buffers;
+}
+
+void
+gl_nir_link_assign_atomic_counter_resources(struct gl_context *ctx,
+                                            struct gl_shader_program *prog)
+{
+   unsigned num_buffers;
+   unsigned num_atomic_buffers[MESA_SHADER_STAGES] = {0};
+   struct active_atomic_buffer *abs =
+      find_active_atomic_counters(ctx, prog, &num_buffers);
+
+   prog->data->AtomicBuffers =
+      rzalloc_array(prog->data, struct gl_active_atomic_buffer, num_buffers);
+   prog->data->NumAtomicBuffers = num_buffers;
+
+   unsigned buffer_idx = 0;
+   for (unsigned binding = 0;
+        binding < ctx->Const.MaxAtomicBufferBindings;
+        binding++) {
+
+      /* If the binding was not used, skip.
+       */
+      if (abs[binding].size == 0)
+         continue;
+
+      struct active_atomic_buffer *ab = abs + binding;
+      struct gl_active_atomic_buffer *mab =
+         prog->data->AtomicBuffers + buffer_idx;
+
+      /* Assign buffer-specific fields. */
+      mab->Binding = binding;
+      mab->MinimumSize = ab->size;
+      mab->Uniforms = rzalloc_array(prog->data->AtomicBuffers, GLuint,
+                                    ab->num_uniforms);
+      mab->NumUniforms = ab->num_uniforms;
+
+      /* Assign counter-specific fields. */
+      for (unsigned j = 0; j < ab->num_uniforms; j++) {
+         nir_variable *var = ab->uniforms[j].var;
+         struct gl_uniform_storage *storage =
+            &prog->data->UniformStorage[ab->uniforms[j].loc];
+
+         mab->Uniforms[j] = ab->uniforms[j].loc;
+
+         storage->atomic_buffer_index = buffer_idx;
+         storage->offset = var->data.offset;
+         if (glsl_type_is_array(var->type)) {
+            const struct glsl_type *without_array =
+               glsl_without_array(var->type);
+            storage->array_stride = glsl_atomic_size(without_array);
+         } else {
+            storage->array_stride = 0;
+         }
+         if (!glsl_type_is_matrix(var->type))
+            storage->matrix_stride = 0;
+      }
+
+      /* Assign stage-specific fields. */
+      for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
+         if (ab->stage_counter_references[stage]) {
+            mab->StageReferences[stage] = GL_TRUE;
+            num_atomic_buffers[stage]++;
+         } else {
+            mab->StageReferences[stage] = GL_FALSE;
+         }
+      }
+
+      buffer_idx++;
+   }
+
+   /* Store a list pointers to atomic buffers per stage and store the index
+    * to the intra-stage buffer list in uniform storage.
+    */
+   for (unsigned stage = 0; stage < MESA_SHADER_STAGES; ++stage) {
+      if (prog->_LinkedShaders[stage] == NULL ||
+          num_atomic_buffers[stage] <= 0)
+         continue;
+
+      struct gl_program *gl_prog = prog->_LinkedShaders[stage]->Program;
+      gl_prog->info.num_abos = num_atomic_buffers[stage];
+      gl_prog->sh.AtomicBuffers =
+         rzalloc_array(gl_prog,
+                       struct gl_active_atomic_buffer *,
+                       num_atomic_buffers[stage]);
+
+      gl_prog->nir->info.num_abos = num_atomic_buffers[stage];
+
+      unsigned intra_stage_idx = 0;
+      for (unsigned i = 0; i < num_buffers; i++) {
+         struct gl_active_atomic_buffer *atomic_buffer =
+            &prog->data->AtomicBuffers[i];
+         if (!atomic_buffer->StageReferences[stage])
+            continue;
+
+         gl_prog->sh.AtomicBuffers[intra_stage_idx] = atomic_buffer;
+
+         for (unsigned u = 0; u < atomic_buffer->NumUniforms; u++) {
+            GLuint uniform_loc = atomic_buffer->Uniforms[u];
+            struct gl_opaque_uniform_index *opaque =
+               prog->data->UniformStorage[uniform_loc].opaque + stage;
+            opaque->index = intra_stage_idx;
+            opaque->active = true;
+         }
+
+         intra_stage_idx++;
+      }
+   }
+
+   assert(buffer_idx == num_buffers);
+
+   ralloc_free(abs);
+}
diff --git a/src/compiler/glsl/gl_nir_link_uniform_initializers.c b/src/compiler/glsl/gl_nir_link_uniform_initializers.c
new file mode 100644
index 0000000..8eefa71
--- /dev/null
+++ b/src/compiler/glsl/gl_nir_link_uniform_initializers.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "gl_nir_linker.h"
+#include "compiler/glsl/ir_uniform.h" /* for gl_uniform_storage */
+#include "main/context.h"
+#include "main/mtypes.h"
+
+struct set_opaque_binding_closure {
+   struct gl_shader_program *shader_prog;
+   struct gl_program *prog;
+   const nir_variable *var;
+   int binding;
+   int location;
+};
+
+static void
+set_opaque_binding(struct set_opaque_binding_closure *data,
+                   const struct glsl_type *type)
+{
+   if (glsl_type_is_array(type) &&
+       glsl_type_is_array(glsl_get_array_element(type))) {
+      const struct glsl_type *element_type = glsl_get_array_element(type);
+
+      for (unsigned int i = 0; i < glsl_get_length(type); i++)
+         set_opaque_binding(data, element_type);
+
+      return;
+   }
+
+   if (data->location < 0 ||
+       data->location >= data->prog->sh.data->NumUniformStorage)
+      return;
+
+   struct gl_uniform_storage *storage =
+      data->prog->sh.data->UniformStorage + data->location++;
+
+   const unsigned elements = MAX2(storage->array_elements, 1);
+
+   for (unsigned int i = 0; i < elements; i++)
+      storage->storage[i].i = data->binding++;
+
+   for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
+      struct gl_linked_shader *shader = data->shader_prog->_LinkedShaders[sh];
+
+      if (!shader)
+         continue;
+      if (!storage->opaque[sh].active)
+         continue;
+
+      if (glsl_type_is_sampler(storage->type)) {
+         for (unsigned i = 0; i < elements; i++) {
+            const unsigned index = storage->opaque[sh].index + i;
+
+            if (storage->is_bindless) {
+               if (index >= shader->Program->sh.NumBindlessSamplers)
+                  break;
+               shader->Program->sh.BindlessSamplers[index].unit =
+                  storage->storage[i].i;
+               shader->Program->sh.BindlessSamplers[index].bound = true;
+               shader->Program->sh.HasBoundBindlessSampler = true;
+            } else {
+               if (index >= ARRAY_SIZE(shader->Program->SamplerUnits))
+                  break;
+               shader->Program->SamplerUnits[index] =
+                  storage->storage[i].i;
+            }
+         }
+      } else if (glsl_type_is_image(type)) {
+         for (unsigned i = 0; i < elements; i++) {
+            const unsigned index = storage->opaque[sh].index + i;
+
+            if (storage->is_bindless) {
+               if (index >= shader->Program->sh.NumBindlessImages)
+                  break;
+               shader->Program->sh.BindlessImages[index].unit =
+                  storage->storage[i].i;
+               shader->Program->sh.BindlessImages[index].bound = true;
+               shader->Program->sh.HasBoundBindlessImage = true;
+            } else {
+               if (index >= ARRAY_SIZE(shader->Program->sh.ImageUnits))
+                  break;
+               shader->Program->sh.ImageUnits[index] =
+                  storage->storage[i].i;
+            }
+         }
+      }
+   }
+}
+
+static void
+copy_constant_to_storage(union gl_constant_value *storage,
+                         const nir_constant *val,
+                         const struct glsl_type *type,
+                         unsigned int boolean_true)
+{
+   const enum glsl_base_type base_type = glsl_get_base_type(type);
+   const unsigned n_columns = glsl_get_matrix_columns(type);
+   const unsigned n_rows = glsl_get_vector_elements(type);
+   int i = 0;
+
+   for (unsigned int column = 0; column < n_columns; column++) {
+      for (unsigned int row = 0; row < n_rows; row++) {
+         switch (base_type) {
+         case GLSL_TYPE_UINT:
+            storage[i].u = val->values[column].u32[row];
+            break;
+         case GLSL_TYPE_INT:
+         case GLSL_TYPE_SAMPLER:
+            storage[i].i = val->values[column].i32[row];
+            break;
+         case GLSL_TYPE_FLOAT:
+            storage[i].f = val->values[column].f32[row];
+            break;
+         case GLSL_TYPE_DOUBLE:
+         case GLSL_TYPE_UINT64:
+         case GLSL_TYPE_INT64:
+            /* XXX need to check on big-endian */
+            memcpy(&storage[i * 2].u,
+                   &val->values[column].f64[row],
+                   sizeof(double));
+            break;
+         case GLSL_TYPE_BOOL:
+            storage[i].b = val->values[column].u32[row] ? boolean_true : 0;
+            break;
+         case GLSL_TYPE_ARRAY:
+         case GLSL_TYPE_STRUCT:
+         case GLSL_TYPE_IMAGE:
+         case GLSL_TYPE_ATOMIC_UINT:
+         case GLSL_TYPE_INTERFACE:
+         case GLSL_TYPE_VOID:
+         case GLSL_TYPE_SUBROUTINE:
+         case GLSL_TYPE_FUNCTION:
+         case GLSL_TYPE_ERROR:
+         case GLSL_TYPE_UINT16:
+         case GLSL_TYPE_INT16:
+         case GLSL_TYPE_UINT8:
+         case GLSL_TYPE_INT8:
+         case GLSL_TYPE_FLOAT16:
+            /* All other types should have already been filtered by other
+             * paths in the caller.
+             */
+            assert(!"Should not get here.");
+            break;
+         }
+         i++;
+      }
+   }
+}
+
+struct set_uniform_initializer_closure {
+   struct gl_shader_program *shader_prog;
+   struct gl_program *prog;
+   const nir_variable *var;
+   int location;
+   unsigned int boolean_true;
+};
+
+static void
+set_uniform_initializer(struct set_uniform_initializer_closure *data,
+                        const struct glsl_type *type,
+                        const nir_constant *val)
+{
+   const struct glsl_type *t_without_array = glsl_without_array(type);
+
+   if (glsl_type_is_struct(type)) {
+      for (unsigned int i = 0; i < glsl_get_length(type); i++) {
+         const struct glsl_type *field_type = glsl_get_struct_field(type, i);
+         set_uniform_initializer(data, field_type, val->elements[i]);
+      }
+      return;
+   }
+
+   if (glsl_type_is_struct(t_without_array) ||
+       (glsl_type_is_array(type) &&
+        glsl_type_is_array(glsl_get_array_element(type)))) {
+      const struct glsl_type *element_type = glsl_get_array_element(type);
+
+      for (unsigned int i = 0; i < glsl_get_length(type); i++)
+         set_uniform_initializer(data, element_type, val->elements[i]);
+
+      return;
+   }
+
+   if (data->location < 0 ||
+       data->location >= data->prog->sh.data->NumUniformStorage)
+      return;
+
+   struct gl_uniform_storage *storage =
+      data->prog->sh.data->UniformStorage + data->location++;
+
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *element_type = glsl_get_array_element(type);
+      const enum glsl_base_type base_type = glsl_get_base_type(element_type);
+      const unsigned int elements = glsl_get_components(element_type);
+      unsigned int idx = 0;
+      unsigned dmul = glsl_base_type_is_64bit(base_type) ? 2 : 1;
+
+      assert(glsl_get_length(type) >= storage->array_elements);
+      for (unsigned int i = 0; i < storage->array_elements; i++) {
+         copy_constant_to_storage(&storage->storage[idx],
+                                  val->elements[i],
+                                  element_type,
+                                  data->boolean_true);
+
+         idx += elements * dmul;
+      }
+   } else {
+      copy_constant_to_storage(storage->storage,
+                               val,
+                               type,
+                               data->boolean_true);
+
+      if (glsl_type_is_sampler(storage->type)) {
+         for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
+            struct gl_linked_shader *shader =
+               data->shader_prog->_LinkedShaders[sh];
+
+            if (shader && storage->opaque[sh].active) {
+               unsigned index = storage->opaque[sh].index;
+
+               shader->Program->SamplerUnits[index] = storage->storage[0].i;
+            }
+         }
+      }
+   }
+}
+
+void
+gl_nir_set_uniform_initializers(struct gl_context *ctx,
+                                struct gl_shader_program *prog)
+{
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_linked_shader *sh = prog->_LinkedShaders[i];
+      if (!sh)
+         continue;
+
+      nir_shader *nir = sh->Program->nir;
+      assert(nir);
+
+      nir_foreach_variable(var, &nir->uniforms) {
+         if (var->constant_initializer) {
+            struct set_uniform_initializer_closure data = {
+               .shader_prog = prog,
+               .prog = sh->Program,
+               .var = var,
+               .location = var->data.location,
+               .boolean_true = ctx->Const.UniformBooleanTrue
+            };
+            set_uniform_initializer(&data,
+                                    var->type,
+                                    var->constant_initializer);
+         } else if (var->data.explicit_binding) {
+            const struct glsl_type *without_array =
+               glsl_without_array(var->type);
+
+            if (glsl_type_is_sampler(without_array) ||
+                glsl_type_is_image(without_array)) {
+               struct set_opaque_binding_closure data = {
+                  .shader_prog = prog,
+                  .prog = sh->Program,
+                  .var = var,
+                  .binding = var->data.binding,
+                  .location = var->data.location
+               };
+               set_opaque_binding(&data, var->type);
+            }
+         }
+      }
+   }
+}
diff --git a/src/compiler/glsl/gl_nir_link_uniforms.c b/src/compiler/glsl/gl_nir_link_uniforms.c
new file mode 100644
index 0000000..f729fa9
--- /dev/null
+++ b/src/compiler/glsl/gl_nir_link_uniforms.c
@@ -0,0 +1,522 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "gl_nir_linker.h"
+#include "compiler/glsl/ir_uniform.h" /* for gl_uniform_storage */
+#include "linker_util.h"
+#include "main/context.h"
+#include "main/mtypes.h"
+
+/* This file do the common link for GLSL uniforms, using NIR, instead of IR as
+ * the counter-part glsl/link_uniforms.cpp
+ *
+ * Also note that this is tailored for ARB_gl_spirv needs and particularities
+ * (like need to work/link without name available, explicit location for
+ * normal uniforms as mandatory, and so on).
+ */
+
+#define UNMAPPED_UNIFORM_LOC ~0u
+
+static void
+nir_setup_uniform_remap_tables(struct gl_context *ctx,
+                               struct gl_shader_program *prog)
+{
+   prog->UniformRemapTable = rzalloc_array(prog,
+                                           struct gl_uniform_storage *,
+                                           prog->NumUniformRemapTable);
+   union gl_constant_value *data =
+      rzalloc_array(prog->data,
+                    union gl_constant_value, prog->data->NumUniformDataSlots);
+   if (!prog->UniformRemapTable || !data) {
+      linker_error(prog, "Out of memory during linking.\n");
+      return;
+   }
+   prog->data->UniformDataSlots = data;
+
+   unsigned data_pos = 0;
+
+   /* Reserve all the explicit locations of the active uniforms. */
+   for (unsigned i = 0; i < prog->data->NumUniformStorage; i++) {
+      struct gl_uniform_storage *uniform = &prog->data->UniformStorage[i];
+
+      if (prog->data->UniformStorage[i].remap_location == UNMAPPED_UNIFORM_LOC)
+         continue;
+
+      /* How many new entries for this uniform? */
+      const unsigned entries = MAX2(1, uniform->array_elements);
+      unsigned num_slots = glsl_get_component_slots(uniform->type);
+
+      uniform->storage = &data[data_pos];
+
+      /* Set remap table entries point to correct gl_uniform_storage. */
+      for (unsigned j = 0; j < entries; j++) {
+         unsigned element_loc = uniform->remap_location + j;
+         prog->UniformRemapTable[element_loc] = uniform;
+
+         data_pos += num_slots;
+      }
+   }
+
+   /* Reserve locations for rest of the uniforms. */
+   link_util_update_empty_uniform_locations(prog);
+
+   for (unsigned i = 0; i < prog->data->NumUniformStorage; i++) {
+      struct gl_uniform_storage *uniform = &prog->data->UniformStorage[i];
+
+      if (uniform->is_shader_storage)
+         continue;
+
+      /* Built-in uniforms should not get any location. */
+      if (uniform->builtin)
+         continue;
+
+      /* Explicit ones have been set already. */
+      if (uniform->remap_location != UNMAPPED_UNIFORM_LOC)
+         continue;
+
+      /* How many entries for this uniform? */
+      const unsigned entries = MAX2(1, uniform->array_elements);
+
+      unsigned location =
+         link_util_find_empty_block(prog, &prog->data->UniformStorage[i]);
+
+      if (location == -1) {
+         location = prog->NumUniformRemapTable;
+
+         /* resize remap table to fit new entries */
+         prog->UniformRemapTable =
+            reralloc(prog,
+                     prog->UniformRemapTable,
+                     struct gl_uniform_storage *,
+                     prog->NumUniformRemapTable + entries);
+         prog->NumUniformRemapTable += entries;
+      }
+
+      /* set the base location in remap table for the uniform */
+      uniform->remap_location = location;
+
+      unsigned num_slots = glsl_get_component_slots(uniform->type);
+
+      uniform->storage = &data[data_pos];
+
+      /* Set remap table entries point to correct gl_uniform_storage. */
+      for (unsigned j = 0; j < entries; j++) {
+         unsigned element_loc = uniform->remap_location + j;
+         prog->UniformRemapTable[element_loc] = uniform;
+
+         data_pos += num_slots;
+      }
+   }
+}
+
+static struct gl_uniform_storage *
+find_previous_uniform_storage(struct gl_shader_program *prog,
+                              int location)
+{
+   /* This would only work for uniform with explicit location, as all the
+    * uniforms without location (ie: atomic counters) would have a initial
+    * location equal to -1. We early return in that case.
+    */
+   if (location == -1)
+      return NULL;
+
+   for (unsigned i = 0; i < prog->data->NumUniformStorage; i++)
+      if (prog->data->UniformStorage[i].remap_location == location)
+         return &prog->data->UniformStorage[i];
+
+   return NULL;
+}
+
+/* Used to build a tree representing the glsl_type so that we can have a place
+ * to store the next index for opaque types. Array types are expanded so that
+ * they have a single child which is used for all elements of the array.
+ * Struct types have a child for each member. The tree is walked while
+ * processing a uniform so that we can recognise when an opaque type is
+ * encountered a second time in order to reuse the same range of indices that
+ * was reserved the first time. That way the sampler indices can be arranged
+ * so that members of an array are placed sequentially even if the array is an
+ * array of structs containing other opaque members.
+ */
+struct type_tree_entry {
+   /* For opaque types, this will be the next index to use. If we haven’t
+    * encountered this member yet, it will be UINT_MAX.
+    */
+   unsigned next_index;
+   unsigned array_size;
+   struct type_tree_entry *parent;
+   struct type_tree_entry *next_sibling;
+   struct type_tree_entry *children;
+};
+
+struct nir_link_uniforms_state {
+   /* per-whole program */
+   unsigned num_hidden_uniforms;
+   unsigned num_values;
+   unsigned max_uniform_location;
+   unsigned next_sampler_index;
+   unsigned next_image_index;
+
+   /* per-shader stage */
+   unsigned num_shader_samplers;
+   unsigned num_shader_images;
+   unsigned num_shader_uniform_components;
+   unsigned shader_samplers_used;
+   unsigned shader_shadow_samplers;
+
+   nir_variable *current_var;
+
+   struct type_tree_entry *current_type;
+};
+
+static struct type_tree_entry *
+build_type_tree_for_type(const struct glsl_type *type)
+{
+   struct type_tree_entry *entry = malloc(sizeof *entry);
+
+   entry->array_size = 1;
+   entry->next_index = UINT_MAX;
+   entry->children = NULL;
+   entry->next_sibling = NULL;
+   entry->parent = NULL;
+
+   if (glsl_type_is_array(type)) {
+      entry->array_size = glsl_get_length(type);
+      entry->children = build_type_tree_for_type(glsl_get_array_element(type));
+      entry->children->parent = entry;
+   } else if (glsl_type_is_struct(type)) {
+      struct type_tree_entry *last = NULL;
+
+      for (unsigned i = 0; i < glsl_get_length(type); i++) {
+         const struct glsl_type *field_type = glsl_get_struct_field(type, i);
+         struct type_tree_entry *field_entry =
+            build_type_tree_for_type(field_type);
+
+         if (last == NULL)
+            entry->children = field_entry;
+         else
+            last->next_sibling = field_entry;
+
+         field_entry->parent = entry;
+
+         last = field_entry;
+      }
+   }
+
+   return entry;
+}
+
+static void
+free_type_tree(struct type_tree_entry *entry)
+{
+   struct type_tree_entry *p, *next;
+
+   for (p = entry->children; p; p = next) {
+      next = p->next_sibling;
+      free_type_tree(p);
+   }
+
+   free(entry);
+}
+
+static unsigned
+get_next_index(struct nir_link_uniforms_state *state,
+               const struct gl_uniform_storage *uniform,
+               unsigned *next_index)
+{
+   /* If we’ve already calculated an index for this member then we can just
+    * offset from there.
+    */
+   if (state->current_type->next_index == UINT_MAX) {
+      /* Otherwise we need to reserve enough indices for all of the arrays
+       * enclosing this member.
+       */
+
+      unsigned array_size = 1;
+
+      for (const struct type_tree_entry *p = state->current_type;
+           p;
+           p = p->parent) {
+         array_size *= p->array_size;
+      }
+
+      state->current_type->next_index = *next_index;
+      *next_index += array_size;
+   }
+
+   unsigned index = state->current_type->next_index;
+
+   state->current_type->next_index += MAX2(1, uniform->array_elements);
+
+   return index;
+}
+
+
+/**
+ * Creates the neccessary entries in UniformStorage for the uniform. Returns
+ * the number of locations used or -1 on failure.
+ */
+static int
+nir_link_uniform(struct gl_context *ctx,
+                 struct gl_shader_program *prog,
+                 struct gl_program *stage_program,
+                 gl_shader_stage stage,
+                 const struct glsl_type *type,
+                 int location,
+                 struct nir_link_uniforms_state *state)
+{
+   struct gl_uniform_storage *uniform = NULL;
+
+   /* gl_uniform_storage can cope with one level of array, so if the type is a
+    * composite type or an array where each element occupies more than one
+    * location than we need to recursively process it.
+    */
+   if (glsl_type_is_struct(type) ||
+       (glsl_type_is_array(type) &&
+        (glsl_type_is_array(glsl_get_array_element(type)) ||
+         glsl_type_is_struct(glsl_get_array_element(type))))) {
+      int location_count = 0;
+      struct type_tree_entry *old_type = state->current_type;
+
+      state->current_type = old_type->children;
+
+      for (unsigned i = 0; i < glsl_get_length(type); i++) {
+         const struct glsl_type *field_type;
+
+         if (glsl_type_is_struct(type))
+            field_type = glsl_get_struct_field(type, i);
+         else
+            field_type = glsl_get_array_element(type);
+
+         int entries = nir_link_uniform(ctx, prog, stage_program, stage,
+                                        field_type, location,
+                                        state);
+         if (entries == -1)
+            return -1;
+
+         if (location != -1)
+            location += entries;
+         location_count += entries;
+
+         if (glsl_type_is_struct(type))
+            state->current_type = state->current_type->next_sibling;
+      }
+
+      state->current_type = old_type;
+
+      return location_count;
+   } else {
+      /* Create a new uniform storage entry */
+      prog->data->UniformStorage =
+         reralloc(prog->data,
+                  prog->data->UniformStorage,
+                  struct gl_uniform_storage,
+                  prog->data->NumUniformStorage + 1);
+      if (!prog->data->UniformStorage) {
+         linker_error(prog, "Out of memory during linking.\n");
+         return -1;
+      }
+
+      uniform = &prog->data->UniformStorage[prog->data->NumUniformStorage];
+      prog->data->NumUniformStorage++;
+
+      /* Initialize its members */
+      memset(uniform, 0x00, sizeof(struct gl_uniform_storage));
+      /* ARB_gl_spirv: names are considered optional debug info, so the linker
+       * needs to work without them, and returning them is optional. For
+       * simplicity we ignore names.
+       */
+      uniform->name = NULL;
+
+      const struct glsl_type *type_no_array = glsl_without_array(type);
+      if (glsl_type_is_array(type)) {
+         uniform->type = type_no_array;
+         uniform->array_elements = glsl_get_length(type);
+      } else {
+         uniform->type = type;
+         uniform->array_elements = 0;
+      }
+      uniform->active_shader_mask |= 1 << stage;
+
+      if (location >= 0) {
+         /* Uniform has an explicit location */
+         uniform->remap_location = location;
+      } else {
+         uniform->remap_location = UNMAPPED_UNIFORM_LOC;
+      }
+
+      /* @FIXME: the initialization of the following will be done as we
+       * implement support for their specific features, like SSBO, atomics,
+       * etc.
+       */
+      uniform->block_index = -1;
+      uniform->offset = -1;
+      uniform->matrix_stride = -1;
+      uniform->array_stride = -1;
+      uniform->row_major = false;
+      uniform->hidden = false;
+      uniform->builtin = false;
+      uniform->is_shader_storage = false;
+      uniform->atomic_buffer_index = -1;
+      uniform->top_level_array_size = 0;
+      uniform->top_level_array_stride = 0;
+      uniform->is_bindless = false;
+
+      /* The following are not for features not supported by ARB_gl_spirv */
+      uniform->num_compatible_subroutines = 0;
+
+      unsigned entries = MAX2(1, uniform->array_elements);
+
+      if (glsl_type_is_sampler(type_no_array)) {
+         int sampler_index =
+            get_next_index(state, uniform, &state->next_sampler_index);
+
+         state->num_shader_samplers++;
+
+         uniform->opaque[stage].active = true;
+         uniform->opaque[stage].index = sampler_index;
+
+         const unsigned shadow = glsl_sampler_type_is_shadow(type_no_array);
+
+         for (unsigned i = sampler_index;
+              i < MIN2(state->next_sampler_index, MAX_SAMPLERS);
+              i++) {
+            stage_program->sh.SamplerTargets[i] =
+               glsl_get_sampler_target(type_no_array);
+            state->shader_samplers_used |= 1U << i;
+            state->shader_shadow_samplers |= shadow << i;
+         }
+      } else if (glsl_type_is_image(type_no_array)) {
+         /* @FIXME: image_index should match that of the same image
+          * uniform in other shaders. This means we need to match image
+          * uniforms by location (GLSL does it by variable name, but we
+          * want to avoid that).
+          */
+         int image_index = state->next_image_index;
+         state->next_image_index += entries;
+
+         state->num_shader_images++;
+
+         uniform->opaque[stage].active = true;
+         uniform->opaque[stage].index = image_index;
+
+         /* Set image access qualifiers */
+         const GLenum access =
+            (state->current_var->data.image.read_only ? GL_READ_ONLY :
+             state->current_var->data.image.write_only ? GL_WRITE_ONLY :
+             GL_READ_WRITE);
+         for (unsigned i = image_index;
+              i < MIN2(state->next_image_index, MAX_IMAGE_UNIFORMS);
+              i++) {
+            stage_program->sh.ImageAccess[i] = access;
+         }
+      }
+
+      unsigned values = glsl_get_component_slots(type);
+      state->num_shader_uniform_components += values;
+      state->num_values += values;
+
+      if (state->max_uniform_location < uniform->remap_location + entries)
+         state->max_uniform_location = uniform->remap_location + entries;
+
+      return MAX2(uniform->array_elements, 1);
+   }
+}
+
+bool
+gl_nir_link_uniforms(struct gl_context *ctx,
+                     struct gl_shader_program *prog)
+{
+   /* First free up any previous UniformStorage items */
+   ralloc_free(prog->data->UniformStorage);
+   prog->data->UniformStorage = NULL;
+   prog->data->NumUniformStorage = 0;
+
+   /* Iterate through all linked shaders */
+   struct nir_link_uniforms_state state = {0,};
+
+   for (unsigned shader_type = 0; shader_type < MESA_SHADER_STAGES; shader_type++) {
+      struct gl_linked_shader *sh = prog->_LinkedShaders[shader_type];
+      if (!sh)
+         continue;
+
+      nir_shader *nir = sh->Program->nir;
+      assert(nir);
+
+      state.num_shader_samplers = 0;
+      state.num_shader_images = 0;
+      state.num_shader_uniform_components = 0;
+      state.shader_samplers_used = 0;
+      state.shader_shadow_samplers = 0;
+
+      nir_foreach_variable(var, &nir->uniforms) {
+         struct gl_uniform_storage *uniform = NULL;
+
+         /* Check if the uniform has been processed already for
+          * other stage. If so, validate they are compatible and update
+          * the active stage mask.
+          */
+         uniform = find_previous_uniform_storage(prog, var->data.location);
+         if (uniform) {
+            uniform->active_shader_mask |= 1 << shader_type;
+            var->data.location = uniform - prog->data->UniformStorage;
+
+            continue;
+         }
+
+         int location = var->data.location;
+         /* From now on the variable’s location will be its uniform index */
+         var->data.location = prog->data->NumUniformStorage;
+
+         state.current_var = var;
+
+         struct type_tree_entry *type_tree =
+            build_type_tree_for_type(var->type);
+         state.current_type = type_tree;
+
+         int res = nir_link_uniform(ctx, prog, sh->Program, shader_type, var->type,
+                                    location, &state);
+
+         free_type_tree(type_tree);
+
+         if (res == -1)
+            return false;
+      }
+
+      sh->Program->SamplersUsed = state.shader_samplers_used;
+      sh->shadow_samplers = state.shader_shadow_samplers;
+      sh->Program->info.num_textures = state.num_shader_samplers;
+      sh->Program->info.num_images = state.num_shader_images;
+      sh->num_uniform_components = state.num_shader_uniform_components;
+      sh->num_combined_uniform_components = sh->num_uniform_components;
+   }
+
+   prog->data->NumHiddenUniforms = state.num_hidden_uniforms;
+   prog->NumUniformRemapTable = state.max_uniform_location;
+   prog->data->NumUniformDataSlots = state.num_values;
+
+   nir_setup_uniform_remap_tables(ctx, prog);
+   gl_nir_set_uniform_initializers(ctx, prog);
+
+   return true;
+}
diff --git a/src/compiler/glsl/gl_nir_link_xfb.c b/src/compiler/glsl/gl_nir_link_xfb.c
new file mode 100644
index 0000000..bcef1e1
--- /dev/null
+++ b/src/compiler/glsl/gl_nir_link_xfb.c
@@ -0,0 +1,316 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "gl_nir_linker.h"
+#include "ir_uniform.h" /* for gl_uniform_storage */
+#include "linker_util.h"
+#include "main/context.h"
+
+/*
+ * This file does the linking of GLSL transform feedback using NIR.
+ *
+ * Note: This linking pass is currently tailored for ARB_gl_spirv needs and
+ * particularities.
+ */
+
+struct active_xfb_buffer {
+   GLuint stride;
+   GLuint num_varyings;
+};
+
+struct active_xfb_varyings {
+   unsigned num_varyings;
+   unsigned num_outputs;
+   unsigned buffer_size;
+   struct nir_variable **varyings;
+   struct active_xfb_buffer buffers[MAX_FEEDBACK_BUFFERS];
+};
+
+static unsigned
+get_num_outputs(nir_variable *var)
+{
+   return glsl_count_attribute_slots(var->type,
+                                     false /* is_vertex_input */);
+}
+
+static void
+add_xfb_varying(struct active_xfb_varyings *active_varyings,
+                nir_variable *var)
+{
+   if (active_varyings->num_varyings >= active_varyings->buffer_size) {
+      if (active_varyings->buffer_size == 0)
+         active_varyings->buffer_size = 1;
+      else
+         active_varyings->buffer_size *= 2;
+
+      active_varyings->varyings = realloc(active_varyings->varyings,
+                                          sizeof(nir_variable*) *
+                                          active_varyings->buffer_size);
+   }
+
+   active_varyings->varyings[active_varyings->num_varyings++] = var;
+
+   active_varyings->num_outputs += get_num_outputs(var);
+}
+
+static int
+cmp_xfb_offset(const void *x_generic, const void *y_generic)
+{
+   const nir_variable *const *x = x_generic;
+   const nir_variable *const *y = y_generic;
+
+   if ((*x)->data.xfb_buffer != (*y)->data.xfb_buffer)
+      return (*x)->data.xfb_buffer - (*y)->data.xfb_buffer;
+   return (*x)->data.offset - (*y)->data.offset;
+}
+
+static void
+get_active_xfb_varyings(struct gl_shader_program *prog,
+                        struct active_xfb_varyings *active_varyings)
+{
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
+      struct gl_linked_shader *sh = prog->_LinkedShaders[i];
+      if (sh == NULL)
+         continue;
+
+      nir_shader *nir = sh->Program->nir;
+
+      nir_foreach_variable(var, &nir->outputs) {
+         if (var->data.explicit_xfb_buffer &&
+             var->data.explicit_xfb_stride) {
+            assert(var->data.xfb_buffer < MAX_FEEDBACK_BUFFERS);
+            active_varyings->buffers[var->data.xfb_buffer].stride =
+               var->data.xfb_stride;
+         }
+
+         if (!var->data.explicit_xfb_buffer ||
+             !var->data.explicit_offset)
+            continue;
+
+         active_varyings->buffers[var->data.xfb_buffer].num_varyings++;
+
+         add_xfb_varying(active_varyings, var);
+      }
+   }
+
+   /* The xfb_offset qualifier does not have to be used in increasing order
+    * however some drivers expect to receive the list of transform feedback
+    * declarations in order so sort it now for convenience.
+    */
+   qsort(active_varyings->varyings,
+         active_varyings->num_varyings,
+         sizeof(*active_varyings->varyings),
+         cmp_xfb_offset);
+}
+
+static unsigned
+add_varying_outputs(nir_variable *var,
+                    const struct glsl_type *type,
+                    unsigned location_offset,
+                    unsigned dest_offset,
+                    struct gl_transform_feedback_output *output)
+{
+   unsigned num_outputs = 0;
+
+   if (glsl_type_is_array(type) || glsl_type_is_matrix(type)) {
+      unsigned length = glsl_get_length(type);
+      const struct glsl_type *child_type = glsl_get_array_element(type);
+      unsigned component_slots = glsl_get_component_slots(child_type);
+
+      for (unsigned i = 0; i < length; i++) {
+         unsigned child_outputs = add_varying_outputs(var,
+                                                      child_type,
+                                                      location_offset,
+                                                      dest_offset,
+                                                      output + num_outputs);
+         num_outputs += child_outputs;
+         location_offset += child_outputs;
+         dest_offset += component_slots;
+      }
+   } else if (glsl_type_is_struct(type)) {
+      unsigned length = glsl_get_length(type);
+      for (unsigned i = 0; i < length; i++) {
+         const struct glsl_type *child_type = glsl_get_struct_field(type, i);
+         unsigned child_outputs = add_varying_outputs(var,
+                                                      child_type,
+                                                      location_offset,
+                                                      dest_offset,
+                                                      output + num_outputs);
+         num_outputs += child_outputs;
+         location_offset += child_outputs;
+         dest_offset += glsl_get_component_slots(child_type);
+      }
+   } else {
+      unsigned location = var->data.location + location_offset;
+      unsigned location_frac = var->data.location_frac;
+      unsigned num_components = glsl_get_component_slots(type);
+
+      while (num_components > 0) {
+         unsigned output_size = MIN2(num_components, 4 - location_frac);
+
+         output->OutputRegister = location;
+         output->OutputBuffer = var->data.xfb_buffer;
+         output->NumComponents = output_size;
+         output->StreamId = var->data.stream;
+         output->DstOffset = var->data.offset / 4 + dest_offset;
+         output->ComponentOffset = location_frac;
+
+         dest_offset += output_size;
+         num_components -= output_size;
+         num_outputs++;
+         output++;
+         location++;
+         location_frac = 0;
+      }
+   }
+
+   return num_outputs;
+}
+
+void
+gl_nir_link_assign_xfb_resources(struct gl_context *ctx,
+                                 struct gl_shader_program *prog)
+{
+   /*
+    * From ARB_gl_spirv spec:
+    *
+    *    "- If the *Xfb* Execution Mode is set, any output variable that is at
+    *       least partially captured:
+    *       * must be decorated with an *XfbBuffer*, declaring the capturing buffer
+    *       * must have at least one captured output variable in the capturing
+    *         buffer decorated with an *XfbStride* (and all such *XfbStride* values
+    *         for the capturing buffer must be equal)
+    *     - If the *Xfb* Execution Mode is set, any captured output:
+    *       * must be a non-structure decorated with *Offset* or a member of a
+    *         structure whose type member is decorated with *Offset*"
+    *
+    * Note the "must be", meaning that explicit buffer, offset and stride are
+    * mandatory. So as this is intended to work with SPIR-V shaders we don't
+    * need to calculate the offset or the stride.
+    */
+
+   struct gl_program *xfb_prog = prog->last_vert_prog;
+
+   if (xfb_prog == NULL)
+      return;
+
+   /* free existing varyings, if any */
+   for (unsigned i = 0; i < prog->TransformFeedback.NumVarying; i++)
+      free(prog->TransformFeedback.VaryingNames[i]);
+   free(prog->TransformFeedback.VaryingNames);
+
+   struct active_xfb_varyings active_varyings = { 0 };
+
+   get_active_xfb_varyings(prog, &active_varyings);
+
+   for (unsigned buf = 0; buf < MAX_FEEDBACK_BUFFERS; buf++)
+      prog->TransformFeedback.BufferStride[buf] = active_varyings.buffers[buf].stride;
+
+   prog->TransformFeedback.NumVarying = active_varyings.num_varyings;
+   prog->TransformFeedback.VaryingNames =
+      malloc(sizeof(GLchar *) * active_varyings.num_varyings);
+
+   struct gl_transform_feedback_info *linked_xfb =
+      rzalloc(xfb_prog, struct gl_transform_feedback_info);
+   xfb_prog->sh.LinkedTransformFeedback = linked_xfb;
+
+   linked_xfb->Outputs =
+      rzalloc_array(xfb_prog,
+                    struct gl_transform_feedback_output,
+                    active_varyings.num_outputs);
+   linked_xfb->NumOutputs = active_varyings.num_outputs;
+
+   linked_xfb->Varyings =
+      rzalloc_array(xfb_prog,
+                    struct gl_transform_feedback_varying_info,
+                    active_varyings.num_varyings);
+   linked_xfb->NumVarying = active_varyings.num_varyings;
+
+   struct gl_transform_feedback_output *output = linked_xfb->Outputs;
+   for (unsigned i = 0; i < active_varyings.num_varyings; i++) {
+      struct nir_variable *var = active_varyings.varyings[i];
+
+      /* From ARB_gl_spirv spec:
+       *
+       *    "19. How should the program interface query operations behave for
+       *         program objects created from SPIR-V shaders?
+       *
+       *     DISCUSSION: we previously said we didn't need reflection to work
+       *     for SPIR-V shaders (at least for the first version), however we
+       *     are left with specifying how it should "not work". The primary
+       *     issue is that SPIR-V binaries are not required to have names
+       *     associated with variables. They can be associated in debug
+       *     information, but there is no requirement for that to be present,
+       *     and it should not be relied upon."
+       *
+       *     Options:"
+       *
+       *     <skip>
+       *
+       *     "RESOLVED.  Pick (c), but also allow debug names to be returned
+       *      if an implementation wants to."
+       *
+       * So names are considered optional debug info, so the linker needs to
+       * work without them, and returning them is optional. For simplicity at
+       * this point we are ignoring names
+       */
+      prog->TransformFeedback.VaryingNames[i] = NULL;
+
+      unsigned varying_outputs = add_varying_outputs(var,
+                                                     var->type,
+                                                     0, /* location_offset */
+                                                     0, /* dest_offset */
+                                                     output);
+      assert(varying_outputs == get_num_outputs(var));
+      output = output + varying_outputs;
+
+      struct gl_transform_feedback_varying_info *varying =
+         linked_xfb->Varyings + i;
+
+      /* ARB_gl_spirv: see above. */
+      varying->Name = NULL;
+      varying->Type = glsl_get_gl_type(var->type);
+      varying->BufferIndex = var->data.xfb_buffer;
+      varying->Size = glsl_get_length(var->type);
+      varying->Offset = var->data.offset;
+   }
+
+   /* Make sure MaxTransformFeedbackBuffers is <= 32 so the bitmask for
+    * tracking the number of buffers doesn't overflow.
+    */
+   unsigned buffers = 0;
+   assert(ctx->Const.MaxTransformFeedbackBuffers <= sizeof(buffers) * 8);
+
+   for (unsigned buf = 0; buf < MAX_FEEDBACK_BUFFERS; buf++) {
+      if (active_varyings.buffers[buf].stride > 0) {
+         linked_xfb->Buffers[buf].Stride = active_varyings.buffers[buf].stride / 4;
+         linked_xfb->Buffers[buf].NumVaryings = active_varyings.buffers[buf].num_varyings;
+         buffers |= 1 << buf;
+      }
+   }
+
+   linked_xfb->ActiveBuffers = buffers;
+
+   free(active_varyings.varyings);
+}
diff --git a/src/compiler/glsl/gl_nir_linker.c b/src/compiler/glsl/gl_nir_linker.c
new file mode 100644
index 0000000..d09a2c0
--- /dev/null
+++ b/src/compiler/glsl/gl_nir_linker.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "gl_nir_linker.h"
+#include "linker_util.h"
+#include "main/mtypes.h"
+#include "ir_uniform.h" /* for gl_uniform_storage */
+
+/* This file included general link methods, using NIR, instead of IR as
+ * the counter-part glsl/linker.cpp
+ *
+ * Also note that this is tailored for ARB_gl_spirv needs and particularities
+ */
+
+void
+nir_build_program_resource_list(struct gl_context *ctx,
+                                struct gl_shader_program *prog)
+{
+   /* Rebuild resource list. */
+   if (prog->data->ProgramResourceList) {
+      ralloc_free(prog->data->ProgramResourceList);
+      prog->data->ProgramResourceList = NULL;
+      prog->data->NumProgramResourceList = 0;
+   }
+
+   struct set *resource_set = _mesa_set_create(NULL,
+                                               _mesa_hash_pointer,
+                                               _mesa_key_pointer_equal);
+
+   /* Add uniforms
+    *
+    * Here, it is expected that nir_link_uniforms() has already been
+    * called, so that UniformStorage table is already available.
+    */
+   for (unsigned i = 0; i < prog->data->NumUniformStorage; i++) {
+      struct gl_uniform_storage *uniform = &prog->data->UniformStorage[i];
+
+      if (!link_util_add_program_resource(prog, resource_set, GL_UNIFORM, uniform,
+                                          uniform->active_shader_mask)) {
+         return;
+      }
+   }
+
+
+   _mesa_set_destroy(resource_set, NULL);
+}
diff --git a/src/compiler/glsl/gl_nir_linker.h b/src/compiler/glsl/gl_nir_linker.h
new file mode 100644
index 0000000..29ca27d
--- /dev/null
+++ b/src/compiler/glsl/gl_nir_linker.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GL_NIR_LINKER_H
+#define GL_NIR_LINKER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct gl_context;
+struct gl_shader_program;
+
+bool gl_nir_link_uniforms(struct gl_context *ctx,
+                          struct gl_shader_program *prog);
+
+void gl_nir_set_uniform_initializers(struct gl_context *ctx,
+                                     struct gl_shader_program *prog);
+
+void nir_build_program_resource_list(struct gl_context *ctx,
+                                     struct gl_shader_program *prog);
+
+void gl_nir_link_assign_atomic_counter_resources(struct gl_context *ctx,
+                                                 struct gl_shader_program *prog);
+
+void gl_nir_link_assign_xfb_resources(struct gl_context *ctx,
+                                      struct gl_shader_program *prog);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* GL_NIR_LINKER_H */
diff --git a/src/compiler/glsl/gl_nir_lower_atomics.c b/src/compiler/glsl/gl_nir_lower_atomics.c
new file mode 100644
index 0000000..36e273c
--- /dev/null
+++ b/src/compiler/glsl/gl_nir_lower_atomics.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Connor Abbott (cwabbott0@gmail.com)
+ *
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "gl_nir.h"
+#include "ir_uniform.h"
+#include "main/config.h"
+#include "main/mtypes.h"
+#include <assert.h>
+
+/*
+ * replace atomic counter intrinsics that use a variable with intrinsics
+ * that directly store the buffer index and byte offset
+ */
+
+static bool
+lower_deref_instr(nir_builder *b, nir_intrinsic_instr *instr,
+                  const struct gl_shader_program *shader_program,
+                  nir_shader *shader, bool use_binding_as_idx)
+{
+   nir_intrinsic_op op;
+   switch (instr->intrinsic) {
+   case nir_intrinsic_atomic_counter_read_deref:
+      op = nir_intrinsic_atomic_counter_read;
+      break;
+
+   case nir_intrinsic_atomic_counter_inc_deref:
+      op = nir_intrinsic_atomic_counter_inc;
+      break;
+
+   case nir_intrinsic_atomic_counter_pre_dec_deref:
+      op = nir_intrinsic_atomic_counter_pre_dec;
+      break;
+
+   case nir_intrinsic_atomic_counter_post_dec_deref:
+      op = nir_intrinsic_atomic_counter_post_dec;
+      break;
+
+   case nir_intrinsic_atomic_counter_add_deref:
+      op = nir_intrinsic_atomic_counter_add;
+      break;
+
+   case nir_intrinsic_atomic_counter_min_deref:
+      op = nir_intrinsic_atomic_counter_min;
+      break;
+
+   case nir_intrinsic_atomic_counter_max_deref:
+      op = nir_intrinsic_atomic_counter_max;
+      break;
+
+   case nir_intrinsic_atomic_counter_and_deref:
+      op = nir_intrinsic_atomic_counter_and;
+      break;
+
+   case nir_intrinsic_atomic_counter_or_deref:
+      op = nir_intrinsic_atomic_counter_or;
+      break;
+
+   case nir_intrinsic_atomic_counter_xor_deref:
+      op = nir_intrinsic_atomic_counter_xor;
+      break;
+
+   case nir_intrinsic_atomic_counter_exchange_deref:
+      op = nir_intrinsic_atomic_counter_exchange;
+      break;
+
+   case nir_intrinsic_atomic_counter_comp_swap_deref:
+      op = nir_intrinsic_atomic_counter_comp_swap;
+      break;
+
+   default:
+      return false;
+   }
+
+   nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   if (var->data.mode != nir_var_uniform &&
+       var->data.mode != nir_var_shader_storage &&
+       var->data.mode != nir_var_shared)
+      return false; /* atomics passed as function arguments can't be lowered */
+
+   const unsigned uniform_loc = var->data.location;
+   const unsigned idx = use_binding_as_idx ? var->data.binding :
+      shader_program->data->UniformStorage[uniform_loc].opaque[shader->info.stage].index;
+
+   b->cursor = nir_before_instr(&instr->instr);
+
+   nir_ssa_def *offset = nir_imm_int(b, var->data.offset);
+   for (nir_deref_instr *d = deref; d->deref_type != nir_deref_type_var;
+        d = nir_deref_instr_parent(d)) {
+      assert(d->deref_type == nir_deref_type_array);
+      assert(d->arr.index.is_ssa);
+
+      unsigned array_stride = ATOMIC_COUNTER_SIZE;
+      if (glsl_type_is_array(d->type))
+         array_stride *= glsl_get_aoa_size(d->type);
+
+      offset = nir_iadd(b, offset, nir_imul(b, d->arr.index.ssa,
+                                            nir_imm_int(b, array_stride)));
+   }
+
+   /* Since the first source is a deref and the first source in the lowered
+    * instruction is the offset, we can just swap it out and change the
+    * opcode.
+    */
+   instr->intrinsic = op;
+   nir_instr_rewrite_src(&instr->instr, &instr->src[0],
+                         nir_src_for_ssa(offset));
+   nir_intrinsic_set_base(instr, idx);
+
+   nir_deref_instr_remove_if_unused(deref);
+
+   return true;
+}
+
+bool
+gl_nir_lower_atomics(nir_shader *shader,
+                     const struct gl_shader_program *shader_program,
+                     bool use_binding_as_idx)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (!function->impl)
+         continue;
+
+      bool impl_progress = false;
+
+      nir_builder build;
+      nir_builder_init(&build, function->impl);
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            impl_progress |= lower_deref_instr(&build,
+                                               nir_instr_as_intrinsic(instr),
+                                               shader_program, shader,
+                                               use_binding_as_idx);
+         }
+      }
+
+      if (impl_progress) {
+         nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                               nir_metadata_dominance);
+         progress = true;
+      }
+   }
+
+   return progress;
+}
diff --git a/src/compiler/glsl/gl_nir_lower_samplers.c b/src/compiler/glsl/gl_nir_lower_samplers.c
new file mode 100644
index 0000000..43fe318
--- /dev/null
+++ b/src/compiler/glsl/gl_nir_lower_samplers.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
+ * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "gl_nir.h"
+#include "ir_uniform.h"
+
+#include "main/compiler.h"
+#include "main/mtypes.h"
+
+/* Calculate the sampler index based on array indicies and also
+ * calculate the base uniform location for struct members.
+ */
+static void
+calc_sampler_offsets(nir_builder *b, nir_ssa_def *ptr,
+                     const struct gl_shader_program *shader_program,
+                     unsigned *base_index, nir_ssa_def **index,
+                     unsigned *array_elements)
+{
+   *base_index = 0;
+   *index = NULL;
+   *array_elements = 1;
+   unsigned location = 0;
+
+   nir_deref_instr *deref = nir_instr_as_deref(ptr->parent_instr);
+   while (deref->deref_type != nir_deref_type_var) {
+      assert(deref->parent.is_ssa);
+      nir_deref_instr *parent =
+         nir_instr_as_deref(deref->parent.ssa->parent_instr);
+
+      switch (deref->deref_type) {
+      case nir_deref_type_struct:
+         location += glsl_get_record_location_offset(parent->type,
+                                                     deref->strct.index);
+         break;
+
+      case nir_deref_type_array: {
+         nir_const_value *const_deref_index =
+            nir_src_as_const_value(deref->arr.index);
+
+         if (const_deref_index && *index == NULL) {
+            /* We're still building a direct index */
+            *base_index += const_deref_index->u32[0] * *array_elements;
+         } else {
+            if (*index == NULL) {
+               /* We used to be direct but not anymore */
+               *index = nir_imm_int(b, *base_index);
+               *base_index = 0;
+            }
+
+            *index = nir_iadd(b, *index,
+                     nir_imul(b, nir_imm_int(b, *array_elements),
+                              nir_ssa_for_src(b, deref->arr.index, 1)));
+         }
+
+         *array_elements *= glsl_get_length(parent->type);
+         break;
+      }
+
+      default:
+         unreachable("Invalid sampler deref type");
+      }
+
+      deref = parent;
+   }
+
+   if (*index)
+      *index = nir_umin(b, *index, nir_imm_int(b, *array_elements - 1));
+
+   /* We hit the deref_var.  This is the end of the line */
+   assert(deref->deref_type == nir_deref_type_var);
+
+   location += deref->var->data.location;
+
+   gl_shader_stage stage = b->shader->info.stage;
+   assert(location < shader_program->data->NumUniformStorage &&
+          shader_program->data->UniformStorage[location].opaque[stage].active);
+
+   *base_index +=
+      shader_program->data->UniformStorage[location].opaque[stage].index;
+}
+
+static bool
+lower_sampler(nir_builder *b, nir_tex_instr *instr,
+              const struct gl_shader_program *shader_program)
+{
+   int texture_idx =
+      nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);
+   int sampler_idx =
+      nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
+
+   if (texture_idx < 0)
+      return false;
+
+   assert(texture_idx >= 0 && sampler_idx >= 0);
+   assert(instr->src[texture_idx].src.is_ssa);
+   assert(instr->src[sampler_idx].src.is_ssa);
+   assert(instr->src[texture_idx].src.ssa == instr->src[sampler_idx].src.ssa);
+
+   b->cursor = nir_before_instr(&instr->instr);
+
+   unsigned base_offset, array_elements;
+   nir_ssa_def *indirect;
+   calc_sampler_offsets(b, instr->src[texture_idx].src.ssa, shader_program,
+                        &base_offset, &indirect, &array_elements);
+
+   instr->texture_index = base_offset;
+   instr->sampler_index = base_offset;
+   if (indirect) {
+      nir_instr_rewrite_src(&instr->instr, &instr->src[texture_idx].src,
+                            nir_src_for_ssa(indirect));
+      instr->src[texture_idx].src_type = nir_tex_src_texture_offset;
+      nir_instr_rewrite_src(&instr->instr, &instr->src[sampler_idx].src,
+                            nir_src_for_ssa(indirect));
+      instr->src[sampler_idx].src_type = nir_tex_src_sampler_offset;
+
+      instr->texture_array_size = array_elements;
+   } else {
+      nir_tex_instr_remove_src(instr, texture_idx);
+      /* The sampler index may have changed */
+      sampler_idx = nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
+      nir_tex_instr_remove_src(instr, sampler_idx);
+   }
+
+   return true;
+}
+
+static bool
+lower_impl(nir_function_impl *impl,
+           const struct gl_shader_program *shader_program)
+{
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   bool progress = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         if (instr->type == nir_instr_type_tex)
+            progress |= lower_sampler(&b, nir_instr_as_tex(instr),
+                                      shader_program);
+      }
+   }
+
+   return progress;
+}
+
+bool
+gl_nir_lower_samplers(nir_shader *shader,
+                      const struct gl_shader_program *shader_program)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= lower_impl(function->impl, shader_program);
+   }
+
+   return progress;
+}
diff --git a/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c b/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
new file mode 100644
index 0000000..aae64c9
--- /dev/null
+++ b/src/compiler/glsl/gl_nir_lower_samplers_as_deref.c
@@ -0,0 +1,302 @@
+/*
+ * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
+ * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
+ * Copyright © 2014 Intel Corporation
+ * Copyright © 2017 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ *
+ * Lower sampler and image references of (non-bindless) uniforms by removing
+ * struct dereferences, and synthesizing new uniform variables without structs
+ * if required.
+ *
+ * This will allow backends to have a simple, uniform treatment of bindless and
+ * non-bindless samplers and images.
+ *
+ * Example:
+ *
+ *   struct S {
+ *      sampler2D tex[2];
+ *      sampler2D other;
+ *   };
+ *   uniform S s[2];
+ *
+ *   tmp = texture(s[n].tex[m], coord);
+ *
+ * Becomes:
+ *
+ *   decl_var uniform INTERP_MODE_NONE sampler2D[2][2] lower@s.tex (...)
+ *
+ *   vec1 32 ssa_idx = $(2 * n + m)
+ *   vec4 32 ssa_out = tex ssa_coord (coord), lower@s.tex[n][m] (texture), lower@s.tex[n][m] (sampler)
+ *
+ * and lower@s.tex has var->data.binding set to the base index as defined by
+ * the opaque uniform mapping.
+ */
+
+#include "compiler/nir/nir.h"
+#include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_deref.h"
+#include "gl_nir.h"
+#include "ir_uniform.h"
+
+#include "main/compiler.h"
+#include "main/mtypes.h"
+
+struct lower_samplers_as_deref_state {
+   nir_shader *shader;
+   const struct gl_shader_program *shader_program;
+   struct hash_table *remap_table;
+};
+
+/* Prepare for removing struct derefs.  This pre-pass generates the name
+ * of the lowered deref, and calculates the lowered type and location.
+ * After that, once looking up (or creating if needed) the lowered var,
+ * constructing the new chain of deref instructions is a simple loop
+ * that skips the struct deref's
+ *
+ * path:     appended to as we descend down the chain of deref instrs
+ *           and remove struct derefs
+ * location: increased as we descend down and remove struct derefs
+ * type:     updated as we recurse back up the chain of deref instrs
+ *           with the resulting type after removing struct derefs
+ */
+static void
+remove_struct_derefs_prep(nir_deref_instr **p, char **name,
+                          unsigned *location, const struct glsl_type **type)
+{
+   nir_deref_instr *cur = p[0], *next = p[1];
+
+   if (!next) {
+      *type = cur->type;
+      return;
+   }
+
+   switch (next->deref_type) {
+   case nir_deref_type_array: {
+      unsigned length = glsl_get_length(cur->type);
+
+      remove_struct_derefs_prep(&p[1], name, location, type);
+
+      *type = glsl_get_array_instance(*type, length);
+      break;
+   }
+
+   case nir_deref_type_struct: {
+      *location += glsl_get_record_location_offset(cur->type, next->strct.index);
+      ralloc_asprintf_append(name, ".%s",
+                             glsl_get_struct_elem_name(cur->type, next->strct.index));
+
+      remove_struct_derefs_prep(&p[1], name, location, type);
+
+      /* skip over the struct type: */
+      *type = next->type;
+      break;
+   }
+
+   default:
+      unreachable("Invalid deref type");
+      break;
+   }
+}
+
+static nir_deref_instr *
+lower_deref(nir_builder *b, struct lower_samplers_as_deref_state *state,
+            nir_deref_instr *deref)
+{
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+   gl_shader_stage stage = state->shader->info.stage;
+
+   if (var->data.bindless || var->data.mode != nir_var_uniform)
+      return NULL;
+
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, state->remap_table);
+   assert(path.path[0]->deref_type == nir_deref_type_var);
+
+   char *name = ralloc_asprintf(state->remap_table, "lower@%s", var->name);
+   unsigned location = var->data.location;
+   const struct glsl_type *type = NULL;
+   unsigned binding;
+
+   /*
+    * We end up needing to do this in two passes, in order to generate
+    * the name of the lowered var (and detecting whether there even are
+    * any struct deref's), and then the second pass to construct the
+    * actual deref instructions after looking up / generating a new
+    * nir_variable (since we need to construct the deref_var first)
+    */
+
+   remove_struct_derefs_prep(path.path, &name, &location, &type);
+
+   assert(location < state->shader_program->data->NumUniformStorage &&
+          state->shader_program->data->UniformStorage[location].opaque[stage].active);
+
+   binding = state->shader_program->data->UniformStorage[location].opaque[stage].index;
+
+   if (var->type == type) {
+      /* Fast path: We did not encounter any struct derefs. */
+      var->data.binding = binding;
+      return deref;
+   }
+
+   uint32_t hash = _mesa_key_hash_string(name);
+   struct hash_entry *h =
+      _mesa_hash_table_search_pre_hashed(state->remap_table, hash, name);
+
+   if (h) {
+      var = (nir_variable *)h->data;
+   } else {
+      var = nir_variable_create(state->shader, nir_var_uniform, type, name);
+      var->data.binding = binding;
+      _mesa_hash_table_insert_pre_hashed(state->remap_table, hash, name, var);
+   }
+
+   /* construct a new deref based on lowered var (skipping the struct deref's
+    * from the original deref:
+    */
+   nir_deref_instr *new_deref = nir_build_deref_var(b, var);
+   for (nir_deref_instr **p = &path.path[1]; *p; p++) {
+      if ((*p)->deref_type == nir_deref_type_struct)
+         continue;
+
+      assert((*p)->deref_type == nir_deref_type_array);
+
+      new_deref = nir_build_deref_array(b, new_deref,
+                                        nir_ssa_for_src(b, (*p)->arr.index, 1));
+   }
+
+   return new_deref;
+}
+
+static bool
+lower_sampler(nir_tex_instr *instr, struct lower_samplers_as_deref_state *state,
+              nir_builder *b)
+{
+   int texture_idx =
+      nir_tex_instr_src_index(instr, nir_tex_src_texture_deref);
+   int sampler_idx =
+      nir_tex_instr_src_index(instr, nir_tex_src_sampler_deref);
+
+   if (texture_idx < 0)
+      return false;
+
+   assert(texture_idx >= 0 && sampler_idx >= 0);
+   assert(instr->src[texture_idx].src.is_ssa);
+   assert(instr->src[sampler_idx].src.is_ssa);
+   assert(instr->src[texture_idx].src.ssa == instr->src[sampler_idx].src.ssa);
+
+   b->cursor = nir_before_instr(&instr->instr);
+
+   nir_deref_instr *texture_deref =
+      lower_deref(b, state, nir_src_as_deref(instr->src[texture_idx].src));
+   /* don't lower bindless: */
+   if (!texture_deref)
+      return false;
+   nir_instr_rewrite_src(&instr->instr, &instr->src[texture_idx].src,
+                         nir_src_for_ssa(&texture_deref->dest.ssa));
+
+   nir_deref_instr *sampler_deref =
+      lower_deref(b, state, nir_src_as_deref(instr->src[sampler_idx].src));
+   nir_instr_rewrite_src(&instr->instr, &instr->src[sampler_idx].src,
+                         nir_src_for_ssa(&sampler_deref->dest.ssa));
+
+   return true;
+}
+
+static bool
+lower_intrinsic(nir_intrinsic_instr *instr,
+                struct lower_samplers_as_deref_state *state,
+                nir_builder *b)
+{
+   if (instr->intrinsic == nir_intrinsic_image_deref_load ||
+       instr->intrinsic == nir_intrinsic_image_deref_store ||
+       instr->intrinsic == nir_intrinsic_image_deref_atomic_add ||
+       instr->intrinsic == nir_intrinsic_image_deref_atomic_min ||
+       instr->intrinsic == nir_intrinsic_image_deref_atomic_max ||
+       instr->intrinsic == nir_intrinsic_image_deref_atomic_and ||
+       instr->intrinsic == nir_intrinsic_image_deref_atomic_or ||
+       instr->intrinsic == nir_intrinsic_image_deref_atomic_xor ||
+       instr->intrinsic == nir_intrinsic_image_deref_atomic_exchange ||
+       instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap ||
+       instr->intrinsic == nir_intrinsic_image_deref_size) {
+
+      b->cursor = nir_before_instr(&instr->instr);
+      nir_deref_instr *deref =
+         lower_deref(b, state, nir_src_as_deref(instr->src[0]));
+      /* don't lower bindless: */
+      if (!deref)
+         return false;
+      nir_instr_rewrite_src(&instr->instr, &instr->src[0],
+                            nir_src_for_ssa(&deref->dest.ssa));
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+lower_impl(nir_function_impl *impl, struct lower_samplers_as_deref_state *state)
+{
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   bool progress = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         if (instr->type == nir_instr_type_tex)
+            progress |= lower_sampler(nir_instr_as_tex(instr), state, &b);
+         else if (instr->type == nir_instr_type_intrinsic)
+            progress |= lower_intrinsic(nir_instr_as_intrinsic(instr), state, &b);
+      }
+   }
+
+   return progress;
+}
+
+bool
+gl_nir_lower_samplers_as_deref(nir_shader *shader,
+                               const struct gl_shader_program *shader_program)
+{
+   bool progress = false;
+   struct lower_samplers_as_deref_state state;
+
+   state.shader = shader;
+   state.shader_program = shader_program;
+   state.remap_table = _mesa_hash_table_create(NULL, _mesa_key_hash_string,
+                                               _mesa_key_string_equal);
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= lower_impl(function->impl, &state);
+   }
+
+   /* keys are freed automatically by ralloc */
+   _mesa_hash_table_destroy(state.remap_table, NULL);
+
+   if (progress)
+      nir_remove_dead_derefs(shader);
+
+   return progress;
+}
diff --git a/src/compiler/glsl/glcpp/glcpp-lex.l b/src/compiler/glsl/glcpp/glcpp-lex.l
index 05447b3..f7003da 100644
--- a/src/compiler/glsl/glcpp/glcpp-lex.l
+++ b/src/compiler/glsl/glcpp/glcpp-lex.l
@@ -434,8 +434,10 @@
 
 	/* This will catch any non-directive garbage after a HASH */
 <HASH>{NONSPACE} {
-	BEGIN INITIAL;
-	RETURN_TOKEN (GARBAGE);
+	if (!parser->skipping) {
+		BEGIN INITIAL;
+		RETURN_TOKEN (GARBAGE);
+	}
 }
 
 	/* An identifier immediately followed by '(' */
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index ccb3aa1..1c095cb 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -462,13 +462,8 @@
 
 integer_constant:
 	INTEGER_STRING {
-		if (strlen ($1) >= 3 && strncmp ($1, "0x", 2) == 0) {
-			$$ = strtoll ($1 + 2, NULL, 16);
-		} else if ($1[0] == '0') {
-			$$ = strtoll ($1, NULL, 8);
-		} else {
-			$$ = strtoll ($1, NULL, 10);
-		}
+		/* let strtoll detect the base */
+		$$ = strtoll ($1, NULL, 0);
 	}
 |	INTEGER {
 		$$ = $1;
@@ -1082,6 +1077,20 @@
       if (node_a == NULL && node_b == NULL)
          break;
 
+      /* Ignore trailing whitespace */
+      if (node_a == NULL && node_b->token->type == SPACE) {
+         while (node_b && node_b->token->type == SPACE)
+            node_b = node_b->next;
+      }
+
+      if (node_b == NULL && node_a->token->type == SPACE) {
+         while (node_a && node_a->token->type == SPACE)
+            node_a = node_a->next;
+      }
+
+      if (node_a == NULL && node_b == NULL)
+         break;
+
       if (node_a == NULL || node_b == NULL)
          return 0;
       /* Make sure whitespace appears in the same places in both.
@@ -2322,7 +2331,7 @@
 
 static void
 _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t version,
-                                         const char *es_identifier,
+                                         const char *identifier,
                                          bool explicitly_set)
 {
    if (parser->version_set)
@@ -2334,11 +2343,15 @@
    add_builtin_define (parser, "__VERSION__", version);
 
    parser->is_gles = (version == 100) ||
-                     (es_identifier && (strcmp(es_identifier, "es") == 0));
+                     (identifier && (strcmp(identifier, "es") == 0));
+   bool is_compat = version >= 150 && identifier &&
+                    strcmp(identifier, "compatibility") == 0;
 
    /* Add pre-defined macros. */
    if (parser->is_gles)
       add_builtin_define(parser, "GL_ES", 1);
+   else if (is_compat)
+      add_builtin_define(parser, "GL_compatibility_profile", 1);
    else if (version >= 150)
       add_builtin_define(parser, "GL_core_profile", 1);
 
@@ -2373,8 +2386,8 @@
    if (explicitly_set) {
       _mesa_string_buffer_printf(parser->output,
                                  "#version %" PRIiMAX "%s%s", version,
-                                 es_identifier ? " " : "",
-                                 es_identifier ? es_identifier : "");
+                                 identifier ? " " : "",
+                                 identifier ? identifier : "");
    }
 }
 
diff --git a/src/compiler/glsl/glcpp/meson.build b/src/compiler/glsl/glcpp/meson.build
index e6a3dc8..287da35 100644
--- a/src/compiler/glsl/glcpp/meson.build
+++ b/src/compiler/glsl/glcpp/meson.build
@@ -57,15 +57,16 @@
 
 if with_tests
   modes = ['unix', 'windows', 'oldmac', 'bizarro']
-  if dep_valgrind != [] and dep_valgrind.found()
+  if dep_valgrind.found()
     modes += ['valgrind']
   endif
 
   foreach m : modes
     test(
       'glcpp test (@0@)'.format(m),
-      find_program('tests/glcpp_test.py'),
+      prog_python2,
       args : [
+        join_paths(meson.current_source_dir(), 'tests/glcpp_test.py'),
         glcpp, join_paths(meson.current_source_dir(), 'tests'),
         '--@0@'.format(m),
       ],
diff --git a/src/compiler/glsl/glcpp/tests/122-redefine-whitespace.c b/src/compiler/glsl/glcpp/tests/122-redefine-whitespace.c
index ae7ea09..2b084e0 100644
--- a/src/compiler/glsl/glcpp/tests/122-redefine-whitespace.c
+++ b/src/compiler/glsl/glcpp/tests/122-redefine-whitespace.c
@@ -2,6 +2,7 @@
 #define TWO  ( 1+1 )
 #define FOUR (2 + 2)
 #define SIX  (3 + 3)
+#define EIGHT (8 + 8)
 
 /* Redefinitions with whitespace in same places, but different amounts, (so no
  * error). */
@@ -9,6 +10,9 @@
 #define FOUR    (2	+  2)
 #define SIX	(3/*comment is whitespace*/+   /* collapsed */ /* to */ /* one */ /* space */  3)
 
+/* Trailing whitespace (no error) */
+#define EIGHT (8 + 8)       
+
 /* Redefinitions with whitespace in different places. Each of these should
  * trigger an error. */
 #define TWO  (1 + 1)
diff --git a/src/compiler/glsl/glcpp/tests/122-redefine-whitespace.c.expected b/src/compiler/glsl/glcpp/tests/122-redefine-whitespace.c.expected
index 602bdef..766849e 100644
--- a/src/compiler/glsl/glcpp/tests/122-redefine-whitespace.c.expected
+++ b/src/compiler/glsl/glcpp/tests/122-redefine-whitespace.c.expected
@@ -1,14 +1,15 @@
-0:14(9): preprocessor error: Redefinition of macro TWO
+0:18(9): preprocessor error: Redefinition of macro TWO
 
-0:15(9): preprocessor error: Redefinition of macro FOUR
+0:19(9): preprocessor error: Redefinition of macro FOUR
 
-0:16(9): preprocessor error: Redefinition of macro SIX
+0:20(9): preprocessor error: Redefinition of macro SIX
 
  
 
 
 
 
+
  
 
 
@@ -18,5 +19,8 @@
  
 
 
+ 
+
+
 
 
diff --git a/src/compiler/glsl/glcpp/tests/149-hex-const-uppercase-prefix.c b/src/compiler/glsl/glcpp/tests/149-hex-const-uppercase-prefix.c
new file mode 100644
index 0000000..1be9b28
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/149-hex-const-uppercase-prefix.c
@@ -0,0 +1,5 @@
+#if 0x1234abcd == 0X1234abcd
+success
+#else
+failure
+#endif
diff --git a/src/compiler/glsl/glcpp/tests/149-hex-const-uppercase-prefix.c.expected b/src/compiler/glsl/glcpp/tests/149-hex-const-uppercase-prefix.c.expected
new file mode 100644
index 0000000..4cf250f
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/149-hex-const-uppercase-prefix.c.expected
@@ -0,0 +1,5 @@
+
+success
+
+
+
diff --git a/src/compiler/glsl/glcpp/tests/glcpp-test-cr-lf.sh b/src/compiler/glsl/glcpp/tests/glcpp-test-cr-lf.sh
deleted file mode 100755
index c41ee9f..0000000
--- a/src/compiler/glsl/glcpp/tests/glcpp-test-cr-lf.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/sh
-
-$PYTHON2 $srcdir/glsl/glcpp/tests/glcpp_test.py $abs_builddir/glsl/glcpp/glcpp $srcdir/glsl/glcpp/tests --windows --oldmac --bizarro
diff --git a/src/compiler/glsl/glcpp/tests/glcpp-test.sh b/src/compiler/glsl/glcpp/tests/glcpp-test.sh
index 3925c4a..b8397ec 100755
--- a/src/compiler/glsl/glcpp/tests/glcpp-test.sh
+++ b/src/compiler/glsl/glcpp/tests/glcpp-test.sh
@@ -1,3 +1,16 @@
 #!/bin/sh
 
-$PYTHON2 $srcdir/glsl/glcpp/tests/glcpp_test.py $abs_builddir/glsl/glcpp/glcpp $srcdir/glsl/glcpp/tests --unix
+if [ -z "$srcdir" -o -z "$abs_builddir" ]; then
+    echo ""
+    echo "Warning: you're invoking the script manually and things may fail."
+    echo "Attempting to determine/set srcdir and abs_builddir variables."
+    echo ""
+
+    # Should point to `dirname Makefile.glsl.am`
+    srcdir=./../../../
+    cd `dirname "$0"`
+    # Should point to `dirname Makefile` equivalent to the above.
+    abs_builddir=`pwd`/../../../
+fi
+
+$PYTHON2 $srcdir/glsl/glcpp/tests/glcpp_test.py $abs_builddir/glsl/glcpp/glcpp $srcdir/glsl/glcpp/tests --unix --windows --oldmac --bizarro
diff --git a/src/compiler/glsl/glcpp/tests/glcpp_test.py b/src/compiler/glsl/glcpp/tests/glcpp_test.py
old mode 100755
new mode 100644
index 8ac5d7c..8c75521
--- a/src/compiler/glsl/glcpp/tests/glcpp_test.py
+++ b/src/compiler/glsl/glcpp/tests/glcpp_test.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python2
 # encoding=utf-8
 # Copyright © 2018 Intel Corporation
 
diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll
index c00438d..87b64e0 100644
--- a/src/compiler/glsl/glsl_lexer.ll
+++ b/src/compiler/glsl/glsl_lexer.ll
@@ -600,7 +600,7 @@
 interface	KEYWORD(110, 100, 0, 0, INTERFACE);
 long		KEYWORD(110, 100, 0, 0, LONG_TOK);
 short		KEYWORD(110, 100, 0, 0, SHORT_TOK);
-double		TYPE_WITH_ALT(130, 300, 130, 300, yyextra->ARB_gpu_shader_fp64_enable, glsl_type::double_type);
+double		TYPE_WITH_ALT(130, 100, 130, 300, yyextra->ARB_gpu_shader_fp64_enable, glsl_type::double_type);
 half		KEYWORD(110, 100, 0, 0, HALF);
 fixed		KEYWORD(110, 100, 0, 0, FIXED_TOK);
 unsigned	KEYWORD(110, 100, 0, 0, UNSIGNED);
diff --git a/src/compiler/glsl/glsl_parser.yy b/src/compiler/glsl/glsl_parser.yy
index d44afb5..cb73769 100644
--- a/src/compiler/glsl/glsl_parser.yy
+++ b/src/compiler/glsl/glsl_parser.yy
@@ -1340,18 +1340,18 @@
                { "r32i", GL_R32I, GLSL_TYPE_INT, 130, 310, false },
                { "r16i", GL_R16I, GLSL_TYPE_INT, 130, 0, true },
                { "r8i", GL_R8I, GLSL_TYPE_INT, 130, 0, true },
-               { "rgba16", GL_RGBA16, GLSL_TYPE_FLOAT, 130, 0, false },
+               { "rgba16", GL_RGBA16, GLSL_TYPE_FLOAT, 130, 0, true },
                { "rgb10_a2", GL_RGB10_A2, GLSL_TYPE_FLOAT, 130, 0, true },
                { "rgba8", GL_RGBA8, GLSL_TYPE_FLOAT, 130, 310, false },
-               { "rg16", GL_RG16, GLSL_TYPE_FLOAT, 130, 0, false },
+               { "rg16", GL_RG16, GLSL_TYPE_FLOAT, 130, 0, true },
                { "rg8", GL_RG8, GLSL_TYPE_FLOAT, 130, 0, true },
-               { "r16", GL_R16, GLSL_TYPE_FLOAT, 130, 0, false },
+               { "r16", GL_R16, GLSL_TYPE_FLOAT, 130, 0, true },
                { "r8", GL_R8, GLSL_TYPE_FLOAT, 130, 0, true },
-               { "rgba16_snorm", GL_RGBA16_SNORM, GLSL_TYPE_FLOAT, 130, 0, false },
+               { "rgba16_snorm", GL_RGBA16_SNORM, GLSL_TYPE_FLOAT, 130, 0, true },
                { "rgba8_snorm", GL_RGBA8_SNORM, GLSL_TYPE_FLOAT, 130, 310, false },
-               { "rg16_snorm", GL_RG16_SNORM, GLSL_TYPE_FLOAT, 130, 0, false },
+               { "rg16_snorm", GL_RG16_SNORM, GLSL_TYPE_FLOAT, 130, 0, true },
                { "rg8_snorm", GL_RG8_SNORM, GLSL_TYPE_FLOAT, 130, 0, true },
-               { "r16_snorm", GL_R16_SNORM, GLSL_TYPE_FLOAT, 130, 0, false },
+               { "r16_snorm", GL_R16_SNORM, GLSL_TYPE_FLOAT, 130, 0, true },
                { "r8_snorm", GL_R8_SNORM, GLSL_TYPE_FLOAT, 130, 0, true }
             };
 
@@ -1432,6 +1432,36 @@
          }
       }
 
+      const bool pixel_interlock_ordered = match_layout_qualifier($1,
+         "pixel_interlock_ordered", state) == 0;
+      const bool pixel_interlock_unordered = match_layout_qualifier($1,
+         "pixel_interlock_unordered", state) == 0;
+      const bool sample_interlock_ordered = match_layout_qualifier($1,
+         "sample_interlock_ordered", state) == 0;
+      const bool sample_interlock_unordered = match_layout_qualifier($1,
+         "sample_interlock_unordered", state) == 0;
+
+      if (pixel_interlock_ordered + pixel_interlock_unordered +
+          sample_interlock_ordered + sample_interlock_unordered > 0 &&
+          state->stage != MESA_SHADER_FRAGMENT) {
+         _mesa_glsl_error(& @1, state, "interlock layout qualifiers: "
+                          "pixel_interlock_ordered, pixel_interlock_unordered, "
+                          "sample_interlock_ordered and sample_interlock_unordered, "
+                          "only valid in fragment shader input layout declaration.");
+      } else if (pixel_interlock_ordered + pixel_interlock_unordered +
+                 sample_interlock_ordered + sample_interlock_unordered > 0 &&
+                 !state->ARB_fragment_shader_interlock_enable) {
+         _mesa_glsl_error(& @1, state,
+                          "interlock layout qualifier present, but the "
+                          "GL_ARB_fragment_shader_interlock extension is not "
+                          "enabled.");
+      } else {
+         $$.flags.q.pixel_interlock_ordered = pixel_interlock_ordered;
+         $$.flags.q.pixel_interlock_unordered = pixel_interlock_unordered;
+         $$.flags.q.sample_interlock_ordered = sample_interlock_ordered;
+         $$.flags.q.sample_interlock_unordered = sample_interlock_unordered;
+      }
+
       /* Layout qualifiers for tessellation evaluation shaders. */
       if (!$$.flags.i) {
          static const struct {
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 25003ee..6d92f24 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -299,6 +299,10 @@
    this->fs_early_fragment_tests = false;
    this->fs_inner_coverage = false;
    this->fs_post_depth_coverage = false;
+   this->fs_pixel_interlock_ordered = false;
+   this->fs_pixel_interlock_unordered = false;
+   this->fs_sample_interlock_ordered = false;
+   this->fs_sample_interlock_unordered = false;
    this->fs_blend_support = 0;
    memset(this->atomic_counter_offsets, 0,
           sizeof(this->atomic_counter_offsets));
@@ -630,6 +634,7 @@
    EXT(ARB_explicit_uniform_location),
    EXT(ARB_fragment_coord_conventions),
    EXT(ARB_fragment_layer_viewport),
+   EXT(ARB_fragment_shader_interlock),
    EXT(ARB_gpu_shader5),
    EXT(ARB_gpu_shader_fp64),
    EXT(ARB_gpu_shader_int64),
@@ -1721,6 +1726,10 @@
       assert(!state->fs_early_fragment_tests);
       assert(!state->fs_inner_coverage);
       assert(!state->fs_post_depth_coverage);
+      assert(!state->fs_pixel_interlock_ordered);
+      assert(!state->fs_pixel_interlock_unordered);
+      assert(!state->fs_sample_interlock_ordered);
+      assert(!state->fs_sample_interlock_unordered);
    }
 
    for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) {
@@ -1842,6 +1851,10 @@
       shader->EarlyFragmentTests = state->fs_early_fragment_tests;
       shader->InnerCoverage = state->fs_inner_coverage;
       shader->PostDepthCoverage = state->fs_post_depth_coverage;
+      shader->PixelInterlockOrdered = state->fs_pixel_interlock_ordered;
+      shader->PixelInterlockUnordered = state->fs_pixel_interlock_unordered;
+      shader->SampleInterlockOrdered = state->fs_sample_interlock_ordered;
+      shader->SampleInterlockUnordered = state->fs_sample_interlock_unordered;
       shader->BlendSupport = state->fs_blend_support;
       break;
 
@@ -2201,7 +2214,6 @@
    OPT(do_if_simplification, ir);
    OPT(opt_flatten_nested_if_blocks, ir);
    OPT(opt_conditional_discard, ir);
-   OPT(do_copy_propagation, ir);
    OPT(do_copy_propagation_elements, ir);
 
    if (options->OptimizeForAOS && !linked)
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 5b9b6cc..59a1734 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -639,6 +639,8 @@
    bool ARB_fragment_coord_conventions_warn;
    bool ARB_fragment_layer_viewport_enable;
    bool ARB_fragment_layer_viewport_warn;
+   bool ARB_fragment_shader_interlock_enable;
+   bool ARB_fragment_shader_interlock_warn;
    bool ARB_gpu_shader5_enable;
    bool ARB_gpu_shader5_warn;
    bool ARB_gpu_shader_fp64_enable;
@@ -833,6 +835,11 @@
 
    bool fs_post_depth_coverage;
 
+   bool fs_pixel_interlock_ordered;
+   bool fs_pixel_interlock_unordered;
+   bool fs_sample_interlock_ordered;
+   bool fs_sample_interlock_unordered;
+
    unsigned fs_blend_support;
 
    /**
diff --git a/src/compiler/glsl/glsl_to_nir.cpp b/src/compiler/glsl/glsl_to_nir.cpp
index 7e9108b..1e4d9f9 100644
--- a/src/compiler/glsl/glsl_to_nir.cpp
+++ b/src/compiler/glsl/glsl_to_nir.cpp
@@ -91,12 +91,10 @@
    nir_builder b;
    nir_ssa_def *result; /* result of the expression tree last visited */
 
-   nir_deref_var *evaluate_deref(nir_instr *mem_ctx, ir_instruction *ir);
+   nir_deref_instr *evaluate_deref(ir_instruction *ir);
 
-   /* the head of the dereference chain we're creating */
-   nir_deref_var *deref_head;
-   /* the tail of the dereference chain we're creating */
-   nir_deref *deref_tail;
+   /* most recent deref instruction created */
+   nir_deref_instr *deref;
 
    nir_variable *var; /* variable created by ir_variable visitor */
 
@@ -199,8 +197,6 @@
    this->result = NULL;
    this->impl = NULL;
    this->var = NULL;
-   this->deref_head = NULL;
-   this->deref_tail = NULL;
    memset(&this->b, 0, sizeof(this->b));
 }
 
@@ -210,12 +206,11 @@
    _mesa_hash_table_destroy(this->overload_table, NULL);
 }
 
-nir_deref_var *
-nir_visitor::evaluate_deref(nir_instr *mem_ctx, ir_instruction *ir)
+nir_deref_instr *
+nir_visitor::evaluate_deref(ir_instruction *ir)
 {
    ir->accept(this);
-   ralloc_steal(mem_ctx, this->deref_head);
-   return this->deref_head;
+   return this->deref;
 }
 
 static nir_constant *
@@ -435,6 +430,7 @@
    var->data.index = ir->data.index;
    var->data.descriptor_set = 0;
    var->data.binding = ir->data.binding;
+   var->data.explicit_binding = ir->data.explicit_binding;
    var->data.bindless = ir->data.bindless;
    var->data.offset = ir->data.offset;
    var->data.image.read_only = ir->data.memory_read_only;
@@ -444,6 +440,10 @@
    var->data.image.restrict_flag = ir->data.memory_restrict;
    var->data.image.format = ir->data.image_format;
    var->data.fb_fetch_output = ir->data.fb_fetch_output;
+   var->data.explicit_xfb_buffer = ir->data.explicit_xfb_buffer;
+   var->data.explicit_xfb_stride = ir->data.explicit_xfb_stride;
+   var->data.xfb_buffer = ir->data.xfb_buffer;
+   var->data.xfb_stride = ir->data.xfb_stride;
 
    var->num_state_slots = ir->get_num_state_slots();
    if (var->num_state_slots > 0) {
@@ -521,7 +521,6 @@
 
       assert(strcmp(func->name, "main") == 0);
       assert(ir->parameters.is_empty());
-      assert(func->return_type == glsl_type::void_type);
 
       this->is_global = false;
 
@@ -616,14 +615,7 @@
 void
 nir_visitor::visit(ir_return *ir)
 {
-   if (ir->value != NULL) {
-      nir_intrinsic_instr *copy =
-         nir_intrinsic_instr_create(this->shader, nir_intrinsic_copy_var);
-
-      copy->variables[0] = nir_deref_var_create(copy, this->impl->return_var);
-      copy->variables[1] = evaluate_deref(&copy->instr, ir->value);
-   }
-
+   assert(ir->value == NULL);
    nir_jump_instr *instr = nir_jump_instr_create(this->shader, nir_jump_return);
    nir_builder_instr_insert(&b, &instr->instr);
 }
@@ -636,76 +628,76 @@
 
       switch (ir->callee->intrinsic_id) {
       case ir_intrinsic_atomic_counter_read:
-         op = nir_intrinsic_atomic_counter_read_var;
+         op = nir_intrinsic_atomic_counter_read_deref;
          break;
       case ir_intrinsic_atomic_counter_increment:
-         op = nir_intrinsic_atomic_counter_inc_var;
+         op = nir_intrinsic_atomic_counter_inc_deref;
          break;
       case ir_intrinsic_atomic_counter_predecrement:
-         op = nir_intrinsic_atomic_counter_dec_var;
+         op = nir_intrinsic_atomic_counter_pre_dec_deref;
          break;
       case ir_intrinsic_atomic_counter_add:
-         op = nir_intrinsic_atomic_counter_add_var;
+         op = nir_intrinsic_atomic_counter_add_deref;
          break;
       case ir_intrinsic_atomic_counter_and:
-         op = nir_intrinsic_atomic_counter_and_var;
+         op = nir_intrinsic_atomic_counter_and_deref;
          break;
       case ir_intrinsic_atomic_counter_or:
-         op = nir_intrinsic_atomic_counter_or_var;
+         op = nir_intrinsic_atomic_counter_or_deref;
          break;
       case ir_intrinsic_atomic_counter_xor:
-         op = nir_intrinsic_atomic_counter_xor_var;
+         op = nir_intrinsic_atomic_counter_xor_deref;
          break;
       case ir_intrinsic_atomic_counter_min:
-         op = nir_intrinsic_atomic_counter_min_var;
+         op = nir_intrinsic_atomic_counter_min_deref;
          break;
       case ir_intrinsic_atomic_counter_max:
-         op = nir_intrinsic_atomic_counter_max_var;
+         op = nir_intrinsic_atomic_counter_max_deref;
          break;
       case ir_intrinsic_atomic_counter_exchange:
-         op = nir_intrinsic_atomic_counter_exchange_var;
+         op = nir_intrinsic_atomic_counter_exchange_deref;
          break;
       case ir_intrinsic_atomic_counter_comp_swap:
-         op = nir_intrinsic_atomic_counter_comp_swap_var;
+         op = nir_intrinsic_atomic_counter_comp_swap_deref;
          break;
       case ir_intrinsic_image_load:
-         op = nir_intrinsic_image_var_load;
+         op = nir_intrinsic_image_deref_load;
          break;
       case ir_intrinsic_image_store:
-         op = nir_intrinsic_image_var_store;
+         op = nir_intrinsic_image_deref_store;
          break;
       case ir_intrinsic_image_atomic_add:
-         op = nir_intrinsic_image_var_atomic_add;
+         op = nir_intrinsic_image_deref_atomic_add;
          break;
       case ir_intrinsic_image_atomic_min:
-         op = nir_intrinsic_image_var_atomic_min;
+         op = nir_intrinsic_image_deref_atomic_min;
          break;
       case ir_intrinsic_image_atomic_max:
-         op = nir_intrinsic_image_var_atomic_max;
+         op = nir_intrinsic_image_deref_atomic_max;
          break;
       case ir_intrinsic_image_atomic_and:
-         op = nir_intrinsic_image_var_atomic_and;
+         op = nir_intrinsic_image_deref_atomic_and;
          break;
       case ir_intrinsic_image_atomic_or:
-         op = nir_intrinsic_image_var_atomic_or;
+         op = nir_intrinsic_image_deref_atomic_or;
          break;
       case ir_intrinsic_image_atomic_xor:
-         op = nir_intrinsic_image_var_atomic_xor;
+         op = nir_intrinsic_image_deref_atomic_xor;
          break;
       case ir_intrinsic_image_atomic_exchange:
-         op = nir_intrinsic_image_var_atomic_exchange;
+         op = nir_intrinsic_image_deref_atomic_exchange;
          break;
       case ir_intrinsic_image_atomic_comp_swap:
-         op = nir_intrinsic_image_var_atomic_comp_swap;
+         op = nir_intrinsic_image_deref_atomic_comp_swap;
          break;
       case ir_intrinsic_memory_barrier:
          op = nir_intrinsic_memory_barrier;
          break;
       case ir_intrinsic_image_size:
-         op = nir_intrinsic_image_var_size;
+         op = nir_intrinsic_image_deref_size;
          break;
       case ir_intrinsic_image_samples:
-         op = nir_intrinsic_image_var_samples;
+         op = nir_intrinsic_image_deref_samples;
          break;
       case ir_intrinsic_ssbo_store:
          op = nir_intrinsic_store_ssbo;
@@ -752,6 +744,12 @@
       case ir_intrinsic_shader_clock:
          op = nir_intrinsic_shader_clock;
          break;
+      case ir_intrinsic_begin_invocation_interlock:
+         op = nir_intrinsic_begin_invocation_interlock;
+         break;
+      case ir_intrinsic_end_invocation_interlock:
+         op = nir_intrinsic_end_invocation_interlock;
+         break;
       case ir_intrinsic_group_memory_barrier:
          op = nir_intrinsic_group_memory_barrier;
          break;
@@ -835,22 +833,22 @@
       nir_dest *dest = &instr->dest;
 
       switch (op) {
-      case nir_intrinsic_atomic_counter_read_var:
-      case nir_intrinsic_atomic_counter_inc_var:
-      case nir_intrinsic_atomic_counter_dec_var:
-      case nir_intrinsic_atomic_counter_add_var:
-      case nir_intrinsic_atomic_counter_min_var:
-      case nir_intrinsic_atomic_counter_max_var:
-      case nir_intrinsic_atomic_counter_and_var:
-      case nir_intrinsic_atomic_counter_or_var:
-      case nir_intrinsic_atomic_counter_xor_var:
-      case nir_intrinsic_atomic_counter_exchange_var:
-      case nir_intrinsic_atomic_counter_comp_swap_var: {
+      case nir_intrinsic_atomic_counter_read_deref:
+      case nir_intrinsic_atomic_counter_inc_deref:
+      case nir_intrinsic_atomic_counter_pre_dec_deref:
+      case nir_intrinsic_atomic_counter_add_deref:
+      case nir_intrinsic_atomic_counter_min_deref:
+      case nir_intrinsic_atomic_counter_max_deref:
+      case nir_intrinsic_atomic_counter_and_deref:
+      case nir_intrinsic_atomic_counter_or_deref:
+      case nir_intrinsic_atomic_counter_xor_deref:
+      case nir_intrinsic_atomic_counter_exchange_deref:
+      case nir_intrinsic_atomic_counter_comp_swap_deref: {
          /* Set the counter variable dereference. */
          exec_node *param = ir->actual_parameters.get_head();
          ir_dereference *counter = (ir_dereference *)param;
 
-         instr->variables[0] = evaluate_deref(&instr->instr, counter);
+         instr->src[0] = nir_src_for_ssa(&evaluate_deref(counter)->dest.ssa);
          param = param->get_next();
 
          /* Set the intrinsic destination. */
@@ -860,13 +858,13 @@
 
          /* Set the intrinsic parameters. */
          if (!param->is_tail_sentinel()) {
-            instr->src[0] =
+            instr->src[1] =
                nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param));
             param = param->get_next();
          }
 
          if (!param->is_tail_sentinel()) {
-            instr->src[1] =
+            instr->src[2] =
                nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param));
             param = param->get_next();
          }
@@ -874,18 +872,18 @@
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
-      case nir_intrinsic_image_var_load:
-      case nir_intrinsic_image_var_store:
-      case nir_intrinsic_image_var_atomic_add:
-      case nir_intrinsic_image_var_atomic_min:
-      case nir_intrinsic_image_var_atomic_max:
-      case nir_intrinsic_image_var_atomic_and:
-      case nir_intrinsic_image_var_atomic_or:
-      case nir_intrinsic_image_var_atomic_xor:
-      case nir_intrinsic_image_var_atomic_exchange:
-      case nir_intrinsic_image_var_atomic_comp_swap:
-      case nir_intrinsic_image_var_samples:
-      case nir_intrinsic_image_var_size: {
+      case nir_intrinsic_image_deref_load:
+      case nir_intrinsic_image_deref_store:
+      case nir_intrinsic_image_deref_atomic_add:
+      case nir_intrinsic_image_deref_atomic_min:
+      case nir_intrinsic_image_deref_atomic_max:
+      case nir_intrinsic_image_deref_atomic_and:
+      case nir_intrinsic_image_deref_atomic_or:
+      case nir_intrinsic_image_deref_atomic_xor:
+      case nir_intrinsic_image_deref_atomic_exchange:
+      case nir_intrinsic_image_deref_atomic_comp_swap:
+      case nir_intrinsic_image_deref_samples:
+      case nir_intrinsic_image_deref_size: {
          nir_ssa_undef_instr *instr_undef =
             nir_ssa_undef_instr_create(shader, 1, 32);
          nir_builder_instr_insert(&b, &instr_undef->instr);
@@ -896,20 +894,20 @@
          const glsl_type *type =
             image->variable_referenced()->type->without_array();
 
-         instr->variables[0] = evaluate_deref(&instr->instr, image);
+         instr->src[0] = nir_src_for_ssa(&evaluate_deref(image)->dest.ssa);
          param = param->get_next();
 
          /* Set the intrinsic destination. */
          if (ir->return_deref) {
             unsigned num_components = ir->return_deref->type->vector_elements;
-            if (instr->intrinsic == nir_intrinsic_image_var_size)
+            if (instr->intrinsic == nir_intrinsic_image_deref_size)
                instr->num_components = num_components;
             nir_ssa_dest_init(&instr->instr, &instr->dest,
                               num_components, 32, NULL);
          }
 
-         if (op == nir_intrinsic_image_var_size ||
-             op == nir_intrinsic_image_var_samples) {
+         if (op == nir_intrinsic_image_deref_size ||
+             op == nir_intrinsic_image_deref_samples) {
             nir_builder_instr_insert(&b, &instr->instr);
             break;
          }
@@ -928,29 +926,29 @@
                srcs[i] = &instr_undef->def;
          }
 
-         instr->src[0] = nir_src_for_ssa(nir_vec(&b, srcs, 4));
+         instr->src[1] = nir_src_for_ssa(nir_vec(&b, srcs, 4));
          param = param->get_next();
 
          /* Set the sample argument, which is undefined for single-sample
           * images.
           */
          if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_MS) {
-            instr->src[1] =
+            instr->src[2] =
                nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param));
             param = param->get_next();
          } else {
-            instr->src[1] = nir_src_for_ssa(&instr_undef->def);
+            instr->src[2] = nir_src_for_ssa(&instr_undef->def);
          }
 
          /* Set the intrinsic parameters. */
          if (!param->is_tail_sentinel()) {
-            instr->src[2] =
+            instr->src[3] =
                nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param));
             param = param->get_next();
          }
 
          if (!param->is_tail_sentinel()) {
-            instr->src[3] =
+            instr->src[4] =
                nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param));
             param = param->get_next();
          }
@@ -970,6 +968,12 @@
          instr->num_components = 2;
          nir_builder_instr_insert(&b, &instr->instr);
          break;
+      case nir_intrinsic_begin_invocation_interlock:
+         nir_builder_instr_insert(&b, &instr->instr);
+         break;
+      case nir_intrinsic_end_invocation_interlock:
+         nir_builder_instr_insert(&b, &instr->instr);
+         break;
       case nir_intrinsic_store_ssbo: {
          exec_node *param = ir->actual_parameters.get_head();
          ir_rvalue *block = ((ir_instruction *)param)->as_rvalue();
@@ -1216,38 +1220,13 @@
          unreachable("not reached");
       }
 
-      if (ir->return_deref) {
-         nir_intrinsic_instr *store_instr =
-            nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
-         store_instr->num_components = ir->return_deref->type->vector_elements;
-         nir_intrinsic_set_write_mask(store_instr,
-                                      (1 << store_instr->num_components) - 1);
-
-         store_instr->variables[0] =
-            evaluate_deref(&store_instr->instr, ir->return_deref);
-         store_instr->src[0] = nir_src_for_ssa(&dest->ssa);
-
-         nir_builder_instr_insert(&b, &store_instr->instr);
-      }
+      if (ir->return_deref)
+         nir_store_deref(&b, evaluate_deref(ir->return_deref), &dest->ssa, ~0);
 
       return;
    }
 
-   struct hash_entry *entry =
-      _mesa_hash_table_search(this->overload_table, ir->callee);
-   assert(entry);
-   nir_function *callee = (nir_function *) entry->data;
-
-   nir_call_instr *instr = nir_call_instr_create(this->shader, callee);
-
-   unsigned i = 0;
-   foreach_in_list(ir_dereference, param, &ir->actual_parameters) {
-      instr->params[i] = evaluate_deref(&instr->instr, param);
-      i++;
-   }
-
-   instr->return_deref = evaluate_deref(&instr->instr, ir->return_deref);
-   nir_builder_instr_insert(&b, &instr->instr);
+   unreachable("glsl_to_nir only handles function calls to intrinsics");
 }
 
 void
@@ -1260,19 +1239,12 @@
 
    if ((ir->rhs->as_dereference() || ir->rhs->as_constant()) &&
        (ir->write_mask == (1 << num_components) - 1 || ir->write_mask == 0)) {
-      /* We're doing a plain-as-can-be copy, so emit a copy_var */
-      nir_intrinsic_instr *copy =
-         nir_intrinsic_instr_create(this->shader, nir_intrinsic_copy_var);
-
-      copy->variables[0] = evaluate_deref(&copy->instr, ir->lhs);
-      copy->variables[1] = evaluate_deref(&copy->instr, ir->rhs);
-
       if (ir->condition) {
          nir_push_if(&b, evaluate_rvalue(ir->condition));
-         nir_builder_instr_insert(&b, &copy->instr);
+         nir_copy_deref(&b, evaluate_deref(ir->lhs), evaluate_deref(ir->rhs));
          nir_pop_if(&b, NULL);
       } else {
-         nir_builder_instr_insert(&b, &copy->instr);
+         nir_copy_deref(&b, evaluate_deref(ir->lhs), evaluate_deref(ir->rhs));
       }
       return;
    }
@@ -1280,7 +1252,7 @@
    assert(ir->rhs->type->is_scalar() || ir->rhs->type->is_vector());
 
    ir->lhs->accept(this);
-   nir_deref_var *lhs_deref = this->deref_head;
+   nir_deref_instr *lhs_deref = this->deref;
    nir_ssa_def *src = evaluate_rvalue(ir->rhs);
 
    if (ir->write_mask != (1 << num_components) - 1 && ir->write_mask != 0) {
@@ -1297,19 +1269,12 @@
       src = nir_swizzle(&b, src, swiz, num_components, !supports_ints);
    }
 
-   nir_intrinsic_instr *store =
-      nir_intrinsic_instr_create(this->shader, nir_intrinsic_store_var);
-   store->num_components = ir->lhs->type->vector_elements;
-   nir_intrinsic_set_write_mask(store, ir->write_mask);
-   store->variables[0] = nir_deref_var_clone(lhs_deref, store);
-   store->src[0] = nir_src_for_ssa(src);
-
    if (ir->condition) {
       nir_push_if(&b, evaluate_rvalue(ir->condition));
-      nir_builder_instr_insert(&b, &store->instr);
+      nir_store_deref(&b, lhs_deref, src, ir->write_mask);
       nir_pop_if(&b, NULL);
    } else {
-      nir_builder_instr_insert(&b, &store->instr);
+      nir_store_deref(&b, lhs_deref, src, ir->write_mask);
    }
 }
 
@@ -1376,13 +1341,7 @@
        * must emit a variable load.
        */
 
-      nir_intrinsic_instr *load_instr =
-         nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_var);
-      load_instr->num_components = ir->type->vector_elements;
-      load_instr->variables[0] = this->deref_head;
-      ralloc_steal(load_instr, load_instr->variables[0]);
-      unsigned bit_size = glsl_get_bit_size(ir->type);
-      add_instr(&load_instr->instr, ir->type->vector_elements, bit_size);
+      this->result = nir_load_deref(&b, this->deref);
    }
 
    return this->result;
@@ -1445,16 +1404,16 @@
       deref->accept(this);
 
       nir_intrinsic_op op;
-      if (this->deref_head->var->data.mode == nir_var_shader_in) {
+      if (this->deref->mode == nir_var_shader_in) {
          switch (ir->operation) {
          case ir_unop_interpolate_at_centroid:
-            op = nir_intrinsic_interp_var_at_centroid;
+            op = nir_intrinsic_interp_deref_at_centroid;
             break;
          case ir_binop_interpolate_at_offset:
-            op = nir_intrinsic_interp_var_at_offset;
+            op = nir_intrinsic_interp_deref_at_offset;
             break;
          case ir_binop_interpolate_at_sample:
-            op = nir_intrinsic_interp_var_at_sample;
+            op = nir_intrinsic_interp_deref_at_sample;
             break;
          default:
             unreachable("Invalid interpolation intrinsic");
@@ -1466,18 +1425,17 @@
           * sense, we'll just turn it into a load which will probably
           * eventually end up as an SSA definition.
           */
-         assert(this->deref_head->var->data.mode == nir_var_global);
-         op = nir_intrinsic_load_var;
+         assert(this->deref->mode == nir_var_global);
+         op = nir_intrinsic_load_deref;
       }
 
       nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(shader, op);
       intrin->num_components = deref->type->vector_elements;
-      intrin->variables[0] = this->deref_head;
-      ralloc_steal(intrin, intrin->variables[0]);
+      intrin->src[0] = nir_src_for_ssa(&this->deref->dest.ssa);
 
-      if (intrin->intrinsic == nir_intrinsic_interp_var_at_offset ||
-          intrin->intrinsic == nir_intrinsic_interp_var_at_sample)
-         intrin->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
+      if (intrin->intrinsic == nir_intrinsic_interp_deref_at_offset ||
+          intrin->intrinsic == nir_intrinsic_interp_deref_at_sample)
+         intrin->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
 
       unsigned bit_size =  glsl_get_bit_size(deref->type);
       add_instr(&intrin->instr, deref->type->vector_elements, bit_size);
@@ -2055,6 +2013,9 @@
    if (ir->offset != NULL)
       num_srcs++;
 
+   /* Add one for the texture deref */
+   num_srcs += 2;
+
    nir_tex_instr *instr = nir_tex_instr_create(this->shader, num_srcs);
 
    instr->op = op;
@@ -2079,9 +2040,13 @@
       unreachable("not reached");
    }
 
-   instr->texture = evaluate_deref(&instr->instr, ir->sampler);
+   nir_deref_instr *sampler_deref = evaluate_deref(ir->sampler);
+   instr->src[0].src = nir_src_for_ssa(&sampler_deref->dest.ssa);
+   instr->src[0].src_type = nir_tex_src_texture_deref;
+   instr->src[1].src = nir_src_for_ssa(&sampler_deref->dest.ssa);
+   instr->src[1].src_type = nir_tex_src_sampler_deref;
 
-   unsigned src_number = 0;
+   unsigned src_number = 2;
 
    if (ir->coordinate != NULL) {
       instr->coord_components = ir->coordinate->type->vector_elements;
@@ -2180,8 +2145,7 @@
    var->data.read_only = true;
    var->constant_initializer = constant_copy(ir, var);
 
-   this->deref_head = nir_deref_var_create(this->shader, var);
-   this->deref_tail = &this->deref_head->deref;
+   this->deref = nir_build_deref_var(&b, var);
 }
 
 void
@@ -2192,9 +2156,7 @@
    assert(entry);
    nir_variable *var = (nir_variable *) entry->data;
 
-   nir_deref_var *deref = nir_deref_var_create(this->shader, var);
-   this->deref_head = deref;
-   this->deref_tail = &deref->deref;
+   this->deref = nir_build_deref_var(&b, var);
 }
 
 void
@@ -2205,33 +2167,17 @@
    int field_index = ir->field_idx;
    assert(field_index >= 0);
 
-   nir_deref_struct *deref = nir_deref_struct_create(this->deref_tail, field_index);
-   deref->deref.type = ir->type;
-   this->deref_tail->child = &deref->deref;
-   this->deref_tail = &deref->deref;
+   this->deref = nir_build_deref_struct(&b, this->deref, field_index);
 }
 
 void
 nir_visitor::visit(ir_dereference_array *ir)
 {
-   nir_deref_array *deref = nir_deref_array_create(this->shader);
-   deref->deref.type = ir->type;
-
-   ir_constant *const_index = ir->array_index->as_constant();
-   if (const_index != NULL) {
-      deref->deref_array_type = nir_deref_array_type_direct;
-      deref->base_offset = const_index->value.u[0];
-   } else {
-      deref->deref_array_type = nir_deref_array_type_indirect;
-      deref->indirect =
-         nir_src_for_ssa(evaluate_rvalue(ir->array_index));
-   }
+   nir_ssa_def *index = evaluate_rvalue(ir->array_index);
 
    ir->array->accept(this);
 
-   this->deref_tail->child = &deref->deref;
-   ralloc_steal(this->deref_tail, deref);
-   this->deref_tail = &deref->deref;
+   this->deref = nir_build_deref_array(&b, this->deref, index);
 }
 
 void
diff --git a/src/compiler/glsl/ir.cpp b/src/compiler/glsl/ir.cpp
index e3134ea..1d1a56a 100644
--- a/src/compiler/glsl/ir.cpp
+++ b/src/compiler/glsl/ir.cpp
@@ -820,6 +820,10 @@
 	    for (unsigned i = 0; i < type->components(); i++)
 	       this->value.b[i] = value->value.b[0];
 	    break;
+	 case GLSL_TYPE_SAMPLER:
+	 case GLSL_TYPE_IMAGE:
+	    this->value.u64[0] = value->value.u64[0];
+	    break;
 	 default:
 	    assert(!"Should not get here.");
 	    break;
@@ -939,6 +943,8 @@
    case GLSL_TYPE_FLOAT: return ((int)this->value.f[i]) != 0;
    case GLSL_TYPE_BOOL:  return this->value.b[i];
    case GLSL_TYPE_DOUBLE: return this->value.d[i] != 0.0;
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_UINT64: return this->value.u64[i] != 0;
    case GLSL_TYPE_INT64:  return this->value.i64[i] != 0;
    default:              assert(!"Should not get here."); break;
@@ -959,6 +965,8 @@
    case GLSL_TYPE_FLOAT: return this->value.f[i];
    case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1.0f : 0.0f;
    case GLSL_TYPE_DOUBLE: return (float) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_UINT64: return (float) this->value.u64[i];
    case GLSL_TYPE_INT64:  return (float) this->value.i64[i];
    default:              assert(!"Should not get here."); break;
@@ -979,6 +987,8 @@
    case GLSL_TYPE_FLOAT: return (double) this->value.f[i];
    case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1.0 : 0.0;
    case GLSL_TYPE_DOUBLE: return this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_UINT64: return (double) this->value.u64[i];
    case GLSL_TYPE_INT64:  return (double) this->value.i64[i];
    default:              assert(!"Should not get here."); break;
@@ -999,6 +1009,8 @@
    case GLSL_TYPE_FLOAT: return (int) this->value.f[i];
    case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1 : 0;
    case GLSL_TYPE_DOUBLE: return (int) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_UINT64: return (int) this->value.u64[i];
    case GLSL_TYPE_INT64:  return (int) this->value.i64[i];
    default:              assert(!"Should not get here."); break;
@@ -1019,6 +1031,8 @@
    case GLSL_TYPE_FLOAT: return (unsigned) this->value.f[i];
    case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1 : 0;
    case GLSL_TYPE_DOUBLE: return (unsigned) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_UINT64: return (unsigned) this->value.u64[i];
    case GLSL_TYPE_INT64:  return (unsigned) this->value.i64[i];
    default:              assert(!"Should not get here."); break;
@@ -1039,6 +1053,8 @@
    case GLSL_TYPE_FLOAT: return (int64_t) this->value.f[i];
    case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1 : 0;
    case GLSL_TYPE_DOUBLE: return (int64_t) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_UINT64: return (int64_t) this->value.u64[i];
    case GLSL_TYPE_INT64:  return this->value.i64[i];
    default:              assert(!"Should not get here."); break;
@@ -1059,6 +1075,8 @@
    case GLSL_TYPE_FLOAT: return (uint64_t) this->value.f[i];
    case GLSL_TYPE_BOOL:  return this->value.b[i] ? 1 : 0;
    case GLSL_TYPE_DOUBLE: return (uint64_t) this->value.d[i];
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_UINT64: return this->value.u64[i];
    case GLSL_TYPE_INT64:  return (uint64_t) this->value.i64[i];
    default:              assert(!"Should not get here."); break;
@@ -1110,6 +1128,8 @@
    case GLSL_TYPE_INT:
    case GLSL_TYPE_FLOAT:
    case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
    case GLSL_TYPE_UINT64:
    case GLSL_TYPE_INT64:
    case GLSL_TYPE_BOOL: {
@@ -1132,7 +1152,9 @@
 	 case GLSL_TYPE_DOUBLE:
 	    value.d[i+offset] = src->get_double_component(i);
 	    break;
-         case GLSL_TYPE_UINT64:
+	 case GLSL_TYPE_SAMPLER:
+	 case GLSL_TYPE_IMAGE:
+	 case GLSL_TYPE_UINT64:
 	    value.u64[i+offset] = src->get_uint64_component(i);
 	    break;
 	 case GLSL_TYPE_INT64:
@@ -1189,7 +1211,9 @@
 	 case GLSL_TYPE_DOUBLE:
 	    value.d[i+offset] = src->get_double_component(id++);
 	    break;
-         case GLSL_TYPE_UINT64:
+	 case GLSL_TYPE_SAMPLER:
+	 case GLSL_TYPE_IMAGE:
+	 case GLSL_TYPE_UINT64:
 	    value.u64[i+offset] = src->get_uint64_component(id++);
 	    break;
 	 case GLSL_TYPE_INT64:
@@ -1239,6 +1263,8 @@
 	 if (this->value.d[i] != c->value.d[i])
 	    return false;
 	 break;
+      case GLSL_TYPE_SAMPLER:
+      case GLSL_TYPE_IMAGE:
       case GLSL_TYPE_UINT64:
 	 if (this->value.u64[i] != c->value.u64[i])
 	    return false;
@@ -1288,6 +1314,8 @@
 	 if (this->value.d[c] != double(f))
 	    return false;
 	 break;
+      case GLSL_TYPE_SAMPLER:
+      case GLSL_TYPE_IMAGE:
       case GLSL_TYPE_UINT64:
 	 if (this->value.u64[c] != uint64_t(i))
 	    return false;
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index 5420fc3..d05d199 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -1120,6 +1120,8 @@
    ir_intrinsic_memory_barrier_buffer,
    ir_intrinsic_memory_barrier_image,
    ir_intrinsic_memory_barrier_shared,
+   ir_intrinsic_begin_invocation_interlock,
+   ir_intrinsic_end_invocation_interlock,
 
    ir_intrinsic_vote_all,
    ir_intrinsic_vote_any,
diff --git a/src/compiler/glsl/ir_constant_expression.cpp b/src/compiler/glsl/ir_constant_expression.cpp
index 4a0aff7..c9788c7 100644
--- a/src/compiler/glsl/ir_constant_expression.cpp
+++ b/src/compiler/glsl/ir_constant_expression.cpp
@@ -826,7 +826,7 @@
          const unsigned component = idx->value.u[0];
 
          return new(mem_ctx) ir_constant(array, component);
-      } else {
+      } else if (array->type->is_array()) {
          const unsigned index = idx->value.u[0];
          return array->get_array_element(index)->clone(mem_ctx, NULL);
       }
diff --git a/src/compiler/glsl/ir_expression_operation.py b/src/compiler/glsl/ir_expression_operation.py
index d854292..16b9869 100644
--- a/src/compiler/glsl/ir_expression_operation.py
+++ b/src/compiler/glsl/ir_expression_operation.py
@@ -62,7 +62,7 @@
    def __iter__(self):
       return self
 
-   def next(self):
+   def __next__(self):
       if self.i < len(self.source_types):
          i = self.i
          self.i += 1
@@ -76,6 +76,8 @@
       else:
          raise StopIteration()
 
+   next = __next__
+
 
 uint_type = type("unsigned", "u", "GLSL_TYPE_UINT")
 int_type = type("int", "i", "GLSL_TYPE_INT")
@@ -114,7 +116,7 @@
 constant_template_vector_scalar = mako.template.Template("""\
    case ${op.get_enum_name()}:
     % if "mixed" in op.flags:
-        % for i in xrange(op.num_operands):
+        % for i in range(op.num_operands):
       assert(op[${i}]->type->base_type == ${op.source_types[0].glsl_type} ||
             % for src_type in op.source_types[1:-1]:
              op[${i}]->type->base_type == ${src_type.glsl_type} ||
diff --git a/src/compiler/glsl/ir_optimization.h b/src/compiler/glsl/ir_optimization.h
index b0e8460..ef68b93 100644
--- a/src/compiler/glsl/ir_optimization.h
+++ b/src/compiler/glsl/ir_optimization.h
@@ -103,7 +103,6 @@
 bool do_constant_folding(exec_list *instructions);
 bool do_constant_variable(exec_list *instructions);
 bool do_constant_variable_unlinked(exec_list *instructions);
-bool do_copy_propagation(exec_list *instructions);
 bool do_copy_propagation_elements(exec_list *instructions);
 bool do_constant_propagation(exec_list *instructions);
 void do_dead_builtin_varyings(struct gl_context *ctx,
diff --git a/src/compiler/glsl/link_uniform_block_active_visitor.cpp b/src/compiler/glsl/link_uniform_block_active_visitor.cpp
index cd1baf7..3689818 100644
--- a/src/compiler/glsl/link_uniform_block_active_visitor.cpp
+++ b/src/compiler/glsl/link_uniform_block_active_visitor.cpp
@@ -23,6 +23,7 @@
 
 #include "link_uniform_block_active_visitor.h"
 #include "program.h"
+#include "linker_util.h"
 
 static link_uniform_block_active *
 process_block(void *mem_ctx, struct hash_table *ht, ir_variable *var)
diff --git a/src/compiler/glsl/link_uniform_blocks.cpp b/src/compiler/glsl/link_uniform_blocks.cpp
index e9e29d1..0ab9687b 100644
--- a/src/compiler/glsl/link_uniform_blocks.cpp
+++ b/src/compiler/glsl/link_uniform_blocks.cpp
@@ -298,7 +298,7 @@
    if (b->is_shader_storage &&
        parcel->buffer_size > ctx->Const.MaxShaderStorageBlockSize) {
       linker_error(prog, "shader storage block `%s' has size %d, "
-                   "which is larger than than the maximum allowed (%d)",
+                   "which is larger than the maximum allowed (%d)",
                    b->type->name,
                    parcel->buffer_size,
                    ctx->Const.MaxShaderStorageBlockSize);
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index 23ff7ec..8d3f95f 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -1153,38 +1153,6 @@
    uniform_size->map->put(hidden_uniform_start + hidden_id, name);
 }
 
-/**
- * Search through the list of empty blocks to find one that fits the current
- * uniform.
- */
-static int
-find_empty_block(struct gl_shader_program *prog,
-                 struct gl_uniform_storage *uniform)
-{
-   const unsigned entries = MAX2(1, uniform->array_elements);
-
-   foreach_list_typed(struct empty_uniform_block, block, link,
-                      &prog->EmptyUniformLocations) {
-      /* Found a block with enough slots to fit the uniform */
-      if (block->slots == entries) {
-         unsigned start = block->start;
-         exec_node_remove(&block->link);
-         ralloc_free(block);
-
-         return start;
-      /* Found a block with more slots than needed. It can still be used. */
-      } else if (block->slots > entries) {
-         unsigned start = block->start;
-         block->start += entries;
-         block->slots -= entries;
-
-         return start;
-      }
-   }
-
-   return -1;
-}
-
 static void
 link_setup_uniform_remap_tables(struct gl_context *ctx,
                                 struct gl_shader_program *prog)
@@ -1239,7 +1207,7 @@
       int chosen_location = -1;
 
       if (empty_locs)
-         chosen_location = find_empty_block(prog, &prog->data->UniformStorage[i]);
+         chosen_location = link_util_find_empty_block(prog, &prog->data->UniformStorage[i]);
 
       /* Add new entries to the total amount of entries. */
       total_entries += entries;
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 8f329d8..b7260ac 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -76,6 +76,7 @@
 #include "util/set.h"
 #include "string_to_uint_map.h"
 #include "linker.h"
+#include "linker_util.h"
 #include "link_varyings.h"
 #include "ir_optimization.h"
 #include "ir_rvalue_visitor.h"
@@ -895,7 +896,7 @@
  * Perform validation of global variables used across multiple shaders
  */
 static void
-cross_validate_globals(struct gl_shader_program *prog,
+cross_validate_globals(struct gl_context *ctx, struct gl_shader_program *prog,
                        struct exec_list *ir, glsl_symbol_table *variables,
                        bool uniforms_only)
 {
@@ -1116,7 +1117,8 @@
          /* Check the precision qualifier matches for uniform variables on
           * GLSL ES.
           */
-         if (prog->IsES && !var->get_interface_type() &&
+         if (!ctx->Const.AllowGLSLRelaxedES &&
+             prog->IsES && !var->get_interface_type() &&
              existing->data.precision != var->data.precision) {
             if ((existing->data.used && var->data.used) || prog->data->Version >= 300) {
                linker_error(prog, "declarations for %s `%s` have "
@@ -1169,15 +1171,16 @@
  * Perform validation of uniforms used across multiple shader stages
  */
 static void
-cross_validate_uniforms(struct gl_shader_program *prog)
+cross_validate_uniforms(struct gl_context *ctx,
+                        struct gl_shader_program *prog)
 {
    glsl_symbol_table variables;
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i] == NULL)
          continue;
 
-      cross_validate_globals(prog, prog->_LinkedShaders[i]->ir, &variables,
-                             true);
+      cross_validate_globals(ctx, prog, prog->_LinkedShaders[i]->ir,
+                             &variables, true);
    }
 }
 
@@ -1979,6 +1982,14 @@
       linked_shader->Program->info.fs.inner_coverage |= shader->InnerCoverage;
       linked_shader->Program->info.fs.post_depth_coverage |=
          shader->PostDepthCoverage;
+      linked_shader->Program->info.fs.pixel_interlock_ordered |=
+         shader->PixelInterlockOrdered;
+      linked_shader->Program->info.fs.pixel_interlock_unordered |=
+         shader->PixelInterlockUnordered;
+      linked_shader->Program->info.fs.sample_interlock_ordered |=
+         shader->SampleInterlockOrdered;
+      linked_shader->Program->info.fs.sample_interlock_unordered |=
+         shader->SampleInterlockUnordered;
 
       linked_shader->Program->sh.fs.BlendSupport |= shader->BlendSupport;
    }
@@ -2176,6 +2187,41 @@
    }
 }
 
+/**
+ * Link all out variables on a single stage which are not
+ * directly used in a shader with the main function.
+ */
+static void
+link_output_variables(struct gl_linked_shader *linked_shader,
+                      struct gl_shader **shader_list,
+                      unsigned num_shaders)
+{
+   struct glsl_symbol_table *symbols = linked_shader->symbols;
+
+   for (unsigned i = 0; i < num_shaders; i++) {
+
+      /* Skip shader object with main function */
+      if (shader_list[i]->symbols->get_function("main"))
+         continue;
+
+      foreach_in_list(ir_instruction, ir, shader_list[i]->ir) {
+         if (ir->ir_type != ir_type_variable)
+            continue;
+
+         ir_variable *var = (ir_variable *) ir;
+
+         if (var->data.mode == ir_var_shader_out &&
+               !symbols->get_variable(var->name)) {
+            var = var->clone(linked_shader, NULL);
+            symbols->add_variable(var);
+            linked_shader->ir->push_head(var);
+         }
+      }
+   }
+
+   return;
+}
+
 
 /**
  * Combine a group of shaders for a single stage to generate a linked shader
@@ -2203,7 +2249,8 @@
    for (unsigned i = 0; i < num_shaders; i++) {
       if (shader_list[i] == NULL)
          continue;
-      cross_validate_globals(prog, shader_list[i]->ir, &variables, false);
+      cross_validate_globals(ctx, prog, shader_list[i]->ir, &variables,
+                             false);
    }
 
    if (!prog->data->LinkStatus)
@@ -2340,6 +2387,9 @@
       return NULL;
    }
 
+   if (linked->Stage != MESA_SHADER_FRAGMENT)
+      link_output_variables(linked, shader_list, num_shaders);
+
    /* Make a pass over all variable declarations to ensure that arrays with
     * unspecified sizes have a size specified.  The size is inferred from the
     * max_array_access field.
@@ -3517,23 +3567,7 @@
       }
    }
 
-   struct empty_uniform_block *current_block = NULL;
-
-   for (unsigned i = 0; i < prog->NumUniformRemapTable; i++) {
-      /* We found empty space in UniformRemapTable. */
-      if (prog->UniformRemapTable[i] == NULL) {
-         /* We've found the beginning of a new continous block of empty slots */
-         if (!current_block || current_block->start + current_block->slots != i) {
-            current_block = rzalloc(prog, struct empty_uniform_block);
-            current_block->start = i;
-            exec_list_push_tail(&prog->EmptyUniformLocations,
-                                &current_block->link);
-         }
-
-         /* The current block continues, so we simply increment its slots */
-         current_block->slots++;
-      }
-   }
+   link_util_update_empty_uniform_locations(prog);
 
    delete uniform_map;
    prog->NumExplicitUniformLocations = entries_total;
@@ -3614,42 +3648,6 @@
    return false;
 }
 
-static bool
-add_program_resource(struct gl_shader_program *prog,
-                     struct set *resource_set,
-                     GLenum type, const void *data, uint8_t stages)
-{
-   assert(data);
-
-   /* If resource already exists, do not add it again. */
-   if (_mesa_set_search(resource_set, data))
-      return true;
-
-   prog->data->ProgramResourceList =
-      reralloc(prog->data,
-               prog->data->ProgramResourceList,
-               gl_program_resource,
-               prog->data->NumProgramResourceList + 1);
-
-   if (!prog->data->ProgramResourceList) {
-      linker_error(prog, "Out of memory during linking.\n");
-      return false;
-   }
-
-   struct gl_program_resource *res =
-      &prog->data->ProgramResourceList[prog->data->NumProgramResourceList];
-
-   res->Type = type;
-   res->Data = data;
-   res->StageReferences = stages;
-
-   prog->data->NumProgramResourceList++;
-
-   _mesa_set_add(resource_set, data);
-
-   return true;
-}
-
 /* Function checks if a variable var is a packed varying and
  * if given name is part of packed varying's list.
  *
@@ -3943,8 +3941,8 @@
       if (!sha_v)
          return false;
 
-      return add_program_resource(shProg, resource_set,
-                                  programInterface, sha_v, stage_mask);
+      return link_util_add_program_resource(shProg, resource_set,
+                                            programInterface, sha_v, stage_mask);
    }
    }
 }
@@ -4373,9 +4371,9 @@
       /* Add transform feedback varyings. */
       if (linked_xfb->NumVarying > 0) {
          for (int i = 0; i < linked_xfb->NumVarying; i++) {
-            if (!add_program_resource(shProg, resource_set,
-                                      GL_TRANSFORM_FEEDBACK_VARYING,
-                                      &linked_xfb->Varyings[i], 0))
+            if (!link_util_add_program_resource(shProg, resource_set,
+                                                GL_TRANSFORM_FEEDBACK_VARYING,
+                                                &linked_xfb->Varyings[i], 0))
             return;
          }
       }
@@ -4384,9 +4382,9 @@
       for (unsigned i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
          if ((linked_xfb->ActiveBuffers >> i) & 1) {
             linked_xfb->Buffers[i].Binding = i;
-            if (!add_program_resource(shProg, resource_set,
-                                      GL_TRANSFORM_FEEDBACK_BUFFER,
-                                      &linked_xfb->Buffers[i], 0))
+            if (!link_util_add_program_resource(shProg, resource_set,
+                                                GL_TRANSFORM_FEEDBACK_BUFFER,
+                                                &linked_xfb->Buffers[i], 0))
             return;
          }
       }
@@ -4422,29 +4420,29 @@
                                          &shProg->data->UniformStorage[i]);
       }
 
-      if (!add_program_resource(shProg, resource_set, type,
-                                &shProg->data->UniformStorage[i], stageref))
+      if (!link_util_add_program_resource(shProg, resource_set, type,
+                                          &shProg->data->UniformStorage[i], stageref))
          return;
    }
 
    /* Add program uniform blocks. */
    for (unsigned i = 0; i < shProg->data->NumUniformBlocks; i++) {
-      if (!add_program_resource(shProg, resource_set, GL_UNIFORM_BLOCK,
-          &shProg->data->UniformBlocks[i], 0))
+      if (!link_util_add_program_resource(shProg, resource_set, GL_UNIFORM_BLOCK,
+                                          &shProg->data->UniformBlocks[i], 0))
          return;
    }
 
    /* Add program shader storage blocks. */
    for (unsigned i = 0; i < shProg->data->NumShaderStorageBlocks; i++) {
-      if (!add_program_resource(shProg, resource_set, GL_SHADER_STORAGE_BLOCK,
-          &shProg->data->ShaderStorageBlocks[i], 0))
+      if (!link_util_add_program_resource(shProg, resource_set, GL_SHADER_STORAGE_BLOCK,
+                                          &shProg->data->ShaderStorageBlocks[i], 0))
          return;
    }
 
    /* Add atomic counter buffers. */
    for (unsigned i = 0; i < shProg->data->NumAtomicBuffers; i++) {
-      if (!add_program_resource(shProg, resource_set, GL_ATOMIC_COUNTER_BUFFER,
-                                &shProg->data->AtomicBuffers[i], 0))
+      if (!link_util_add_program_resource(shProg, resource_set, GL_ATOMIC_COUNTER_BUFFER,
+                                          &shProg->data->AtomicBuffers[i], 0))
          return;
    }
 
@@ -4460,8 +4458,8 @@
 
          type = _mesa_shader_stage_to_subroutine_uniform((gl_shader_stage)j);
          /* add shader subroutines */
-         if (!add_program_resource(shProg, resource_set,
-                                   type, &shProg->data->UniformStorage[i], 0))
+         if (!link_util_add_program_resource(shProg, resource_set,
+                                             type, &shProg->data->UniformStorage[i], 0))
             return;
       }
    }
@@ -4473,8 +4471,8 @@
 
       GLuint type = _mesa_shader_stage_to_subroutine((gl_shader_stage)i);
       for (unsigned j = 0; j < p->sh.NumSubroutineFunctions; j++) {
-         if (!add_program_resource(shProg, resource_set,
-                                   type, &p->sh.SubroutineFunctions[j], 0))
+         if (!link_util_add_program_resource(shProg, resource_set,
+                                             type, &p->sh.SubroutineFunctions[j], 0))
             return;
       }
    }
@@ -4800,7 +4798,8 @@
       min_version = MIN2(min_version, prog->Shaders[i]->Version);
       max_version = MAX2(max_version, prog->Shaders[i]->Version);
 
-      if (prog->Shaders[i]->IsES != prog->Shaders[0]->IsES) {
+      if (!ctx->Const.AllowGLSLRelaxedES &&
+          prog->Shaders[i]->IsES != prog->Shaders[0]->IsES) {
          linker_error(prog, "all shaders must use same shading "
                       "language version\n");
          goto done;
@@ -4818,7 +4817,8 @@
    /* In desktop GLSL, different shader versions may be linked together.  In
     * GLSL ES, all shader versions must be the same.
     */
-   if (prog->Shaders[0]->IsES && min_version != max_version) {
+   if (!ctx->Const.AllowGLSLRelaxedES && prog->Shaders[0]->IsES &&
+       min_version != max_version) {
       linker_error(prog, "all shaders must use same shading "
                    "language version\n");
       goto done;
@@ -4944,7 +4944,7 @@
     * performed, then locations are assigned for uniforms, attributes, and
     * varyings.
     */
-   cross_validate_uniforms(prog);
+   cross_validate_uniforms(ctx, prog);
    if (!prog->data->LinkStatus)
       goto done;
 
diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h
index 454b65a..f6fb003 100644
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -25,6 +25,8 @@
 #ifndef GLSL_LINKER_H
 #define GLSL_LINKER_H
 
+#include "linker_util.h"
+
 struct gl_shader_program;
 struct gl_shader;
 struct gl_linked_shader;
@@ -192,23 +194,4 @@
                   const glsl_struct_field *named_ifc_member);
 };
 
-void
-linker_error(gl_shader_program *prog, const char *fmt, ...);
-
-void
-linker_warning(gl_shader_program *prog, const char *fmt, ...);
-
-/**
- * Sometimes there are empty slots left over in UniformRemapTable after we
- * allocate slots to explicit locations. This struct represents a single
- * continouous block of empty slots in UniformRemapTable.
- */
-struct empty_uniform_block {
-   struct exec_node link;
-   /* The start location of the block */
-   unsigned start;
-   /* The number of slots in the block */
-   unsigned slots;
-};
-
 #endif /* GLSL_LINKER_H */
diff --git a/src/compiler/glsl/linker_util.cpp b/src/compiler/glsl/linker_util.cpp
new file mode 100644
index 0000000..d2724c2
--- /dev/null
+++ b/src/compiler/glsl/linker_util.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+#include "main/mtypes.h"
+#include "linker_util.h"
+#include "util/set.h"
+#include "ir_uniform.h" /* for gl_uniform_storage */
+
+/* Utility methods shared between the GLSL IR and the NIR */
+
+bool
+link_util_add_program_resource(struct gl_shader_program *prog,
+                               struct set *resource_set,
+                               GLenum type, const void *data, uint8_t stages)
+{
+   assert(data);
+
+   /* If resource already exists, do not add it again. */
+   if (_mesa_set_search(resource_set, data))
+      return true;
+
+   prog->data->ProgramResourceList =
+      reralloc(prog->data,
+               prog->data->ProgramResourceList,
+               gl_program_resource,
+               prog->data->NumProgramResourceList + 1);
+
+   if (!prog->data->ProgramResourceList) {
+      linker_error(prog, "Out of memory during linking.\n");
+      return false;
+   }
+
+   struct gl_program_resource *res =
+      &prog->data->ProgramResourceList[prog->data->NumProgramResourceList];
+
+   res->Type = type;
+   res->Data = data;
+   res->StageReferences = stages;
+
+   prog->data->NumProgramResourceList++;
+
+   _mesa_set_add(resource_set, data);
+
+   return true;
+}
+
+/**
+ * Search through the list of empty blocks to find one that fits the current
+ * uniform.
+ */
+int
+link_util_find_empty_block(struct gl_shader_program *prog,
+                           struct gl_uniform_storage *uniform)
+{
+   const unsigned entries = MAX2(1, uniform->array_elements);
+
+   foreach_list_typed(struct empty_uniform_block, block, link,
+                      &prog->EmptyUniformLocations) {
+      /* Found a block with enough slots to fit the uniform */
+      if (block->slots == entries) {
+         unsigned start = block->start;
+         exec_node_remove(&block->link);
+         ralloc_free(block);
+
+         return start;
+      /* Found a block with more slots than needed. It can still be used. */
+      } else if (block->slots > entries) {
+         unsigned start = block->start;
+         block->start += entries;
+         block->slots -= entries;
+
+         return start;
+      }
+   }
+
+   return -1;
+}
+
+void
+link_util_update_empty_uniform_locations(struct gl_shader_program *prog)
+{
+   struct empty_uniform_block *current_block = NULL;
+
+   for (unsigned i = 0; i < prog->NumUniformRemapTable; i++) {
+      /* We found empty space in UniformRemapTable. */
+      if (prog->UniformRemapTable[i] == NULL) {
+         /* We've found the beginning of a new continous block of empty slots */
+         if (!current_block || current_block->start + current_block->slots != i) {
+            current_block = rzalloc(prog, struct empty_uniform_block);
+            current_block->start = i;
+            exec_list_push_tail(&prog->EmptyUniformLocations,
+                                &current_block->link);
+         }
+
+         /* The current block continues, so we simply increment its slots */
+         current_block->slots++;
+      }
+   }
+}
diff --git a/src/compiler/glsl/linker_util.h b/src/compiler/glsl/linker_util.h
new file mode 100644
index 0000000..1c3674f
--- /dev/null
+++ b/src/compiler/glsl/linker_util.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef GLSL_LINKER_UTIL_H
+#define GLSL_LINKER_UTIL_H
+
+struct gl_shader_program;
+struct gl_uniform_storage;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Sometimes there are empty slots left over in UniformRemapTable after we
+ * allocate slots to explicit locations. This struct represents a single
+ * continouous block of empty slots in UniformRemapTable.
+ */
+struct empty_uniform_block {
+   struct exec_node link;
+   /* The start location of the block */
+   unsigned start;
+   /* The number of slots in the block */
+   unsigned slots;
+};
+
+void
+linker_error(struct gl_shader_program *prog, const char *fmt, ...);
+
+void
+linker_warning(struct gl_shader_program *prog, const char *fmt, ...);
+
+bool
+link_util_add_program_resource(struct gl_shader_program *prog,
+                               struct set *resource_set,
+                               GLenum type, const void *data, uint8_t stages);
+
+int
+link_util_find_empty_block(struct gl_shader_program *prog,
+                           struct gl_uniform_storage *uniform);
+
+void
+link_util_update_empty_uniform_locations(struct gl_shader_program *prog);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GLSL_LINKER_UTIL_H */
diff --git a/src/compiler/glsl/list.h b/src/compiler/glsl/list.h
index 2bfa273..59ed766 100644
--- a/src/compiler/glsl/list.h
+++ b/src/compiler/glsl/list.h
@@ -672,7 +672,7 @@
         __node = __prev, __prev = (__type *)__prev->prev)
 
 #define foreach_in_list_use_after(__type, __inst, __list) \
-   __type *(__inst);                                      \
+   __type *__inst;                                        \
    for ((__inst) = (__type *)(__list)->head_sentinel.next; \
         !(__inst)->is_tail_sentinel();                    \
         (__inst) = (__type *)(__inst)->next)
diff --git a/src/compiler/glsl/lower_packed_varyings.cpp b/src/compiler/glsl/lower_packed_varyings.cpp
index 5c1eed7..5bd6c92 100644
--- a/src/compiler/glsl/lower_packed_varyings.cpp
+++ b/src/compiler/glsl/lower_packed_varyings.cpp
@@ -729,12 +729,17 @@
       unpacked_var->insert_before(packed_var);
       this->packed_varyings[slot] = packed_var;
    } else {
+      ir_variable *var = this->packed_varyings[slot];
+
+      /* The slot needs to be marked as always active if any variable that got
+       * packed there was.
+       */
+      var->data.always_active_io |= unpacked_var->data.always_active_io;
+
       /* For geometry shader inputs, only update the packed variable name the
        * first time we visit each component.
        */
       if (this->gs_input_vertices == 0 || vertex_index == 0) {
-         ir_variable *var = this->packed_varyings[slot];
-
          if (var->is_name_ralloced())
             ralloc_asprintf_append((char **) &var->name, ",%s", name);
          else
diff --git a/src/compiler/glsl/lower_vector_derefs.cpp b/src/compiler/glsl/lower_vector_derefs.cpp
index 7583d1f..6cd9a2d 100644
--- a/src/compiler/glsl/lower_vector_derefs.cpp
+++ b/src/compiler/glsl/lower_vector_derefs.cpp
@@ -59,8 +59,7 @@
    if (!deref->array->type->is_vector())
       return ir_rvalue_enter_visitor::visit_enter(ir);
 
-   ir_dereference *const new_lhs = (ir_dereference *) deref->array;
-   ir->set_lhs(new_lhs);
+   ir_rvalue *const new_lhs = deref->array;
 
    void *mem_ctx = ralloc_parent(ir);
    ir_constant *old_index_constant =
@@ -72,8 +71,16 @@
                                            ir->rhs,
                                            deref->array_index);
       ir->write_mask = (1 << new_lhs->type->vector_elements) - 1;
+      ir->set_lhs(new_lhs);
+   } else if (new_lhs->ir_type != ir_type_swizzle) {
+      ir->set_lhs(new_lhs);
+      ir->write_mask = 1 << old_index_constant->get_uint_component(0);
    } else {
-      ir->write_mask = 1 << old_index_constant->get_int_component(0);
+      /* If the "new" LHS is a swizzle, use the set_lhs helper to instead
+       * swizzle the RHS.
+       */
+      unsigned component[1] = { old_index_constant->get_uint_component(0) };
+      ir->set_lhs(new(mem_ctx) ir_swizzle(new_lhs, component, 1));
    }
 
    return ir_rvalue_enter_visitor::visit_enter(ir);
diff --git a/src/compiler/glsl/meson.build b/src/compiler/glsl/meson.build
index 26ab4f1..df1c086 100644
--- a/src/compiler/glsl/meson.build
+++ b/src/compiler/glsl/meson.build
@@ -66,6 +66,16 @@
   'builtin_types.cpp',
   'builtin_variables.cpp',
   'generate_ir.cpp',
+  'gl_nir_lower_atomics.c',
+  'gl_nir_lower_samplers.c',
+  'gl_nir_lower_samplers_as_deref.c',
+  'gl_nir_link_atomics.c',
+  'gl_nir_link_uniform_initializers.c',
+  'gl_nir_link_uniforms.c',
+  'gl_nir_link_xfb.c',
+  'gl_nir_linker.c',
+  'gl_nir_linker.h',
+  'gl_nir.h',
   'glsl_parser_extras.cpp',
   'glsl_parser_extras.h',
   'glsl_symbol_table.cpp',
@@ -108,6 +118,8 @@
   'ir_visitor.h',
   'linker.cpp',
   'linker.h',
+  'linker_util.h',
+  'linker_util.cpp',
   'link_atomics.cpp',
   'link_functions.cpp',
   'link_interface_blocks.cpp',
@@ -159,7 +171,6 @@
   'opt_constant_folding.cpp',
   'opt_constant_propagation.cpp',
   'opt_constant_variable.cpp',
-  'opt_copy_propagation.cpp',
   'opt_copy_propagation_elements.cpp',
   'opt_dead_builtin_variables.cpp',
   'opt_dead_builtin_varyings.cpp',
diff --git a/src/compiler/glsl/opt_constant_propagation.cpp b/src/compiler/glsl/opt_constant_propagation.cpp
index 05dc71e..a603c9f 100644
--- a/src/compiler/glsl/opt_constant_propagation.cpp
+++ b/src/compiler/glsl/opt_constant_propagation.cpp
@@ -77,23 +77,6 @@
 };
 
 
-class kill_entry : public exec_node
-{
-public:
-   /* override operator new from exec_node */
-   DECLARE_LINEAR_ZALLOC_CXX_OPERATORS(kill_entry)
-
-   kill_entry(ir_variable *var, unsigned write_mask)
-   {
-      assert(var);
-      this->var = var;
-      this->write_mask = write_mask;
-   }
-
-   ir_variable *var;
-   unsigned write_mask;
-};
-
 class ir_constant_propagation_visitor : public ir_rvalue_visitor {
 public:
    ir_constant_propagation_visitor()
@@ -122,15 +105,15 @@
    void constant_folding(ir_rvalue **rvalue);
    void constant_propagation(ir_rvalue **rvalue);
    void kill(ir_variable *ir, unsigned write_mask);
-   void handle_if_block(exec_list *instructions);
+   void handle_if_block(exec_list *instructions, hash_table *kills, bool *killed_all);
+   void handle_loop(class ir_loop *, bool keep_acp);
    void handle_rvalue(ir_rvalue **rvalue);
 
    /** List of acp_entry: The available constants to propagate */
    exec_list *acp;
 
    /**
-    * Hash table of kill_entry: The masks of variables whose values were
-    * killed in this block.
+    * Hash table of killed entries: maps variables to the mask of killed channels.
     */
    hash_table *kills;
 
@@ -356,15 +339,14 @@
 }
 
 void
-ir_constant_propagation_visitor::handle_if_block(exec_list *instructions)
+ir_constant_propagation_visitor::handle_if_block(exec_list *instructions, hash_table *kills, bool *killed_all)
 {
    exec_list *orig_acp = this->acp;
    hash_table *orig_kills = this->kills;
    bool orig_killed_all = this->killed_all;
 
    this->acp = new(mem_ctx) exec_list;
-   this->kills = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
-                                         _mesa_key_pointer_equal);
+   this->kills = kills;
    this->killed_all = false;
 
    /* Populate the initial acp with a constant of the original */
@@ -374,20 +356,10 @@
 
    visit_list_elements(this, instructions);
 
-   if (this->killed_all) {
-      orig_acp->make_empty();
-   }
-
-   hash_table *new_kills = this->kills;
+   *killed_all = this->killed_all;
    this->kills = orig_kills;
    this->acp = orig_acp;
-   this->killed_all = this->killed_all || orig_killed_all;
-
-   hash_entry *htk;
-   hash_table_foreach(new_kills, htk) {
-      kill_entry *k = (kill_entry *) htk->data;
-      kill(k->var, k->write_mask);
-   }
+   this->killed_all = orig_killed_all;
 }
 
 ir_visitor_status
@@ -396,29 +368,47 @@
    ir->condition->accept(this);
    handle_rvalue(&ir->condition);
 
-   handle_if_block(&ir->then_instructions);
-   handle_if_block(&ir->else_instructions);
+   hash_table *new_kills = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
+                                                   _mesa_key_pointer_equal);
+   bool then_killed_all = false;
+   bool else_killed_all = false;
+
+   handle_if_block(&ir->then_instructions, new_kills, &then_killed_all);
+   handle_if_block(&ir->else_instructions, new_kills, &else_killed_all);
+
+   if (then_killed_all || else_killed_all) {
+      acp->make_empty();
+      killed_all = true;
+   } else {
+      hash_entry *htk;
+      hash_table_foreach(new_kills, htk)
+         kill((ir_variable *) htk->key, (uintptr_t) htk->data);
+   }
+
+   _mesa_hash_table_destroy(new_kills, NULL);
 
    /* handle_if_block() already descended into the children. */
    return visit_continue_with_parent;
 }
 
-ir_visitor_status
-ir_constant_propagation_visitor::visit_enter(ir_loop *ir)
+void
+ir_constant_propagation_visitor::handle_loop(ir_loop *ir, bool keep_acp)
 {
    exec_list *orig_acp = this->acp;
    hash_table *orig_kills = this->kills;
    bool orig_killed_all = this->killed_all;
 
-   /* FINISHME: For now, the initial acp for loops is totally empty.
-    * We could go through once, then go through again with the acp
-    * cloned minus the killed entries after the first run through.
-    */
    this->acp = new(mem_ctx) exec_list;
    this->kills = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
                                          _mesa_key_pointer_equal);
    this->killed_all = false;
 
+   if (keep_acp) {
+      foreach_in_list(acp_entry, a, orig_acp) {
+         this->acp->push_tail(new(this->lin_ctx) acp_entry(a));
+      }
+   }
+
    visit_list_elements(this, &ir->body_instructions);
 
    if (this->killed_all) {
@@ -432,9 +422,22 @@
 
    hash_entry *htk;
    hash_table_foreach(new_kills, htk) {
-      kill_entry *k = (kill_entry *) htk->data;
-      kill(k->var, k->write_mask);
+      kill((ir_variable *) htk->key, (uintptr_t) htk->data);
    }
+}
+
+ir_visitor_status
+ir_constant_propagation_visitor::visit_enter(ir_loop *ir)
+{
+   /* Make a conservative first pass over the loop with an empty ACP set.
+    * This also removes any killed entries from the original ACP set.
+    */
+   handle_loop(ir, false);
+
+   /* Then, run it again with the real ACP set, minus any killed entries.
+    * This takes care of propagating values from before the loop into it.
+    */
+   handle_loop(ir, true);
 
    /* already descended into the children. */
    return visit_continue_with_parent;
@@ -463,13 +466,12 @@
     */
    hash_entry *kill_hash_entry = _mesa_hash_table_search(this->kills, var);
    if (kill_hash_entry) {
-      kill_entry *entry = (kill_entry *) kill_hash_entry->data;
-      entry->write_mask |= write_mask;
+      uintptr_t new_write_mask = ((uintptr_t) kill_hash_entry->data) | write_mask;
+      kill_hash_entry->data = (void *) new_write_mask;
       return;
    }
    /* Not already in the hash table.  Make new entry. */
-   _mesa_hash_table_insert(this->kills, var,
-                           new(this->lin_ctx) kill_entry(var, write_mask));
+   _mesa_hash_table_insert(this->kills, var, (void *) uintptr_t(write_mask));
 }
 
 /**
diff --git a/src/compiler/glsl/opt_copy_propagation_elements.cpp b/src/compiler/glsl/opt_copy_propagation_elements.cpp
index 8975e72..cd55c54 100644
--- a/src/compiler/glsl/opt_copy_propagation_elements.cpp
+++ b/src/compiler/glsl/opt_copy_propagation_elements.cpp
@@ -27,18 +27,9 @@
  * Replaces usage of recently-copied components of variables with the
  * previous copy of the variable.
  *
- * This pass can be compared with opt_copy_propagation, which operands
- * on arbitrary whole-variable copies.  However, in order to handle
- * the copy propagation of swizzled variables or writemasked writes,
- * we want to track things on a channel-wise basis.  I found that
- * trying to mix the swizzled/writemasked support here with the
- * whole-variable stuff in opt_copy_propagation.cpp just made a mess,
- * so this is separate despite the ACP handling being somewhat
- * similar.
- *
  * This should reduce the number of MOV instructions in the generated
- * programs unless copy propagation is also done on the LIR, and may
- * help anyway by triggering other optimizations that live in the HIR.
+ * programs and help triggering other optimizations that live in GLSL
+ * level.
  */
 
 #include "ir.h"
@@ -47,48 +38,224 @@
 #include "ir_optimization.h"
 #include "compiler/glsl_types.h"
 #include "util/hash_table.h"
+#include "util/set.h"
 
 static bool debug = false;
 
 namespace {
 
-class acp_entry;
-
-/* Class that refers to acp_entry in another exec_list. Used
- * when making removals based on rhs.
- */
-class acp_ref : public exec_node
+class acp_entry
 {
 public:
-   acp_ref(acp_entry *e)
-   {
-      entry = e;
-   }
-   acp_entry *entry;
-};
-
-class acp_entry : public exec_node
-{
-public:
-   /* override operator new from exec_node */
    DECLARE_LINEAR_ZALLOC_CXX_OPERATORS(acp_entry)
 
-   acp_entry(ir_variable *lhs, ir_variable *rhs, int write_mask, int swizzle[4])
-      : rhs_node(this)
-   {
-      this->lhs = lhs;
-      this->rhs = rhs;
-      this->write_mask = write_mask;
-      memcpy(this->swizzle, swizzle, sizeof(this->swizzle));
-   }
+   /* If set, rhs_full indicates that this ACP entry represents a
+    * whole-variable copy.  The rhs_element[] array will still be filled,
+    * to allow the swizzling from its components in case the variable
+    * was a vector (and to simplify some of the erase() and write_vector()
+    * logic).
+    */
 
-   ir_variable *lhs;
-   ir_variable *rhs;
-   unsigned int write_mask;
-   int swizzle[4];
-   acp_ref rhs_node;
+   ir_variable *rhs_full;
+   ir_variable *rhs_element[4];
+   unsigned rhs_channel[4];
+
+   /* Set of variables that use the variable associated with this acp_entry as
+    * RHS.  This has the "reverse references" of rhs_full/rhs_element.  It is
+    * used to speed up invalidating those references when the acp_entry
+    * changes.
+    */
+   set *dsts;
 };
 
+class copy_propagation_state {
+public:
+   DECLARE_RZALLOC_CXX_OPERATORS(copy_propagation_state);
+
+   static
+   copy_propagation_state* create(void *mem_ctx)
+   {
+      return new (mem_ctx) copy_propagation_state(NULL);
+   }
+
+   copy_propagation_state* clone()
+   {
+      return new (ralloc_parent(this)) copy_propagation_state(this);
+   }
+
+   void erase_all()
+   {
+      /* Individual elements were allocated from a linear allocator, so will
+       * be destroyed when the state is destroyed.
+       */
+      _mesa_hash_table_clear(acp, NULL);
+      fallback = NULL;
+   }
+
+   void erase(ir_variable *var, unsigned write_mask)
+   {
+      acp_entry *entry = pull_acp(var);
+      entry->rhs_full = NULL;
+
+      for (int i = 0; i < 4; i++) {
+         if (!entry->rhs_element[i])
+            continue;
+         if ((write_mask & (1 << i)) == 0)
+            continue;
+
+         ir_variable *to_remove = entry->rhs_element[i];
+         entry->rhs_element[i] = NULL;
+         remove_unused_var_from_dsts(entry, var, to_remove);
+      }
+
+      /* TODO: Check write mask, and possibly not clear everything. */
+
+      /* For any usage of our variable on the RHS, clear it out. */
+      struct set_entry *set_entry;
+      set_foreach(entry->dsts, set_entry) {
+         ir_variable *dst_var = (ir_variable *)set_entry->key;
+         acp_entry *dst_entry = pull_acp(dst_var);
+         for (int i = 0; i < 4; i++) {
+            if (dst_entry->rhs_element[i] == var)
+               dst_entry->rhs_element[i] = NULL;
+         }
+         if (dst_entry->rhs_full == var)
+            dst_entry->rhs_full = NULL;
+         _mesa_set_remove(entry->dsts, set_entry);
+      }
+   }
+
+   acp_entry *read(ir_variable *var)
+   {
+      for (copy_propagation_state *s = this; s != NULL; s = s->fallback) {
+         hash_entry *ht_entry = _mesa_hash_table_search(s->acp, var);
+         if (ht_entry)
+            return (acp_entry *) ht_entry->data;
+      }
+      return NULL;
+   }
+
+   void write_elements(ir_variable *lhs, ir_variable *rhs, unsigned write_mask, int swizzle[4])
+   {
+      acp_entry *lhs_entry = pull_acp(lhs);
+      lhs_entry->rhs_full = NULL;
+
+      for (int i = 0; i < 4; i++) {
+         if ((write_mask & (1 << i)) == 0)
+            continue;
+         ir_variable *to_remove = lhs_entry->rhs_element[i];
+         lhs_entry->rhs_element[i] = rhs;
+         lhs_entry->rhs_channel[i] = swizzle[i];
+
+         remove_unused_var_from_dsts(lhs_entry, lhs, to_remove);
+      }
+
+      acp_entry *rhs_entry = pull_acp(rhs);
+      _mesa_set_add(rhs_entry->dsts, lhs);
+   }
+
+   void write_full(ir_variable *lhs, ir_variable *rhs)
+   {
+      acp_entry *lhs_entry = pull_acp(lhs);
+      if (lhs_entry->rhs_full == rhs)
+         return;
+
+      if (lhs_entry->rhs_full) {
+         remove_from_dsts(lhs_entry->rhs_full, lhs);
+      } else if (lhs->type->is_vector()) {
+         for (int i = 0; i < 4; i++) {
+            if (lhs_entry->rhs_element[i])
+               remove_from_dsts(lhs_entry->rhs_element[i], lhs);
+         }
+      }
+
+      lhs_entry->rhs_full = rhs;
+      acp_entry *rhs_entry = pull_acp(rhs);
+      _mesa_set_add(rhs_entry->dsts, lhs);
+
+      if (lhs->type->is_vector()) {
+         for (int i = 0; i < 4; i++) {
+            lhs_entry->rhs_element[i] = rhs;
+            lhs_entry->rhs_channel[i] = i;
+         }
+      }
+   }
+
+   void remove_unused_var_from_dsts(acp_entry *lhs_entry, ir_variable *lhs, ir_variable *var)
+   {
+      if (!var)
+         return;
+
+      /* If lhs still uses var, don't remove anything. */
+      for (int j = 0; j < 4; j++) {
+         if (lhs_entry->rhs_element[j] == var)
+            return;
+      }
+
+      acp_entry *element = pull_acp(var);
+      assert(element);
+      _mesa_set_remove_key(element->dsts, lhs);
+   }
+
+private:
+   explicit copy_propagation_state(copy_propagation_state *fallback)
+   {
+      this->fallback = fallback;
+      /* Use 'this' as context for the table, no explicit destruction
+       * needed later.
+       */
+      acp = _mesa_hash_table_create(this, _mesa_hash_pointer,
+                                    _mesa_key_pointer_equal);
+      lin_ctx = linear_alloc_parent(this, 0);
+   }
+
+   acp_entry *pull_acp(ir_variable *var)
+   {
+      hash_entry *ht_entry = _mesa_hash_table_search(acp, var);
+      if (ht_entry)
+         return (acp_entry *) ht_entry->data;
+
+      /* If not found, create one and copy data from fallback if available. */
+      acp_entry *entry = new(lin_ctx) acp_entry();
+      _mesa_hash_table_insert(acp, var, entry);
+
+      bool found = false;
+      for (copy_propagation_state *s = fallback; s != NULL; s = s->fallback) {
+         hash_entry *fallback_ht_entry = _mesa_hash_table_search(s->acp, var);
+         if (fallback_ht_entry) {
+            acp_entry *fallback_entry = (acp_entry *) fallback_ht_entry->data;
+            *entry = *fallback_entry;
+            entry->dsts = _mesa_set_clone(fallback_entry->dsts, this);
+            found = true;
+            break;
+         }
+      }
+
+      if (!found) {
+         entry->dsts = _mesa_set_create(this, _mesa_hash_pointer,
+                                        _mesa_key_pointer_equal);
+      }
+
+      return entry;
+   }
+
+   void
+   remove_from_dsts(ir_variable *var, ir_variable *to_remove)
+   {
+      acp_entry *entry = pull_acp(var);
+      assert(entry);
+      _mesa_set_remove_key(entry->dsts, to_remove);
+   }
+
+   /** Available Copy to Propagate table, from variable to the entry
+    *  containing the current sources that can be used. */
+   hash_table *acp;
+
+   /** When a state is cloned, entries are copied on demand from fallback. */
+   copy_propagation_state *fallback;
+
+   void *lin_ctx;
+};
 
 class kill_entry : public exec_node
 {
@@ -116,33 +283,14 @@
       this->lin_ctx = linear_alloc_parent(this->mem_ctx, 0);
       this->shader_mem_ctx = NULL;
       this->kills = new(mem_ctx) exec_list;
-
-      create_acp();
+      this->state = copy_propagation_state::create(mem_ctx);
    }
    ~ir_copy_propagation_elements_visitor()
    {
       ralloc_free(mem_ctx);
    }
 
-   void clone_acp(hash_table *lhs, hash_table *rhs)
-   {
-      lhs_ht = _mesa_hash_table_clone(lhs, mem_ctx);
-      rhs_ht = _mesa_hash_table_clone(rhs, mem_ctx);
-   }
-
-   void create_acp()
-   {
-      lhs_ht = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
-                                       _mesa_key_pointer_equal);
-      rhs_ht = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
-                                       _mesa_key_pointer_equal);
-   }
-
-   void destroy_acp()
-   {
-      _mesa_hash_table_destroy(lhs_ht, NULL);
-      _mesa_hash_table_destroy(rhs_ht, NULL);
-   }
+   virtual ir_visitor_status visit(ir_dereference_variable *);
 
    void handle_loop(ir_loop *, bool keep_acp);
    virtual ir_visitor_status visit_enter(class ir_loop *);
@@ -156,11 +304,9 @@
 
    void add_copy(ir_assignment *ir);
    void kill(kill_entry *k);
-   void handle_if_block(exec_list *instructions);
+   void handle_if_block(exec_list *instructions, exec_list *kills, bool *killed_all);
 
-   /** Hash of acp_entry: The available copies to propagate */
-   hash_table *lhs_ht;
-   hash_table *rhs_ht;
+   copy_propagation_state *state;
 
    /**
     * List of kill_entry: The variables whose values were killed in this
@@ -182,6 +328,21 @@
 } /* unnamed namespace */
 
 ir_visitor_status
+ir_copy_propagation_elements_visitor::visit(ir_dereference_variable *ir)
+{
+   if (this->in_assignee)
+      return visit_continue;
+
+   const acp_entry *entry = state->read(ir->var);
+   if (entry && entry->rhs_full) {
+      ir->var = (ir_variable *) entry->rhs_full;
+      progress = true;
+   }
+
+   return visit_continue;
+}
+
+ir_visitor_status
 ir_copy_propagation_elements_visitor::visit_enter(ir_function_signature *ir)
 {
    /* Treat entry into a function signature as a completely separate
@@ -191,26 +352,21 @@
    exec_list *orig_kills = this->kills;
    bool orig_killed_all = this->killed_all;
 
-   hash_table *orig_lhs_ht = lhs_ht;
-   hash_table *orig_rhs_ht = rhs_ht;
-
    this->kills = new(mem_ctx) exec_list;
    this->killed_all = false;
 
-   create_acp();
+   copy_propagation_state *orig_state = state;
+   this->state = copy_propagation_state::create(mem_ctx);
 
    visit_list_elements(this, &ir->body);
 
+   delete this->state;
+   this->state = orig_state;
+
    ralloc_free(this->kills);
-
-   destroy_acp();
-
    this->kills = orig_kills;
    this->killed_all = orig_killed_all;
 
-   lhs_ht = orig_lhs_ht;
-   rhs_ht = orig_rhs_ht;
-
    return visit_continue_with_parent;
 }
 
@@ -220,16 +376,14 @@
    ir_dereference_variable *lhs = ir->lhs->as_dereference_variable();
    ir_variable *var = ir->lhs->variable_referenced();
 
-   if (var->type->is_scalar() || var->type->is_vector()) {
-      kill_entry *k;
+   kill_entry *k;
 
-      if (lhs)
-	 k = new(this->lin_ctx) kill_entry(var, ir->write_mask);
-      else
-	 k = new(this->lin_ctx) kill_entry(var, ~0);
+   if (lhs && var->type->is_vector())
+      k = new(this->lin_ctx) kill_entry(var, ir->write_mask);
+   else
+      k = new(this->lin_ctx) kill_entry(var, ~0);
 
-      kill(k);
-   }
+   kill(k);
 
    add_copy(ir);
 
@@ -296,19 +450,18 @@
    /* Try to find ACP entries covering swizzle_chan[], hoping they're
     * the same source variable.
     */
-   hash_entry *ht_entry = _mesa_hash_table_search(lhs_ht, var);
-   if (ht_entry) {
-      exec_list *ht_list = (exec_list *) ht_entry->data;
-      foreach_in_list(acp_entry, entry, ht_list) {
-         for (int c = 0; c < chans; c++) {
-            if (entry->write_mask & (1 << swizzle_chan[c])) {
-               source[c] = entry->rhs;
-               source_chan[c] = entry->swizzle[swizzle_chan[c]];
 
-               if (source_chan[c] != swizzle_chan[c])
-                  noop_swizzle = false;
-            }
-         }
+   const acp_entry *entry = state->read(var);
+   if (entry) {
+      for (int c = 0; c < chans; c++) {
+         unsigned index = swizzle_chan[c];
+         ir_variable *src = entry->rhs_element[index];
+         if (!src)
+            continue;
+         source[c] = src;
+         source_chan[c] = entry->rhs_channel[index];
+         if (source_chan[c] != swizzle_chan[c])
+            noop_swizzle = false;
       }
    }
 
@@ -365,56 +518,50 @@
       }
    }
 
-   /* Since we're unlinked, we don't (necessarily) know the side effects of
-    * this call.  So kill all copies.
-    */
-   _mesa_hash_table_clear(lhs_ht, NULL);
-   _mesa_hash_table_clear(rhs_ht, NULL);
+   if (!ir->callee->is_intrinsic()) {
+      state->erase_all();
+      this->killed_all = true;
+   } else {
+      if (ir->return_deref) {
+         kill(new(this->lin_ctx) kill_entry(ir->return_deref->var, ~0));
+      }
 
-   this->killed_all = true;
+      foreach_two_lists(formal_node, &ir->callee->parameters,
+                        actual_node, &ir->actual_parameters) {
+         ir_variable *sig_param = (ir_variable *) formal_node;
+         if (sig_param->data.mode == ir_var_function_out ||
+             sig_param->data.mode == ir_var_function_inout) {
+            ir_rvalue *ir = (ir_rvalue *) actual_node;
+            ir_variable *var = ir->variable_referenced();
+            kill(new(this->lin_ctx) kill_entry(var, ~0));
+         }
+      }
+   }
 
    return visit_continue_with_parent;
 }
 
 void
-ir_copy_propagation_elements_visitor::handle_if_block(exec_list *instructions)
+ir_copy_propagation_elements_visitor::handle_if_block(exec_list *instructions, exec_list *kills, bool *killed_all)
 {
    exec_list *orig_kills = this->kills;
    bool orig_killed_all = this->killed_all;
 
-   hash_table *orig_lhs_ht = lhs_ht;
-   hash_table *orig_rhs_ht = rhs_ht;
-
-   this->kills = new(mem_ctx) exec_list;
+   this->kills = kills;
    this->killed_all = false;
 
    /* Populate the initial acp with a copy of the original */
-   clone_acp(orig_lhs_ht, orig_rhs_ht);
+   copy_propagation_state *orig_state = state;
+   this->state = orig_state->clone();
 
    visit_list_elements(this, instructions);
 
-   if (this->killed_all) {
-      _mesa_hash_table_clear(orig_lhs_ht, NULL);
-      _mesa_hash_table_clear(orig_rhs_ht, NULL);
-   }
+   delete this->state;
+   this->state = orig_state;
 
-   exec_list *new_kills = this->kills;
+   *killed_all = this->killed_all;
    this->kills = orig_kills;
-   this->killed_all = this->killed_all || orig_killed_all;
-
-   destroy_acp();
-
-   lhs_ht = orig_lhs_ht;
-   rhs_ht = orig_rhs_ht;
-
-   /* Move the new kills into the parent block's list, removing them
-    * from the parent's ACP list in the process.
-    */
-   foreach_in_list_safe(kill_entry, k, new_kills) {
-      kill(k);
-   }
-
-   ralloc_free(new_kills);
+   this->killed_all = orig_killed_all;
 }
 
 ir_visitor_status
@@ -422,8 +569,22 @@
 {
    ir->condition->accept(this);
 
-   handle_if_block(&ir->then_instructions);
-   handle_if_block(&ir->else_instructions);
+   exec_list *new_kills = new(mem_ctx) exec_list;
+   bool then_killed_all = false;
+   bool else_killed_all = false;
+
+   handle_if_block(&ir->then_instructions, new_kills, &then_killed_all);
+   handle_if_block(&ir->else_instructions, new_kills, &else_killed_all);
+
+   if (then_killed_all || else_killed_all) {
+      state->erase_all();
+      killed_all = true;
+   } else {
+      foreach_in_list_safe(kill_entry, k, new_kills)
+         kill(k);
+   }
+
+   ralloc_free(new_kills);
 
    /* handle_if_block() already descended into the children. */
    return visit_continue_with_parent;
@@ -435,39 +596,30 @@
    exec_list *orig_kills = this->kills;
    bool orig_killed_all = this->killed_all;
 
-   hash_table *orig_lhs_ht = lhs_ht;
-   hash_table *orig_rhs_ht = rhs_ht;
-
-   /* FINISHME: For now, the initial acp for loops is totally empty.
-    * We could go through once, then go through again with the acp
-    * cloned minus the killed entries after the first run through.
-    */
    this->kills = new(mem_ctx) exec_list;
    this->killed_all = false;
 
+   copy_propagation_state *orig_state = state;
+
    if (keep_acp) {
       /* Populate the initial acp with a copy of the original */
-      clone_acp(orig_lhs_ht, orig_rhs_ht);
+      this->state = orig_state->clone();
    } else {
-      create_acp();
+      this->state = copy_propagation_state::create(mem_ctx);
    }
 
    visit_list_elements(this, &ir->body_instructions);
 
-   if (this->killed_all) {
-      _mesa_hash_table_clear(orig_lhs_ht, NULL);
-      _mesa_hash_table_clear(orig_rhs_ht, NULL);
-   }
+   delete this->state;
+   this->state = orig_state;
+
+   if (this->killed_all)
+      this->state->erase_all();
 
    exec_list *new_kills = this->kills;
    this->kills = orig_kills;
    this->killed_all = this->killed_all || orig_killed_all;
 
-   destroy_acp();
-
-   lhs_ht = orig_lhs_ht;
-   rhs_ht = orig_rhs_ht;
-
    foreach_in_list_safe(kill_entry, k, new_kills) {
       kill(k);
    }
@@ -489,35 +641,7 @@
 void
 ir_copy_propagation_elements_visitor::kill(kill_entry *k)
 {
-   /* removal of lhs entries */
-   hash_entry *ht_entry = _mesa_hash_table_search(lhs_ht, k->var);
-   if (ht_entry) {
-      exec_list *lhs_list = (exec_list *) ht_entry->data;
-      foreach_in_list_safe(acp_entry, entry, lhs_list) {
-         entry->write_mask = entry->write_mask & ~k->write_mask;
-         if (entry->write_mask == 0) {
-            entry->remove();
-            continue;
-         }
-      }
-   }
-
-   /* removal of rhs entries */
-   ht_entry = _mesa_hash_table_search(rhs_ht, k->var);
-   if (ht_entry) {
-      exec_list *rhs_list = (exec_list *) ht_entry->data;
-      acp_ref *ref;
-
-      while ((ref = (acp_ref *) rhs_list->pop_head()) != NULL) {
-         acp_entry *entry = ref->entry;
-
-         /* If entry is still in a list (not already removed by lhs entry
-          * removal above), remove it.
-          */
-         if (entry->prev || entry->next)
-            entry->remove();
-      }
-   }
+   state->erase(k->var, k->write_mask);
 
    /* If we were on a list, remove ourselves before inserting */
    if (k->next)
@@ -533,13 +657,29 @@
 void
 ir_copy_propagation_elements_visitor::add_copy(ir_assignment *ir)
 {
-   acp_entry *entry;
-   int orig_swizzle[4] = {0, 1, 2, 3};
-   int swizzle[4];
-
    if (ir->condition)
       return;
 
+   {
+      ir_variable *lhs_var = ir->whole_variable_written();
+      ir_dereference_variable *rhs = ir->rhs->as_dereference_variable();
+
+      if (lhs_var != NULL && rhs && rhs->var != NULL && lhs_var != rhs->var) {
+         if (lhs_var->data.mode == ir_var_shader_storage ||
+             lhs_var->data.mode == ir_var_shader_shared ||
+             rhs->var->data.mode == ir_var_shader_storage ||
+             rhs->var->data.mode == ir_var_shader_shared ||
+             lhs_var->data.precise != rhs->var->data.precise) {
+            return;
+         }
+         state->write_full(lhs_var, rhs->var);
+         return;
+      }
+   }
+
+   int orig_swizzle[4] = {0, 1, 2, 3};
+   int swizzle[4];
+
    ir_dereference_variable *lhs = ir->lhs->as_dereference_variable();
    if (!lhs || !(lhs->type->is_scalar() || lhs->type->is_vector()))
       return;
@@ -594,30 +734,7 @@
    if (lhs->var->data.precise != rhs->var->data.precise)
       return;
 
-   entry = new(this->lin_ctx) acp_entry(lhs->var, rhs->var, write_mask,
-					swizzle);
-
-   /* lhs hash, hash of lhs -> acp_entry lists */
-   hash_entry *ht_entry = _mesa_hash_table_search(lhs_ht, lhs->var);
-   if (ht_entry) {
-      exec_list *lhs_list = (exec_list *) ht_entry->data;
-      lhs_list->push_tail(entry);
-   } else {
-      exec_list *lhs_list = new(mem_ctx) exec_list;
-      lhs_list->push_tail(entry);
-      _mesa_hash_table_insert(lhs_ht, lhs->var, lhs_list);
-   }
-
-   /* rhs hash, hash of rhs -> acp_entry pointers to lhs lists */
-   ht_entry = _mesa_hash_table_search(rhs_ht, rhs->var);
-   if (ht_entry) {
-      exec_list *rhs_list = (exec_list *) ht_entry->data;
-      rhs_list->push_tail(&entry->rhs_node);
-   } else {
-      exec_list *rhs_list = new(mem_ctx) exec_list;
-      rhs_list->push_tail(&entry->rhs_node);
-      _mesa_hash_table_insert(rhs_ht, rhs->var, rhs_list);
-   }
+   state->write_elements(lhs->var, rhs->var, write_mask, swizzle);
 }
 
 bool
diff --git a/src/compiler/glsl/opt_function_inlining.cpp b/src/compiler/glsl/opt_function_inlining.cpp
index 04690b6..52f57da 100644
--- a/src/compiler/glsl/opt_function_inlining.cpp
+++ b/src/compiler/glsl/opt_function_inlining.cpp
@@ -131,6 +131,18 @@
    return visit_stop;
 }
 
+static bool
+should_replace_variable(ir_variable *sig_param, ir_rvalue *param) {
+   /* For opaque types, we want the inlined variable references
+    * referencing the passed in variable, since that will have
+    * the location information, which an assignment of an opaque
+    * variable wouldn't.
+    */
+   return sig_param->type->contains_opaque() &&
+          param->is_dereference() &&
+          sig_param->data.mode == ir_var_function_in;
+}
+
 void
 ir_call::generate_inline(ir_instruction *next_ir)
 {
@@ -155,12 +167,8 @@
       ir_rvalue *param = (ir_rvalue *) actual_node;
 
       /* Generate a new variable for the parameter. */
-      if (sig_param->type->contains_opaque()) {
-	 /* For opaque types, we want the inlined variable references
-	  * referencing the passed in variable, since that will have
-	  * the location information, which an assignment of an opaque
-	  * variable wouldn't.  Fix it up below.
-	  */
+      if (should_replace_variable(sig_param, param)) {
+         /* Actual replacement happens below */
 	 parameters[i] = NULL;
       } else {
 	 parameters[i] = sig_param->clone(ctx, ht);
@@ -242,10 +250,9 @@
       ir_rvalue *const param = (ir_rvalue *) actual_node;
       ir_variable *sig_param = (ir_variable *) formal_node;
 
-      if (sig_param->type->contains_opaque()) {
+      if (should_replace_variable(sig_param, param)) {
 	 ir_dereference *deref = param->as_dereference();
 
-	 assert(deref);
 	 do_variable_replacement(&new_instructions, sig_param, deref);
       }
    }
@@ -351,6 +358,9 @@
    virtual ir_visitor_status visit_leave(ir_dereference_array *);
    virtual ir_visitor_status visit_leave(ir_dereference_record *);
    virtual ir_visitor_status visit_leave(ir_texture *);
+   virtual ir_visitor_status visit_leave(ir_assignment *);
+   virtual ir_visitor_status visit_leave(ir_expression *);
+   virtual ir_visitor_status visit_leave(ir_return *);
 
    void replace_deref(ir_dereference **deref);
    void replace_rvalue(ir_rvalue **rvalue);
@@ -392,6 +402,32 @@
 }
 
 ir_visitor_status
+ir_variable_replacement_visitor::visit_leave(ir_assignment *ir)
+{
+   replace_deref(&ir->lhs);
+   replace_rvalue(&ir->rhs);
+
+   return visit_continue;
+}
+
+ir_visitor_status
+ir_variable_replacement_visitor::visit_leave(ir_expression *ir)
+{
+   for (uint8_t i = 0; i < ir->num_operands; i++)
+      replace_rvalue(&ir->operands[i]);
+
+   return visit_continue;
+}
+
+ir_visitor_status
+ir_variable_replacement_visitor::visit_leave(ir_return *ir)
+{
+   replace_rvalue(&ir->value);
+
+   return visit_continue;
+}
+
+ir_visitor_status
 ir_variable_replacement_visitor::visit_leave(ir_dereference_array *ir)
 {
    replace_rvalue(&ir->array);
diff --git a/src/compiler/glsl/program.h b/src/compiler/glsl/program.h
index 480379b..9df42dd 100644
--- a/src/compiler/glsl/program.h
+++ b/src/compiler/glsl/program.h
@@ -48,14 +48,6 @@
 build_program_resource_list(struct gl_context *ctx,
                             struct gl_shader_program *shProg);
 
-extern void
-linker_error(struct gl_shader_program *prog, const char *fmt, ...)
-   PRINTFLIKE(2, 3);
-
-extern void
-linker_warning(struct gl_shader_program *prog, const char *fmt, ...)
-   PRINTFLIKE(2, 3);
-
 extern long
 parse_program_resource_name(const GLchar *name,
                             const GLchar **out_base_name_end);
diff --git a/src/compiler/glsl/serialize.cpp b/src/compiler/glsl/serialize.cpp
index 889038f..efd436b 100644
--- a/src/compiler/glsl/serialize.cpp
+++ b/src/compiler/glsl/serialize.cpp
@@ -360,13 +360,20 @@
    if (xfb_stage == ~0u)
       return;
 
+   if (shProg->TransformFeedback.VaryingNames)  {
+      for (unsigned i = 0; i < shProg->TransformFeedback.NumVarying; ++i)
+         free(shProg->TransformFeedback.VaryingNames[i]);
+   }
+
    /* Data set by glTransformFeedbackVaryings. */
    shProg->TransformFeedback.BufferMode = blob_read_uint32(metadata);
    blob_copy_bytes(metadata, &shProg->TransformFeedback.BufferStride,
                    sizeof(shProg->TransformFeedback.BufferStride));
    shProg->TransformFeedback.NumVarying = blob_read_uint32(metadata);
+
    shProg->TransformFeedback.VaryingNames = (char **)
-      malloc(shProg->TransformFeedback.NumVarying * sizeof(GLchar *));
+      realloc(shProg->TransformFeedback.VaryingNames,
+             shProg->TransformFeedback.NumVarying * sizeof(GLchar *));
    /* Note, malloc used with VaryingNames. */
    for (unsigned i = 0; i < shProg->TransformFeedback.NumVarying; i++)
       shProg->TransformFeedback.VaryingNames[i] =
diff --git a/src/compiler/glsl/shader_cache.cpp b/src/compiler/glsl/shader_cache.cpp
index 042f3a6..31d0aa6 100644
--- a/src/compiler/glsl/shader_cache.cpp
+++ b/src/compiler/glsl/shader_cache.cpp
@@ -102,6 +102,14 @@
    struct blob metadata;
    blob_init(&metadata);
 
+   if (ctx->Driver.ShaderCacheSerializeDriverBlob) {
+      for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+         struct gl_linked_shader *sh = prog->_LinkedShaders[i];
+         if (sh)
+            ctx->Driver.ShaderCacheSerializeDriverBlob(ctx, sh->Program);
+      }
+   }
+
    serialize_glsl_program(&metadata, ctx, prog);
 
    struct cache_item_metadata cache_item_metadata;
diff --git a/src/compiler/glsl/test_optpass.cpp b/src/compiler/glsl/test_optpass.cpp
index 1fd9db1..735129d 100644
--- a/src/compiler/glsl/test_optpass.cpp
+++ b/src/compiler/glsl/test_optpass.cpp
@@ -73,8 +73,6 @@
       return do_constant_variable(ir);
    } else if (strcmp(optimization, "do_constant_variable_unlinked") == 0) {
       return do_constant_variable_unlinked(ir);
-   } else if (strcmp(optimization, "do_copy_propagation") == 0) {
-      return do_copy_propagation(ir);
    } else if (strcmp(optimization, "do_copy_propagation_elements") == 0) {
       return do_copy_propagation_elements(ir);
    } else if (strcmp(optimization, "do_constant_propagation") == 0) {
diff --git a/src/compiler/glsl/tests/meson.build b/src/compiler/glsl/tests/meson.build
index fc7b863..821760e 100644
--- a/src/compiler/glsl/tests/meson.build
+++ b/src/compiler/glsl/tests/meson.build
@@ -84,8 +84,10 @@
 )
 
 test(
-  'glsl compiler warnings', find_program('warnings_test.py'),
+  'glsl compiler warnings',
+  prog_python2,
   args : [
+    join_paths(meson.current_source_dir(), 'warnings_test.py'),
     '--glsl-compiler', glsl_compiler,
     '--test-directory', join_paths(
       meson.source_root(), 'src', 'compiler', 'glsl', 'tests', 'warnings'
@@ -94,6 +96,9 @@
 )
 test(
   'glsl optimization',
-  find_program('optimization_test.py'),
-  args : ['--test-runner', glsl_test],
+  prog_python2,
+  args : [
+    join_paths(meson.current_source_dir(), 'optimization_test.py'),
+    '--test-runner', glsl_test
+  ],
 )
diff --git a/src/compiler/glsl/tests/optimization_test.py b/src/compiler/glsl/tests/optimization_test.py
old mode 100755
new mode 100644
index 577d2df..f8518a1
--- a/src/compiler/glsl/tests/optimization_test.py
+++ b/src/compiler/glsl/tests/optimization_test.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python2
 # encoding=utf-8
 # Copyright © 2018 Intel Corporation
 
diff --git a/src/compiler/glsl/tests/uniform_initializer_utils.cpp b/src/compiler/glsl/tests/uniform_initializer_utils.cpp
index 0d7fa26..8c00c69 100644
--- a/src/compiler/glsl/tests/uniform_initializer_utils.cpp
+++ b/src/compiler/glsl/tests/uniform_initializer_utils.cpp
@@ -113,6 +113,8 @@
       case GLSL_TYPE_FLOAT16:
       case GLSL_TYPE_UINT16:
       case GLSL_TYPE_INT16:
+      case GLSL_TYPE_UINT8:
+      case GLSL_TYPE_INT8:
 	 ASSERT_TRUE(false);
 	 break;
       }
@@ -156,6 +158,8 @@
       case GLSL_TYPE_FLOAT16:
       case GLSL_TYPE_UINT16:
       case GLSL_TYPE_INT16:
+      case GLSL_TYPE_UINT8:
+      case GLSL_TYPE_INT8:
 	 ASSERT_TRUE(false);
 	 break;
       }
@@ -287,6 +291,8 @@
          case GLSL_TYPE_FLOAT16:
          case GLSL_TYPE_UINT16:
          case GLSL_TYPE_INT16:
+         case GLSL_TYPE_UINT8:
+         case GLSL_TYPE_INT8:
 	    ASSERT_TRUE(false);
 	    break;
 	 }
diff --git a/src/compiler/glsl/tests/warnings_test.py b/src/compiler/glsl/tests/warnings_test.py
old mode 100755
new mode 100644
index 2e0f231..2c4fa5a
--- a/src/compiler/glsl/tests/warnings_test.py
+++ b/src/compiler/glsl/tests/warnings_test.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # encoding=utf-8
 # Copyright © 2017 Intel Corporation
 
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index efc6324..d32b580 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -87,6 +87,13 @@
    GLSL_TYPE_ERROR
 };
 
+static inline bool glsl_base_type_is_16bit(enum glsl_base_type type)
+{
+   return type == GLSL_TYPE_FLOAT16 ||
+          type == GLSL_TYPE_UINT16 ||
+          type == GLSL_TYPE_INT16;
+}
+
 static inline bool glsl_base_type_is_64bit(enum glsl_base_type type)
 {
    return type == GLSL_TYPE_DOUBLE ||
@@ -552,6 +559,14 @@
    }
 
    /**
+    * Query whether or not a type is 16-bit
+    */
+   bool is_16bit() const
+   {
+      return glsl_base_type_is_16bit(base_type);
+   }
+
+   /**
     * Query whether or not a type is a non-array boolean type
     */
    bool is_boolean() const
diff --git a/src/compiler/meson.build b/src/compiler/meson.build
index da2464d..60b6338 100644
--- a/src/compiler/meson.build
+++ b/src/compiler/meson.build
@@ -62,7 +62,7 @@
 # dependency with nir/meson.build.
 spirv2nir = executable(
   'spirv2nir',
-  [files('spirv/spirv2nir.c'), dummy_cpp],
+  files('spirv/spirv2nir.c'),
   dependencies : [dep_m, dep_thread, idep_nir],
   include_directories : [inc_common, inc_nir, include_directories('spirv')],
   link_with : libmesa_util,
diff --git a/src/compiler/nir/.gitignore b/src/compiler/nir/.gitignore
index 64828eb..8faf93f 100644
--- a/src/compiler/nir/.gitignore
+++ b/src/compiler/nir/.gitignore
@@ -3,3 +3,5 @@
 nir_opcodes.c
 nir_opcodes.h
 nir_constant_expressions.c
+nir_intrinsics.c
+nir_intrinsics.h
diff --git a/src/compiler/nir/BUILD.gn b/src/compiler/nir/BUILD.gn
index 7a81d75..6ec36c1 100644
--- a/src/compiler/nir/BUILD.gn
+++ b/src/compiler/nir/BUILD.gn
@@ -51,11 +51,13 @@
     "nir.c",
     "nir.h",
     "nir_builder.h",
+    "nir_builtin_builder.c",
     "nir_clone.c",
     "nir_constant_expressions.h",
     "nir_control_flow.c",
     "nir_control_flow.h",
     "nir_control_flow_private.h",
+    "nir_deref.c",
     "nir_dominance.c",
     "nir_from_ssa.c",
     "nir_gather_info.c",
@@ -66,9 +68,8 @@
     "nir_linking_helpers.c",
     "nir_liveness.c",
     "nir_loop_analyze.c",
-    "nir_lower_64bit_packing.c",
     "nir_lower_alu_to_scalar.c",
-    "nir_lower_atomics.c",
+    "nir_lower_bit_size.c",
     "nir_lower_clip.c",
     "nir_lower_clip_cull_distance_arrays.c",
     "nir_lower_constant_initializers.c",
@@ -79,14 +80,15 @@
     "nir_lower_indirect_derefs.c",
     "nir_lower_int64.c",
     "nir_lower_io.c",
+    "nir_lower_io_arrays_to_elements.c",
     "nir_lower_io_to_temporaries.c",
     "nir_lower_load_const_to_scalar.c",
     "nir_lower_locals_to_regs.c",
+    "nir_lower_packing.c",
     "nir_lower_patch_vertices.c",
     "nir_lower_phis_to_scalar.c",
     "nir_lower_regs_to_ssa.c",
     "nir_lower_returns.c",
-    "nir_lower_samplers.c",
     "nir_lower_subgroups.c",
     "nir_lower_system_values.c",
     "nir_lower_tex.c",
@@ -109,6 +111,7 @@
     "nir_opt_global_to_local.c",
     "nir_opt_if.c",
     "nir_opt_intrinsics.c",
+    "nir_opt_large_constants.c",
     "nir_opt_loop_unroll.c",
     "nir_opt_move_comparisons.c",
     "nir_opt_peephole_select.c",
@@ -123,6 +126,7 @@
     "nir_search.c",
     "nir_search.h",
     "nir_serialize.c",
+    "nir_split_per_member_structs.c",
     "nir_split_var_copies.c",
     "nir_sweep.c",
     "nir_to_lcssa.c",
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index 6b84deb..010da38 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -18,7 +18,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-nir_depends = files('nir_opcodes.py')
+nir_depends = files('nir_opcodes.py', 'nir_intrinsics.py')
 
 nir_builder_opcodes_h = custom_target(
   'nir_builder_opcodes.h',
@@ -87,12 +87,17 @@
   'nir.c',
   'nir.h',
   'nir_builder.h',
+  'nir_builtin_builder.c',
+  'nir_builtin_builder.h',
   'nir_clone.c',
   'nir_constant_expressions.h',
   'nir_control_flow.c',
   'nir_control_flow.h',
   'nir_control_flow_private.h',
+  'nir_deref.c',
+  'nir_deref.h',
   'nir_dominance.c',
+  'nir_format_convert.h',
   'nir_from_ssa.c',
   'nir_gather_info.c',
   'nir_gs_count_vertices.c',
@@ -103,10 +108,9 @@
   'nir_liveness.c',
   'nir_loop_analyze.c',
   'nir_loop_analyze.h',
-  'nir_lower_64bit_packing.c',
+  'nir_lower_alu.c',
   'nir_lower_alu_to_scalar.c',
   'nir_lower_alpha_test.c',
-  'nir_lower_atomics.c',
   'nir_lower_atomics_to_ssbo.c',
   'nir_lower_bitmap.c',
   'nir_lower_clamp_color_outputs.c',
@@ -126,14 +130,12 @@
   'nir_lower_io_arrays_to_elements.c',
   'nir_lower_io_to_temporaries.c',
   'nir_lower_io_to_scalar.c',
-  'nir_lower_io_types.c',
+  'nir_lower_packing.c',
   'nir_lower_passthrough_edgeflags.c',
   'nir_lower_patch_vertices.c',
   'nir_lower_phis_to_scalar.c',
   'nir_lower_regs_to_ssa.c',
   'nir_lower_returns.c',
-  'nir_lower_samplers.c',
-  'nir_lower_samplers_as_deref.c',
   'nir_lower_subgroups.c',
   'nir_lower_system_values.c',
   'nir_lower_tex.c',
@@ -144,7 +146,9 @@
   'nir_lower_vec_to_movs.c',
   'nir_lower_wpos_center.c',
   'nir_lower_wpos_ytransform.c',
+  'nir_lower_bit_size.c',
   'nir_metadata.c',
+  'nir_move_load_const.c',
   'nir_move_vec_src_uses_to_dest.c',
   'nir_normalize_cubemap_coords.c',
   'nir_opt_conditional_discard.c',
@@ -158,6 +162,7 @@
   'nir_opt_global_to_local.c',
   'nir_opt_if.c',
   'nir_opt_intrinsics.c',
+  'nir_opt_large_constants.c',
   'nir_opt_loop_unroll.c',
   'nir_opt_move_comparisons.c',
   'nir_opt_move_load_ubo.c',
@@ -177,6 +182,7 @@
   'nir_search_helpers.h',
   'nir_serialize.c',
   'nir_serialize.h',
+  'nir_split_per_member_structs.c',
   'nir_split_var_copies.c',
   'nir_sweep.c',
   'nir_to_lcssa.c',
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index f1dead4..a849664 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -126,10 +126,6 @@
       assert(!"nir_shader_add_variable cannot be used for local variables");
       break;
 
-   case nir_var_param:
-      assert(!"nir_shader_add_variable cannot be used for function parameters");
-      break;
-
    case nir_var_global:
       exec_list_push_tail(&shader->globals, &var->node);
       break;
@@ -206,7 +202,6 @@
    func->shader = shader;
    func->num_params = 0;
    func->params = NULL;
-   func->return_type = glsl_void_type();
    func->impl = NULL;
 
    return func;
@@ -256,7 +251,7 @@
    nir_src_copy(&dest->src, &src->src, &instr->instr);
    dest->abs = src->abs;
    dest->negate = src->negate;
-   for (unsigned i = 0; i < 4; i++)
+   for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++)
       dest->swizzle[i] = src->swizzle[i];
 }
 
@@ -290,9 +285,6 @@
    exec_list_make_empty(&impl->body);
    exec_list_make_empty(&impl->registers);
    exec_list_make_empty(&impl->locals);
-   impl->num_params = 0;
-   impl->params = NULL;
-   impl->return_var = NULL;
    impl->reg_alloc = 0;
    impl->ssa_alloc = 0;
    impl->valid_metadata = nir_metadata_none;
@@ -321,26 +313,6 @@
    function->impl = impl;
    impl->function = function;
 
-   impl->num_params = function->num_params;
-   impl->params = ralloc_array(function->shader,
-                               nir_variable *, impl->num_params);
-
-   for (unsigned i = 0; i < impl->num_params; i++) {
-      impl->params[i] = rzalloc(function->shader, nir_variable);
-      impl->params[i]->type = function->params[i].type;
-      impl->params[i]->data.mode = nir_var_param;
-      impl->params[i]->data.location = i;
-   }
-
-   if (!glsl_type_is_void(function->return_type)) {
-      impl->return_var = rzalloc(function->shader, nir_variable);
-      impl->return_var->type = function->return_type;
-      impl->return_var->data.mode = nir_var_param;
-      impl->return_var->data.location = -1;
-   } else {
-      impl->return_var = NULL;
-   }
-
    return impl;
 }
 
@@ -449,10 +421,8 @@
 {
    src_init(&src->src);
    src->abs = src->negate = false;
-   src->swizzle[0] = 0;
-   src->swizzle[1] = 1;
-   src->swizzle[2] = 2;
-   src->swizzle[3] = 3;
+   for (int i = 0; i < NIR_MAX_VEC_COMPONENTS; ++i)
+      src->swizzle[i] = i;
 }
 
 nir_alu_instr *
@@ -473,6 +443,26 @@
    return instr;
 }
 
+nir_deref_instr *
+nir_deref_instr_create(nir_shader *shader, nir_deref_type deref_type)
+{
+   nir_deref_instr *instr =
+      rzalloc_size(shader, sizeof(nir_deref_instr));
+
+   instr_init(&instr->instr, nir_instr_type_deref);
+
+   instr->deref_type = deref_type;
+   if (deref_type != nir_deref_type_var)
+      src_init(&instr->parent);
+
+   if (deref_type == nir_deref_type_array)
+      src_init(&instr->arr.index);
+
+   dest_init(&instr->dest);
+
+   return instr;
+}
+
 nir_jump_instr *
 nir_jump_instr_create(nir_shader *shader, nir_jump_type type)
 {
@@ -518,13 +508,16 @@
 nir_call_instr *
 nir_call_instr_create(nir_shader *shader, nir_function *callee)
 {
-   nir_call_instr *instr = ralloc(shader, nir_call_instr);
-   instr_init(&instr->instr, nir_instr_type_call);
+   const unsigned num_params = callee->num_params;
+   nir_call_instr *instr =
+      rzalloc_size(shader, sizeof(*instr) +
+                   num_params * sizeof(instr->params[0]));
 
+   instr_init(&instr->instr, nir_instr_type_call);
    instr->callee = callee;
-   instr->num_params = callee->num_params;
-   instr->params = ralloc_array(instr, nir_deref_var *, instr->num_params);
-   instr->return_deref = NULL;
+   instr->num_params = num_params;
+   for (unsigned i = 0; i < num_params; i++)
+      src_init(&instr->params[i]);
 
    return instr;
 }
@@ -544,9 +537,7 @@
 
    instr->texture_index = 0;
    instr->texture_array_size = 0;
-   instr->texture = NULL;
    instr->sampler_index = 0;
-   instr->sampler = NULL;
 
    return instr;
 }
@@ -624,281 +615,6 @@
    return instr;
 }
 
-nir_deref_var *
-nir_deref_var_create(void *mem_ctx, nir_variable *var)
-{
-   nir_deref_var *deref = ralloc(mem_ctx, nir_deref_var);
-   deref->deref.deref_type = nir_deref_type_var;
-   deref->deref.child = NULL;
-   deref->deref.type = var->type;
-   deref->var = var;
-   return deref;
-}
-
-nir_deref_array *
-nir_deref_array_create(void *mem_ctx)
-{
-   nir_deref_array *deref = ralloc(mem_ctx, nir_deref_array);
-   deref->deref.deref_type = nir_deref_type_array;
-   deref->deref.child = NULL;
-   deref->deref_array_type = nir_deref_array_type_direct;
-   src_init(&deref->indirect);
-   deref->base_offset = 0;
-   return deref;
-}
-
-nir_deref_struct *
-nir_deref_struct_create(void *mem_ctx, unsigned field_index)
-{
-   nir_deref_struct *deref = ralloc(mem_ctx, nir_deref_struct);
-   deref->deref.deref_type = nir_deref_type_struct;
-   deref->deref.child = NULL;
-   deref->index = field_index;
-   return deref;
-}
-
-nir_deref_var *
-nir_deref_var_clone(const nir_deref_var *deref, void *mem_ctx)
-{
-   if (deref == NULL)
-      return NULL;
-
-   nir_deref_var *ret = nir_deref_var_create(mem_ctx, deref->var);
-   ret->deref.type = deref->deref.type;
-   if (deref->deref.child)
-      ret->deref.child = nir_deref_clone(deref->deref.child, ret);
-   return ret;
-}
-
-static nir_deref_array *
-deref_array_clone(const nir_deref_array *deref, void *mem_ctx)
-{
-   nir_deref_array *ret = nir_deref_array_create(mem_ctx);
-   ret->base_offset = deref->base_offset;
-   ret->deref_array_type = deref->deref_array_type;
-   if (deref->deref_array_type == nir_deref_array_type_indirect) {
-      nir_src_copy(&ret->indirect, &deref->indirect, mem_ctx);
-   }
-   ret->deref.type = deref->deref.type;
-   if (deref->deref.child)
-      ret->deref.child = nir_deref_clone(deref->deref.child, ret);
-   return ret;
-}
-
-static nir_deref_struct *
-deref_struct_clone(const nir_deref_struct *deref, void *mem_ctx)
-{
-   nir_deref_struct *ret = nir_deref_struct_create(mem_ctx, deref->index);
-   ret->deref.type = deref->deref.type;
-   if (deref->deref.child)
-      ret->deref.child = nir_deref_clone(deref->deref.child, ret);
-   return ret;
-}
-
-nir_deref *
-nir_deref_clone(const nir_deref *deref, void *mem_ctx)
-{
-   if (deref == NULL)
-      return NULL;
-
-   switch (deref->deref_type) {
-   case nir_deref_type_var:
-      return &nir_deref_var_clone(nir_deref_as_var(deref), mem_ctx)->deref;
-   case nir_deref_type_array:
-      return &deref_array_clone(nir_deref_as_array(deref), mem_ctx)->deref;
-   case nir_deref_type_struct:
-      return &deref_struct_clone(nir_deref_as_struct(deref), mem_ctx)->deref;
-   default:
-      unreachable("Invalid dereference type");
-   }
-
-   return NULL;
-}
-
-/* This is the second step in the recursion.  We've found the tail and made a
- * copy.  Now we need to iterate over all possible leaves and call the
- * callback on each one.
- */
-static bool
-deref_foreach_leaf_build_recur(nir_deref_var *deref, nir_deref *tail,
-                               nir_deref_foreach_leaf_cb cb, void *state)
-{
-   unsigned length;
-   union {
-      nir_deref_array arr;
-      nir_deref_struct str;
-   } tmp;
-
-   assert(tail->child == NULL);
-   switch (glsl_get_base_type(tail->type)) {
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_UINT16:
-   case GLSL_TYPE_UINT64:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_INT16:
-   case GLSL_TYPE_INT64:
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_FLOAT16:
-   case GLSL_TYPE_DOUBLE:
-   case GLSL_TYPE_BOOL:
-      if (glsl_type_is_vector_or_scalar(tail->type))
-         return cb(deref, state);
-      /* Fall Through */
-
-   case GLSL_TYPE_ARRAY:
-      tmp.arr.deref.deref_type = nir_deref_type_array;
-      tmp.arr.deref.type = glsl_get_array_element(tail->type);
-      tmp.arr.deref_array_type = nir_deref_array_type_direct;
-      tmp.arr.indirect = NIR_SRC_INIT;
-      tail->child = &tmp.arr.deref;
-
-      length = glsl_get_length(tail->type);
-      for (unsigned i = 0; i < length; i++) {
-         tmp.arr.deref.child = NULL;
-         tmp.arr.base_offset = i;
-         if (!deref_foreach_leaf_build_recur(deref, &tmp.arr.deref, cb, state))
-            return false;
-      }
-      return true;
-
-   case GLSL_TYPE_STRUCT:
-      tmp.str.deref.deref_type = nir_deref_type_struct;
-      tail->child = &tmp.str.deref;
-
-      length = glsl_get_length(tail->type);
-      for (unsigned i = 0; i < length; i++) {
-         tmp.arr.deref.child = NULL;
-         tmp.str.deref.type = glsl_get_struct_field(tail->type, i);
-         tmp.str.index = i;
-         if (!deref_foreach_leaf_build_recur(deref, &tmp.arr.deref, cb, state))
-            return false;
-      }
-      return true;
-
-   default:
-      unreachable("Invalid type for dereference");
-   }
-}
-
-/* This is the first step of the foreach_leaf recursion.  In this step we are
- * walking to the end of the deref chain and making a copy in the stack as we
- * go.  This is because we don't want to mutate the deref chain that was
- * passed in by the caller.  The downside is that this deref chain is on the
- * stack and , if the caller wants to do anything with it, they will have to
- * make their own copy because this one will go away.
- */
-static bool
-deref_foreach_leaf_copy_recur(nir_deref_var *deref, nir_deref *tail,
-                              nir_deref_foreach_leaf_cb cb, void *state)
-{
-   union {
-      nir_deref_array arr;
-      nir_deref_struct str;
-   } c;
-
-   if (tail->child) {
-      switch (tail->child->deref_type) {
-      case nir_deref_type_array:
-         c.arr = *nir_deref_as_array(tail->child);
-         tail->child = &c.arr.deref;
-         return deref_foreach_leaf_copy_recur(deref, &c.arr.deref, cb, state);
-
-      case nir_deref_type_struct:
-         c.str = *nir_deref_as_struct(tail->child);
-         tail->child = &c.str.deref;
-         return deref_foreach_leaf_copy_recur(deref, &c.str.deref, cb, state);
-
-      case nir_deref_type_var:
-      default:
-         unreachable("Invalid deref type for a child");
-      }
-   } else {
-      /* We've gotten to the end of the original deref.  Time to start
-       * building our own derefs.
-       */
-      return deref_foreach_leaf_build_recur(deref, tail, cb, state);
-   }
-}
-
-/**
- * This function iterates over all of the possible derefs that can be created
- * with the given deref as the head.  It then calls the provided callback with
- * a full deref for each one.
- *
- * The deref passed to the callback will be allocated on the stack.  You will
- * need to make a copy if you want it to hang around.
- */
-bool
-nir_deref_foreach_leaf(nir_deref_var *deref,
-                       nir_deref_foreach_leaf_cb cb, void *state)
-{
-   nir_deref_var copy = *deref;
-   return deref_foreach_leaf_copy_recur(&copy, &copy.deref, cb, state);
-}
-
-/* Returns a load_const instruction that represents the constant
- * initializer for the given deref chain.  The caller is responsible for
- * ensuring that there actually is a constant initializer.
- */
-nir_load_const_instr *
-nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref)
-{
-   nir_constant *constant = deref->var->constant_initializer;
-   assert(constant);
-
-   const nir_deref *tail = &deref->deref;
-   unsigned matrix_col = 0;
-   while (tail->child) {
-      switch (tail->child->deref_type) {
-      case nir_deref_type_array: {
-         nir_deref_array *arr = nir_deref_as_array(tail->child);
-         assert(arr->deref_array_type == nir_deref_array_type_direct);
-         if (glsl_type_is_matrix(tail->type)) {
-            assert(arr->deref.child == NULL);
-            matrix_col = arr->base_offset;
-         } else {
-            constant = constant->elements[arr->base_offset];
-         }
-         break;
-      }
-
-      case nir_deref_type_struct: {
-         constant = constant->elements[nir_deref_as_struct(tail->child)->index];
-         break;
-      }
-
-      default:
-         unreachable("Invalid deref child type");
-      }
-
-      tail = tail->child;
-   }
-
-   unsigned bit_size = glsl_get_bit_size(tail->type);
-   nir_load_const_instr *load =
-      nir_load_const_instr_create(shader, glsl_get_vector_elements(tail->type),
-                                  bit_size);
-
-   switch (glsl_get_base_type(tail->type)) {
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_FLOAT16:
-   case GLSL_TYPE_DOUBLE:
-   case GLSL_TYPE_INT16:
-   case GLSL_TYPE_UINT16:
-   case GLSL_TYPE_UINT64:
-   case GLSL_TYPE_INT64:
-   case GLSL_TYPE_BOOL:
-      load->value = constant->values[matrix_col];
-      break;
-   default:
-      unreachable("Invalid immediate type");
-   }
-
-   return load;
-}
-
 static nir_const_value
 const_value_float(double d, unsigned bit_size)
 {
@@ -1202,6 +918,12 @@
 }
 
 static bool
+visit_deref_dest(nir_deref_instr *instr, nir_foreach_dest_cb cb, void *state)
+{
+   return cb(&instr->dest, state);
+}
+
+static bool
 visit_intrinsic_dest(nir_intrinsic_instr *instr, nir_foreach_dest_cb cb,
                      void *state)
 {
@@ -1242,6 +964,8 @@
    switch (instr->type) {
    case nir_instr_type_alu:
       return visit_alu_dest(nir_instr_as_alu(instr), cb, state);
+   case nir_instr_type_deref:
+      return visit_deref_dest(nir_instr_as_deref(instr), cb, state);
    case nir_instr_type_intrinsic:
       return visit_intrinsic_dest(nir_instr_as_intrinsic(instr), cb, state);
    case nir_instr_type_tex:
@@ -1287,6 +1011,7 @@
 {
    switch (instr->type) {
    case nir_instr_type_alu:
+   case nir_instr_type_deref:
    case nir_instr_type_tex:
    case nir_instr_type_intrinsic:
    case nir_instr_type_phi:
@@ -1318,31 +1043,6 @@
 }
 
 static bool
-visit_deref_array_src(nir_deref_array *deref, nir_foreach_src_cb cb,
-                      void *state)
-{
-   if (deref->deref_array_type == nir_deref_array_type_indirect)
-      return visit_src(&deref->indirect, cb, state);
-   return true;
-}
-
-static bool
-visit_deref_src(nir_deref_var *deref, nir_foreach_src_cb cb, void *state)
-{
-   nir_deref *cur = &deref->deref;
-   while (cur != NULL) {
-      if (cur->deref_type == nir_deref_type_array) {
-         if (!visit_deref_array_src(nir_deref_as_array(cur), cb, state))
-            return false;
-      }
-
-      cur = cur->child;
-   }
-
-   return true;
-}
-
-static bool
 visit_alu_src(nir_alu_instr *instr, nir_foreach_src_cb cb, void *state)
 {
    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
@@ -1353,20 +1053,16 @@
 }
 
 static bool
-visit_tex_src(nir_tex_instr *instr, nir_foreach_src_cb cb, void *state)
+visit_deref_instr_src(nir_deref_instr *instr,
+                      nir_foreach_src_cb cb, void *state)
 {
-   for (unsigned i = 0; i < instr->num_srcs; i++) {
-      if (!visit_src(&instr->src[i].src, cb, state))
+   if (instr->deref_type != nir_deref_type_var) {
+      if (!visit_src(&instr->parent, cb, state))
          return false;
    }
 
-   if (instr->texture != NULL) {
-      if (!visit_deref_src(instr->texture, cb, state))
-         return false;
-   }
-
-   if (instr->sampler != NULL) {
-      if (!visit_deref_src(instr->sampler, cb, state))
+   if (instr->deref_type == nir_deref_type_array) {
+      if (!visit_src(&instr->arr.index, cb, state))
          return false;
    }
 
@@ -1374,13 +1070,10 @@
 }
 
 static bool
-visit_call_src(nir_call_instr *instr, nir_foreach_src_cb cb, void *state)
+visit_tex_src(nir_tex_instr *instr, nir_foreach_src_cb cb, void *state)
 {
-   if (instr->return_deref && !visit_deref_src(instr->return_deref, cb, state))
-      return false;
-
-   for (unsigned i = 0; i < instr->num_params; i++) {
-      if (!visit_deref_src(instr->params[i], cb, state))
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      if (!visit_src(&instr->src[i].src, cb, state))
          return false;
    }
 
@@ -1397,10 +1090,14 @@
          return false;
    }
 
-   unsigned num_vars =
-      nir_intrinsic_infos[instr->intrinsic].num_variables;
-   for (unsigned i = 0; i < num_vars; i++) {
-      if (!visit_deref_src(instr->variables[i], cb, state))
+   return true;
+}
+
+static bool
+visit_call_src(nir_call_instr *instr, nir_foreach_src_cb cb, void *state)
+{
+   for (unsigned i = 0; i < instr->num_params; i++) {
+      if (!visit_src(&instr->params[i], cb, state))
          return false;
    }
 
@@ -1454,6 +1151,10 @@
       if (!visit_alu_src(nir_instr_as_alu(instr), cb, state))
          return false;
       break;
+   case nir_instr_type_deref:
+      if (!visit_deref_instr_src(nir_instr_as_deref(instr), cb, state))
+         return false;
+      break;
    case nir_instr_type_intrinsic:
       if (!visit_intrinsic_src(nir_instr_as_intrinsic(instr), cb, state))
          return false;
@@ -1628,19 +1329,6 @@
       src_add_all_uses(dest->reg.indirect, instr, NULL);
 }
 
-void
-nir_instr_rewrite_deref(nir_instr *instr, nir_deref_var **deref,
-                        nir_deref_var *new_deref)
-{
-   if (*deref)
-      visit_deref_src(*deref, remove_use_cb, NULL);
-
-   *deref = new_deref;
-
-   if (*deref)
-      visit_deref_src(*deref, add_use_cb, instr);
-}
-
 /* note: does *not* take ownership of 'name' */
 void
 nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
@@ -1736,10 +1424,10 @@
       nir_if_rewrite_condition(use_src->parent_if, new_src);
 }
 
-uint8_t
+nir_component_mask_t
 nir_ssa_def_components_read(const nir_ssa_def *def)
 {
-   uint8_t read_mask = 0;
+   nir_component_mask_t read_mask = 0;
    nir_foreach_use(use, def) {
       if (use->parent_instr->type == nir_instr_type_alu) {
          nir_alu_instr *alu = nir_instr_as_alu(use->parent_instr);
@@ -1747,7 +1435,7 @@
          int src_idx = alu_src - &alu->src[0];
          assert(src_idx >= 0 && src_idx < nir_op_infos[alu->op].num_inputs);
 
-         for (unsigned c = 0; c < 4; c++) {
+         for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; c++) {
             if (!nir_alu_instr_channel_used(alu, src_idx, c))
                continue;
 
@@ -2009,6 +1697,8 @@
       return nir_intrinsic_load_base_instance;
    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
       return nir_intrinsic_load_vertex_id_zero_base;
+   case SYSTEM_VALUE_IS_INDEXED_DRAW:
+      return nir_intrinsic_load_is_indexed_draw;
    case SYSTEM_VALUE_FIRST_VERTEX:
       return nir_intrinsic_load_first_vertex;
    case SYSTEM_VALUE_BASE_VERTEX:
@@ -2067,6 +1757,10 @@
       return nir_intrinsic_load_subgroup_id;
    case SYSTEM_VALUE_LOCAL_GROUP_SIZE:
       return nir_intrinsic_load_local_group_size;
+   case SYSTEM_VALUE_GLOBAL_INVOCATION_ID:
+      return nir_intrinsic_load_global_invocation_id;
+   case SYSTEM_VALUE_WORK_DIM:
+      return nir_intrinsic_load_work_dim;
    default:
       unreachable("system value does not directly correspond to intrinsic");
    }
@@ -2088,6 +1782,8 @@
       return SYSTEM_VALUE_VERTEX_ID_ZERO_BASE;
    case nir_intrinsic_load_first_vertex:
       return SYSTEM_VALUE_FIRST_VERTEX;
+   case nir_intrinsic_load_is_indexed_draw:
+      return SYSTEM_VALUE_IS_INDEXED_DRAW;
    case nir_intrinsic_load_base_vertex:
       return SYSTEM_VALUE_BASE_VERTEX;
    case nir_intrinsic_load_invocation_id:
@@ -2144,6 +1840,8 @@
       return SYSTEM_VALUE_SUBGROUP_ID;
    case nir_intrinsic_load_local_group_size:
       return SYSTEM_VALUE_LOCAL_GROUP_SIZE;
+   case nir_intrinsic_load_global_invocation_id:
+      return SYSTEM_VALUE_GLOBAL_INVOCATION_ID;
    default:
       unreachable("intrinsic doesn't produce a system value");
    }
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index f3326e6..b7c8754 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -55,11 +55,10 @@
 extern "C" {
 #endif
 
-struct gl_program;
-struct gl_shader_program;
-
 #define NIR_FALSE 0u
 #define NIR_TRUE (~0u)
+#define NIR_MAX_VEC_COMPONENTS 4
+typedef uint8_t nir_component_mask_t;
 
 /** Defines a cast function
  *
@@ -81,6 +80,7 @@
 struct nir_function;
 struct nir_shader;
 struct nir_instr;
+struct nir_builder;
 
 
 /**
@@ -101,7 +101,6 @@
    nir_var_uniform         = (1 << 4),
    nir_var_shader_storage  = (1 << 5),
    nir_var_system_value    = (1 << 6),
-   nir_var_param           = (1 << 7),
    nir_var_shared          = (1 << 8),
    nir_var_all             = ~0,
 } nir_variable_mode;
@@ -118,16 +117,16 @@
 } nir_rounding_mode;
 
 typedef union {
-   float f32[4];
-   double f64[4];
-   int8_t i8[4];
-   uint8_t u8[4];
-   int16_t i16[4];
-   uint16_t u16[4];
-   int32_t i32[4];
-   uint32_t u32[4];
-   int64_t i64[4];
-   uint64_t u64[4];
+   float f32[NIR_MAX_VEC_COMPONENTS];
+   double f64[NIR_MAX_VEC_COMPONENTS];
+   int8_t i8[NIR_MAX_VEC_COMPONENTS];
+   uint8_t u8[NIR_MAX_VEC_COMPONENTS];
+   int16_t i16[NIR_MAX_VEC_COMPONENTS];
+   uint16_t u16[NIR_MAX_VEC_COMPONENTS];
+   int32_t i32[NIR_MAX_VEC_COMPONENTS];
+   uint32_t u32[NIR_MAX_VEC_COMPONENTS];
+   int64_t i64[NIR_MAX_VEC_COMPONENTS];
+   uint64_t u64[NIR_MAX_VEC_COMPONENTS];
 } nir_const_value;
 
 typedef struct nir_constant {
@@ -138,7 +137,7 @@
     * by the type associated with the \c nir_variable.  Constants may be
     * scalars, vectors, or matrices.
     */
-   nir_const_value values[4];
+   nir_const_value values[NIR_MAX_VEC_COMPONENTS];
 
    /* we could get this from the var->type but makes clone *much* easier to
     * not have to care about the type.
@@ -256,6 +255,26 @@
       unsigned bindless:1;
 
       /**
+       * Was an explicit binding set in the shader?
+       */
+      unsigned explicit_binding:1;
+
+      /**
+       * Was a transfer feedback buffer set in the shader?
+       */
+      unsigned explicit_xfb_buffer:1;
+
+      /**
+       * Was a transfer feedback stride set in the shader?
+       */
+      unsigned explicit_xfb_stride:1;
+
+      /**
+       * Was an explicit offset set in the shader?
+       */
+      unsigned explicit_offset:1;
+
+      /**
        * \brief Layout qualifier for gl_FragDepth.
        *
        * This is not equal to \c ir_depth_layout_none if and only if this
@@ -316,11 +335,21 @@
       int binding;
 
       /**
-       * Location an atomic counter is stored at.
+       * Location an atomic counter or transform feedback is stored at.
        */
       unsigned offset;
 
       /**
+       * Transform feedback buffer.
+       */
+      unsigned xfb_buffer;
+
+      /**
+       * Transform feedback stride.
+       */
+      unsigned xfb_stride;
+
+      /**
        * ARB_shader_image_load_store qualifiers.
        */
       struct {
@@ -367,6 +396,17 @@
     * \sa ir_variable::location
     */
    const struct glsl_type *interface_type;
+
+   /**
+    * Description of per-member data for per-member struct variables
+    *
+    * This is used for variables which are actually an amalgamation of
+    * multiple entities such as a struct of built-in values or a struct of
+    * inputs each with their own layout specifier.  This is only allowed on
+    * variables with a struct or array of array of struct type.
+    */
+   unsigned num_members;
+   struct nir_variable_data *members;
 } nir_variable;
 
 #define nir_foreach_variable(var, var_list) \
@@ -378,7 +418,7 @@
 static inline bool
 nir_variable_is_global(const nir_variable *var)
 {
-   return var->data.mode != nir_var_local && var->data.mode != nir_var_param;
+   return var->data.mode != nir_var_local;
 }
 
 typedef struct nir_register {
@@ -427,6 +467,7 @@
 
 typedef enum {
    nir_instr_type_alu,
+   nir_instr_type_deref,
    nir_instr_type_call,
    nir_instr_type_tex,
    nir_instr_type_intrinsic,
@@ -493,6 +534,7 @@
    /** Index into the live_in and live_out bitfields */
    unsigned live_index;
 
+   /** Instruction which produces this SSA value. */
    nir_instr *parent_instr;
 
    /** set of nir_instrs where this register is used (read from) */
@@ -532,6 +574,7 @@
 
 typedef struct nir_src {
    union {
+      /** Instruction that consumes this value as a source. */
       nir_instr *parent_instr;
       struct nir_if *parent_if;
    };
@@ -681,7 +724,7 @@
     * a statement like "foo.xzw = bar.zyx" would have a writemask of 1101b and
     * a swizzle of {2, x, 1, 0} where x means "don't care."
     */
-   uint8_t swizzle[4];
+   uint8_t swizzle[NIR_MAX_VEC_COMPONENTS];
 } nir_alu_src;
 
 typedef struct {
@@ -696,7 +739,7 @@
 
    bool saturate;
 
-   unsigned write_mask : 4; /* ignored if dest.is_ssa is true */
+   unsigned write_mask : NIR_MAX_VEC_COMPONENTS; /* ignored if dest.is_ssa is true */
 } nir_alu_dest;
 
 typedef enum {
@@ -825,14 +868,14 @@
    /**
     * The number of components in each input
     */
-   unsigned input_sizes[4];
+   unsigned input_sizes[NIR_MAX_VEC_COMPONENTS];
 
    /**
     * The type of vector that each input takes. Note that negate and
     * absolute value are only allowed on inputs with int or float type and
     * behave differently on the two.
     */
-   nir_alu_type input_types[4];
+   nir_alu_type input_types[NIR_MAX_VEC_COMPONENTS];
 
    nir_op_algebraic_property algebraic_properties;
 } nir_op_info;
@@ -894,73 +937,94 @@
 typedef enum {
    nir_deref_type_var,
    nir_deref_type_array,
-   nir_deref_type_struct
+   nir_deref_type_array_wildcard,
+   nir_deref_type_struct,
+   nir_deref_type_cast,
 } nir_deref_type;
 
-typedef struct nir_deref {
-   nir_deref_type deref_type;
-   struct nir_deref *child;
-   const struct glsl_type *type;
-} nir_deref;
-
-typedef struct {
-   nir_deref deref;
-
-   nir_variable *var;
-} nir_deref_var;
-
-/* This enum describes how the array is referenced.  If the deref is
- * direct then the base_offset is used.  If the deref is indirect then
- * offset is given by base_offset + indirect.  If the deref is a wildcard
- * then the deref refers to all of the elements of the array at the same
- * time.  Wildcard dereferences are only ever allowed in copy_var
- * intrinsics and the source and destination derefs must have matching
- * wildcards.
- */
-typedef enum {
-   nir_deref_array_type_direct,
-   nir_deref_array_type_indirect,
-   nir_deref_array_type_wildcard,
-} nir_deref_array_type;
-
-typedef struct {
-   nir_deref deref;
-
-   nir_deref_array_type deref_array_type;
-   unsigned base_offset;
-   nir_src indirect;
-} nir_deref_array;
-
-typedef struct {
-   nir_deref deref;
-
-   unsigned index;
-} nir_deref_struct;
-
-NIR_DEFINE_CAST(nir_deref_as_var, nir_deref, nir_deref_var, deref,
-                deref_type, nir_deref_type_var)
-NIR_DEFINE_CAST(nir_deref_as_array, nir_deref, nir_deref_array, deref,
-                deref_type, nir_deref_type_array)
-NIR_DEFINE_CAST(nir_deref_as_struct, nir_deref, nir_deref_struct, deref,
-                deref_type, nir_deref_type_struct)
-
-/* Returns the last deref in the chain. */
-static inline nir_deref *
-nir_deref_tail(nir_deref *deref)
-{
-   while (deref->child)
-      deref = deref->child;
-   return deref;
-}
-
 typedef struct {
    nir_instr instr;
 
-   unsigned num_params;
-   nir_deref_var **params;
-   nir_deref_var *return_deref;
+   /** The type of this deref instruction */
+   nir_deref_type deref_type;
+
+   /** The mode of the underlying variable */
+   nir_variable_mode mode;
+
+   /** The dereferenced type of the resulting pointer value */
+   const struct glsl_type *type;
+
+   union {
+      /** Variable being dereferenced if deref_type is a deref_var */
+      nir_variable *var;
+
+      /** Parent deref if deref_type is not deref_var */
+      nir_src parent;
+   };
+
+   /** Additional deref parameters */
+   union {
+      struct {
+         nir_src index;
+      } arr;
+
+      struct {
+         unsigned index;
+      } strct;
+   };
+
+   /** Destination to store the resulting "pointer" */
+   nir_dest dest;
+} nir_deref_instr;
+
+NIR_DEFINE_CAST(nir_instr_as_deref, nir_instr, nir_deref_instr, instr,
+                type, nir_instr_type_deref)
+
+static inline nir_deref_instr *
+nir_src_as_deref(nir_src src)
+{
+   if (!src.is_ssa)
+      return NULL;
+
+   if (src.ssa->parent_instr->type != nir_instr_type_deref)
+      return NULL;
+
+   return nir_instr_as_deref(src.ssa->parent_instr);
+}
+
+static inline nir_deref_instr *
+nir_deref_instr_parent(const nir_deref_instr *instr)
+{
+   if (instr->deref_type == nir_deref_type_var)
+      return NULL;
+   else
+      return nir_src_as_deref(instr->parent);
+}
+
+static inline nir_variable *
+nir_deref_instr_get_variable(const nir_deref_instr *instr)
+{
+   while (instr->deref_type != nir_deref_type_var) {
+      if (instr->deref_type == nir_deref_type_cast)
+         return NULL;
+
+      instr = nir_deref_instr_parent(instr);
+   }
+
+   return instr->var;
+}
+
+bool nir_deref_instr_has_indirect(nir_deref_instr *instr);
+
+bool nir_deref_instr_remove_if_unused(nir_deref_instr *instr);
+
+typedef struct {
+   nir_instr instr;
 
    struct nir_function *callee;
+
+   unsigned num_params;
+   nir_src params[];
 } nir_call_instr;
 
 #include "nir_intrinsics.h"
@@ -1012,11 +1076,15 @@
 
    int const_index[NIR_INTRINSIC_MAX_CONST_INDEX];
 
-   nir_deref_var *variables[2];
-
    nir_src src[];
 } nir_intrinsic_instr;
 
+static inline nir_variable *
+nir_intrinsic_get_var(nir_intrinsic_instr *intrin, unsigned i)
+{
+   return nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[i]));
+}
+
 /**
  * \name NIR intrinsics semantic flags
  *
@@ -1104,11 +1172,16 @@
     */
    NIR_INTRINSIC_CLUSTER_SIZE = 11,
 
+   /**
+    * Parameter index for a load_param intrinsic
+    */
+   NIR_INTRINSIC_PARAM_IDX = 12,
+
    NIR_INTRINSIC_NUM_INDEX_FLAGS,
 
 } nir_intrinsic_index_flag;
 
-#define NIR_INTRINSIC_MAX_INPUTS 4
+#define NIR_INTRINSIC_MAX_INPUTS 5
 
 typedef struct {
    const char *name;
@@ -1131,9 +1204,6 @@
     */
    unsigned dest_components;
 
-   /** the number of inputs/outputs that are variables */
-   unsigned num_variables;
-
    /** the number of constant indices used by the intrinsic */
    unsigned num_indices;
 
@@ -1196,6 +1266,7 @@
 INTRINSIC_IDX_ACCESSORS(interp_mode, INTERP_MODE, unsigned)
 INTRINSIC_IDX_ACCESSORS(reduction_op, REDUCTION_OP, unsigned)
 INTRINSIC_IDX_ACCESSORS(cluster_size, CLUSTER_SIZE, unsigned)
+INTRINSIC_IDX_ACCESSORS(param_idx, PARAM_IDX, unsigned)
 
 /**
  * \group texture information
@@ -1215,6 +1286,8 @@
    nir_tex_src_ms_mcs, /* MSAA compression value */
    nir_tex_src_ddx,
    nir_tex_src_ddy,
+   nir_tex_src_texture_deref, /* < deref pointing to the texture */
+   nir_tex_src_sampler_deref, /* < deref pointing to the sampler */
    nir_tex_src_texture_offset, /* < dynamically uniform indirect offset */
    nir_tex_src_sampler_offset, /* < dynamically uniform indirect offset */
    nir_tex_src_plane,          /* < selects plane for planar textures */
@@ -1275,12 +1348,6 @@
    /** The size of the texture array or 0 if it's not an array */
    unsigned texture_array_size;
 
-   /** The texture deref
-    *
-    * If this is null, use texture_index instead.
-    */
-   nir_deref_var *texture;
-
    /** The sampler index
     *
     * The following operations do not require a sampler and, as such, this
@@ -1297,12 +1364,6 @@
     * then the sampler index is given by sampler_index + sampler_offset.
     */
    unsigned sampler_index;
-
-   /** The sampler deref
-    *
-    * If this is null, use sampler_index instead.
-    */
-   nir_deref_var *sampler;
 } nir_tex_instr;
 
 static inline unsigned
@@ -1377,6 +1438,30 @@
    }
 }
 
+static inline bool
+nir_alu_instr_is_comparison(const nir_alu_instr *instr)
+{
+   switch (instr->op) {
+   case nir_op_flt:
+   case nir_op_fge:
+   case nir_op_feq:
+   case nir_op_fne:
+   case nir_op_ilt:
+   case nir_op_ult:
+   case nir_op_ige:
+   case nir_op_uge:
+   case nir_op_ieq:
+   case nir_op_ine:
+   case nir_op_i2b:
+   case nir_op_f2b:
+   case nir_op_inot:
+   case nir_op_fnot:
+      return true;
+   default:
+      return false;
+   }
+}
+
 static inline nir_alu_type
 nir_tex_instr_src_type(const nir_tex_instr *instr, unsigned src)
 {
@@ -1725,13 +1810,6 @@
    /** list for all local variables in the function */
    struct exec_list locals;
 
-   /** array of variables used as parameters */
-   unsigned num_params;
-   nir_variable **params;
-
-   /** variable used to hold the result of the function */
-   nir_variable *return_var;
-
    /** list of local registers in the function */
    struct exec_list registers;
 
@@ -1842,15 +1920,9 @@
    return nir_cf_node_as_block(exec_node_data(nir_cf_node, tail, node));
 }
 
-typedef enum {
-   nir_parameter_in,
-   nir_parameter_out,
-   nir_parameter_inout,
-} nir_parameter_type;
-
 typedef struct {
-   nir_parameter_type param_type;
-   const struct glsl_type *type;
+   uint8_t num_components;
+   uint8_t bit_size;
 } nir_parameter;
 
 typedef struct nir_function {
@@ -1861,7 +1933,6 @@
 
    unsigned num_params;
    nir_parameter *params;
-   const struct glsl_type *return_type;
 
    /** The implementation of this function.
     *
@@ -1882,10 +1953,28 @@
    bool lower_fsqrt;
    bool lower_fmod32;
    bool lower_fmod64;
+   /** Lowers ibitfield_extract/ubitfield_extract to ibfe/ubfe. */
    bool lower_bitfield_extract;
+   /** Lowers ibitfield_extract/ubitfield_extract to bfm, compares, shifts. */
+   bool lower_bitfield_extract_to_shifts;
+   /** Lowers bitfield_insert to bfi/bfm */
    bool lower_bitfield_insert;
+   /** Lowers bitfield_insert to bfm, compares, and shifts. */
+   bool lower_bitfield_insert_to_shifts;
+   /** Lowers bitfield_reverse to shifts. */
+   bool lower_bitfield_reverse;
+   /** Lowers bit_count to shifts. */
+   bool lower_bit_count;
+   /** Lowers bfm to shifts and subtracts. */
+   bool lower_bfm;
+   /** Lowers ifind_msb to compare and ufind_msb */
+   bool lower_ifind_msb;
+   /** Lowers find_lsb to ufind_msb and logic ops */
+   bool lower_find_lsb;
    bool lower_uadd_carry;
    bool lower_usub_borrow;
+   /** Lowers imul_high/umul_high to 16-bit multiplies and carry operations. */
+   bool lower_mul_high;
    /** lowers fneg and ineg to fsub and isub. */
    bool lower_negate;
    /** lowers fsub and isub to fadd+fneg and iadd+ineg. */
@@ -1897,6 +1986,9 @@
    /** enables rules to lower idiv by power-of-two: */
    bool lower_idiv;
 
+   /* lower b2f to iand */
+   bool lower_b2f;
+
    /* Does the native fdot instruction replicate its result for four
     * components?  If so, then opt_algebraic_late will turn all fdotN
     * instructions into fdot_replicatedN instructions.
@@ -1933,10 +2025,33 @@
    /* Indicates that the driver only has zero-based vertex id */
    bool vertex_id_zero_based;
 
+   /**
+    * If enabled, gl_BaseVertex will be lowered as:
+    * is_indexed_draw (~0/0) & firstvertex
+    */
+   bool lower_base_vertex;
+
+   /**
+    * If enabled, gl_HelperInvocation will be lowered as:
+    *
+    *   !((1 << sample_id) & sample_mask_in))
+    *
+    * This depends on some possibly hw implementation details, which may
+    * not be true for all hw.  In particular that the FS is only executed
+    * for covered samples or for helper invocations.  So, do not blindly
+    * enable this option.
+    *
+    * Note: See also issue #22 in ARB_shader_image_load_store
+    */
+   bool lower_helper_invocation;
+
    bool lower_cs_local_index_from_id;
 
    bool lower_device_index_to_zero;
 
+   /* Set if nir_lower_wpos_ytransform() should also invert gl_PointCoord. */
+   bool lower_wpos_pntc;
+
    /**
     * Should nir_lower_io() create load_interpolated_input intrinsics?
     *
@@ -1996,6 +2111,14 @@
     * access plus one
     */
    unsigned num_inputs, num_uniforms, num_outputs, num_shared;
+
+   /** Constant data associated with this shader.
+    *
+    * Constant data is loaded through load_constant intrinsics.  See also
+    * nir_opt_large_constants.
+    */
+   void *constant_data;
+   unsigned constant_data_size;
 } nir_shader;
 
 static inline nir_function_impl *
@@ -2004,7 +2127,6 @@
    assert(exec_list_length(&shader->functions) == 1);
    struct exec_node *func_node = exec_list_get_head(&shader->functions);
    nir_function *func = exec_node_data(nir_function, func_node, node);
-   assert(func->return_type == glsl_void_type());
    assert(func->num_params == 0);
    assert(func->impl);
    return func->impl;
@@ -2066,6 +2188,9 @@
 /** creates an instruction with default swizzle/writemask/etc. with NULL registers */
 nir_alu_instr *nir_alu_instr_create(nir_shader *shader, nir_op op);
 
+nir_deref_instr *nir_deref_instr_create(nir_shader *shader,
+                                        nir_deref_type deref_type);
+
 nir_jump_instr *nir_jump_instr_create(nir_shader *shader, nir_jump_type type);
 
 nir_load_const_instr *nir_load_const_instr_create(nir_shader *shader,
@@ -2088,17 +2213,6 @@
                                                 unsigned num_components,
                                                 unsigned bit_size);
 
-nir_deref_var *nir_deref_var_create(void *mem_ctx, nir_variable *var);
-nir_deref_array *nir_deref_array_create(void *mem_ctx);
-nir_deref_struct *nir_deref_struct_create(void *mem_ctx, unsigned field_index);
-
-typedef bool (*nir_deref_foreach_leaf_cb)(nir_deref_var *deref, void *state);
-bool nir_deref_foreach_leaf(nir_deref_var *deref,
-                            nir_deref_foreach_leaf_cb cb, void *state);
-
-nir_load_const_instr *
-nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref);
-
 nir_const_value nir_alu_binop_identity(nir_op binop, unsigned bit_size);
 
 /**
@@ -2330,8 +2444,6 @@
 void nir_if_rewrite_condition(nir_if *if_stmt, nir_src new_src);
 void nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest,
                             nir_dest new_dest);
-void nir_instr_rewrite_deref(nir_instr *instr, nir_deref_var **deref,
-                             nir_deref_var *new_deref);
 
 void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
                        unsigned num_components, unsigned bit_size,
@@ -2352,7 +2464,7 @@
 void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src,
                                     nir_instr *after_me);
 
-uint8_t nir_ssa_def_components_read(const nir_ssa_def *def);
+nir_component_mask_t nir_ssa_def_components_read(const nir_ssa_def *def);
 
 /*
  * finds the next basic block in source-code order, returns NULL if there is
@@ -2426,8 +2538,6 @@
 nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi);
 nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
 nir_variable *nir_variable_clone(const nir_variable *c, nir_shader *shader);
-nir_deref *nir_deref_clone(const nir_deref *deref, void *mem_ctx);
-nir_deref_var *nir_deref_var_clone(const nir_deref_var *deref, void *mem_ctx);
 
 nir_shader *nir_shader_serialize_deserialize(void *mem_ctx, nir_shader *s);
 
@@ -2526,6 +2636,7 @@
 int nir_gs_count_vertices(const nir_shader *shader);
 
 bool nir_split_var_copies(nir_shader *shader);
+bool nir_split_per_member_structs(nir_shader *shader);
 
 bool nir_lower_returns_impl(nir_function_impl *impl);
 bool nir_lower_returns(nir_shader *shader);
@@ -2535,8 +2646,12 @@
 bool nir_propagate_invariant(nir_shader *shader);
 
 void nir_lower_var_copy_instr(nir_intrinsic_instr *copy, nir_shader *shader);
+void nir_lower_deref_copy_instr(struct nir_builder *b,
+                                nir_intrinsic_instr *copy);
 bool nir_lower_var_copies(nir_shader *shader);
 
+void nir_fixup_deref_modes(nir_shader *shader);
+
 bool nir_lower_global_vars_to_local(nir_shader *shader);
 
 bool nir_lower_indirect_derefs(nir_shader *shader, nir_variable_mode modes);
@@ -2573,19 +2688,22 @@
 
 bool nir_is_per_vertex_io(const nir_variable *var, gl_shader_stage stage);
 
-void nir_lower_io_types(nir_shader *shader);
 bool nir_lower_regs_to_ssa_impl(nir_function_impl *impl);
 bool nir_lower_regs_to_ssa(nir_shader *shader);
 bool nir_lower_vars_to_ssa(nir_shader *shader);
 
+bool nir_remove_dead_derefs(nir_shader *shader);
+bool nir_remove_dead_derefs_impl(nir_function_impl *impl);
 bool nir_remove_dead_variables(nir_shader *shader, nir_variable_mode modes);
 bool nir_lower_constant_initializers(nir_shader *shader,
                                      nir_variable_mode modes);
 
+bool nir_move_load_const(nir_shader *shader);
 bool nir_move_vec_src_uses_to_dest(nir_shader *shader);
 bool nir_lower_vec_to_movs(nir_shader *shader);
 void nir_lower_alpha_test(nir_shader *shader, enum compare_func func,
                           bool alpha_to_one);
+bool nir_lower_alu(nir_shader *shader);
 bool nir_lower_alu_to_scalar(nir_shader *shader);
 bool nir_lower_load_const_to_scalar(nir_shader *shader);
 bool nir_lower_read_invocation_to_scalar(nir_shader *shader);
@@ -2596,11 +2714,6 @@
 void nir_lower_io_to_scalar(nir_shader *shader, nir_variable_mode mask);
 void nir_lower_io_to_scalar_early(nir_shader *shader, nir_variable_mode mask);
 
-bool nir_lower_samplers(nir_shader *shader,
-                        const struct gl_shader_program *shader_program);
-bool nir_lower_samplers_as_deref(nir_shader *shader,
-                                 const struct gl_shader_program *shader_program);
-
 typedef struct nir_lower_subgroups_options {
    uint8_t subgroup_size;
    uint8_t ballot_bit_size;
@@ -2721,7 +2834,8 @@
 bool nir_lower_clamp_color_outputs(nir_shader *shader);
 
 void nir_lower_passthrough_edgeflags(nir_shader *shader);
-void nir_lower_tes_patch_vertices(nir_shader *tes, unsigned patch_vertices);
+bool nir_lower_patch_vertices(nir_shader *nir, unsigned static_count,
+                              const gl_state_index16 *uniform_state_tokens);
 
 typedef struct nir_lower_wpos_ytransform_options {
    gl_state_index16 state_tokens[STATE_LENGTH];
@@ -2755,14 +2869,17 @@
 
 void nir_lower_bitmap(nir_shader *shader, const nir_lower_bitmap_options *options);
 
-bool nir_lower_atomics(nir_shader *shader,
-                       const struct gl_shader_program *shader_program,
-                       bool use_binding_as_idx);
 bool nir_lower_atomics_to_ssbo(nir_shader *shader, unsigned ssbo_offset);
 bool nir_lower_to_source_mods(nir_shader *shader);
 
 bool nir_lower_gs_intrinsics(nir_shader *shader);
 
+typedef unsigned (*nir_lower_bit_size_callback)(const nir_alu_instr *, void *);
+
+bool nir_lower_bit_size(nir_shader *shader,
+                        nir_lower_bit_size_callback callback,
+                        void *callback_data);
+
 typedef enum {
    nir_lower_imul64 = (1 << 0),
    nir_lower_isign64 = (1 << 1),
@@ -2785,7 +2902,7 @@
 } nir_lower_doubles_options;
 
 bool nir_lower_doubles(nir_shader *shader, nir_lower_doubles_options options);
-bool nir_lower_64bit_pack(nir_shader *shader);
+bool nir_lower_pack(nir_shader *shader);
 
 bool nir_normalize_cubemap_coords(nir_shader *shader);
 
@@ -2809,6 +2926,7 @@
 
 bool nir_lower_phis_to_regs_block(nir_block *block);
 bool nir_lower_ssa_defs_to_regs_block(nir_block *block);
+bool nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl);
 
 bool nir_opt_algebraic(nir_shader *shader);
 bool nir_opt_algebraic_before_ffma(nir_shader *shader);
@@ -2833,6 +2951,10 @@
 
 bool nir_opt_intrinsics(nir_shader *shader);
 
+bool nir_opt_large_constants(nir_shader *shader,
+                             glsl_type_size_align_func size_align,
+                             unsigned threshold);
+
 bool nir_opt_loop_unroll(nir_shader *shader, nir_variable_mode indirect_mask);
 
 bool nir_opt_move_comparisons(nir_shader *shader);
@@ -2841,6 +2963,7 @@
 
 bool nir_opt_peephole_select(nir_shader *shader, unsigned limit);
 
+bool nir_opt_remove_phis_impl(nir_function_impl *impl);
 bool nir_opt_remove_phis(nir_shader *shader);
 
 bool nir_opt_shrink_load(nir_shader *shader);
diff --git a/src/compiler/nir/nir_algebraic.py b/src/compiler/nir/nir_algebraic.py
index d6784df..a84c41a 100644
--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@@ -25,6 +25,7 @@
 
 from __future__ import print_function
 import ast
+from collections import OrderedDict
 import itertools
 import struct
 import sys
@@ -55,7 +56,7 @@
    def __getitem__(self, name):
       if name not in self.names:
          assert not self.immutable, "Unknown replacement variable: " + name
-         self.names[name] = self.ids.next()
+         self.names[name] = next(self.ids)
 
       return self.names[name]
 
@@ -78,7 +79,7 @@
 static const ${val.c_type} ${val.name} = {
    { ${val.type_enum}, ${val.bit_size} },
 % if isinstance(val, Constant):
-   ${val.type()}, { ${hex(val)} /* ${val.value} */ },
+   ${val.type()}, { ${val.hex()} /* ${val.value} */ },
 % elif isinstance(val, Variable):
    ${val.index}, /* ${val.var_name} */
    ${'true' if val.is_constant else 'false'},
@@ -132,13 +133,22 @@
          assert self.bit_size == 0 or self.bit_size == 32
          self.bit_size = 32
 
-   def __hex__(self):
+   def hex(self):
       if isinstance(self.value, (bool)):
          return 'NIR_TRUE' if self.value else 'NIR_FALSE'
       if isinstance(self.value, (int, long)):
          return hex(self.value)
       elif isinstance(self.value, float):
-         return hex(struct.unpack('Q', struct.pack('d', self.value))[0])
+         i = struct.unpack('Q', struct.pack('d', self.value))[0]
+         h = hex(i)
+
+         # On Python 2 this 'L' suffix is automatically added, but not on Python 3
+         # Adding it explicitly makes the generated file identical, regardless
+         # of the Python version running this script.
+         if h[-1] != 'L' and i > sys.maxsize:
+            h += 'L'
+
+         return h
       else:
          assert False
 
@@ -467,7 +477,7 @@
 
 class SearchAndReplace(object):
    def __init__(self, transform):
-      self.id = _optimization_ids.next()
+      self.id = next(_optimization_ids)
 
       search = transform[0]
       replace = transform[1]
@@ -511,7 +521,7 @@
 
 #endif
 
-% for (opcode, xform_list) in xform_dict.iteritems():
+% for (opcode, xform_list) in xform_dict.items():
 % for xform in xform_list:
    ${xform.search.render()}
    ${xform.replace.render()}
@@ -601,7 +611,7 @@
 
 class AlgebraicPass(object):
    def __init__(self, pass_name, transforms):
-      self.xform_dict = {}
+      self.xform_dict = OrderedDict()
       self.pass_name = pass_name
 
       error = False
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 02a9dbf..8c883f2 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -361,7 +361,8 @@
     * scalar value was passed into a multiply with a vector).
     */
    for (unsigned i = 0; i < op_info->num_inputs; i++) {
-      for (unsigned j = instr->src[i].src.ssa->num_components; j < 4; j++) {
+      for (unsigned j = instr->src[i].src.ssa->num_components;
+           j < NIR_MAX_VEC_COMPONENTS; j++) {
          instr->src[i].swizzle[j] = instr->src[i].src.ssa->num_components - 1;
       }
    }
@@ -430,12 +431,13 @@
  * Construct an fmov or imov that reswizzles the source's components.
  */
 static inline nir_ssa_def *
-nir_swizzle(nir_builder *build, nir_ssa_def *src, const unsigned swiz[4],
+nir_swizzle(nir_builder *build, nir_ssa_def *src, const unsigned *swiz,
             unsigned num_components, bool use_fmov)
 {
+   assert(num_components <= NIR_MAX_VEC_COMPONENTS);
    nir_alu_src alu_src = { NIR_SRC_INIT };
    alu_src.src = nir_src_for_ssa(src);
-   for (unsigned i = 0; i < num_components; i++)
+   for (unsigned i = 0; i < num_components && i < NIR_MAX_VEC_COMPONENTS; i++)
       alu_src.swizzle[i] = swiz[i];
 
    return use_fmov ? nir_fmov_alu(build, alu_src, num_components) :
@@ -481,16 +483,15 @@
 static inline nir_ssa_def *
 nir_channel(nir_builder *b, nir_ssa_def *def, unsigned c)
 {
-   unsigned swizzle[4] = {c, c, c, c};
-   return nir_swizzle(b, def, swizzle, 1, false);
+   return nir_swizzle(b, def, &c, 1, false);
 }
 
 static inline nir_ssa_def *
-nir_channels(nir_builder *b, nir_ssa_def *def, unsigned mask)
+nir_channels(nir_builder *b, nir_ssa_def *def, nir_component_mask_t mask)
 {
-   unsigned num_channels = 0, swizzle[4] = { 0, 0, 0, 0 };
+   unsigned num_channels = 0, swizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
 
-   for (unsigned i = 0; i < 4; i++) {
+   for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
       if ((mask & (1 << i)) == 0)
          continue;
       swizzle[num_channels++] = i;
@@ -526,7 +527,9 @@
 static inline nir_ssa_def *
 nir_ssa_for_alu_src(nir_builder *build, nir_alu_instr *instr, unsigned srcn)
 {
-   static uint8_t trivial_swizzle[4] = { 0, 1, 2, 3 };
+   static uint8_t trivial_swizzle[NIR_MAX_VEC_COMPONENTS];
+   for (int i = 0; i < NIR_MAX_VEC_COMPONENTS; ++i)
+      trivial_swizzle[i] = i;
    nir_alu_src *src = &instr->src[srcn];
    unsigned num_components = nir_ssa_alu_instr_src_components(instr, srcn);
 
@@ -538,6 +541,160 @@
    return nir_imov_alu(build, *src, num_components);
 }
 
+static inline nir_deref_instr *
+nir_build_deref_var(nir_builder *build, nir_variable *var)
+{
+   nir_deref_instr *deref =
+      nir_deref_instr_create(build->shader, nir_deref_type_var);
+
+   deref->mode = var->data.mode;
+   deref->type = var->type;
+   deref->var = var;
+
+   nir_ssa_dest_init(&deref->instr, &deref->dest, 1, 32, NULL);
+
+   nir_builder_instr_insert(build, &deref->instr);
+
+   return deref;
+}
+
+static inline nir_deref_instr *
+nir_build_deref_array(nir_builder *build, nir_deref_instr *parent,
+                      nir_ssa_def *index)
+{
+   assert(glsl_type_is_array(parent->type) ||
+          glsl_type_is_matrix(parent->type) ||
+          glsl_type_is_vector(parent->type));
+
+   nir_deref_instr *deref =
+      nir_deref_instr_create(build->shader, nir_deref_type_array);
+
+   deref->mode = parent->mode;
+   deref->type = glsl_get_array_element(parent->type);
+   deref->parent = nir_src_for_ssa(&parent->dest.ssa);
+   deref->arr.index = nir_src_for_ssa(index);
+
+   nir_ssa_dest_init(&deref->instr, &deref->dest,
+                     parent->dest.ssa.num_components,
+                     parent->dest.ssa.bit_size, NULL);
+
+   nir_builder_instr_insert(build, &deref->instr);
+
+   return deref;
+}
+
+static inline nir_deref_instr *
+nir_build_deref_array_wildcard(nir_builder *build, nir_deref_instr *parent)
+{
+   assert(glsl_type_is_array(parent->type) ||
+          glsl_type_is_matrix(parent->type));
+
+   nir_deref_instr *deref =
+      nir_deref_instr_create(build->shader, nir_deref_type_array_wildcard);
+
+   deref->mode = parent->mode;
+   deref->type = glsl_get_array_element(parent->type);
+   deref->parent = nir_src_for_ssa(&parent->dest.ssa);
+
+   nir_ssa_dest_init(&deref->instr, &deref->dest,
+                     parent->dest.ssa.num_components,
+                     parent->dest.ssa.bit_size, NULL);
+
+   nir_builder_instr_insert(build, &deref->instr);
+
+   return deref;
+}
+
+static inline nir_deref_instr *
+nir_build_deref_struct(nir_builder *build, nir_deref_instr *parent,
+                       unsigned index)
+{
+   assert(glsl_type_is_struct(parent->type));
+
+   nir_deref_instr *deref =
+      nir_deref_instr_create(build->shader, nir_deref_type_struct);
+
+   deref->mode = parent->mode;
+   deref->type = glsl_get_struct_field(parent->type, index);
+   deref->parent = nir_src_for_ssa(&parent->dest.ssa);
+   deref->strct.index = index;
+
+   nir_ssa_dest_init(&deref->instr, &deref->dest,
+                     parent->dest.ssa.num_components,
+                     parent->dest.ssa.bit_size, NULL);
+
+   nir_builder_instr_insert(build, &deref->instr);
+
+   return deref;
+}
+
+static inline nir_deref_instr *
+nir_build_deref_cast(nir_builder *build, nir_ssa_def *parent,
+                     nir_variable_mode mode, const struct glsl_type *type)
+{
+   nir_deref_instr *deref =
+      nir_deref_instr_create(build->shader, nir_deref_type_cast);
+
+   deref->mode = mode;
+   deref->type = type;
+   deref->parent = nir_src_for_ssa(parent);
+
+   nir_ssa_dest_init(&deref->instr, &deref->dest,
+                     parent->num_components, parent->bit_size, NULL);
+
+   nir_builder_instr_insert(build, &deref->instr);
+
+   return deref;
+}
+
+/** Returns a deref that follows another but starting from the given parent
+ *
+ * The new deref will be the same type and take the same array or struct index
+ * as the leader deref but it may have a different parent.  This is very
+ * useful for walking deref paths.
+ */
+static inline nir_deref_instr *
+nir_build_deref_follower(nir_builder *b, nir_deref_instr *parent,
+                         nir_deref_instr *leader)
+{
+   /* If the derefs would have the same parent, don't make a new one */
+   assert(leader->parent.is_ssa);
+   if (leader->parent.ssa == &parent->dest.ssa)
+      return leader;
+
+   UNUSED nir_deref_instr *leader_parent = nir_src_as_deref(leader->parent);
+
+   switch (leader->deref_type) {
+   case nir_deref_type_var:
+      unreachable("A var dereference cannot have a parent");
+      break;
+
+   case nir_deref_type_array:
+   case nir_deref_type_array_wildcard:
+      assert(glsl_type_is_matrix(parent->type) ||
+             glsl_type_is_array(parent->type));
+      assert(glsl_get_length(parent->type) ==
+             glsl_get_length(leader_parent->type));
+
+      if (leader->deref_type == nir_deref_type_array) {
+         assert(leader->arr.index.is_ssa);
+         return nir_build_deref_array(b, parent, leader->arr.index.ssa);
+      } else {
+         return nir_build_deref_array_wildcard(b, parent);
+      }
+
+   case nir_deref_type_struct:
+      assert(glsl_type_is_struct(parent->type));
+      assert(glsl_get_length(parent->type) ==
+             glsl_get_length(leader_parent->type));
+
+      return nir_build_deref_struct(b, parent, leader->strct.index);
+
+   default:
+      unreachable("Invalid deref instruction type");
+   }
+}
+
 static inline nir_ssa_def *
 nir_load_reg(nir_builder *build, nir_register *reg)
 {
@@ -545,88 +702,76 @@
 }
 
 static inline nir_ssa_def *
-nir_load_var(nir_builder *build, nir_variable *var)
+nir_load_deref(nir_builder *build, nir_deref_instr *deref)
 {
-   const unsigned num_components = glsl_get_vector_elements(var->type);
-
    nir_intrinsic_instr *load =
-      nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_var);
-   load->num_components = num_components;
-   load->variables[0] = nir_deref_var_create(load, var);
-   nir_ssa_dest_init(&load->instr, &load->dest, num_components,
-                     glsl_get_bit_size(var->type), NULL);
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_deref);
+   load->num_components = glsl_get_vector_elements(deref->type);
+   load->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+   nir_ssa_dest_init(&load->instr, &load->dest, load->num_components,
+                     glsl_get_bit_size(deref->type), NULL);
    nir_builder_instr_insert(build, &load->instr);
    return &load->dest.ssa;
 }
 
-static inline nir_ssa_def *
-nir_load_deref_var(nir_builder *build, nir_deref_var *deref)
+static inline void
+nir_store_deref(nir_builder *build, nir_deref_instr *deref,
+                nir_ssa_def *value, unsigned writemask)
 {
-   const struct glsl_type *type = nir_deref_tail(&deref->deref)->type;
-   const unsigned num_components = glsl_get_vector_elements(type);
+   nir_intrinsic_instr *store =
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_store_deref);
+   store->num_components = glsl_get_vector_elements(deref->type);
+   store->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+   store->src[1] = nir_src_for_ssa(value);
+   nir_intrinsic_set_write_mask(store,
+                                writemask & ((1 << store->num_components) - 1));
+   nir_builder_instr_insert(build, &store->instr);
+}
 
-   nir_intrinsic_instr *load =
-      nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_var);
-   load->num_components = num_components;
-   load->variables[0] = nir_deref_var_clone(deref, load);
-   nir_ssa_dest_init(&load->instr, &load->dest, num_components,
-                     glsl_get_bit_size(type), NULL);
-   nir_builder_instr_insert(build, &load->instr);
-   return &load->dest.ssa;
+static inline void
+nir_copy_deref(nir_builder *build, nir_deref_instr *dest, nir_deref_instr *src)
+{
+   nir_intrinsic_instr *copy =
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_copy_deref);
+   copy->src[0] = nir_src_for_ssa(&dest->dest.ssa);
+   copy->src[1] = nir_src_for_ssa(&src->dest.ssa);
+   nir_builder_instr_insert(build, &copy->instr);
+}
+
+static inline nir_ssa_def *
+nir_load_var(nir_builder *build, nir_variable *var)
+{
+   return nir_load_deref(build, nir_build_deref_var(build, var));
 }
 
 static inline void
 nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value,
               unsigned writemask)
 {
-   const unsigned num_components = glsl_get_vector_elements(var->type);
-
-   nir_intrinsic_instr *store =
-      nir_intrinsic_instr_create(build->shader, nir_intrinsic_store_var);
-   store->num_components = num_components;
-   nir_intrinsic_set_write_mask(store, writemask);
-   store->variables[0] = nir_deref_var_create(store, var);
-   store->src[0] = nir_src_for_ssa(value);
-   nir_builder_instr_insert(build, &store->instr);
-}
-
-static inline void
-nir_store_deref_var(nir_builder *build, nir_deref_var *deref,
-                    nir_ssa_def *value, unsigned writemask)
-{
-   const unsigned num_components =
-      glsl_get_vector_elements(nir_deref_tail(&deref->deref)->type);
-
-   nir_intrinsic_instr *store =
-      nir_intrinsic_instr_create(build->shader, nir_intrinsic_store_var);
-   store->num_components = num_components;
-   store->const_index[0] = writemask & ((1 << num_components) - 1);
-   store->variables[0] = nir_deref_var_clone(deref, store);
-   store->src[0] = nir_src_for_ssa(value);
-   nir_builder_instr_insert(build, &store->instr);
-}
-
-static inline void
-nir_copy_deref_var(nir_builder *build, nir_deref_var *dest, nir_deref_var *src)
-{
-   assert(nir_deref_tail(&dest->deref)->type ==
-          nir_deref_tail(&src->deref)->type);
-
-   nir_intrinsic_instr *copy =
-      nir_intrinsic_instr_create(build->shader, nir_intrinsic_copy_var);
-   copy->variables[0] = nir_deref_var_clone(dest, copy);
-   copy->variables[1] = nir_deref_var_clone(src, copy);
-   nir_builder_instr_insert(build, &copy->instr);
+   nir_store_deref(build, nir_build_deref_var(build, var), value, writemask);
 }
 
 static inline void
 nir_copy_var(nir_builder *build, nir_variable *dest, nir_variable *src)
 {
-   nir_intrinsic_instr *copy =
-      nir_intrinsic_instr_create(build->shader, nir_intrinsic_copy_var);
-   copy->variables[0] = nir_deref_var_create(copy, dest);
-   copy->variables[1] = nir_deref_var_create(copy, src);
-   nir_builder_instr_insert(build, &copy->instr);
+   nir_copy_deref(build, nir_build_deref_var(build, dest),
+                         nir_build_deref_var(build, src));
+}
+
+static inline nir_ssa_def *
+nir_load_param(nir_builder *build, uint32_t param_idx)
+{
+   assert(param_idx < build->impl->function->num_params);
+   nir_parameter *param = &build->impl->function->params[param_idx];
+
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_param);
+   nir_intrinsic_set_param_idx(load, param_idx);
+   load->num_components = param->num_components;
+   nir_ssa_dest_init(&load->instr, &load->dest,
+                     param->num_components, param->bit_size, NULL);
+   nir_builder_instr_insert(build, &load->instr);
+   return &load->dest.ssa;
 }
 
 #include "nir_builder_opcodes.h"
diff --git a/src/compiler/nir/nir_builder_opcodes_h.py b/src/compiler/nir/nir_builder_opcodes_h.py
index 4a41e60..e600093 100644
--- a/src/compiler/nir/nir_builder_opcodes_h.py
+++ b/src/compiler/nir/nir_builder_opcodes_h.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 template = """\
 /* Copyright (C) 2015 Broadcom
@@ -33,7 +34,7 @@
    return ', '.join('src' + str(i) if i < num_srcs else 'NULL' for i in range(4))
 %>
 
-% for name, opcode in sorted(opcodes.iteritems()):
+% for name, opcode in sorted(opcodes.items()):
 static inline nir_ssa_def *
 nir_${name}(nir_builder *build, ${src_decl_list(opcode.num_inputs)})
 {
@@ -54,7 +55,7 @@
    return &load->dest.ssa;
 }
 
-% for name, opcode in filter(lambda v: v[1].sysval, sorted(INTR_OPCODES.iteritems())):
+% for name, opcode in filter(lambda v: v[1].sysval, sorted(INTR_OPCODES.items())):
 static inline nir_ssa_def *
 nir_${name}(nir_builder *build)
 {
@@ -68,4 +69,4 @@
 from nir_intrinsics import INTR_OPCODES
 from mako.template import Template
 
-print Template(template).render(opcodes=opcodes, INTR_OPCODES=INTR_OPCODES)
+print(Template(template).render(opcodes=opcodes, INTR_OPCODES=INTR_OPCODES))
diff --git a/src/compiler/nir/nir_builtin_builder.c b/src/compiler/nir/nir_builtin_builder.c
new file mode 100644
index 0000000..252a769
--- /dev/null
+++ b/src/compiler/nir/nir_builtin_builder.c
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2018 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builtin_builder.h"
+
+nir_ssa_def*
+nir_cross(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
+{
+   unsigned yzx[3] = { 1, 2, 0 };
+   unsigned zxy[3] = { 2, 0, 1 };
+
+   return nir_fsub(b, nir_fmul(b, nir_swizzle(b, x, yzx, 3, true),
+                                  nir_swizzle(b, y, zxy, 3, true)),
+                      nir_fmul(b, nir_swizzle(b, x, zxy, 3, true),
+                                  nir_swizzle(b, y, yzx, 3, true)));
+}
+
+nir_ssa_def*
+nir_fast_length(nir_builder *b, nir_ssa_def *vec)
+{
+   switch (vec->num_components) {
+   case 1: return nir_fsqrt(b, nir_fmul(b, vec, vec));
+   case 2: return nir_fsqrt(b, nir_fdot2(b, vec, vec));
+   case 3: return nir_fsqrt(b, nir_fdot3(b, vec, vec));
+   case 4: return nir_fsqrt(b, nir_fdot4(b, vec, vec));
+   default:
+      unreachable("Invalid number of components");
+   }
+}
+
+nir_ssa_def*
+nir_smoothstep(nir_builder *b, nir_ssa_def *edge0, nir_ssa_def *edge1, nir_ssa_def *x)
+{
+   nir_ssa_def *f2 = nir_imm_floatN_t(b, 2.0, x->bit_size);
+   nir_ssa_def *f3 = nir_imm_floatN_t(b, 3.0, x->bit_size);
+
+   /* t = clamp((x - edge0) / (edge1 - edge0), 0, 1) */
+   nir_ssa_def *t =
+      nir_fsat(b, nir_fdiv(b, nir_fsub(b, x, edge0),
+                              nir_fsub(b, edge1, edge0)));
+
+   /* result = t * t * (3 - 2 * t) */
+   return nir_fmul(b, t, nir_fmul(b, t, nir_fsub(b, f3, nir_fmul(b, f2, t))));
+}
diff --git a/src/compiler/nir/nir_builtin_builder.h b/src/compiler/nir/nir_builtin_builder.h
new file mode 100644
index 0000000..0e5b9db
--- /dev/null
+++ b/src/compiler/nir/nir_builtin_builder.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright © 2018 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef NIR_BUILTIN_BUILDER_H
+#define NIR_BUILTIN_BUILDER_H
+
+#include "nir/nir_builder.h"
+
+/*
+ * Functions are sorted alphabetically with removed type and "fast" prefix.
+ * Definitions for functions in the C file come first.
+ */
+
+nir_ssa_def* nir_cross(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y);
+nir_ssa_def* nir_fast_length(nir_builder *b, nir_ssa_def *vec);
+nir_ssa_def* nir_smoothstep(nir_builder *b, nir_ssa_def *edge0,
+                            nir_ssa_def *edge1, nir_ssa_def *x);
+
+static inline nir_ssa_def *
+nir_fclamp(nir_builder *b,
+           nir_ssa_def *x, nir_ssa_def *min_val, nir_ssa_def *max_val)
+{
+   return nir_fmin(b, nir_fmax(b, x, min_val), max_val);
+}
+
+static inline nir_ssa_def *
+nir_iclamp(nir_builder *b,
+           nir_ssa_def *x, nir_ssa_def *min_val, nir_ssa_def *max_val)
+{
+   return nir_imin(b, nir_imax(b, x, min_val), max_val);
+}
+
+static inline nir_ssa_def *
+nir_uclamp(nir_builder *b,
+           nir_ssa_def *x, nir_ssa_def *min_val, nir_ssa_def *max_val)
+{
+   return nir_umin(b, nir_umax(b, x, min_val), max_val);
+}
+
+static inline nir_ssa_def *
+nir_degrees(nir_builder *b, nir_ssa_def *val)
+{
+   return nir_fmul(b, val, nir_imm_float(b, 57.2957795131));
+}
+
+static inline nir_ssa_def *
+nir_fast_distance(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
+{
+   return nir_fast_length(b, nir_fsub(b, x, y));
+}
+
+static inline nir_ssa_def*
+nir_fast_normalize(nir_builder *b, nir_ssa_def *vec)
+{
+   return nir_fdiv(b, vec, nir_fast_length(b, vec));
+}
+
+static inline nir_ssa_def *
+nir_radians(nir_builder *b, nir_ssa_def *val)
+{
+   return nir_fmul(b, val, nir_imm_float(b, 0.01745329251));
+}
+
+#endif /* NIR_BUILTIN_BUILDER_H */
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index bcfdaa7..989c505 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -161,6 +161,14 @@
    }
    nvar->interface_type = var->interface_type;
 
+   nvar->num_members = var->num_members;
+   if (var->num_members) {
+      nvar->members = ralloc_array(nvar, struct nir_variable_data,
+                                   var->num_members);
+      memcpy(nvar->members, var->members,
+             var->num_members * sizeof(*var->members));
+   }
+
    return nvar;
 }
 
@@ -258,73 +266,6 @@
    }
 }
 
-static nir_deref *clone_deref(clone_state *state, const nir_deref *deref,
-                              nir_instr *ninstr, nir_deref *parent);
-
-static nir_deref_var *
-clone_deref_var(clone_state *state, const nir_deref_var *dvar,
-                nir_instr *ninstr)
-{
-   nir_variable *nvar = remap_var(state, dvar->var);
-   nir_deref_var *ndvar = nir_deref_var_create(ninstr, nvar);
-
-   if (dvar->deref.child)
-      ndvar->deref.child = clone_deref(state, dvar->deref.child,
-                                       ninstr, &ndvar->deref);
-
-   return ndvar;
-}
-
-static nir_deref_array *
-clone_deref_array(clone_state *state, const nir_deref_array *darr,
-                  nir_instr *ninstr, nir_deref *parent)
-{
-   nir_deref_array *ndarr = nir_deref_array_create(parent);
-
-   ndarr->deref.type = darr->deref.type;
-   if (darr->deref.child)
-      ndarr->deref.child = clone_deref(state, darr->deref.child,
-                                       ninstr, &ndarr->deref);
-
-   ndarr->deref_array_type = darr->deref_array_type;
-   ndarr->base_offset = darr->base_offset;
-   if (ndarr->deref_array_type == nir_deref_array_type_indirect)
-      __clone_src(state, ninstr, &ndarr->indirect, &darr->indirect);
-
-   return ndarr;
-}
-
-static nir_deref_struct *
-clone_deref_struct(clone_state *state, const nir_deref_struct *dstr,
-                   nir_instr *ninstr, nir_deref *parent)
-{
-   nir_deref_struct *ndstr = nir_deref_struct_create(parent, dstr->index);
-
-   ndstr->deref.type = dstr->deref.type;
-   if (dstr->deref.child)
-      ndstr->deref.child = clone_deref(state, dstr->deref.child,
-                                       ninstr, &ndstr->deref);
-
-   return ndstr;
-}
-
-static nir_deref *
-clone_deref(clone_state *state, const nir_deref *dref,
-            nir_instr *ninstr, nir_deref *parent)
-{
-   switch (dref->deref_type) {
-   case nir_deref_type_array:
-      return &clone_deref_array(state, nir_deref_as_array(dref),
-                                ninstr, parent)->deref;
-   case nir_deref_type_struct:
-      return &clone_deref_struct(state, nir_deref_as_struct(dref),
-                                 ninstr, parent)->deref;
-   default:
-      unreachable("bad deref type");
-      return NULL;
-   }
-}
-
 static nir_alu_instr *
 clone_alu(clone_state *state, const nir_alu_instr *alu)
 {
@@ -346,13 +287,52 @@
    return nalu;
 }
 
+static nir_deref_instr *
+clone_deref_instr(clone_state *state, const nir_deref_instr *deref)
+{
+   nir_deref_instr *nderef =
+      nir_deref_instr_create(state->ns, deref->deref_type);
+
+   __clone_dst(state, &nderef->instr, &nderef->dest, &deref->dest);
+
+   nderef->mode = deref->mode;
+   nderef->type = deref->type;
+
+   if (deref->deref_type == nir_deref_type_var) {
+      nderef->var = remap_var(state, deref->var);
+      return nderef;
+   }
+
+   __clone_src(state, &nderef->instr, &nderef->parent, &deref->parent);
+
+   switch (deref->deref_type) {
+   case nir_deref_type_struct:
+      nderef->strct.index = deref->strct.index;
+      break;
+
+   case nir_deref_type_array:
+      __clone_src(state, &nderef->instr,
+                  &nderef->arr.index, &deref->arr.index);
+      break;
+
+   case nir_deref_type_array_wildcard:
+   case nir_deref_type_cast:
+      /* Nothing to do */
+      break;
+
+   default:
+      unreachable("Invalid instruction deref type");
+   }
+
+   return nderef;
+}
+
 static nir_intrinsic_instr *
 clone_intrinsic(clone_state *state, const nir_intrinsic_instr *itr)
 {
    nir_intrinsic_instr *nitr =
       nir_intrinsic_instr_create(state->ns, itr->intrinsic);
 
-   unsigned num_variables = nir_intrinsic_infos[itr->intrinsic].num_variables;
    unsigned num_srcs = nir_intrinsic_infos[itr->intrinsic].num_srcs;
 
    if (nir_intrinsic_infos[itr->intrinsic].has_dest)
@@ -361,11 +341,6 @@
    nitr->num_components = itr->num_components;
    memcpy(nitr->const_index, itr->const_index, sizeof(nitr->const_index));
 
-   for (unsigned i = 0; i < num_variables; i++) {
-      nitr->variables[i] = clone_deref_var(state, itr->variables[i],
-                                           &nitr->instr);
-   }
-
    for (unsigned i = 0; i < num_srcs; i++)
       __clone_src(state, &nitr->instr, &nitr->src[i], &itr->src[i]);
 
@@ -418,13 +393,8 @@
    ntex->component = tex->component;
 
    ntex->texture_index = tex->texture_index;
-   if (tex->texture)
-      ntex->texture = clone_deref_var(state, tex->texture, &ntex->instr);
    ntex->texture_array_size = tex->texture_array_size;
-
    ntex->sampler_index = tex->sampler_index;
-   if (tex->sampler)
-      ntex->sampler = clone_deref_var(state, tex->sampler, &ntex->instr);
 
    return ntex;
 }
@@ -488,10 +458,7 @@
    nir_call_instr *ncall = nir_call_instr_create(state->ns, ncallee);
 
    for (unsigned i = 0; i < ncall->num_params; i++)
-      ncall->params[i] = clone_deref_var(state, call->params[i], &ncall->instr);
-
-   ncall->return_deref = clone_deref_var(state, call->return_deref,
-                                         &ncall->instr);
+      __clone_src(state, ncall, &ncall->params[i], &call->params[i]);
 
    return ncall;
 }
@@ -502,6 +469,8 @@
    switch (instr->type) {
    case nir_instr_type_alu:
       return &clone_alu(state, nir_instr_as_alu(instr))->instr;
+   case nir_instr_type_deref:
+      return &clone_deref_instr(state, nir_instr_as_deref(instr))->instr;
    case nir_instr_type_intrinsic:
       return &clone_intrinsic(state, nir_instr_as_intrinsic(instr))->instr;
    case nir_instr_type_load_const:
@@ -671,14 +640,6 @@
    clone_reg_list(state, &nfi->registers, &fi->registers);
    nfi->reg_alloc = fi->reg_alloc;
 
-   nfi->num_params = fi->num_params;
-   nfi->params = ralloc_array(state->ns, nir_variable *, fi->num_params);
-   for (unsigned i = 0; i < fi->num_params; i++) {
-      nfi->params[i] = clone_variable(state, fi->params[i]);
-   }
-   if (fi->return_var)
-      nfi->return_var = clone_variable(state, fi->return_var);
-
    assert(list_empty(&state->phi_srcs));
 
    clone_cf_list(state, &nfi->body, &fi->body);
@@ -720,8 +681,6 @@
    nfxn->params = ralloc_array(state->ns, nir_parameter, fxn->num_params);
    memcpy(nfxn->params, fxn->params, sizeof(nir_parameter) * fxn->num_params);
 
-   nfxn->return_type = fxn->return_type;
-
    /* At first glance, it looks like we should clone the function_impl here.
     * However, call instructions need to be able to reference at least the
     * function and those will get processed as we clone the function_impls.
@@ -775,6 +734,12 @@
    ns->num_outputs = s->num_outputs;
    ns->num_shared = s->num_shared;
 
+   ns->constant_data_size = s->constant_data_size;
+   if (s->constant_data_size > 0) {
+      ns->constant_data = ralloc_size(ns, s->constant_data_size);
+      memcpy(ns->constant_data, s->constant_data, s->constant_data_size);
+   }
+
    free_clone_state(&state);
 
    return ns;
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
index db5bde2..118af9f 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 import re
 
@@ -386,7 +387,7 @@
    % endif
 </%def>
 
-% for name, op in sorted(opcodes.iteritems()):
+% for name, op in sorted(opcodes.items()):
 static nir_const_value
 evaluate_${name}(MAYBE_UNUSED unsigned num_components,
                  ${"UNUSED" if op_bit_sizes(op) is None else ""} unsigned bit_size,
@@ -419,7 +420,7 @@
                       unsigned bit_width, nir_const_value *src)
 {
    switch (op) {
-% for name in sorted(opcodes.iterkeys()):
+% for name in sorted(opcodes.keys()):
    case nir_op_${name}:
       return evaluate_${name}(num_components, bit_width, src);
 % endfor
@@ -431,8 +432,8 @@
 from nir_opcodes import opcodes
 from mako.template import Template
 
-print Template(template).render(opcodes=opcodes, type_sizes=type_sizes,
+print(Template(template).render(opcodes=opcodes, type_sizes=type_sizes,
                                 type_has_size=type_has_size,
                                 type_add_size=type_add_size,
                                 op_bit_sizes=op_bit_sizes,
-                                get_const_field=get_const_field)
+                                get_const_field=get_const_field))
diff --git a/src/compiler/nir/nir_control_flow.c b/src/compiler/nir/nir_control_flow.c
index 1622b35..0617c6a 100644
--- a/src/compiler/nir/nir_control_flow.c
+++ b/src/compiler/nir/nir_control_flow.c
@@ -444,6 +444,23 @@
    return nir_cf_node_as_loop(node);
 }
 
+static void
+remove_phi_src(nir_block *block, nir_block *pred)
+{
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_phi)
+         break;
+
+      nir_phi_instr *phi = nir_instr_as_phi(instr);
+      nir_foreach_phi_src_safe(src, phi) {
+         if (src->pred == pred) {
+            list_del(&src->src.use_link);
+            exec_node_remove(&src->node);
+         }
+      }
+   }
+}
+
 /*
  * update the CFG after a jump instruction has been added to the end of a block
  */
@@ -454,6 +471,10 @@
    nir_instr *instr = nir_block_last_instr(block);
    nir_jump_instr *jump_instr = nir_instr_as_jump(instr);
 
+   if (block->successors[0])
+      remove_phi_src(block->successors[0], block);
+   if (block->successors[1])
+      remove_phi_src(block->successors[1], block);
    unlink_block_successors(block);
 
    nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node);
@@ -477,23 +498,6 @@
    }
 }
 
-static void
-remove_phi_src(nir_block *block, nir_block *pred)
-{
-   nir_foreach_instr(instr, block) {
-      if (instr->type != nir_instr_type_phi)
-         break;
-
-      nir_phi_instr *phi = nir_instr_as_phi(instr);
-      nir_foreach_phi_src_safe(src, phi) {
-         if (src->pred == pred) {
-            list_del(&src->src.use_link);
-            exec_node_remove(&src->node);
-         }
-      }
-   }
-}
-
 /* Removes the successor of a block with a jump. Note that the jump to be
  * eliminated may be free-floating.
  */
diff --git a/src/compiler/nir/nir_deref.c b/src/compiler/nir/nir_deref.c
new file mode 100644
index 0000000..6f788ad
--- /dev/null
+++ b/src/compiler/nir/nir_deref.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_deref.h"
+#include "util/hash_table.h"
+
+void
+nir_deref_path_init(nir_deref_path *path,
+                    nir_deref_instr *deref, void *mem_ctx)
+{
+   assert(deref != NULL);
+
+   /* The length of the short path is at most ARRAY_SIZE - 1 because we need
+    * room for the NULL terminator.
+    */
+   static const int max_short_path_len = ARRAY_SIZE(path->_short_path) - 1;
+
+   int count = 0;
+
+   nir_deref_instr **tail = &path->_short_path[max_short_path_len];
+   nir_deref_instr **head = tail;
+
+   *tail = NULL;
+   for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) {
+      count++;
+      if (count <= max_short_path_len)
+         *(--head) = d;
+   }
+
+   if (count <= max_short_path_len) {
+      /* If we're under max_short_path_len, just use the short path. */
+      path->path = head;
+      goto done;
+   }
+
+#ifndef NDEBUG
+   /* Just in case someone uses short_path by accident */
+   for (unsigned i = 0; i < ARRAY_SIZE(path->_short_path); i++)
+      path->_short_path[i] = (void *)0xdeadbeef;
+#endif
+
+   path->path = ralloc_array(mem_ctx, nir_deref_instr *, count + 1);
+   head = tail = path->path + count;
+   *tail = NULL;
+   for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d))
+      *(--head) = d;
+
+done:
+   assert(head == path->path);
+   assert(tail == head + count);
+   assert((*head)->deref_type == nir_deref_type_var);
+   assert(*tail == NULL);
+}
+
+void
+nir_deref_path_finish(nir_deref_path *path)
+{
+   if (path->path < &path->_short_path[0] ||
+       path->path > &path->_short_path[ARRAY_SIZE(path->_short_path) - 1])
+      ralloc_free(path->path);
+}
+
+/**
+ * Recursively removes unused deref instructions
+ */
+bool
+nir_deref_instr_remove_if_unused(nir_deref_instr *instr)
+{
+   bool progress = false;
+
+   for (nir_deref_instr *d = instr; d; d = nir_deref_instr_parent(d)) {
+      /* If anyone is using this deref, leave it alone */
+      assert(d->dest.is_ssa);
+      if (!list_empty(&d->dest.ssa.uses))
+         break;
+
+      nir_instr_remove(&d->instr);
+      progress = true;
+   }
+
+   return progress;
+}
+
+bool
+nir_deref_instr_has_indirect(nir_deref_instr *instr)
+{
+   while (instr->deref_type != nir_deref_type_var) {
+      /* Consider casts to be indirects */
+      if (instr->deref_type == nir_deref_type_cast)
+         return true;
+
+      if (instr->deref_type == nir_deref_type_array &&
+          !nir_src_as_const_value(instr->arr.index))
+         return true;
+
+      instr = nir_deref_instr_parent(instr);
+   }
+
+   return false;
+}
+
+static unsigned
+type_get_array_stride(const struct glsl_type *elem_type,
+                      glsl_type_size_align_func size_align)
+{
+   unsigned elem_size, elem_align;
+   glsl_get_natural_size_align_bytes(elem_type, &elem_size, &elem_align);
+   return ALIGN_POT(elem_size, elem_align);
+}
+
+static unsigned
+struct_type_get_field_offset(const struct glsl_type *struct_type,
+                             glsl_type_size_align_func size_align,
+                             unsigned field_idx)
+{
+   assert(glsl_type_is_struct(struct_type));
+   unsigned offset = 0;
+   for (unsigned i = 0; i <= field_idx; i++) {
+      unsigned elem_size, elem_align;
+      glsl_get_natural_size_align_bytes(glsl_get_struct_field(struct_type, i),
+                                        &elem_size, &elem_align);
+      offset = ALIGN_POT(offset, elem_align);
+      if (i < field_idx)
+         offset += elem_size;
+   }
+   return offset;
+}
+
+unsigned
+nir_deref_instr_get_const_offset(nir_deref_instr *deref,
+                                 glsl_type_size_align_func size_align)
+{
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, NULL);
+
+   assert(path.path[0]->deref_type == nir_deref_type_var);
+
+   unsigned offset = 0;
+   for (nir_deref_instr **p = &path.path[1]; *p; p++) {
+      if ((*p)->deref_type == nir_deref_type_array) {
+         offset += nir_src_as_const_value((*p)->arr.index)->u32[0] *
+                   type_get_array_stride((*p)->type, size_align);
+      } else if ((*p)->deref_type == nir_deref_type_struct) {
+         /* p starts at path[1], so this is safe */
+         nir_deref_instr *parent = *(p - 1);
+         offset += struct_type_get_field_offset(parent->type, size_align,
+                                                (*p)->strct.index);
+      } else {
+         unreachable("Unsupported deref type");
+      }
+   }
+
+   nir_deref_path_finish(&path);
+
+   return offset;
+}
+
+nir_ssa_def *
+nir_build_deref_offset(nir_builder *b, nir_deref_instr *deref,
+                       glsl_type_size_align_func size_align)
+{
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, NULL);
+
+   assert(path.path[0]->deref_type == nir_deref_type_var);
+
+   nir_ssa_def *offset = nir_imm_int(b, 0);
+   for (nir_deref_instr **p = &path.path[1]; *p; p++) {
+      if ((*p)->deref_type == nir_deref_type_array) {
+         nir_ssa_def *index = nir_ssa_for_src(b, (*p)->arr.index, 1);
+         nir_ssa_def *stride =
+            nir_imm_int(b, type_get_array_stride((*p)->type, size_align));
+         offset = nir_iadd(b, offset, nir_imul(b, index, stride));
+      } else if ((*p)->deref_type == nir_deref_type_struct) {
+         /* p starts at path[1], so this is safe */
+         nir_deref_instr *parent = *(p - 1);
+         unsigned field_offset =
+            struct_type_get_field_offset(parent->type, size_align,
+                                         (*p)->strct.index);
+         nir_iadd(b, offset, nir_imm_int(b, field_offset));
+      } else {
+         unreachable("Unsupported deref type");
+      }
+   }
+
+   nir_deref_path_finish(&path);
+
+   return offset;
+}
+
+bool
+nir_remove_dead_derefs_impl(nir_function_impl *impl)
+{
+   bool progress = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type == nir_instr_type_deref &&
+             nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr)))
+            progress = true;
+      }
+   }
+
+   if (progress)
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+
+   return progress;
+}
+
+bool
+nir_remove_dead_derefs(nir_shader *shader)
+{
+   bool progress = false;
+   nir_foreach_function(function, shader) {
+      if (function->impl && nir_remove_dead_derefs_impl(function->impl))
+         progress = true;
+   }
+
+   return progress;
+}
+
+void
+nir_fixup_deref_modes(nir_shader *shader)
+{
+   nir_foreach_function(function, shader) {
+      if (!function->impl)
+         continue;
+
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_deref)
+               continue;
+
+            nir_deref_instr *deref = nir_instr_as_deref(instr);
+
+            nir_variable_mode parent_mode;
+            if (deref->deref_type == nir_deref_type_var) {
+               parent_mode = deref->var->data.mode;
+            } else {
+               assert(deref->parent.is_ssa);
+               nir_deref_instr *parent =
+                  nir_instr_as_deref(deref->parent.ssa->parent_instr);
+               parent_mode = parent->mode;
+            }
+
+            deref->mode = parent_mode;
+         }
+      }
+   }
+}
+
+struct rematerialize_deref_state {
+   bool progress;
+   nir_builder builder;
+   nir_block *block;
+   struct hash_table *cache;
+};
+
+static nir_deref_instr *
+rematerialize_deref_in_block(nir_deref_instr *deref,
+                             struct rematerialize_deref_state *state)
+{
+   if (deref->instr.block == state->block)
+      return deref;
+
+   if (!state->cache) {
+      state->cache = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
+                                             _mesa_key_pointer_equal);
+   }
+
+   struct hash_entry *cached = _mesa_hash_table_search(state->cache, deref);
+   if (cached)
+      return cached->data;
+
+   nir_builder *b = &state->builder;
+   nir_deref_instr *new_deref =
+      nir_deref_instr_create(b->shader, deref->deref_type);
+   new_deref->mode = deref->mode;
+   new_deref->type = deref->type;
+
+   if (deref->deref_type == nir_deref_type_var) {
+      new_deref->var = deref->var;
+   } else {
+      nir_deref_instr *parent = nir_src_as_deref(deref->parent);
+      if (parent) {
+         parent = rematerialize_deref_in_block(parent, state);
+         new_deref->parent = nir_src_for_ssa(&parent->dest.ssa);
+      } else {
+         nir_src_copy(&new_deref->parent, &deref->parent, new_deref);
+      }
+   }
+
+   switch (deref->deref_type) {
+   case nir_deref_type_var:
+   case nir_deref_type_array_wildcard:
+   case nir_deref_type_cast:
+      /* Nothing more to do */
+      break;
+
+   case nir_deref_type_array:
+      assert(!nir_src_as_deref(deref->arr.index));
+      nir_src_copy(&new_deref->arr.index, &deref->arr.index, new_deref);
+      break;
+
+   case nir_deref_type_struct:
+      new_deref->strct.index = deref->strct.index;
+      break;
+
+   default:
+      unreachable("Invalid deref instruction type");
+   }
+
+   nir_ssa_dest_init(&new_deref->instr, &new_deref->dest,
+                     deref->dest.ssa.num_components,
+                     deref->dest.ssa.bit_size,
+                     deref->dest.ssa.name);
+   nir_builder_instr_insert(b, &new_deref->instr);
+
+   return new_deref;
+}
+
+static bool
+rematerialize_deref_src(nir_src *src, void *_state)
+{
+   struct rematerialize_deref_state *state = _state;
+
+   nir_deref_instr *deref = nir_src_as_deref(*src);
+   if (!deref)
+      return true;
+
+   nir_deref_instr *block_deref = rematerialize_deref_in_block(deref, state);
+   if (block_deref != deref) {
+      nir_instr_rewrite_src(src->parent_instr, src,
+                            nir_src_for_ssa(&block_deref->dest.ssa));
+      nir_deref_instr_remove_if_unused(deref);
+      state->progress = true;
+   }
+
+   return true;
+}
+
+/** Re-materialize derefs in every block
+ *
+ * This pass re-materializes deref instructions in every block in which it is
+ * used.  After this pass has been run, every use of a deref will be of a
+ * deref in the same block as the use.  Also, all unused derefs will be
+ * deleted as a side-effect.
+ */
+bool
+nir_rematerialize_derefs_in_use_blocks_impl(nir_function_impl *impl)
+{
+   struct rematerialize_deref_state state = { 0 };
+   nir_builder_init(&state.builder, impl);
+
+   nir_foreach_block(block, impl) {
+      state.block = block;
+
+      /* Start each block with a fresh cache */
+      if (state.cache)
+         _mesa_hash_table_clear(state.cache, NULL);
+
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type == nir_instr_type_deref) {
+            nir_deref_instr_remove_if_unused(nir_instr_as_deref(instr));
+            continue;
+         }
+
+         state.builder.cursor = nir_before_instr(instr);
+         nir_foreach_src(instr, rematerialize_deref_src, &state);
+      }
+
+#ifndef NDEBUG
+      nir_if *following_if = nir_block_get_following_if(block);
+      if (following_if)
+         assert(!nir_src_as_deref(following_if->condition));
+#endif
+   }
+
+   _mesa_hash_table_destroy(state.cache, NULL);
+
+   return state.progress;
+}
diff --git a/src/compiler/nir/nir_deref.h b/src/compiler/nir/nir_deref.h
new file mode 100644
index 0000000..6f4141a
--- /dev/null
+++ b/src/compiler/nir/nir_deref.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef NIR_DEREF_H
+#define NIR_DEREF_H
+
+#include "nir.h"
+#include "nir_builder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+   /** Short path so we can keep it on the stack most of the time. */
+   nir_deref_instr *_short_path[7];
+
+   /** A null-terminated array view of a deref chain
+    *
+    * The first element of this array will be the variable dereference
+    * followed by every deref_instr on the path to the final one.  The last
+    * element in the array is a NULL pointer which acts as a terminator.
+    */
+   nir_deref_instr **path;
+} nir_deref_path;
+
+void nir_deref_path_init(nir_deref_path *path,
+                         nir_deref_instr *deref, void *mem_ctx);
+void nir_deref_path_finish(nir_deref_path *path);
+
+unsigned nir_deref_instr_get_const_offset(nir_deref_instr *deref,
+                                          glsl_type_size_align_func size_align);
+
+nir_ssa_def *nir_build_deref_offset(nir_builder *b, nir_deref_instr *deref,
+                                    glsl_type_size_align_func size_align);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* NIR_DEREF_H */
diff --git a/src/compiler/nir/nir_format_convert.h b/src/compiler/nir/nir_format_convert.h
new file mode 100644
index 0000000..e5cc653
--- /dev/null
+++ b/src/compiler/nir/nir_format_convert.h
@@ -0,0 +1,287 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir_builder.h"
+
+#include "util/format_rgb9e5.h"
+
+static inline nir_ssa_def *
+nir_shift(nir_builder *b, nir_ssa_def *value, int left_shift)
+{
+   if (left_shift > 0)
+      return nir_ishl(b, value, nir_imm_int(b, left_shift));
+   else if (left_shift < 0)
+      return nir_ushr(b, value, nir_imm_int(b, -left_shift));
+   else
+      return value;
+}
+
+static inline nir_ssa_def *
+nir_mask_shift(struct nir_builder *b, nir_ssa_def *src,
+               uint32_t mask, int left_shift)
+{
+   return nir_shift(b, nir_iand(b, src, nir_imm_int(b, mask)), left_shift);
+}
+
+static inline nir_ssa_def *
+nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src,
+                  uint32_t src_mask, int src_left_shift)
+{
+   return nir_ior(b, nir_mask_shift(b, src, src_mask, src_left_shift), dst);
+}
+
+static inline nir_ssa_def *
+nir_format_unpack_uint(nir_builder *b, nir_ssa_def *packed,
+                       const unsigned *bits, unsigned num_components)
+{
+   assert(num_components >= 1 && num_components <= 4);
+   nir_ssa_def *comps[4];
+
+   if (bits[0] >= packed->bit_size) {
+      assert(bits[0] == packed->bit_size);
+      assert(num_components == 1);
+      return packed;
+   }
+
+   unsigned offset = 0;
+   for (unsigned i = 0; i < num_components; i++) {
+      assert(bits[i] < 32);
+      nir_ssa_def *mask = nir_imm_int(b, (1u << bits[i]) - 1);
+      comps[i] = nir_iand(b, nir_shift(b, packed, -offset), mask);
+      offset += bits[i];
+   }
+   assert(offset <= packed->bit_size);
+
+   return nir_vec(b, comps, num_components);
+}
+
+static inline nir_ssa_def *
+nir_format_pack_uint_unmasked(nir_builder *b, nir_ssa_def *color,
+                              const unsigned *bits, unsigned num_components)
+{
+   assert(num_components >= 1 && num_components <= 4);
+   nir_ssa_def *packed = nir_imm_int(b, 0);
+   unsigned offset = 0;
+   for (unsigned i = 0; i < num_components; i++) {
+      packed = nir_ior(b, packed, nir_shift(b, nir_channel(b, color, i),
+                                               offset));
+      offset += bits[i];
+   }
+   assert(offset <= packed->bit_size);
+
+   return packed;
+}
+
+static inline nir_ssa_def *
+nir_format_pack_uint(nir_builder *b, nir_ssa_def *color,
+                     const unsigned *bits, unsigned num_components)
+{
+   nir_const_value mask;
+   for (unsigned i = 0; i < num_components; i++) {
+      assert(bits[i] < 32);
+      mask.u32[i] = (1u << bits[i]) - 1;
+   }
+   nir_ssa_def *mask_imm = nir_build_imm(b, num_components, 32, mask);
+
+   return nir_format_pack_uint_unmasked(b, nir_iand(b, color, mask_imm),
+                                        bits, num_components);
+}
+
+static inline nir_ssa_def *
+nir_format_bitcast_uint_vec_unmasked(nir_builder *b, nir_ssa_def *src,
+                                     unsigned src_bits, unsigned dst_bits)
+{
+   assert(src_bits == 8 || src_bits == 16 || src_bits == 32);
+   assert(dst_bits == 8 || dst_bits == 16 || dst_bits == 32);
+
+   if (src_bits == dst_bits)
+      return src;
+
+   const unsigned dst_components =
+      DIV_ROUND_UP(src->num_components * src_bits, dst_bits);
+   assert(dst_components <= 4);
+
+   nir_ssa_def *dst_chan[4] = {0};
+   if (dst_bits > src_bits) {
+      unsigned shift = 0;
+      unsigned dst_idx = 0;
+      for (unsigned i = 0; i < src->num_components; i++) {
+         nir_ssa_def *shifted = nir_ishl(b, nir_channel(b, src, i),
+                                            nir_imm_int(b, shift));
+         if (shift == 0) {
+            dst_chan[dst_idx] = shifted;
+         } else {
+            dst_chan[dst_idx] = nir_ior(b, dst_chan[dst_idx], shifted);
+         }
+
+         shift += src_bits;
+         if (shift >= dst_bits) {
+            dst_idx++;
+            shift = 0;
+         }
+      }
+   } else {
+      nir_ssa_def *mask = nir_imm_int(b, ~0u >> (32 - dst_bits));
+
+      unsigned src_idx = 0;
+      unsigned shift = 0;
+      for (unsigned i = 0; i < dst_components; i++) {
+         dst_chan[i] = nir_iand(b, nir_ushr(b, nir_channel(b, src, src_idx),
+                                               nir_imm_int(b, shift)),
+                                   mask);
+         shift += dst_bits;
+         if (shift >= src_bits) {
+            src_idx++;
+            shift = 0;
+         }
+      }
+   }
+
+   return nir_vec(b, dst_chan, dst_components);
+}
+
+static inline nir_ssa_def *
+nir_format_linear_to_srgb(nir_builder *b, nir_ssa_def *c)
+{
+   nir_ssa_def *linear = nir_fmul(b, c, nir_imm_float(b, 12.92f));
+   nir_ssa_def *curved =
+      nir_fsub(b, nir_fmul(b, nir_imm_float(b, 1.055f),
+                              nir_fpow(b, c, nir_imm_float(b, 1.0 / 2.4))),
+                  nir_imm_float(b, 0.055f));
+
+   return nir_fsat(b, nir_bcsel(b, nir_flt(b, c, nir_imm_float(b, 0.0031308f)),
+                                   linear, curved));
+}
+
+static inline nir_ssa_def *
+nir_format_srgb_to_linear(nir_builder *b, nir_ssa_def *c)
+{
+   nir_ssa_def *linear = nir_fdiv(b, c, nir_imm_float(b, 12.92f));
+   nir_ssa_def *curved =
+      nir_fpow(b, nir_fdiv(b, nir_fadd(b, c, nir_imm_float(b, 0.055f)),
+                              nir_imm_float(b, 1.055f)),
+                  nir_imm_float(b, 2.4f));
+
+   return nir_fsat(b, nir_bcsel(b, nir_fge(b, nir_imm_float(b, 0.04045f), c),
+                                   linear, curved));
+}
+
+static inline nir_ssa_def *
+nir_format_unpack_11f11f10f(nir_builder *b, nir_ssa_def *packed)
+{
+   nir_ssa_def *chans[3];
+   chans[0] = nir_mask_shift(b, packed, 0x000007ff, 4);
+   chans[1] = nir_mask_shift(b, packed, 0x003ff800, -7);
+   chans[2] = nir_mask_shift(b, packed, 0xffc00000, -17);
+
+   for (unsigned i = 0; i < 3; i++)
+      chans[i] = nir_unpack_half_2x16_split_x(b, chans[i]);
+
+   return nir_vec(b, chans, 3);
+}
+
+static inline nir_ssa_def *
+nir_format_pack_r11g11b10f(nir_builder *b, nir_ssa_def *color)
+{
+   /* 10 and 11-bit floats are unsigned.  Clamp to non-negative */
+   nir_ssa_def *clamped = nir_fmax(b, color, nir_imm_float(b, 0));
+
+   nir_ssa_def *undef = nir_ssa_undef(b, 1, color->bit_size);
+   nir_ssa_def *p1 = nir_pack_half_2x16_split(b, nir_channel(b, clamped, 0),
+                                                 nir_channel(b, clamped, 1));
+   nir_ssa_def *p2 = nir_pack_half_2x16_split(b, nir_channel(b, clamped, 2),
+                                                 undef);
+
+   /* A 10 or 11-bit float has the same exponent as a 16-bit float but with
+    * fewer mantissa bits and no sign bit.  All we have to do is throw away
+    * the sign bit and the bottom mantissa bits and shift it into place.
+    */
+   nir_ssa_def *packed = nir_imm_int(b, 0);
+   packed = nir_mask_shift_or(b, packed, p1, 0x00007ff0, -4);
+   packed = nir_mask_shift_or(b, packed, p1, 0x7ff00000, -9);
+   packed = nir_mask_shift_or(b, packed, p2, 0x00007fe0, 17);
+
+   return packed;
+}
+
+static inline nir_ssa_def *
+nir_format_pack_r9g9b9e5(nir_builder *b, nir_ssa_def *color)
+{
+   /* See also float3_to_rgb9e5 */
+
+   /* First, we need to clamp it to range. */
+   nir_ssa_def *clamped = nir_fmin(b, color, nir_imm_float(b, MAX_RGB9E5));
+
+   /* Get rid of negatives and NaN */
+   clamped = nir_bcsel(b, nir_ult(b, nir_imm_int(b, 0x7f800000), color),
+                          nir_imm_float(b, 0), clamped);
+
+   /* maxrgb.u = MAX3(rc.u, gc.u, bc.u); */
+   nir_ssa_def *maxu = nir_umax(b, nir_channel(b, clamped, 0),
+                       nir_umax(b, nir_channel(b, clamped, 1),
+                                   nir_channel(b, clamped, 2)));
+
+   /* maxrgb.u += maxrgb.u & (1 << (23-9)); */
+   maxu = nir_iadd(b, maxu, nir_iand(b, maxu, nir_imm_int(b, 1 << 14)));
+
+   /* exp_shared = MAX2((maxrgb.u >> 23), -RGB9E5_EXP_BIAS - 1 + 127) +
+    *              1 + RGB9E5_EXP_BIAS - 127;
+    */
+   nir_ssa_def *exp_shared =
+      nir_iadd(b, nir_umax(b, nir_ushr(b, maxu, nir_imm_int(b, 23)),
+                              nir_imm_int(b, -RGB9E5_EXP_BIAS - 1 + 127)),
+                  nir_imm_int(b, 1 + RGB9E5_EXP_BIAS - 127));
+
+   /* revdenom_biasedexp = 127 - (exp_shared - RGB9E5_EXP_BIAS -
+    *                             RGB9E5_MANTISSA_BITS) + 1;
+    */
+   nir_ssa_def *revdenom_biasedexp =
+      nir_isub(b, nir_imm_int(b, 127 + RGB9E5_EXP_BIAS +
+                                 RGB9E5_MANTISSA_BITS + 1),
+                  exp_shared);
+
+   /* revdenom.u = revdenom_biasedexp << 23; */
+   nir_ssa_def *revdenom =
+      nir_ishl(b, revdenom_biasedexp, nir_imm_int(b, 23));
+
+   /* rm = (int) (rc.f * revdenom.f);
+    * gm = (int) (gc.f * revdenom.f);
+    * bm = (int) (bc.f * revdenom.f);
+    */
+   nir_ssa_def *mantissa =
+      nir_f2i32(b, nir_fmul(b, clamped, revdenom));
+
+   /* rm = (rm & 1) + (rm >> 1);
+    * gm = (gm & 1) + (gm >> 1);
+    * bm = (bm & 1) + (bm >> 1);
+    */
+   mantissa = nir_iadd(b, nir_iand(b, mantissa, nir_imm_int(b, 1)),
+                          nir_ushr(b, mantissa, nir_imm_int(b, 1)));
+
+   nir_ssa_def *packed = nir_channel(b, mantissa, 0);
+   packed = nir_mask_shift_or(b, packed, nir_channel(b, mantissa, 1), ~0, 9);
+   packed = nir_mask_shift_or(b, packed, nir_channel(b, mantissa, 2), ~0, 18);
+   packed = nir_mask_shift_or(b, packed, exp_shared, ~0, 27);
+
+   return packed;
+}
diff --git a/src/compiler/nir/nir_from_ssa.c b/src/compiler/nir/nir_from_ssa.c
index 1aa3550..413807f 100644
--- a/src/compiler/nir/nir_from_ssa.c
+++ b/src/compiler/nir/nir_from_ssa.c
@@ -974,6 +974,12 @@
          mov->dest.dest = nir_dest_for_reg(reg);
          mov->dest.write_mask = (1 << reg->num_components) - 1;
          nir_instr_insert(nir_after_instr(&load->instr), &mov->instr);
+      } else if (instr->type == nir_instr_type_deref) {
+         /* Derefs should always be SSA values, don't rewrite them. */
+         nir_deref_instr *deref = nir_instr_as_deref(instr);
+         nir_foreach_use_safe(use, &deref->dest.ssa)
+            assert(use->parent_instr->block == block);
+         assert(list_empty(&deref->dest.ssa.if_uses));
       } else {
          nir_foreach_dest(instr, dest_replace_ssa_with_reg, &state);
       }
diff --git a/src/compiler/nir/nir_gather_info.c b/src/compiler/nir/nir_gather_info.c
index a6a699a..d0b656e 100644
--- a/src/compiler/nir/nir_gather_info.c
+++ b/src/compiler/nir/nir_gather_info.c
@@ -113,23 +113,19 @@
 }
 
 static unsigned
-get_io_offset(nir_deref_var *deref, bool is_vertex_input)
+get_io_offset(nir_deref_instr *deref, bool is_vertex_input)
 {
    unsigned offset = 0;
 
-   nir_deref *tail = &deref->deref;
-   while (tail->child != NULL) {
-      tail = tail->child;
+   for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) {
+      if (d->deref_type == nir_deref_type_array) {
+         nir_const_value *const_index = nir_src_as_const_value(d->arr.index);
 
-      if (tail->deref_type == nir_deref_type_array) {
-         nir_deref_array *deref_array = nir_deref_as_array(tail);
-
-         if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+         if (!const_index)
             return -1;
-         }
 
-         offset += glsl_count_attribute_slots(tail->type, is_vertex_input) *
-            deref_array->base_offset;
+         offset += glsl_count_attribute_slots(d->type, is_vertex_input) *
+            const_index->u32[0];
       }
       /* TODO: we can get the offset for structs here see nir_lower_io() */
    }
@@ -145,9 +141,9 @@
  * occurs, then nothing will be marked and false will be returned.
  */
 static bool
-try_mask_partial_io(nir_shader *shader, nir_deref_var *deref, bool is_output_read)
+try_mask_partial_io(nir_shader *shader, nir_variable *var,
+                    nir_deref_instr *deref, bool is_output_read)
 {
-   nir_variable *var = deref->var;
    const struct glsl_type *type = var->type;
 
    if (nir_is_per_vertex_io(var, shader->info.stage)) {
@@ -219,7 +215,8 @@
 }
 
 static void
-gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader)
+gather_intrinsic_info(nir_intrinsic_instr *instr, nir_shader *shader,
+                      void *dead_ctx)
 {
    switch (instr->intrinsic) {
    case nir_intrinsic_discard:
@@ -228,21 +225,22 @@
       shader->info.fs.uses_discard = true;
       break;
 
-   case nir_intrinsic_interp_var_at_centroid:
-   case nir_intrinsic_interp_var_at_sample:
-   case nir_intrinsic_interp_var_at_offset:
-   case nir_intrinsic_load_var:
-   case nir_intrinsic_store_var: {
-      nir_variable *var = instr->variables[0]->var;
+   case nir_intrinsic_interp_deref_at_centroid:
+   case nir_intrinsic_interp_deref_at_sample:
+   case nir_intrinsic_interp_deref_at_offset:
+   case nir_intrinsic_load_deref:
+   case nir_intrinsic_store_deref:{
+      nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
+      nir_variable *var = nir_deref_instr_get_variable(deref);
 
       if (var->data.mode == nir_var_shader_in ||
           var->data.mode == nir_var_shader_out) {
          bool is_output_read = false;
          if (var->data.mode == nir_var_shader_out &&
-             instr->intrinsic == nir_intrinsic_load_var)
+             instr->intrinsic == nir_intrinsic_load_deref)
             is_output_read = true;
 
-         if (!try_mask_partial_io(shader, instr->variables[0], is_output_read))
+         if (!try_mask_partial_io(shader, var, deref, is_output_read))
             mark_whole_variable(shader, var, is_output_read);
 
          /* We need to track which input_reads bits correspond to a
@@ -266,6 +264,7 @@
    case nir_intrinsic_load_vertex_id_zero_base:
    case nir_intrinsic_load_base_vertex:
    case nir_intrinsic_load_first_vertex:
+   case nir_intrinsic_load_is_indexed_draw:
    case nir_intrinsic_load_base_instance:
    case nir_intrinsic_load_instance_id:
    case nir_intrinsic_load_sample_id:
@@ -289,6 +288,11 @@
    case nir_intrinsic_end_primitive_with_counter:
       assert(shader->info.stage == MESA_SHADER_GEOMETRY);
       shader->info.gs.uses_end_primitive = 1;
+
+   case nir_intrinsic_emit_vertex:
+      if (nir_intrinsic_stream_id(instr) > 0)
+         shader->info.gs.uses_streams = true;
+
       break;
 
    default:
@@ -329,7 +333,7 @@
 }
 
 static void
-gather_info_block(nir_block *block, nir_shader *shader)
+gather_info_block(nir_block *block, nir_shader *shader, void *dead_ctx)
 {
    nir_foreach_instr(instr, block) {
       switch (instr->type) {
@@ -337,7 +341,7 @@
          gather_alu_info(nir_instr_as_alu(instr), shader);
          break;
       case nir_instr_type_intrinsic:
-         gather_intrinsic_info(nir_instr_as_intrinsic(instr), shader);
+         gather_intrinsic_info(nir_instr_as_intrinsic(instr), shader, dead_ctx);
          break;
       case nir_instr_type_tex:
          gather_tex_info(nir_instr_as_tex(instr), shader);
@@ -351,24 +355,56 @@
    }
 }
 
+static unsigned
+glsl_type_get_sampler_count(const struct glsl_type *type)
+{
+   if (glsl_type_is_array(type)) {
+      return (glsl_get_aoa_size(type) *
+              glsl_type_get_sampler_count(glsl_without_array(type)));
+   }
+
+   if (glsl_type_is_struct(type)) {
+      unsigned count = 0;
+      for (int i = 0; i < glsl_get_length(type); i++)
+         count += glsl_type_get_sampler_count(glsl_get_struct_field(type, i));
+      return count;
+   }
+
+   if (glsl_type_is_sampler(type))
+      return 1;
+
+   return 0;
+}
+
+static unsigned
+glsl_type_get_image_count(const struct glsl_type *type)
+{
+   if (glsl_type_is_array(type)) {
+      return (glsl_get_aoa_size(type) *
+              glsl_type_get_image_count(glsl_without_array(type)));
+   }
+
+   if (glsl_type_is_struct(type)) {
+      unsigned count = 0;
+      for (int i = 0; i < glsl_get_length(type); i++)
+         count += glsl_type_get_image_count(glsl_get_struct_field(type, i));
+      return count;
+   }
+
+   if (glsl_type_is_image(type))
+      return 1;
+
+   return 0;
+}
+
 void
 nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint)
 {
    shader->info.num_textures = 0;
    shader->info.num_images = 0;
    nir_foreach_variable(var, &shader->uniforms) {
-      const struct glsl_type *type = var->type;
-      unsigned count = 1;
-      if (glsl_type_is_array(type)) {
-         count = glsl_get_aoa_size(type);
-         type = glsl_without_array(type);
-      }
-
-      if (glsl_type_is_image(type)) {
-         shader->info.num_images += count;
-      } else if (glsl_type_is_sampler(type)) {
-         shader->info.num_textures += count;
-      }
+      shader->info.num_textures += glsl_type_get_sampler_count(var->type);
+      shader->info.num_images += glsl_type_get_image_count(var->type);
    }
 
    shader->info.inputs_read = 0;
@@ -385,7 +421,10 @@
    if (shader->info.stage == MESA_SHADER_FRAGMENT) {
       shader->info.fs.uses_sample_qualifier = false;
    }
+
+   void *dead_ctx = ralloc_context(NULL);
    nir_foreach_block(block, entrypoint) {
-      gather_info_block(block, shader);
+      gather_info_block(block, shader, dead_ctx);
    }
+   ralloc_free(dead_ctx);
 }
diff --git a/src/compiler/nir/nir_inline_functions.c b/src/compiler/nir/nir_inline_functions.c
index b91e7bc..06c90d9 100644
--- a/src/compiler/nir/nir_inline_functions.c
+++ b/src/compiler/nir/nir_inline_functions.c
@@ -24,126 +24,10 @@
 #include "nir.h"
 #include "nir_builder.h"
 #include "nir_control_flow.h"
+#include "nir_vla.h"
 
 static bool inline_function_impl(nir_function_impl *impl, struct set *inlined);
 
-static void
-convert_deref_to_param_deref(nir_instr *instr, nir_deref_var **deref,
-                             nir_call_instr *call)
-{
-   /* This isn't a parameter, just return the deref */
-   if ((*deref)->var->data.mode != nir_var_param)
-      return;
-
-   int param_idx = (*deref)->var->data.location;
-
-   nir_deref_var *call_deref;
-   if (param_idx >= 0) {
-      assert(param_idx < call->callee->num_params);
-      call_deref = call->params[param_idx];
-   } else {
-      call_deref = call->return_deref;
-   }
-   assert(call_deref);
-
-   /* Now we make a new deref by concatenating the deref in the call's
-    * parameter with the deref we were given.
-    */
-   nir_deref_var *new_deref = nir_deref_var_clone(call_deref, instr);
-   nir_deref *new_tail = nir_deref_tail(&new_deref->deref);
-   new_tail->child = (*deref)->deref.child;
-   ralloc_steal(new_tail, new_tail->child);
-   *deref = new_deref;
-}
-
-static void
-rewrite_param_derefs(nir_instr *instr, nir_call_instr *call)
-{
-   switch (instr->type) {
-   case nir_instr_type_intrinsic: {
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-      for (unsigned i = 0;
-           i < nir_intrinsic_infos[intrin->intrinsic].num_variables; i++) {
-         convert_deref_to_param_deref(instr, &intrin->variables[i], call);
-      }
-      break;
-   }
-
-   case nir_instr_type_tex: {
-      nir_tex_instr *tex = nir_instr_as_tex(instr);
-      if (tex->texture)
-         convert_deref_to_param_deref(&tex->instr, &tex->texture, call);
-      if (tex->sampler)
-         convert_deref_to_param_deref(&tex->instr, &tex->sampler, call);
-      break;
-   }
-
-   default:
-      break; /* Nothing else has derefs */
-   }
-}
-
-static void
-lower_param_to_local(nir_variable *param, nir_function_impl *impl, bool write)
-{
-   if (param->data.mode != nir_var_param)
-      return;
-
-   nir_parameter_type param_type;
-   if (param->data.location >= 0) {
-      assert(param->data.location < impl->num_params);
-      param_type = impl->function->params[param->data.location].param_type;
-   } else {
-      /* Return variable */
-      param_type = nir_parameter_out;
-   }
-
-   if ((write && param_type == nir_parameter_in) ||
-       (!write && param_type == nir_parameter_out)) {
-      /* In this case, we need a shadow copy.  Turn it into a local */
-      param->data.mode = nir_var_local;
-      exec_list_push_tail(&impl->locals, &param->node);
-   }
-}
-
-static bool
-lower_params_to_locals_block(nir_block *block, nir_function_impl *impl)
-{
-   nir_foreach_instr(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-      switch (intrin->intrinsic) {
-      case nir_intrinsic_store_var:
-         lower_param_to_local(intrin->variables[0]->var, impl, true);
-         break;
-
-      case nir_intrinsic_copy_var:
-         lower_param_to_local(intrin->variables[0]->var, impl, true);
-         lower_param_to_local(intrin->variables[1]->var, impl, false);
-         break;
-
-      case nir_intrinsic_load_var:
-         /* All other intrinsics which access variables (image_load_store)
-          * do so in a read-only fasion.
-          */
-         for (unsigned i = 0;
-              i < nir_intrinsic_infos[intrin->intrinsic].num_variables; i++) {
-            lower_param_to_local(intrin->variables[i]->var, impl, false);
-         }
-         break;
-
-      default:
-         continue;
-      }
-   }
-
-   return true;
-}
-
 static bool
 inline_functions_block(nir_block *block, nir_builder *b,
                        struct set *inlined)
@@ -171,69 +55,50 @@
          nir_function_impl_clone(call->callee->impl);
       callee_copy->function = call->callee;
 
-      /* Add copies of all in parameters */
-      assert(call->num_params == callee_copy->num_params);
-
       exec_list_append(&b->impl->locals, &callee_copy->locals);
       exec_list_append(&b->impl->registers, &callee_copy->registers);
 
       b->cursor = nir_before_instr(&call->instr);
 
-      /* We now need to tie the two functions together using the
-       * parameters.  There are two ways we do this: One is to turn the
-       * parameter into a local variable and do a shadow-copy.  The other
-       * is to treat the parameter as a "proxy" and rewrite derefs to use
-       * the actual variable that comes from the call instruction.  We
-       * implement both schemes.  The first is needed in the case where we
-       * have an in parameter that we write or similar.  The second case is
-       * needed for handling things such as images and uniforms properly.
+      /* Rewrite all of the uses of the callee's parameters to use the call
+       * instructions sources.  In order to ensure that the "load" happens
+       * here and not later (for register sources), we make sure to convert it
+       * to an SSA value first.
        */
-
-      /* Figure out when we need to lower to a shadow local */
-      nir_foreach_block(block, callee_copy) {
-         lower_params_to_locals_block(block, callee_copy);
+      const unsigned num_params = call->num_params;
+      NIR_VLA(nir_ssa_def *, params, num_params);
+      for (unsigned i = 0; i < num_params; i++) {
+         params[i] = nir_ssa_for_src(b, call->params[i],
+                                     call->callee->params[i].num_components);
       }
 
-      for (unsigned i = 0; i < callee_copy->num_params; i++) {
-         nir_variable *param = callee_copy->params[i];
+      nir_foreach_block(block, callee_copy) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
 
-         if (param->data.mode == nir_var_local &&
-             call->callee->params[i].param_type != nir_parameter_out) {
-            nir_copy_deref_var(b, nir_deref_var_create(b->shader, param),
-                                  call->params[i]);
+            nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
+            if (load->intrinsic != nir_intrinsic_load_param)
+               continue;
+
+            unsigned param_idx = nir_intrinsic_param_idx(load);
+            assert(param_idx < num_params);
+            assert(load->dest.is_ssa);
+            nir_ssa_def_rewrite_uses(&load->dest.ssa,
+                                     nir_src_for_ssa(params[param_idx]));
+
+            /* Remove any left-over load_param intrinsics because they're soon
+             * to be in another function and therefore no longer valid.
+             */
+            nir_instr_remove(&load->instr);
          }
       }
 
-      nir_foreach_block(block, callee_copy) {
-         nir_foreach_instr(instr, block)
-            rewrite_param_derefs(instr, call);
-      }
-
       /* Pluck the body out of the function and place it here */
       nir_cf_list body;
       nir_cf_list_extract(&body, &callee_copy->body);
       nir_cf_reinsert(&body, b->cursor);
 
-      b->cursor = nir_before_instr(&call->instr);
-
-      /* Add copies of all out parameters and the return */
-      assert(call->num_params == callee_copy->num_params);
-      for (unsigned i = 0; i < callee_copy->num_params; i++) {
-         nir_variable *param = callee_copy->params[i];
-
-         if (param->data.mode == nir_var_local &&
-             call->callee->params[i].param_type != nir_parameter_in) {
-            nir_copy_deref_var(b, call->params[i],
-                                  nir_deref_var_create(b->shader, param));
-         }
-      }
-      if (!glsl_type_is_void(call->callee->return_type) &&
-          callee_copy->return_var->data.mode == nir_var_local) {
-         nir_copy_deref_var(b, call->return_deref,
-                               nir_deref_var_create(b->shader,
-                                                    callee_copy->return_var));
-      }
-
       nir_instr_remove(&call->instr);
    }
 
diff --git a/src/compiler/nir/nir_instr_set.c b/src/compiler/nir/nir_instr_set.c
index 9cb9ed4..19771fc 100644
--- a/src/compiler/nir/nir_instr_set.c
+++ b/src/compiler/nir/nir_instr_set.c
@@ -79,6 +79,40 @@
 }
 
 static uint32_t
+hash_deref(uint32_t hash, const nir_deref_instr *instr)
+{
+   hash = HASH(hash, instr->deref_type);
+   hash = HASH(hash, instr->mode);
+   hash = HASH(hash, instr->type);
+
+   if (instr->deref_type == nir_deref_type_var)
+      return HASH(hash, instr->var);
+
+   hash = hash_src(hash, &instr->parent);
+
+   switch (instr->deref_type) {
+   case nir_deref_type_struct:
+      hash = HASH(hash, instr->strct.index);
+      break;
+
+   case nir_deref_type_array:
+      hash = hash_src(hash, &instr->arr.index);
+      break;
+
+   case nir_deref_type_var:
+   case nir_deref_type_array_wildcard:
+   case nir_deref_type_cast:
+      /* Nothing to do */
+      break;
+
+   default:
+      unreachable("Invalid instruction deref type");
+   }
+
+   return hash;
+}
+
+static uint32_t
 hash_load_const(uint32_t hash, const nir_load_const_instr *instr)
 {
    hash = HASH(hash, instr->def.num_components);
@@ -131,8 +165,6 @@
       hash = HASH(hash, instr->dest.ssa.bit_size);
    }
 
-   assert(info->num_variables == 0);
-
    hash = _mesa_fnv32_1a_accumulate_block(hash, instr->const_index,
                                           info->num_indices
                                              * sizeof(instr->const_index[0]));
@@ -161,8 +193,6 @@
    hash = HASH(hash, instr->texture_array_size);
    hash = HASH(hash, instr->sampler_index);
 
-   assert(!instr->texture && !instr->sampler);
-
    return hash;
 }
 
@@ -182,6 +212,9 @@
    case nir_instr_type_alu:
       hash = hash_alu(hash, nir_instr_as_alu(instr));
       break;
+   case nir_instr_type_deref:
+      hash = hash_deref(hash, nir_instr_as_deref(instr));
+      break;
    case nir_instr_type_load_const:
       hash = hash_load_const(hash, nir_instr_as_load_const(instr));
       break;
@@ -289,6 +322,43 @@
       }
       return true;
    }
+   case nir_instr_type_deref: {
+      nir_deref_instr *deref1 = nir_instr_as_deref(instr1);
+      nir_deref_instr *deref2 = nir_instr_as_deref(instr2);
+
+      if (deref1->deref_type != deref2->deref_type ||
+          deref1->mode != deref2->mode ||
+          deref1->type != deref2->type)
+         return false;
+
+      if (deref1->deref_type == nir_deref_type_var)
+         return deref1->var == deref2->var;
+
+      if (!nir_srcs_equal(deref1->parent, deref2->parent))
+         return false;
+
+      switch (deref1->deref_type) {
+      case nir_deref_type_struct:
+         if (deref1->strct.index != deref2->strct.index)
+            return false;
+         break;
+
+      case nir_deref_type_array:
+         if (!nir_srcs_equal(deref1->arr.index, deref2->arr.index))
+            return false;
+         break;
+
+      case nir_deref_type_var:
+      case nir_deref_type_array_wildcard:
+      case nir_deref_type_cast:
+         /* Nothing to do */
+         break;
+
+      default:
+         unreachable("Invalid instruction deref type");
+      }
+      return true;
+   }
    case nir_instr_type_tex: {
       nir_tex_instr *tex1 = nir_instr_as_tex(instr1);
       nir_tex_instr *tex2 = nir_instr_as_tex(instr2);
@@ -317,10 +387,6 @@
          return false;
       }
 
-      /* Don't support un-lowered sampler derefs currently. */
-      assert(!tex1->texture && !tex1->sampler &&
-             !tex2->texture && !tex2->sampler);
-
       return true;
    }
    case nir_instr_type_load_const: {
@@ -379,8 +445,6 @@
             return false;
       }
 
-      assert(info->num_variables == 0);
-
       for (unsigned i = 0; i < info->num_indices; i++) {
          if (intrinsic1->const_index[i] != intrinsic2->const_index[i])
             return false;
@@ -396,7 +460,7 @@
       unreachable("Invalid instruction type");
    }
 
-   return false;
+   unreachable("All cases in the above switch should return");
 }
 
 static bool
@@ -430,24 +494,16 @@
 
    switch (instr->type) {
    case nir_instr_type_alu:
+   case nir_instr_type_deref:
+   case nir_instr_type_tex:
    case nir_instr_type_load_const:
    case nir_instr_type_phi:
       return true;
-   case nir_instr_type_tex: {
-      nir_tex_instr *tex = nir_instr_as_tex(instr);
-
-      /* Don't support un-lowered sampler derefs currently. */
-      if (tex->texture || tex->sampler)
-         return false;
-
-      return true;
-   }
    case nir_instr_type_intrinsic: {
       const nir_intrinsic_info *info =
          &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic];
       return (info->flags & NIR_INTRINSIC_CAN_ELIMINATE) &&
-             (info->flags & NIR_INTRINSIC_CAN_REORDER) &&
-             info->num_variables == 0; /* not implemented yet */
+             (info->flags & NIR_INTRINSIC_CAN_REORDER);
    }
    case nir_instr_type_call:
    case nir_instr_type_jump:
@@ -468,6 +524,9 @@
    case nir_instr_type_alu:
       assert(nir_instr_as_alu(instr)->dest.dest.is_ssa);
       return &nir_instr_as_alu(instr)->dest.dest.ssa;
+   case nir_instr_type_deref:
+      assert(nir_instr_as_deref(instr)->dest.is_ssa);
+      return &nir_instr_as_deref(instr)->dest.ssa;
    case nir_instr_type_load_const:
       return &nir_instr_as_load_const(instr)->def;
    case nir_instr_type_phi:
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index f26aaf3..d688a57 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -31,7 +31,7 @@
    """Class that represents all the information about an intrinsic opcode.
    NOTE: this must be kept in sync with nir_intrinsic_info.
    """
-   def __init__(self, name, src_components, dest_components, num_variables,
+   def __init__(self, name, src_components, dest_components,
                 indices, flags, sysval):
        """Parameters:
 
@@ -42,7 +42,6 @@
        - dest_components: number of destination components, -1 means no
          dest, 0 means number of components given in num_components field
          in nir_intrinsic_instr.
-       - num_variables: the number of variables
        - indices: list of constant indicies
        - flags: list of semantic flags
        - sysval: is this a system-value intrinsic
@@ -52,7 +51,6 @@
        if src_components:
            assert isinstance(src_components[0], int)
        assert isinstance(dest_components, int)
-       assert isinstance(num_variables, int)
        assert isinstance(indices, list)
        if indices:
            assert isinstance(indices[0], str)
@@ -66,7 +64,6 @@
        self.src_components = src_components
        self.has_dest = (dest_components >= 0)
        self.dest_components = dest_components
-       self.num_variables = num_variables
        self.num_indices = len(indices)
        self.indices = indices
        self.flags = flags
@@ -102,6 +99,8 @@
 REDUCTION_OP = "NIR_INTRINSIC_REDUCTION_OP"
 # Cluster size for reduction operations
 CLUSTER_SIZE = "NIR_INTRINSIC_CLUSTER_SIZE"
+# Parameter index for a load_param intrinsic
+PARAM_IDX = "NIR_INTRINSIC_PARAM_IDX"
 
 #
 # Possible flags:
@@ -112,29 +111,31 @@
 
 INTR_OPCODES = {}
 
-def intrinsic(name, src_comp=[], dest_comp=-1, num_vars=0, indices=[],
+def intrinsic(name, src_comp=[], dest_comp=-1, indices=[],
               flags=[], sysval=False):
     assert name not in INTR_OPCODES
-    INTR_OPCODES[name] = Intrinsic(name, src_comp, dest_comp, num_vars,
+    INTR_OPCODES[name] = Intrinsic(name, src_comp, dest_comp,
                                    indices, flags, sysval)
 
 intrinsic("nop", flags=[CAN_ELIMINATE])
 
-intrinsic("load_var", dest_comp=0, num_vars=1, flags=[CAN_ELIMINATE])
-intrinsic("store_var", src_comp=[0], num_vars=1, indices=[WRMASK])
-intrinsic("copy_var", num_vars=2)
+intrinsic("load_param", dest_comp=0, indices=[PARAM_IDX], flags=[CAN_ELIMINATE])
 
-# Interpolation of input.  The interp_var_at* intrinsics are similar to the
-# load_var intrinsic acting on a shader input except that they interpolate
-# the input differently.  The at_sample and at_offset intrinsics take an
+intrinsic("load_deref", dest_comp=0, src_comp=[1], flags=[CAN_ELIMINATE])
+intrinsic("store_deref", src_comp=[1, 0], indices=[WRMASK])
+intrinsic("copy_deref", src_comp=[1, 1])
+
+# Interpolation of input.  The interp_deref_at* intrinsics are similar to the
+# load_var intrinsic acting on a shader input except that they interpolate the
+# input differently.  The at_sample and at_offset intrinsics take an
 # additional source that is an integer sample id or a vec2 position offset
 # respectively.
 
-intrinsic("interp_var_at_centroid", dest_comp=0, num_vars=1,
+intrinsic("interp_deref_at_centroid", dest_comp=0, src_comp=[1],
           flags=[ CAN_ELIMINATE, CAN_REORDER])
-intrinsic("interp_var_at_sample", src_comp=[1], dest_comp=0, num_vars=1,
+intrinsic("interp_deref_at_sample", src_comp=[1, 1], dest_comp=0,
           flags=[CAN_ELIMINATE, CAN_REORDER])
-intrinsic("interp_var_at_offset", src_comp=[2], dest_comp=0, num_vars=1,
+intrinsic("interp_deref_at_offset", src_comp=[1, 2], dest_comp=0,
           flags=[CAN_ELIMINATE, CAN_REORDER])
 
 # Ask the driver for the size of a given buffer. It takes the buffer index
@@ -188,6 +189,8 @@
 barrier("memory_barrier_buffer")
 barrier("memory_barrier_image")
 barrier("memory_barrier_shared")
+barrier("begin_invocation_interlock")
+barrier("end_invocation_interlock")
 
 # A conditional discard, with a single boolean source.
 intrinsic("discard_if", src_comp=[1])
@@ -255,19 +258,20 @@
 # lowered, variants take a constant buffer index and register offset.
 
 def atomic(name, flags=[]):
-    intrinsic(name + "_var", dest_comp=1, num_vars=1, flags=flags)
+    intrinsic(name + "_deref", src_comp=[1], dest_comp=1, flags=flags)
     intrinsic(name, src_comp=[1], dest_comp=1, indices=[BASE], flags=flags)
 
 def atomic2(name):
-    intrinsic(name + "_var", src_comp=[1], dest_comp=1, num_vars=1)
+    intrinsic(name + "_deref", src_comp=[1, 1], dest_comp=1)
     intrinsic(name, src_comp=[1, 1], dest_comp=1, indices=[BASE])
 
 def atomic3(name):
-    intrinsic(name + "_var", src_comp=[1, 1], dest_comp=1, num_vars=1)
+    intrinsic(name + "_deref", src_comp=[1, 1, 1], dest_comp=1)
     intrinsic(name, src_comp=[1, 1, 1], dest_comp=1, indices=[BASE])
 
 atomic("atomic_counter_inc")
-atomic("atomic_counter_dec")
+atomic("atomic_counter_pre_dec")
+atomic("atomic_counter_post_dec")
 atomic("atomic_counter_read", flags=[CAN_ELIMINATE])
 atomic2("atomic_counter_add")
 atomic2("atomic_counter_min")
@@ -280,30 +284,31 @@
 
 # Image load, store and atomic intrinsics.
 #
-# All image intrinsics take an image target passed as a nir_variable.  Image
-# variables contain a number of memory and layout qualifiers that influence
-# the semantics of the intrinsic.
+# All image intrinsics take an image target passed as a nir_variable.  The
+# variable is passed in using a chain of nir_deref_instr with as the first
+# source of the image intrinsic.  Image variables contain a number of memory
+# and layout qualifiers that influence the semantics of the intrinsic.
 #
 # All image intrinsics take a four-coordinate vector and a sample index as
-# first two sources, determining the location within the image that will be
+# 2nd and 3rd sources, determining the location within the image that will be
 # accessed by the intrinsic.  Components not applicable to the image target
 # in use are undefined.  Image store takes an additional four-component
 # argument with the value to be written, and image atomic operations take
 # either one or two additional scalar arguments with the same meaning as in
 # the ARB_shader_image_load_store specification.
-intrinsic("image_var_load", src_comp=[4, 1], dest_comp=4, num_vars=1,
+intrinsic("image_deref_load", src_comp=[1, 4, 1], dest_comp=4,
           flags=[CAN_ELIMINATE])
-intrinsic("image_var_store", src_comp=[4, 1, 4], num_vars=1)
-intrinsic("image_var_atomic_add",  src_comp=[4, 1, 1], dest_comp=1, num_vars=1)
-intrinsic("image_var_atomic_min",  src_comp=[4, 1, 1], dest_comp=1, num_vars=1)
-intrinsic("image_var_atomic_max",  src_comp=[4, 1, 1], dest_comp=1, num_vars=1)
-intrinsic("image_var_atomic_and",  src_comp=[4, 1, 1], dest_comp=1, num_vars=1)
-intrinsic("image_var_atomic_or",   src_comp=[4, 1, 1], dest_comp=1, num_vars=1)
-intrinsic("image_var_atomic_xor",  src_comp=[4, 1, 1], dest_comp=1, num_vars=1)
-intrinsic("image_var_atomic_exchange",  src_comp=[4, 1, 1], dest_comp=1, num_vars=1)
-intrinsic("image_var_atomic_comp_swap", src_comp=[4, 1, 1, 1], dest_comp=1, num_vars=1)
-intrinsic("image_var_size",    dest_comp=0, num_vars=1, flags=[CAN_ELIMINATE, CAN_REORDER])
-intrinsic("image_var_samples", dest_comp=1, num_vars=1, flags=[CAN_ELIMINATE, CAN_REORDER])
+intrinsic("image_deref_store", src_comp=[1, 4, 1, 4])
+intrinsic("image_deref_atomic_add",  src_comp=[1, 4, 1, 1], dest_comp=1)
+intrinsic("image_deref_atomic_min",  src_comp=[1, 4, 1, 1], dest_comp=1)
+intrinsic("image_deref_atomic_max",  src_comp=[1, 4, 1, 1], dest_comp=1)
+intrinsic("image_deref_atomic_and",  src_comp=[1, 4, 1, 1], dest_comp=1)
+intrinsic("image_deref_atomic_or",   src_comp=[1, 4, 1, 1], dest_comp=1)
+intrinsic("image_deref_atomic_xor",  src_comp=[1, 4, 1, 1], dest_comp=1)
+intrinsic("image_deref_atomic_exchange",  src_comp=[1, 4, 1, 1], dest_comp=1)
+intrinsic("image_deref_atomic_comp_swap", src_comp=[1, 4, 1, 1, 1], dest_comp=1)
+intrinsic("image_deref_size",    src_comp=[1], dest_comp=0, flags=[CAN_ELIMINATE, CAN_REORDER])
+intrinsic("image_deref_samples", src_comp=[1], dest_comp=1, flags=[CAN_ELIMINATE, CAN_REORDER])
 
 # Vulkan descriptor set intrinsics
 #
@@ -335,24 +340,23 @@
 # compute a new value using one of the operations below, write the new value
 # to memory, and return the original value read.
 #
-# All operations take 1 source except CompSwap that takes 2. These sources
+# All operations take 2 sources except CompSwap that takes 3. These sources
 # represent:
 #
-# 0: The data parameter to the atomic function (i.e. the value to add
+# 0: A deref to the memory on which to perform the atomic
+# 1: The data parameter to the atomic function (i.e. the value to add
 #    in shared_atomic_add, etc).
-# 1: For CompSwap only: the second data parameter.
-#
-# All operations take 1 variable deref.
-intrinsic("var_atomic_add",  src_comp=[1], dest_comp=1, num_vars=1)
-intrinsic("var_atomic_imin", src_comp=[1], dest_comp=1, num_vars=1)
-intrinsic("var_atomic_umin", src_comp=[1], dest_comp=1, num_vars=1)
-intrinsic("var_atomic_imax", src_comp=[1], dest_comp=1, num_vars=1)
-intrinsic("var_atomic_umax", src_comp=[1], dest_comp=1, num_vars=1)
-intrinsic("var_atomic_and",  src_comp=[1], dest_comp=1, num_vars=1)
-intrinsic("var_atomic_or",   src_comp=[1], dest_comp=1, num_vars=1)
-intrinsic("var_atomic_xor",  src_comp=[1], dest_comp=1, num_vars=1)
-intrinsic("var_atomic_exchange", src_comp=[1], dest_comp=1, num_vars=1)
-intrinsic("var_atomic_comp_swap", src_comp=[1, 1], dest_comp=1, num_vars=1)
+# 2: For CompSwap only: the second data parameter.
+intrinsic("deref_atomic_add",  src_comp=[1, 1], dest_comp=1)
+intrinsic("deref_atomic_imin", src_comp=[1, 1], dest_comp=1)
+intrinsic("deref_atomic_umin", src_comp=[1, 1], dest_comp=1)
+intrinsic("deref_atomic_imax", src_comp=[1, 1], dest_comp=1)
+intrinsic("deref_atomic_umax", src_comp=[1, 1], dest_comp=1)
+intrinsic("deref_atomic_and",  src_comp=[1, 1], dest_comp=1)
+intrinsic("deref_atomic_or",   src_comp=[1, 1], dest_comp=1)
+intrinsic("deref_atomic_xor",  src_comp=[1, 1], dest_comp=1)
+intrinsic("deref_atomic_exchange", src_comp=[1, 1], dest_comp=1)
+intrinsic("deref_atomic_comp_swap", src_comp=[1, 1, 1], dest_comp=1)
 
 # SSBO atomic intrinsics
 #
@@ -406,7 +410,7 @@
 intrinsic("shared_atomic_comp_swap", src_comp=[1, 1, 1], dest_comp=1, indices=[BASE])
 
 def system_value(name, dest_comp, indices=[]):
-    intrinsic("load_" + name, [], dest_comp, 0, indices,
+    intrinsic("load_" + name, [], dest_comp, indices,
               flags=[CAN_ELIMINATE, CAN_REORDER], sysval=True)
 
 system_value("frag_coord", 4)
@@ -414,11 +418,15 @@
 system_value("vertex_id", 1)
 system_value("vertex_id_zero_base", 1)
 system_value("first_vertex", 1)
+system_value("is_indexed_draw", 1)
 system_value("base_vertex", 1)
 system_value("instance_id", 1)
 system_value("base_instance", 1)
 system_value("draw_id", 1)
 system_value("sample_id", 1)
+# sample_id_no_per_sample is like sample_id but does not imply per-
+# sample shading.  See the lower_helper_invocation option.
+system_value("sample_id_no_per_sample", 1)
 system_value("sample_pos", 2)
 system_value("sample_mask_in", 1)
 system_value("primitive_id", 1)
@@ -446,6 +454,8 @@
 system_value("num_subgroups", 1)
 system_value("subgroup_id", 1)
 system_value("local_group_size", 3)
+system_value("global_invocation_id", 3)
+system_value("work_dim", 1)
 
 # Blend constant color values.  Float values are clamped.#
 system_value("blend_const_color_r_float", 1)
@@ -527,6 +537,8 @@
 load("shared", 1, [BASE], [CAN_ELIMINATE])
 # src[] = { offset }. const_index[] = { base, range }
 load("push_constant", 1, [BASE, RANGE], [CAN_ELIMINATE, CAN_REORDER])
+# src[] = { offset }. const_index[] = { base, range }
+load("constant", 1, [BASE, RANGE], [CAN_ELIMINATE, CAN_REORDER])
 
 # Stores work the same way as loads, except now the first source is the value
 # to store and the second (and possibly third) source specify where to store
diff --git a/src/compiler/nir/nir_intrinsics_c.py b/src/compiler/nir/nir_intrinsics_c.py
index 21c8b69..ccf0a03 100755
--- a/src/compiler/nir/nir_intrinsics_c.py
+++ b/src/compiler/nir/nir_intrinsics_c.py
@@ -25,7 +25,7 @@
 #include "nir.h"
 
 const nir_intrinsic_info nir_intrinsic_infos[nir_num_intrinsics] = {
-% for name, opcode in sorted(INTR_OPCODES.iteritems()):
+% for name, opcode in sorted(INTR_OPCODES.items()):
 {
    .name = "${name}",
    .num_srcs = ${opcode.num_srcs},
@@ -36,7 +36,6 @@
 % endif
    .has_dest = ${"true" if opcode.has_dest else "false"},
    .dest_components = ${max(opcode.dest_components, 0)},
-   .num_variables = ${opcode.num_variables},
    .num_indices = ${opcode.num_indices},
 % if opcode.indices:
    .index_map = {
diff --git a/src/compiler/nir/nir_linking_helpers.c b/src/compiler/nir/nir_linking_helpers.c
index 2b0a266..6e6655d 100644
--- a/src/compiler/nir/nir_linking_helpers.c
+++ b/src/compiler/nir/nir_linking_helpers.c
@@ -62,29 +62,30 @@
 tcs_add_output_reads(nir_shader *shader, uint64_t *read, uint64_t *patches_read)
 {
    nir_foreach_function(function, shader) {
-      if (function->impl) {
-         nir_foreach_block(block, function->impl) {
-            nir_foreach_instr(instr, block) {
-               if (instr->type != nir_instr_type_intrinsic)
-                  continue;
+      if (!function->impl)
+         continue;
 
-               nir_intrinsic_instr *intrin_instr =
-                  nir_instr_as_intrinsic(instr);
-               if (intrin_instr->intrinsic == nir_intrinsic_load_var &&
-                   intrin_instr->variables[0]->var->data.mode ==
-                   nir_var_shader_out) {
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
 
-                  nir_variable *var = intrin_instr->variables[0]->var;
-                  if (var->data.patch) {
-                     patches_read[var->data.location_frac] |=
-                        get_variable_io_mask(intrin_instr->variables[0]->var,
-                                             shader->info.stage);
-                  } else {
-                     read[var->data.location_frac] |=
-                        get_variable_io_mask(intrin_instr->variables[0]->var,
-                                             shader->info.stage);
-                  }
-               }
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_deref)
+               continue;
+
+            nir_variable *var =
+               nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0]));
+
+            if (var->data.mode != nir_var_shader_out)
+               continue;
+
+            if (var->data.patch) {
+               patches_read[var->data.location_frac] |=
+                  get_variable_io_mask(var, shader->info.stage);
+            } else {
+               read[var->data.location_frac] |=
+                  get_variable_io_mask(var, shader->info.stage);
             }
          }
       }
@@ -175,9 +176,12 @@
 }
 
 static uint8_t
-get_interp_type(nir_variable *var, bool default_to_smooth_interp)
+get_interp_type(nir_variable *var, const struct glsl_type *type,
+                bool default_to_smooth_interp)
 {
-   if (var->data.interpolation != INTERP_MODE_NONE)
+   if (glsl_type_is_integer(type))
+      return INTERP_MODE_FLAT;
+   else if (var->data.interpolation != INTERP_MODE_NONE)
       return var->data.interpolation;
    else if (default_to_smooth_interp)
       return INTERP_MODE_SMOOTH;
@@ -232,7 +236,7 @@
          unsigned comps_slot2 = 0;
          for (unsigned i = 0; i < slots; i++) {
             interp_type[location + i] =
-               get_interp_type(var, default_to_smooth_interp);
+               get_interp_type(var, type, default_to_smooth_interp);
             interp_loc[location + i] = get_interp_loc(var);
 
             if (dual_slot) {
@@ -404,7 +408,7 @@
             continue;
 
          bool found_new_offset = false;
-         uint8_t interp = get_interp_type(var, default_to_smooth_interp);
+         uint8_t interp = get_interp_type(var, type, default_to_smooth_interp);
          for (; cursor[interp] < 32; cursor[interp]++) {
             uint8_t cursor_used_comps = comps[cursor[interp]];
 
diff --git a/src/compiler/nir/nir_loop_analyze.c b/src/compiler/nir/nir_loop_analyze.c
index aa6877f..d564296 100644
--- a/src/compiler/nir/nir_loop_analyze.c
+++ b/src/compiler/nir/nir_loop_analyze.c
@@ -290,17 +290,6 @@
    return true;
 }
 
-static inline bool
-ends_in_break(nir_block *block)
-{
-   if (exec_list_is_empty(&block->instr_list))
-      return false;
-
-   nir_instr *instr = nir_block_last_instr(block);
-   return instr->type == nir_instr_type_jump &&
-      nir_instr_as_jump(instr)->type == nir_jump_break;
-}
-
 static bool
 find_loop_terminators(loop_info_state *state)
 {
@@ -315,11 +304,11 @@
 
          nir_block *last_then = nir_if_last_then_block(nif);
          nir_block *last_else = nir_if_last_else_block(nif);
-         if (ends_in_break(last_then)) {
+         if (nir_block_ends_in_break(last_then)) {
             break_blk = last_then;
             continue_from_blk = last_else;
             continue_from_then = false;
-         } else if (ends_in_break(last_else)) {
+         } else if (nir_block_ends_in_break(last_else)) {
             break_blk = last_else;
             continue_from_blk = last_then;
          }
@@ -630,45 +619,31 @@
    state->loop->info->limiting_terminator = limiting_terminator;
 }
 
-/* Checks if we should force the loop to be unrolled regardless of size
- * due to array access heuristics.
- */
 static bool
 force_unroll_array_access(loop_info_state *state, nir_shader *ns,
-                          nir_deref_var *variable)
+                          nir_deref_instr *deref)
 {
-   nir_deref *tail = &variable->deref;
+   for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) {
+      if (d->deref_type != nir_deref_type_array)
+         continue;
 
-   while (tail->child != NULL) {
-      tail = tail->child;
+      assert(d->arr.index.is_ssa);
+      nir_loop_variable *array_index = get_loop_var(d->arr.index.ssa, state);
 
-      if (tail->deref_type == nir_deref_type_array) {
+      if (array_index->type != basic_induction)
+         continue;
 
-         nir_deref_array *deref_array = nir_deref_as_array(tail);
-         if (deref_array->deref_array_type != nir_deref_array_type_indirect)
-            continue;
+      nir_deref_instr *parent = nir_deref_instr_parent(d);
+      assert(glsl_type_is_array(parent->type) ||
+             glsl_type_is_matrix(parent->type));
+      if (glsl_get_length(parent->type) == state->loop->info->trip_count) {
+         state->loop->info->force_unroll = true;
+         return true;
+      }
 
-         nir_loop_variable *array_index =
-            get_loop_var(deref_array->indirect.ssa, state);
-
-         if (array_index->type != basic_induction)
-            continue;
-
-         /* If an array is indexed by a loop induction variable, and the
-          * array size is exactly the number of loop iterations, this is
-          * probably a simple for-loop trying to access each element in
-          * turn; the application may expect it to be unrolled.
-          */
-         if (glsl_get_length(variable->deref.type) ==
-             state->loop->info->trip_count) {
-            state->loop->info->force_unroll = true;
-            return state->loop->info->force_unroll;
-         }
-
-         if (variable->var->data.mode & state->indirect_mask) {
-            state->loop->info->force_unroll = true;
-            return state->loop->info->force_unroll;
-         }
+      if (deref->mode & state->indirect_mask) {
+         state->loop->info->force_unroll = true;
+         return true;
       }
    }
 
@@ -688,15 +663,17 @@
       /* Check for arrays variably-indexed by a loop induction variable.
        * Unrolling the loop may convert that access into constant-indexing.
        */
-      if (intrin->intrinsic == nir_intrinsic_load_var ||
-          intrin->intrinsic == nir_intrinsic_store_var ||
-          intrin->intrinsic == nir_intrinsic_copy_var) {
-         unsigned num_vars =
-            nir_intrinsic_infos[intrin->intrinsic].num_variables;
-         for (unsigned i = 0; i < num_vars; i++) {
-            if (force_unroll_array_access(state, ns, intrin->variables[i]))
-               return true;
-         }
+      if (intrin->intrinsic == nir_intrinsic_load_deref ||
+          intrin->intrinsic == nir_intrinsic_store_deref ||
+          intrin->intrinsic == nir_intrinsic_copy_deref) {
+         if (force_unroll_array_access(state, ns,
+                                       nir_src_as_deref(intrin->src[0])))
+            return true;
+
+         if (intrin->intrinsic == nir_intrinsic_copy_deref &&
+             force_unroll_array_access(state, ns,
+                                       nir_src_as_deref(intrin->src[1])))
+            return true;
       }
    }
 
diff --git a/src/compiler/nir/nir_loop_analyze.h b/src/compiler/nir/nir_loop_analyze.h
index 18c2305..7b4ed66 100644
--- a/src/compiler/nir/nir_loop_analyze.h
+++ b/src/compiler/nir/nir_loop_analyze.h
@@ -92,4 +92,15 @@
    return true;
 }
 
+static inline bool
+nir_block_ends_in_break(nir_block *block)
+{
+   if (exec_list_is_empty(&block->instr_list))
+      return false;
+
+   nir_instr *instr = nir_block_last_instr(block);
+   return instr->type == nir_instr_type_jump &&
+      nir_instr_as_jump(instr)->type == nir_jump_break;
+}
+
 #endif /* NIR_LOOP_ANALYZE_H */
diff --git a/src/compiler/nir/nir_lower_64bit_packing.c b/src/compiler/nir/nir_lower_64bit_packing.c
deleted file mode 100644
index abae173..0000000
--- a/src/compiler/nir/nir_lower_64bit_packing.c
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright © 2015 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- */
-
-#include "nir.h"
-#include "nir_builder.h"
-
-/*
- * lowers:
- *
- * packDouble2x32(foo) -> packDouble2x32Split(foo.x, foo.y)
- * unpackDouble2x32(foo) -> vec2(unpackDouble2x32_x(foo), unpackDouble2x32_y(foo))
- * packInt2x32(foo) -> packInt2x32Split(foo.x, foo.y)
- * unpackInt2x32(foo) -> vec2(unpackInt2x32_x(foo), unpackInt2x32_y(foo))
- */
-
-static nir_ssa_def *
-lower_pack_64(nir_builder *b, nir_ssa_def *src)
-{
-   return nir_pack_64_2x32_split(b, nir_channel(b, src, 0),
-                                    nir_channel(b, src, 1));
-}
-
-static nir_ssa_def *
-lower_unpack_64(nir_builder *b, nir_ssa_def *src)
-{
-   return nir_vec2(b, nir_unpack_64_2x32_split_x(b, src),
-                      nir_unpack_64_2x32_split_y(b, src));
-}
-
-static bool
-lower_64bit_pack_impl(nir_function_impl *impl)
-{
-   nir_builder b;
-   nir_builder_init(&b, impl);
-   bool progress = false;
-
-   nir_foreach_block(block, impl) {
-      nir_foreach_instr_safe(instr, block) {
-         if (instr->type != nir_instr_type_alu)
-            continue;
-
-         nir_alu_instr *alu_instr = (nir_alu_instr *) instr;
-
-         if (alu_instr->op != nir_op_pack_64_2x32 &&
-             alu_instr->op != nir_op_unpack_64_2x32)
-            continue;
-
-         b.cursor = nir_before_instr(&alu_instr->instr);
-
-         nir_ssa_def *src = nir_ssa_for_alu_src(&b, alu_instr, 0);
-         nir_ssa_def *dest;
-
-         switch (alu_instr->op) {
-         case nir_op_pack_64_2x32:
-            dest = lower_pack_64(&b, src);
-            break;
-         case nir_op_unpack_64_2x32:
-            dest = lower_unpack_64(&b, src);
-            break;
-         default:
-            unreachable("Impossible opcode");
-         }
-
-         nir_ssa_def_rewrite_uses(&alu_instr->dest.dest.ssa, nir_src_for_ssa(dest));
-         nir_instr_remove(&alu_instr->instr);
-         nir_metadata_preserve(impl, nir_metadata_block_index |
-                                     nir_metadata_dominance);
-         progress = true;
-      }
-   }
-
-   return progress;
-}
-
-bool
-nir_lower_64bit_pack(nir_shader *shader)
-{
-   bool progress = false;
-
-   nir_foreach_function(function, shader) {
-      if (function->impl)
-         progress |= lower_64bit_pack_impl(function->impl);
-   }
-
-   return false;
-}
diff --git a/src/compiler/nir/nir_lower_alpha_test.c b/src/compiler/nir/nir_lower_alpha_test.c
index 6bf9ff1..ddd8157 100644
--- a/src/compiler/nir/nir_lower_alpha_test.c
+++ b/src/compiler/nir/nir_lower_alpha_test.c
@@ -55,8 +55,8 @@
                nir_variable *out = NULL;
 
                switch (intr->intrinsic) {
-               case nir_intrinsic_store_var:
-                  out = intr->variables[0]->var;
+               case nir_intrinsic_store_deref:
+                  out = nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
                   break;
                case nir_intrinsic_store_output:
                   /* already had i/o lowered.. lookup the matching output var: */
@@ -85,6 +85,9 @@
                nir_ssa_def *alpha;
                if (alpha_to_one) {
                   alpha = nir_imm_float(&b, 1.0);
+               } else if (intr->intrinsic == nir_intrinsic_store_deref) {
+                  alpha = nir_channel(&b, nir_ssa_for_src(&b, intr->src[1], 4),
+                                      3);
                } else {
                   alpha = nir_channel(&b, nir_ssa_for_src(&b, intr->src[0], 4),
                                       3);
diff --git a/src/compiler/nir/nir_lower_alu.c b/src/compiler/nir/nir_lower_alu.c
new file mode 100644
index 0000000..8d1a1d3
--- /dev/null
+++ b/src/compiler/nir/nir_lower_alu.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ * Copyright © 2018 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+/** nir_lower_alu.c
+ *
+ * NIR's home for miscellaneous ALU operation lowering implementations.
+ *
+ * Most NIR ALU lowering occurs in nir_opt_algebraic.py, since it's generally
+ * easy to write them there.  However, if terms appear multiple times in the
+ * lowered code, it can get very verbose and cause a lot of work for CSE, so
+ * it may end up being easier to write out in C code.
+ *
+ * The shader must be in SSA for this pass.
+ */
+
+#define LOWER_MUL_HIGH (1 << 0)
+
+static bool
+lower_alu_instr(nir_alu_instr *instr, nir_builder *b)
+{
+   nir_ssa_def *lowered = NULL;
+
+   assert(instr->dest.dest.is_ssa);
+
+   b->cursor = nir_before_instr(&instr->instr);
+   b->exact = instr->exact;
+
+   switch (instr->op) {
+   case nir_op_bitfield_reverse:
+      if (b->shader->options->lower_bitfield_reverse) {
+         /* For more details, see:
+          *
+          * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
+          */
+         nir_ssa_def *c1 = nir_imm_int(b, 1);
+         nir_ssa_def *c2 = nir_imm_int(b, 2);
+         nir_ssa_def *c4 = nir_imm_int(b, 4);
+         nir_ssa_def *c8 = nir_imm_int(b, 8);
+         nir_ssa_def *c16 = nir_imm_int(b, 16);
+         nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333);
+         nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555);
+         nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f);
+         nir_ssa_def *c00ff00ff = nir_imm_int(b, 0x00ff00ff);
+
+         lowered = nir_ssa_for_alu_src(b, instr, 0);
+
+         /* Swap odd and even bits. */
+         lowered = nir_ior(b,
+                           nir_iand(b, nir_ushr(b, lowered, c1), c55555555),
+                           nir_ishl(b, nir_iand(b, lowered, c55555555), c1));
+
+         /* Swap consecutive pairs. */
+         lowered = nir_ior(b,
+                           nir_iand(b, nir_ushr(b, lowered, c2), c33333333),
+                           nir_ishl(b, nir_iand(b, lowered, c33333333), c2));
+
+         /* Swap nibbles. */
+         lowered = nir_ior(b,
+                           nir_iand(b, nir_ushr(b, lowered, c4), c0f0f0f0f),
+                           nir_ishl(b, nir_iand(b, lowered, c0f0f0f0f), c4));
+
+         /* Swap bytes. */
+         lowered = nir_ior(b,
+                           nir_iand(b, nir_ushr(b, lowered, c8), c00ff00ff),
+                           nir_ishl(b, nir_iand(b, lowered, c00ff00ff), c8));
+
+         lowered = nir_ior(b,
+                           nir_ushr(b, lowered, c16),
+                           nir_ishl(b, lowered, c16));
+      }
+      break;
+
+   case nir_op_bit_count:
+      if (b->shader->options->lower_bit_count) {
+         /* For more details, see:
+          *
+          * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+          */
+         nir_ssa_def *c1 = nir_imm_int(b, 1);
+         nir_ssa_def *c2 = nir_imm_int(b, 2);
+         nir_ssa_def *c4 = nir_imm_int(b, 4);
+         nir_ssa_def *c24 = nir_imm_int(b, 24);
+         nir_ssa_def *c33333333 = nir_imm_int(b, 0x33333333);
+         nir_ssa_def *c55555555 = nir_imm_int(b, 0x55555555);
+         nir_ssa_def *c0f0f0f0f = nir_imm_int(b, 0x0f0f0f0f);
+         nir_ssa_def *c01010101 = nir_imm_int(b, 0x01010101);
+
+         lowered = nir_ssa_for_alu_src(b, instr, 0);
+
+         lowered = nir_isub(b, lowered,
+                            nir_iand(b, nir_ushr(b, lowered, c1), c55555555));
+
+         lowered = nir_iadd(b,
+                            nir_iand(b, lowered, c33333333),
+                            nir_iand(b, nir_ushr(b, lowered, c2), c33333333));
+
+         lowered = nir_ushr(b,
+                            nir_imul(b,
+                                     nir_iand(b,
+                                              nir_iadd(b,
+                                                       lowered,
+                                                       nir_ushr(b, lowered, c4)),
+                                              c0f0f0f0f),
+                                     c01010101),
+                            c24);
+      }
+      break;
+
+   case nir_op_imul_high:
+   case nir_op_umul_high:
+      if (b->shader->options->lower_mul_high) {
+         nir_ssa_def *c1 = nir_imm_int(b, 1);
+         nir_ssa_def *c16 = nir_imm_int(b, 16);
+
+         nir_ssa_def *src0 = nir_ssa_for_alu_src(b, instr, 0);
+         nir_ssa_def *src1 = nir_ssa_for_alu_src(b, instr, 1);
+         nir_ssa_def *different_signs = NULL;
+         if (instr->op == nir_op_imul_high) {
+            nir_ssa_def *c0 = nir_imm_int(b, 0);
+            different_signs = nir_ixor(b,
+                                       nir_ilt(b, src0, c0),
+                                       nir_ilt(b, src1, c0));
+            src0 = nir_iabs(b, src0);
+            src1 = nir_iabs(b, src1);
+         }
+
+         /*   ABCD
+          * * EFGH
+          * ======
+          * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32
+          *
+          * Start by splitting into the 4 multiplies.
+          */
+         nir_ssa_def *src0l = nir_iand(b, src0, nir_imm_int(b, 0xffff));
+         nir_ssa_def *src1l = nir_iand(b, src1, nir_imm_int(b, 0xffff));
+         nir_ssa_def *src0h = nir_ushr(b, src0, c16);
+         nir_ssa_def *src1h = nir_ushr(b, src1, c16);
+
+         nir_ssa_def *lo = nir_imul(b, src0l, src1l);
+         nir_ssa_def *m1 = nir_imul(b, src0l, src1h);
+         nir_ssa_def *m2 = nir_imul(b, src0h, src1l);
+         nir_ssa_def *hi = nir_imul(b, src0h, src1h);
+
+         nir_ssa_def *tmp;
+
+         tmp = nir_ishl(b, m1, c16);
+         hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1));
+         lo = nir_iadd(b, lo, tmp);
+         hi = nir_iadd(b, hi, nir_ushr(b, m1, c16));
+
+         tmp = nir_ishl(b, m2, c16);
+         hi = nir_iadd(b, hi, nir_iand(b, nir_uadd_carry(b, lo, tmp), c1));
+         lo = nir_iadd(b, lo, tmp);
+         hi = nir_iadd(b, hi, nir_ushr(b, m2, c16));
+
+         if (instr->op == nir_op_imul_high) {
+            /* For channels where different_signs is set we have to perform a
+             * 64-bit negation.  This is *not* the same as just negating the
+             * high 32-bits.  Consider -3 * 2.  The high 32-bits is 0, but the
+             * desired result is -1, not -0!  Recall -x == ~x + 1.
+             */
+            hi = nir_bcsel(b, different_signs,
+                           nir_iadd(b,
+                                    nir_inot(b, hi),
+                                    nir_iand(b,
+                                             nir_uadd_carry(b,
+                                                            nir_inot(b, lo),
+                                                            c1),
+                                             nir_imm_int(b, 1))),
+                           hi);
+         }
+
+         lowered = hi;
+      }
+      break;
+
+   default:
+      break;
+   }
+
+   if (lowered) {
+      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(lowered));
+      nir_instr_remove(&instr->instr);
+      return true;
+   } else {
+      return false;
+   }
+}
+
+bool
+nir_lower_alu(nir_shader *shader)
+{
+   bool progress = false;
+
+   if (!shader->options->lower_bitfield_reverse &&
+       !shader->options->lower_mul_high)
+      return false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl) {
+         nir_builder builder;
+         nir_builder_init(&builder, function->impl);
+
+         nir_foreach_block(block, function->impl) {
+            nir_foreach_instr_safe(instr, block) {
+               if (instr->type == nir_instr_type_alu) {
+                  progress = lower_alu_instr(nir_instr_as_alu(instr),
+                                             &builder) || progress;
+               }
+            }
+         }
+
+         if (progress) {
+            nir_metadata_preserve(function->impl,
+                                  nir_metadata_block_index |
+                                  nir_metadata_dominance);
+         }
+      }
+   }
+
+   return progress;
+}
diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c
index a0377dc..7ef032c 100644
--- a/src/compiler/nir/nir_lower_alu_to_scalar.c
+++ b/src/compiler/nir/nir_lower_alu_to_scalar.c
@@ -107,11 +107,11 @@
       if (!b->shader->options->lower_pack_half_2x16)
          return false;
 
+      nir_ssa_def *src_vec2 = nir_ssa_for_alu_src(b, instr, 0);
+
       nir_ssa_def *val =
-         nir_pack_half_2x16_split(b, nir_channel(b, instr->src[0].src.ssa,
-                                                 instr->src[0].swizzle[0]),
-                                     nir_channel(b, instr->src[0].src.ssa,
-                                                 instr->src[0].swizzle[1]));
+         nir_pack_half_2x16_split(b, nir_channel(b, src_vec2, 0),
+                                     nir_channel(b, src_vec2, 1));
 
       nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
       nir_instr_remove(&instr->instr);
@@ -130,9 +130,11 @@
       if (!b->shader->options->lower_unpack_half_2x16)
          return false;
 
+      nir_ssa_def *packed = nir_ssa_for_alu_src(b, instr, 0);
+
       nir_ssa_def *comps[2];
-      comps[0] = nir_unpack_half_2x16_split_x(b, instr->src[0].src.ssa);
-      comps[1] = nir_unpack_half_2x16_split_y(b, instr->src[0].src.ssa);
+      comps[0] = nir_unpack_half_2x16_split_x(b, packed);
+      comps[1] = nir_unpack_half_2x16_split_y(b, packed);
       nir_ssa_def *vec = nir_vec(b, comps, 2);
 
       nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(vec));
@@ -144,8 +146,8 @@
       assert(b->shader->options->lower_pack_snorm_2x16 ||
              b->shader->options->lower_pack_unorm_2x16);
 
-      nir_ssa_def *word =
-         nir_extract_u16(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+      nir_ssa_def *word = nir_extract_u16(b, nir_ssa_for_alu_src(b, instr, 0),
+                                             nir_imm_int(b, 0));
       nir_ssa_def *val =
          nir_ior(b, nir_ishl(b, nir_channel(b, word, 1), nir_imm_int(b, 16)),
                                 nir_channel(b, word, 0));
@@ -159,8 +161,8 @@
       assert(b->shader->options->lower_pack_snorm_4x8 ||
              b->shader->options->lower_pack_unorm_4x8);
 
-      nir_ssa_def *byte =
-         nir_extract_u8(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
+      nir_ssa_def *byte = nir_extract_u8(b, nir_ssa_for_alu_src(b, instr, 0),
+                                            nir_imm_int(b, 0));
       nir_ssa_def *val =
          nir_ior(b, nir_ior(b, nir_ishl(b, nir_channel(b, byte, 3), nir_imm_int(b, 24)),
                                nir_ishl(b, nir_channel(b, byte, 2), nir_imm_int(b, 16))),
@@ -173,14 +175,15 @@
    }
 
    case nir_op_fdph: {
+      nir_ssa_def *src0_vec = nir_ssa_for_alu_src(b, instr, 0);
+      nir_ssa_def *src1_vec = nir_ssa_for_alu_src(b, instr, 1);
+
       nir_ssa_def *sum[4];
       for (unsigned i = 0; i < 3; i++) {
-         sum[i] = nir_fmul(b, nir_channel(b, instr->src[0].src.ssa,
-                                          instr->src[0].swizzle[i]),
-                              nir_channel(b, instr->src[1].src.ssa,
-                                          instr->src[1].swizzle[i]));
+         sum[i] = nir_fmul(b, nir_channel(b, src0_vec, i),
+                              nir_channel(b, src1_vec, i));
       }
-      sum[3] = nir_channel(b, instr->src[1].src.ssa, instr->src[1].swizzle[3]);
+      sum[3] = nir_channel(b, src1_vec, 3);
 
       nir_ssa_def *val = nir_fadd(b, nir_fadd(b, sum[0], sum[1]),
                                      nir_fadd(b, sum[2], sum[3]));
@@ -191,6 +194,7 @@
    }
 
    case nir_op_unpack_64_2x32:
+   case nir_op_unpack_32_2x16:
       return false;
 
       LOWER_REDUCTION(nir_op_fdot, nir_op_fmul, nir_op_fadd);
@@ -209,9 +213,9 @@
       return false;
 
    unsigned num_components = instr->dest.dest.ssa.num_components;
-   nir_ssa_def *comps[] = { NULL, NULL, NULL, NULL };
+   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { NULL };
 
-   for (chan = 0; chan < 4; chan++) {
+   for (chan = 0; chan < NIR_MAX_VEC_COMPONENTS; chan++) {
       if (!(instr->dest.write_mask & (1 << chan)))
          continue;
 
@@ -225,7 +229,7 @@
                               0 : chan);
 
          nir_alu_src_copy(&lower->src[i], &instr->src[i], lower);
-         for (int j = 0; j < 4; j++)
+         for (int j = 0; j < NIR_MAX_VEC_COMPONENTS; j++)
             lower->src[i].swizzle[j] = instr->src[i].swizzle[src_chan];
       }
 
diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c
deleted file mode 100644
index 383e323..0000000
--- a/src/compiler/nir/nir_lower_atomics.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- *    Connor Abbott (cwabbott0@gmail.com)
- *
- */
-
-#include "compiler/glsl/ir_uniform.h"
-#include "nir.h"
-#include "main/config.h"
-#include "main/mtypes.h"
-#include <assert.h>
-
-/*
- * replace atomic counter intrinsics that use a variable with intrinsics
- * that directly store the buffer index and byte offset
- */
-
-static bool
-lower_instr(nir_intrinsic_instr *instr,
-            const struct gl_shader_program *shader_program,
-            nir_shader *shader, bool use_binding_as_idx)
-{
-   nir_intrinsic_op op;
-   switch (instr->intrinsic) {
-   case nir_intrinsic_atomic_counter_read_var:
-      op = nir_intrinsic_atomic_counter_read;
-      break;
-
-   case nir_intrinsic_atomic_counter_inc_var:
-      op = nir_intrinsic_atomic_counter_inc;
-      break;
-
-   case nir_intrinsic_atomic_counter_dec_var:
-      op = nir_intrinsic_atomic_counter_dec;
-      break;
-
-   case nir_intrinsic_atomic_counter_add_var:
-      op = nir_intrinsic_atomic_counter_add;
-      break;
-
-   case nir_intrinsic_atomic_counter_min_var:
-      op = nir_intrinsic_atomic_counter_min;
-      break;
-
-   case nir_intrinsic_atomic_counter_max_var:
-      op = nir_intrinsic_atomic_counter_max;
-      break;
-
-   case nir_intrinsic_atomic_counter_and_var:
-      op = nir_intrinsic_atomic_counter_and;
-      break;
-
-   case nir_intrinsic_atomic_counter_or_var:
-      op = nir_intrinsic_atomic_counter_or;
-      break;
-
-   case nir_intrinsic_atomic_counter_xor_var:
-      op = nir_intrinsic_atomic_counter_xor;
-      break;
-
-   case nir_intrinsic_atomic_counter_exchange_var:
-      op = nir_intrinsic_atomic_counter_exchange;
-      break;
-
-   case nir_intrinsic_atomic_counter_comp_swap_var:
-      op = nir_intrinsic_atomic_counter_comp_swap;
-      break;
-
-   default:
-      return false;
-   }
-
-   if (instr->variables[0]->var->data.mode != nir_var_uniform &&
-       instr->variables[0]->var->data.mode != nir_var_shader_storage &&
-       instr->variables[0]->var->data.mode != nir_var_shared)
-      return false; /* atomics passed as function arguments can't be lowered */
-
-   void *mem_ctx = ralloc_parent(instr);
-   unsigned uniform_loc = instr->variables[0]->var->data.location;
-
-   unsigned idx = use_binding_as_idx ?
-      instr->variables[0]->var->data.binding :
-      shader_program->data->UniformStorage[uniform_loc].opaque[shader->info.stage].index;
-
-   nir_intrinsic_instr *new_instr = nir_intrinsic_instr_create(mem_ctx, op);
-   nir_intrinsic_set_base(new_instr, idx);
-
-   nir_load_const_instr *offset_const =
-      nir_load_const_instr_create(mem_ctx, 1, 32);
-   offset_const->value.u32[0] = instr->variables[0]->var->data.offset;
-
-   nir_instr_insert_before(&instr->instr, &offset_const->instr);
-
-   nir_ssa_def *offset_def = &offset_const->def;
-
-   nir_deref *tail = &instr->variables[0]->deref;
-   while (tail->child != NULL) {
-      nir_deref_array *deref_array = nir_deref_as_array(tail->child);
-      tail = tail->child;
-
-      unsigned child_array_elements = tail->child != NULL ?
-         glsl_get_aoa_size(tail->type) : 1;
-
-      offset_const->value.u32[0] += deref_array->base_offset *
-         child_array_elements * ATOMIC_COUNTER_SIZE;
-
-      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-         nir_load_const_instr *atomic_counter_size =
-            nir_load_const_instr_create(mem_ctx, 1, 32);
-         atomic_counter_size->value.u32[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
-         nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr);
-
-         nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul);
-         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
-         mul->dest.write_mask = 0x1;
-         nir_src_copy(&mul->src[0].src, &deref_array->indirect, mul);
-         mul->src[1].src.is_ssa = true;
-         mul->src[1].src.ssa = &atomic_counter_size->def;
-         nir_instr_insert_before(&instr->instr, &mul->instr);
-
-         nir_alu_instr *add = nir_alu_instr_create(mem_ctx, nir_op_iadd);
-         nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
-         add->dest.write_mask = 0x1;
-         add->src[0].src.is_ssa = true;
-         add->src[0].src.ssa = &mul->dest.dest.ssa;
-         add->src[1].src.is_ssa = true;
-         add->src[1].src.ssa = offset_def;
-         nir_instr_insert_before(&instr->instr, &add->instr);
-
-         offset_def = &add->dest.dest.ssa;
-      }
-   }
-
-   new_instr->src[0].is_ssa = true;
-   new_instr->src[0].ssa = offset_def;
-
-   /* Copy the other sources, if any, from the original instruction to the new
-    * instruction.
-    */
-   for (unsigned i = 0; i < nir_intrinsic_infos[instr->intrinsic].num_srcs; i++)
-      nir_src_copy(&new_instr->src[i + 1], &instr->src[i], new_instr);
-
-   if (instr->dest.is_ssa) {
-      nir_ssa_dest_init(&new_instr->instr, &new_instr->dest,
-                        instr->dest.ssa.num_components, 32, NULL);
-      nir_ssa_def_rewrite_uses(&instr->dest.ssa,
-                               nir_src_for_ssa(&new_instr->dest.ssa));
-   } else {
-      nir_dest_copy(&new_instr->dest, &instr->dest, mem_ctx);
-   }
-
-   nir_instr_insert_before(&instr->instr, &new_instr->instr);
-   nir_instr_remove(&instr->instr);
-
-   return true;
-}
-
-bool
-nir_lower_atomics(nir_shader *shader,
-                  const struct gl_shader_program *shader_program,
-                  bool use_binding_as_idx)
-{
-   bool progress = false;
-
-   nir_foreach_function(function, shader) {
-      if (!function->impl)
-         continue;
-
-      bool impl_progress = false;
-
-      nir_foreach_block(block, function->impl) {
-         nir_foreach_instr_safe(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
-
-            impl_progress |= lower_instr(nir_instr_as_intrinsic(instr),
-                                         shader_program, shader,
-                                         use_binding_as_idx);
-         }
-      }
-
-      if (impl_progress) {
-         nir_metadata_preserve(function->impl, nir_metadata_block_index |
-                                               nir_metadata_dominance);
-         progress = true;
-      }
-   }
-
-   return progress;
-}
diff --git a/src/compiler/nir/nir_lower_atomics_to_ssbo.c b/src/compiler/nir/nir_lower_atomics_to_ssbo.c
index 934ae81..6ebd363 100644
--- a/src/compiler/nir/nir_lower_atomics_to_ssbo.c
+++ b/src/compiler/nir/nir_lower_atomics_to_ssbo.c
@@ -71,7 +71,8 @@
       return true;
    case nir_intrinsic_atomic_counter_inc:
    case nir_intrinsic_atomic_counter_add:
-   case nir_intrinsic_atomic_counter_dec:
+   case nir_intrinsic_atomic_counter_pre_dec:
+   case nir_intrinsic_atomic_counter_post_dec:
       /* inc and dec get remapped to add: */
       op = nir_intrinsic_ssbo_atomic_add;
       break;
@@ -119,7 +120,8 @@
       nir_src_copy(&new_instr->src[1], &instr->src[0], new_instr);
       new_instr->src[2] = nir_src_for_ssa(temp);
       break;
-   case nir_intrinsic_atomic_counter_dec:
+   case nir_intrinsic_atomic_counter_pre_dec:
+   case nir_intrinsic_atomic_counter_post_dec:
       /* remapped to ssbo_atomic_add: { buffer_idx, offset, -1 } */
       /* NOTE semantic difference so we adjust the return value below */
       temp = nir_imm_int(b, -1);
@@ -148,7 +150,7 @@
    nir_instr_insert_before(&instr->instr, &new_instr->instr);
    nir_instr_remove(&instr->instr);
 
-   if (instr->intrinsic == nir_intrinsic_atomic_counter_dec) {
+   if (instr->intrinsic == nir_intrinsic_atomic_counter_pre_dec) {
       b->cursor = nir_after_instr(&new_instr->instr);
       nir_ssa_def *result = nir_iadd(b, &new_instr->dest.ssa, temp);
       nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(result));
diff --git a/src/compiler/nir/nir_lower_bit_size.c b/src/compiler/nir/nir_lower_bit_size.c
new file mode 100644
index 0000000..531e6aa
--- /dev/null
+++ b/src/compiler/nir/nir_lower_bit_size.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir_builder.h"
+
+/**
+ * Some ALU operations may not be supported in hardware in specific bit-sizes.
+ * This pass allows implementations to selectively lower such operations to
+ * a bit-size that is supported natively and then converts the result back to
+ * the original bit-size.
+ */
+static nir_ssa_def *
+convert_to_bit_size(nir_builder *bld,
+                    nir_ssa_def *src,
+                    nir_alu_type type,
+                    unsigned bit_size)
+{
+   nir_alu_type base_type = nir_alu_type_get_base_type(type);
+   nir_alu_type lowered_type = bit_size | base_type;
+
+   nir_op opcode =
+      nir_type_conversion_op(type, lowered_type, nir_rounding_mode_undef);
+
+   return nir_build_alu(bld, opcode, src, NULL, NULL, NULL);
+}
+
+static void
+lower_instr(nir_builder *bld, nir_alu_instr *alu, unsigned bit_size)
+{
+   const nir_op op = alu->op;
+
+   bld->cursor = nir_before_instr(&alu->instr);
+
+   /* Convert each source to the requested bit-size */
+   nir_ssa_def *srcs[4] = { NULL, NULL, NULL, NULL };
+   for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++) {
+      nir_ssa_def *src = nir_ssa_for_alu_src(bld, alu, i);
+
+      nir_alu_type type = nir_op_infos[op].input_types[i];
+      if (nir_alu_type_get_type_size(type) == 0)
+         srcs[i] = convert_to_bit_size(bld, src, type, bit_size);
+      else
+         srcs[i] = src;
+   }
+
+   /* Emit the lowered ALU instruction */
+   nir_ssa_def *lowered_dst =
+      nir_build_alu(bld, op, srcs[0], srcs[1], srcs[2], srcs[3]);
+
+   /* Convert result back to the original bit-size */
+   unsigned dst_bit_size = alu->dest.dest.ssa.bit_size;
+   nir_alu_type type = nir_op_infos[op].output_type;
+   nir_ssa_def *dst = convert_to_bit_size(bld, lowered_dst, type, dst_bit_size);
+   nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(dst));
+}
+
+static bool
+lower_impl(nir_function_impl *impl,
+           nir_lower_bit_size_callback callback,
+           void *callback_data)
+{
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   bool progress = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_alu)
+            continue;
+
+         nir_alu_instr *alu = nir_instr_as_alu(instr);
+         assert(alu->dest.dest.is_ssa);
+
+         unsigned lower_bit_size = callback(alu, callback_data);
+         if (lower_bit_size == 0)
+            continue;
+
+         assert(lower_bit_size != alu->dest.dest.ssa.bit_size);
+
+         lower_instr(&b, alu, lower_bit_size);
+         progress = true;
+      }
+   }
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+   }
+
+   return progress;
+}
+
+bool
+nir_lower_bit_size(nir_shader *shader,
+                   nir_lower_bit_size_callback callback,
+                   void *callback_data)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= lower_impl(function->impl, callback, callback_data);
+   }
+
+   return progress;
+}
diff --git a/src/compiler/nir/nir_lower_clamp_color_outputs.c b/src/compiler/nir/nir_lower_clamp_color_outputs.c
index 55becbf..32f8556 100644
--- a/src/compiler/nir/nir_lower_clamp_color_outputs.c
+++ b/src/compiler/nir/nir_lower_clamp_color_outputs.c
@@ -47,13 +47,8 @@
       }
       break;
    case MESA_SHADER_FRAGMENT:
-      switch (out->data.location) {
-      case FRAG_RESULT_COLOR:
-         return true;
-      default:
-         return false;
-      }
-      break;
+      return (out->data.location == FRAG_RESULT_COLOR ||
+              out->data.location >= FRAG_RESULT_DATA0);
    default:
       return false;
    }
@@ -67,8 +62,8 @@
    nir_ssa_def *s;
 
    switch (intr->intrinsic) {
-   case nir_intrinsic_store_var:
-      out = intr->variables[0]->var;
+   case nir_intrinsic_store_deref:
+      out = nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
       break;
    case nir_intrinsic_store_output:
       /* already had i/o lowered.. lookup the matching output var: */
@@ -90,9 +85,10 @@
 
    if (is_color_output(state, out)) {
       b->cursor = nir_before_instr(&intr->instr);
-      s = nir_ssa_for_src(b, intr->src[0], intr->num_components);
+      int src = intr->intrinsic == nir_intrinsic_store_deref ? 1 : 0;
+      s = nir_ssa_for_src(b, intr->src[src], intr->num_components);
       s = nir_fsat(b, s);
-      nir_instr_rewrite_src(&intr->instr, &intr->src[0], nir_src_for_ssa(s));
+      nir_instr_rewrite_src(&intr->instr, &intr->src[src], nir_src_for_ssa(s));
    }
 
    return true;
diff --git a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
index 95eda82..86ce5fb 100644
--- a/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
+++ b/src/compiler/nir/nir_lower_clip_cull_distance_arrays.c
@@ -70,55 +70,61 @@
    var->type = type;
 }
 
-/**
- * Rewrite any clip/cull distances to refer to the new combined array.
- */
 static void
-rewrite_references(nir_instr *instr,
+rewrite_clip_cull_deref(nir_builder *b,
+                        nir_deref_instr *deref,
+                        const struct glsl_type *type,
+                        unsigned tail_offset)
+{
+   deref->type = type;
+
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *child_type = glsl_get_array_element(type);
+      nir_foreach_use(src, &deref->dest.ssa) {
+         rewrite_clip_cull_deref(b, nir_instr_as_deref(src->parent_instr),
+                                 child_type, tail_offset);
+      }
+   } else {
+      assert(glsl_type_is_scalar(type));
+
+      /* This is the end of the line.  Add the tail offset if needed */
+      if (tail_offset > 0) {
+         b->cursor = nir_before_instr(&deref->instr);
+         assert(deref->deref_type == nir_deref_type_array);
+         nir_ssa_def *index = nir_iadd(b, deref->arr.index.ssa,
+                                          nir_imm_int(b, tail_offset));
+         nir_instr_rewrite_src(&deref->instr, &deref->arr.index,
+                               nir_src_for_ssa(index));
+      }
+   }
+}
+
+static void
+rewrite_references(nir_builder *b,
+                   nir_instr *instr,
                    nir_variable *combined,
                    unsigned cull_offset)
 {
-   if (instr->type != nir_instr_type_intrinsic)
+   if (instr->type != nir_instr_type_deref)
       return;
 
-   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-   /* copy_var needs to be lowered to load/store before calling this pass */
-   assert(intrin->intrinsic != nir_intrinsic_copy_var);
-
-   if (intrin->intrinsic != nir_intrinsic_load_var &&
-       intrin->intrinsic != nir_intrinsic_store_var)
+   nir_deref_instr *deref = nir_instr_as_deref(instr);
+   if (deref->deref_type != nir_deref_type_var)
       return;
 
-   nir_deref_var *var_ref = intrin->variables[0];
-   if (var_ref->var->data.mode != combined->data.mode)
+   if (deref->var->data.mode != combined->data.mode)
       return;
 
-   if (var_ref->var->data.location != VARYING_SLOT_CLIP_DIST0 &&
-       var_ref->var->data.location != VARYING_SLOT_CULL_DIST0)
+   const unsigned location = deref->var->data.location;
+   if (location != VARYING_SLOT_CLIP_DIST0 &&
+       location != VARYING_SLOT_CULL_DIST0)
       return;
 
-   /* Update types along the deref chain */
-   const struct glsl_type *type = combined->type;
-   nir_deref *deref = &var_ref->deref;
-   while (deref) {
-      deref->type = type;
-      deref = deref->child;
-      type = glsl_get_array_element(type);
-   }
-
-   /* For cull distances, add an offset to the array index */
-   if (var_ref->var->data.location == VARYING_SLOT_CULL_DIST0) {
-      nir_deref *tail = nir_deref_tail(&intrin->variables[0]->deref);
-      nir_deref_array *array_ref = nir_deref_as_array(tail);
-
-      array_ref->base_offset += cull_offset;
-   }
-
-   /* Point the deref at the combined array */
-   var_ref->var = combined;
-
-   /* There's no need to update writemasks; it's a scalar array. */
+   deref->var = combined;
+   if (location == VARYING_SLOT_CULL_DIST0)
+      rewrite_clip_cull_deref(b, deref, combined->type, cull_offset);
+   else
+      rewrite_clip_cull_deref(b, deref, combined->type, 0);
 }
 
 static bool
@@ -163,9 +169,12 @@
          /* Rewrite CullDistance to reference the combined array */
          nir_foreach_function(function, nir) {
             if (function->impl) {
+               nir_builder b;
+               nir_builder_init(&b, function->impl);
+
                nir_foreach_block(block, function->impl) {
                   nir_foreach_instr(instr, block) {
-                     rewrite_references(instr, clip, clip_array_size);
+                     rewrite_references(&b, instr, clip, clip_array_size);
                   }
                }
             }
diff --git a/src/compiler/nir/nir_lower_constant_initializers.c b/src/compiler/nir/nir_lower_constant_initializers.c
index f4d4d70..4e9cea4 100644
--- a/src/compiler/nir/nir_lower_constant_initializers.c
+++ b/src/compiler/nir/nir_lower_constant_initializers.c
@@ -24,18 +24,44 @@
 #include "nir.h"
 #include "nir_builder.h"
 
-static bool
-deref_apply_constant_initializer(nir_deref_var *deref, void *state)
+static void
+build_constant_load(nir_builder *b, nir_deref_instr *deref, nir_constant *c)
 {
-   struct nir_builder *b = state;
-
-   nir_load_const_instr *initializer =
-      nir_deref_get_const_initializer_load(b->shader, deref);
-   nir_builder_instr_insert(b, &initializer->instr);
-
-   nir_store_deref_var(b, deref, &initializer->def, 0xf);
-
-   return true;
+   if (glsl_type_is_vector_or_scalar(deref->type)) {
+      nir_load_const_instr *load =
+         nir_load_const_instr_create(b->shader,
+                                     glsl_get_vector_elements(deref->type),
+                                     glsl_get_bit_size(deref->type));
+      load->value = c->values[0];
+      nir_builder_instr_insert(b, &load->instr);
+      nir_store_deref(b, deref, &load->def, ~0);
+   } else if (glsl_type_is_matrix(deref->type)) {
+      unsigned cols = glsl_get_matrix_columns(deref->type);
+      unsigned rows = glsl_get_vector_elements(deref->type);
+      unsigned bit_size = glsl_get_bit_size(deref->type);
+      for (unsigned i = 0; i < cols; i++) {
+         nir_load_const_instr *load =
+            nir_load_const_instr_create(b->shader, rows, bit_size);
+         load->value = c->values[i];
+         nir_builder_instr_insert(b, &load->instr);
+         nir_store_deref(b, nir_build_deref_array(b, deref, nir_imm_int(b, i)),
+                         &load->def, ~0);
+      }
+   } else if (glsl_type_is_struct(deref->type)) {
+      unsigned len = glsl_get_length(deref->type);
+      for (unsigned i = 0; i < len; i++) {
+         build_constant_load(b, nir_build_deref_struct(b, deref, i),
+                             c->elements[i]);
+      }
+   } else {
+      assert(glsl_type_is_array(deref->type));
+      unsigned len = glsl_get_length(deref->type);
+      for (unsigned i = 0; i < len; i++) {
+         build_constant_load(b,
+                             nir_build_deref_array(b, deref, nir_imm_int(b, i)),
+                             c->elements[i]);
+      }
+   }
 }
 
 static bool
@@ -51,13 +77,8 @@
 
       progress = true;
 
-      nir_deref_var deref;
-      deref.deref.deref_type = nir_deref_type_var,
-      deref.deref.child = NULL;
-      deref.deref.type = var->type,
-      deref.var = var;
-
-      nir_deref_foreach_leaf(&deref, deref_apply_constant_initializer, b);
+      build_constant_load(b, nir_build_deref_var(b, var),
+                          var->constant_initializer);
 
       var->constant_initializer = NULL;
    }
diff --git a/src/compiler/nir/nir_lower_drawpixels.c b/src/compiler/nir/nir_lower_drawpixels.c
index 5cc358d..462b9c3 100644
--- a/src/compiler/nir/nir_lower_drawpixels.c
+++ b/src/compiler/nir/nir_lower_drawpixels.c
@@ -151,9 +151,6 @@
    }
 
    if (state->options->pixel_maps) {
-      static const unsigned swiz_xy[4] = {0,1};
-      static const unsigned swiz_zw[4] = {2,3};
-
       /* do four pixel map look-ups with two TEX instructions: */
       nir_ssa_def *def_xy, *def_zw;
 
@@ -166,7 +163,7 @@
       tex->texture_index = state->options->pixelmap_sampler;
       tex->dest_type = nir_type_float;
       tex->src[0].src_type = nir_tex_src_coord;
-      tex->src[0].src = nir_src_for_ssa(nir_swizzle(b, def, swiz_xy, 2, true));
+      tex->src[0].src = nir_src_for_ssa(nir_channels(b, def, 0x3));
 
       nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
       nir_builder_instr_insert(b, &tex->instr);
@@ -180,7 +177,7 @@
       tex->sampler_index = state->options->pixelmap_sampler;
       tex->dest_type = nir_type_float;
       tex->src[0].src_type = nir_tex_src_coord;
-      tex->src[0].src = nir_src_for_ssa(nir_swizzle(b, def, swiz_zw, 2, true));
+      tex->src[0].src = nir_src_for_ssa(nir_channels(b, def, 0xc));
 
       nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
       nir_builder_instr_insert(b, &tex->instr);
@@ -212,17 +209,17 @@
    nir_foreach_instr_safe(instr, block) {
       if (instr->type == nir_instr_type_intrinsic) {
          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-         if (intr->intrinsic == nir_intrinsic_load_var) {
-            nir_deref_var *dvar = intr->variables[0];
-            nir_variable *var = dvar->var;
+         if (intr->intrinsic == nir_intrinsic_load_deref) {
+            nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+            nir_variable *var = nir_deref_instr_get_variable(deref);
 
             if (var->data.location == VARYING_SLOT_COL0) {
                /* gl_Color should not have array/struct derefs: */
-               assert(dvar->deref.child == NULL);
+               assert(deref->deref_type == nir_deref_type_var);
                lower_color(state, intr);
             } else if (var->data.location == VARYING_SLOT_TEX0) {
                /* gl_TexCoord should not have array/struct derefs: */
-               assert(dvar->deref.child == NULL);
+               assert(deref->deref_type == nir_deref_type_var);
                lower_texcoord(state, intr);
             }
          }
diff --git a/src/compiler/nir/nir_lower_global_vars_to_local.c b/src/compiler/nir/nir_lower_global_vars_to_local.c
index c8fdfde..d441215 100644
--- a/src/compiler/nir/nir_lower_global_vars_to_local.c
+++ b/src/compiler/nir/nir_lower_global_vars_to_local.c
@@ -32,31 +32,33 @@
 
 #include "nir.h"
 
+static void
+register_var_use(nir_variable *var, nir_function_impl *impl,
+                 struct hash_table *var_func_table)
+{
+   if (var->data.mode != nir_var_global)
+      return;
+
+   struct hash_entry *entry =
+      _mesa_hash_table_search(var_func_table, var);
+
+   if (entry) {
+      if (entry->data != impl)
+         entry->data = NULL;
+   } else {
+      _mesa_hash_table_insert(var_func_table, var, impl);
+   }
+}
+
 static bool
 mark_global_var_uses_block(nir_block *block, nir_function_impl *impl,
                            struct hash_table *var_func_table)
 {
    nir_foreach_instr(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-      unsigned num_vars = nir_intrinsic_infos[intrin->intrinsic].num_variables;
-
-      for (unsigned i = 0; i < num_vars; i++) {
-         nir_variable *var = intrin->variables[i]->var;
-         if (var->data.mode != nir_var_global)
-            continue;
-
-         struct hash_entry *entry =
-            _mesa_hash_table_search(var_func_table, var);
-
-         if (entry) {
-            if (entry->data != impl)
-               entry->data = NULL;
-         } else {
-            _mesa_hash_table_insert(var_func_table, var, impl);
-         }
+      if (instr->type ==  nir_instr_type_deref) {
+         nir_deref_instr *deref = nir_instr_as_deref(instr);
+         if (deref->deref_type == nir_deref_type_var)
+            register_var_use(deref->var, impl, var_func_table);
       }
    }
 
@@ -103,5 +105,8 @@
 
    _mesa_hash_table_destroy(var_func_table, NULL);
 
+   if (progress)
+      nir_fixup_deref_modes(shader);
+
    return progress;
 }
diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c
index 02f202d..c1f3cf8 100644
--- a/src/compiler/nir/nir_lower_indirect_derefs.c
+++ b/src/compiler/nir/nir_lower_indirect_derefs.c
@@ -23,44 +23,41 @@
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_deref.h"
 
 static void
-emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
-                nir_deref_var *deref, nir_deref *tail,
-                nir_ssa_def **dest, nir_ssa_def *src);
+emit_load_store_deref(nir_builder *b, nir_intrinsic_instr *orig_instr,
+                      nir_deref_instr *parent,
+                      nir_deref_instr **deref_arr,
+                      nir_ssa_def **dest, nir_ssa_def *src);
 
 static void
-emit_indirect_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
-                         nir_deref_var *deref, nir_deref *arr_parent,
-                         int start, int end,
-                         nir_ssa_def **dest, nir_ssa_def *src)
+emit_indirect_load_store_deref(nir_builder *b, nir_intrinsic_instr *orig_instr,
+                               nir_deref_instr *parent,
+                               nir_deref_instr **deref_arr,
+                               int start, int end,
+                               nir_ssa_def **dest, nir_ssa_def *src)
 {
-   nir_deref_array *arr = nir_deref_as_array(arr_parent->child);
-   assert(arr->deref_array_type == nir_deref_array_type_indirect);
-   assert(arr->indirect.is_ssa);
-
    assert(start < end);
    if (start == end - 1) {
-      /* Base case.  Just emit the load/store op */
-      nir_deref_array direct = *arr;
-      direct.deref_array_type = nir_deref_array_type_direct;
-      direct.base_offset += start;
-      direct.indirect = NIR_SRC_INIT;
-
-      arr_parent->child = &direct.deref;
-      emit_load_store(b, orig_instr, deref, &direct.deref, dest, src);
-      arr_parent->child = &arr->deref;
+      nir_ssa_def *index = nir_imm_int(b, start);
+      emit_load_store_deref(b, orig_instr,
+                            nir_build_deref_array(b, parent, index),
+                            deref_arr + 1, dest, src);
    } else {
       int mid = start + (end - start) / 2;
 
       nir_ssa_def *then_dest, *else_dest;
 
-      nir_push_if(b, nir_ilt(b, arr->indirect.ssa, nir_imm_int(b, mid)));
-      emit_indirect_load_store(b, orig_instr, deref, arr_parent,
-                               start, mid, &then_dest, src);
+      nir_deref_instr *deref = *deref_arr;
+      assert(deref->deref_type == nir_deref_type_array);
+
+      nir_push_if(b, nir_ilt(b, deref->arr.index.ssa, nir_imm_int(b, mid)));
+      emit_indirect_load_store_deref(b, orig_instr, parent, deref_arr,
+                                     start, mid, &then_dest, src);
       nir_push_else(b, NULL);
-      emit_indirect_load_store(b, orig_instr, deref, arr_parent,
-                               mid, end, &else_dest, src);
+      emit_indirect_load_store_deref(b, orig_instr, parent, deref_arr,
+                                     mid, end, &else_dest, src);
       nir_pop_if(b, NULL);
 
       if (src == NULL)
@@ -69,76 +66,55 @@
 }
 
 static void
-emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
-                nir_deref_var *deref, nir_deref *tail,
-                nir_ssa_def **dest, nir_ssa_def *src)
+emit_load_store_deref(nir_builder *b, nir_intrinsic_instr *orig_instr,
+                      nir_deref_instr *parent,
+                      nir_deref_instr **deref_arr,
+                      nir_ssa_def **dest, nir_ssa_def *src)
 {
-   for (; tail->child; tail = tail->child) {
-      if (tail->child->deref_type != nir_deref_type_array)
-         continue;
+   for (; *deref_arr; deref_arr++) {
+      nir_deref_instr *deref = *deref_arr;
+      if (deref->deref_type == nir_deref_type_array &&
+          nir_src_as_const_value(deref->arr.index) == NULL) {
+         int length = glsl_get_length(parent->type);
 
-      nir_deref_array *arr = nir_deref_as_array(tail->child);
-      if (arr->deref_array_type != nir_deref_array_type_indirect)
-         continue;
+         emit_indirect_load_store_deref(b, orig_instr, parent, deref_arr,
+                                        0, length, dest, src);
+         return;
+      }
 
-      int length = glsl_get_length(tail->type);
-
-      emit_indirect_load_store(b, orig_instr, deref, tail, -arr->base_offset,
-                               length - arr->base_offset, dest, src);
-      return;
+      parent = nir_build_deref_follower(b, parent, deref);
    }
 
-   assert(tail && tail->child == NULL);
-
    /* We reached the end of the deref chain.  Emit the instruction */
+   assert(*deref_arr == NULL);
 
    if (src == NULL) {
       /* This is a load instruction */
       nir_intrinsic_instr *load =
          nir_intrinsic_instr_create(b->shader, orig_instr->intrinsic);
       load->num_components = orig_instr->num_components;
-      load->variables[0] = nir_deref_var_clone(deref, load);
 
-      /* Copy over any sources.  This is needed for interp_var_at */
-      for (unsigned i = 0;
+      load->src[0] = nir_src_for_ssa(&parent->dest.ssa);
+
+      /* Copy over any other sources.  This is needed for interp_deref_at */
+      for (unsigned i = 1;
            i < nir_intrinsic_infos[orig_instr->intrinsic].num_srcs; i++)
          nir_src_copy(&load->src[i], &orig_instr->src[i], load);
 
-      unsigned bit_size = orig_instr->dest.ssa.bit_size;
       nir_ssa_dest_init(&load->instr, &load->dest,
-                        load->num_components, bit_size, NULL);
+                        orig_instr->dest.ssa.num_components,
+                        orig_instr->dest.ssa.bit_size, NULL);
       nir_builder_instr_insert(b, &load->instr);
       *dest = &load->dest.ssa;
    } else {
-      /* This is a store instruction */
-      nir_intrinsic_instr *store =
-         nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
-      store->num_components = orig_instr->num_components;
-      nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(orig_instr));
-      store->variables[0] = nir_deref_var_clone(deref, store);
-      store->src[0] = nir_src_for_ssa(src);
-      nir_builder_instr_insert(b, &store->instr);
+      assert(orig_instr->intrinsic == nir_intrinsic_store_deref);
+      nir_store_deref(b, parent, src, nir_intrinsic_write_mask(orig_instr));
    }
 }
 
 static bool
-deref_has_indirect(nir_deref_var *deref)
-{
-   for (nir_deref *tail = deref->deref.child; tail; tail = tail->child) {
-      if (tail->deref_type != nir_deref_type_array)
-         continue;
-
-      nir_deref_array *arr = nir_deref_as_array(tail);
-      if (arr->deref_array_type == nir_deref_array_type_indirect)
-         return true;
-   }
-
-   return false;
-}
-
-static bool
-lower_indirect_block(nir_block *block, nir_builder *b,
-                     nir_variable_mode modes)
+lower_indirect_derefs_block(nir_block *block, nir_builder *b,
+                            nir_variable_mode modes)
 {
    bool progress = false;
 
@@ -147,37 +123,55 @@
          continue;
 
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-      if (intrin->intrinsic != nir_intrinsic_load_var &&
-          intrin->intrinsic != nir_intrinsic_interp_var_at_centroid &&
-          intrin->intrinsic != nir_intrinsic_interp_var_at_sample &&
-          intrin->intrinsic != nir_intrinsic_interp_var_at_offset &&
-          intrin->intrinsic != nir_intrinsic_store_var)
+      if (intrin->intrinsic != nir_intrinsic_load_deref &&
+          intrin->intrinsic != nir_intrinsic_interp_deref_at_centroid &&
+          intrin->intrinsic != nir_intrinsic_interp_deref_at_sample &&
+          intrin->intrinsic != nir_intrinsic_interp_deref_at_offset &&
+          intrin->intrinsic != nir_intrinsic_store_deref)
          continue;
 
-      if (!deref_has_indirect(intrin->variables[0]))
+      nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+      /* Walk the deref chain back to the base and look for indirects */
+      bool has_indirect = false;
+      nir_deref_instr *base = deref;
+      while (base->deref_type != nir_deref_type_var) {
+         if (base->deref_type == nir_deref_type_array &&
+             nir_src_as_const_value(base->arr.index) == NULL)
+            has_indirect = true;
+
+         base = nir_deref_instr_parent(base);
+      }
+
+      if (!has_indirect)
          continue;
 
       /* Only lower variables whose mode is in the mask, or compact
        * array variables.  (We can't handle indirects on tightly packed
        * scalar arrays, so we need to lower them regardless.)
        */
-      if (!(modes & intrin->variables[0]->var->data.mode) &&
-          !intrin->variables[0]->var->data.compact)
+      if (!(modes & base->var->data.mode) && !base->var->data.compact)
          continue;
 
-      b->cursor = nir_before_instr(&intrin->instr);
+      b->cursor = nir_instr_remove(&intrin->instr);
 
-      if (intrin->intrinsic != nir_intrinsic_store_var) {
-         nir_ssa_def *result;
-         emit_load_store(b, intrin, intrin->variables[0],
-                         &intrin->variables[0]->deref, &result, NULL);
-         nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(result));
+      nir_deref_path path;
+      nir_deref_path_init(&path, deref, NULL);
+      assert(path.path[0] == base);
+
+      if (intrin->intrinsic == nir_intrinsic_store_deref) {
+         assert(intrin->src[1].is_ssa);
+         emit_load_store_deref(b, intrin, base, &path.path[1],
+                               NULL, intrin->src[1].ssa);
       } else {
-         assert(intrin->src[0].is_ssa);
-         emit_load_store(b, intrin, intrin->variables[0],
-                         &intrin->variables[0]->deref, NULL, intrin->src[0].ssa);
+         nir_ssa_def *result;
+         emit_load_store_deref(b, intrin, base, &path.path[1],
+                               &result, NULL);
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(result));
       }
-      nir_instr_remove(&intrin->instr);
+
+      nir_deref_path_finish(&path);
+
       progress = true;
    }
 
@@ -192,7 +186,7 @@
    bool progress = false;
 
    nir_foreach_block_safe(block, impl) {
-      progress |= lower_indirect_block(block, &builder, modes);
+      progress |= lower_indirect_derefs_block(block, &builder, modes);
    }
 
    if (progress)
@@ -211,6 +205,9 @@
 {
    bool progress = false;
 
+   if (modes == 0)
+      return false;
+
    nir_foreach_function(function, shader) {
       if (function->impl)
          progress = lower_indirects_impl(function->impl, modes) || progress;
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index df91feb..9500a30 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -33,8 +33,10 @@
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_deref.h"
 
 struct lower_io_state {
+   void *dead_ctx;
    nir_builder builder;
    int (*type_size)(const struct glsl_type *type);
    nir_variable_mode modes;
@@ -85,35 +87,35 @@
 }
 
 static nir_ssa_def *
-get_io_offset(nir_builder *b, nir_deref_var *deref,
+get_io_offset(nir_builder *b, nir_deref_instr *deref,
               nir_ssa_def **vertex_index,
               int (*type_size)(const struct glsl_type *),
               unsigned *component)
 {
-   nir_deref *tail = &deref->deref;
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, NULL);
+
+   assert(path.path[0]->deref_type == nir_deref_type_var);
+   nir_deref_instr **p = &path.path[1];
 
    /* For per-vertex input arrays (i.e. geometry shader inputs), keep the
     * outermost array index separate.  Process the rest normally.
     */
    if (vertex_index != NULL) {
-      tail = tail->child;
-      nir_deref_array *deref_array = nir_deref_as_array(tail);
-
-      nir_ssa_def *vtx = nir_imm_int(b, deref_array->base_offset);
-      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-         vtx = nir_iadd(b, vtx, nir_ssa_for_src(b, deref_array->indirect, 1));
-      }
-      *vertex_index = vtx;
+      assert((*p)->deref_type == nir_deref_type_array);
+      *vertex_index = nir_ssa_for_src(b, (*p)->arr.index, 1);
+      p++;
    }
 
-   if (deref->var->data.compact) {
-      assert(tail->child->deref_type == nir_deref_type_array);
-      assert(glsl_type_is_scalar(glsl_without_array(deref->var->type)));
-      nir_deref_array *deref_array = nir_deref_as_array(tail->child);
-      /* We always lower indirect dereferences for "compact" array vars. */
-      assert(deref_array->deref_array_type == nir_deref_array_type_direct);
+   if (path.path[0]->var->data.compact) {
+      assert((*p)->deref_type == nir_deref_type_array);
+      assert(glsl_type_is_scalar((*p)->type));
 
-      const unsigned total_offset = *component + deref_array->base_offset;
+      /* We always lower indirect dereferences for "compact" array vars. */
+      nir_const_value *const_index = nir_src_as_const_value((*p)->arr.index);
+      assert(const_index);
+
+      const unsigned total_offset = *component + const_index->u32[0];
       const unsigned slot_offset = total_offset / 4;
       *component = total_offset % 4;
       return nir_imm_int(b, type_size(glsl_vec4_type()) * slot_offset);
@@ -122,45 +124,40 @@
    /* Just emit code and let constant-folding go to town */
    nir_ssa_def *offset = nir_imm_int(b, 0);
 
-   while (tail->child != NULL) {
-      const struct glsl_type *parent_type = tail->type;
-      tail = tail->child;
+   for (; *p; p++) {
+      if ((*p)->deref_type == nir_deref_type_array) {
+         unsigned size = type_size((*p)->type);
 
-      if (tail->deref_type == nir_deref_type_array) {
-         nir_deref_array *deref_array = nir_deref_as_array(tail);
-         unsigned size = type_size(tail->type);
+         nir_ssa_def *mul =
+            nir_imul(b, nir_imm_int(b, size),
+                     nir_ssa_for_src(b, (*p)->arr.index, 1));
 
-         offset = nir_iadd(b, offset,
-                           nir_imm_int(b, size * deref_array->base_offset));
-
-         if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-            nir_ssa_def *mul =
-               nir_imul(b, nir_imm_int(b, size),
-                        nir_ssa_for_src(b, deref_array->indirect, 1));
-
-            offset = nir_iadd(b, offset, mul);
-         }
-      } else if (tail->deref_type == nir_deref_type_struct) {
-         nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
+         offset = nir_iadd(b, offset, mul);
+      } else if ((*p)->deref_type == nir_deref_type_struct) {
+         /* p starts at path[1], so this is safe */
+         nir_deref_instr *parent = *(p - 1);
 
          unsigned field_offset = 0;
-         for (unsigned i = 0; i < deref_struct->index; i++) {
-            field_offset += type_size(glsl_get_struct_field(parent_type, i));
+         for (unsigned i = 0; i < (*p)->strct.index; i++) {
+            field_offset += type_size(glsl_get_struct_field(parent->type, i));
          }
          offset = nir_iadd(b, offset, nir_imm_int(b, field_offset));
+      } else {
+         unreachable("Unsupported deref type");
       }
    }
 
+   nir_deref_path_finish(&path);
+
    return offset;
 }
 
 static nir_intrinsic_instr *
 lower_load(nir_intrinsic_instr *intrin, struct lower_io_state *state,
-           nir_ssa_def *vertex_index, nir_ssa_def *offset,
+           nir_ssa_def *vertex_index, nir_variable *var, nir_ssa_def *offset,
            unsigned component)
 {
    const nir_shader *nir = state->builder.shader;
-   nir_variable *var = intrin->variables[0]->var;
    nir_variable_mode mode = var->data.mode;
    nir_ssa_def *barycentric = NULL;
 
@@ -229,10 +226,9 @@
 
 static nir_intrinsic_instr *
 lower_store(nir_intrinsic_instr *intrin, struct lower_io_state *state,
-            nir_ssa_def *vertex_index, nir_ssa_def *offset,
+            nir_ssa_def *vertex_index, nir_variable *var, nir_ssa_def *offset,
             unsigned component)
 {
-   nir_variable *var = intrin->variables[0]->var;
    nir_variable_mode mode = var->data.mode;
 
    nir_intrinsic_op op;
@@ -248,7 +244,7 @@
       nir_intrinsic_instr_create(state->builder.shader, op);
    store->num_components = intrin->num_components;
 
-   nir_src_copy(&store->src[0], &intrin->src[0], store);
+   nir_src_copy(&store->src[0], &intrin->src[1], store);
 
    nir_intrinsic_set_base(store, var->data.driver_location);
 
@@ -267,15 +263,13 @@
 
 static nir_intrinsic_instr *
 lower_atomic(nir_intrinsic_instr *intrin, struct lower_io_state *state,
-             nir_ssa_def *offset)
+             nir_variable *var, nir_ssa_def *offset)
 {
-   nir_variable *var = intrin->variables[0]->var;
-
    assert(var->data.mode == nir_var_shared);
 
    nir_intrinsic_op op;
    switch (intrin->intrinsic) {
-#define OP(O) case nir_intrinsic_var_##O: op = nir_intrinsic_shared_##O; break;
+#define OP(O) case nir_intrinsic_deref_##O: op = nir_intrinsic_shared_##O; break;
    OP(atomic_exchange)
    OP(atomic_comp_swap)
    OP(atomic_add)
@@ -297,8 +291,10 @@
    nir_intrinsic_set_base(atomic, var->data.driver_location);
 
    atomic->src[0] = nir_src_for_ssa(offset);
-   for (unsigned i = 0; i < nir_intrinsic_infos[intrin->intrinsic].num_srcs; i++) {
-      nir_src_copy(&atomic->src[i+1], &intrin->src[i], atomic);
+   assert(nir_intrinsic_infos[intrin->intrinsic].num_srcs ==
+          nir_intrinsic_infos[op].num_srcs);
+   for (unsigned i = 1; i < nir_intrinsic_infos[op].num_srcs; i++) {
+      nir_src_copy(&atomic->src[i], &intrin->src[i], atomic);
    }
 
    return atomic;
@@ -306,27 +302,25 @@
 
 static nir_intrinsic_instr *
 lower_interpolate_at(nir_intrinsic_instr *intrin, struct lower_io_state *state,
-                     nir_ssa_def *offset, unsigned component)
+                     nir_variable *var, nir_ssa_def *offset, unsigned component)
 {
-   nir_variable *var = intrin->variables[0]->var;
-
    assert(var->data.mode == nir_var_shader_in);
 
    /* Ignore interpolateAt() for flat variables - flat is flat. */
    if (var->data.interpolation == INTERP_MODE_FLAT)
-      return lower_load(intrin, state, NULL, offset, component);
+      return lower_load(intrin, state, NULL, var, offset, component);
 
    nir_intrinsic_op bary_op;
    switch (intrin->intrinsic) {
-   case nir_intrinsic_interp_var_at_centroid:
+   case nir_intrinsic_interp_deref_at_centroid:
       bary_op = (state->options & nir_lower_io_force_sample_interpolation) ?
                 nir_intrinsic_load_barycentric_sample :
                 nir_intrinsic_load_barycentric_centroid;
       break;
-   case nir_intrinsic_interp_var_at_sample:
+   case nir_intrinsic_interp_deref_at_sample:
       bary_op = nir_intrinsic_load_barycentric_at_sample;
       break;
-   case nir_intrinsic_interp_var_at_offset:
+   case nir_intrinsic_interp_deref_at_offset:
       bary_op = nir_intrinsic_load_barycentric_at_offset;
       break;
    default:
@@ -339,8 +333,9 @@
    nir_ssa_dest_init(&bary_setup->instr, &bary_setup->dest, 2, 32, NULL);
    nir_intrinsic_set_interp_mode(bary_setup, var->data.interpolation);
 
-   if (intrin->intrinsic != nir_intrinsic_interp_var_at_centroid)
-      nir_src_copy(&bary_setup->src[0], &intrin->src[0], bary_setup);
+   if (intrin->intrinsic == nir_intrinsic_interp_deref_at_sample ||
+       intrin->intrinsic == nir_intrinsic_interp_deref_at_offset)
+      nir_src_copy(&bary_setup->src[0], &intrin->src[1], bary_setup);
 
    nir_builder_instr_insert(&state->builder, &bary_setup->instr);
 
@@ -373,23 +368,23 @@
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
       switch (intrin->intrinsic) {
-      case nir_intrinsic_load_var:
-      case nir_intrinsic_store_var:
-      case nir_intrinsic_var_atomic_add:
-      case nir_intrinsic_var_atomic_imin:
-      case nir_intrinsic_var_atomic_umin:
-      case nir_intrinsic_var_atomic_imax:
-      case nir_intrinsic_var_atomic_umax:
-      case nir_intrinsic_var_atomic_and:
-      case nir_intrinsic_var_atomic_or:
-      case nir_intrinsic_var_atomic_xor:
-      case nir_intrinsic_var_atomic_exchange:
-      case nir_intrinsic_var_atomic_comp_swap:
+      case nir_intrinsic_load_deref:
+      case nir_intrinsic_store_deref:
+      case nir_intrinsic_deref_atomic_add:
+      case nir_intrinsic_deref_atomic_imin:
+      case nir_intrinsic_deref_atomic_umin:
+      case nir_intrinsic_deref_atomic_imax:
+      case nir_intrinsic_deref_atomic_umax:
+      case nir_intrinsic_deref_atomic_and:
+      case nir_intrinsic_deref_atomic_or:
+      case nir_intrinsic_deref_atomic_xor:
+      case nir_intrinsic_deref_atomic_exchange:
+      case nir_intrinsic_deref_atomic_comp_swap:
          /* We can lower the io for this nir instrinsic */
          break;
-      case nir_intrinsic_interp_var_at_centroid:
-      case nir_intrinsic_interp_var_at_sample:
-      case nir_intrinsic_interp_var_at_offset:
+      case nir_intrinsic_interp_deref_at_centroid:
+      case nir_intrinsic_interp_deref_at_sample:
+      case nir_intrinsic_interp_deref_at_offset:
          /* We can optionally lower these to load_interpolated_input */
          if (options->use_interpolated_input_intrinsics)
             break;
@@ -398,7 +393,9 @@
          continue;
       }
 
-      nir_variable *var = intrin->variables[0]->var;
+      nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+      nir_variable *var = nir_deref_instr_get_variable(deref);
       nir_variable_mode mode = var->data.mode;
 
       if ((state->modes & mode) == 0)
@@ -418,42 +415,41 @@
       nir_ssa_def *vertex_index = NULL;
       unsigned component_offset = var->data.location_frac;
 
-      offset = get_io_offset(b, intrin->variables[0],
-                             per_vertex ? &vertex_index : NULL,
+      offset = get_io_offset(b, deref, per_vertex ? &vertex_index : NULL,
                              state->type_size, &component_offset);
 
       nir_intrinsic_instr *replacement;
 
       switch (intrin->intrinsic) {
-      case nir_intrinsic_load_var:
-         replacement = lower_load(intrin, state, vertex_index, offset,
+      case nir_intrinsic_load_deref:
+         replacement = lower_load(intrin, state, vertex_index, var, offset,
                                   component_offset);
          break;
 
-      case nir_intrinsic_store_var:
-         replacement = lower_store(intrin, state, vertex_index, offset,
+      case nir_intrinsic_store_deref:
+         replacement = lower_store(intrin, state, vertex_index, var, offset,
                                    component_offset);
          break;
 
-      case nir_intrinsic_var_atomic_add:
-      case nir_intrinsic_var_atomic_imin:
-      case nir_intrinsic_var_atomic_umin:
-      case nir_intrinsic_var_atomic_imax:
-      case nir_intrinsic_var_atomic_umax:
-      case nir_intrinsic_var_atomic_and:
-      case nir_intrinsic_var_atomic_or:
-      case nir_intrinsic_var_atomic_xor:
-      case nir_intrinsic_var_atomic_exchange:
-      case nir_intrinsic_var_atomic_comp_swap:
+      case nir_intrinsic_deref_atomic_add:
+      case nir_intrinsic_deref_atomic_imin:
+      case nir_intrinsic_deref_atomic_umin:
+      case nir_intrinsic_deref_atomic_imax:
+      case nir_intrinsic_deref_atomic_umax:
+      case nir_intrinsic_deref_atomic_and:
+      case nir_intrinsic_deref_atomic_or:
+      case nir_intrinsic_deref_atomic_xor:
+      case nir_intrinsic_deref_atomic_exchange:
+      case nir_intrinsic_deref_atomic_comp_swap:
          assert(vertex_index == NULL);
-         replacement = lower_atomic(intrin, state, offset);
+         replacement = lower_atomic(intrin, state, var, offset);
          break;
 
-      case nir_intrinsic_interp_var_at_centroid:
-      case nir_intrinsic_interp_var_at_sample:
-      case nir_intrinsic_interp_var_at_offset:
+      case nir_intrinsic_interp_deref_at_centroid:
+      case nir_intrinsic_interp_deref_at_sample:
+      case nir_intrinsic_interp_deref_at_offset:
          assert(vertex_index == NULL);
-         replacement = lower_interpolate_at(intrin, state, offset,
+         replacement = lower_interpolate_at(intrin, state, var, offset,
                                             component_offset);
          break;
 
@@ -491,6 +487,7 @@
    bool progress = false;
 
    nir_builder_init(&state.builder, impl);
+   state.dead_ctx = ralloc_context(NULL);
    state.modes = modes;
    state.type_size = type_size;
    state.options = options;
@@ -499,6 +496,8 @@
       progress |= nir_lower_io_block(block, &state);
    }
 
+   ralloc_free(state.dead_ctx);
+
    nir_metadata_preserve(impl, nir_metadata_block_index |
                                nir_metadata_dominance);
    return progress;
diff --git a/src/compiler/nir/nir_lower_io_arrays_to_elements.c b/src/compiler/nir/nir_lower_io_arrays_to_elements.c
index 9a5eec8..16f6233 100644
--- a/src/compiler/nir/nir_lower_io_arrays_to_elements.c
+++ b/src/compiler/nir/nir_lower_io_arrays_to_elements.c
@@ -23,6 +23,7 @@
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_deref.h"
 
 /** @file nir_lower_io_arrays_to_elements.c
  *
@@ -32,45 +33,51 @@
  */
 
 static unsigned
-get_io_offset(nir_builder *b, nir_deref_var *deref, nir_variable *var,
-              unsigned *element_index)
+get_io_offset(nir_builder *b, nir_deref_instr *deref, nir_variable *var,
+              unsigned *element_index, nir_ssa_def **vertex_index)
 {
    bool vs_in = (b->shader->info.stage == MESA_SHADER_VERTEX) &&
                 (var->data.mode == nir_var_shader_in);
 
-   nir_deref *tail = &deref->deref;
+   nir_deref_path path;
+   nir_deref_path_init(&path, deref, NULL);
+
+   assert(path.path[0]->deref_type == nir_deref_type_var);
+   nir_deref_instr **p = &path.path[1];
 
    /* For per-vertex input arrays (i.e. geometry shader inputs), skip the
     * outermost array index.  Process the rest normally.
     */
    if (nir_is_per_vertex_io(var, b->shader->info.stage)) {
-      tail = tail->child;
+      *vertex_index = nir_ssa_for_src(b, (*p)->arr.index, 1);
+      p++;
    }
 
    unsigned offset = 0;
-   while (tail->child != NULL) {
-      tail = tail->child;
+   for (; *p; p++) {
+      if ((*p)->deref_type == nir_deref_type_array) {
+         nir_const_value *c = nir_src_as_const_value((*p)->arr.index);
 
-      if (tail->deref_type == nir_deref_type_array) {
-         nir_deref_array *deref_array = nir_deref_as_array(tail);
-         assert(deref_array->deref_array_type != nir_deref_array_type_indirect);
+         assert(c);     /* must not be indirect dereference */
 
-         unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
-         offset += size * deref_array->base_offset;
+         unsigned size = glsl_count_attribute_slots((*p)->type, vs_in);
+         offset += size * c->u32[0];
 
-         unsigned num_elements = glsl_type_is_array(tail->type) ?
-            glsl_get_aoa_size(tail->type) : 1;
+         unsigned num_elements = glsl_type_is_array((*p)->type) ?
+            glsl_get_aoa_size((*p)->type) : 1;
 
-         num_elements *= glsl_type_is_matrix(glsl_without_array(tail->type)) ?
-            glsl_get_matrix_columns(glsl_without_array(tail->type)) : 1;
+         num_elements *= glsl_type_is_matrix(glsl_without_array((*p)->type)) ?
+            glsl_get_matrix_columns(glsl_without_array((*p)->type)) : 1;
 
-         *element_index += num_elements * deref_array->base_offset;
-      } else if (tail->deref_type == nir_deref_type_struct) {
+         *element_index += num_elements * c->u32[0];
+      } else if ((*p)->deref_type == nir_deref_type_struct) {
          /* TODO: we could also add struct splitting support to this pass */
          break;
       }
    }
 
+   nir_deref_path_finish(&path);
+
    return offset;
 }
 
@@ -103,27 +110,6 @@
 }
 
 static void
-create_array_deref(nir_intrinsic_instr *arr_intr,
-                   nir_intrinsic_instr *element_intr)
-{
-   assert(arr_intr->variables[0]->deref.child);
-
-   nir_deref *parent = &element_intr->variables[0]->deref;
-   nir_deref_array *darr =
-            nir_deref_as_array(arr_intr->variables[0]->deref.child);
-   nir_deref_array *ndarr = nir_deref_array_create(parent);
-
-   ndarr->deref.type = glsl_get_array_element(parent->type);
-   ndarr->deref_array_type = darr->deref_array_type;
-   ndarr->base_offset = darr->base_offset;
-
-   if (ndarr->deref_array_type == nir_deref_array_type_indirect)
-      nir_src_copy(&ndarr->indirect, &darr->indirect, parent);
-
-   element_intr->variables[0]->deref.child = &ndarr->deref;
-}
-
-static void
 lower_array(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var,
             struct hash_table *varyings)
 {
@@ -132,9 +118,10 @@
    nir_variable **elements =
       get_array_elements(varyings, var, b->shader->info.stage);
 
+   nir_ssa_def *vertex_index = NULL;
    unsigned elements_index = 0;
-   unsigned io_offset = get_io_offset(b, intr->variables[0], var,
-                                      &elements_index);
+   unsigned io_offset = get_io_offset(b, nir_src_as_deref(intr->src[0]),
+                                      var, &elements_index, &vertex_index);
 
    nir_variable *element = elements[elements_index];
    if (!element) {
@@ -160,18 +147,25 @@
          nir_shader_add_variable(b->shader, element);
    }
 
+   nir_deref_instr *element_deref = nir_build_deref_var(b, element);
+
+   if (nir_is_per_vertex_io(var, b->shader->info.stage)) {
+      assert(vertex_index);
+      element_deref = nir_build_deref_array(b, element_deref, vertex_index);
+   }
+
    nir_intrinsic_instr *element_intr =
       nir_intrinsic_instr_create(b->shader, intr->intrinsic);
    element_intr->num_components = intr->num_components;
-   element_intr->variables[0] = nir_deref_var_create(element_intr, element);
+   element_intr->src[0] = nir_src_for_ssa(&element_deref->dest.ssa);
 
-   if (intr->intrinsic != nir_intrinsic_store_var) {
+   if (intr->intrinsic != nir_intrinsic_store_deref) {
       nir_ssa_dest_init(&element_intr->instr, &element_intr->dest,
                         intr->num_components, intr->dest.ssa.bit_size, NULL);
 
-      if (intr->intrinsic == nir_intrinsic_interp_var_at_offset ||
-          intr->intrinsic == nir_intrinsic_interp_var_at_sample) {
-         nir_src_copy(&element_intr->src[0], &intr->src[0],
+      if (intr->intrinsic == nir_intrinsic_interp_deref_at_offset ||
+          intr->intrinsic == nir_intrinsic_interp_deref_at_sample) {
+         nir_src_copy(&element_intr->src[1], &intr->src[1],
                       &element_intr->instr);
       }
 
@@ -180,14 +174,10 @@
    } else {
       nir_intrinsic_set_write_mask(element_intr,
                                    nir_intrinsic_write_mask(intr));
-      nir_src_copy(&element_intr->src[0], &intr->src[0],
+      nir_src_copy(&element_intr->src[1], &intr->src[1],
                    &element_intr->instr);
    }
 
-   if (nir_is_per_vertex_io(var, b->shader->info.stage)) {
-      create_array_deref(intr, element_intr);
-   }
-
    nir_builder_instr_insert(b, &element_intr->instr);
 
    /* Remove the old load intrinsic */
@@ -195,20 +185,20 @@
 }
 
 static bool
-deref_has_indirect(nir_builder *b, nir_variable *var, nir_deref_var *deref)
+deref_has_indirect(nir_builder *b, nir_variable *var, nir_deref_path *path)
 {
-   nir_deref *tail = &deref->deref;
+   assert(path->path[0]->deref_type == nir_deref_type_var);
+   nir_deref_instr **p = &path->path[1];
 
    if (nir_is_per_vertex_io(var, b->shader->info.stage)) {
-      tail = tail->child;
+      p++;
    }
 
-   for (tail = tail->child; tail; tail = tail->child) {
-      if (tail->deref_type != nir_deref_type_array)
+   for (; *p; p++) {
+      if ((*p)->deref_type != nir_deref_type_array)
          continue;
 
-      nir_deref_array *arr = nir_deref_as_array(tail);
-      if (arr->deref_array_type == nir_deref_array_type_indirect)
+      if (!nir_src_as_const_value((*p)->arr.index))
          return true;
    }
 
@@ -235,26 +225,32 @@
 
                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 
-               if (intr->intrinsic != nir_intrinsic_load_var &&
-                   intr->intrinsic != nir_intrinsic_store_var &&
-                   intr->intrinsic != nir_intrinsic_interp_var_at_centroid &&
-                   intr->intrinsic != nir_intrinsic_interp_var_at_sample &&
-                   intr->intrinsic != nir_intrinsic_interp_var_at_offset)
+               if (intr->intrinsic != nir_intrinsic_load_deref &&
+                   intr->intrinsic != nir_intrinsic_store_deref &&
+                   intr->intrinsic != nir_intrinsic_interp_deref_at_centroid &&
+                   intr->intrinsic != nir_intrinsic_interp_deref_at_sample &&
+                   intr->intrinsic != nir_intrinsic_interp_deref_at_offset)
                   continue;
 
-               nir_variable *var = intr->variables[0]->var;
+               nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+               nir_variable *var = nir_deref_instr_get_variable(deref);
 
                if (var->data.mode != mode)
                   continue;
 
+               nir_deref_path path;
+               nir_deref_path_init(&path, deref, NULL);
+
                uint64_t loc_mask = ((uint64_t)1) << var->data.location;
                if (var->data.patch) {
-                  if (deref_has_indirect(&b, var, intr->variables[0]))
+                  if (deref_has_indirect(&b, var, &path))
                      patch_indirects[var->data.location_frac] |= loc_mask;
                } else {
-                  if (deref_has_indirect(&b, var, intr->variables[0]))
+                  if (deref_has_indirect(&b, var, &path))
                      indirects[var->data.location_frac] |= loc_mask;
                }
+
+               nir_deref_path_finish(&path);
             }
          }
       }
@@ -279,14 +275,15 @@
 
                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
 
-               if (intr->intrinsic != nir_intrinsic_load_var &&
-                   intr->intrinsic != nir_intrinsic_store_var &&
-                   intr->intrinsic != nir_intrinsic_interp_var_at_centroid &&
-                   intr->intrinsic != nir_intrinsic_interp_var_at_sample &&
-                   intr->intrinsic != nir_intrinsic_interp_var_at_offset)
+               if (intr->intrinsic != nir_intrinsic_load_deref &&
+                   intr->intrinsic != nir_intrinsic_store_deref &&
+                   intr->intrinsic != nir_intrinsic_interp_deref_at_centroid &&
+                   intr->intrinsic != nir_intrinsic_interp_deref_at_sample &&
+                   intr->intrinsic != nir_intrinsic_interp_deref_at_offset)
                   continue;
 
-               nir_variable *var = intr->variables[0]->var;
+               nir_variable *var =
+                  nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
 
                /* Skip indirects */
                uint64_t loc_mask = ((uint64_t)1) << var->data.location;
@@ -327,11 +324,11 @@
                   continue;
 
                switch (intr->intrinsic) {
-               case nir_intrinsic_interp_var_at_centroid:
-               case nir_intrinsic_interp_var_at_sample:
-               case nir_intrinsic_interp_var_at_offset:
-               case nir_intrinsic_load_var:
-               case nir_intrinsic_store_var:
+               case nir_intrinsic_interp_deref_at_centroid:
+               case nir_intrinsic_interp_deref_at_sample:
+               case nir_intrinsic_interp_deref_at_offset:
+               case nir_intrinsic_load_deref:
+               case nir_intrinsic_store_deref:
                   if ((mask & nir_var_shader_in && mode == nir_var_shader_in) ||
                       (mask & nir_var_shader_out && mode == nir_var_shader_out))
                      lower_array(&b, intr, var, varyings);
@@ -386,6 +383,8 @@
 
    _mesa_hash_table_destroy(split_inputs, NULL);
    _mesa_hash_table_destroy(split_outputs, NULL);
+
+   nir_remove_dead_derefs(shader);
 }
 
 void
@@ -429,4 +428,7 @@
 
    _mesa_hash_table_destroy(split_inputs, NULL);
    _mesa_hash_table_destroy(split_outputs, NULL);
+
+   nir_remove_dead_derefs(producer);
+   nir_remove_dead_derefs(consumer);
 }
diff --git a/src/compiler/nir/nir_lower_io_to_scalar.c b/src/compiler/nir/nir_lower_io_to_scalar.c
index 7774c2d..f0c2a6a 100644
--- a/src/compiler/nir/nir_lower_io_to_scalar.c
+++ b/src/compiler/nir/nir_lower_io_to_scalar.c
@@ -23,6 +23,7 @@
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_deref.h"
 
 /** @file nir_lower_io_to_scalar.c
  *
@@ -37,7 +38,7 @@
 
    assert(intr->dest.is_ssa);
 
-   nir_ssa_def *loads[4];
+   nir_ssa_def *loads[NIR_MAX_VEC_COMPONENTS];
 
    for (unsigned i = 0; i < intr->num_components; i++) {
       nir_intrinsic_instr *chan_intr =
@@ -144,28 +145,27 @@
 }
 
 /*
- * This function differs from nir_deref_clone() in that it gets its type from
- * the parent deref rather than our source deref. This is useful when splitting
- * vectors because we want to use the scalar type of the new parent rather than
- * then the old vector type.
+ * Note that the src deref that we are cloning is the head of the
+ * chain of deref instructions from the original intrinsic, but
+ * the dst we are cloning to is the tail (because chains of deref
+ * instructions are created back to front)
  */
-static nir_deref_array *
-clone_deref_array(const nir_deref_array *darr, nir_deref *parent)
+
+static nir_deref_instr *
+clone_deref_array(nir_builder *b, nir_deref_instr *dst_tail,
+                  const nir_deref_instr *src_head)
 {
-   nir_deref_array *ndarr = nir_deref_array_create(parent);
+   const nir_deref_instr *parent = nir_deref_instr_parent(src_head);
 
-   ndarr->deref.type = glsl_get_array_element(parent->type);
-   if (darr->deref.child)
-      ndarr->deref.child =
-         &clone_deref_array(nir_deref_as_array(darr->deref.child),
-                            &ndarr->deref)->deref;
+   if (!parent)
+      return dst_tail;
 
-   ndarr->deref_array_type = darr->deref_array_type;
-   ndarr->base_offset = darr->base_offset;
-   if (ndarr->deref_array_type == nir_deref_array_type_indirect)
-     nir_src_copy(&ndarr->indirect, &darr->indirect, parent);
+   assert(src_head->deref_type == nir_deref_type_array);
 
-   return ndarr;
+   dst_tail = clone_deref_array(b, dst_tail, parent);
+
+   return nir_build_deref_array(b, dst_tail,
+                                nir_ssa_for_src(b, src_head->arr.index, 1));
 }
 
 static void
@@ -177,7 +177,7 @@
 
    assert(intr->dest.is_ssa);
 
-   nir_ssa_def *loads[4];
+   nir_ssa_def *loads[NIR_MAX_VEC_COMPONENTS];
 
    nir_variable **chan_vars;
    if (var->data.mode == nir_var_shader_in) {
@@ -203,17 +203,16 @@
       nir_ssa_dest_init(&chan_intr->instr, &chan_intr->dest,
                         1, intr->dest.ssa.bit_size, NULL);
       chan_intr->num_components = 1;
-      chan_intr->variables[0] = nir_deref_var_create(chan_intr, chan_var);
 
-      if (intr->variables[0]->deref.child) {
-         chan_intr->variables[0]->deref.child =
-            &clone_deref_array(nir_deref_as_array(intr->variables[0]->deref.child),
-                               &chan_intr->variables[0]->deref)->deref;
-      }
+      nir_deref_instr *deref = nir_build_deref_var(b, chan_var);
 
-      if (intr->intrinsic == nir_intrinsic_interp_var_at_offset ||
-          intr->intrinsic == nir_intrinsic_interp_var_at_sample)
-         nir_src_copy(chan_intr->src, intr->src, &chan_intr->instr);
+      deref = clone_deref_array(b, deref, nir_src_as_deref(intr->src[0]));
+
+      chan_intr->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+
+      if (intr->intrinsic == nir_intrinsic_interp_deref_at_offset ||
+          intr->intrinsic == nir_intrinsic_interp_deref_at_sample)
+         nir_src_copy(&chan_intr->src[1], &intr->src[1], &chan_intr->instr);
 
       nir_builder_instr_insert(b, &chan_intr->instr);
 
@@ -235,7 +234,7 @@
 {
    b->cursor = nir_before_instr(&intr->instr);
 
-   nir_ssa_def *value = nir_ssa_for_src(b, intr->src[0], intr->num_components);
+   nir_ssa_def *value = nir_ssa_for_src(b, intr->src[1], intr->num_components);
 
    nir_variable **chan_vars = get_channel_variables(split_outputs, var);
    for (unsigned i = 0; i < intr->num_components; i++) {
@@ -259,14 +258,12 @@
 
       nir_intrinsic_set_write_mask(chan_intr, 0x1);
 
-      chan_intr->variables[0] = nir_deref_var_create(chan_intr, chan_var);
-      chan_intr->src[0] = nir_src_for_ssa(nir_channel(b, value, i));
+      nir_deref_instr *deref = nir_build_deref_var(b, chan_var);
 
-      if (intr->variables[0]->deref.child) {
-         chan_intr->variables[0]->deref.child =
-            &clone_deref_array(nir_deref_as_array(intr->variables[0]->deref.child),
-                               &chan_intr->variables[0]->deref)->deref;
-      }
+      deref = clone_deref_array(b, deref, nir_src_as_deref(intr->src[0]));
+
+      chan_intr->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+      chan_intr->src[1] = nir_src_for_ssa(nir_channel(b, value, i));
 
       nir_builder_instr_insert(b, &chan_intr->instr);
    }
@@ -304,14 +301,15 @@
                if (intr->num_components == 1)
                   continue;
 
-               if (intr->intrinsic != nir_intrinsic_load_var &&
-                   intr->intrinsic != nir_intrinsic_store_var &&
-                   intr->intrinsic != nir_intrinsic_interp_var_at_centroid &&
-                   intr->intrinsic != nir_intrinsic_interp_var_at_sample &&
-                   intr->intrinsic != nir_intrinsic_interp_var_at_offset)
+               if (intr->intrinsic != nir_intrinsic_load_deref &&
+                   intr->intrinsic != nir_intrinsic_store_deref &&
+                   intr->intrinsic != nir_intrinsic_interp_deref_at_centroid &&
+                   intr->intrinsic != nir_intrinsic_interp_deref_at_sample &&
+                   intr->intrinsic != nir_intrinsic_interp_deref_at_offset)
                   continue;
 
-               nir_variable *var = intr->variables[0]->var;
+               nir_variable *var =
+                  nir_deref_instr_get_variable(nir_src_as_deref(intr->src[0]));
                nir_variable_mode mode = var->data.mode;
 
                /* TODO: add patch support */
@@ -338,16 +336,16 @@
                  continue;
 
                switch (intr->intrinsic) {
-               case nir_intrinsic_interp_var_at_centroid:
-               case nir_intrinsic_interp_var_at_sample:
-               case nir_intrinsic_interp_var_at_offset:
-               case nir_intrinsic_load_var:
+               case nir_intrinsic_interp_deref_at_centroid:
+               case nir_intrinsic_interp_deref_at_sample:
+               case nir_intrinsic_interp_deref_at_offset:
+               case nir_intrinsic_load_deref:
                   if ((mask & nir_var_shader_in && mode == nir_var_shader_in) ||
                       (mask & nir_var_shader_out && mode == nir_var_shader_out))
                      lower_load_to_scalar_early(&b, intr, var, split_inputs,
                                                 split_outputs);
                   break;
-               case nir_intrinsic_store_var:
+               case nir_intrinsic_store_deref:
                   if (mask & nir_var_shader_out &&
                       mode == nir_var_shader_out)
                      lower_store_output_to_scalar_early(&b, intr, var,
@@ -380,4 +378,6 @@
 
    _mesa_hash_table_destroy(split_inputs, NULL);
    _mesa_hash_table_destroy(split_outputs, NULL);
+
+   nir_remove_dead_derefs(shader);
 }
diff --git a/src/compiler/nir/nir_lower_io_to_temporaries.c b/src/compiler/nir/nir_lower_io_to_temporaries.c
index 3dae522..d93e20e 100644
--- a/src/compiler/nir/nir_lower_io_to_temporaries.c
+++ b/src/compiler/nir/nir_lower_io_to_temporaries.c
@@ -195,4 +195,6 @@
 
    exec_list_append(&shader->globals, &state.old_inputs);
    exec_list_append(&shader->globals, &state.old_outputs);
+
+   nir_fixup_deref_modes(shader);
 }
diff --git a/src/compiler/nir/nir_lower_io_types.c b/src/compiler/nir/nir_lower_io_types.c
deleted file mode 100644
index 795bbd8..0000000
--- a/src/compiler/nir/nir_lower_io_types.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright © 2016 Red Hat
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "nir.h"
-#include "nir_builder.h"
-
-/* Lower complex (struct/array/mat) input and output vars to primitive types
- * (vec4) for linking.  All indirect input/output access should already be
- * lowered (ie. nir_lower_io_to_temporaries).
- */
-
-struct lower_io_types_state {
-   nir_shader *shader;
-   struct exec_list new_ins;
-   struct exec_list new_outs;
-};
-
-static nir_variable *
-get_new_var(struct lower_io_types_state *state, nir_variable *var,
-            const struct glsl_type *deref_type, unsigned off)
-{
-   struct exec_list *list;
-
-   if (var->data.mode == nir_var_shader_in) {
-      list = &state->new_ins;
-   } else {
-      assert(var->data.mode == nir_var_shader_out);
-      list = &state->new_outs;
-   }
-
-   nir_foreach_variable(nvar, list) {
-      if (nvar->data.location == (var->data.location + off))
-         return nvar;
-   }
-
-   /* doesn't already exist, so we need to create a new one: */
-   /* TODO figure out if we need to fixup interpolation mode for int vs float
-    * components of a struct, etc..
-    */
-   nir_variable *nvar = nir_variable_create(state->shader, var->data.mode,
-                                            deref_type, NULL);
-
-   nvar->name = ralloc_asprintf(nvar, "%s@%u", var->name, off);
-   nvar->data = var->data;
-   nvar->data.location += off;
-
-   /* nir_variable_create is too clever for its own good: */
-   exec_node_remove(&nvar->node);
-   exec_node_self_link(&nvar->node);      /* no delinit() :-( */
-
-   exec_list_push_tail(list, &nvar->node);
-
-   /* remove existing var from input/output list: */
-   exec_node_remove(&var->node);
-   exec_node_self_link(&var->node);
-
-   return nvar;
-}
-
-static unsigned
-get_deref_offset(struct lower_io_types_state *state, nir_deref *tail, bool vs_in)
-{
-   unsigned offset = 0;
-
-   while (tail->child != NULL) {
-      const struct glsl_type *parent_type = tail->type;
-      tail = tail->child;
-
-      if (tail->deref_type == nir_deref_type_array) {
-         nir_deref_array *deref_array = nir_deref_as_array(tail);
-
-         /* indirect inputs/outputs should already be lowered! */
-         assert(deref_array->deref_array_type == nir_deref_array_type_direct);
-
-         unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
-
-         offset += size * deref_array->base_offset;
-      } else if (tail->deref_type == nir_deref_type_struct) {
-         nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
-
-         for (unsigned i = 0; i < deref_struct->index; i++) {
-            const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
-            offset += glsl_count_attribute_slots(ft, vs_in);
-         }
-      }
-   }
-
-   return offset;
-}
-
-static bool
-lower_io_types_block(struct lower_io_types_state *state, nir_block *block)
-{
-   nir_foreach_instr(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-
-      if ((intr->intrinsic != nir_intrinsic_load_var) &&
-          (intr->intrinsic != nir_intrinsic_store_var))
-         continue;
-
-      nir_variable *var = intr->variables[0]->var;
-
-      if ((var->data.mode != nir_var_shader_in) &&
-          (var->data.mode != nir_var_shader_out))
-         continue;
-
-      bool vs_in = (state->shader->info.stage == MESA_SHADER_VERTEX) &&
-                   (var->data.mode == nir_var_shader_in);
-      if (glsl_count_attribute_slots(var->type, vs_in) == 1)
-         continue;
-
-      unsigned off = get_deref_offset(state, &intr->variables[0]->deref, vs_in);
-      const struct glsl_type *deref_type =
-         nir_deref_tail(&intr->variables[0]->deref)->type;
-      nir_variable *nvar = get_new_var(state, var, deref_type, off);
-
-      /* and then re-write the load/store_var deref: */
-      intr->variables[0] = nir_deref_var_create(intr, nvar);
-   }
-
-   return true;
-}
-
-static void
-lower_io_types_impl(nir_function_impl *impl, struct lower_io_types_state *state)
-{
-   nir_foreach_block(block, impl) {
-      lower_io_types_block(state, block);
-   }
-
-   nir_metadata_preserve(impl, nir_metadata_block_index |
-                               nir_metadata_dominance);
-}
-
-
-void
-nir_lower_io_types(nir_shader *shader)
-{
-   struct lower_io_types_state state;
-
-   state.shader = shader;
-   exec_list_make_empty(&state.new_ins);
-   exec_list_make_empty(&state.new_outs);
-
-   nir_foreach_function(function, shader) {
-      if (function->impl)
-         lower_io_types_impl(function->impl, &state);
-   }
-
-   /* move new in/out vars to shader's lists: */
-   exec_list_append(&shader->inputs, &state.new_ins);
-   exec_list_append(&shader->outputs, &state.new_outs);
-}
diff --git a/src/compiler/nir/nir_lower_load_const_to_scalar.c b/src/compiler/nir/nir_lower_load_const_to_scalar.c
index 39447d4..b2e055f 100644
--- a/src/compiler/nir/nir_lower_load_const_to_scalar.c
+++ b/src/compiler/nir/nir_lower_load_const_to_scalar.c
@@ -46,7 +46,7 @@
    b.cursor = nir_before_instr(&lower->instr);
 
    /* Emit the individual loads. */
-   nir_ssa_def *loads[4];
+   nir_ssa_def *loads[NIR_MAX_VEC_COMPONENTS];
    for (unsigned i = 0; i < lower->def.num_components; i++) {
       nir_load_const_instr *load_comp =
          nir_load_const_instr_create(b.shader, 1, lower->def.bit_size);
diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c
index d0667bc..ec71f3d 100644
--- a/src/compiler/nir/nir_lower_locals_to_regs.c
+++ b/src/compiler/nir/nir_lower_locals_to_regs.c
@@ -26,10 +26,10 @@
  */
 
 #include "nir.h"
+#include "nir_builder.h"
 
 struct locals_to_regs_state {
-   nir_shader *shader;
-   nir_function_impl *impl;
+   nir_builder builder;
 
    /* A hash table mapping derefs to registers */
    struct hash_table *regs_table;
@@ -47,54 +47,61 @@
 {
    uint32_t hash = _mesa_fnv32_1a_offset_bias;
 
-   const nir_deref_var *deref_var = void_deref;
-   hash = _mesa_fnv32_1a_accumulate(hash, deref_var->var);
+   for (const nir_deref_instr *deref = void_deref; deref;
+        deref = nir_deref_instr_parent(deref)) {
+      switch (deref->deref_type) {
+      case nir_deref_type_var:
+         return _mesa_fnv32_1a_accumulate(hash, deref->var);
 
-   for (const nir_deref *deref = deref_var->deref.child;
-        deref; deref = deref->child) {
-      if (deref->deref_type == nir_deref_type_struct) {
-         const nir_deref_struct *deref_struct = nir_deref_as_struct(deref);
-         hash = _mesa_fnv32_1a_accumulate(hash, deref_struct->index);
+      case nir_deref_type_array:
+         continue; /* Do nothing */
+
+      case nir_deref_type_struct:
+         hash = _mesa_fnv32_1a_accumulate(hash, deref->strct.index);
+         continue;
+
+      default:
+         unreachable("Invalid deref type");
       }
    }
 
-   return hash;
+   unreachable("We should have hit a variable dereference");
 }
 
 static bool
 derefs_equal(const void *void_a, const void *void_b)
 {
-   const nir_deref_var *a_var = void_a;
-   const nir_deref_var *b_var = void_b;
-
-   if (a_var->var != b_var->var)
-      return false;
-
-   for (const nir_deref *a = a_var->deref.child, *b = b_var->deref.child;
-        a != NULL; a = a->child, b = b->child) {
+   for (const nir_deref_instr *a = void_a, *b = void_b; a || b;
+        a = nir_deref_instr_parent(a), b = nir_deref_instr_parent(b)) {
       if (a->deref_type != b->deref_type)
          return false;
 
-      if (a->deref_type == nir_deref_type_struct) {
-         if (nir_deref_as_struct(a)->index != nir_deref_as_struct(b)->index)
-            return false;
-      }
-      /* Do nothing for arrays.  They're all the same. */
+      switch (a->deref_type) {
+      case nir_deref_type_var:
+         return a->var == b->var;
 
-      assert((a->child == NULL) == (b->child == NULL));
-      if((a->child == NULL) != (b->child == NULL))
-         return false;
+      case nir_deref_type_array:
+         continue; /* Do nothing */
+
+      case nir_deref_type_struct:
+         if (a->strct.index != b->strct.index)
+            return false;
+         continue;
+
+      default:
+         unreachable("Invalid deref type");
+      }
    }
 
-   return true;
+   unreachable("We should have hit a variable dereference");
 }
 
 static nir_register *
-get_reg_for_deref(nir_deref_var *deref, struct locals_to_regs_state *state)
+get_reg_for_deref(nir_deref_instr *deref, struct locals_to_regs_state *state)
 {
    uint32_t hash = hash_deref(deref);
 
-   assert(deref->var->constant_initializer == NULL);
+   assert(nir_deref_instr_get_variable(deref)->constant_initializer == NULL);
 
    struct hash_entry *entry =
       _mesa_hash_table_search_pre_hashed(state->regs_table, hash, deref);
@@ -102,19 +109,17 @@
       return entry->data;
 
    unsigned array_size = 1;
-   nir_deref *tail = &deref->deref;
-   while (tail->child) {
-      if (tail->child->deref_type == nir_deref_type_array)
-         array_size *= glsl_get_length(tail->type);
-      tail = tail->child;
+   for (nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) {
+      if (d->deref_type == nir_deref_type_array)
+         array_size *= glsl_get_length(nir_deref_instr_parent(d)->type);
    }
 
-   assert(glsl_type_is_vector(tail->type) || glsl_type_is_scalar(tail->type));
+   assert(glsl_type_is_vector_or_scalar(deref->type));
 
-   nir_register *reg = nir_local_reg_create(state->impl);
-   reg->num_components = glsl_get_vector_elements(tail->type);
+   nir_register *reg = nir_local_reg_create(state->builder.impl);
+   reg->num_components = glsl_get_vector_elements(deref->type);
    reg->num_array_elems = array_size > 1 ? array_size : 0;
-   reg->bit_size = glsl_get_bit_size(tail->type);
+   reg->bit_size = glsl_get_bit_size(deref->type);
 
    _mesa_hash_table_insert_pre_hashed(state->regs_table, hash, deref, reg);
 
@@ -122,9 +127,10 @@
 }
 
 static nir_src
-get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
-                  struct locals_to_regs_state *state)
+get_deref_reg_src(nir_deref_instr *deref, struct locals_to_regs_state *state)
 {
+   nir_builder *b = &state->builder;
+
    nir_src src;
 
    src.is_ssa = false;
@@ -140,55 +146,32 @@
    if (src.reg.reg->num_array_elems == 0)
       return src;
 
-   nir_deref *tail = &deref->deref;
-   while (tail->child != NULL) {
-      const struct glsl_type *parent_type = tail->type;
-      tail = tail->child;
-
-      if (tail->deref_type != nir_deref_type_array)
+   unsigned inner_array_size = 1;
+   for (const nir_deref_instr *d = deref; d; d = nir_deref_instr_parent(d)) {
+      if (d->deref_type != nir_deref_type_array)
          continue;
 
-      nir_deref_array *deref_array = nir_deref_as_array(tail);
-
-      src.reg.base_offset *= glsl_get_length(parent_type);
-      src.reg.base_offset += deref_array->base_offset;
-
-      if (src.reg.indirect) {
-         nir_load_const_instr *load_const =
-            nir_load_const_instr_create(state->shader, 1, 32);
-         load_const->value.u32[0] = glsl_get_length(parent_type);
-         nir_instr_insert_before(instr, &load_const->instr);
-
-         nir_alu_instr *mul = nir_alu_instr_create(state->shader, nir_op_imul);
-         mul->src[0].src = *src.reg.indirect;
-         mul->src[1].src.is_ssa = true;
-         mul->src[1].src.ssa = &load_const->def;
-         mul->dest.write_mask = 1;
-         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
-         nir_instr_insert_before(instr, &mul->instr);
-
-         src.reg.indirect->is_ssa = true;
-         src.reg.indirect->ssa = &mul->dest.dest.ssa;
-      }
-
-      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-         if (src.reg.indirect == NULL) {
-            src.reg.indirect = ralloc(state->shader, nir_src);
-            nir_src_copy(src.reg.indirect, &deref_array->indirect,
-                         state->shader);
+      nir_const_value *const_index = nir_src_as_const_value(d->arr.index);
+      if (const_index && !src.reg.indirect) {
+         src.reg.base_offset += const_index->u32[0] * inner_array_size;
+      } else {
+         if (src.reg.indirect) {
+            assert(src.reg.base_offset == 0);
          } else {
-            nir_alu_instr *add = nir_alu_instr_create(state->shader,
-                                                      nir_op_iadd);
-            add->src[0].src = *src.reg.indirect;
-            nir_src_copy(&add->src[1].src, &deref_array->indirect, add);
-            add->dest.write_mask = 1;
-            nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
-            nir_instr_insert_before(instr, &add->instr);
-
-            src.reg.indirect->is_ssa = true;
-            src.reg.indirect->ssa = &add->dest.dest.ssa;
+            src.reg.indirect = ralloc(b->shader, nir_src);
+            *src.reg.indirect =
+               nir_src_for_ssa(nir_imm_int(b, src.reg.base_offset));
+            src.reg.base_offset = 0;
          }
+
+         assert(src.reg.indirect->is_ssa);
+         src.reg.indirect->ssa =
+            nir_iadd(b, src.reg.indirect->ssa,
+                        nir_imul(b, nir_ssa_for_src(b, d->arr.index, 1),
+                                    nir_imm_int(b, inner_array_size)));
       }
+
+      inner_array_size *= glsl_get_length(nir_deref_instr_parent(d)->type);
    }
 
    return src;
@@ -198,6 +181,8 @@
 lower_locals_to_regs_block(nir_block *block,
                            struct locals_to_regs_state *state)
 {
+   nir_builder *b = &state->builder;
+
    nir_foreach_instr_safe(instr, block) {
       if (instr->type != nir_instr_type_intrinsic)
          continue;
@@ -205,13 +190,15 @@
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
       switch (intrin->intrinsic) {
-      case nir_intrinsic_load_var: {
-         if (intrin->variables[0]->var->data.mode != nir_var_local)
+      case nir_intrinsic_load_deref: {
+         nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+         if (deref->mode != nir_var_local)
             continue;
 
-         nir_alu_instr *mov = nir_alu_instr_create(state->shader, nir_op_imov);
-         mov->src[0].src = get_deref_reg_src(intrin->variables[0],
-                                             &intrin->instr, state);
+         b->cursor = nir_before_instr(&intrin->instr);
+
+         nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_imov);
+         mov->src[0].src = get_deref_reg_src(deref, state);
          mov->dest.write_mask = (1 << intrin->num_components) - 1;
          if (intrin->dest.is_ssa) {
             nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
@@ -222,36 +209,38 @@
          } else {
             nir_dest_copy(&mov->dest.dest, &intrin->dest, &mov->instr);
          }
-         nir_instr_insert_before(&intrin->instr, &mov->instr);
+         nir_builder_instr_insert(b, &mov->instr);
 
          nir_instr_remove(&intrin->instr);
          state->progress = true;
          break;
       }
 
-      case nir_intrinsic_store_var: {
-         if (intrin->variables[0]->var->data.mode != nir_var_local)
+      case nir_intrinsic_store_deref: {
+         nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+         if (deref->mode != nir_var_local)
             continue;
 
-         nir_src reg_src = get_deref_reg_src(intrin->variables[0],
-                                             &intrin->instr, state);
+         b->cursor = nir_before_instr(&intrin->instr);
 
-         nir_alu_instr *mov = nir_alu_instr_create(state->shader, nir_op_imov);
-         nir_src_copy(&mov->src[0].src, &intrin->src[0], mov);
+         nir_src reg_src = get_deref_reg_src(deref, state);
+
+         nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_imov);
+         nir_src_copy(&mov->src[0].src, &intrin->src[1], mov);
          mov->dest.write_mask = nir_intrinsic_write_mask(intrin);
          mov->dest.dest.is_ssa = false;
          mov->dest.dest.reg.reg = reg_src.reg.reg;
          mov->dest.dest.reg.base_offset = reg_src.reg.base_offset;
          mov->dest.dest.reg.indirect = reg_src.reg.indirect;
 
-         nir_instr_insert_before(&intrin->instr, &mov->instr);
+         nir_builder_instr_insert(b, &mov->instr);
 
          nir_instr_remove(&intrin->instr);
          state->progress = true;
          break;
       }
 
-      case nir_intrinsic_copy_var:
+      case nir_intrinsic_copy_deref:
          unreachable("There should be no copies whatsoever at this point");
          break;
 
@@ -268,8 +257,7 @@
 {
    struct locals_to_regs_state state;
 
-   state.shader = impl->function->shader;
-   state.impl = impl;
+   nir_builder_init(&state.builder, impl);
    state.progress = false;
    state.regs_table = _mesa_hash_table_create(NULL, hash_deref, derefs_equal);
 
diff --git a/src/compiler/nir/nir_lower_packing.c b/src/compiler/nir/nir_lower_packing.c
new file mode 100644
index 0000000..ba9f4bc
--- /dev/null
+++ b/src/compiler/nir/nir_lower_packing.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+/*
+ * lowers:
+ *
+ * packDouble2x32(foo) -> packDouble2x32Split(foo.x, foo.y)
+ * unpackDouble2x32(foo) -> vec2(unpackDouble2x32_x(foo), unpackDouble2x32_y(foo))
+ * packInt2x32(foo) -> packInt2x32Split(foo.x, foo.y)
+ * unpackInt2x32(foo) -> vec2(unpackInt2x32_x(foo), unpackInt2x32_y(foo))
+ */
+
+static nir_ssa_def *
+lower_pack_64_from_32(nir_builder *b, nir_ssa_def *src)
+{
+   return nir_pack_64_2x32_split(b, nir_channel(b, src, 0),
+                                    nir_channel(b, src, 1));
+}
+
+static nir_ssa_def *
+lower_unpack_64_to_32(nir_builder *b, nir_ssa_def *src)
+{
+   return nir_vec2(b, nir_unpack_64_2x32_split_x(b, src),
+                      nir_unpack_64_2x32_split_y(b, src));
+}
+
+static nir_ssa_def *
+lower_pack_32_from_16(nir_builder *b, nir_ssa_def *src)
+{
+   return nir_pack_32_2x16_split(b, nir_channel(b, src, 0),
+                                    nir_channel(b, src, 1));
+}
+
+static nir_ssa_def *
+lower_unpack_32_to_16(nir_builder *b, nir_ssa_def *src)
+{
+   return nir_vec2(b, nir_unpack_32_2x16_split_x(b, src),
+                      nir_unpack_32_2x16_split_y(b, src));
+}
+
+static nir_ssa_def *
+lower_pack_64_from_16(nir_builder *b, nir_ssa_def *src)
+{
+   nir_ssa_def *xy = nir_pack_32_2x16_split(b, nir_channel(b, src, 0),
+                                               nir_channel(b, src, 1));
+
+   nir_ssa_def *zw = nir_pack_32_2x16_split(b, nir_channel(b, src, 2),
+                                               nir_channel(b, src, 3));
+
+   return nir_pack_64_2x32_split(b, xy, zw);
+}
+
+static nir_ssa_def *
+lower_unpack_64_to_16(nir_builder *b, nir_ssa_def *src)
+{
+   nir_ssa_def *xy = nir_unpack_64_2x32_split_x(b, src);
+   nir_ssa_def *zw = nir_unpack_64_2x32_split_y(b, src);
+
+   return nir_vec4(b, nir_unpack_32_2x16_split_x(b, xy),
+                      nir_unpack_32_2x16_split_y(b, xy),
+                      nir_unpack_32_2x16_split_x(b, zw),
+                      nir_unpack_32_2x16_split_y(b, zw));
+}
+
+static bool
+lower_pack_impl(nir_function_impl *impl)
+{
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   bool progress = false;
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_alu)
+            continue;
+
+         nir_alu_instr *alu_instr = (nir_alu_instr *) instr;
+
+         if (alu_instr->op != nir_op_pack_64_2x32 &&
+             alu_instr->op != nir_op_unpack_64_2x32 &&
+             alu_instr->op != nir_op_pack_64_4x16 &&
+             alu_instr->op != nir_op_unpack_64_4x16 &&
+             alu_instr->op != nir_op_pack_32_2x16 &&
+             alu_instr->op != nir_op_unpack_32_2x16)
+            continue;
+
+         b.cursor = nir_before_instr(&alu_instr->instr);
+
+         nir_ssa_def *src = nir_ssa_for_alu_src(&b, alu_instr, 0);
+         nir_ssa_def *dest;
+
+         switch (alu_instr->op) {
+         case nir_op_pack_64_2x32:
+            dest = lower_pack_64_from_32(&b, src);
+            break;
+         case nir_op_unpack_64_2x32:
+            dest = lower_unpack_64_to_32(&b, src);
+            break;
+         case nir_op_pack_64_4x16:
+            dest = lower_pack_64_from_16(&b, src);
+            break;
+         case nir_op_unpack_64_4x16:
+            dest = lower_unpack_64_to_16(&b, src);
+            break;
+         case nir_op_pack_32_2x16:
+            dest = lower_pack_32_from_16(&b, src);
+            break;
+         case nir_op_unpack_32_2x16:
+            dest = lower_unpack_32_to_16(&b, src);
+            break;
+         default:
+            unreachable("Impossible opcode");
+         }
+
+         nir_ssa_def_rewrite_uses(&alu_instr->dest.dest.ssa, nir_src_for_ssa(dest));
+         nir_instr_remove(&alu_instr->instr);
+         nir_metadata_preserve(impl, nir_metadata_block_index |
+                                     nir_metadata_dominance);
+         progress = true;
+      }
+   }
+
+   return progress;
+}
+
+bool
+nir_lower_pack(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= lower_pack_impl(function->impl);
+   }
+
+   return false;
+}
diff --git a/src/compiler/nir/nir_lower_patch_vertices.c b/src/compiler/nir/nir_lower_patch_vertices.c
index d196576..8e89268 100644
--- a/src/compiler/nir/nir_lower_patch_vertices.c
+++ b/src/compiler/nir/nir_lower_patch_vertices.c
@@ -22,11 +22,52 @@
  */
 
 #include "nir_builder.h"
+#include "program/prog_instruction.h"
 
-void
-nir_lower_tes_patch_vertices(nir_shader *tes_nir, unsigned patch_vertices)
+static nir_variable *
+make_uniform(nir_shader *nir, const gl_state_index16 *tokens)
 {
-   nir_foreach_function(function, tes_nir) {
+   /* Note: name must be prefixed with "gl_" to trigger slot based
+    * special handling in uniform setup.
+    */
+   nir_variable *var =
+      nir_variable_create(nir, nir_var_uniform, glsl_int_type(),
+                          "gl_PatchVerticesIn");
+   var->num_state_slots = 1;
+   var->state_slots = ralloc_array(var, nir_state_slot, var->num_state_slots);
+   memcpy(var->state_slots[0].tokens, tokens, sizeof(*tokens) * STATE_LENGTH);
+   var->state_slots[0].swizzle = SWIZZLE_XXXX;
+
+   return var;
+}
+
+/**
+ * This pass lowers the load_patch_vertices_in intrinsic.
+ *
+ * - If we statically know the value, we lower it to a constant.
+ *   (If a TES is linked against a TCS, the TCS tells us the TES input count.)
+ *
+ * - If not, and we're given Mesa state slots, we lower it to a uniform.
+ *
+ * - Otherwise, we leave it as a system value.
+ *
+ * This pass must be run after nir_lower_system_values().
+ */
+bool
+nir_lower_patch_vertices(nir_shader *nir,
+                         unsigned static_count,
+                         const gl_state_index16 *uniform_state_tokens)
+{
+   bool progress = false;
+   nir_variable *var = NULL;
+
+   /* If there's no static count and we don't want uniforms, there's no
+    * lowering to do...just bail early.
+    */
+   if (static_count == 0 && !uniform_state_tokens)
+      return false;
+
+   nir_foreach_function(function, nir) {
       if (function->impl) {
          nir_foreach_block(block, function->impl) {
             nir_builder b;
@@ -38,7 +79,18 @@
                      continue;
 
                   b.cursor = nir_before_instr(&intr->instr);
-                  nir_ssa_def *val = nir_imm_int(&b, patch_vertices);
+
+                  nir_ssa_def *val = NULL;
+                  if (static_count) {
+                     val = nir_imm_int(&b, static_count);
+                  } else {
+                     if (!var)
+                        var = make_uniform(nir, uniform_state_tokens);
+
+                     val = nir_load_var(&b, var);
+                  }
+
+                  progress = true;
                   nir_ssa_def_rewrite_uses(&intr->dest.ssa,
                                            nir_src_for_ssa(val));
                   nir_instr_remove(instr);
@@ -46,8 +98,12 @@
             }
          }
 
-         nir_metadata_preserve(function->impl, nir_metadata_block_index |
-                                               nir_metadata_dominance);
+         if (progress) {
+            nir_metadata_preserve(function->impl, nir_metadata_block_index |
+                                                  nir_metadata_dominance);
+         }
       }
    }
+
+   return progress;
 }
diff --git a/src/compiler/nir/nir_lower_phis_to_scalar.c b/src/compiler/nir/nir_lower_phis_to_scalar.c
index b12718f..904eff0 100644
--- a/src/compiler/nir/nir_lower_phis_to_scalar.c
+++ b/src/compiler/nir/nir_lower_phis_to_scalar.c
@@ -83,13 +83,15 @@
       nir_intrinsic_instr *src_intrin = nir_instr_as_intrinsic(src_instr);
 
       switch (src_intrin->intrinsic) {
-      case nir_intrinsic_load_var:
-         return src_intrin->variables[0]->var->data.mode == nir_var_shader_in ||
-                src_intrin->variables[0]->var->data.mode == nir_var_uniform;
+      case nir_intrinsic_load_deref: {
+         nir_deref_instr *deref = nir_src_as_deref(src_intrin->src[0]);
+         return deref->mode == nir_var_shader_in ||
+                deref->mode == nir_var_uniform;
+      }
 
-      case nir_intrinsic_interp_var_at_centroid:
-      case nir_intrinsic_interp_var_at_sample:
-      case nir_intrinsic_interp_var_at_offset:
+      case nir_intrinsic_interp_deref_at_centroid:
+      case nir_intrinsic_interp_deref_at_sample:
+      case nir_intrinsic_interp_deref_at_offset:
       case nir_intrinsic_load_uniform:
       case nir_intrinsic_load_ubo:
       case nir_intrinsic_load_ssbo:
diff --git a/src/compiler/nir/nir_lower_samplers.c b/src/compiler/nir/nir_lower_samplers.c
deleted file mode 100644
index 7690665..0000000
--- a/src/compiler/nir/nir_lower_samplers.c
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
- * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
- * Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include "nir.h"
-#include "nir_builder.h"
-#include "compiler/glsl/ir_uniform.h"
-
-#include "main/compiler.h"
-#include "main/mtypes.h"
-
-/* Calculate the sampler index based on array indicies and also
- * calculate the base uniform location for struct members.
- */
-static void
-calc_sampler_offsets(nir_deref *tail, nir_tex_instr *instr,
-                     unsigned *array_elements, nir_ssa_def **indirect,
-                     nir_builder *b, unsigned *location)
-{
-   if (tail->child == NULL)
-      return;
-
-   switch (tail->child->deref_type) {
-   case nir_deref_type_array: {
-      nir_deref_array *deref_array = nir_deref_as_array(tail->child);
-
-      assert(deref_array->deref_array_type != nir_deref_array_type_wildcard);
-
-      calc_sampler_offsets(tail->child, instr, array_elements,
-                           indirect, b, location);
-      instr->texture_index += deref_array->base_offset * *array_elements;
-
-      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-         nir_ssa_def *mul =
-            nir_imul(b, nir_imm_int(b, *array_elements),
-                     nir_ssa_for_src(b, deref_array->indirect, 1));
-
-         nir_instr_rewrite_src(&instr->instr, &deref_array->indirect,
-                               NIR_SRC_INIT);
-
-         if (*indirect) {
-            *indirect = nir_iadd(b, *indirect, mul);
-         } else {
-            *indirect = mul;
-         }
-      }
-
-      *array_elements *= glsl_get_length(tail->type);
-       break;
-   }
-
-   case nir_deref_type_struct: {
-      nir_deref_struct *deref_struct = nir_deref_as_struct(tail->child);
-      *location += glsl_get_record_location_offset(tail->type, deref_struct->index);
-      calc_sampler_offsets(tail->child, instr, array_elements,
-                           indirect, b, location);
-      break;
-   }
-
-   default:
-      unreachable("Invalid deref type");
-      break;
-   }
-}
-
-static bool
-lower_sampler(nir_tex_instr *instr, const struct gl_shader_program *shader_program,
-              gl_shader_stage stage, nir_builder *b)
-{
-   if (instr->texture == NULL)
-      return false;
-
-   /* In GLSL, we only fill out the texture field.  The sampler is inferred */
-   assert(instr->sampler == NULL);
-
-   instr->texture_index = 0;
-   unsigned location = instr->texture->var->data.location;
-   unsigned array_elements = 1;
-   nir_ssa_def *indirect = NULL;
-
-   b->cursor = nir_before_instr(&instr->instr);
-   calc_sampler_offsets(&instr->texture->deref, instr, &array_elements,
-                        &indirect, b, &location);
-
-   if (indirect) {
-      assert(array_elements >= 1);
-      indirect = nir_umin(b, indirect, nir_imm_int(b, array_elements - 1));
-
-      nir_tex_instr_add_src(instr, nir_tex_src_texture_offset,
-                            nir_src_for_ssa(indirect));
-      nir_tex_instr_add_src(instr, nir_tex_src_sampler_offset,
-                            nir_src_for_ssa(indirect));
-
-      instr->texture_array_size = array_elements;
-   }
-
-   assert(location < shader_program->data->NumUniformStorage &&
-          shader_program->data->UniformStorage[location].opaque[stage].active);
-
-   instr->texture_index +=
-      shader_program->data->UniformStorage[location].opaque[stage].index;
-
-   instr->sampler_index = instr->texture_index;
-
-   instr->texture = NULL;
-
-   return true;
-}
-
-static bool
-lower_impl(nir_function_impl *impl, const struct gl_shader_program *shader_program,
-           gl_shader_stage stage)
-{
-   nir_builder b;
-   nir_builder_init(&b, impl);
-   bool progress = false;
-
-   nir_foreach_block(block, impl) {
-      nir_foreach_instr(instr, block) {
-         if (instr->type == nir_instr_type_tex)
-            progress |= lower_sampler(nir_instr_as_tex(instr),
-                                      shader_program, stage, &b);
-      }
-   }
-
-   return progress;
-}
-
-bool
-nir_lower_samplers(nir_shader *shader,
-                   const struct gl_shader_program *shader_program)
-{
-   bool progress = false;
-
-   nir_foreach_function(function, shader) {
-      if (function->impl)
-         progress |= lower_impl(function->impl, shader_program,
-                                shader->info.stage);
-   }
-
-   return progress;
-}
diff --git a/src/compiler/nir/nir_lower_samplers_as_deref.c b/src/compiler/nir/nir_lower_samplers_as_deref.c
deleted file mode 100644
index cb0c827..0000000
--- a/src/compiler/nir/nir_lower_samplers_as_deref.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright (C) 2005-2007  Brian Paul   All Rights Reserved.
- * Copyright (C) 2008  VMware, Inc.   All Rights Reserved.
- * Copyright © 2014 Intel Corporation
- * Copyright © 2017 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * \file
- *
- * Lower sampler and image references of (non-bindless) uniforms by removing
- * struct dereferences, and synthesizing new uniform variables without structs
- * if required.
- *
- * This will allow backends to have a simple, uniform treatment of bindless and
- * non-bindless samplers and images.
- *
- * Example:
- *
- *   struct S {
- *      sampler2D tex[2];
- *      sampler2D other;
- *   };
- *   uniform S s[2];
- *
- *   tmp = texture(s[n].tex[m], coord);
- *
- * Becomes:
- *
- *   decl_var uniform INTERP_MODE_NONE sampler2D[2][2] lower@s.tex (...)
- *
- *   vec1 32 ssa_idx = $(2 * n + m)
- *   vec4 32 ssa_out = tex ssa_coord (coord), lower@s.tex[n][m] (texture), lower@s.tex[n][m] (sampler)
- *
- * and lower@s.tex has var->data.binding set to the base index as defined by
- * the opaque uniform mapping.
- */
-
-#include "nir.h"
-#include "nir_builder.h"
-#include "compiler/glsl/ir_uniform.h"
-
-#include "main/compiler.h"
-#include "main/mtypes.h"
-
-struct lower_samplers_as_deref_state {
-   nir_shader *shader;
-   const struct gl_shader_program *shader_program;
-   struct hash_table *remap_table;
-};
-
-static void
-remove_struct_derefs(nir_deref *tail,
-                     struct lower_samplers_as_deref_state *state,
-                     nir_builder *b, char **path, unsigned *location)
-{
-   if (!tail->child)
-      return;
-
-   switch (tail->child->deref_type) {
-   case nir_deref_type_array: {
-      unsigned length = glsl_get_length(tail->type);
-
-      remove_struct_derefs(tail->child, state, b, path, location);
-
-      tail->type = glsl_get_array_instance(tail->child->type, length);
-      break;
-   }
-
-   case nir_deref_type_struct: {
-      nir_deref_struct *deref_struct = nir_deref_as_struct(tail->child);
-
-      *location += glsl_get_record_location_offset(tail->type, deref_struct->index);
-      ralloc_asprintf_append(path, ".%s",
-                             glsl_get_struct_elem_name(tail->type, deref_struct->index));
-
-      remove_struct_derefs(tail->child, state, b, path, location);
-
-      /* Drop the struct deref and re-parent. */
-      ralloc_steal(tail, tail->child->child);
-      tail->type = tail->child->type;
-      tail->child = tail->child->child;
-      break;
-   }
-
-   default:
-      unreachable("Invalid deref type");
-      break;
-   }
-}
-
-static void
-lower_deref(nir_deref_var *deref,
-            struct lower_samplers_as_deref_state *state,
-            nir_builder *b)
-{
-   nir_variable *var = deref->var;
-   gl_shader_stage stage = state->shader->info.stage;
-   unsigned location = var->data.location;
-   unsigned binding;
-   const struct glsl_type *orig_type = deref->deref.type;
-   char *path;
-
-   assert(var->data.mode == nir_var_uniform);
-
-   path = ralloc_asprintf(state->remap_table, "lower@%s", var->name);
-   remove_struct_derefs(&deref->deref, state, b, &path, &location);
-
-   assert(location < state->shader_program->data->NumUniformStorage &&
-          state->shader_program->data->UniformStorage[location].opaque[stage].active);
-
-   binding = state->shader_program->data->UniformStorage[location].opaque[stage].index;
-
-   if (orig_type == deref->deref.type) {
-      /* Fast path: We did not encounter any struct derefs. */
-      var->data.binding = binding;
-      return;
-   }
-
-   uint32_t hash = _mesa_key_hash_string(path);
-   struct hash_entry *h =
-      _mesa_hash_table_search_pre_hashed(state->remap_table, hash, path);
-
-   if (h) {
-      var = (nir_variable *)h->data;
-   } else {
-      var = nir_variable_create(state->shader, nir_var_uniform, deref->deref.type, path);
-      var->data.binding = binding;
-      _mesa_hash_table_insert_pre_hashed(state->remap_table, hash, path, var);
-   }
-
-   deref->var = var;
-}
-
-static bool
-lower_sampler(nir_tex_instr *instr, struct lower_samplers_as_deref_state *state,
-              nir_builder *b)
-{
-   if (!instr->texture || instr->texture->var->data.bindless ||
-       instr->texture->var->data.mode != nir_var_uniform)
-      return false;
-
-   /* In GLSL, we only fill out the texture field.  The sampler is inferred */
-   assert(instr->sampler == NULL);
-
-   b->cursor = nir_before_instr(&instr->instr);
-   lower_deref(instr->texture, state, b);
-
-   if (instr->op != nir_texop_txf_ms &&
-       instr->op != nir_texop_txf_ms_mcs &&
-       instr->op != nir_texop_samples_identical) {
-      nir_instr_rewrite_deref(&instr->instr, &instr->sampler,
-                              nir_deref_var_clone(instr->texture, instr));
-   } else {
-      assert(!instr->sampler);
-   }
-
-   return true;
-}
-
-static bool
-lower_intrinsic(nir_intrinsic_instr *instr,
-                struct lower_samplers_as_deref_state *state,
-                nir_builder *b)
-{
-   if (instr->intrinsic == nir_intrinsic_image_var_load ||
-       instr->intrinsic == nir_intrinsic_image_var_store ||
-       instr->intrinsic == nir_intrinsic_image_var_atomic_add ||
-       instr->intrinsic == nir_intrinsic_image_var_atomic_min ||
-       instr->intrinsic == nir_intrinsic_image_var_atomic_max ||
-       instr->intrinsic == nir_intrinsic_image_var_atomic_and ||
-       instr->intrinsic == nir_intrinsic_image_var_atomic_or ||
-       instr->intrinsic == nir_intrinsic_image_var_atomic_xor ||
-       instr->intrinsic == nir_intrinsic_image_var_atomic_exchange ||
-       instr->intrinsic == nir_intrinsic_image_var_atomic_comp_swap ||
-       instr->intrinsic == nir_intrinsic_image_var_size) {
-      b->cursor = nir_before_instr(&instr->instr);
-
-      if (instr->variables[0]->var->data.bindless ||
-          instr->variables[0]->var->data.mode != nir_var_uniform)
-         return false;
-
-      lower_deref(instr->variables[0], state, b);
-      return true;
-   }
-
-   return false;
-}
-
-static bool
-lower_impl(nir_function_impl *impl, struct lower_samplers_as_deref_state *state)
-{
-   nir_builder b;
-   nir_builder_init(&b, impl);
-   bool progress = false;
-
-   nir_foreach_block(block, impl) {
-      nir_foreach_instr(instr, block) {
-         if (instr->type == nir_instr_type_tex)
-            progress |= lower_sampler(nir_instr_as_tex(instr), state, &b);
-         else if (instr->type == nir_instr_type_intrinsic)
-            progress |= lower_intrinsic(nir_instr_as_intrinsic(instr), state, &b);
-      }
-   }
-
-   return progress;
-}
-
-bool
-nir_lower_samplers_as_deref(nir_shader *shader,
-                            const struct gl_shader_program *shader_program)
-{
-   bool progress = false;
-   struct lower_samplers_as_deref_state state;
-
-   state.shader = shader;
-   state.shader_program = shader_program;
-   state.remap_table = _mesa_hash_table_create(NULL, _mesa_key_hash_string,
-                                               _mesa_key_string_equal);
-
-   nir_foreach_function(function, shader) {
-      if (function->impl)
-         progress |= lower_impl(function->impl, &state);
-   }
-
-   /* keys are freed automatically by ralloc */
-   _mesa_hash_table_destroy(state.remap_table, NULL);
-
-   return progress;
-}
diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c
index 47709e9..2820dcd 100644
--- a/src/compiler/nir/nir_lower_system_values.c
+++ b/src/compiler/nir/nir_lower_system_values.c
@@ -28,6 +28,17 @@
 #include "nir.h"
 #include "nir_builder.h"
 
+static nir_ssa_def*
+build_local_group_size(nir_builder *b)
+{
+   nir_const_value local_size;
+   memset(&local_size, 0, sizeof(local_size));
+   local_size.u32[0] = b->shader->info.cs.local_size[0];
+   local_size.u32[1] = b->shader->info.cs.local_size[1];
+   local_size.u32[2] = b->shader->info.cs.local_size[2];
+   return nir_build_imm(b, 3, 32, local_size);
+}
+
 static bool
 convert_block(nir_block *block, nir_builder *b)
 {
@@ -37,16 +48,26 @@
       if (instr->type != nir_instr_type_intrinsic)
          continue;
 
-      nir_intrinsic_instr *load_var = nir_instr_as_intrinsic(instr);
-
-      if (load_var->intrinsic != nir_intrinsic_load_var)
+      nir_intrinsic_instr *load_deref = nir_instr_as_intrinsic(instr);
+      if (load_deref->intrinsic != nir_intrinsic_load_deref)
          continue;
 
-      nir_variable *var = load_var->variables[0]->var;
-      if (var->data.mode != nir_var_system_value)
+      nir_deref_instr *deref = nir_src_as_deref(load_deref->src[0]);
+      if (deref->mode != nir_var_system_value)
          continue;
 
-      b->cursor = nir_after_instr(&load_var->instr);
+      if (deref->deref_type != nir_deref_type_var) {
+         /* The only one system value that is an array and that is
+          * gl_SampleMask which is always an array of one element.
+          */
+         assert(deref->deref_type == nir_deref_type_array);
+         deref = nir_deref_instr_parent(deref);
+         assert(deref->deref_type == nir_deref_type_var);
+         assert(deref->var->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN);
+      }
+      nir_variable *var = deref->var;
+
+      b->cursor = nir_after_instr(&load_deref->instr);
 
       nir_ssa_def *sysval = NULL;
       switch (var->data.location) {
@@ -56,19 +77,11 @@
           *    "The value of gl_GlobalInvocationID is equal to
           *    gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID"
           */
-
-         nir_const_value local_size;
-         memset(&local_size, 0, sizeof(local_size));
-         local_size.u32[0] = b->shader->info.cs.local_size[0];
-         local_size.u32[1] = b->shader->info.cs.local_size[1];
-         local_size.u32[2] = b->shader->info.cs.local_size[2];
-
+         nir_ssa_def *group_size = build_local_group_size(b);
          nir_ssa_def *group_id = nir_load_work_group_id(b);
          nir_ssa_def *local_id = nir_load_local_invocation_id(b);
 
-         sysval = nir_iadd(b, nir_imul(b, group_id,
-                                       nir_build_imm(b, 3, 32, local_size)),
-                              local_id);
+         sysval = nir_iadd(b, nir_imul(b, group_id, group_size), local_id);
          break;
       }
 
@@ -102,12 +115,7 @@
       }
 
       case SYSTEM_VALUE_LOCAL_GROUP_SIZE: {
-         nir_const_value local_size;
-         memset(&local_size, 0, sizeof(local_size));
-         local_size.u32[0] = b->shader->info.cs.local_size[0];
-         local_size.u32[1] = b->shader->info.cs.local_size[1];
-         local_size.u32[2] = b->shader->info.cs.local_size[2];
-         sysval = nir_build_imm(b, 3, 32, local_size);
+         sysval = build_local_group_size(b);
          break;
       }
 
@@ -121,6 +129,38 @@
          }
          break;
 
+      case SYSTEM_VALUE_BASE_VERTEX:
+         /**
+          * From the OpenGL 4.6 (11.1.3.9 Shader Inputs) specification:
+          *
+          * "gl_BaseVertex holds the integer value passed to the baseVertex
+          * parameter to the command that resulted in the current shader
+          * invocation. In the case where the command has no baseVertex
+          * parameter, the value of gl_BaseVertex is zero."
+          */
+         if (b->shader->options->lower_base_vertex)
+            sysval = nir_iand(b,
+                              nir_load_is_indexed_draw(b),
+                              nir_load_first_vertex(b));
+         break;
+
+      case SYSTEM_VALUE_HELPER_INVOCATION:
+         if (b->shader->options->lower_helper_invocation) {
+            nir_ssa_def *tmp;
+
+            tmp = nir_ishl(b,
+                           nir_imm_int(b, 1),
+                           nir_load_sample_id_no_per_sample(b));
+
+            tmp = nir_iand(b,
+                           nir_load_sample_mask_in(b),
+                           tmp);
+
+            sysval = nir_inot(b, nir_i2b(b, tmp));
+         }
+
+         break;
+
       case SYSTEM_VALUE_INSTANCE_INDEX:
          sysval = nir_iadd(b,
                            nir_load_instance_id(b),
@@ -148,6 +188,13 @@
             sysval = nir_imm_int(b, 0);
          break;
 
+      case SYSTEM_VALUE_GLOBAL_GROUP_SIZE: {
+         nir_ssa_def *group_size = build_local_group_size(b);
+         nir_ssa_def *num_work_groups = nir_load_num_work_groups(b);
+         sysval = nir_imul(b, group_size, num_work_groups);
+         break;
+      }
+
       default:
          break;
       }
@@ -158,8 +205,8 @@
          sysval = nir_load_system_value(b, sysval_op, 0);
       }
 
-      nir_ssa_def_rewrite_uses(&load_var->dest.ssa, nir_src_for_ssa(sysval));
-      nir_instr_remove(&load_var->instr);
+      nir_ssa_def_rewrite_uses(&load_deref->dest.ssa, nir_src_for_ssa(sysval));
+      nir_instr_remove(&load_deref->instr);
 
       progress = true;
    }
@@ -193,6 +240,11 @@
          progress = convert_impl(function->impl) || progress;
    }
 
+   /* We're going to delete the variables so we need to clean up all those
+    * derefs we left lying around.
+    */
+   nir_remove_dead_derefs(shader);
+
    exec_list_make_empty(&shader->system_values);
 
    return progress;
diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c
index 1062afd..dc40d82 100644
--- a/src/compiler/nir/nir_lower_tex.c
+++ b/src/compiler/nir/nir_lower_tex.c
@@ -37,6 +37,7 @@
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_format_convert.h"
 
 static void
 project_src(nir_builder *b, nir_tex_instr *tex)
@@ -108,21 +109,39 @@
 
    nir_tex_instr *txs;
 
-   txs = nir_tex_instr_create(b->shader, 1);
+   unsigned num_srcs = 1; /* One for the LOD */
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      if (tex->src[i].src_type == nir_tex_src_texture_deref ||
+          tex->src[i].src_type == nir_tex_src_sampler_deref ||
+          tex->src[i].src_type == nir_tex_src_texture_offset ||
+          tex->src[i].src_type == nir_tex_src_sampler_offset)
+         num_srcs++;
+   }
+
+   txs = nir_tex_instr_create(b->shader, num_srcs);
    txs->op = nir_texop_txs;
    txs->sampler_dim = tex->sampler_dim;
    txs->is_array = tex->is_array;
    txs->is_shadow = tex->is_shadow;
    txs->is_new_style_shadow = tex->is_new_style_shadow;
    txs->texture_index = tex->texture_index;
-   txs->texture = nir_deref_var_clone(tex->texture, txs);
    txs->sampler_index = tex->sampler_index;
-   txs->sampler = nir_deref_var_clone(tex->sampler, txs);
    txs->dest_type = nir_type_int;
 
-   /* only single src, the lod: */
-   txs->src[0].src = nir_src_for_ssa(nir_imm_int(b, 0));
-   txs->src[0].src_type = nir_tex_src_lod;
+   unsigned idx = 0;
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      if (tex->src[i].src_type == nir_tex_src_texture_deref ||
+          tex->src[i].src_type == nir_tex_src_sampler_deref ||
+          tex->src[i].src_type == nir_tex_src_texture_offset ||
+          tex->src[i].src_type == nir_tex_src_sampler_offset) {
+         nir_src_copy(&txs->src[idx].src, &tex->src[i].src, txs);
+         txs->src[idx].src_type = tex->src[i].src_type;
+         idx++;
+      }
+   }
+   /* Add in an LOD because some back-ends require it */
+   txs->src[idx].src = nir_src_for_ssa(nir_imm_int(b, 0));
+   txs->src[idx].src_type = nir_tex_src_lod;
 
    nir_ssa_dest_init(&txs->instr, &txs->dest,
                      nir_tex_instr_dest_size(txs), 32, NULL);
@@ -217,20 +236,21 @@
    assert(tex->op == nir_texop_tex);
    assert(tex->coord_components == 2);
 
-   nir_tex_instr *plane_tex = nir_tex_instr_create(b->shader, 2);
-   nir_src_copy(&plane_tex->src[0].src, &tex->src[0].src, plane_tex);
-   plane_tex->src[0].src_type = nir_tex_src_coord;
-   plane_tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, plane));
-   plane_tex->src[1].src_type = nir_tex_src_plane;
+   nir_tex_instr *plane_tex =
+      nir_tex_instr_create(b->shader, tex->num_srcs + 1);
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      nir_src_copy(&plane_tex->src[i].src, &tex->src[i].src, plane_tex);
+      plane_tex->src[i].src_type = tex->src[i].src_type;
+   }
+   plane_tex->src[tex->num_srcs].src = nir_src_for_ssa(nir_imm_int(b, plane));
+   plane_tex->src[tex->num_srcs].src_type = nir_tex_src_plane;
    plane_tex->op = nir_texop_tex;
    plane_tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
    plane_tex->dest_type = nir_type_float;
    plane_tex->coord_components = 2;
 
    plane_tex->texture_index = tex->texture_index;
-   plane_tex->texture = nir_deref_var_clone(tex->texture, plane_tex);
    plane_tex->sampler_index = tex->sampler_index;
-   plane_tex->sampler = nir_deref_var_clone(tex->sampler, plane_tex);
 
    nir_ssa_dest_init(&plane_tex->instr, &plane_tex->dest, 4, 32, NULL);
 
@@ -344,8 +364,6 @@
    txl->is_shadow = tex->is_shadow;
    txl->is_new_style_shadow = tex->is_new_style_shadow;
    txl->sampler_index = tex->sampler_index;
-   txl->texture = nir_deref_var_clone(tex->texture, txl);
-   txl->sampler = nir_deref_var_clone(tex->sampler, txl);
    txl->coord_components = tex->coord_components;
 
    nir_ssa_dest_init(&txl->instr, &txl->dest, 4, 32, NULL);
@@ -462,8 +480,8 @@
    nir_ssa_def *cond_z = nir_fge(b, abs_p_z, nir_fmax(b, abs_p_x, abs_p_y));
    nir_ssa_def *cond_y = nir_fge(b, abs_p_y, nir_fmax(b, abs_p_x, abs_p_z));
 
-   unsigned yzx[4] = { 1, 2, 0, 0 };
-   unsigned xzy[4] = { 0, 2, 1, 0 };
+   unsigned yzx[3] = { 1, 2, 0 };
+   unsigned xzy[3] = { 0, 2, 1 };
 
    Q = nir_bcsel(b, cond_z,
                  p,
@@ -491,16 +509,15 @@
     */
    nir_ssa_def *rcp_Q_z = nir_frcp(b, nir_channel(b, Q, 2));
 
-   unsigned xy[4] = { 0, 1, 0, 0 };
-   nir_ssa_def *Q_xy = nir_swizzle(b, Q, xy, 2, false);
+   nir_ssa_def *Q_xy = nir_channels(b, Q, 0x3);
    nir_ssa_def *tmp = nir_fmul(b, Q_xy, rcp_Q_z);
 
-   nir_ssa_def *dQdx_xy = nir_swizzle(b, dQdx, xy, 2, false);
+   nir_ssa_def *dQdx_xy = nir_channels(b, dQdx, 0x3);
    nir_ssa_def *dQdx_z = nir_channel(b, dQdx, 2);
    nir_ssa_def *dx =
       nir_fmul(b, rcp_Q_z, nir_fsub(b, dQdx_xy, nir_fmul(b, tmp, dQdx_z)));
 
-   nir_ssa_def *dQdy_xy = nir_swizzle(b, dQdy, xy, 2, false);
+   nir_ssa_def *dQdy_xy = nir_channels(b, dQdy, 0x3);
    nir_ssa_def *dQdy_z = nir_channel(b, dQdy, 2);
    nir_ssa_def *dy =
       nir_fmul(b, rcp_Q_z, nir_fsub(b, dQdy_xy, nir_fmul(b, tmp, dQdy_z)));
@@ -694,24 +711,8 @@
 
    b->cursor = nir_after_instr(&tex->instr);
 
-   static const unsigned swiz[4] = {0, 1, 2, 0};
-   nir_ssa_def *comp = nir_swizzle(b, &tex->dest.ssa, swiz, 3, true);
-
-   /* Formula is:
-    *    (comp <= 0.04045) ?
-    *          (comp / 12.92) :
-    *          pow((comp + 0.055) / 1.055, 2.4)
-    */
-   nir_ssa_def *low  = nir_fmul(b, comp, nir_imm_float(b, 1.0 / 12.92));
-   nir_ssa_def *high = nir_fpow(b,
-                                nir_fmul(b,
-                                         nir_fadd(b,
-                                                  comp,
-                                                  nir_imm_float(b, 0.055)),
-                                         nir_imm_float(b, 1.0 / 1.055)),
-                                nir_imm_float(b, 2.4));
-   nir_ssa_def *cond = nir_fge(b, nir_imm_float(b, 0.04045), comp);
-   nir_ssa_def *rgb  = nir_bcsel(b, cond, low, high);
+   nir_ssa_def *rgb =
+      nir_format_srgb_to_linear(b, nir_channels(b, &tex->dest.ssa, 0x7));
 
    /* alpha is untouched: */
    nir_ssa_def *result = nir_vec4(b,
diff --git a/src/compiler/nir/nir_lower_var_copies.c b/src/compiler/nir/nir_lower_var_copies.c
index 6288bdc..e72f7ef 100644
--- a/src/compiler/nir/nir_lower_var_copies.c
+++ b/src/compiler/nir/nir_lower_var_copies.c
@@ -26,152 +26,115 @@
  */
 
 #include "nir.h"
+#include "nir_builder.h"
+#include "nir_deref.h"
 #include "compiler/nir_types.h"
 
 /*
  * Lowers all copy intrinsics to sequences of load/store intrinsics.
  */
 
-/* Walks down the deref chain and returns the next deref in the chain whose
- * child is a wildcard.  In other words, given the chain  a[1].foo[*].bar,
- * this function will return the deref to foo.  Calling it a second time
- * with the [*].bar, it will return NULL.
- */
-static nir_deref *
-deref_next_wildcard_parent(nir_deref *deref)
+static nir_deref_instr *
+build_deref_to_next_wildcard(nir_builder *b,
+                             nir_deref_instr *parent,
+                             nir_deref_instr ***deref_arr)
 {
-   for (nir_deref *tail = deref; tail->child; tail = tail->child) {
-      if (tail->child->deref_type != nir_deref_type_array)
-         continue;
+   for (; **deref_arr; (*deref_arr)++) {
+      if ((**deref_arr)->deref_type == nir_deref_type_array_wildcard)
+         return parent;
 
-      nir_deref_array *arr = nir_deref_as_array(tail->child);
-
-      if (arr->deref_array_type == nir_deref_array_type_wildcard)
-         return tail;
+      parent = nir_build_deref_follower(b, parent, **deref_arr);
    }
 
-   return NULL;
+   assert(**deref_arr == NULL);
+   *deref_arr = NULL;
+   return parent;
 }
 
-/* This function recursively walks the given deref chain and replaces the
- * given copy instruction with an equivalent sequence load/store
- * operations.
- *
- * @copy_instr    The copy instruction to replace; new instructions will be
- *                inserted before this one
- *
- * @dest_head     The head of the destination variable deref chain
- *
- * @src_head      The head of the source variable deref chain
- *
- * @dest_tail     The current tail of the destination variable deref chain;
- *                this is used for recursion and external callers of this
- *                function should call it with tail == head
- *
- * @src_tail      The current tail of the source variable deref chain;
- *                this is used for recursion and external callers of this
- *                function should call it with tail == head
- *
- * @state         The current variable lowering state
- */
 static void
-emit_copy_load_store(nir_intrinsic_instr *copy_instr,
-                     nir_deref_var *dest_head, nir_deref_var *src_head,
-                     nir_deref *dest_tail, nir_deref *src_tail,
-                     nir_shader *shader)
+emit_deref_copy_load_store(nir_builder *b,
+                           nir_deref_instr *dst_deref,
+                           nir_deref_instr **dst_deref_arr,
+                           nir_deref_instr *src_deref,
+                           nir_deref_instr **src_deref_arr)
 {
-   /* Find the next pair of wildcards */
-   nir_deref *src_arr_parent = deref_next_wildcard_parent(src_tail);
-   nir_deref *dest_arr_parent = deref_next_wildcard_parent(dest_tail);
+   if (dst_deref_arr || src_deref_arr) {
+      assert(dst_deref_arr && src_deref_arr);
+      dst_deref = build_deref_to_next_wildcard(b, dst_deref, &dst_deref_arr);
+      src_deref = build_deref_to_next_wildcard(b, src_deref, &src_deref_arr);
+   }
 
-   if (src_arr_parent || dest_arr_parent) {
-      /* Wildcards had better come in matched pairs */
-      assert(src_arr_parent && dest_arr_parent);
+   if (dst_deref_arr || src_deref_arr) {
+      assert(dst_deref_arr && src_deref_arr);
+      assert((*dst_deref_arr)->deref_type == nir_deref_type_array_wildcard);
+      assert((*src_deref_arr)->deref_type == nir_deref_type_array_wildcard);
 
-      nir_deref_array *src_arr = nir_deref_as_array(src_arr_parent->child);
-      nir_deref_array *dest_arr = nir_deref_as_array(dest_arr_parent->child);
-
-      unsigned length = glsl_get_length(src_arr_parent->type);
+      unsigned length = glsl_get_length(src_deref->type);
       /* The wildcards should represent the same number of elements */
-      assert(length == glsl_get_length(dest_arr_parent->type));
+      assert(length == glsl_get_length(dst_deref->type));
       assert(length > 0);
 
-      /* Walk over all of the elements that this wildcard refers to and
-       * call emit_copy_load_store on each one of them */
-      src_arr->deref_array_type = nir_deref_array_type_direct;
-      dest_arr->deref_array_type = nir_deref_array_type_direct;
       for (unsigned i = 0; i < length; i++) {
-         src_arr->base_offset = i;
-         dest_arr->base_offset = i;
-         emit_copy_load_store(copy_instr, dest_head, src_head,
-                              &dest_arr->deref, &src_arr->deref, shader);
+         nir_ssa_def *index = nir_imm_int(b, i);
+         emit_deref_copy_load_store(b,
+                                    nir_build_deref_array(b, dst_deref, index),
+                                    dst_deref_arr + 1,
+                                    nir_build_deref_array(b, src_deref, index),
+                                    src_deref_arr + 1);
       }
-      src_arr->deref_array_type = nir_deref_array_type_wildcard;
-      dest_arr->deref_array_type = nir_deref_array_type_wildcard;
    } else {
-      /* In this case, we have no wildcards anymore, so all we have to do
-       * is just emit the load and store operations. */
-      src_tail = nir_deref_tail(src_tail);
-      dest_tail = nir_deref_tail(dest_tail);
+      assert(dst_deref->type == src_deref->type);
+      assert(glsl_type_is_vector_or_scalar(dst_deref->type));
 
-      assert(src_tail->type == dest_tail->type);
-
-      unsigned num_components = glsl_get_vector_elements(src_tail->type);
-      unsigned bit_size = glsl_get_bit_size(src_tail->type);
-
-      nir_intrinsic_instr *load =
-         nir_intrinsic_instr_create(shader, nir_intrinsic_load_var);
-      load->num_components = num_components;
-      load->variables[0] = nir_deref_var_clone(src_head, load);
-      nir_ssa_dest_init(&load->instr, &load->dest, num_components, bit_size,
-                        NULL);
-
-      nir_instr_insert_before(&copy_instr->instr, &load->instr);
-
-      nir_intrinsic_instr *store =
-         nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
-      store->num_components = num_components;
-      nir_intrinsic_set_write_mask(store, (1 << num_components) - 1);
-      store->variables[0] = nir_deref_var_clone(dest_head, store);
-
-      store->src[0].is_ssa = true;
-      store->src[0].ssa = &load->dest.ssa;
-
-      nir_instr_insert_before(&copy_instr->instr, &store->instr);
+      nir_store_deref(b, dst_deref, nir_load_deref(b, src_deref), ~0);
    }
 }
 
-/* Lowers a copy instruction to a sequence of load/store instructions
- *
- * The new instructions are placed before the copy instruction in the IR.
- */
 void
-nir_lower_var_copy_instr(nir_intrinsic_instr *copy, nir_shader *shader)
+nir_lower_deref_copy_instr(nir_builder *b, nir_intrinsic_instr *copy)
 {
-   assert(copy->intrinsic == nir_intrinsic_copy_var);
-   emit_copy_load_store(copy, copy->variables[0], copy->variables[1],
-                        &copy->variables[0]->deref,
-                        &copy->variables[1]->deref, shader);
+   /* Unfortunately, there's just no good way to handle wildcards except to
+    * flip the chain around and walk the list from variable to final pointer.
+    */
+   assert(copy->src[0].is_ssa && copy->src[1].is_ssa);
+   nir_deref_instr *dst = nir_instr_as_deref(copy->src[0].ssa->parent_instr);
+   nir_deref_instr *src = nir_instr_as_deref(copy->src[1].ssa->parent_instr);
+
+   nir_deref_path dst_path, src_path;
+   nir_deref_path_init(&dst_path, dst, NULL);
+   nir_deref_path_init(&src_path, src, NULL);
+
+   b->cursor = nir_before_instr(&copy->instr);
+   emit_deref_copy_load_store(b, dst_path.path[0], &dst_path.path[1],
+                                 src_path.path[0], &src_path.path[1]);
+
+   nir_deref_path_finish(&dst_path);
+   nir_deref_path_finish(&src_path);
 }
 
 static bool
 lower_var_copies_impl(nir_function_impl *impl)
 {
-   nir_shader *shader = impl->function->shader;
    bool progress = false;
 
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
    nir_foreach_block(block, impl) {
       nir_foreach_instr_safe(instr, block) {
          if (instr->type != nir_instr_type_intrinsic)
             continue;
 
          nir_intrinsic_instr *copy = nir_instr_as_intrinsic(instr);
-         if (copy->intrinsic != nir_intrinsic_copy_var)
+         if (copy->intrinsic != nir_intrinsic_copy_deref)
             continue;
 
-         nir_lower_var_copy_instr(copy, shader);
+         nir_lower_deref_copy_instr(&b, copy);
 
          nir_instr_remove(&copy->instr);
+         nir_deref_instr_remove_if_unused(nir_src_as_deref(copy->src[0]));
+         nir_deref_instr_remove_if_unused(nir_src_as_deref(copy->src[1]));
+
          progress = true;
          ralloc_free(copy);
       }
diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c
index 8bc847f..3f37aca 100644
--- a/src/compiler/nir/nir_lower_vars_to_ssa.c
+++ b/src/compiler/nir/nir_lower_vars_to_ssa.c
@@ -27,6 +27,7 @@
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_deref.h"
 #include "nir_phi_builder.h"
 #include "nir_vla.h"
 
@@ -38,10 +39,10 @@
    bool lower_to_ssa;
 
    /* Only valid for things that end up in the direct list.
-    * Note that multiple nir_deref_vars may correspond to this node, but they
-    * will all be equivalent, so any is as good as the other.
+    * Note that multiple nir_deref_instrs may correspond to this node, but
+    * they will all be equivalent, so any is as good as the other.
     */
-   nir_deref_var *deref;
+   nir_deref_path path;
    struct exec_node direct_derefs_link;
 
    struct set *loads;
@@ -50,6 +51,11 @@
 
    struct nir_phi_builder_value *pb_value;
 
+   /* True if this node is fully direct.  If set, it must be in the children
+    * array of its parent.
+    */
+   bool is_direct;
+
    struct deref_node *wildcard;
    struct deref_node *indirect;
    struct deref_node *children[0];
@@ -92,16 +98,17 @@
 
 static struct deref_node *
 deref_node_create(struct deref_node *parent,
-                  const struct glsl_type *type, nir_shader *shader)
+                  const struct glsl_type *type,
+                  bool is_direct, void *mem_ctx)
 {
    size_t size = sizeof(struct deref_node) +
                  glsl_get_length(type) * sizeof(struct deref_node *);
 
-   struct deref_node *node = rzalloc_size(shader, size);
+   struct deref_node *node = rzalloc_size(mem_ctx, size);
    node->type = type;
    node->parent = parent;
-   node->deref = NULL;
    exec_node_init(&node->direct_derefs_link);
+   node->is_direct = is_direct;
 
    return node;
 }
@@ -120,7 +127,7 @@
    if (var_entry) {
       return var_entry->data;
    } else {
-      node = deref_node_create(NULL, var->type, state->dead_ctx);
+      node = deref_node_create(NULL, var->type, true, state->dead_ctx);
       _mesa_hash_table_insert(state->deref_var_nodes, var, node);
       return node;
    }
@@ -132,82 +139,84 @@
  * table of of fully-qualified direct derefs.
  */
 static struct deref_node *
-get_deref_node(nir_deref_var *deref, struct lower_variables_state *state)
+get_deref_node_recur(nir_deref_instr *deref,
+                     struct lower_variables_state *state)
 {
-   bool is_direct = true;
+   if (deref->deref_type == nir_deref_type_var)
+      return get_deref_node_for_var(deref->var, state);
 
-   /* Start at the base of the chain. */
-   struct deref_node *node = get_deref_node_for_var(deref->var, state);
-   assert(deref->deref.type == node->type);
+   struct deref_node *parent =
+      get_deref_node_recur(nir_deref_instr_parent(deref), state);
 
-   for (nir_deref *tail = deref->deref.child; tail; tail = tail->child) {
-      switch (tail->deref_type) {
-      case nir_deref_type_struct: {
-         nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
+   switch (deref->deref_type) {
+   case nir_deref_type_struct:
+      assert(glsl_type_is_struct(parent->type));
+      assert(deref->strct.index < glsl_get_length(parent->type));
 
-         assert(deref_struct->index < glsl_get_length(node->type));
-
-         if (node->children[deref_struct->index] == NULL)
-            node->children[deref_struct->index] =
-               deref_node_create(node, tail->type, state->dead_ctx);
-
-         node = node->children[deref_struct->index];
-         break;
+      if (parent->children[deref->strct.index] == NULL) {
+         parent->children[deref->strct.index] =
+            deref_node_create(parent, deref->type, parent->is_direct,
+                              state->dead_ctx);
       }
 
-      case nir_deref_type_array: {
-         nir_deref_array *arr = nir_deref_as_array(tail);
+      return parent->children[deref->strct.index];
 
-         switch (arr->deref_array_type) {
-         case nir_deref_array_type_direct:
-            /* This is possible if a loop unrolls and generates an
-             * out-of-bounds offset.  We need to handle this at least
-             * somewhat gracefully.
-             */
-            if (arr->base_offset >= glsl_get_length(node->type))
-               return NULL;
+   case nir_deref_type_array: {
+      nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
+      if (const_index) {
+         uint32_t index = const_index->u32[0];
+         /* This is possible if a loop unrolls and generates an
+          * out-of-bounds offset.  We need to handle this at least
+          * somewhat gracefully.
+          */
+         if (index >= glsl_get_length(parent->type))
+            return NULL;
 
-            if (node->children[arr->base_offset] == NULL)
-               node->children[arr->base_offset] =
-                  deref_node_create(node, tail->type, state->dead_ctx);
-
-            node = node->children[arr->base_offset];
-            break;
-
-         case nir_deref_array_type_indirect:
-            if (node->indirect == NULL)
-               node->indirect = deref_node_create(node, tail->type,
-                                                  state->dead_ctx);
-
-            node = node->indirect;
-            is_direct = false;
-            break;
-
-         case nir_deref_array_type_wildcard:
-            if (node->wildcard == NULL)
-               node->wildcard = deref_node_create(node, tail->type,
-                                                  state->dead_ctx);
-
-            node = node->wildcard;
-            is_direct = false;
-            break;
-
-         default:
-            unreachable("Invalid array deref type");
+         if (parent->children[index] == NULL) {
+            parent->children[index] =
+               deref_node_create(parent, deref->type, parent->is_direct,
+                                 state->dead_ctx);
          }
-         break;
+
+         return parent->children[index];
+      } else {
+         if (parent->indirect == NULL) {
+            parent->indirect =
+               deref_node_create(parent, deref->type, false, state->dead_ctx);
+         }
+
+         return parent->indirect;
       }
-      default:
-         unreachable("Invalid deref type");
-      }
+      break;
    }
 
-   assert(node);
+   case nir_deref_type_array_wildcard:
+      if (parent->wildcard == NULL) {
+         parent->wildcard =
+            deref_node_create(parent, deref->type, false, state->dead_ctx);
+      }
 
-   /* Only insert if it isn't already in the list. */
-   if (is_direct && state->add_to_direct_deref_nodes &&
+      return parent->wildcard;
+
+   default:
+      unreachable("Invalid deref type");
+   }
+}
+
+static struct deref_node *
+get_deref_node(nir_deref_instr *deref, struct lower_variables_state *state)
+{
+   struct deref_node *node = get_deref_node_recur(deref, state);
+   if (!node)
+      return NULL;
+
+   /* Insert the node in the direct derefs list.  We only do this if it's not
+    * already in the list and we only bother for deref nodes which are used
+    * directly in a load or store.
+    */
+   if (node->is_direct && state->add_to_direct_deref_nodes &&
        node->direct_derefs_link.next == NULL) {
-      node->deref = deref;
+      nir_deref_path_init(&node->path, deref, state->dead_ctx);
       assert(deref->var != NULL);
       exec_list_push_tail(&state->direct_deref_nodes,
                           &node->direct_derefs_link);
@@ -218,41 +227,43 @@
 
 /* \sa foreach_deref_node_match */
 static void
-foreach_deref_node_worker(struct deref_node *node, nir_deref *deref,
+foreach_deref_node_worker(struct deref_node *node, nir_deref_instr **path,
                           void (* cb)(struct deref_node *node,
                                       struct lower_variables_state *state),
                           struct lower_variables_state *state)
 {
-   if (deref->child == NULL) {
+   if (*path == NULL) {
       cb(node, state);
       return;
    }
 
-   switch (deref->child->deref_type) {
+   switch ((*path)->deref_type) {
+   case nir_deref_type_struct:
+      if (node->children[(*path)->strct.index]) {
+         foreach_deref_node_worker(node->children[(*path)->strct.index],
+                                   path + 1, cb, state);
+      }
+      return;
+
    case nir_deref_type_array: {
-      nir_deref_array *arr = nir_deref_as_array(deref->child);
-      assert(arr->deref_array_type == nir_deref_array_type_direct);
+      nir_const_value *const_index = nir_src_as_const_value((*path)->arr.index);
+      assert(const_index);
+      uint32_t index = const_index->u32[0];
 
-      if (node->children[arr->base_offset]) {
-         foreach_deref_node_worker(node->children[arr->base_offset],
-                                   deref->child, cb, state);
+      if (node->children[index]) {
+         foreach_deref_node_worker(node->children[index],
+                                   path + 1, cb, state);
       }
-      if (node->wildcard)
-         foreach_deref_node_worker(node->wildcard, deref->child, cb, state);
-      break;
-   }
 
-   case nir_deref_type_struct: {
-      nir_deref_struct *str = nir_deref_as_struct(deref->child);
-      if (node->children[str->index]) {
-         foreach_deref_node_worker(node->children[str->index],
-                                   deref->child, cb, state);
+      if (node->wildcard) {
+         foreach_deref_node_worker(node->wildcard,
+                                   path + 1, cb, state);
       }
-      break;
+      return;
    }
 
    default:
-      unreachable("Invalid deref child type");
+      unreachable("Unsupported deref type");
    }
 }
 
@@ -269,67 +280,62 @@
  * or indirects) deref chain.
  */
 static void
-foreach_deref_node_match(nir_deref_var *deref,
+foreach_deref_node_match(nir_deref_path *path,
                          void (* cb)(struct deref_node *node,
                                      struct lower_variables_state *state),
                          struct lower_variables_state *state)
 {
-   nir_deref_var var_deref = *deref;
-   var_deref.deref.child = NULL;
-   struct deref_node *node = get_deref_node(&var_deref, state);
+   assert(path->path[0]->deref_type == nir_deref_type_var);
+   struct deref_node *node = get_deref_node_for_var(path->path[0]->var, state);
 
    if (node == NULL)
       return;
 
-   foreach_deref_node_worker(node, &deref->deref, cb, state);
+   foreach_deref_node_worker(node, &path->path[1], cb, state);
 }
 
 /* \sa deref_may_be_aliased */
 static bool
-deref_may_be_aliased_node(struct deref_node *node, nir_deref *deref,
-                          struct lower_variables_state *state)
+path_may_be_aliased_node(struct deref_node *node, nir_deref_instr **path,
+                         struct lower_variables_state *state)
 {
-   if (deref->child == NULL) {
+   if (*path == NULL)
       return false;
-   } else {
-      switch (deref->child->deref_type) {
-      case nir_deref_type_array: {
-         nir_deref_array *arr = nir_deref_as_array(deref->child);
 
-         /* This is a child of one of the derefs in direct_deref_nodes,
-          * so we know it is direct.
-          */
-         assert(arr->deref_array_type == nir_deref_array_type_direct);
-
-         /* If there is an indirect at this level, we're aliased. */
-         if (node->indirect)
-            return true;
-
-         if (node->children[arr->base_offset] &&
-             deref_may_be_aliased_node(node->children[arr->base_offset],
-                                       deref->child, state))
-            return true;
-
-         if (node->wildcard &&
-             deref_may_be_aliased_node(node->wildcard, deref->child, state))
-            return true;
-
+   switch ((*path)->deref_type) {
+   case nir_deref_type_struct:
+      if (node->children[(*path)->strct.index]) {
+         return path_may_be_aliased_node(node->children[(*path)->strct.index],
+                                         path + 1, state);
+      } else {
          return false;
       }
 
-      case nir_deref_type_struct: {
-         nir_deref_struct *str = nir_deref_as_struct(deref->child);
-         if (node->children[str->index]) {
-             return deref_may_be_aliased_node(node->children[str->index],
-                                              deref->child, state);
-         } else {
-            return false;
-         }
-      }
+   case nir_deref_type_array: {
+      nir_const_value *const_index = nir_src_as_const_value((*path)->arr.index);
+      if (!const_index)
+         return true;
 
-      default:
-         unreachable("Invalid nir_deref child type");
-      }
+      uint32_t index = const_index->u32[0];
+
+      /* If there is an indirect at this level, we're aliased. */
+      if (node->indirect)
+         return true;
+
+      if (node->children[index] &&
+          path_may_be_aliased_node(node->children[index],
+                                   path + 1, state))
+         return true;
+
+      if (node->wildcard &&
+          path_may_be_aliased_node(node->wildcard, path + 1, state))
+         return true;
+
+      return false;
+   }
+
+   default:
+      unreachable("Unsupported deref type");
    }
 }
 
@@ -348,18 +354,22 @@
  * references.
  */
 static bool
-deref_may_be_aliased(nir_deref_var *deref,
-                     struct lower_variables_state *state)
+path_may_be_aliased(nir_deref_path *path,
+                    struct lower_variables_state *state)
 {
-   return deref_may_be_aliased_node(get_deref_node_for_var(deref->var, state),
-                                    &deref->deref, state);
+   assert(path->path[0]->deref_type == nir_deref_type_var);
+   nir_variable *var = path->path[0]->var;
+
+   return path_may_be_aliased_node(get_deref_node_for_var(var, state),
+                                   &path->path[1], state);
 }
 
 static void
 register_load_instr(nir_intrinsic_instr *load_instr,
                     struct lower_variables_state *state)
 {
-   struct deref_node *node = get_deref_node(load_instr->variables[0], state);
+   nir_deref_instr *deref = nir_src_as_deref(load_instr->src[0]);
+   struct deref_node *node = get_deref_node(deref, state);
    if (node == NULL)
       return;
 
@@ -374,7 +384,8 @@
 register_store_instr(nir_intrinsic_instr *store_instr,
                      struct lower_variables_state *state)
 {
-   struct deref_node *node = get_deref_node(store_instr->variables[0], state);
+   nir_deref_instr *deref = nir_src_as_deref(store_instr->src[0]);
+   struct deref_node *node = get_deref_node(deref, state);
    if (node == NULL)
       return;
 
@@ -390,9 +401,8 @@
                     struct lower_variables_state *state)
 {
    for (unsigned idx = 0; idx < 2; idx++) {
-      struct deref_node *node =
-         get_deref_node(copy_instr->variables[idx], state);
-
+      nir_deref_instr *deref = nir_src_as_deref(copy_instr->src[idx]);
+      struct deref_node *node = get_deref_node(deref, state);
       if (node == NULL)
          continue;
 
@@ -416,15 +426,15 @@
          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
          switch (intrin->intrinsic) {
-         case nir_intrinsic_load_var:
+         case nir_intrinsic_load_deref:
             register_load_instr(intrin, state);
             break;
 
-         case nir_intrinsic_store_var:
+         case nir_intrinsic_store_deref:
             register_store_instr(intrin, state);
             break;
 
-         case nir_intrinsic_copy_var:
+         case nir_intrinsic_copy_deref:
             register_copy_instr(intrin, state);
             break;
 
@@ -445,15 +455,18 @@
    if (!node->copies)
       return;
 
+   nir_builder b;
+   nir_builder_init(&b, state->impl);
+
    struct set_entry *copy_entry;
    set_foreach(node->copies, copy_entry) {
       nir_intrinsic_instr *copy = (void *)copy_entry->key;
 
-      nir_lower_var_copy_instr(copy, state->shader);
+      nir_lower_deref_copy_instr(&b, copy);
 
       for (unsigned i = 0; i < 2; ++i) {
-         struct deref_node *arg_node =
-            get_deref_node(copy->variables[i], state);
+         nir_deref_instr *arg_deref = nir_src_as_deref(copy->src[i]);
+         struct deref_node *arg_node = get_deref_node(arg_deref, state);
 
          /* Only bother removing copy entries for other nodes */
          if (arg_node == NULL || arg_node == node)
@@ -491,10 +504,9 @@
          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
          switch (intrin->intrinsic) {
-         case nir_intrinsic_load_var: {
-            struct deref_node *node =
-               get_deref_node(intrin->variables[0], state);
-
+         case nir_intrinsic_load_deref: {
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+            struct deref_node *node = get_deref_node(deref, state);
             if (node == NULL) {
                /* If we hit this path then we are referencing an invalid
                 * value.  Most likely, we unrolled something and are
@@ -539,9 +551,12 @@
             break;
          }
 
-         case nir_intrinsic_store_var: {
-            struct deref_node *node =
-               get_deref_node(intrin->variables[0], state);
+         case nir_intrinsic_store_deref: {
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+            struct deref_node *node = get_deref_node(deref, state);
+
+            assert(intrin->src[1].is_ssa);
+            nir_ssa_def *value = intrin->src[1].ssa;
 
             if (node == NULL) {
                /* Probably an out-of-bounds array store.  That should be a
@@ -556,22 +571,20 @@
             assert(intrin->num_components ==
                    glsl_get_vector_elements(node->type));
 
-            assert(intrin->src[0].is_ssa);
-
             nir_ssa_def *new_def;
             b.cursor = nir_before_instr(&intrin->instr);
 
             unsigned wrmask = nir_intrinsic_write_mask(intrin);
             if (wrmask == (1 << intrin->num_components) - 1) {
                /* Whole variable store - just copy the source.  Note that
-                * intrin->num_components and intrin->src[0].ssa->num_components
+                * intrin->num_components and value->num_components
                 * may differ.
                 */
                unsigned swiz[4];
                for (unsigned i = 0; i < 4; i++)
                   swiz[i] = i < intrin->num_components ? i : 0;
 
-               new_def = nir_swizzle(&b, intrin->src[0].ssa, swiz,
+               new_def = nir_swizzle(&b, value, swiz,
                                      intrin->num_components, false);
             } else {
                nir_ssa_def *old_def =
@@ -583,7 +596,7 @@
                nir_ssa_def *srcs[4];
                for (unsigned i = 0; i < intrin->num_components; i++) {
                   if (wrmask & (1 << i)) {
-                     srcs[i] = nir_channel(&b, intrin->src[0].ssa, i);
+                     srcs[i] = nir_channel(&b, value, i);
                   } else {
                      srcs[i] = nir_channel(&b, old_def, i);
                   }
@@ -660,14 +673,17 @@
 
    foreach_list_typed_safe(struct deref_node, node, direct_derefs_link,
                            &state.direct_deref_nodes) {
-      nir_deref_var *deref = node->deref;
+      nir_deref_path *path = &node->path;
 
-      if (deref->var->data.mode != nir_var_local) {
+      assert(path->path[0]->deref_type == nir_deref_type_var);
+      nir_variable *var = path->path[0]->var;
+
+      if (var->data.mode != nir_var_local) {
          exec_node_remove(&node->direct_derefs_link);
          continue;
       }
 
-      if (deref_may_be_aliased(deref, &state)) {
+      if (path_may_be_aliased(path, &state)) {
          exec_node_remove(&node->direct_derefs_link);
          continue;
       }
@@ -675,7 +691,7 @@
       node->lower_to_ssa = true;
       progress = true;
 
-      foreach_deref_node_match(deref, lower_copies_to_load_store, &state);
+      foreach_deref_node_match(path, lower_copies_to_load_store, &state);
    }
 
    if (!progress)
@@ -702,7 +718,7 @@
       memset(store_blocks, 0,
              BITSET_WORDS(state.impl->num_blocks) * sizeof(*store_blocks));
 
-      assert(node->deref->var->constant_initializer == NULL);
+      assert(node->path.path[0]->var->constant_initializer == NULL);
 
       if (node->stores) {
          struct set_entry *store_entry;
diff --git a/src/compiler/nir/nir_lower_wpos_center.c b/src/compiler/nir/nir_lower_wpos_center.c
index dca810d..b6f3529 100644
--- a/src/compiler/nir/nir_lower_wpos_center.c
+++ b/src/compiler/nir/nir_lower_wpos_center.c
@@ -81,14 +81,14 @@
    nir_foreach_instr(instr, block) {
       if (instr->type == nir_instr_type_intrinsic) {
          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-         if (intr->intrinsic == nir_intrinsic_load_var) {
-            nir_deref_var *dvar = intr->variables[0];
-            nir_variable *var = dvar->var;
+         if (intr->intrinsic == nir_intrinsic_load_deref) {
+            nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+            nir_variable *var = nir_deref_instr_get_variable(deref);
 
             if (var->data.mode == nir_var_shader_in &&
                 var->data.location == VARYING_SLOT_POS) {
                /* gl_FragCoord should not have array/struct derefs: */
-               assert(dvar->deref.child == NULL);
+               assert(deref->deref_type == nir_deref_type_var);
                update_fragcoord(b, intr, for_sample_shading);
                progress = true;
             }
diff --git a/src/compiler/nir/nir_lower_wpos_ytransform.c b/src/compiler/nir/nir_lower_wpos_ytransform.c
index 62166e7..fc61beb 100644
--- a/src/compiler/nir/nir_lower_wpos_ytransform.c
+++ b/src/compiler/nir/nir_lower_wpos_ytransform.c
@@ -77,11 +77,10 @@
 /* see emit_wpos_adjustment() in st_mesa_to_tgsi.c */
 static void
 emit_wpos_adjustment(lower_wpos_ytransform_state *state,
-                     nir_intrinsic_instr *intr,
+                     nir_intrinsic_instr *intr, nir_variable *fragcoord,
                      bool invert, float adjX, float adjY[2])
 {
    nir_builder *b = &state->b;
-   nir_variable *fragcoord = intr->variables[0]->var;
    nir_ssa_def *wpostrans, *wpos_temp, *wpos_temp_y, *wpos_input;
 
    assert(intr->dest.is_ssa);
@@ -144,10 +143,10 @@
 }
 
 static void
-lower_fragcoord(lower_wpos_ytransform_state *state, nir_intrinsic_instr *intr)
+lower_fragcoord(lower_wpos_ytransform_state *state,
+                nir_intrinsic_instr *intr, nir_variable *fragcoord)
 {
    const nir_lower_wpos_ytransform_options *options = state->options;
-   nir_variable *fragcoord = intr->variables[0]->var;
    float adjX = 0.0f;
    float adjY[2] = { 0.0f, 0.0f };
    bool invert = false;
@@ -229,7 +228,32 @@
       }
    }
 
-   emit_wpos_adjustment(state, intr, invert, adjX, adjY);
+   emit_wpos_adjustment(state, intr, fragcoord, invert, adjX, adjY);
+}
+
+static void
+lower_load_pointcoord(lower_wpos_ytransform_state *state,
+                      nir_intrinsic_instr *intr)
+{
+   nir_builder *b = &state->b;
+   b->cursor = nir_after_instr(&intr->instr);
+
+   nir_ssa_def *pntc = &intr->dest.ssa;
+   nir_ssa_def *transform = get_transform(state);
+   nir_ssa_def *y = nir_channel(b, pntc, 1);
+   /* The offset is 1 if we're flipping, 0 otherwise. */
+   nir_ssa_def *offset = nir_fmax(b, nir_channel(b, transform, 2),
+                                  nir_imm_float(b, 0.0));
+   /* Flip the sign of y if we're flipping. */
+   nir_ssa_def *scaled = nir_fmul(b, y, nir_channel(b, transform, 0));
+
+   /* Reassemble the vector. */
+   nir_ssa_def *flipped_pntc = nir_vec2(b,
+                                        nir_channel(b, pntc, 0),
+                                        nir_fadd(b, offset, scaled));
+
+   nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, nir_src_for_ssa(flipped_pntc),
+                                  flipped_pntc->parent_instr);
 }
 
 /* turns 'fddy(p)' into 'fddy(fmul(p, transform.x))' */
@@ -253,9 +277,9 @@
       fddy->src[0].swizzle[i] = MIN2(i, pt->num_components - 1);
 }
 
-/* Multiply interp_var_at_offset's offset by transform.x to flip it. */
+/* Multiply interp_deref_at_offset's offset by transform.x to flip it. */
 static void
-lower_interp_var_at_offset(lower_wpos_ytransform_state *state,
+lower_interp_deref_at_offset(lower_wpos_ytransform_state *state,
                            nir_intrinsic_instr *interp)
 {
    nir_builder *b = &state->b;
@@ -264,10 +288,10 @@
 
    b->cursor = nir_before_instr(&interp->instr);
 
-   offset = nir_ssa_for_src(b, interp->src[0], 2);
+   offset = nir_ssa_for_src(b, interp->src[1], 2);
    flip_y = nir_fmul(b, nir_channel(b, offset, 1),
                         nir_channel(b, get_transform(state), 0));
-   nir_instr_rewrite_src(&interp->instr, &interp->src[0],
+   nir_instr_rewrite_src(&interp->instr, &interp->src[1],
                          nir_src_for_ssa(nir_vec2(b, nir_channel(b, offset, 0),
                                                      flip_y)));
 }
@@ -298,28 +322,30 @@
    nir_foreach_instr_safe(instr, block) {
       if (instr->type == nir_instr_type_intrinsic) {
          nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-         if (intr->intrinsic == nir_intrinsic_load_var) {
-            nir_deref_var *dvar = intr->variables[0];
-            nir_variable *var = dvar->var;
+         if (intr->intrinsic == nir_intrinsic_load_deref) {
+            nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+            nir_variable *var = nir_deref_instr_get_variable(deref);
 
             if ((var->data.mode == nir_var_shader_in &&
                  var->data.location == VARYING_SLOT_POS) ||
                 (var->data.mode == nir_var_system_value &&
                  var->data.location == SYSTEM_VALUE_FRAG_COORD)) {
                /* gl_FragCoord should not have array/struct derefs: */
-               assert(dvar->deref.child == NULL);
-               lower_fragcoord(state, intr);
+               lower_fragcoord(state, intr, var);
             } else if (var->data.mode == nir_var_system_value &&
                        var->data.location == SYSTEM_VALUE_SAMPLE_POS) {
-               assert(dvar->deref.child == NULL);
                lower_load_sample_pos(state, intr);
+            } else if (var->data.mode == nir_var_shader_in &&
+                       var->data.location == VARYING_SLOT_PNTC &&
+                       state->shader->options->lower_wpos_pntc) {
+               lower_load_pointcoord(state, intr);
             }
          } else if (intr->intrinsic == nir_intrinsic_load_frag_coord) {
-            lower_fragcoord(state, intr);
+            lower_fragcoord(state, intr, NULL);
          } else if (intr->intrinsic == nir_intrinsic_load_sample_pos) {
             lower_load_sample_pos(state, intr);
-         } else if (intr->intrinsic == nir_intrinsic_interp_var_at_offset) {
-            lower_interp_var_at_offset(state, intr);
+         } else if (intr->intrinsic == nir_intrinsic_interp_deref_at_offset) {
+            lower_interp_deref_at_offset(state, intr);
          }
       } else if (instr->type == nir_instr_type_alu) {
          nir_alu_instr *alu = nir_instr_as_alu(instr);
diff --git a/src/compiler/nir/nir_move_load_const.c b/src/compiler/nir/nir_move_load_const.c
new file mode 100644
index 0000000..abc53fd
--- /dev/null
+++ b/src/compiler/nir/nir_move_load_const.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2018 Red Hat
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark (robdclark@gmail.com>
+ *
+ */
+
+#include "nir.h"
+
+
+/*
+ * A simple pass that moves load_const's into consuming block if
+ * they are only consumed in a single block, to try to counter-
+ * act nir's tendency to move all load_const to the top of the
+ * first block.
+ */
+
+/* iterate a ssa def's use's and try to find a more optimal block to
+ * move it to, using the dominance tree.  In short, if all of the uses
+ * are contained in a single block, the load will be moved there,
+ * otherwise it will be move to the least common ancestor block of all
+ * the uses
+ */
+static nir_block *
+get_preferred_block(nir_ssa_def *def)
+{
+   nir_block *lca = NULL;
+
+   /* hmm, probably ignore if-uses: */
+   if (!list_empty(&def->if_uses))
+      return NULL;
+
+   nir_foreach_use(use, def) {
+      nir_instr *instr = use->parent_instr;
+      nir_block *use_block = instr->block;
+
+      /*
+       * Kind of an ugly special-case, but phi instructions
+       * need to appear first in the block, so by definition
+       * we can't move a load_immed into a block where it is
+       * consumed by a phi instruction.  We could conceivably
+       * move it into a dominator block.
+       */
+      if (instr->type == nir_instr_type_phi) {
+         nir_phi_instr *phi = nir_instr_as_phi(instr);
+         nir_block *phi_lca = NULL;
+         nir_foreach_phi_src(src, phi)
+            phi_lca = nir_dominance_lca(phi_lca, src->pred);
+         use_block = phi_lca;
+      }
+
+      lca = nir_dominance_lca(lca, use_block);
+   }
+
+   return lca;
+}
+
+/* insert before first non-phi instruction: */
+static void
+insert_after_phi(nir_instr *instr, nir_block *block)
+{
+   nir_foreach_instr(instr2, block) {
+      if (instr2->type == nir_instr_type_phi)
+         continue;
+
+      exec_node_insert_node_before(&instr2->node,
+                                   &instr->node);
+
+      return;
+   }
+
+   /* if haven't inserted it, push to tail (ie. empty block or possibly
+    * a block only containing phi's?)
+    */
+   exec_list_push_tail(&block->instr_list, &instr->node);
+}
+
+bool
+nir_move_load_const(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (!function->impl)
+         continue;
+
+      nir_foreach_block(block, function->impl) {
+         nir_metadata_require(function->impl,
+                              nir_metadata_block_index | nir_metadata_dominance);
+
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_load_const)
+               continue;
+
+            nir_load_const_instr *load =
+                  nir_instr_as_load_const(instr);
+            nir_block *use_block =
+                  get_preferred_block(&load->def);
+
+            if (!use_block)
+               continue;
+
+            if (use_block == load->instr.block)
+               continue;
+
+            exec_node_remove(&load->instr.node);
+
+            insert_after_phi(&load->instr, use_block);
+
+            load->instr.block = use_block;
+
+            progress = true;
+         }
+      }
+
+      nir_metadata_preserve(function->impl,
+                            nir_metadata_block_index | nir_metadata_dominance);
+   }
+
+   return progress;
+}
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 89a6c6b..ed8e0ae 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -91,6 +91,7 @@
 tint = "int"
 tbool = "bool32"
 tuint = "uint"
+tuint16 = "uint16"
 tfloat32 = "float32"
 tint32 = "int32"
 tuint32 = "uint32"
@@ -180,9 +181,9 @@
          bit_sizes = [8, 16, 32, 64]
       for bit_size in bit_sizes:
           if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
-              rnd_modes = ['rtne', 'rtz', 'undef']
+              rnd_modes = ['_rtne', '_rtz', '']
               for rnd_mode in rnd_modes:
-                  unop_convert("{0}2{1}{2}_{3}".format(src_t[0], dst_t[0],
+                  unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
                                                        bit_size, rnd_mode),
                                dst_t + str(bit_size), src_t, "src0")
           else:
@@ -282,12 +283,24 @@
         (src0.w << 24);
 """)
 
+unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
+           "dst.x = src0.x | ((uint32_t)src0.y << 16);")
+
 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
 
+unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
+           "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
+
 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
            "dst.x = src0.x; dst.y = src0.x >> 32;")
 
+unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
+           "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
+
+unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
+           "dst.x = src0.x; dst.y = src0.x >> 16;")
+
 # Lowered floating point unpacking operations.
 
 
@@ -296,6 +309,9 @@
 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 
+unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
+unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
+
 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
 
@@ -351,8 +367,8 @@
 """)
 
 
-for i in xrange(1, 5):
-   for j in xrange(1, 5):
+for i in range(1, 5):
+   for j in range(1, 5):
       unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
 
 
@@ -608,6 +624,9 @@
 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
               "src0 | ((uint64_t)src1 << 32)")
 
+binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
+              "src0 | ((uint32_t)src1 << 16)")
+
 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 # if either of its arguments are 32.
diff --git a/src/compiler/nir/nir_opcodes_c.py b/src/compiler/nir/nir_opcodes_c.py
index c191855..8bfcda6 100644
--- a/src/compiler/nir/nir_opcodes_c.py
+++ b/src/compiler/nir/nir_opcodes_c.py
@@ -23,6 +23,8 @@
 # Authors:
 #    Connor Abbott (cwabbott0@gmail.com)
 
+from __future__ import print_function
+
 from nir_opcodes import opcodes
 from mako.template import Template
 
@@ -62,14 +64,19 @@
 %                 endif
 %              endif
                switch (dst_bit_size) {
-%                 for dst_bits in [16, 32, 64]:
+%                 if dst_t == 'float':
+<%                    bit_sizes = [16, 32, 64] %>
+%                 else:
+<%                    bit_sizes = [8, 16, 32, 64] %>
+%                 endif
+%                 for dst_bits in bit_sizes:
                   case ${dst_bits}:
 %                    if src_t == 'float' and dst_t == 'float' and dst_bits == 16:
                      switch(rnd) {
-%                       for rnd_t in ['rtne', 'rtz', 'undef']:
-                        case nir_rounding_mode_${rnd_t}:
-                           return ${'nir_op_{0}2{1}{2}_{3}'.format(src_t[0], dst_t[0],
-                                                                   dst_bits, rnd_t)};
+%                       for rnd_t in [('rtne', '_rtne'), ('rtz', '_rtz'), ('undef', '')]:
+                        case nir_rounding_mode_${rnd_t[0]}:
+                           return ${'nir_op_{0}2{1}{2}{3}'.format(src_t[0], dst_t[0],
+                                                                   dst_bits, rnd_t[1])};
 %                       endfor
                         default:
                            unreachable("Invalid 16-bit nir rounding mode");
@@ -109,7 +116,7 @@
 }
 
 const nir_op_info nir_op_infos[nir_num_opcodes] = {
-% for name, opcode in sorted(opcodes.iteritems()):
+% for name, opcode in sorted(opcodes.items()):
 {
    .name = "${name}",
    .num_inputs = ${opcode.num_inputs},
@@ -130,4 +137,4 @@
 };
 """)
 
-print template.render(opcodes=opcodes)
+print(template.render(opcodes=opcodes))
diff --git a/src/compiler/nir/nir_opcodes_h.py b/src/compiler/nir/nir_opcodes_h.py
index c9538e4..6b4e2fe 100644
--- a/src/compiler/nir/nir_opcodes_h.py
+++ b/src/compiler/nir/nir_opcodes_h.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 template = """\
 /* Copyright (C) 2014 Connor Abbott
@@ -28,7 +29,7 @@
 #ifndef _NIR_OPCODES_
 #define _NIR_OPCODES_
 
-<% opcode_names = sorted(opcodes.iterkeys()) %>
+<% opcode_names = sorted(opcodes.keys()) %>
 
 typedef enum {
 % for name in opcode_names:
@@ -43,4 +44,4 @@
 from nir_opcodes import opcodes
 from mako.template import Template
 
-print Template(template).render(opcodes=opcodes)
+print(Template(template).render(opcodes=opcodes))
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 5a69364..19526d8 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -23,6 +23,9 @@
 # Authors:
 #    Jason Ekstrand (jason@jlekstrand.net)
 
+from __future__ import print_function
+
+from collections import OrderedDict
 import nir_algebraic
 import itertools
 
@@ -149,7 +152,9 @@
    (('~inot', ('feq', a, b)), ('fne', a, b)),
    (('~inot', ('fne', a, b)), ('feq', a, b)),
    (('inot', ('ilt', a, b)), ('ige', a, b)),
+   (('inot', ('ult', a, b)), ('uge', a, b)),
    (('inot', ('ige', a, b)), ('ilt', a, b)),
+   (('inot', ('uge', a, b)), ('ult', a, b)),
    (('inot', ('ieq', a, b)), ('ine', a, b)),
    (('inot', ('ine', a, b)), ('ieq', a, b)),
 
@@ -224,6 +229,8 @@
    (('imax', a, a), a),
    (('umin', a, a), a),
    (('umax', a, a), a),
+   (('fmax', a, ('fneg', a)), ('fabs', a)),
+   (('imax', a, ('ineg', a)), ('iabs', a)),
    (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))),
    (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))),
    (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))),
@@ -280,6 +287,11 @@
    (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))),
    (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)),
 
+   (('ior', 'a@bool', ('ieq', a, False)), True),
+   (('ior', 'a@bool', ('inot', a)), True),
+
+   (('iand', ('ieq', 'a@32', 0), ('ieq', 'b@32', 0)), ('ieq', ('ior', 'a@32', 'b@32'), 0)),
+
    # These patterns can result when (a < b || a < c) => (a < min(b, c))
    # transformations occur before constant propagation and loop-unrolling.
    (('~flt', a, ('fmax', b, a)), ('flt', a, b)),
@@ -288,6 +300,8 @@
    (('~fge', ('fmax', a, b), a), True),
    (('~flt', a, ('fmin', b, a)), False),
    (('~flt', ('fmax', a, b), a), False),
+   (('~fge', a, ('fmax', b, a)), ('fge', a, b)),
+   (('~fge', ('fmin', a, b), a), ('fge', b, a)),
 
    (('ilt', a, ('imax', b, a)), ('ilt', a, b)),
    (('ilt', ('imin', a, b), a), ('ilt', b, a)),
@@ -297,7 +311,23 @@
    (('ult', ('umin', a, b), a), ('ult', b, a)),
    (('uge', a, ('umin', b, a)), True),
    (('uge', ('umax', a, b), a), True),
+   (('ilt', a, ('imin', b, a)), False),
+   (('ilt', ('imax', a, b), a), False),
+   (('ige', a, ('imax', b, a)), ('ige', a, b)),
+   (('ige', ('imin', a, b), a), ('ige', b, a)),
+   (('ult', a, ('umin', b, a)), False),
+   (('ult', ('umax', a, b), a), False),
+   (('uge', a, ('umax', b, a)), ('uge', a, b)),
+   (('uge', ('umin', a, b), a), ('uge', b, a)),
 
+   (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))),
+   (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))),
+   (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))),
+   (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))),
+   (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))),
+   (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))),
+   (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))),
+   (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))),
    (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))),
    (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))),
    (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))),
@@ -321,7 +351,7 @@
    (('imul', ('b2i', a), ('b2i', b)), ('b2i', ('iand', a, b))),
    (('fmul', ('b2f', a), ('b2f', b)), ('b2f', ('iand', a, b))),
    (('fsat', ('fadd', ('b2f', a), ('b2f', b))), ('b2f', ('ior', a, b))),
-   (('iand', 'a@bool', 1.0), ('b2f', a)),
+   (('iand', 'a@bool', 1.0), ('b2f', a), '!options->lower_b2f'),
    # True/False are ~0 and 0 in NIR.  b2i of True is 1, and -1 is ~0 (True).
    (('ineg', ('b2i@32', a)), a),
    (('flt', ('fneg', ('b2f', a)), 0), a), # Generated by TGSI KILL_IF.
@@ -347,6 +377,10 @@
    (('ixor', a, a), 0),
    (('ixor', a, 0), a),
    (('inot', ('inot', a)), a),
+   (('ior', ('iand', a, b), b), b),
+   (('ior', ('ior', a, b), b), ('ior', a, b)),
+   (('iand', ('ior', a, b), b), b),
+   (('iand', ('iand', a, b), b), ('iand', a, b)),
    # DeMorgan's Laws
    (('iand', ('inot', a), ('inot', b)), ('inot', ('ior',  a, b))),
    (('ior',  ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
@@ -419,6 +453,7 @@
    (('i2b', ('iabs', a)), ('i2b', a)),
    (('fabs', ('b2f', a)), ('b2f', a)),
    (('iabs', ('b2i', a)), ('b2i', a)),
+   (('inot', ('f2b', a)), ('feq', a, 0.0)),
 
    # Packing and then unpacking does nothing
    (('unpack_64_2x32_split_x', ('pack_64_2x32_split', a, b)), a),
@@ -492,6 +527,20 @@
               ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
     'options->lower_bitfield_insert'),
 
+   # Alternative lowering that doesn't rely on bfi.
+   (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
+    ('bcsel', ('ilt', 31, 'bits'),
+     'insert',
+     ('ior',
+      ('iand', 'base', ('inot', ('bfm', 'bits', 'offset'))),
+      ('iand', ('ishl', 'insert', 'offset'), ('bfm', 'bits', 'offset')))),
+    'options->lower_bitfield_insert_to_shifts'),
+
+   # bfm lowering -- note that the NIR opcode is undefined if either arg is 32.
+   (('bfm', 'bits', 'offset'),
+    ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'),
+    'options->lower_bfm'),
+
    (('ibitfield_extract', 'value', 'offset', 'bits'),
     ('bcsel', ('ilt', 31, 'bits'), 'value',
               ('ibfe', 'value', 'offset', 'bits')),
@@ -502,6 +551,30 @@
               ('ubfe', 'value', 'offset', 'bits')),
     'options->lower_bitfield_extract'),
 
+   (('ibitfield_extract', 'value', 'offset', 'bits'),
+    ('bcsel', ('ieq', 0, 'bits'),
+     0,
+     ('ishr',
+       ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')),
+       ('isub', 32, 'bits'))),
+    'options->lower_bitfield_extract_to_shifts'),
+
+   (('ubitfield_extract', 'value', 'offset', 'bits'),
+    ('iand',
+     ('ushr', 'value', 'offset'),
+     ('bcsel', ('ieq', 'bits', 32),
+      0xffffffff,
+      ('bfm', 'bits', 0))),
+    'options->lower_bitfield_extract_to_shifts'),
+
+   (('ifind_msb', 'value'),
+    ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')),
+    'options->lower_ifind_msb'),
+
+   (('find_lsb', 'value'),
+    ('ufind_msb', ('iand', 'value', ('ineg', 'value'))),
+    'options->lower_find_lsb'),
+
    (('extract_i8', a, 'b@32'),
     ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
     'options->lower_extract_byte'),
@@ -567,7 +640,7 @@
      'options->lower_unpack_snorm_4x8'),
 ]
 
-invert = {'feq': 'fne', 'fne': 'feq', 'fge': 'flt', 'flt': 'fge' }
+invert = OrderedDict([('feq', 'fne'), ('fne', 'feq'), ('fge', 'flt'), ('flt', 'fge')])
 
 for left, right in list(itertools.combinations(invert.keys(), 2)) + zip(invert.keys(), invert.keys()):
    optimizations.append((('inot', ('ior(is_used_once)', (left, a, b), (right, c, d))),
@@ -724,10 +797,13 @@
    # we do these late so that we don't get in the way of creating ffmas
    (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))),
    (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))),
+
+   # Lowered for backends without a dedicated b2f instruction
+   (('b2f@32', a), ('iand', a, 1.0), 'options->lower_b2f'),
 ]
 
-print nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()
-print nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
-                                  before_ffma_optimizations).render()
-print nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
-                                  late_optimizations).render()
+print(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render())
+print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma",
+                                  before_ffma_optimizations).render())
+print(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
+                                  late_optimizations).render())
diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c
index d6be807..e2920e6 100644
--- a/src/compiler/nir/nir_opt_constant_folding.c
+++ b/src/compiler/nir/nir_opt_constant_folding.c
@@ -41,7 +41,7 @@
 static bool
 constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
 {
-   nir_const_value src[4];
+   nir_const_value src[NIR_MAX_VEC_COMPONENTS];
 
    if (!instr->dest.dest.is_ssa)
       return false;
@@ -76,10 +76,20 @@
 
       for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(instr, i);
            j++) {
-         if (load_const->def.bit_size == 64)
+         switch(load_const->def.bit_size) {
+         case 64:
             src[i].u64[j] = load_const->value.u64[instr->src[i].swizzle[j]];
-         else
+            break;
+         case 32:
             src[i].u32[j] = load_const->value.u32[instr->src[i].swizzle[j]];
+            break;
+         case 16:
+            src[i].u16[j] = load_const->value.u16[instr->src[i].swizzle[j]];
+            break;
+         case 8:
+            src[i].u8[j] = load_const->value.u8[instr->src[i].swizzle[j]];
+            break;
+         }
       }
 
       /* We shouldn't have any source modifiers in the optimization loop. */
@@ -115,49 +125,28 @@
 }
 
 static bool
-constant_fold_deref(nir_instr *instr, nir_deref_var *deref)
-{
-   bool progress = false;
-
-   for (nir_deref *tail = deref->deref.child; tail; tail = tail->child) {
-      if (tail->deref_type != nir_deref_type_array)
-         continue;
-
-      nir_deref_array *arr = nir_deref_as_array(tail);
-
-      if (arr->deref_array_type == nir_deref_array_type_indirect &&
-          arr->indirect.is_ssa &&
-          arr->indirect.ssa->parent_instr->type == nir_instr_type_load_const) {
-         nir_load_const_instr *indirect =
-            nir_instr_as_load_const(arr->indirect.ssa->parent_instr);
-
-         arr->base_offset += indirect->value.u32[0];
-
-         /* Clear out the source */
-         nir_instr_rewrite_src(instr, &arr->indirect, nir_src_for_ssa(NULL));
-
-         arr->deref_array_type = nir_deref_array_type_direct;
-
-         progress = true;
-      }
-   }
-
-   return progress;
-}
-
-static bool
 constant_fold_intrinsic_instr(nir_intrinsic_instr *instr)
 {
    bool progress = false;
 
-   unsigned num_vars = nir_intrinsic_infos[instr->intrinsic].num_variables;
-   for (unsigned i = 0; i < num_vars; i++) {
-      progress |= constant_fold_deref(&instr->instr, instr->variables[i]);
-   }
-
    if (instr->intrinsic == nir_intrinsic_discard_if) {
       nir_const_value *src_val = nir_src_as_const_value(instr->src[0]);
-      if (src_val && src_val->u32[0] == 0) {
+      if (src_val && src_val->u32[0] == NIR_FALSE) {
+         nir_instr_remove(&instr->instr);
+         progress = true;
+      } else if (src_val && src_val->u32[0] == NIR_TRUE) {
+         /* This method of getting a nir_shader * from a nir_instr is
+          * admittedly gross, but given the rarity of hitting this case I think
+          * it's preferable to plumbing an otherwise unused nir_shader *
+          * parameter through four functions to get here.
+          */
+         nir_cf_node *cf_node = &instr->instr.block->cf_node;
+         nir_function_impl *impl = nir_cf_node_get_function(cf_node);
+         nir_shader *shader = impl->function->shader;
+
+         nir_intrinsic_instr *discard =
+            nir_intrinsic_instr_create(shader, nir_intrinsic_discard);
+         nir_instr_insert_before(&instr->instr, &discard->instr);
          nir_instr_remove(&instr->instr);
          progress = true;
       }
@@ -167,20 +156,6 @@
 }
 
 static bool
-constant_fold_tex_instr(nir_tex_instr *instr)
-{
-   bool progress = false;
-
-   if (instr->texture)
-      progress |= constant_fold_deref(&instr->instr, instr->texture);
-
-   if (instr->sampler)
-      progress |= constant_fold_deref(&instr->instr, instr->sampler);
-
-   return progress;
-}
-
-static bool
 constant_fold_block(nir_block *block, void *mem_ctx)
 {
    bool progress = false;
@@ -194,9 +169,6 @@
          progress |=
             constant_fold_intrinsic_instr(nir_instr_as_intrinsic(instr));
          break;
-      case nir_instr_type_tex:
-         progress |= constant_fold_tex_instr(nir_instr_as_tex(instr));
-         break;
       default:
          /* Don't know how to constant fold */
          break;
diff --git a/src/compiler/nir/nir_opt_copy_prop_vars.c b/src/compiler/nir/nir_opt_copy_prop_vars.c
index cc8f00f..d8c4cab 100644
--- a/src/compiler/nir/nir_opt_copy_prop_vars.c
+++ b/src/compiler/nir/nir_opt_copy_prop_vars.c
@@ -23,6 +23,7 @@
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_deref.h"
 
 #include "util/bitscan.h"
 
@@ -40,7 +41,7 @@
  *  2) Dead code elimination of store_var and copy_var intrinsics based on
  *     killed destination values.
  *
- *  3) Removal of redundant load_var intrinsics.  We can't trust regular CSE
+ *  3) Removal of redundant load_deref intrinsics.  We can't trust regular CSE
  *     to do this because it isn't aware of variable writes that may alias the
  *     value and make the former load invalid.
  *
@@ -56,7 +57,7 @@
    bool is_ssa;
    union {
       nir_ssa_def *ssa[4];
-      nir_deref_var *deref;
+      nir_deref_instr *deref;
    };
 };
 
@@ -68,7 +69,7 @@
    unsigned comps_may_be_read;
    struct value src;
 
-   nir_deref_var *dst;
+   nir_deref_instr *dst;
 };
 
 struct copy_prop_var_state {
@@ -88,7 +89,7 @@
 
 static struct copy_entry *
 copy_entry_create(struct copy_prop_var_state *state,
-                  nir_deref_var *dst_deref)
+                  nir_deref_instr *dst_deref)
 {
    struct copy_entry *entry;
    if (!list_empty(&state->copy_free_list)) {
@@ -127,9 +128,10 @@
  * ever needs it.
  */
 static enum deref_compare_result
-compare_derefs(nir_deref_var *a, nir_deref_var *b)
+compare_deref_paths(nir_deref_path *a_path,
+                    nir_deref_path *b_path)
 {
-   if (a->var != b->var)
+   if (a_path->path[0]->var != b_path->path[0]->var)
       return 0;
 
    /* Start off assuming they fully compare.  We ignore equality for now.  In
@@ -139,62 +141,54 @@
                                       derefs_a_contains_b_bit |
                                       derefs_b_contains_a_bit;
 
-   nir_deref *a_tail = &a->deref;
-   nir_deref *b_tail = &b->deref;
-   while (a_tail->child && b_tail->child) {
-      a_tail = a_tail->child;
-      b_tail = b_tail->child;
+   nir_deref_instr **a_p = &a_path->path[1];
+   nir_deref_instr **b_p = &b_path->path[1];
+   while (*a_p != NULL && *b_p != NULL) {
+      nir_deref_instr *a_tail = *(a_p++);
+      nir_deref_instr *b_tail = *(b_p++);
 
-      assert(a_tail->deref_type == b_tail->deref_type);
       switch (a_tail->deref_type) {
-      case nir_deref_type_array: {
-         nir_deref_array *a_arr = nir_deref_as_array(a_tail);
-         nir_deref_array *b_arr = nir_deref_as_array(b_tail);
+      case nir_deref_type_array:
+      case nir_deref_type_array_wildcard: {
+         assert(b_tail->deref_type == nir_deref_type_array ||
+                b_tail->deref_type == nir_deref_type_array_wildcard);
 
-         if (a_arr->deref_array_type == nir_deref_array_type_direct &&
-             b_arr->deref_array_type == nir_deref_array_type_direct) {
-            /* If they're both direct and have different offsets, they
-             * don't even alias much less anything else.
-             */
-            if (a_arr->base_offset != b_arr->base_offset)
-               return 0;
-         } else if (a_arr->deref_array_type == nir_deref_array_type_wildcard) {
-            if (b_arr->deref_array_type != nir_deref_array_type_wildcard)
+         if (a_tail->deref_type == nir_deref_type_array_wildcard) {
+            if (b_tail->deref_type != nir_deref_type_array_wildcard)
                result &= ~derefs_b_contains_a_bit;
-         } else if (b_arr->deref_array_type == nir_deref_array_type_wildcard) {
-            if (a_arr->deref_array_type != nir_deref_array_type_wildcard)
+         } else if (b_tail->deref_type == nir_deref_type_array_wildcard) {
+            if (a_tail->deref_type != nir_deref_type_array_wildcard)
                result &= ~derefs_a_contains_b_bit;
-         } else if (a_arr->deref_array_type == nir_deref_array_type_indirect &&
-                    b_arr->deref_array_type == nir_deref_array_type_indirect) {
-            assert(a_arr->indirect.is_ssa && b_arr->indirect.is_ssa);
-            if (a_arr->indirect.ssa == b_arr->indirect.ssa) {
-               /* If they're different constant offsets from the same indirect
-                * then they don't alias at all.
+         } else {
+            assert(a_tail->deref_type == nir_deref_type_array &&
+                   b_tail->deref_type == nir_deref_type_array);
+            assert(a_tail->arr.index.is_ssa && b_tail->arr.index.is_ssa);
+
+            nir_const_value *a_index_const =
+               nir_src_as_const_value(a_tail->arr.index);
+            nir_const_value *b_index_const =
+               nir_src_as_const_value(b_tail->arr.index);
+            if (a_index_const && b_index_const) {
+               /* If they're both direct and have different offsets, they
+                * don't even alias much less anything else.
                 */
-               if (a_arr->base_offset != b_arr->base_offset)
+               if (a_index_const->u32[0] != b_index_const->u32[0])
                   return 0;
-               /* Otherwise the indirect and base both match */
+            } else if (a_tail->arr.index.ssa == b_tail->arr.index.ssa) {
+               /* They're the same indirect, continue on */
             } else {
-               /* If they're have different indirect offsets then we can't
-                * prove anything about containment.
+               /* They're not the same index so we can't prove anything about
+                * containment.
                 */
                result &= ~(derefs_a_contains_b_bit | derefs_b_contains_a_bit);
             }
-         } else {
-            /* In this case, one is indirect and the other direct so we can't
-             * prove anything about containment.
-             */
-            result &= ~(derefs_a_contains_b_bit | derefs_b_contains_a_bit);
          }
          break;
       }
 
       case nir_deref_type_struct: {
-         nir_deref_struct *a_struct = nir_deref_as_struct(a_tail);
-         nir_deref_struct *b_struct = nir_deref_as_struct(b_tail);
-
          /* If they're different struct members, they don't even alias */
-         if (a_struct->index != b_struct->index)
+         if (a_tail->strct.index != b_tail->strct.index)
             return 0;
          break;
       }
@@ -205,9 +199,9 @@
    }
 
    /* If a is longer than b, then it can't contain b */
-   if (a_tail->child)
+   if (*a_p != NULL)
       result &= ~derefs_a_contains_b_bit;
-   if (b_tail->child)
+   if (*b_p != NULL)
       result &= ~derefs_b_contains_a_bit;
 
    /* If a contains b and b contains a they must be equal. */
@@ -217,6 +211,28 @@
    return result;
 }
 
+static enum deref_compare_result
+compare_derefs(nir_deref_instr *a, nir_deref_instr *b)
+{
+   if (a == b) {
+      return derefs_equal_bit | derefs_may_alias_bit |
+             derefs_a_contains_b_bit | derefs_b_contains_a_bit;
+   }
+
+   nir_deref_path a_path, b_path;
+   nir_deref_path_init(&a_path, a, NULL);
+   nir_deref_path_init(&b_path, b, NULL);
+   assert(a_path.path[0]->deref_type == nir_deref_type_var);
+   assert(b_path.path[0]->deref_type == nir_deref_type_var);
+
+   enum deref_compare_result result = compare_deref_paths(&a_path, &b_path);
+
+   nir_deref_path_finish(&a_path);
+   nir_deref_path_finish(&b_path);
+
+   return result;
+}
+
 static void
 remove_dead_writes(struct copy_prop_var_state *state,
                    struct copy_entry *entry, unsigned write_mask)
@@ -257,7 +273,7 @@
 
 static struct copy_entry *
 lookup_entry_for_deref(struct copy_prop_var_state *state,
-                       nir_deref_var *deref,
+                       nir_deref_instr *deref,
                        enum deref_compare_result allowed_comparisons)
 {
    list_for_each_entry(struct copy_entry, iter, &state->copies, link) {
@@ -270,7 +286,7 @@
 
 static void
 mark_aliased_entries_as_read(struct copy_prop_var_state *state,
-                             nir_deref_var *deref, unsigned components)
+                             nir_deref_instr *deref, unsigned components)
 {
    list_for_each_entry(struct copy_entry, iter, &state->copies, link) {
       if (compare_derefs(iter->dst, deref) & derefs_may_alias_bit)
@@ -280,7 +296,7 @@
 
 static struct copy_entry *
 get_entry_and_kill_aliases(struct copy_prop_var_state *state,
-                           nir_deref_var *deref,
+                           nir_deref_instr *deref,
                            unsigned write_mask)
 {
    struct copy_entry *entry = NULL;
@@ -319,8 +335,12 @@
                         nir_variable_mode modes)
 {
    list_for_each_entry_safe(struct copy_entry, iter, &state->copies, link) {
-      if ((iter->dst->var->data.mode & modes) ||
-          (!iter->src.is_ssa && (iter->src.deref->var->data.mode & modes)))
+      nir_variable *dst_var = nir_deref_instr_get_variable(iter->dst);
+      nir_variable *src_var = iter->src.is_ssa ? NULL :
+         nir_deref_instr_get_variable(iter->src.deref);
+
+      if ((dst_var->data.mode & modes) ||
+          (src_var && (src_var->data.mode & modes)))
          copy_entry_remove(state, iter);
    }
 }
@@ -366,10 +386,10 @@
    *value = entry->src;
    assert(value->is_ssa);
 
-   const struct glsl_type *type = nir_deref_tail(&entry->dst->deref)->type;
+   const struct glsl_type *type = entry->dst->type;
    unsigned num_components = glsl_get_vector_elements(type);
 
-   uint8_t available = 0;
+   nir_component_mask_t available = 0;
    bool all_same = true;
    for (unsigned i = 0; i < num_components; i++) {
       if (value->ssa[i])
@@ -387,11 +407,11 @@
    }
 
    if (available != (1 << num_components) - 1 &&
-       intrin->intrinsic == nir_intrinsic_load_var &&
+       intrin->intrinsic == nir_intrinsic_load_deref &&
        (available & nir_ssa_def_components_read(&intrin->dest.ssa)) == 0) {
       /* If none of the components read are available as SSA values, then we
        * should just bail.  Otherwise, we would end up replacing the uses of
-       * the load_var a vecN() that just gathers up its components.
+       * the load_deref a vecN() that just gathers up its components.
        */
       return false;
    }
@@ -399,10 +419,10 @@
    b->cursor = nir_after_instr(&intrin->instr);
 
    nir_ssa_def *load_def =
-      intrin->intrinsic == nir_intrinsic_load_var ? &intrin->dest.ssa : NULL;
+      intrin->intrinsic == nir_intrinsic_load_deref ? &intrin->dest.ssa : NULL;
 
    bool keep_intrin = false;
-   nir_ssa_def *comps[4];
+   nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS];
    for (unsigned i = 0; i < num_components; i++) {
       if (value->ssa[i]) {
          comps[i] = nir_channel(b, value->ssa[i], i);
@@ -411,7 +431,7 @@
           * list.  Just re-use a channel from the load.
           */
          if (load_def == NULL)
-            load_def = nir_load_deref_var(b, entry->dst);
+            load_def = nir_load_deref(b, entry->dst);
 
          if (load_def->parent_instr == &intrin->instr)
             keep_intrin = true;
@@ -445,79 +465,39 @@
  * process is guided by \param guide which references the same type as \param
  * specific but has the same wildcard array lengths as \param deref.
  */
-static nir_deref_var *
-specialize_wildcards(nir_deref_var *deref,
-                     nir_deref_var *guide,
-                     nir_deref_var *specific,
-                     void *mem_ctx)
+static nir_deref_instr *
+specialize_wildcards(nir_builder *b,
+                     nir_deref_path *deref,
+                     nir_deref_path *guide,
+                     nir_deref_path *specific)
 {
-   nir_deref_var *ret = nir_deref_var_create(mem_ctx, deref->var);
-
-   nir_deref *deref_tail = deref->deref.child;
-   nir_deref *guide_tail = &guide->deref;
-   nir_deref *spec_tail = &specific->deref;
-   nir_deref *ret_tail = &ret->deref;
-   while (deref_tail) {
-      switch (deref_tail->deref_type) {
-      case nir_deref_type_array: {
-         nir_deref_array *deref_arr = nir_deref_as_array(deref_tail);
-
-         nir_deref_array *ret_arr = nir_deref_array_create(ret_tail);
-         ret_arr->deref.type = deref_arr->deref.type;
-         ret_arr->deref_array_type = deref_arr->deref_array_type;
-
-         switch (deref_arr->deref_array_type) {
-         case nir_deref_array_type_direct:
-            ret_arr->base_offset = deref_arr->base_offset;
-            break;
-         case nir_deref_array_type_indirect:
-            ret_arr->base_offset = deref_arr->base_offset;
-            assert(deref_arr->indirect.is_ssa);
-            ret_arr->indirect = deref_arr->indirect;
-            break;
-         case nir_deref_array_type_wildcard:
-            /* This is where things get tricky.  We have to search through
-             * the entry deref to find its corresponding wildcard and fill
-             * this slot in with the value from the src.
-             */
-            while (guide_tail->child) {
-               guide_tail = guide_tail->child;
-               spec_tail = spec_tail->child;
-
-               if (guide_tail->deref_type == nir_deref_type_array &&
-                   nir_deref_as_array(guide_tail)->deref_array_type ==
-                   nir_deref_array_type_wildcard)
-                  break;
-            }
-
-            nir_deref_array *spec_arr = nir_deref_as_array(spec_tail);
-            ret_arr->deref_array_type = spec_arr->deref_array_type;
-            ret_arr->base_offset = spec_arr->base_offset;
-            ret_arr->indirect = spec_arr->indirect;
+   nir_deref_instr **deref_p = &deref->path[1];
+   nir_deref_instr **guide_p = &guide->path[1];
+   nir_deref_instr **spec_p = &specific->path[1];
+   nir_deref_instr *ret_tail = deref->path[0];
+   for (; *deref_p; deref_p++) {
+      if ((*deref_p)->deref_type == nir_deref_type_array_wildcard) {
+         /* This is where things get tricky.  We have to search through
+          * the entry deref to find its corresponding wildcard and fill
+          * this slot in with the value from the src.
+          */
+         while (*guide_p &&
+                (*guide_p)->deref_type != nir_deref_type_array_wildcard) {
+            guide_p++;
+            spec_p++;
          }
+         assert(*guide_p && *spec_p);
 
-         ret_tail->child = &ret_arr->deref;
-         break;
+         ret_tail = nir_build_deref_follower(b, ret_tail, *spec_p);
+
+         guide_p++;
+         spec_p++;
+      } else {
+         ret_tail = nir_build_deref_follower(b, ret_tail, *deref_p);
       }
-      case nir_deref_type_struct: {
-         nir_deref_struct *deref_struct = nir_deref_as_struct(deref_tail);
-
-         nir_deref_struct *ret_struct =
-            nir_deref_struct_create(ret_tail, deref_struct->index);
-         ret_struct->deref.type = deref_struct->deref.type;
-
-         ret_tail->child = &ret_struct->deref;
-         break;
-      }
-      case nir_deref_type_var:
-         unreachable("Invalid deref type");
-      }
-
-      deref_tail = deref_tail->child;
-      ret_tail = ret_tail->child;
    }
 
-   return ret;
+   return ret_tail;
 }
 
 /* Do a "load" from an deref-based entry return it in "value" as a value.  The
@@ -529,57 +509,55 @@
 load_from_deref_entry_value(struct copy_prop_var_state *state,
                             struct copy_entry *entry,
                             nir_builder *b, nir_intrinsic_instr *intrin,
-                            nir_deref_var *src, struct value *value)
+                            nir_deref_instr *src, struct value *value)
 {
    *value = entry->src;
 
-   /* Walk the deref to get the two tails and also figure out if we need to
-    * specialize any wildcards.
-    */
+   b->cursor = nir_instr_remove(&intrin->instr);
+
+   nir_deref_path entry_dst_path, src_path;
+   nir_deref_path_init(&entry_dst_path, entry->dst, state->mem_ctx);
+   nir_deref_path_init(&src_path, src, state->mem_ctx);
+
    bool need_to_specialize_wildcards = false;
-   nir_deref *entry_tail = &entry->dst->deref;
-   nir_deref *src_tail = &src->deref;
-   while (entry_tail->child && src_tail->child) {
-      assert(src_tail->child->deref_type == entry_tail->child->deref_type);
-      if (src_tail->child->deref_type == nir_deref_type_array) {
-         nir_deref_array *entry_arr = nir_deref_as_array(entry_tail->child);
-         nir_deref_array *src_arr = nir_deref_as_array(src_tail->child);
+   nir_deref_instr **entry_p = &entry_dst_path.path[1];
+   nir_deref_instr **src_p = &src_path.path[1];
+   while (*entry_p && *src_p) {
+      nir_deref_instr *entry_tail = *entry_p++;
+      nir_deref_instr *src_tail = *src_p++;
 
-         if (src_arr->deref_array_type != nir_deref_array_type_wildcard &&
-             entry_arr->deref_array_type == nir_deref_array_type_wildcard)
-            need_to_specialize_wildcards = true;
-      }
-
-      entry_tail = entry_tail->child;
-      src_tail = src_tail->child;
+      if (src_tail->deref_type == nir_deref_type_array &&
+          entry_tail->deref_type == nir_deref_type_array_wildcard)
+         need_to_specialize_wildcards = true;
    }
 
    /* If the entry deref is longer than the source deref then it refers to a
     * smaller type and we can't source from it.
     */
-   assert(entry_tail->child == NULL);
+   assert(*entry_p == NULL);
 
    if (need_to_specialize_wildcards) {
       /* The entry has some wildcards that are not in src.  This means we need
        * to construct a new deref based on the entry but using the wildcards
        * from the source and guided by the entry dst.  Oof.
        */
-      value->deref = specialize_wildcards(entry->src.deref, entry->dst, src,
-                                          state->mem_ctx);
-   } else {
-      /* We're going to need to make a copy in case we modify it below */
-      value->deref = nir_deref_var_clone(value->deref, state->mem_ctx);
+      nir_deref_path entry_src_path;
+      nir_deref_path_init(&entry_src_path, entry->src.deref, state->mem_ctx);
+      value->deref = specialize_wildcards(b, &entry_src_path,
+                                          &entry_dst_path, &src_path);
+      nir_deref_path_finish(&entry_src_path);
    }
 
-   if (src_tail->child) {
-      /* If our source deref is longer than the entry deref, that's ok because
-       * it just means the entry deref needs to be extended a bit.
-       */
-      nir_deref *value_tail = nir_deref_tail(&value->deref->deref);
-      value_tail->child = nir_deref_clone(src_tail->child, value_tail);
+   /* If our source deref is longer than the entry deref, that's ok because
+    * it just means the entry deref needs to be extended a bit.
+    */
+   while (*src_p) {
+      nir_deref_instr *src_tail = *src_p++;
+      value->deref = nir_build_deref_follower(b, value->deref, src_tail);
    }
 
-   b->cursor = nir_instr_remove(&intrin->instr);
+   nir_deref_path_finish(&entry_dst_path);
+   nir_deref_path_finish(&src_path);
 
    return true;
 }
@@ -587,7 +565,7 @@
 static bool
 try_load_from_entry(struct copy_prop_var_state *state, struct copy_entry *entry,
                     nir_builder *b, nir_intrinsic_instr *intrin,
-                    nir_deref_var *src, struct value *value)
+                    nir_deref_instr *src, struct value *value)
 {
    if (entry == NULL)
       return false;
@@ -628,8 +606,8 @@
          apply_barrier_for_modes(state, nir_var_shader_out);
          break;
 
-      case nir_intrinsic_load_var: {
-         nir_deref_var *src = intrin->variables[0];
+      case nir_intrinsic_load_deref: {
+         nir_deref_instr *src = nir_src_as_deref(intrin->src[0]);
 
          uint8_t comps_read = nir_ssa_def_components_read(&intrin->dest.ssa);
          mark_aliased_entries_as_read(state, src, comps_read);
@@ -658,8 +636,7 @@
                }
             } else {
                /* We're turning it into a load of a different variable */
-               ralloc_steal(intrin, value.deref);
-               intrin->variables[0] = value.deref;
+               intrin->src[0] = nir_src_for_ssa(&value.deref->dest.ssa);
 
                /* Put it back in again. */
                nir_builder_instr_insert(b, instr);
@@ -695,15 +672,15 @@
          break;
       }
 
-      case nir_intrinsic_store_var: {
+      case nir_intrinsic_store_deref: {
          struct value value = {
             .is_ssa = true
          };
 
          for (unsigned i = 0; i < intrin->num_components; i++)
-            value.ssa[i] = intrin->src[0].ssa;
+            value.ssa[i] = intrin->src[1].ssa;
 
-         nir_deref_var *dst = intrin->variables[0];
+         nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
          unsigned wrmask = nir_intrinsic_write_mask(intrin);
          struct copy_entry *entry =
             get_entry_and_kill_aliases(state, dst, wrmask);
@@ -711,9 +688,9 @@
          break;
       }
 
-      case nir_intrinsic_copy_var: {
-         nir_deref_var *dst = intrin->variables[0];
-         nir_deref_var *src = intrin->variables[1];
+      case nir_intrinsic_copy_deref: {
+         nir_deref_instr *dst = nir_src_as_deref(intrin->src[0]);
+         nir_deref_instr *src = nir_src_as_deref(intrin->src[1]);
 
          if (compare_derefs(src, dst) & derefs_equal_bit) {
             /* This is a no-op self-copy.  Get rid of it */
@@ -728,7 +705,7 @@
          struct value value;
          if (try_load_from_entry(state, src_entry, b, intrin, src, &value)) {
             if (value.is_ssa) {
-               nir_store_deref_var(b, dst, value.ssa[0], 0xf);
+               nir_store_deref(b, dst, value.ssa[0], 0xf);
                intrin = nir_instr_as_intrinsic(nir_builder_last_instr(b));
             } else {
                /* If this would be a no-op self-copy, don't bother. */
@@ -736,8 +713,7 @@
                   continue;
 
                /* Just turn it into a copy of a different deref */
-               ralloc_steal(intrin, value.deref);
-               intrin->variables[1] = value.deref;
+               intrin->src[1] = nir_src_for_ssa(&value.deref->dest.ssa);
 
                /* Put it back in again. */
                nir_builder_instr_insert(b, instr);
diff --git a/src/compiler/nir/nir_opt_copy_propagate.c b/src/compiler/nir/nir_opt_copy_propagate.c
index 3cd476a..189d544 100644
--- a/src/compiler/nir/nir_opt_copy_propagate.c
+++ b/src/compiler/nir/nir_opt_copy_propagate.c
@@ -99,6 +99,22 @@
 }
 
 static bool
+is_trivial_deref_cast(nir_deref_instr *cast)
+{
+   if (cast->deref_type != nir_deref_type_cast)
+      return false;
+
+   nir_deref_instr *parent = nir_src_as_deref(cast->parent);
+   if (!parent)
+      return false;
+
+   return cast->mode == parent->mode &&
+          cast->type == parent->type &&
+          cast->dest.ssa.num_components == parent->dest.ssa.num_components &&
+          cast->dest.ssa.bit_size == parent->dest.ssa.bit_size;
+}
+
+static bool
 copy_prop_src(nir_src *src, nir_instr *parent_instr, nir_if *parent_if,
               unsigned num_components)
 {
@@ -109,23 +125,31 @@
    }
 
    nir_instr *src_instr = src->ssa->parent_instr;
-   if (src_instr->type != nir_instr_type_alu)
-      return false;
+   nir_ssa_def *copy_def;
+   if (src_instr->type == nir_instr_type_alu) {
+      nir_alu_instr *alu_instr = nir_instr_as_alu(src_instr);
+      if (!is_swizzleless_move(alu_instr))
+         return false;
 
-   nir_alu_instr *alu_instr = nir_instr_as_alu(src_instr);
-   if (!is_swizzleless_move(alu_instr))
-      return false;
+      if (alu_instr->src[0].src.ssa->num_components != num_components)
+         return false;
 
-   if (alu_instr->src[0].src.ssa->num_components != num_components)
+      copy_def= alu_instr->src[0].src.ssa;
+   } else if (src_instr->type == nir_instr_type_deref) {
+      nir_deref_instr *deref_instr = nir_instr_as_deref(src_instr);
+      if (!is_trivial_deref_cast(deref_instr))
+         return false;
+
+      copy_def = deref_instr->parent.ssa;
+   } else {
       return false;
+   }
 
    if (parent_instr) {
-      nir_instr_rewrite_src(parent_instr, src,
-                            nir_src_for_ssa(alu_instr->src[0].src.ssa));
+      nir_instr_rewrite_src(parent_instr, src, nir_src_for_ssa(copy_def));
    } else {
       assert(src == &parent_if->condition);
-      nir_if_rewrite_condition(parent_if,
-                               nir_src_for_ssa(alu_instr->src[0].src.ssa));
+      nir_if_rewrite_condition(parent_if, nir_src_for_ssa(copy_def));
    }
 
    return true;
@@ -195,28 +219,6 @@
 }
 
 static bool
-copy_prop_deref_var(nir_instr *instr, nir_deref_var *deref_var)
-{
-   if (!deref_var)
-      return false;
-
-   bool progress = false;
-   for (nir_deref *deref = deref_var->deref.child;
-        deref; deref = deref->child) {
-      if (deref->deref_type != nir_deref_type_array)
-         continue;
-
-      nir_deref_array *arr = nir_deref_as_array(deref);
-      if (arr->deref_array_type != nir_deref_array_type_indirect)
-         continue;
-
-      while (copy_prop_src(&arr->indirect, instr, NULL, 1))
-         progress = true;
-   }
-   return progress;
-}
-
-static bool
 copy_prop_instr(nir_instr *instr)
 {
    bool progress = false;
@@ -234,6 +236,24 @@
       return progress;
    }
 
+   case nir_instr_type_deref: {
+      nir_deref_instr *deref = nir_instr_as_deref(instr);
+
+      if (deref->deref_type != nir_deref_type_var) {
+         assert(deref->dest.is_ssa);
+         const unsigned comps = deref->dest.ssa.num_components;
+         while (copy_prop_src(&deref->parent, instr, NULL, comps))
+            progress = true;
+      }
+
+      if (deref->deref_type == nir_deref_type_array) {
+         while (copy_prop_src(&deref->arr.index, instr, NULL, 1))
+            progress = true;
+      }
+
+      return progress;
+   }
+
    case nir_instr_type_tex: {
       nir_tex_instr *tex = nir_instr_as_tex(instr);
       for (unsigned i = 0; i < tex->num_srcs; i++) {
@@ -242,11 +262,6 @@
             progress = true;
       }
 
-      if (copy_prop_deref_var(instr, tex->texture))
-         progress = true;
-      if (copy_prop_deref_var(instr, tex->sampler))
-         progress = true;
-
       while (copy_prop_dest(&tex->dest, instr))
          progress = true;
 
@@ -263,12 +278,6 @@
             progress = true;
       }
 
-      for (unsigned i = 0;
-           i < nir_intrinsic_infos[intrin->intrinsic].num_variables; i++) {
-         if (copy_prop_deref_var(instr, intrin->variables[i]))
-            progress = true;
-      }
-
       if (nir_intrinsic_infos[intrin->intrinsic].has_dest) {
          while (copy_prop_dest(&intrin->dest, instr))
             progress = true;
diff --git a/src/compiler/nir/nir_opt_dce.c b/src/compiler/nir/nir_opt_dce.c
index 570e430..70532be 100644
--- a/src/compiler/nir/nir_opt_dce.c
+++ b/src/compiler/nir/nir_opt_dce.c
@@ -52,6 +52,7 @@
 init_instr(nir_instr *instr, nir_instr_worklist *worklist)
 {
    nir_alu_instr *alu_instr;
+   nir_deref_instr *deref_instr;
    nir_intrinsic_instr *intrin_instr;
    nir_tex_instr *tex_instr;
 
@@ -73,6 +74,12 @@
          mark_and_push(worklist, instr);
       break;
 
+   case nir_instr_type_deref:
+      deref_instr = nir_instr_as_deref(instr);
+      if (!deref_instr->dest.is_ssa)
+         mark_and_push(worklist, instr);
+      break;
+
    case nir_instr_type_intrinsic:
       intrin_instr = nir_instr_as_intrinsic(instr);
       if (nir_intrinsic_infos[intrin_instr->intrinsic].flags &
@@ -122,8 +129,7 @@
       init_block(block, worklist);
    }
 
-   nir_instr *instr = NULL;
-   nir_instr_worklist_foreach(worklist, instr)
+   nir_foreach_instr_in_worklist(instr, worklist)
       nir_foreach_src(instr, mark_live_cb, worklist);
 
    nir_instr_worklist_destroy(worklist);
diff --git a/src/compiler/nir/nir_opt_if.c b/src/compiler/nir/nir_opt_if.c
index ac5ed13..d8e03d6 100644
--- a/src/compiler/nir/nir_opt_if.c
+++ b/src/compiler/nir/nir_opt_if.c
@@ -22,7 +22,9 @@
  */
 
 #include "nir.h"
+#include "nir/nir_builder.h"
 #include "nir_control_flow.h"
+#include "nir_loop_analyze.h"
 
 /**
  * Gets the single block that jumps back to the loop header. Already assumes
@@ -178,6 +180,13 @@
       }
    }
 
+   /* We're about to re-arrange a bunch of blocks so make sure that we don't
+    * have deref uses which cross block boundaries.  We don't want a deref
+    * accidentally ending up in a phi.
+    */
+   nir_rematerialize_derefs_in_use_blocks_impl(
+      nir_cf_node_get_function(&loop->cf_node));
+
    /* Before we do anything, convert the loop to LCSSA.  We're about to
     * replace a bunch of SSA defs with registers and this will prevent any of
     * it from leaking outside the loop.
@@ -222,7 +231,153 @@
 }
 
 static bool
-opt_if_cf_list(struct exec_list *cf_list)
+is_block_empty(nir_block *block)
+{
+   return nir_cf_node_is_last(&block->cf_node) &&
+          exec_list_is_empty(&block->instr_list);
+}
+
+/**
+ * This optimization turns:
+ *
+ *     if (cond) {
+ *     } else {
+ *         do_work();
+ *     }
+ *
+ * into:
+ *
+ *     if (!cond) {
+ *         do_work();
+ *     } else {
+ *     }
+ */
+static bool
+opt_if_simplification(nir_builder *b, nir_if *nif)
+{
+   /* Only simplify if the then block is empty and the else block is not. */
+   if (!is_block_empty(nir_if_first_then_block(nif)) ||
+       is_block_empty(nir_if_first_else_block(nif)))
+      return false;
+
+   /* Make sure the condition is a comparison operation. */
+   nir_instr *src_instr = nif->condition.ssa->parent_instr;
+   if (src_instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *alu_instr = nir_instr_as_alu(src_instr);
+   if (!nir_alu_instr_is_comparison(alu_instr))
+      return false;
+
+   /* Insert the inverted instruction and rewrite the condition. */
+   b->cursor = nir_after_instr(&alu_instr->instr);
+
+   nir_ssa_def *new_condition =
+      nir_inot(b, &alu_instr->dest.dest.ssa);
+
+   nir_if_rewrite_condition(nif, nir_src_for_ssa(new_condition));
+
+   /* Grab pointers to the last then/else blocks for fixing up the phis. */
+   nir_block *then_block = nir_if_last_then_block(nif);
+   nir_block *else_block = nir_if_last_else_block(nif);
+
+   /* Walk all the phis in the block immediately following the if statement and
+    * swap the blocks.
+    */
+   nir_block *after_if_block =
+      nir_cf_node_as_block(nir_cf_node_next(&nif->cf_node));
+
+   nir_foreach_instr(instr, after_if_block) {
+      if (instr->type != nir_instr_type_phi)
+         continue;
+
+      nir_phi_instr *phi = nir_instr_as_phi(instr);
+
+      foreach_list_typed(nir_phi_src, src, node, &phi->srcs) {
+         if (src->pred == else_block) {
+            src->pred = then_block;
+         } else if (src->pred == then_block) {
+            src->pred = else_block;
+         }
+      }
+   }
+
+   /* Finally, move the else block to the then block. */
+   nir_cf_list tmp;
+   nir_cf_extract(&tmp, nir_before_cf_list(&nif->else_list),
+                        nir_after_cf_list(&nif->else_list));
+   nir_cf_reinsert(&tmp, nir_before_cf_list(&nif->then_list));
+
+   return true;
+}
+
+/**
+ * This optimization simplifies potential loop terminators which then allows
+ * other passes such as opt_if_simplification() and loop unrolling to progress
+ * further:
+ *
+ *     if (cond) {
+ *        ... then block instructions ...
+ *     } else {
+ *         ...
+ *        break;
+ *     }
+ *
+ * into:
+ *
+ *     if (cond) {
+ *     } else {
+ *         ...
+ *        break;
+ *     }
+ *     ... then block instructions ...
+ */
+static bool
+opt_if_loop_terminator(nir_if *nif)
+{
+   nir_block *break_blk = NULL;
+   nir_block *continue_from_blk = NULL;
+   bool continue_from_then = true;
+
+   nir_block *last_then = nir_if_last_then_block(nif);
+   nir_block *last_else = nir_if_last_else_block(nif);
+
+   if (nir_block_ends_in_break(last_then)) {
+      break_blk = last_then;
+      continue_from_blk = last_else;
+      continue_from_then = false;
+   } else if (nir_block_ends_in_break(last_else)) {
+      break_blk = last_else;
+      continue_from_blk = last_then;
+   }
+
+   /* Continue if the if-statement contained no jumps at all */
+   if (!break_blk)
+      return false;
+
+   /* If the continue from block is empty then return as there is nothing to
+    * move.
+    */
+   nir_block *first_continue_from_blk = continue_from_then ?
+      nir_if_first_then_block(nif) :
+      nir_if_first_else_block(nif);
+   if (is_block_empty(first_continue_from_blk))
+      return false;
+
+   if (!nir_is_trivial_loop_if(nif, break_blk))
+      return false;
+
+   /* Finally, move the continue from branch after the if-statement. */
+   nir_cf_list tmp;
+   nir_cf_extract(&tmp, nir_before_block(first_continue_from_blk),
+                        nir_after_block(continue_from_blk));
+   nir_cf_reinsert(&tmp, nir_after_cf_node(&nif->cf_node));
+
+   return true;
+}
+
+static bool
+opt_if_cf_list(nir_builder *b, struct exec_list *cf_list)
 {
    bool progress = false;
    foreach_list_typed(nir_cf_node, cf_node, node, cf_list) {
@@ -232,14 +387,16 @@
 
       case nir_cf_node_if: {
          nir_if *nif = nir_cf_node_as_if(cf_node);
-         progress |= opt_if_cf_list(&nif->then_list);
-         progress |= opt_if_cf_list(&nif->else_list);
+         progress |= opt_if_cf_list(b, &nif->then_list);
+         progress |= opt_if_cf_list(b, &nif->else_list);
+         progress |= opt_if_loop_terminator(nif);
+         progress |= opt_if_simplification(b, nif);
          break;
       }
 
       case nir_cf_node_loop: {
          nir_loop *loop = nir_cf_node_as_loop(cf_node);
-         progress |= opt_if_cf_list(&loop->body);
+         progress |= opt_if_cf_list(b, &loop->body);
          progress |= opt_peel_loop_initial_if(loop);
          break;
       }
@@ -261,7 +418,10 @@
       if (function->impl == NULL)
          continue;
 
-      if (opt_if_cf_list(&function->impl->body)) {
+      nir_builder b;
+      nir_builder_init(&b, function->impl);
+
+      if (opt_if_cf_list(&b, &function->impl->body)) {
          nir_metadata_preserve(function->impl, nir_metadata_none);
 
          /* If that made progress, we're no longer really in SSA form.  We
@@ -269,6 +429,7 @@
           * that don't dominate their uses.
           */
          nir_lower_regs_to_ssa_impl(function->impl);
+
          progress = true;
       }
    }
diff --git a/src/compiler/nir/nir_opt_large_constants.c b/src/compiler/nir/nir_opt_large_constants.c
new file mode 100644
index 0000000..25a9219
--- /dev/null
+++ b/src/compiler/nir/nir_opt_large_constants.c
@@ -0,0 +1,305 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_deref.h"
+
+struct var_info {
+   bool is_constant;
+   bool found_read;
+};
+
+static nir_ssa_def *
+build_constant_load(nir_builder *b, nir_deref_instr *deref,
+                    glsl_type_size_align_func size_align)
+{
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   const unsigned bit_size = glsl_get_bit_size(deref->type);
+   const unsigned num_components = glsl_get_vector_elements(deref->type);
+
+   UNUSED unsigned var_size, var_align;
+   size_align(var->type, &var_size, &var_align);
+   assert(var->data.location % var_align == 0);
+
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_constant);
+   load->num_components = num_components;
+   nir_intrinsic_set_base(load, var->data.location);
+   nir_intrinsic_set_range(load, var_size);
+   load->src[0] = nir_src_for_ssa(nir_build_deref_offset(b, deref, size_align));
+   nir_ssa_dest_init(&load->instr, &load->dest,
+                     num_components, bit_size, NULL);
+   nir_builder_instr_insert(b, &load->instr);
+
+   return &load->dest.ssa;
+}
+
+static void
+handle_constant_store(nir_builder *b, nir_intrinsic_instr *store,
+                      glsl_type_size_align_func size_align)
+{
+   nir_deref_instr *deref = nir_src_as_deref(store->src[0]);
+   assert(!nir_deref_instr_has_indirect(deref));
+
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   const unsigned bit_size = glsl_get_bit_size(deref->type);
+   const unsigned num_components = glsl_get_vector_elements(deref->type);
+
+   char *dst = (char *)b->shader->constant_data +
+               var->data.location +
+               nir_deref_instr_get_const_offset(deref, size_align);
+
+   nir_const_value *val = nir_src_as_const_value(store->src[1]);
+   switch (bit_size) {
+   case 8:
+      for (unsigned i = 0; i < num_components; i++)
+         ((uint8_t *)dst)[i] = val->u8[i];
+      break;
+
+   case 16:
+      for (unsigned i = 0; i < num_components; i++)
+         ((uint16_t *)dst)[i] = val->u16[i];
+      break;
+
+   case 32:
+      for (unsigned i = 0; i < num_components; i++)
+         ((uint32_t *)dst)[i] = val->u32[i];
+      break;
+
+   case 64:
+      for (unsigned i = 0; i < num_components; i++)
+         ((uint64_t *)dst)[i] = val->u64[i];
+      break;
+
+   default:
+      unreachable("Invalid bit size");
+   }
+}
+
+/** Lower large constant variables to shader constant data
+ *
+ * This pass looks for large (type_size(var->type) > threshold) variables
+ * which are statically constant and moves them into shader constant data.
+ * This is especially useful when large tables are baked into the shader
+ * source code because they can be moved into a UBO by the driver to reduce
+ * register pressure and make indirect access cheaper.
+ */
+bool
+nir_opt_large_constants(nir_shader *shader,
+                        glsl_type_size_align_func size_align,
+                        unsigned threshold)
+{
+   /* Default to a natural alignment if none is provided */
+   if (size_align == NULL)
+      size_align = glsl_get_natural_size_align_bytes;
+
+   /* This only works with a single entrypoint */
+   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
+
+   /* This pass can only be run once */
+   assert(shader->constant_data == NULL && shader->constant_data_size == 0);
+
+   /* The index parameter is unused for local variables so we'll use it for
+    * indexing into our array of variable metadata.
+    */
+   unsigned num_locals = 0;
+   nir_foreach_variable(var, &impl->locals)
+      var->data.index = num_locals++;
+
+   struct var_info *var_infos = malloc(num_locals * sizeof(struct var_info));
+   for (unsigned i = 0; i < num_locals; i++) {
+      var_infos[i] = (struct var_info) {
+         .is_constant = true,
+         .found_read = false,
+      };
+   }
+
+   /* First, walk through the shader and figure out what variables we can
+    * lower to the constant blob.
+    */
+   bool first_block = true;
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         bool src_is_const = false;
+         nir_deref_instr *src_deref = NULL, *dst_deref = NULL;
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_store_deref:
+            dst_deref = nir_src_as_deref(intrin->src[0]);
+            src_is_const = nir_src_as_const_value(intrin->src[1]);
+            break;
+
+         case nir_intrinsic_load_deref:
+            src_deref = nir_src_as_deref(intrin->src[0]);
+            break;
+
+         case nir_intrinsic_copy_deref:
+            /* We always assume the src and therefore the dst are not
+             * constants here. Copy and constant propagation passes should
+             * have taken care of this in most cases anyway.
+             */
+            dst_deref = nir_src_as_deref(intrin->src[0]);
+            src_deref = nir_src_as_deref(intrin->src[1]);
+            src_is_const = false;
+            break;
+
+         default:
+            continue;
+         }
+
+         if (dst_deref && dst_deref->mode == nir_var_local) {
+            nir_variable *var = nir_deref_instr_get_variable(dst_deref);
+            assert(var->data.mode == nir_var_local);
+
+            /* We only consider variables constant if they only have constant
+             * stores, all the stores come before any reads, and all stores
+             * come in the first block.  We also can't handle indirect stores.
+             */
+            struct var_info *info = &var_infos[var->data.index];
+            if (!src_is_const || info->found_read || !first_block ||
+                nir_deref_instr_has_indirect(dst_deref))
+               info->is_constant = false;
+         }
+
+         if (src_deref && src_deref->mode == nir_var_local) {
+            nir_variable *var = nir_deref_instr_get_variable(src_deref);
+            assert(var->data.mode == nir_var_local);
+
+            var_infos[var->data.index].found_read = true;
+         }
+      }
+      first_block = false;
+   }
+
+   shader->constant_data_size = 0;
+   nir_foreach_variable(var, &impl->locals) {
+      struct var_info *info = &var_infos[var->data.index];
+      if (!info->is_constant)
+         continue;
+
+      unsigned var_size, var_align;
+      size_align(var->type, &var_size, &var_align);
+      if (var_size <= threshold || !info->found_read) {
+         /* Don't bother lowering small stuff or data that's never read */
+         info->is_constant = false;
+         continue;
+      }
+
+      var->data.location = ALIGN_POT(shader->constant_data_size, var_align);
+      shader->constant_data_size = var->data.location + var_size;
+   }
+
+   if (shader->constant_data_size == 0) {
+      free(var_infos);
+      return false;
+   }
+
+   shader->constant_data = rzalloc_size(shader, shader->constant_data_size);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block(block, impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_deref: {
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+            if (deref->mode != nir_var_local)
+               continue;
+
+            nir_variable *var = nir_deref_instr_get_variable(deref);
+            struct var_info *info = &var_infos[var->data.index];
+            if (info->is_constant) {
+               b.cursor = nir_after_instr(&intrin->instr);
+               nir_ssa_def *val = build_constant_load(&b, deref, size_align);
+               nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                        nir_src_for_ssa(val));
+               nir_instr_remove(&intrin->instr);
+               nir_deref_instr_remove_if_unused(deref);
+            }
+            break;
+         }
+
+         case nir_intrinsic_store_deref: {
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+            if (deref->mode != nir_var_local)
+               continue;
+
+            nir_variable *var = nir_deref_instr_get_variable(deref);
+            struct var_info *info = &var_infos[var->data.index];
+            if (info->is_constant) {
+               b.cursor = nir_after_instr(&intrin->instr);
+               handle_constant_store(&b, intrin, size_align);
+               nir_instr_remove(&intrin->instr);
+               nir_deref_instr_remove_if_unused(deref);
+            }
+            break;
+         }
+
+         case nir_intrinsic_copy_deref: {
+            nir_deref_instr *deref = nir_src_as_deref(intrin->src[1]);
+            if (deref->mode != nir_var_local)
+               continue;
+
+            nir_variable *var = nir_deref_instr_get_variable(deref);
+            struct var_info *info = &var_infos[var->data.index];
+            if (info->is_constant) {
+               b.cursor = nir_after_instr(&intrin->instr);
+               nir_ssa_def *val = build_constant_load(&b, deref, size_align);
+               nir_store_deref(&b, nir_src_as_deref(intrin->src[0]), val, ~0);
+               nir_instr_remove(&intrin->instr);
+               nir_deref_instr_remove_if_unused(deref);
+            }
+            break;
+         }
+
+         default:
+            continue;
+         }
+      }
+   }
+
+   /* Clean up the now unused variables */
+   nir_foreach_variable_safe(var, &impl->locals) {
+      if (var_infos[var->data.index].is_constant)
+         exec_node_remove(&var->node);
+   }
+
+   free(var_infos);
+
+   nir_metadata_preserve(impl, nir_metadata_block_index |
+                               nir_metadata_dominance);
+   return true;
+}
diff --git a/src/compiler/nir/nir_opt_loop_unroll.c b/src/compiler/nir/nir_opt_loop_unroll.c
index b8efbb5..161c4ba 100644
--- a/src/compiler/nir/nir_opt_loop_unroll.c
+++ b/src/compiler/nir/nir_opt_loop_unroll.c
@@ -49,6 +49,9 @@
 static void
 loop_prepare_for_unroll(nir_loop *loop)
 {
+   nir_rematerialize_derefs_in_use_blocks_impl(
+      nir_cf_node_get_function(&loop->cf_node));
+
    nir_convert_loop_to_lcssa(loop);
 
    /* Lower phis at the top level of the loop body */
@@ -578,6 +581,10 @@
    return progress;
 }
 
+/**
+ * indirect_mask specifies which type of indirectly accessed variables
+ * should force loop unrolling.
+ */
 bool
 nir_opt_loop_unroll(nir_shader *shader, nir_variable_mode indirect_mask)
 {
diff --git a/src/compiler/nir/nir_opt_move_comparisons.c b/src/compiler/nir/nir_opt_move_comparisons.c
index 617c2ca..5da57dc 100644
--- a/src/compiler/nir/nir_opt_move_comparisons.c
+++ b/src/compiler/nir/nir_opt_move_comparisons.c
@@ -51,30 +51,6 @@
  */
 
 static bool
-is_comparison(nir_op op)
-{
-   switch (op) {
-   case nir_op_flt:
-   case nir_op_fge:
-   case nir_op_feq:
-   case nir_op_fne:
-   case nir_op_ilt:
-   case nir_op_ult:
-   case nir_op_ige:
-   case nir_op_uge:
-   case nir_op_ieq:
-   case nir_op_ine:
-   case nir_op_i2b:
-   case nir_op_f2b:
-   case nir_op_inot:
-   case nir_op_fnot:
-      return true;
-   default:
-      return false;
-   }
-}
-
-static bool
 move_comparison_source(nir_src *src, nir_block *block, nir_instr *before)
 {
    if (!src->is_ssa)
@@ -84,7 +60,7 @@
 
    if (src_instr->block == block &&
        src_instr->type == nir_instr_type_alu &&
-       is_comparison(nir_instr_as_alu(src_instr)->op)) {
+       nir_alu_instr_is_comparison(nir_instr_as_alu(src_instr))) {
 
       exec_node_remove(&src_instr->node);
 
diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c
index 4ca4f80..ad9d0ab 100644
--- a/src/compiler/nir/nir_opt_peephole_select.c
+++ b/src/compiler/nir/nir_opt_peephole_select.c
@@ -66,8 +66,8 @@
          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
          switch (intrin->intrinsic) {
-         case nir_intrinsic_load_var:
-            switch (intrin->variables[0]->var->data.mode) {
+         case nir_intrinsic_load_deref:
+            switch (nir_src_as_deref(intrin->src[0])->mode) {
             case nir_var_shader_in:
             case nir_var_uniform:
                break;
@@ -89,6 +89,7 @@
          break;
       }
 
+      case nir_instr_type_deref:
       case nir_instr_type_load_const:
          break;
 
diff --git a/src/compiler/nir/nir_opt_remove_phis.c b/src/compiler/nir/nir_opt_remove_phis.c
index b20ff72..e2d3994 100644
--- a/src/compiler/nir/nir_opt_remove_phis.c
+++ b/src/compiler/nir/nir_opt_remove_phis.c
@@ -139,8 +139,8 @@
    return progress;
 }
 
-static bool
-remove_phis_impl(nir_function_impl *impl)
+bool
+nir_opt_remove_phis_impl(nir_function_impl *impl)
 {
    bool progress = false;
    nir_builder bld;
@@ -165,7 +165,7 @@
 
    nir_foreach_function(function, shader)
       if (function->impl)
-         progress = remove_phis_impl(function->impl) || progress;
+         progress = nir_opt_remove_phis_impl(function->impl) || progress;
 
    return progress;
 }
diff --git a/src/compiler/nir/nir_opt_undef.c b/src/compiler/nir/nir_opt_undef.c
index 8d3210c..c26158d 100644
--- a/src/compiler/nir/nir_opt_undef.c
+++ b/src/compiler/nir/nir_opt_undef.c
@@ -107,19 +107,23 @@
 static bool
 opt_undef_store(nir_intrinsic_instr *intrin)
 {
+   int arg_index;
    switch (intrin->intrinsic) {
-   case nir_intrinsic_store_var:
+   case nir_intrinsic_store_deref:
+      arg_index = 1;
+      break;
    case nir_intrinsic_store_output:
    case nir_intrinsic_store_per_vertex_output:
    case nir_intrinsic_store_ssbo:
    case nir_intrinsic_store_shared:
+      arg_index =  0;
       break;
    default:
       return false;
    }
 
-   if (!intrin->src[0].is_ssa ||
-       intrin->src[0].ssa->parent_instr->type != nir_instr_type_ssa_undef)
+   if (!intrin->src[arg_index].is_ssa ||
+       intrin->src[arg_index].ssa->parent_instr->type != nir_instr_type_ssa_undef)
       return false;
 
    nir_instr_remove(&intrin->instr);
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index 452e8cd..7cb16ab 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -27,6 +27,7 @@
 
 #include "nir.h"
 #include "compiler/shader_enums.h"
+#include "util/half_float.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <inttypes.h> /* for PRIx64 macro */
@@ -185,9 +186,9 @@
    print_src(&instr->src[src].src, state);
 
    bool print_swizzle = false;
-   unsigned used_channels = 0;
+   nir_component_mask_t used_channels = 0;
 
-   for (unsigned i = 0; i < 4; i++) {
+   for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
       if (!nir_alu_instr_channel_used(instr, src, i))
          continue;
 
@@ -199,13 +200,11 @@
       }
    }
 
-   unsigned live_channels = instr->src[src].src.is_ssa
-      ? instr->src[src].src.ssa->num_components
-      : instr->src[src].src.reg.reg->num_components;
+   unsigned live_channels = nir_src_num_components(instr->src[src].src);
 
    if (print_swizzle || used_channels != live_channels) {
       fprintf(fp, ".");
-      for (unsigned i = 0; i < 4; i++) {
+      for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
          if (!nir_alu_instr_channel_used(instr, src, i))
             continue;
 
@@ -228,7 +227,7 @@
    if (!dest->dest.is_ssa &&
        dest->write_mask != (1 << dest->dest.reg.reg->num_components) - 1) {
       fprintf(fp, ".");
-      for (unsigned i = 0; i < 4; i++)
+      for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++)
          if ((dest->write_mask >> i) & 1)
             fprintf(fp, "%c", "xyzw"[i]);
    }
@@ -299,6 +298,28 @@
    unsigned i, j;
 
    switch (glsl_get_base_type(type)) {
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_INT8:
+      /* Only float base types can be matrices. */
+      assert(cols == 1);
+
+      for (i = 0; i < rows; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "0x%02x", c->values[0].u8[i]);
+      }
+      break;
+
+   case GLSL_TYPE_UINT16:
+   case GLSL_TYPE_INT16:
+      /* Only float base types can be matrices. */
+      assert(cols == 1);
+
+      for (i = 0; i < rows; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "0x%04x", c->values[0].u16[i]);
+      }
+      break;
+
    case GLSL_TYPE_UINT:
    case GLSL_TYPE_INT:
    case GLSL_TYPE_BOOL:
@@ -311,6 +332,15 @@
       }
       break;
 
+   case GLSL_TYPE_FLOAT16:
+      for (i = 0; i < cols; i++) {
+         for (j = 0; j < rows; j++) {
+            if (i + j > 0) fprintf(fp, ", ");
+            fprintf(fp, "%f", _mesa_half_to_float(c->values[i].u16[j]));
+         }
+      }
+      break;
+
    case GLSL_TYPE_FLOAT:
       for (i = 0; i < cols; i++) {
          for (j = 0; j < rows; j++) {
@@ -364,7 +394,7 @@
 }
 
 static const char *
-get_variable_mode_str(nir_variable_mode mode)
+get_variable_mode_str(nir_variable_mode mode, bool want_local_global_mode)
 {
    switch (mode) {
    case nir_var_shader_in:
@@ -379,9 +409,10 @@
       return "system";
    case nir_var_shared:
       return "shared";
-   case nir_var_param:
    case nir_var_global:
+      return want_local_global_mode ? "global" : "";
    case nir_var_local:
+      return want_local_global_mode ? "local" : "";
    default:
       return "";
    }
@@ -399,7 +430,7 @@
    const char *const patch = (var->data.patch) ? "patch " : "";
    const char *const inv = (var->data.invariant) ? "invariant " : "";
    fprintf(fp, "%s%s%s%s%s %s ",
-           cent, samp, patch, inv, get_variable_mode_str(var->data.mode),
+           cent, samp, patch, inv, get_variable_mode_str(var->data.mode, false),
            glsl_interp_mode_name(var->data.interpolation));
 
    const char *const coher = (var->data.image.coherent) ? "coherent " : "";
@@ -489,85 +520,124 @@
 }
 
 static void
-print_var(nir_variable *var, print_state *state)
+print_deref_link(nir_deref_instr *instr, bool whole_chain, print_state *state)
 {
    FILE *fp = state->fp;
-   fprintf(fp, "%s", get_var_name(var, state));
-}
 
-static void
-print_arg(nir_variable *var, print_state *state)
-{
-   FILE *fp = state->fp;
-   fprintf(fp, "%s %s", glsl_get_type_name(var->type),
-           get_var_name(var, state));
-}
+   if (instr->deref_type == nir_deref_type_var) {
+      fprintf(fp, "%s", get_var_name(instr->var, state));
+      return;
+   } else if (instr->deref_type == nir_deref_type_cast) {
+      fprintf(fp, "(%s *)", glsl_get_type_name(instr->type));
+      print_src(&instr->parent, state);
+      return;
+   }
 
-static void
-print_deref_var(nir_deref_var *deref, print_state *state)
-{
-   print_var(deref->var, state);
-}
+   assert(instr->parent.is_ssa);
+   nir_deref_instr *parent =
+      nir_instr_as_deref(instr->parent.ssa->parent_instr);
 
-static void
-print_deref_array(nir_deref_array *deref, print_state *state)
-{
-   FILE *fp = state->fp;
-   fprintf(fp, "[");
-   switch (deref->deref_array_type) {
-   case nir_deref_array_type_direct:
-      fprintf(fp, "%u", deref->base_offset);
-      break;
-   case nir_deref_array_type_indirect:
-      if (deref->base_offset != 0)
-         fprintf(fp, "%u + ", deref->base_offset);
-      print_src(&deref->indirect, state);
-      break;
-   case nir_deref_array_type_wildcard:
+   /* Is the parent we're going to print a bare cast? */
+   const bool is_parent_cast =
+      whole_chain && parent->deref_type == nir_deref_type_cast;
+
+   /* If we're not printing the whole chain, the parent we print will be a SSA
+    * value that represents a pointer.  The only deref type that naturally
+    * gives a pointer is a cast.
+    */
+   const bool is_parent_pointer =
+      !whole_chain || parent->deref_type == nir_deref_type_cast;
+
+   /* Struct derefs have a nice syntax that works on pointers, arrays derefs
+    * do not.
+    */
+   const bool need_deref =
+      is_parent_pointer && instr->deref_type != nir_deref_type_struct;
+
+   /* Cast need extra parens and so * dereferences */
+   if (is_parent_cast || need_deref)
+      fprintf(fp, "(");
+
+   if (need_deref)
       fprintf(fp, "*");
+
+   if (whole_chain) {
+      print_deref_link(parent, whole_chain, state);
+   } else {
+      print_src(&instr->parent, state);
+   }
+
+   if (is_parent_cast || need_deref)
+      fprintf(fp, ")");
+
+   switch (instr->deref_type) {
+   case nir_deref_type_struct:
+      fprintf(fp, "%s%s", is_parent_pointer ? "->" : ".",
+              glsl_get_struct_elem_name(parent->type, instr->strct.index));
+      break;
+
+   case nir_deref_type_array: {
+      nir_const_value *const_index = nir_src_as_const_value(instr->arr.index);
+      if (const_index) {
+         fprintf(fp, "[%u]", const_index->u32[0]);
+      } else {
+         fprintf(fp, "[");
+         print_src(&instr->arr.index, state);
+         fprintf(fp, "]");
+      }
       break;
    }
-   fprintf(fp, "]");
+
+   case nir_deref_type_array_wildcard:
+      fprintf(fp, "[*]");
+      break;
+
+   default:
+      unreachable("Invalid deref instruction type");
+   }
 }
 
 static void
-print_deref_struct(nir_deref_struct *deref, const struct glsl_type *parent_type,
-                   print_state *state)
+print_deref_instr(nir_deref_instr *instr, print_state *state)
 {
    FILE *fp = state->fp;
-   fprintf(fp, ".%s", glsl_get_struct_elem_name(parent_type, deref->index));
-}
 
-static void
-print_deref(nir_deref_var *deref, print_state *state)
-{
-   nir_deref *tail = &deref->deref;
-   nir_deref *pretail = NULL;
-   while (tail != NULL) {
-      switch (tail->deref_type) {
-      case nir_deref_type_var:
-         assert(pretail == NULL);
-         assert(tail == &deref->deref);
-         print_deref_var(deref, state);
-         break;
+   print_dest(&instr->dest, state);
 
-      case nir_deref_type_array:
-         assert(pretail != NULL);
-         print_deref_array(nir_deref_as_array(tail), state);
-         break;
+   switch (instr->deref_type) {
+   case nir_deref_type_var:
+      fprintf(fp, " = deref_var ");
+      break;
+   case nir_deref_type_array:
+   case nir_deref_type_array_wildcard:
+      fprintf(fp, " = deref_array ");
+      break;
+   case nir_deref_type_struct:
+      fprintf(fp, " = deref_struct ");
+      break;
+   case nir_deref_type_cast:
+      fprintf(fp, " = deref_cast ");
+      break;
+   default:
+      unreachable("Invalid deref instruction type");
+   }
 
-      case nir_deref_type_struct:
-         assert(pretail != NULL);
-         print_deref_struct(nir_deref_as_struct(tail),
-                            pretail->type, state);
-         break;
+   /* Only casts naturally return a pointer type */
+   if (instr->deref_type != nir_deref_type_cast)
+      fprintf(fp, "&");
 
-      default:
-         unreachable("Invalid deref type");
-      }
+   print_deref_link(instr, false, state);
 
-      pretail = tail;
-      tail = pretail->child;
+   fprintf(fp, " (%s %s) ",
+           get_variable_mode_str(instr->mode, true),
+           glsl_get_type_name(instr->type));
+
+   if (instr->deref_type != nir_deref_type_var &&
+       instr->deref_type != nir_deref_type_cast) {
+      /* Print the entire chain as a comment */
+      fprintf(fp, "/* &");
+      print_deref_link(instr, true, state);
+      fprintf(fp, " */");
    }
 }
 
@@ -594,15 +664,6 @@
 
    fprintf(fp, ") (");
 
-   for (unsigned i = 0; i < info->num_variables; i++) {
-      if (i != 0)
-         fprintf(fp, ", ");
-
-      print_deref(instr->variables[i], state);
-   }
-
-   fprintf(fp, ") (");
-
    for (unsigned i = 0; i < info->num_indices; i++) {
       if (i != 0)
          fprintf(fp, ", ");
@@ -624,6 +685,7 @@
       [NIR_INTRINSIC_INTERP_MODE] = "interp_mode",
       [NIR_INTRINSIC_REDUCTION_OP] = "reduction_op",
       [NIR_INTRINSIC_CLUSTER_SIZE] = "cluster_size",
+      [NIR_INTRINSIC_PARAM_IDX] = "param_idx",
    };
    for (unsigned idx = 1; idx < NIR_INTRINSIC_NUM_INDEX_FLAGS; idx++) {
       if (!info->index_map[idx])
@@ -734,6 +796,7 @@
       break;
    }
 
+   bool has_texture_deref = false, has_sampler_deref = false;
    for (unsigned i = 0; i < instr->num_srcs; i++) {
       print_src(&instr->src[i].src, state);
 
@@ -770,6 +833,14 @@
       case nir_tex_src_ddy:
          fprintf(fp, "(ddy)");
          break;
+      case nir_tex_src_texture_deref:
+         has_texture_deref = true;
+         fprintf(fp, "(texture_deref)");
+         break;
+      case nir_tex_src_sampler_deref:
+         has_sampler_deref = true;
+         fprintf(fp, "(sampler_deref)");
+         break;
       case nir_tex_src_texture_offset:
          fprintf(fp, "(texture_offset)");
          break;
@@ -792,17 +863,12 @@
       fprintf(fp, "%u (gather_component), ", instr->component);
    }
 
-   if (instr->texture) {
-      print_deref(instr->texture, state);
-      fprintf(fp, " (texture)");
-      if (instr->sampler) {
-         print_deref(instr->sampler, state);
-         fprintf(fp, " (sampler)");
-      }
-   } else {
-      assert(instr->sampler == NULL);
-      fprintf(fp, "%u (texture) %u (sampler)",
-              instr->texture_index, instr->sampler_index);
+   if (!has_texture_deref) {
+      fprintf(fp, "%u (texture), ", instr->texture_index);
+   }
+
+   if (!has_sampler_deref) {
+      fprintf(fp, "%u (sampler), ", instr->sampler_index);
    }
 }
 
@@ -817,14 +883,7 @@
       if (i != 0)
          fprintf(fp, ", ");
 
-      print_deref(instr->params[i], state);
-   }
-
-   if (instr->return_deref != NULL) {
-      if (instr->num_params != 0)
-         fprintf(fp, ", ");
-      fprintf(fp, "returning ");
-      print_deref(instr->return_deref, state);
+      print_src(&instr->params[i], state);
    }
 }
 
@@ -847,11 +906,22 @@
        * and then print the float in a comment for readability.
        */
 
-      if (instr->def.bit_size == 64)
+      switch (instr->def.bit_size) {
+      case 64:
          fprintf(fp, "0x%16" PRIx64 " /* %f */", instr->value.u64[i],
                  instr->value.f64[i]);
-      else
+         break;
+      case 32:
          fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]);
+         break;
+      case 16:
+         fprintf(fp, "0x%04x /* %f */", instr->value.u16[i],
+                 _mesa_half_to_float(instr->value.u16[i]));
+         break;
+      case 8:
+         fprintf(fp, "0x%02x", instr->value.u8[i]);
+         break;
+      }
    }
 
    fprintf(fp, ")");
@@ -925,6 +995,10 @@
       print_alu_instr(nir_instr_as_alu(instr), state);
       break;
 
+   case nir_instr_type_deref:
+      print_deref_instr(nir_instr_as_deref(instr), state);
+      break;
+
    case nir_instr_type_call:
       print_call_instr(nir_instr_as_call(instr), state);
       break;
@@ -1084,20 +1158,6 @@
 
    fprintf(fp, "\nimpl %s ", impl->function->name);
 
-   for (unsigned i = 0; i < impl->num_params; i++) {
-      if (i != 0)
-         fprintf(fp, ", ");
-
-      print_arg(impl->params[i], state);
-   }
-
-   if (impl->return_var != NULL) {
-      if (impl->num_params != 0)
-         fprintf(fp, ", ");
-      fprintf(fp, "returning ");
-      print_arg(impl->return_var, state);
-   }
-
    fprintf(fp, "{\n");
 
    nir_foreach_variable(var, &impl->locals) {
@@ -1124,34 +1184,8 @@
 {
    FILE *fp = state->fp;
 
-   fprintf(fp, "decl_function %s ", function->name);
-
-   for (unsigned i = 0; i < function->num_params; i++) {
-      if (i != 0)
-         fprintf(fp, ", ");
-
-      switch (function->params[i].param_type) {
-      case nir_parameter_in:
-         fprintf(fp, "in ");
-         break;
-      case nir_parameter_out:
-         fprintf(fp, "out ");
-         break;
-      case nir_parameter_inout:
-         fprintf(fp, "inout ");
-         break;
-      default:
-         unreachable("Invalid parameter type");
-      }
-
-      fprintf(fp, "%s", glsl_get_type_name(function->params[i].type));
-   }
-
-   if (function->return_type != NULL) {
-      if (function->num_params != 0)
-         fprintf(fp, ", ");
-      fprintf(fp, "returning %s", glsl_get_type_name(function->return_type));
-   }
+   fprintf(fp, "decl_function %s (%d params)", function->name,
+           function->num_params);
 
    fprintf(fp, "\n");
 
diff --git a/src/compiler/nir/nir_propagate_invariant.c b/src/compiler/nir/nir_propagate_invariant.c
index 7b5bd6c..eb858f5 100644
--- a/src/compiler/nir/nir_propagate_invariant.c
+++ b/src/compiler/nir/nir_propagate_invariant.c
@@ -98,20 +98,20 @@
    case nir_instr_type_intrinsic: {
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       switch (intrin->intrinsic) {
-      case nir_intrinsic_copy_var:
+      case nir_intrinsic_copy_deref:
          /* If the destination is invariant then so is the source */
-         if (var_is_invariant(intrin->variables[0]->var, invariants))
-            add_var(intrin->variables[1]->var, invariants);
+         if (var_is_invariant(nir_intrinsic_get_var(intrin, 0), invariants))
+            add_var(nir_intrinsic_get_var(intrin, 1), invariants);
          break;
 
-      case nir_intrinsic_load_var:
+      case nir_intrinsic_load_deref:
          if (dest_is_invariant(&intrin->dest, invariants))
-            add_var(intrin->variables[0]->var, invariants);
+            add_var(nir_intrinsic_get_var(intrin, 0), invariants);
          break;
 
-      case nir_intrinsic_store_var:
-         if (var_is_invariant(intrin->variables[0]->var, invariants))
-            add_src(&intrin->src[0], invariants);
+      case nir_intrinsic_store_deref:
+         if (var_is_invariant(nir_intrinsic_get_var(intrin, 0), invariants))
+            add_src(&intrin->src[1], invariants);
          break;
 
       default:
@@ -120,6 +120,7 @@
       }
    }
 
+   case nir_instr_type_deref:
    case nir_instr_type_jump:
    case nir_instr_type_ssa_undef:
    case nir_instr_type_load_const:
diff --git a/src/compiler/nir/nir_remove_dead_variables.c b/src/compiler/nir/nir_remove_dead_variables.c
index eff66f9..fadc51a 100644
--- a/src/compiler/nir/nir_remove_dead_variables.c
+++ b/src/compiler/nir/nir_remove_dead_variables.c
@@ -27,69 +27,53 @@
 
 #include "nir.h"
 
-static void
-add_var_use_intrinsic(nir_intrinsic_instr *instr, struct set *live,
-                      nir_variable_mode modes)
+static bool
+deref_used_for_not_store(nir_deref_instr *deref)
 {
-   unsigned num_vars = nir_intrinsic_infos[instr->intrinsic].num_variables;
+   nir_foreach_use(src, &deref->dest.ssa) {
+      switch (src->parent_instr->type) {
+      case nir_instr_type_deref:
+         if (deref_used_for_not_store(nir_instr_as_deref(src->parent_instr)))
+            return true;
+         break;
 
-   switch (instr->intrinsic) {
-   case nir_intrinsic_copy_var:
-      _mesa_set_add(live, instr->variables[1]->var);
-      /* Fall through */
-   case nir_intrinsic_store_var: {
-      /* The first source in both copy_var and store_var is the destination.
-       * If the variable is a local that never escapes the shader, then we
-       * don't mark it as live for just a store.
-       */
-      nir_variable_mode mode = instr->variables[0]->var->data.mode;
-      if (!(mode & (nir_var_local | nir_var_global | nir_var_shared)))
-         _mesa_set_add(live, instr->variables[0]->var);
-      break;
-   }
-
-   /* This pass can't be used on I/O variables after they've been lowered. */
-   case nir_intrinsic_load_input:
-      assert(!(modes & nir_var_shader_in));
-      break;
-   case nir_intrinsic_store_output:
-      assert(!(modes & nir_var_shader_out));
-      break;
-
-   default:
-      for (unsigned i = 0; i < num_vars; i++) {
-         _mesa_set_add(live, instr->variables[i]->var);
+      case nir_instr_type_intrinsic: {
+         nir_intrinsic_instr *intrin =
+            nir_instr_as_intrinsic(src->parent_instr);
+         /* The first source of copy and store intrinsics is the deref to
+          * write.  Don't record those.
+          */
+         if ((intrin->intrinsic != nir_intrinsic_store_deref &&
+              intrin->intrinsic != nir_intrinsic_copy_deref) ||
+             src != &intrin->src[0])
+            return true;
+         break;
       }
-      break;
+
+      default:
+         /* If it's used by any other instruction type (most likely a texture
+          * or call instruction), consider it used.
+          */
+         return true;
+      }
    }
+
+   return false;
 }
 
 static void
-add_var_use_call(nir_call_instr *instr, struct set *live)
+add_var_use_deref(nir_deref_instr *deref, struct set *live)
 {
-   if (instr->return_deref != NULL) {
-      nir_variable *var = instr->return_deref->var;
-      _mesa_set_add(live, var);
-   }
+   if (deref->deref_type != nir_deref_type_var)
+      return;
 
-   for (unsigned i = 0; i < instr->num_params; i++) {
-      nir_variable *var = instr->params[i]->var;
-      _mesa_set_add(live, var);
-   }
-}
-
-static void
-add_var_use_tex(nir_tex_instr *instr, struct set *live)
-{
-   if (instr->texture != NULL) {
-      nir_variable *var = instr->texture->var;
-      _mesa_set_add(live, var);
-   }
-
-   if (instr->sampler != NULL) {
-      nir_variable *var = instr->sampler->var;
-      _mesa_set_add(live, var);
-   }
+   /* If it's not a local that never escapes the shader, then any access at
+    * all means we need to keep it alive.
+    */
+   assert(deref->mode == deref->var->data.mode);
+   if (!(deref->mode & (nir_var_local | nir_var_global | nir_var_shared)) ||
+       deref_used_for_not_store(deref))
+      _mesa_set_add(live, deref->var);
 }
 
 static void
@@ -99,23 +83,8 @@
       if (function->impl) {
          nir_foreach_block(block, function->impl) {
             nir_foreach_instr(instr, block) {
-               switch(instr->type) {
-               case nir_instr_type_intrinsic:
-                  add_var_use_intrinsic(nir_instr_as_intrinsic(instr), live,
-                                        modes);
-                  break;
-
-               case nir_instr_type_call:
-                  add_var_use_call(nir_instr_as_call(instr), live);
-                  break;
-
-               case nir_instr_type_tex:
-                  add_var_use_tex(nir_instr_as_tex(instr), live);
-                  break;
-
-               default:
-                  break;
-               }
+               if (instr->type == nir_instr_type_deref)
+                  add_var_use_deref(nir_instr_as_deref(instr), live);
             }
          }
       }
@@ -131,17 +100,40 @@
 
       nir_foreach_block(block, function->impl) {
          nir_foreach_instr_safe(instr, block) {
-            if (instr->type != nir_instr_type_intrinsic)
-               continue;
+            switch (instr->type) {
+            case nir_instr_type_deref: {
+               nir_deref_instr *deref = nir_instr_as_deref(instr);
 
-            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-            if (intrin->intrinsic != nir_intrinsic_copy_var &&
-                intrin->intrinsic != nir_intrinsic_store_var)
-               continue;
+               nir_variable_mode parent_mode;
+               if (deref->deref_type == nir_deref_type_var)
+                  parent_mode = deref->var->data.mode;
+               else
+                  parent_mode = nir_deref_instr_parent(deref)->mode;
 
-            /* Stores to dead variables need to be removed */
-            if (intrin->variables[0]->var->data.mode == 0)
-               nir_instr_remove(instr);
+               /* If the parent mode is 0, then it references a dead variable.
+                * Flag this deref as dead and remove it.
+                */
+               if (parent_mode == 0) {
+                  deref->mode = 0;
+                  nir_instr_remove(&deref->instr);
+               }
+               break;
+            }
+
+            case nir_instr_type_intrinsic: {
+               nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+               if (intrin->intrinsic != nir_intrinsic_copy_deref &&
+                   intrin->intrinsic != nir_intrinsic_store_deref)
+                  break;
+
+               if (nir_src_as_deref(intrin->src[0])->mode == 0)
+                  nir_instr_remove(instr);
+               break;
+            }
+
+            default:
+               break; /* Nothing to do */
+            }
          }
       }
    }
diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index 28b36b2..21fcbe7 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -41,7 +41,7 @@
                  unsigned num_components, const uint8_t *swizzle,
                  struct match_state *state);
 
-static const uint8_t identity_swizzle[] = { 0, 1, 2, 3 };
+static const uint8_t identity_swizzle[NIR_MAX_VEC_COMPONENTS] = { 0, 1, 2, 3 };
 
 /**
  * Check if a source produces a value of the given type.
@@ -97,7 +97,7 @@
             unsigned num_components, const uint8_t *swizzle,
             struct match_state *state)
 {
-   uint8_t new_swizzle[4];
+   uint8_t new_swizzle[NIR_MAX_VEC_COMPONENTS];
 
    /* Searching only works on SSA values because, if it's not SSA, we can't
     * know if the value changed between one instance of that value in the
@@ -167,7 +167,7 @@
          state->variables[var->variable].abs = false;
          state->variables[var->variable].negate = false;
 
-         for (unsigned i = 0; i < 4; ++i) {
+         for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; ++i) {
             if (i < num_components)
                state->variables[var->variable].swizzle[i] = new_swizzle[i];
             else
@@ -606,7 +606,7 @@
 nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
                   const nir_search_value *replace, void *mem_ctx)
 {
-   uint8_t swizzle[4] = { 0, 0, 0, 0 };
+   uint8_t swizzle[NIR_MAX_VEC_COMPONENTS] = { 0 };
 
    for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i)
       swizzle[i] = i;
diff --git a/src/compiler/nir/nir_serialize.c b/src/compiler/nir/nir_serialize.c
index cf77e75..4301631 100644
--- a/src/compiler/nir/nir_serialize.c
+++ b/src/compiler/nir/nir_serialize.c
@@ -149,6 +149,11 @@
    blob_write_uint32(ctx->blob, !!(var->interface_type));
    if (var->interface_type)
       encode_type_to_blob(ctx->blob, var->interface_type);
+   blob_write_uint32(ctx->blob, var->num_members);
+   if (var->num_members > 0) {
+      blob_write_bytes(ctx->blob, (uint8_t *) var->members,
+                       var->num_members * sizeof(*var->members));
+   }
 }
 
 static nir_variable *
@@ -180,6 +185,13 @@
       var->interface_type = decode_type_from_blob(ctx->blob);
    else
       var->interface_type = NULL;
+   var->num_members = blob_read_uint32(ctx->blob);
+   if (var->num_members > 0) {
+      var->members = ralloc_array(var, struct nir_variable_data,
+                                  var->num_members);
+      blob_copy_bytes(ctx->blob, (uint8_t *) var->members,
+                      var->num_members * sizeof(*var->members));
+   }
 
    return var;
 }
@@ -358,81 +370,6 @@
 }
 
 static void
-write_deref_chain(write_ctx *ctx, const nir_deref_var *deref_var)
-{
-   write_object(ctx, deref_var->var);
-
-   uint32_t len = 0;
-   for (const nir_deref *d = deref_var->deref.child; d; d = d->child)
-      len++;
-   blob_write_uint32(ctx->blob, len);
-
-   for (const nir_deref *d = deref_var->deref.child; d; d = d->child) {
-      blob_write_uint32(ctx->blob, d->deref_type);
-      switch (d->deref_type) {
-      case nir_deref_type_array: {
-         const nir_deref_array *deref_array = nir_deref_as_array(d);
-         blob_write_uint32(ctx->blob, deref_array->deref_array_type);
-         blob_write_uint32(ctx->blob, deref_array->base_offset);
-         if (deref_array->deref_array_type == nir_deref_array_type_indirect)
-            write_src(ctx, &deref_array->indirect);
-         break;
-      }
-      case nir_deref_type_struct: {
-         const nir_deref_struct *deref_struct = nir_deref_as_struct(d);
-         blob_write_uint32(ctx->blob, deref_struct->index);
-         break;
-      }
-      case nir_deref_type_var:
-         unreachable("Invalid deref type");
-      }
-
-      encode_type_to_blob(ctx->blob, d->type);
-   }
-}
-
-static nir_deref_var *
-read_deref_chain(read_ctx *ctx, void *mem_ctx)
-{
-   nir_variable *var = read_object(ctx);
-   nir_deref_var *deref_var = nir_deref_var_create(mem_ctx, var);
-
-   uint32_t len = blob_read_uint32(ctx->blob);
-
-   nir_deref *tail = &deref_var->deref;
-   for (uint32_t i = 0; i < len; i++) {
-      nir_deref_type deref_type = blob_read_uint32(ctx->blob);
-      nir_deref *deref = NULL;
-      switch (deref_type) {
-      case nir_deref_type_array: {
-         nir_deref_array *deref_array = nir_deref_array_create(tail);
-         deref_array->deref_array_type = blob_read_uint32(ctx->blob);
-         deref_array->base_offset = blob_read_uint32(ctx->blob);
-         if (deref_array->deref_array_type == nir_deref_array_type_indirect)
-            read_src(ctx, &deref_array->indirect, mem_ctx);
-         deref = &deref_array->deref;
-         break;
-      }
-      case nir_deref_type_struct: {
-         uint32_t index = blob_read_uint32(ctx->blob);
-         nir_deref_struct *deref_struct = nir_deref_struct_create(tail, index);
-         deref = &deref_struct->deref;
-         break;
-      }
-      case nir_deref_type_var:
-         unreachable("Invalid deref type");
-      }
-
-      deref->type = decode_type_from_blob(ctx->blob);
-
-      tail->child = deref;
-      tail = deref;
-   }
-
-   return deref_var;
-}
-
-static void
 write_alu(write_ctx *ctx, const nir_alu_instr *alu)
 {
    blob_write_uint32(ctx->blob, alu->op);
@@ -479,11 +416,85 @@
 }
 
 static void
+write_deref(write_ctx *ctx, const nir_deref_instr *deref)
+{
+   blob_write_uint32(ctx->blob, deref->deref_type);
+
+   blob_write_uint32(ctx->blob, deref->mode);
+   encode_type_to_blob(ctx->blob, deref->type);
+
+   write_dest(ctx, &deref->dest);
+
+   if (deref->deref_type == nir_deref_type_var) {
+      write_object(ctx, deref->var);
+      return;
+   }
+
+   write_src(ctx, &deref->parent);
+
+   switch (deref->deref_type) {
+   case nir_deref_type_struct:
+      blob_write_uint32(ctx->blob, deref->strct.index);
+      break;
+
+   case nir_deref_type_array:
+      write_src(ctx, &deref->arr.index);
+      break;
+
+   case nir_deref_type_array_wildcard:
+   case nir_deref_type_cast:
+      /* Nothing to do */
+      break;
+
+   default:
+      unreachable("Invalid deref type");
+   }
+}
+
+static nir_deref_instr *
+read_deref(read_ctx *ctx)
+{
+   nir_deref_type deref_type = blob_read_uint32(ctx->blob);
+   nir_deref_instr *deref = nir_deref_instr_create(ctx->nir, deref_type);
+
+   deref->mode = blob_read_uint32(ctx->blob);
+   deref->type = decode_type_from_blob(ctx->blob);
+
+   read_dest(ctx, &deref->dest, &deref->instr);
+
+   if (deref_type == nir_deref_type_var) {
+      deref->var = read_object(ctx);
+      return deref;
+   }
+
+   read_src(ctx, &deref->parent, &deref->instr);
+
+   switch (deref->deref_type) {
+   case nir_deref_type_struct:
+      deref->strct.index = blob_read_uint32(ctx->blob);
+      break;
+
+   case nir_deref_type_array:
+      read_src(ctx, &deref->arr.index, &deref->instr);
+      break;
+
+   case nir_deref_type_array_wildcard:
+   case nir_deref_type_cast:
+      /* Nothing to do */
+      break;
+
+   default:
+      unreachable("Invalid deref type");
+   }
+
+   return deref;
+}
+
+static void
 write_intrinsic(write_ctx *ctx, const nir_intrinsic_instr *intrin)
 {
    blob_write_uint32(ctx->blob, intrin->intrinsic);
 
-   unsigned num_variables = nir_intrinsic_infos[intrin->intrinsic].num_variables;
    unsigned num_srcs = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
    unsigned num_indices = nir_intrinsic_infos[intrin->intrinsic].num_indices;
 
@@ -492,9 +503,6 @@
    if (nir_intrinsic_infos[intrin->intrinsic].has_dest)
       write_dest(ctx, &intrin->dest);
 
-   for (unsigned i = 0; i < num_variables; i++)
-      write_deref_chain(ctx, intrin->variables[i]);
-
    for (unsigned i = 0; i < num_srcs; i++)
       write_src(ctx, &intrin->src[i]);
 
@@ -509,7 +517,6 @@
 
    nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(ctx->nir, op);
 
-   unsigned num_variables = nir_intrinsic_infos[op].num_variables;
    unsigned num_srcs = nir_intrinsic_infos[op].num_srcs;
    unsigned num_indices = nir_intrinsic_infos[op].num_indices;
 
@@ -518,9 +525,6 @@
    if (nir_intrinsic_infos[op].has_dest)
       read_dest(ctx, &intrin->dest, &intrin->instr);
 
-   for (unsigned i = 0; i < num_variables; i++)
-      intrin->variables[i] = read_deref_chain(ctx, &intrin->instr);
-
    for (unsigned i = 0; i < num_srcs; i++)
       read_src(ctx, &intrin->src[i], &intrin->instr);
 
@@ -584,8 +588,6 @@
       unsigned is_shadow:1;
       unsigned is_new_style_shadow:1;
       unsigned component:2;
-      unsigned has_texture_deref:1;
-      unsigned has_sampler_deref:1;
       unsigned unused:10; /* Mark unused for valgrind. */
    } u;
 };
@@ -608,8 +610,6 @@
       .u.is_shadow = tex->is_shadow,
       .u.is_new_style_shadow = tex->is_new_style_shadow,
       .u.component = tex->component,
-      .u.has_texture_deref = tex->texture != NULL,
-      .u.has_sampler_deref = tex->sampler != NULL,
    };
    blob_write_uint32(ctx->blob, packed.u32);
 
@@ -618,11 +618,6 @@
       blob_write_uint32(ctx->blob, tex->src[i].src_type);
       write_src(ctx, &tex->src[i].src);
    }
-
-   if (tex->texture)
-      write_deref_chain(ctx, tex->texture);
-   if (tex->sampler)
-      write_deref_chain(ctx, tex->sampler);
 }
 
 static nir_tex_instr *
@@ -652,11 +647,6 @@
       read_src(ctx, &tex->src[i].src, &tex->instr);
    }
 
-   tex->texture = packed.u.has_texture_deref ?
-                  read_deref_chain(ctx, &tex->instr) : NULL;
-   tex->sampler = packed.u.has_sampler_deref ?
-                  read_deref_chain(ctx, &tex->instr) : NULL;
-
    return tex;
 }
 
@@ -776,9 +766,7 @@
    blob_write_intptr(ctx->blob, write_lookup_object(ctx, call->callee));
 
    for (unsigned i = 0; i < call->num_params; i++)
-      write_deref_chain(ctx, call->params[i]);
-
-   write_deref_chain(ctx, call->return_deref);
+      write_src(ctx, &call->params[i]);
 }
 
 static nir_call_instr *
@@ -788,9 +776,7 @@
    nir_call_instr *call = nir_call_instr_create(ctx->nir, callee);
 
    for (unsigned i = 0; i < call->num_params; i++)
-      call->params[i] = read_deref_chain(ctx, &call->instr);
-
-   call->return_deref = read_deref_chain(ctx, &call->instr);
+      read_src(ctx, &call->params[i], call);
 
    return call;
 }
@@ -803,6 +789,9 @@
    case nir_instr_type_alu:
       write_alu(ctx, nir_instr_as_alu(instr));
       break;
+   case nir_instr_type_deref:
+      write_deref(ctx, nir_instr_as_deref(instr));
+      break;
    case nir_instr_type_intrinsic:
       write_intrinsic(ctx, nir_instr_as_intrinsic(instr));
       break;
@@ -840,6 +829,9 @@
    case nir_instr_type_alu:
       instr = &read_alu(ctx)->instr;
       break;
+   case nir_instr_type_deref:
+      instr = &read_deref(ctx)->instr;
+      break;
    case nir_instr_type_intrinsic:
       instr = &read_intrinsic(ctx)->instr;
       break;
@@ -1009,15 +1001,6 @@
    write_reg_list(ctx, &fi->registers);
    blob_write_uint32(ctx->blob, fi->reg_alloc);
 
-   blob_write_uint32(ctx->blob, fi->num_params);
-   for (unsigned i = 0; i < fi->num_params; i++) {
-      write_variable(ctx, fi->params[i]);
-   }
-
-   blob_write_uint32(ctx->blob, !!(fi->return_var));
-   if (fi->return_var)
-      write_variable(ctx, fi->return_var);
-
    write_cf_list(ctx, &fi->body);
    write_fixup_phis(ctx);
 }
@@ -1032,17 +1015,6 @@
    read_reg_list(ctx, &fi->registers);
    fi->reg_alloc = blob_read_uint32(ctx->blob);
 
-   fi->num_params = blob_read_uint32(ctx->blob);
-   for (unsigned i = 0; i < fi->num_params; i++) {
-      fi->params[i] = read_variable(ctx);
-   }
-
-   bool has_return = blob_read_uint32(ctx->blob);
-   if (has_return)
-      fi->return_var = read_variable(ctx);
-   else
-      fi->return_var = NULL;
-
    read_cf_list(ctx, &fi->body);
    read_fixup_phis(ctx);
 
@@ -1062,12 +1034,12 @@
 
    blob_write_uint32(ctx->blob, fxn->num_params);
    for (unsigned i = 0; i < fxn->num_params; i++) {
-      blob_write_uint32(ctx->blob, fxn->params[i].param_type);
-      encode_type_to_blob(ctx->blob, fxn->params[i].type);
+      uint32_t val =
+         ((uint32_t)fxn->params[i].num_components) |
+         ((uint32_t)fxn->params[i].bit_size) << 8;
+      blob_write_uint32(ctx->blob, val);
    }
 
-   encode_type_to_blob(ctx->blob, fxn->return_type);
-
    /* At first glance, it looks like we should write the function_impl here.
     * However, call instructions need to be able to reference at least the
     * function and those will get processed as we write the function_impls.
@@ -1086,12 +1058,12 @@
    read_add_object(ctx, fxn);
 
    fxn->num_params = blob_read_uint32(ctx->blob);
+   fxn->params = ralloc_array(fxn, nir_parameter, fxn->num_params);
    for (unsigned i = 0; i < fxn->num_params; i++) {
-      fxn->params[i].param_type = blob_read_uint32(ctx->blob);
-      fxn->params[i].type = decode_type_from_blob(ctx->blob);
+      uint32_t val = blob_read_uint32(ctx->blob);
+      fxn->params[i].num_components = val & 0xff;
+      fxn->params[i].bit_size = (val >> 8) & 0xff;
    }
-
-   fxn->return_type = decode_type_from_blob(ctx->blob);
 }
 
 void
@@ -1144,6 +1116,10 @@
       write_function_impl(&ctx, fxn->impl);
    }
 
+   blob_write_uint32(blob, nir->constant_data_size);
+   if (nir->constant_data_size > 0)
+      blob_write_bytes(blob, nir->constant_data, nir->constant_data_size);
+
    *(uintptr_t *)(blob->data + idx_size_offset) = ctx.next_idx;
 
    _mesa_hash_table_destroy(ctx.remap_table, NULL);
@@ -1197,6 +1173,14 @@
    nir_foreach_function(fxn, ctx.nir)
       fxn->impl = read_function_impl(&ctx, fxn);
 
+   ctx.nir->constant_data_size = blob_read_uint32(blob);
+   if (ctx.nir->constant_data_size > 0) {
+      ctx.nir->constant_data =
+         ralloc_size(ctx.nir, ctx.nir->constant_data_size);
+      blob_copy_bytes(blob, ctx.nir->constant_data,
+                      ctx.nir->constant_data_size);
+   }
+
    free(ctx.idx_table);
 
    return ctx.nir;
diff --git a/src/compiler/nir/nir_split_per_member_structs.c b/src/compiler/nir/nir_split_per_member_structs.c
new file mode 100644
index 0000000..9bad351
--- /dev/null
+++ b/src/compiler/nir/nir_split_per_member_structs.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_deref.h"
+
+struct split_struct_state {
+   void *dead_ctx;
+
+   struct hash_table *var_to_member_map;
+};
+
+static nir_variable *
+find_var_member(struct nir_variable *var, unsigned member,
+                struct hash_table *var_to_member_map)
+{
+   struct hash_entry *map_entry =
+      _mesa_hash_table_search(var_to_member_map, var);
+   if (map_entry == NULL)
+      return NULL;
+
+   nir_variable **members = map_entry->data;
+   assert(member < var->num_members);
+   return members[member];
+}
+
+static const struct glsl_type *
+member_type(const struct glsl_type *type, unsigned index)
+{
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *elem =
+         member_type(glsl_get_array_element(type), index);
+      return glsl_get_array_instance(elem, glsl_get_length(type));
+   } else {
+      assert(glsl_type_is_struct(type));
+      assert(index < glsl_get_length(type));
+      return glsl_get_struct_field(type, index);
+   }
+}
+
+static void
+split_variable(struct nir_variable *var, nir_shader *shader,
+               struct hash_table *var_to_member_map, void *dead_ctx)
+{
+   assert(var->state_slots == NULL);
+
+   /* Constant initializers are currently not handled */
+   assert(var->constant_initializer == NULL);
+
+   nir_variable **members =
+      ralloc_array(dead_ctx, nir_variable *, var->num_members);
+
+   for (unsigned i = 0; i < var->num_members; i++) {
+      char *member_name = NULL;
+      if (var->name) {
+         /* Calculate a reasonable variable name */
+         member_name = ralloc_strdup(dead_ctx, var->name);
+         const struct glsl_type *t = var->type;
+         while (glsl_type_is_array(t)) {
+            ralloc_strcat(&member_name, "[*]");
+            t = glsl_get_array_element(t);
+         }
+         const char *field_name = glsl_get_struct_elem_name(t, i);
+         if (field_name) {
+            member_name = ralloc_asprintf(dead_ctx, "%s.%s",
+                                          member_name, field_name);
+         } else {
+            member_name = ralloc_asprintf(dead_ctx, "%s.@%d", member_name, i);
+         }
+      }
+
+      members[i] =
+         nir_variable_create(shader, var->members[i].mode,
+                             member_type(var->type, i), member_name);
+      if (var->interface_type) {
+         members[i]->interface_type =
+            glsl_get_struct_field(var->interface_type, i);
+      }
+      members[i]->data = var->members[i];
+   }
+
+   _mesa_hash_table_insert(var_to_member_map, var, members);
+}
+
+static bool
+split_variables_in_list(struct exec_list *var_list, nir_shader *shader,
+                        struct hash_table *var_to_member_map, void *dead_ctx)
+{
+   bool progress = false;
+
+   nir_foreach_variable_safe(var, var_list) {
+      if (var->num_members == 0)
+         continue;
+
+      split_variable(var, shader, var_to_member_map, dead_ctx);
+      exec_node_remove(&var->node);
+      progress = true;
+   }
+
+   return progress;
+}
+
+static nir_deref_instr *
+build_member_deref(nir_builder *b, nir_deref_instr *deref, nir_variable *member)
+{
+   if (deref->deref_type == nir_deref_type_var) {
+      return nir_build_deref_var(b, member);
+   } else {
+      nir_deref_instr *parent =
+         build_member_deref(b, nir_deref_instr_parent(deref), member);
+      return nir_build_deref_follower(b, parent, deref);
+   }
+}
+
+static void
+rewrite_deref_instr(nir_builder *b, nir_deref_instr *deref,
+                    struct hash_table *var_to_member_map)
+{
+   /* We must be a struct deref */
+   if (deref->deref_type != nir_deref_type_struct)
+      return;
+
+   nir_deref_instr *base;
+   for (base = nir_deref_instr_parent(deref);
+        base && base->deref_type != nir_deref_type_var;
+        base = nir_deref_instr_parent(base)) {
+
+      /* If this struct is nested inside another, bail */
+      if (base->deref_type == nir_deref_type_struct)
+         return;
+   }
+
+   /* We must be on a variable with members */
+   if (!base || base->var->num_members == 0)
+      return;
+
+   nir_variable *member = find_var_member(base->var, deref->strct.index,
+                                          var_to_member_map);
+   assert(member);
+
+   b->cursor = nir_before_instr(&deref->instr);
+   nir_deref_instr *member_deref =
+      build_member_deref(b, nir_deref_instr_parent(deref), member);
+   nir_ssa_def_rewrite_uses(&deref->dest.ssa,
+                            nir_src_for_ssa(&member_deref->dest.ssa));
+
+   /* The referenced variable is no longer valid, clean up the deref */
+   nir_deref_instr_remove_if_unused(deref);
+}
+
+bool
+nir_split_per_member_structs(nir_shader *shader)
+{
+   bool progress = false;
+   void *dead_ctx = ralloc_context(NULL);
+   struct hash_table *var_to_member_map =
+      _mesa_hash_table_create(dead_ctx, _mesa_hash_pointer,
+                              _mesa_key_pointer_equal);
+
+   progress |= split_variables_in_list(&shader->inputs, shader,
+                                       var_to_member_map, dead_ctx);
+   progress |= split_variables_in_list(&shader->outputs, shader,
+                                       var_to_member_map, dead_ctx);
+   progress |= split_variables_in_list(&shader->system_values, shader,
+                                       var_to_member_map, dead_ctx);
+   if (!progress)
+      return false;
+
+   nir_foreach_function(function, shader) {
+      if (!function->impl)
+         continue;
+
+      nir_builder b;
+      nir_builder_init(&b, function->impl);
+      nir_foreach_block(block, function->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type == nir_instr_type_deref) {
+               rewrite_deref_instr(&b, nir_instr_as_deref(instr),
+                                   var_to_member_map);
+            }
+         }
+      }
+   }
+
+   ralloc_free(dead_ctx);
+
+   return progress;
+}
diff --git a/src/compiler/nir/nir_split_var_copies.c b/src/compiler/nir/nir_split_var_copies.c
index bc3ceed..5ac1c33 100644
--- a/src/compiler/nir/nir_split_var_copies.c
+++ b/src/compiler/nir/nir_split_var_copies.c
@@ -26,6 +26,7 @@
  */
 
 #include "nir.h"
+#include "nir_builder.h"
 
 /*
  * Implements "copy splitting" which is similar to structure splitting only
@@ -61,225 +62,60 @@
  * possibly a few wildcard array dereferences.
  */
 
-struct split_var_copies_state {
-   nir_shader *shader;
-   void *dead_ctx;
-   bool progress;
-};
-
-/* Recursively constructs deref chains to split a copy instruction into
- * multiple (if needed) copy instructions with full-length deref chains.
- * External callers of this function should pass the tail and head of the
- * deref chains found as the source and destination of the copy instruction
- * into this function.
- *
- * \param  old_copy  The copy instruction we are splitting
- * \param  dest_head The head of the destination deref chain we are building
- * \param  src_head  The head of the source deref chain we are building
- * \param  dest_tail The tail of the destination deref chain we are building
- * \param  src_tail  The tail of the source deref chain we are building
- * \param  state     The current split_var_copies_state object
- */
 static void
-split_var_copy_instr(nir_intrinsic_instr *old_copy,
-                     nir_deref_var *dest_head, nir_deref_var *src_head,
-                     nir_deref *dest_tail, nir_deref *src_tail,
-                     struct split_var_copies_state *state)
+split_deref_copy_instr(nir_builder *b,
+                       nir_deref_instr *dst, nir_deref_instr *src)
 {
-   assert(src_tail->type == dest_tail->type);
-
-   /* Make sure these really are the tails of the deref chains */
-   assert(dest_tail->child == NULL);
-   assert(src_tail->child == NULL);
-
-   switch (glsl_get_base_type(src_tail->type)) {
-   case GLSL_TYPE_ARRAY: {
-      /* Make a wildcard dereference */
-      nir_deref_array *deref = nir_deref_array_create(state->dead_ctx);
-      deref->deref.type = glsl_get_array_element(src_tail->type);
-      deref->deref_array_type = nir_deref_array_type_wildcard;
-
-      /* Set the tail of both as the newly created wildcard deref.  It is
-       * safe to use the same wildcard in both places because a) we will be
-       * copying it before we put it in an actual instruction and b)
-       * everything that will potentially add another link in the deref
-       * chain will also add the same thing to both chains.
-       */
-      src_tail->child = &deref->deref;
-      dest_tail->child = &deref->deref;
-
-      split_var_copy_instr(old_copy, dest_head, src_head,
-                           dest_tail->child, src_tail->child, state);
-
-      /* Set it back to the way we found it */
-      src_tail->child = NULL;
-      dest_tail->child = NULL;
-      break;
-   }
-
-   case GLSL_TYPE_STRUCT:
-      /* This is the only part that actually does any interesting
-       * splitting.  For array types, we just use wildcards and resolve
-       * them later.  For structure types, we need to emit one copy
-       * instruction for every structure element.  Because we may have
-       * structs inside structs, we just recurse and let the next level
-       * take care of any additional structures.
-       */
-      for (unsigned i = 0; i < glsl_get_length(src_tail->type); i++) {
-         nir_deref_struct *deref = nir_deref_struct_create(state->dead_ctx, i);
-         deref->deref.type = glsl_get_struct_field(src_tail->type, i);
-
-         /* Set the tail of both as the newly created structure deref.  It
-          * is safe to use the same wildcard in both places because a) we
-          * will be copying it before we put it in an actual instruction
-          * and b) everything that will potentially add another link in the
-          * deref chain will also add the same thing to both chains.
-          */
-         src_tail->child = &deref->deref;
-         dest_tail->child = &deref->deref;
-
-         split_var_copy_instr(old_copy, dest_head, src_head,
-                              dest_tail->child, src_tail->child, state);
+   assert(dst->type == src->type);
+   if (glsl_type_is_vector_or_scalar(src->type)) {
+      nir_copy_deref(b, dst, src);
+   } else if (glsl_type_is_struct(src->type)) {
+      for (unsigned i = 0; i < glsl_get_length(src->type); i++) {
+         split_deref_copy_instr(b, nir_build_deref_struct(b, dst, i),
+                                   nir_build_deref_struct(b, src, i));
       }
-      /* Set it back to the way we found it */
-      src_tail->child = NULL;
-      dest_tail->child = NULL;
-      break;
-
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_UINT16:
-   case GLSL_TYPE_UINT64:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_INT16:
-   case GLSL_TYPE_INT64:
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_FLOAT16:
-   case GLSL_TYPE_DOUBLE:
-   case GLSL_TYPE_BOOL:
-      if (glsl_type_is_matrix(src_tail->type)) {
-         nir_deref_array *deref = nir_deref_array_create(state->dead_ctx);
-         deref->deref.type = glsl_get_column_type(src_tail->type);
-         deref->deref_array_type = nir_deref_array_type_wildcard;
-
-         /* Set the tail of both as the newly created wildcard deref.  It
-          * is safe to use the same wildcard in both places because a) we
-          * will be copying it before we put it in an actual instruction
-          * and b) everything that will potentially add another link in the
-          * deref chain will also add the same thing to both chains.
-          */
-         src_tail->child = &deref->deref;
-         dest_tail->child = &deref->deref;
-
-         split_var_copy_instr(old_copy, dest_head, src_head,
-                              dest_tail->child, src_tail->child, state);
-
-         /* Set it back to the way we found it */
-         src_tail->child = NULL;
-         dest_tail->child = NULL;
-      } else {
-         /* At this point, we have fully built our deref chains and can
-          * actually add the new copy instruction.
-          */
-         nir_intrinsic_instr *new_copy =
-            nir_intrinsic_instr_create(state->shader, nir_intrinsic_copy_var);
-
-         /* We need to make copies because a) this deref chain actually
-          * belongs to the copy instruction and b) the deref chains may
-          * have some of the same links due to the way we constructed them
-          */
-         new_copy->variables[0] = nir_deref_var_clone(dest_head, new_copy);
-         new_copy->variables[1] = nir_deref_var_clone(src_head, new_copy);
-
-         /* Emit the copy instruction after the old instruction.  We'll
-          * remove the old one later.
-          */
-         nir_instr_insert_after(&old_copy->instr, &new_copy->instr);
-         state->progress = true;
-      }
-      break;
-
-   case GLSL_TYPE_SAMPLER:
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_ATOMIC_UINT:
-   case GLSL_TYPE_INTERFACE:
-   default:
-      unreachable("Cannot copy these types");
+   } else {
+      assert(glsl_type_is_matrix(src->type) || glsl_type_is_array(src->type));
+      split_deref_copy_instr(b, nir_build_deref_array_wildcard(b, dst),
+                                nir_build_deref_array_wildcard(b, src));
    }
 }
 
 static bool
-split_var_copies_block(nir_block *block, struct split_var_copies_state *state)
-{
-   nir_foreach_instr_safe(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
-      if (intrinsic->intrinsic != nir_intrinsic_copy_var)
-         continue;
-
-      nir_deref_var *dest_head = intrinsic->variables[0];
-      nir_deref_var *src_head = intrinsic->variables[1];
-      nir_deref *dest_tail = nir_deref_tail(&dest_head->deref);
-      nir_deref *src_tail = nir_deref_tail(&src_head->deref);
-
-      switch (glsl_get_base_type(src_tail->type)) {
-      case GLSL_TYPE_ARRAY:
-      case GLSL_TYPE_STRUCT:
-         split_var_copy_instr(intrinsic, dest_head, src_head,
-                              dest_tail, src_tail, state);
-         nir_instr_remove(&intrinsic->instr);
-         ralloc_steal(state->dead_ctx, instr);
-         break;
-      case GLSL_TYPE_FLOAT:
-      case GLSL_TYPE_FLOAT16:
-      case GLSL_TYPE_DOUBLE:
-         if (glsl_type_is_matrix(src_tail->type)) {
-            split_var_copy_instr(intrinsic, dest_head, src_head,
-                                 dest_tail, src_tail, state);
-            nir_instr_remove(&intrinsic->instr);
-            ralloc_steal(state->dead_ctx, instr);
-         }
-         break;
-      case GLSL_TYPE_INT:
-      case GLSL_TYPE_UINT:
-      case GLSL_TYPE_INT16:
-      case GLSL_TYPE_UINT16:
-      case GLSL_TYPE_INT64:
-      case GLSL_TYPE_UINT64:
-      case GLSL_TYPE_BOOL:
-         assert(!glsl_type_is_matrix(src_tail->type));
-         break;
-      default:
-         unreachable("Invalid type");
-         break;
-      }
-   }
-
-   return true;
-}
-
-static bool
 split_var_copies_impl(nir_function_impl *impl)
 {
-   struct split_var_copies_state state;
+   bool progress = false;
 
-   state.shader = impl->function->shader;
-   state.dead_ctx = ralloc_context(NULL);
-   state.progress = false;
+   nir_builder b;
+   nir_builder_init(&b, impl);
 
    nir_foreach_block(block, impl) {
-      split_var_copies_block(block, &state);
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
+            continue;
+
+         nir_intrinsic_instr *copy = nir_instr_as_intrinsic(instr);
+         if (copy->intrinsic != nir_intrinsic_copy_deref)
+            continue;
+
+         b.cursor = nir_instr_remove(&copy->instr);
+
+         nir_deref_instr *dst =
+            nir_instr_as_deref(copy->src[0].ssa->parent_instr);
+         nir_deref_instr *src =
+            nir_instr_as_deref(copy->src[1].ssa->parent_instr);
+         split_deref_copy_instr(&b, dst, src);
+
+         progress = true;
+      }
    }
 
-   ralloc_free(state.dead_ctx);
-
-   if (state.progress) {
+   if (progress) {
       nir_metadata_preserve(impl, nir_metadata_block_index |
                                   nir_metadata_dominance);
    }
 
-   return state.progress;
+   return progress;
 }
 
 bool
diff --git a/src/compiler/nir/nir_sweep.c b/src/compiler/nir/nir_sweep.c
index 0f1debc..aab6413 100644
--- a/src/compiler/nir/nir_sweep.c
+++ b/src/compiler/nir/nir_sweep.c
@@ -118,10 +118,6 @@
 {
    ralloc_steal(nir, impl);
 
-   ralloc_steal(nir, impl->params);
-   for (unsigned i = 0; i < impl->num_params; i++)
-      ralloc_steal(nir, impl->params[i]);
-   ralloc_steal(nir, impl->return_var);
    steal_list(nir, nir_variable, &impl->locals);
    steal_list(nir, nir_register, &impl->registers);
 
@@ -171,6 +167,8 @@
       sweep_function(nir, func);
    }
 
+   ralloc_steal(nir, nir->constant_data);
+
    /* Free everything we didn't steal back. */
    ralloc_free(rubbish);
 }
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index 4a60b7d..8e6f5bf 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -228,17 +228,11 @@
 {
    nir_alu_src *src = &instr->src[index];
 
-   unsigned num_components;
-   if (src->src.is_ssa) {
-      num_components = src->src.ssa->num_components;
-   } else {
-      if (src->src.reg.reg->is_packed)
-         num_components = 4; /* can't check anything */
-      else
-         num_components = src->src.reg.reg->num_components;
-   }
-   for (unsigned i = 0; i < 4; i++) {
-      validate_assert(state, src->swizzle[i] < 4);
+   unsigned num_components = nir_src_num_components(src->src);
+   if (!src->src.is_ssa && src->src.reg.reg->is_packed)
+      num_components = NIR_MAX_VEC_COMPONENTS; /* can't check anything */
+   for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) {
+      validate_assert(state, src->swizzle[i] < NIR_MAX_VEC_COMPONENTS);
 
       if (nir_alu_instr_channel_used(instr, index, i))
          validate_assert(state, src->swizzle[i] < num_components);
@@ -333,9 +327,7 @@
 {
    nir_alu_dest *dest = &instr->dest;
 
-   unsigned dest_size =
-      dest->dest.is_ssa ? dest->dest.ssa.num_components
-                        : dest->dest.reg.reg->num_components;
+   unsigned dest_size = nir_dest_num_components(dest->dest);
    bool is_packed = !dest->dest.is_ssa && dest->dest.reg.reg->is_packed;
    /*
     * validate that the instruction doesn't write to components not in the
@@ -401,16 +393,62 @@
 }
 
 static void
-validate_deref_chain(nir_deref *deref, nir_variable_mode mode,
-                     validate_state *state)
+validate_var_use(nir_variable *var, validate_state *state)
 {
-   validate_assert(state, deref->child == NULL || ralloc_parent(deref->child) == deref);
+   struct hash_entry *entry = _mesa_hash_table_search(state->var_defs, var);
+   validate_assert(state, entry);
+   if (var->data.mode == nir_var_local)
+      validate_assert(state, (nir_function_impl *) entry->data == state->impl);
+}
 
-   nir_deref *parent = NULL;
-   while (deref != NULL) {
-      switch (deref->deref_type) {
+static void
+validate_deref_instr(nir_deref_instr *instr, validate_state *state)
+{
+   if (instr->deref_type == nir_deref_type_var) {
+      /* Variable dereferences are stupid simple. */
+      validate_assert(state, instr->mode == instr->var->data.mode);
+      validate_assert(state, instr->type == instr->var->type);
+      validate_var_use(instr->var, state);
+   } else if (instr->deref_type == nir_deref_type_cast) {
+      /* For cast, we simply have to trust the instruction.  It's up to
+       * lowering passes and front/back-ends to make them sane.
+       */
+      validate_src(&instr->parent, state, 0, 0);
+
+      /* We just validate that the type and mode are there */
+      validate_assert(state, instr->mode);
+      validate_assert(state, instr->type);
+   } else {
+      /* We require the parent to be SSA.  This may be lifted in the future */
+      validate_assert(state, instr->parent.is_ssa);
+
+      /* The parent pointer value must have the same number of components
+       * as the destination.
+       */
+      validate_src(&instr->parent, state, nir_dest_bit_size(instr->dest),
+                   nir_dest_num_components(instr->dest));
+
+      nir_instr *parent_instr = instr->parent.ssa->parent_instr;
+
+      /* The parent must come from another deref instruction */
+      validate_assert(state, parent_instr->type == nir_instr_type_deref);
+
+      nir_deref_instr *parent = nir_instr_as_deref(parent_instr);
+
+      validate_assert(state, instr->mode == parent->mode);
+
+      switch (instr->deref_type) {
+      case nir_deref_type_struct:
+         validate_assert(state, glsl_type_is_struct(parent->type));
+         validate_assert(state,
+            instr->strct.index < glsl_get_length(parent->type));
+         validate_assert(state, instr->type ==
+            glsl_get_struct_field(parent->type, instr->strct.index));
+         break;
+
       case nir_deref_type_array:
-         if (mode == nir_var_shared) {
+      case nir_deref_type_array_wildcard:
+         if (instr->mode == nir_var_shared) {
             /* Shared variables have a bit more relaxed rules because we need
              * to be able to handle array derefs on vectors.  Fortunately,
              * nir_lower_io handles these just fine.
@@ -423,62 +461,76 @@
             validate_assert(state, glsl_type_is_array(parent->type) ||
                                    glsl_type_is_matrix(parent->type));
          }
-         validate_assert(state, deref->type == glsl_get_array_element(parent->type));
-         if (nir_deref_as_array(deref)->deref_array_type ==
-             nir_deref_array_type_indirect)
-            validate_src(&nir_deref_as_array(deref)->indirect, state, 32, 1);
-         break;
+         validate_assert(state,
+            instr->type == glsl_get_array_element(parent->type));
 
-      case nir_deref_type_struct:
-         assume(parent); /* cannot happen: deref change starts w/ nir_deref_var */
-         validate_assert(state, deref->type ==
-                glsl_get_struct_field(parent->type,
-                                      nir_deref_as_struct(deref)->index));
-         break;
-
-      case nir_deref_type_var:
+         if (instr->deref_type == nir_deref_type_array)
+            validate_src(&instr->arr.index, state, 32, 1);
          break;
 
       default:
-         validate_assert(state, !"Invalid deref type");
-         break;
+         unreachable("Invalid deref instruction type");
       }
-
-      parent = deref;
-      deref = deref->child;
    }
-}
 
-static void
-validate_var_use(nir_variable *var, validate_state *state)
-{
-   struct hash_entry *entry = _mesa_hash_table_search(state->var_defs, var);
-   validate_assert(state, entry);
-   if (var->data.mode == nir_var_local)
-      validate_assert(state, (nir_function_impl *) entry->data == state->impl);
-}
-
-static void
-validate_deref_var(void *parent_mem_ctx, nir_deref_var *deref, validate_state *state)
-{
-   validate_assert(state, deref != NULL);
-   validate_assert(state, ralloc_parent(deref) == parent_mem_ctx);
-   validate_assert(state, deref->deref.type == deref->var->type);
-
-   validate_var_use(deref->var, state);
-
-   validate_deref_chain(&deref->deref, deref->var->data.mode, state);
+   /* We intentionally don't validate the size of the destination because we
+    * want to let other compiler components such as SPIR-V decide how big
+    * pointers should be.
+    */
+   validate_dest(&instr->dest, state, 0, 0);
 }
 
 static void
 validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
 {
-   unsigned bit_size = 0;
-   if (instr->intrinsic == nir_intrinsic_load_var ||
-       instr->intrinsic == nir_intrinsic_store_var) {
-      const struct glsl_type *type =
-         nir_deref_tail(&instr->variables[0]->deref)->type;
-      bit_size = glsl_get_bit_size(type);
+   unsigned dest_bit_size = 0;
+   unsigned src_bit_sizes[NIR_INTRINSIC_MAX_INPUTS] = { 0, };
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_param: {
+      unsigned param_idx = nir_intrinsic_param_idx(instr);
+      validate_assert(state, param_idx < state->impl->function->num_params);
+      nir_parameter *param = &state->impl->function->params[param_idx];
+      validate_assert(state, instr->num_components == param->num_components);
+      dest_bit_size = param->bit_size;
+      break;
+   }
+
+   case nir_intrinsic_load_deref: {
+      nir_deref_instr *src = nir_src_as_deref(instr->src[0]);
+      validate_assert(state, glsl_type_is_vector_or_scalar(src->type) ||
+                      (src->mode == nir_var_uniform &&
+                       glsl_get_base_type(src->type) == GLSL_TYPE_SUBROUTINE));
+      validate_assert(state, instr->num_components ==
+                             glsl_get_vector_elements(src->type));
+      dest_bit_size = glsl_get_bit_size(src->type);
+      break;
+   }
+
+   case nir_intrinsic_store_deref: {
+      nir_deref_instr *dst = nir_src_as_deref(instr->src[0]);
+      validate_assert(state, glsl_type_is_vector_or_scalar(dst->type));
+      validate_assert(state, instr->num_components ==
+                             glsl_get_vector_elements(dst->type));
+      src_bit_sizes[1] = glsl_get_bit_size(dst->type);
+      validate_assert(state, (dst->mode & (nir_var_shader_in |
+                                           nir_var_uniform |
+                                           nir_var_shader_storage)) == 0);
+      validate_assert(state, (nir_intrinsic_write_mask(instr) & ~((1 << instr->num_components) - 1)) == 0);
+      break;
+   }
+
+   case nir_intrinsic_copy_deref: {
+      nir_deref_instr *dst = nir_src_as_deref(instr->src[0]);
+      nir_deref_instr *src = nir_src_as_deref(instr->src[1]);
+      validate_assert(state, dst->type == src->type);
+      validate_assert(state, (dst->mode & (nir_var_shader_in |
+                                           nir_var_uniform |
+                                           nir_var_shader_storage)) == 0);
+      break;
+   }
+
+   default:
+      break;
    }
 
    unsigned num_srcs = nir_intrinsic_infos[instr->intrinsic].num_srcs;
@@ -487,12 +539,7 @@
 
       validate_assert(state, components_read > 0);
 
-      validate_src(&instr->src[i], state, bit_size, components_read);
-   }
-
-   unsigned num_vars = nir_intrinsic_infos[instr->intrinsic].num_variables;
-   for (unsigned i = 0; i < num_vars; i++) {
-      validate_deref_var(instr, instr->variables[i], state);
+      validate_src(&instr->src[i], state, src_bit_sizes[i], components_read);
    }
 
    if (nir_intrinsic_infos[instr->intrinsic].has_dest) {
@@ -500,41 +547,7 @@
 
       validate_assert(state, components_written > 0);
 
-      validate_dest(&instr->dest, state, bit_size, components_written);
-   }
-
-   switch (instr->intrinsic) {
-   case nir_intrinsic_load_var: {
-      const struct glsl_type *type =
-         nir_deref_tail(&instr->variables[0]->deref)->type;
-      validate_assert(state, glsl_type_is_vector_or_scalar(type) ||
-             (instr->variables[0]->var->data.mode == nir_var_uniform &&
-              glsl_get_base_type(type) == GLSL_TYPE_SUBROUTINE));
-      validate_assert(state, instr->num_components == glsl_get_vector_elements(type));
-      break;
-   }
-   case nir_intrinsic_store_var: {
-      const struct glsl_type *type =
-         nir_deref_tail(&instr->variables[0]->deref)->type;
-      validate_assert(state, glsl_type_is_vector_or_scalar(type) ||
-             (instr->variables[0]->var->data.mode == nir_var_uniform &&
-              glsl_get_base_type(type) == GLSL_TYPE_SUBROUTINE));
-      validate_assert(state, instr->num_components == glsl_get_vector_elements(type));
-      validate_assert(state, instr->variables[0]->var->data.mode != nir_var_shader_in &&
-             instr->variables[0]->var->data.mode != nir_var_uniform &&
-             instr->variables[0]->var->data.mode != nir_var_shader_storage);
-      validate_assert(state, (nir_intrinsic_write_mask(instr) & ~((1 << instr->num_components) - 1)) == 0);
-      break;
-   }
-   case nir_intrinsic_copy_var:
-      validate_assert(state, nir_deref_tail(&instr->variables[0]->deref)->type ==
-             nir_deref_tail(&instr->variables[1]->deref)->type);
-      validate_assert(state, instr->variables[0]->var->data.mode != nir_var_shader_in &&
-             instr->variables[0]->var->data.mode != nir_var_uniform &&
-             instr->variables[0]->var->data.mode != nir_var_shader_storage);
-      break;
-   default:
-      break;
+      validate_dest(&instr->dest, state, dest_bit_size, components_written);
    }
 }
 
@@ -552,32 +565,18 @@
                    0, nir_tex_instr_src_size(instr, i));
    }
 
-   if (instr->texture != NULL)
-      validate_deref_var(instr, instr->texture, state);
-
-   if (instr->sampler != NULL)
-      validate_deref_var(instr, instr->sampler, state);
-
    validate_dest(&instr->dest, state, 0, nir_tex_instr_dest_size(instr));
 }
 
 static void
 validate_call_instr(nir_call_instr *instr, validate_state *state)
 {
-   if (instr->return_deref == NULL) {
-      validate_assert(state, glsl_type_is_void(instr->callee->return_type));
-   } else {
-      validate_assert(state, instr->callee->return_type ==
-                             nir_deref_tail(&instr->return_deref->deref)->type);
-      validate_deref_var(instr, instr->return_deref, state);
-   }
-
    validate_assert(state, instr->num_params == instr->callee->num_params);
 
    for (unsigned i = 0; i < instr->num_params; i++) {
-      validate_assert(state, instr->callee->params[i].type ==
-                             nir_deref_tail(&instr->params[i]->deref)->type);
-      validate_deref_var(instr, instr->params[i], state);
+      validate_src(&instr->params[i], state,
+                   instr->callee->params[i].bit_size,
+                   instr->callee->params[i].num_components);
    }
 }
 
@@ -620,6 +619,10 @@
       validate_alu_instr(nir_instr_as_alu(instr), state);
       break;
 
+   case nir_instr_type_deref:
+      validate_deref_instr(nir_instr_as_deref(instr), state);
+      break;
+
    case nir_instr_type_call:
       validate_call_instr(nir_instr_as_call(instr), state);
       break;
@@ -993,6 +996,13 @@
       }
    }
 
+   if (var->num_members > 0) {
+      const struct glsl_type *without_array = glsl_without_array(var->type);
+      validate_assert(state, glsl_type_is_struct(without_array));
+      validate_assert(state, var->num_members == glsl_get_length(without_array));
+      validate_assert(state, var->members != NULL);
+   }
+
    /*
     * TODO validate some things ir_validate.cpp does (requires more GLSL type
     * support)
@@ -1053,23 +1063,6 @@
    validate_assert(state, impl->function->impl == impl);
    validate_assert(state, impl->cf_node.parent == NULL);
 
-   validate_assert(state, impl->num_params == impl->function->num_params);
-   for (unsigned i = 0; i < impl->num_params; i++) {
-      validate_assert(state, impl->params[i]->type == impl->function->params[i].type);
-      validate_assert(state, impl->params[i]->data.mode == nir_var_param);
-      validate_assert(state, impl->params[i]->data.location == i);
-      validate_var_decl(impl->params[i], false, state);
-   }
-
-   if (glsl_type_is_void(impl->function->return_type)) {
-      validate_assert(state, impl->return_var == NULL);
-   } else {
-      validate_assert(state, impl->return_var->type == impl->function->return_type);
-      validate_assert(state, impl->return_var->data.mode == nir_var_param);
-      validate_assert(state, impl->return_var->data.location == -1);
-      validate_var_decl(impl->return_var, false, state);
-   }
-
    validate_assert(state, exec_list_is_empty(&impl->end_block->instr_list));
    validate_assert(state, impl->end_block->successors[0] == NULL);
    validate_assert(state, impl->end_block->successors[1] == NULL);
diff --git a/src/compiler/nir/nir_worklist.h b/src/compiler/nir/nir_worklist.h
index 3fb391f..05aa757 100644
--- a/src/compiler/nir/nir_worklist.h
+++ b/src/compiler/nir/nir_worklist.h
@@ -154,8 +154,8 @@
    return *vec_instr;
 }
 
-#define nir_instr_worklist_foreach(wl, instr)                    \
-   while ((instr = nir_instr_worklist_pop_head(wl)))
+#define nir_foreach_instr_in_worklist(instr, wl) \
+   for (nir_instr *instr; (instr = nir_instr_worklist_pop_head(wl));)
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp
index 78b6680..e002739 100644
--- a/src/compiler/nir_types.cpp
+++ b/src/compiler/nir_types.cpp
@@ -81,6 +81,12 @@
    return type->column_type();
 }
 
+GLenum
+glsl_get_gl_type(const struct glsl_type *type)
+{
+   return type->gl_type;
+}
+
 enum glsl_base_type
 glsl_get_base_type(const struct glsl_type *type)
 {
@@ -124,6 +130,12 @@
    return type->count_attribute_slots(is_vertex_input);
 }
 
+unsigned
+glsl_get_component_slots(const struct glsl_type *type)
+{
+   return type->component_slots();
+}
+
 const char *
 glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index)
 {
@@ -145,6 +157,13 @@
 }
 
 unsigned
+glsl_get_sampler_target(const struct glsl_type *type)
+{
+   assert(glsl_type_is_sampler(type));
+   return type->sampler_index();
+}
+
+unsigned
 glsl_get_record_location_offset(const struct glsl_type *type,
                                 unsigned length)
 {
@@ -152,6 +171,12 @@
 }
 
 bool
+glsl_type_is_16bit(const glsl_type *type)
+{
+   return type->is_16bit();
+}
+
+bool
 glsl_type_is_64bit(const glsl_type *type)
 {
    return type->is_64bit();
@@ -254,6 +279,11 @@
 {
    return type->is_boolean();
 }
+bool
+glsl_type_is_integer(const struct glsl_type *type)
+{
+   return type->is_integer();
+}
 
 const glsl_type *
 glsl_void_type(void)
@@ -460,7 +490,87 @@
       return glsl_uint64_t_type();
    case GLSL_TYPE_INT64:
       return glsl_int64_t_type();
+   case GLSL_TYPE_FLOAT16:
+      return glsl_float16_t_type();
+   case GLSL_TYPE_UINT16:
+      return glsl_uint16_t_type();
+   case GLSL_TYPE_INT16:
+      return glsl_int16_t_type();
    default:
       unreachable("Unhandled base type glsl_channel_type()");
    }
 }
+
+void
+glsl_get_natural_size_align_bytes(const struct glsl_type *type,
+                                  unsigned *size, unsigned *align)
+{
+   switch (type->base_type) {
+   case GLSL_TYPE_UINT8:
+   case GLSL_TYPE_INT8:
+   case GLSL_TYPE_UINT16:
+   case GLSL_TYPE_INT16:
+   case GLSL_TYPE_FLOAT16:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_INT64: {
+      unsigned N = glsl_get_bit_size(type) / 8;
+      *size = N * type->components();
+      *align = N;
+      break;
+   }
+
+   case GLSL_TYPE_ARRAY: {
+      unsigned elem_size, elem_align;
+      glsl_get_natural_size_align_bytes(type->fields.array,
+                                        &elem_size, &elem_align);
+      *align = elem_align;
+      *size = type->length * ALIGN_POT(elem_size, elem_align);
+      break;
+   }
+
+   case GLSL_TYPE_STRUCT:
+      *size = 0;
+      *align = 0;
+      for (unsigned i = 0; i < type->length; i++) {
+         unsigned elem_size, elem_align;
+         glsl_get_natural_size_align_bytes(type->fields.structure[i].type,
+                                           &elem_size, &elem_align);
+         *align = MAX2(*align, elem_align);
+         *size = ALIGN_POT(*size, elem_align) + elem_size;
+      }
+      break;
+
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_ATOMIC_UINT:
+   case GLSL_TYPE_SUBROUTINE:
+   case GLSL_TYPE_IMAGE:
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_FUNCTION:
+      unreachable("type does not have a natural size");
+   }
+}
+
+const glsl_type *
+glsl_atomic_uint_type(void)
+{
+   return glsl_type::atomic_uint_type;
+}
+
+unsigned
+glsl_atomic_size(const struct glsl_type *type)
+{
+   return type->atomic_size();
+}
+
+bool
+glsl_contains_atomic(const struct glsl_type *type)
+{
+   return type->contains_atomic();
+}
diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h
index 5b441af..7db32e3 100644
--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -59,6 +59,8 @@
 const struct glsl_function_param *
 glsl_get_function_param(const struct glsl_type *type, unsigned index);
 
+GLenum glsl_get_gl_type(const struct glsl_type *type);
+
 enum glsl_base_type glsl_get_base_type(const struct glsl_type *type);
 
 unsigned glsl_get_vector_elements(const struct glsl_type *type);
@@ -73,16 +75,20 @@
 
 unsigned glsl_count_attribute_slots(const struct glsl_type *type,
                                     bool is_vertex_input);
+unsigned glsl_get_component_slots(const struct glsl_type *type);
 
 const char *glsl_get_struct_elem_name(const struct glsl_type *type,
                                       unsigned index);
 
 enum glsl_sampler_dim glsl_get_sampler_dim(const struct glsl_type *type);
 enum glsl_base_type glsl_get_sampler_result_type(const struct glsl_type *type);
+unsigned glsl_get_sampler_target(const struct glsl_type *type);
 
 unsigned glsl_get_record_location_offset(const struct glsl_type *type,
                                          unsigned length);
 
+unsigned glsl_atomic_size(const struct glsl_type *type);
+
 static inline unsigned
 glsl_get_bit_size(const struct glsl_type *type)
 {
@@ -117,6 +123,7 @@
    return 0;
 }
 
+bool glsl_type_is_16bit(const struct glsl_type *type);
 bool glsl_type_is_64bit(const struct glsl_type *type);
 bool glsl_type_is_void(const struct glsl_type *type);
 bool glsl_type_is_error(const struct glsl_type *type);
@@ -132,8 +139,10 @@
 bool glsl_type_is_dual_slot(const struct glsl_type *type);
 bool glsl_type_is_numeric(const struct glsl_type *type);
 bool glsl_type_is_boolean(const struct glsl_type *type);
+bool glsl_type_is_integer(const struct glsl_type *type);
 bool glsl_sampler_type_is_shadow(const struct glsl_type *type);
 bool glsl_sampler_type_is_array(const struct glsl_type *type);
+bool glsl_contains_atomic(const struct glsl_type *type);
 
 const struct glsl_type *glsl_void_type(void);
 const struct glsl_type *glsl_float_type(void);
@@ -182,6 +191,14 @@
 
 const struct glsl_type *glsl_channel_type(const struct glsl_type *type);
 
+typedef void (*glsl_type_size_align_func)(const struct glsl_type *type,
+                                          unsigned *size, unsigned *align);
+
+void glsl_get_natural_size_align_bytes(const struct glsl_type *type,
+                                       unsigned *size, unsigned *align);
+
+const struct glsl_type *glsl_atomic_uint_type(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/compiler/shader_enums.c b/src/compiler/shader_enums.c
index ebee076..a874083 100644
--- a/src/compiler/shader_enums.c
+++ b/src/compiler/shader_enums.c
@@ -216,15 +216,18 @@
      ENUM(SYSTEM_VALUE_INSTANCE_ID),
      ENUM(SYSTEM_VALUE_INSTANCE_INDEX),
      ENUM(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE),
-     ENUM(SYSTEM_VALUE_FIRST_VERTEX),
      ENUM(SYSTEM_VALUE_BASE_VERTEX),
+     ENUM(SYSTEM_VALUE_FIRST_VERTEX),
+     ENUM(SYSTEM_VALUE_IS_INDEXED_DRAW),
      ENUM(SYSTEM_VALUE_BASE_INSTANCE),
      ENUM(SYSTEM_VALUE_DRAW_ID),
      ENUM(SYSTEM_VALUE_INVOCATION_ID),
+     ENUM(SYSTEM_VALUE_FRAG_COORD),
      ENUM(SYSTEM_VALUE_FRONT_FACE),
      ENUM(SYSTEM_VALUE_SAMPLE_ID),
      ENUM(SYSTEM_VALUE_SAMPLE_POS),
      ENUM(SYSTEM_VALUE_SAMPLE_MASK_IN),
+     ENUM(SYSTEM_VALUE_HELPER_INVOCATION),
      ENUM(SYSTEM_VALUE_TESS_COORD),
      ENUM(SYSTEM_VALUE_VERTICES_IN),
      ENUM(SYSTEM_VALUE_PRIMITIVE_ID),
@@ -235,6 +238,9 @@
      ENUM(SYSTEM_VALUE_GLOBAL_INVOCATION_ID),
      ENUM(SYSTEM_VALUE_WORK_GROUP_ID),
      ENUM(SYSTEM_VALUE_NUM_WORK_GROUPS),
+     ENUM(SYSTEM_VALUE_LOCAL_GROUP_SIZE),
+     ENUM(SYSTEM_VALUE_GLOBAL_GROUP_SIZE),
+     ENUM(SYSTEM_VALUE_WORK_DIM),
      ENUM(SYSTEM_VALUE_DEVICE_INDEX),
      ENUM(SYSTEM_VALUE_VIEW_INDEX),
      ENUM(SYSTEM_VALUE_VERTEX_CNT),
diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h
index 8a277a1..f8e2292 100644
--- a/src/compiler/shader_enums.h
+++ b/src/compiler/shader_enums.h
@@ -518,6 +518,13 @@
    SYSTEM_VALUE_FIRST_VERTEX,
 
    /**
+    * If the Draw command used to start the rendering was an indexed draw
+    * or not (~0/0). Useful to calculate \c SYSTEM_VALUE_BASE_VERTEX as
+    * \c SYSTEM_VALUE_IS_INDEXED_DRAW & \c SYSTEM_VALUE_FIRST_VERTEX.
+    */
+   SYSTEM_VALUE_IS_INDEXED_DRAW,
+
+   /**
     * Value of \c baseinstance passed to instanced draw entry points
     *
     * \sa SYSTEM_VALUE_INSTANCE_ID
@@ -578,6 +585,8 @@
    SYSTEM_VALUE_WORK_GROUP_ID,
    SYSTEM_VALUE_NUM_WORK_GROUPS,
    SYSTEM_VALUE_LOCAL_GROUP_SIZE,
+   SYSTEM_VALUE_GLOBAL_GROUP_SIZE,
+   SYSTEM_VALUE_WORK_DIM,
    /*@}*/
 
    /** Required for VK_KHR_device_group */
diff --git a/src/compiler/shader_info.h b/src/compiler/shader_info.h
index 53a0ef2..dab15b5 100644
--- a/src/compiler/shader_info.h
+++ b/src/compiler/shader_info.h
@@ -44,6 +44,7 @@
    bool multiview;
    bool variable_pointers;
    bool storage_16bit;
+   bool int16;
    bool shader_viewport_index_layer;
    bool subgroup_arithmetic;
    bool subgroup_ballot;
@@ -55,6 +56,12 @@
    bool trinary_minmax;
    bool descriptor_array_dynamic_indexing;
    bool runtime_descriptor_array;
+   bool stencil_export;
+   bool atomic_storage;
+   bool storage_8bit;
+   bool post_depth_coverage;
+   bool transform_feedback;
+   bool geometry_streams;
 };
 
 typedef struct shader_info {
@@ -178,6 +185,11 @@
 
          bool pixel_center_integer;
 
+         bool pixel_interlock_ordered;
+         bool pixel_interlock_unordered;
+         bool sample_interlock_ordered;
+         bool sample_interlock_unordered;
+
          /** gl_FragDepth layout for ARB_conservative_depth. */
          enum gl_frag_depth_layout depth_layout;
       } fs;
diff --git a/src/compiler/spirv/OpenCL.std.h b/src/compiler/spirv/OpenCL.std.h
new file mode 100644
index 0000000..1e9e7fc
--- /dev/null
+++ b/src/compiler/spirv/OpenCL.std.h
@@ -0,0 +1,211 @@
+/*
+** Copyright (c) 2015-2017 The Khronos Group Inc.
+**
+** Permission is hereby granted, free of charge, to any person obtaining a copy
+** of this software and/or associated documentation files (the "Materials"),
+** to deal in the Materials without restriction, including without limitation
+** the rights to use, copy, modify, merge, publish, distribute, sublicense,
+** and/or sell copies of the Materials, and to permit persons to whom the
+** Materials are furnished to do so, subject to the following conditions:
+**
+** The above copyright notice and this permission notice shall be included in
+** all copies or substantial portions of the Materials.
+**
+** MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
+** STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
+** HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ 
+**
+** THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+** OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+** FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+** THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+** LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+** FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
+** IN THE MATERIALS.
+*/
+
+#ifndef OpenCLstd_H
+#define OpenCLstd_H
+
+enum OpenCLstd {
+
+    // Section 2.1: Math extended instructions
+    Acos = 0,
+    Acosh = 1,
+    Acospi = 2,
+    Asin = 3,
+    Asinh = 4,
+    Asinpi = 5,
+    Atan = 6,
+    Atan2 = 7,
+    Atanh = 8,
+    Atanpi = 9,
+    Atan2pi = 10,
+    Cbrt = 11,
+    Ceil = 12,
+    Copysign = 13,
+    Cos = 14,
+    Cosh = 15,
+    Cospi = 16,
+    Erfc = 17,
+    Erf = 18,
+    Exp = 19,
+    Exp2 = 20,
+    Exp10 = 21,
+    Expm1 = 22,
+    Fabs = 23,
+    Fdim = 24,
+    Floor = 25,
+    Fma = 26,
+    Fmax = 27,
+    Fmin = 28,
+    Fmod = 29,
+    Fract = 30, 
+    Frexp = 31,
+    Hypot = 32,
+    Ilogb = 33,
+    Ldexp = 34,
+    Lgamma = 35,
+    Lgamma_r = 36,
+    Log = 37,
+    Log2 = 38,
+    Log10 = 39,
+    Log1p = 40,
+    Logb = 41,
+    Mad = 42,
+    Maxmag = 43,
+    Minmag = 44,
+    Modf = 45,
+    Nan = 46,
+    Nextafter = 47,
+    Pow = 48,
+    Pown = 49,
+    Powr = 50,
+    Remainder = 51,
+    Remquo = 52,
+    Rint = 53,
+    Rootn = 54,
+    Round = 55,
+    Rsqrt = 56,
+    Sin = 57,
+    Sincos = 58,
+    Sinh = 59,
+    Sinpi = 60,
+    Sqrt = 61,
+    Tan = 62,
+    Tanh = 63,
+    Tanpi = 64,
+    Tgamma = 65,
+    Trunc = 66,
+    Half_cos = 67,
+    Half_divide = 68,
+    Half_exp = 69,
+    Half_exp2 = 70,
+    Half_exp10 = 71,
+    Half_log = 72,
+    Half_log2 = 73,
+    Half_log10 = 74,
+    Half_powr = 75,
+    Half_recip = 76,
+    Half_rsqrt = 77,
+    Half_sin = 78,
+    Half_sqrt = 79,
+    Half_tan = 80,
+    Native_cos = 81,
+    Native_divide = 82,
+    Native_exp = 83,
+    Native_exp2 = 84,
+    Native_exp10 = 85,
+    Native_log = 86,
+    Native_log2 = 87,
+    Native_log10 = 88,
+    Native_powr = 89,
+    Native_recip = 90,
+    Native_rsqrt = 91,
+    Native_sin = 92,
+    Native_sqrt = 93,
+    Native_tan = 94,
+    
+    // Section 2.2: Integer instructions
+    SAbs = 141,
+    SAbs_diff = 142,
+    SAdd_sat = 143,
+    UAdd_sat = 144,
+    SHadd = 145,
+    UHadd = 146,
+    SRhadd = 147,
+    URhadd = 148,
+    SClamp = 149,
+    UClamp = 150, 
+    Clz = 151,
+    Ctz = 152,    
+    SMad_hi = 153,
+    UMad_sat = 154,
+    SMad_sat = 155,
+    SMax = 156,
+    UMax = 157,
+    SMin = 158,
+    UMin = 159,
+    SMul_hi = 160,
+    Rotate = 161,
+    SSub_sat = 162,
+    USub_sat = 163,
+    U_Upsample = 164,
+    S_Upsample = 165,
+    Popcount = 166,
+    SMad24 = 167,
+    UMad24 = 168,
+    SMul24 = 169,
+    UMul24 = 170,
+    UAbs = 201,
+    UAbs_diff = 202,
+    UMul_hi = 203,
+    UMad_hi = 204,
+
+    // Section 2.3: Common instructions
+    FClamp = 95,
+    Degrees = 96,
+    FMax_common = 97,
+    FMin_common = 98, 
+    Mix = 99,
+    Radians = 100,
+    Step = 101,
+    Smoothstep = 102,
+    Sign = 103,
+
+    // Section 2.4: Geometric instructions
+    Cross = 104,
+    Distance = 105, 
+    Length = 106,
+    Normalize = 107,
+    Fast_distance = 108,
+    Fast_length = 109,
+    Fast_normalize = 110,
+
+    // Section 2.5: Relational instructions
+    Bitselect = 186,
+    Select = 187,
+
+    // Section 2.6: Vector Data Load and Store instructions
+    Vloadn = 171,
+    Vstoren = 172,
+    Vload_half = 173,
+    Vload_halfn = 174,
+    Vstore_half = 175,
+    Vstore_half_r = 176,
+    Vstore_halfn = 177,
+    Vstore_halfn_r = 178,
+    Vloada_halfn = 179,
+    Vstorea_halfn = 180,
+    Vstorea_halfn_r = 181,
+
+    // Section 2.7: Miscellaneous Vector instructions
+    Shuffle = 182,
+    Shuffle2 = 183,
+
+    // Section 2.8: Misc instructions 
+    Printf = 184,
+    Prefetch = 185,
+};
+
+#endif
diff --git a/src/compiler/spirv/spirv.core.grammar.json b/src/compiler/spirv/spirv.core.grammar.json
index a03c024..cb64142 100644
--- a/src/compiler/spirv/spirv.core.grammar.json
+++ b/src/compiler/spirv/spirv.core.grammar.json
@@ -3914,7 +3914,7 @@
         { "kind" : "IdRef",         "name" : "'Target'" },
         { "kind" : "Decoration" }
       ],
-      "extensions" : [ "SPV_GOOGLE_decorate_string" ],
+      "extensions" : [ "SPV_GOOGLE_decorate_string", "SPV_GOOGLE_hlsl_functionality1" ],
       "version" : "None"
     },
     {
@@ -3925,7 +3925,7 @@
         { "kind" : "LiteralInteger", "name" : "'Member'" },
         { "kind" : "Decoration" }
       ],
-      "extensions" : [ "SPV_GOOGLE_decorate_string" ],
+      "extensions" : [ "SPV_GOOGLE_decorate_string", "SPV_GOOGLE_hlsl_functionality1" ],
       "version" : "None"
     },
     {
@@ -3991,6 +3991,7 @@
         {
           "enumerant" : "ConstOffsets",
           "value" : "0x0020",
+          "capabilities" : [ "ImageGatherExtended" ],
           "parameters" : [
             { "kind" : "IdRef" }
           ]
@@ -5550,12 +5551,14 @@
           "enumerant" : "OverrideCoverageNV",
           "value" : 5248,
           "capabilities" : [ "SampleMaskOverrideCoverageNV" ],
+          "extensions" : [ "SPV_NV_sample_mask_override_coverage" ],
           "version" : "None"
         },
         {
           "enumerant" : "PassthroughNV",
           "value" : 5250,
           "capabilities" : [ "GeometryShaderPassthroughNV" ],
+          "extensions" : [ "SPV_NV_geometry_shader_passthrough" ],
           "version" : "None"
         },
         {
@@ -5568,6 +5571,7 @@
           "enumerant" : "SecondaryViewportRelativeNV",
           "value" : 5256,
           "capabilities" : [ "ShaderStereoViewNV" ],
+          "extensions" : [ "SPV_NV_stereo_view_rendering" ],
           "version" : "None",
           "parameters" : [
             { "kind" : "LiteralInteger", "name" : "'Offset'" }
@@ -5960,12 +5964,14 @@
           "enumerant" : "SecondaryPositionNV",
           "value" : 5257,
           "capabilities" : [ "ShaderStereoViewNV" ],
+          "extensions" : [ "SPV_NV_stereo_view_rendering" ],
           "version" : "None"
         },
         {
           "enumerant" : "SecondaryViewportMaskNV",
           "value" : 5258,
           "capabilities" : [ "ShaderStereoViewNV" ],
+          "extensions" : [ "SPV_NV_stereo_view_rendering" ],
           "version" : "None"
         },
         {
@@ -6043,17 +6049,23 @@
         {
           "enumerant" : "PartitionedReduceNV",
           "value" : 6,
-          "capabilities" : [ "GroupNonUniformPartitionedNV" ]
+          "capabilities" : [ "GroupNonUniformPartitionedNV" ],
+          "extensions" : [ "SPV_NV_shader_subgroup_partitioned" ],
+          "version" : "None"
         },
         {
           "enumerant" : "PartitionedInclusiveScanNV",
           "value" : 7,
-          "capabilities" : [ "GroupNonUniformPartitionedNV" ]
+          "capabilities" : [ "GroupNonUniformPartitionedNV" ],
+          "extensions" : [ "SPV_NV_shader_subgroup_partitioned" ],
+          "version" : "None"
         },
         {
           "enumerant" : "PartitionedExclusiveScanNV",
           "value" : 8,
-          "capabilities" : [ "GroupNonUniformPartitionedNV" ]
+          "capabilities" : [ "GroupNonUniformPartitionedNV" ],
+          "extensions" : [ "SPV_NV_shader_subgroup_partitioned" ],
+          "version" : "None"
         }
       ]
     },
@@ -6260,8 +6272,7 @@
         },
         {
           "enumerant" : "Int8",
-          "value" : 39,
-          "capabilities" : [ "Kernel" ]
+          "value" : 39
         },
         {
           "enumerant" : "InputAttachment",
@@ -6519,6 +6530,25 @@
           "version" : "None"
         },
         {
+          "enumerant" : "StorageBuffer8BitAccess",
+          "value" : 4448,
+          "extensions" : [ "SPV_KHR_8bit_storage" ],
+          "version" : "None"
+        },
+        {
+          "enumerant" : "UniformAndStorageBuffer8BitAccess",
+          "value" : 4449,
+          "capabilities" : [ "StorageBuffer8BitAccess" ],
+          "extensions" : [ "SPV_KHR_8bit_storage" ],
+          "version" : "None"
+        },
+        {
+          "enumerant" : "StoragePushConstant8",
+          "value" : 4450,
+          "extensions" : [ "SPV_KHR_8bit_storage" ],
+          "version" : "None"
+        },
+        {
           "enumerant" : "Float16ImageAMD",
           "value" : 5008,
           "capabilities" : [ "Shader" ],
diff --git a/src/compiler/spirv/spirv.h b/src/compiler/spirv/spirv.h
index e0a0330..4c90c93 100644
--- a/src/compiler/spirv/spirv.h
+++ b/src/compiler/spirv/spirv.h
@@ -683,6 +683,9 @@
     SpvCapabilityVariablePointers = 4442,
     SpvCapabilityAtomicStorageOps = 4445,
     SpvCapabilitySampleMaskPostDepthCoverage = 4447,
+    SpvCapabilityStorageBuffer8BitAccess = 4448,
+    SpvCapabilityUniformAndStorageBuffer8BitAccess = 4449,
+    SpvCapabilityStoragePushConstant8 = 4450,
     SpvCapabilityFloat16ImageAMD = 5008,
     SpvCapabilityImageGatherBiasLodAMD = 5009,
     SpvCapabilityFragmentMaskAMD = 5010,
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index f62b025..beadb1c 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -29,6 +29,7 @@
 #include "nir/nir_vla.h"
 #include "nir/nir_control_flow.h"
 #include "nir/nir_constant_expressions.h"
+#include "nir/nir_deref.h"
 #include "spirv_info.h"
 
 #include <stdio.h>
@@ -130,6 +131,18 @@
 }
 
 void
+_vtn_err(struct vtn_builder *b, const char *file, unsigned line,
+          const char *fmt, ...)
+{
+   va_list args;
+
+   va_start(args, fmt);
+   vtn_log_err(b, NIR_SPIRV_DEBUG_LEVEL_ERROR, "SPIR-V ERROR:\n",
+               file, line, fmt, args);
+   va_end(args);
+}
+
+void
 _vtn_fail(struct vtn_builder *b, const char *file, unsigned line,
           const char *fmt, ...)
 {
@@ -370,19 +383,20 @@
 vtn_handle_extension(struct vtn_builder *b, SpvOp opcode,
                      const uint32_t *w, unsigned count)
 {
+   const char *ext = (const char *)&w[2];
    switch (opcode) {
    case SpvOpExtInstImport: {
       struct vtn_value *val = vtn_push_value(b, w[1], vtn_value_type_extension);
-      if (strcmp((const char *)&w[2], "GLSL.std.450") == 0) {
+      if (strcmp(ext, "GLSL.std.450") == 0) {
          val->ext_handler = vtn_handle_glsl450_instruction;
-      } else if ((strcmp((const char *)&w[2], "SPV_AMD_gcn_shader") == 0)
+      } else if ((strcmp(ext, "SPV_AMD_gcn_shader") == 0)
                 && (b->options && b->options->caps.gcn_shader)) {
          val->ext_handler = vtn_handle_amd_gcn_shader_instruction;
-      } else if ((strcmp((const char *)&w[2], "SPV_AMD_shader_trinary_minmax") == 0)
+      } else if ((strcmp(ext, "SPV_AMD_shader_trinary_minmax") == 0)
                 && (b->options && b->options->caps.trinary_minmax)) {
          val->ext_handler = vtn_handle_amd_shader_trinary_minmax_instruction;
       } else {
-         vtn_fail("Unsupported extension");
+         vtn_fail("Unsupported extension: %s", ext);
       }
       break;
    }
@@ -833,7 +847,6 @@
    case SpvDecorationNonWritable:
    case SpvDecorationNonReadable:
    case SpvDecorationUniform:
-   case SpvDecorationStream:
    case SpvDecorationLocation:
    case SpvDecorationComponent:
    case SpvDecorationOffset:
@@ -843,6 +856,14 @@
                spirv_decoration_to_string(dec->decoration));
       break;
 
+   case SpvDecorationStream:
+      /* We don't need to do anything here, as stream is filled up when
+       * aplying the decoration to a variable, just check that if it is not a
+       * struct member, it should be a struct.
+       */
+      vtn_assert(type->base_type == vtn_base_type_struct);
+      break;
+
    case SpvDecorationRelaxedPrecision:
    case SpvDecorationSpecId:
    case SpvDecorationInvariant:
@@ -1169,6 +1190,13 @@
          val->type->type = glsl_vector_type(GLSL_TYPE_UINT, 2);
       }
 
+      if (storage_class == SpvStorageClassPushConstant) {
+         /* These can actually be stored to nir_variables and used as SSA
+          * values so they need a real glsl_type.
+          */
+         val->type->type = glsl_uint_type();
+      }
+
       if (storage_class == SpvStorageClassWorkgroup &&
           b->options->lower_workgroup_access_to_offsets) {
          uint32_t size, align;
@@ -1473,8 +1501,19 @@
                   spirv_op_to_string(opcode), elem_count, val->type->length);
 
       nir_constant **elems = ralloc_array(b, nir_constant *, elem_count);
-      for (unsigned i = 0; i < elem_count; i++)
-         elems[i] = vtn_value(b, w[i + 3], vtn_value_type_constant)->constant;
+      for (unsigned i = 0; i < elem_count; i++) {
+         struct vtn_value *val = vtn_untyped_value(b, w[i + 3]);
+
+         if (val->value_type == vtn_value_type_constant) {
+            elems[i] = val->constant;
+         } else {
+            vtn_fail_if(val->value_type != vtn_value_type_undef,
+                        "only constants or undefs allowed for "
+                        "SpvOpConstantComposite");
+            /* to make it easier, just insert a NULL constant for now */
+            elems[i] = vtn_null_constant(b, val->type->type);
+         }
+      }
 
       switch (val->type->base_type) {
       case vtn_base_type_vector: {
@@ -1732,11 +1771,37 @@
          nir_const_value src[4];
 
          for (unsigned i = 0; i < count - 4; i++) {
-            nir_constant *c =
-               vtn_value(b, w[4 + i], vtn_value_type_constant)->constant;
+            struct vtn_value *src_val =
+               vtn_value(b, w[4 + i], vtn_value_type_constant);
+
+            /* If this is an unsized source, pull the bit size from the
+             * source; otherwise, we'll use the bit size from the destination.
+             */
+            if (!nir_alu_type_get_type_size(nir_op_infos[op].input_types[i]))
+               bit_size = glsl_get_bit_size(src_val->type->type);
 
             unsigned j = swap ? 1 - i : i;
-            src[j] = c->values[0];
+            src[j] = src_val->constant->values[0];
+         }
+
+         /* fix up fixed size sources */
+         switch (op) {
+         case nir_op_ishl:
+         case nir_op_ishr:
+         case nir_op_ushr: {
+            if (bit_size == 32)
+               break;
+            for (unsigned i = 0; i < num_components; ++i) {
+               switch (bit_size) {
+               case 64: src[1].u32[i] = src[1].u64[i]; break;
+               case 16: src[1].u32[i] = src[1].u16[i]; break;
+               case  8: src[1].u32[i] = src[1].u8[i];  break;
+               }
+            }
+            break;
+         }
+         default:
+            break;
          }
 
          val->constant->values[0] =
@@ -1775,39 +1840,54 @@
    vtn_callee->referenced = true;
 
    nir_call_instr *call = nir_call_instr_create(b->nb.shader, callee);
-   for (unsigned i = 0; i < call->num_params; i++) {
+
+   unsigned param_idx = 0;
+
+   nir_deref_instr *ret_deref = NULL;
+   struct vtn_type *ret_type = vtn_callee->type->return_type;
+   if (ret_type->base_type != vtn_base_type_void) {
+      nir_variable *ret_tmp =
+         nir_local_variable_create(b->nb.impl, ret_type->type, "return_tmp");
+      ret_deref = nir_build_deref_var(&b->nb, ret_tmp);
+      call->params[param_idx++] = nir_src_for_ssa(&ret_deref->dest.ssa);
+   }
+
+   for (unsigned i = 0; i < vtn_callee->type->length; i++) {
+      struct vtn_type *arg_type = vtn_callee->type->params[i];
       unsigned arg_id = w[4 + i];
-      struct vtn_value *arg = vtn_untyped_value(b, arg_id);
-      if (arg->value_type == vtn_value_type_pointer &&
-          arg->pointer->ptr_type->type == NULL) {
-         nir_deref_var *d = vtn_pointer_to_deref(b, arg->pointer);
-         call->params[i] = nir_deref_var_clone(d, call);
+
+      if (arg_type->base_type == vtn_base_type_sampled_image) {
+         struct vtn_sampled_image *sampled_image =
+            vtn_value(b, arg_id, vtn_value_type_sampled_image)->sampled_image;
+
+         call->params[param_idx++] =
+            nir_src_for_ssa(&sampled_image->image->deref->dest.ssa);
+         call->params[param_idx++] =
+            nir_src_for_ssa(&sampled_image->sampler->deref->dest.ssa);
+      } else if (arg_type->base_type == vtn_base_type_pointer ||
+                 arg_type->base_type == vtn_base_type_image ||
+                 arg_type->base_type == vtn_base_type_sampler) {
+         struct vtn_pointer *pointer =
+            vtn_value(b, arg_id, vtn_value_type_pointer)->pointer;
+         call->params[param_idx++] =
+            nir_src_for_ssa(vtn_pointer_to_ssa(b, pointer));
       } else {
-         struct vtn_ssa_value *arg_ssa = vtn_ssa_value(b, arg_id);
-
-         /* Make a temporary to store the argument in */
+         /* This is a regular SSA value and we need a temporary */
          nir_variable *tmp =
-            nir_local_variable_create(b->nb.impl, arg_ssa->type, "arg_tmp");
-         call->params[i] = nir_deref_var_create(call, tmp);
-
-         vtn_local_store(b, arg_ssa, call->params[i]);
+            nir_local_variable_create(b->nb.impl, arg_type->type, "arg_tmp");
+         nir_deref_instr *tmp_deref = nir_build_deref_var(&b->nb, tmp);
+         vtn_local_store(b, vtn_ssa_value(b, arg_id), tmp_deref);
+         call->params[param_idx++] = nir_src_for_ssa(&tmp_deref->dest.ssa);
       }
    }
-
-   nir_variable *out_tmp = NULL;
-   vtn_assert(res_type->type == callee->return_type);
-   if (!glsl_type_is_void(callee->return_type)) {
-      out_tmp = nir_local_variable_create(b->nb.impl, callee->return_type,
-                                          "out_tmp");
-      call->return_deref = nir_deref_var_create(call, out_tmp);
-   }
+   assert(param_idx == call->num_params);
 
    nir_builder_instr_insert(&b->nb, &call->instr);
 
-   if (glsl_type_is_void(callee->return_type)) {
+   if (ret_type->base_type == vtn_base_type_void) {
       vtn_push_value(b, w[2], vtn_value_type_undef);
    } else {
-      vtn_push_ssa(b, w[2], res_type, vtn_local_load(b, call->return_deref));
+      vtn_push_ssa(b, w[2], res_type, vtn_local_load(b, ret_deref));
    }
 }
 
@@ -1960,9 +2040,41 @@
       vtn_fail("Unhandled opcode");
    }
 
-   nir_tex_src srcs[8]; /* 8 should be enough */
+   nir_tex_src srcs[10]; /* 10 should be enough */
    nir_tex_src *p = srcs;
 
+   nir_deref_instr *sampler = vtn_pointer_to_deref(b, sampled.sampler);
+   nir_deref_instr *texture =
+      sampled.image ? vtn_pointer_to_deref(b, sampled.image) : sampler;
+
+   p->src = nir_src_for_ssa(&texture->dest.ssa);
+   p->src_type = nir_tex_src_texture_deref;
+   p++;
+
+   switch (texop) {
+   case nir_texop_tex:
+   case nir_texop_txb:
+   case nir_texop_txl:
+   case nir_texop_txd:
+   case nir_texop_tg4:
+      /* These operations require a sampler */
+      p->src = nir_src_for_ssa(&sampler->dest.ssa);
+      p->src_type = nir_tex_src_sampler_deref;
+      p++;
+      break;
+   case nir_texop_txf:
+   case nir_texop_txf_ms:
+   case nir_texop_txs:
+   case nir_texop_lod:
+   case nir_texop_query_levels:
+   case nir_texop_texture_samples:
+   case nir_texop_samples_identical:
+      /* These don't */
+      break;
+   case nir_texop_txf_ms_mcs:
+      vtn_fail("unexpected nir_texop_txf_ms_mcs");
+   }
+
    unsigned idx = 4;
 
    struct nir_ssa_def *coord;
@@ -2123,40 +2235,6 @@
       vtn_fail("Invalid base type for sampler result");
    }
 
-   nir_deref_var *sampler = vtn_pointer_to_deref(b, sampled.sampler);
-   nir_deref_var *texture;
-   if (sampled.image) {
-      nir_deref_var *image = vtn_pointer_to_deref(b, sampled.image);
-      texture = image;
-   } else {
-      texture = sampler;
-   }
-
-   instr->texture = nir_deref_var_clone(texture, instr);
-
-   switch (instr->op) {
-   case nir_texop_tex:
-   case nir_texop_txb:
-   case nir_texop_txl:
-   case nir_texop_txd:
-   case nir_texop_tg4:
-      /* These operations require a sampler */
-      instr->sampler = nir_deref_var_clone(sampler, instr);
-      break;
-   case nir_texop_txf:
-   case nir_texop_txf_ms:
-   case nir_texop_txs:
-   case nir_texop_lod:
-   case nir_texop_query_levels:
-   case nir_texop_texture_samples:
-   case nir_texop_samples_identical:
-      /* These don't */
-      instr->sampler = NULL;
-      break;
-   case nir_texop_txf_ms_mcs:
-      vtn_fail("unexpected nir_texop_txf_ms_mcs");
-   }
-
    nir_ssa_dest_init(&instr->instr, &instr->dest,
                      nir_tex_instr_dest_size(instr), 32, NULL);
 
@@ -2181,8 +2259,6 @@
          instrs[i]->is_new_style_shadow = instr->is_new_style_shadow;
          instrs[i]->component = instr->component;
          instrs[i]->dest_type = instr->dest_type;
-         instrs[i]->texture = nir_deref_var_clone(texture, instrs[i]);
-         instrs[i]->sampler = NULL;
 
          memcpy(instrs[i]->src, srcs, instr->num_srcs * sizeof(*instr->src));
 
@@ -2369,7 +2445,7 @@
 
    nir_intrinsic_op op;
    switch (opcode) {
-#define OP(S, N) case SpvOp##S: op = nir_intrinsic_image_var_##N; break;
+#define OP(S, N) case SpvOp##S: op = nir_intrinsic_image_deref_##N; break;
    OP(ImageQuerySize,         size)
    OP(ImageRead,              load)
    OP(ImageWrite,             store)
@@ -2395,16 +2471,16 @@
 
    nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(b->shader, op);
 
-   nir_deref_var *image_deref = vtn_pointer_to_deref(b, image.image);
-   intrin->variables[0] = nir_deref_var_clone(image_deref, intrin);
+   nir_deref_instr *image_deref = vtn_pointer_to_deref(b, image.image);
+   intrin->src[0] = nir_src_for_ssa(&image_deref->dest.ssa);
 
    /* ImageQuerySize doesn't take any extra parameters */
    if (opcode != SpvOpImageQuerySize) {
       /* The image coordinate is always 4 components but we may not have that
        * many.  Swizzle to compensate.
        */
-      intrin->src[0] = nir_src_for_ssa(expand_to_vec4(&b->nb, image.coord));
-      intrin->src[1] = nir_src_for_ssa(image.sample);
+      intrin->src[1] = nir_src_for_ssa(expand_to_vec4(&b->nb, image.coord));
+      intrin->src[2] = nir_src_for_ssa(image.sample);
    }
 
    switch (opcode) {
@@ -2417,7 +2493,7 @@
       const uint32_t value_id = opcode == SpvOpAtomicStore ? w[4] : w[3];
       nir_ssa_def *value = vtn_ssa_value(b, value_id)->def;
       /* nir_intrinsic_image_deref_store always takes a vec4 value */
-      intrin->src[2] = nir_src_for_ssa(expand_to_vec4(&b->nb, value));
+      intrin->src[3] = nir_src_for_ssa(expand_to_vec4(&b->nb, value));
       break;
    }
 
@@ -2434,7 +2510,7 @@
    case SpvOpAtomicAnd:
    case SpvOpAtomicOr:
    case SpvOpAtomicXor:
-      fill_common_atomic_sources(b, opcode, w, &intrin->src[2]);
+      fill_common_atomic_sources(b, opcode, w, &intrin->src[3]);
       break;
 
    default:
@@ -2493,6 +2569,35 @@
 }
 
 static nir_intrinsic_op
+get_uniform_nir_atomic_op(struct vtn_builder *b, SpvOp opcode)
+{
+   switch (opcode) {
+#define OP(S, N) case SpvOp##S: return nir_intrinsic_atomic_counter_ ##N;
+   OP(AtomicLoad,             read_deref)
+   OP(AtomicExchange,         exchange)
+   OP(AtomicCompareExchange,  comp_swap)
+   OP(AtomicIIncrement,       inc_deref)
+   OP(AtomicIDecrement,       post_dec_deref)
+   OP(AtomicIAdd,             add_deref)
+   OP(AtomicISub,             add_deref)
+   OP(AtomicUMin,             min_deref)
+   OP(AtomicUMax,             max_deref)
+   OP(AtomicAnd,              and_deref)
+   OP(AtomicOr,               or_deref)
+   OP(AtomicXor,              xor_deref)
+#undef OP
+   default:
+      /* We left the following out: AtomicStore, AtomicSMin and
+       * AtomicSmax. Right now there are not nir intrinsics for them. At this
+       * moment Atomic Counter support is needed for ARB_spirv support, so is
+       * only need to support GLSL Atomic Counters that are uints and don't
+       * allow direct storage.
+       */
+      unreachable("Invalid uniform atomic");
+   }
+}
+
+static nir_intrinsic_op
 get_shared_nir_atomic_op(struct vtn_builder *b, SpvOp opcode)
 {
    switch (opcode) {
@@ -2519,12 +2624,12 @@
 }
 
 static nir_intrinsic_op
-get_var_nir_atomic_op(struct vtn_builder *b, SpvOp opcode)
+get_deref_nir_atomic_op(struct vtn_builder *b, SpvOp opcode)
 {
    switch (opcode) {
-   case SpvOpAtomicLoad:      return nir_intrinsic_load_var;
-   case SpvOpAtomicStore:     return nir_intrinsic_store_var;
-#define OP(S, N) case SpvOp##S: return nir_intrinsic_var_##N;
+   case SpvOpAtomicLoad:      return nir_intrinsic_load_deref;
+   case SpvOpAtomicStore:     return nir_intrinsic_store_deref;
+#define OP(S, N) case SpvOp##S: return nir_intrinsic_deref_##N;
    OP(AtomicExchange,         atomic_exchange)
    OP(AtomicCompareExchange,  atomic_comp_swap)
    OP(AtomicIIncrement,       atomic_add)
@@ -2544,9 +2649,12 @@
    }
 }
 
+/*
+ * Handles shared atomics, ssbo atomics and atomic counters.
+ */
 static void
-vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
-                                 const uint32_t *w, unsigned count)
+vtn_handle_atomics(struct vtn_builder *b, SpvOp opcode,
+                   const uint32_t *w, unsigned count)
 {
    struct vtn_pointer *ptr;
    nir_intrinsic_instr *atomic;
@@ -2583,13 +2691,18 @@
    SpvMemorySemanticsMask semantics = w[5];
    */
 
-   if (ptr->mode == vtn_variable_mode_workgroup &&
-       !b->options->lower_workgroup_access_to_offsets) {
-      nir_deref_var *deref = vtn_pointer_to_deref(b, ptr);
-      const struct glsl_type *deref_type = nir_deref_tail(&deref->deref)->type;
-      nir_intrinsic_op op = get_var_nir_atomic_op(b, opcode);
+   /* uniform as "atomic counter uniform" */
+   if (ptr->mode == vtn_variable_mode_uniform) {
+      nir_deref_instr *deref = vtn_pointer_to_deref(b, ptr);
+      const struct glsl_type *deref_type = deref->type;
+      nir_intrinsic_op op = get_uniform_nir_atomic_op(b, opcode);
       atomic = nir_intrinsic_instr_create(b->nb.shader, op);
-      atomic->variables[0] = nir_deref_var_clone(deref, atomic);
+      atomic->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+
+      /* SSBO needs to initialize index/offset. In this case we don't need to,
+       * as that info is already stored on the ptr->var->var nir_variable (see
+       * vtn_create_variable)
+       */
 
       switch (opcode) {
       case SpvOpAtomicLoad:
@@ -2599,7 +2712,6 @@
       case SpvOpAtomicStore:
          atomic->num_components = glsl_get_vector_elements(deref_type);
          nir_intrinsic_set_write_mask(atomic, (1 << atomic->num_components) - 1);
-         atomic->src[0] = nir_src_for_ssa(vtn_ssa_value(b, w[4])->def);
          break;
 
       case SpvOpAtomicExchange:
@@ -2616,7 +2728,49 @@
       case SpvOpAtomicAnd:
       case SpvOpAtomicOr:
       case SpvOpAtomicXor:
-         fill_common_atomic_sources(b, opcode, w, &atomic->src[0]);
+         /* Nothing: we don't need to call fill_common_atomic_sources here, as
+          * atomic counter uniforms doesn't have sources
+          */
+         break;
+
+      default:
+         unreachable("Invalid SPIR-V atomic");
+
+      }
+   } else if (ptr->mode == vtn_variable_mode_workgroup &&
+              !b->options->lower_workgroup_access_to_offsets) {
+      nir_deref_instr *deref = vtn_pointer_to_deref(b, ptr);
+      const struct glsl_type *deref_type = deref->type;
+      nir_intrinsic_op op = get_deref_nir_atomic_op(b, opcode);
+      atomic = nir_intrinsic_instr_create(b->nb.shader, op);
+      atomic->src[0] = nir_src_for_ssa(&deref->dest.ssa);
+
+      switch (opcode) {
+      case SpvOpAtomicLoad:
+         atomic->num_components = glsl_get_vector_elements(deref_type);
+         break;
+
+      case SpvOpAtomicStore:
+         atomic->num_components = glsl_get_vector_elements(deref_type);
+         nir_intrinsic_set_write_mask(atomic, (1 << atomic->num_components) - 1);
+         atomic->src[1] = nir_src_for_ssa(vtn_ssa_value(b, w[4])->def);
+         break;
+
+      case SpvOpAtomicExchange:
+      case SpvOpAtomicCompareExchange:
+      case SpvOpAtomicCompareExchangeWeak:
+      case SpvOpAtomicIIncrement:
+      case SpvOpAtomicIDecrement:
+      case SpvOpAtomicIAdd:
+      case SpvOpAtomicISub:
+      case SpvOpAtomicSMin:
+      case SpvOpAtomicUMin:
+      case SpvOpAtomicSMax:
+      case SpvOpAtomicUMax:
+      case SpvOpAtomicAnd:
+      case SpvOpAtomicOr:
+      case SpvOpAtomicXor:
+         fill_common_atomic_sources(b, opcode, w, &atomic->src[1]);
          break;
 
       default:
@@ -2625,7 +2779,7 @@
       }
    } else {
       nir_ssa_def *offset, *index;
-      offset = vtn_pointer_to_offset(b, ptr, &index, NULL);
+      offset = vtn_pointer_to_offset(b, ptr, &index);
 
       nir_intrinsic_op op;
       if (ptr->mode == vtn_variable_mode_ssbo) {
@@ -2702,7 +2856,7 @@
 {
    nir_op op;
    switch (num_components) {
-   case 1: op = nir_op_fmov; break;
+   case 1: op = nir_op_imov; break;
    case 2: op = nir_op_vec2; break;
    case 3: op = nir_op_vec3; break;
    case 4: op = nir_op_vec4; break;
@@ -2750,8 +2904,7 @@
 nir_ssa_def *
 vtn_vector_extract(struct vtn_builder *b, nir_ssa_def *src, unsigned index)
 {
-   unsigned swiz[4] = { index };
-   return nir_swizzle(&b->nb, src, swiz, 1, true);
+   return nir_channel(&b->nb, src, index);
 }
 
 nir_ssa_def *
@@ -2967,7 +3120,7 @@
       unsigned elems = count - 3;
       assume(elems >= 1);
       if (glsl_type_is_vector_or_scalar(type)) {
-         nir_ssa_def *srcs[4];
+         nir_ssa_def *srcs[NIR_MAX_VEC_COMPONENTS];
          for (unsigned i = 0; i < elems; i++)
             srcs[i] = vtn_ssa_value(b, w[3 + i])->def;
          val->ssa->def =
@@ -3091,9 +3244,12 @@
 
       switch (opcode) {
       case SpvOpEmitStreamVertex:
-      case SpvOpEndStreamPrimitive:
-         nir_intrinsic_set_stream_id(intrin, w[1]);
+      case SpvOpEndStreamPrimitive: {
+         unsigned stream = vtn_constant_value(b, w[1])->values[0].u32[0];
+         nir_intrinsic_set_stream_id(intrin, stream);
          break;
+      }
+
       default:
          break;
       }
@@ -3287,29 +3443,40 @@
       case SpvCapabilityStorageImageExtendedFormats:
          break;
 
-      case SpvCapabilityGeometryStreams:
       case SpvCapabilityLinkage:
       case SpvCapabilityVector16:
       case SpvCapabilityFloat16Buffer:
       case SpvCapabilityFloat16:
       case SpvCapabilityInt64Atomics:
-      case SpvCapabilityAtomicStorage:
-      case SpvCapabilityInt16:
       case SpvCapabilityStorageImageMultisample:
       case SpvCapabilityInt8:
       case SpvCapabilitySparseResidency:
       case SpvCapabilityMinLod:
-      case SpvCapabilityTransformFeedback:
          vtn_warn("Unsupported SPIR-V capability: %s",
                   spirv_capability_to_string(cap));
          break;
 
+      case SpvCapabilityAtomicStorage:
+         spv_check_supported(atomic_storage, cap);
+         break;
+
       case SpvCapabilityFloat64:
          spv_check_supported(float64, cap);
          break;
       case SpvCapabilityInt64:
          spv_check_supported(int64, cap);
          break;
+      case SpvCapabilityInt16:
+         spv_check_supported(int16, cap);
+         break;
+
+      case SpvCapabilityTransformFeedback:
+         spv_check_supported(transform_feedback, cap);
+         break;
+
+      case SpvCapabilityGeometryStreams:
+         spv_check_supported(geometry_streams, cap);
+         break;
 
       case SpvCapabilityAddresses:
       case SpvCapabilityKernel:
@@ -3397,6 +3564,12 @@
          spv_check_supported(shader_viewport_index_layer, cap);
          break;
 
+      case SpvCapabilityStorageBuffer8BitAccess:
+      case SpvCapabilityUniformAndStorageBuffer8BitAccess:
+      case SpvCapabilityStoragePushConstant8:
+         spv_check_supported(storage_8bit, cap);
+         break;
+
       case SpvCapabilityInputAttachmentArrayDynamicIndexingEXT:
       case SpvCapabilityUniformTexelBufferArrayDynamicIndexingEXT:
       case SpvCapabilityStorageTexelBufferArrayDynamicIndexingEXT:
@@ -3407,6 +3580,14 @@
          spv_check_supported(runtime_descriptor_array, cap);
          break;
 
+      case SpvCapabilityStencilExportEXT:
+         spv_check_supported(stencil_export, cap);
+         break;
+
+      case SpvCapabilitySampleMaskPostDepthCoverage:
+         spv_check_supported(post_depth_coverage, cap);
+         break;
+
       default:
          vtn_fail("Unhandled capability");
       }
@@ -3474,6 +3655,11 @@
       b->shader->info.fs.early_fragment_tests = true;
       break;
 
+   case SpvExecutionModePostDepthCoverage:
+      vtn_assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
+      b->shader->info.fs.post_depth_coverage = true;
+      break;
+
    case SpvExecutionModeInvocations:
       vtn_assert(b->shader->info.stage == MESA_SHADER_GEOMETRY);
       b->shader->info.gs.invocations = MAX2(1, mode->literals[0]);
@@ -3577,13 +3763,17 @@
       break;
 
    case SpvExecutionModeXfb:
-      vtn_fail("Unhandled execution mode");
+      b->shader->info.has_transform_feedback_varyings = true;
       break;
 
    case SpvExecutionModeVecTypeHint:
    case SpvExecutionModeContractionOff:
       break; /* OpenCL */
 
+   case SpvExecutionModeStencilRefReplacingEXT:
+      vtn_assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
+      break;
+
    default:
       vtn_fail("Unhandled execution mode");
    }
@@ -3733,10 +3923,10 @@
    case SpvOpImageQuerySize: {
       struct vtn_pointer *image =
          vtn_value(b, w[3], vtn_value_type_pointer)->pointer;
-      if (image->mode == vtn_variable_mode_image) {
+      if (glsl_type_is_image(image->type->type)) {
          vtn_handle_image(b, opcode, w, count);
       } else {
-         vtn_assert(image->mode == vtn_variable_mode_sampler);
+         vtn_assert(glsl_type_is_sampler(image->type->type));
          vtn_handle_texture(b, opcode, w, count);
       }
       break;
@@ -3762,7 +3952,7 @@
          vtn_handle_image(b, opcode, w, count);
       } else {
          vtn_assert(pointer->value_type == vtn_value_type_pointer);
-         vtn_handle_ssbo_or_shared_atomic(b, opcode, w, count);
+         vtn_handle_atomics(b, opcode, w, count);
       }
       break;
    }
@@ -3773,7 +3963,7 @@
          vtn_handle_image(b, opcode, w, count);
       } else {
          vtn_assert(pointer->value_type == vtn_value_type_pointer);
-         vtn_handle_ssbo_or_shared_atomic(b, opcode, w, count);
+         vtn_handle_atomics(b, opcode, w, count);
       }
       break;
    }
@@ -4022,19 +4212,36 @@
    b->entry_point_name = entry_point_name;
    b->options = options;
 
-   /* Handle the SPIR-V header (first 4 dwords)  */
-   vtn_assert(word_count > 5);
+   /*
+    * Handle the SPIR-V header (first 5 dwords).
+    * Can't use vtx_assert() as the setjmp(3) target isn't initialized yet.
+    */
+   if (word_count <= 5)
+      goto fail;
 
-   vtn_assert(words[0] == SpvMagicNumber);
-   vtn_assert(words[1] >= 0x10000);
+   if (words[0] != SpvMagicNumber) {
+      vtn_err("words[0] was 0x%x, want 0x%x", words[0], SpvMagicNumber);
+      goto fail;
+   }
+   if (words[1] < 0x10000) {
+      vtn_err("words[1] was 0x%x, want >= 0x10000", words[1]);
+      goto fail;
+   }
+
    /* words[2] == generator magic */
    unsigned value_id_bound = words[3];
-   vtn_assert(words[4] == 0);
+   if (words[4] != 0) {
+      vtn_err("words[4] was %u, want 0", words[4]);
+      goto fail;
+   }
 
    b->value_id_bound = value_id_bound;
    b->values = rzalloc_array(b, struct vtn_value, value_id_bound);
 
    return b;
+ fail:
+   ralloc_free(b);
+   return NULL;
 }
 
 nir_function *
@@ -4111,6 +4318,11 @@
       }
    } while (progress);
 
+   /* We sometimes generate bogus derefs that, while never used, give the
+    * validator a bit of heartburn.  Run dead code to get rid of them.
+    */
+   nir_opt_dce(b->shader);
+
    vtn_assert(b->entry_point->value_type == vtn_value_type_function);
    nir_function *entry_point = b->entry_point->func->impl->function;
    vtn_assert(entry_point);
diff --git a/src/compiler/spirv/vtn_alu.c b/src/compiler/spirv/vtn_alu.c
index 71e743c..b7987c7 100644
--- a/src/compiler/spirv/vtn_alu.c
+++ b/src/compiler/spirv/vtn_alu.c
@@ -246,15 +246,22 @@
    unsigned dest_components = glsl_get_vector_elements(dest->type);
    vtn_assert(src_bit_size * src_components == dest_bit_size * dest_components);
 
-   nir_ssa_def *dest_chan[4];
+   nir_ssa_def *dest_chan[NIR_MAX_VEC_COMPONENTS];
    if (src_bit_size > dest_bit_size) {
       vtn_assert(src_bit_size % dest_bit_size == 0);
       unsigned divisor = src_bit_size / dest_bit_size;
       for (unsigned comp = 0; comp < src_components; comp++) {
-         vtn_assert(src_bit_size == 64);
-         vtn_assert(dest_bit_size == 32);
-         nir_ssa_def *split =
-            nir_unpack_64_2x32(&b->nb, nir_channel(&b->nb, src, comp));
+         nir_ssa_def *split;
+         if (src_bit_size == 64) {
+            assert(dest_bit_size == 32 || dest_bit_size == 16);
+            split = dest_bit_size == 32 ?
+               nir_unpack_64_2x32(&b->nb, nir_channel(&b->nb, src, comp)) :
+               nir_unpack_64_4x16(&b->nb, nir_channel(&b->nb, src, comp));
+         } else {
+            vtn_assert(src_bit_size == 32);
+            vtn_assert(dest_bit_size == 16);
+            split = nir_unpack_32_2x16(&b->nb, nir_channel(&b->nb, src, comp));
+         }
          for (unsigned i = 0; i < divisor; i++)
             dest_chan[divisor * comp + i] = nir_channel(&b->nb, split, i);
       }
@@ -263,11 +270,17 @@
       unsigned divisor = dest_bit_size / src_bit_size;
       for (unsigned comp = 0; comp < dest_components; comp++) {
          unsigned channels = ((1 << divisor) - 1) << (comp * divisor);
-         nir_ssa_def *src_chan =
-            nir_channels(&b->nb, src, channels);
-         vtn_assert(dest_bit_size == 64);
-         vtn_assert(src_bit_size == 32);
-         dest_chan[comp] = nir_pack_64_2x32(&b->nb, src_chan);
+         nir_ssa_def *src_chan = nir_channels(&b->nb, src, channels);
+         if (dest_bit_size == 64) {
+            assert(src_bit_size == 32 || src_bit_size == 16);
+            dest_chan[comp] = src_bit_size == 32 ?
+               nir_pack_64_2x32(&b->nb, src_chan) :
+               nir_pack_64_4x16(&b->nb, src_chan);
+         } else {
+            vtn_assert(dest_bit_size == 32);
+            vtn_assert(src_bit_size == 16);
+            dest_chan[comp] = nir_pack_32_2x16(&b->nb, src_chan);
+         }
       }
    }
    dest->def = nir_vec(&b->nb, dest_chan, dest_components);
@@ -402,7 +415,7 @@
    case SpvOpDPdyCoarse:   return nir_op_fddy_coarse;
 
    default:
-      vtn_fail("No NIR equivalent");
+      vtn_fail("No NIR equivalent: %u", opcode);
    }
 }
 
@@ -597,23 +610,18 @@
       break;
    }
 
-   case SpvOpFOrdEqual:
-   case SpvOpFOrdNotEqual:
-   case SpvOpFOrdLessThan:
-   case SpvOpFOrdGreaterThan:
-   case SpvOpFOrdLessThanEqual:
-   case SpvOpFOrdGreaterThanEqual: {
+   case SpvOpFOrdNotEqual: {
+      /* For all the SpvOpFOrd* comparisons apart from NotEqual, the value
+       * from the ALU will probably already be false if the operands are not
+       * ordered so we don’t need to handle it specially.
+       */
       bool swap;
       unsigned src_bit_size = glsl_get_bit_size(vtn_src[0]->type);
       unsigned dst_bit_size = glsl_get_bit_size(type);
       nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &swap,
                                                   src_bit_size, dst_bit_size);
 
-      if (swap) {
-         nir_ssa_def *tmp = src[0];
-         src[0] = src[1];
-         src[1] = tmp;
-      }
+      assert(!swap);
 
       val->ssa->def =
          nir_iand(&b->nb,
@@ -640,6 +648,41 @@
       break;
    }
 
+   case SpvOpBitFieldInsert:
+   case SpvOpBitFieldSExtract:
+   case SpvOpBitFieldUExtract:
+   case SpvOpShiftLeftLogical:
+   case SpvOpShiftRightArithmetic:
+   case SpvOpShiftRightLogical: {
+      bool swap;
+      unsigned src0_bit_size = glsl_get_bit_size(vtn_src[0]->type);
+      unsigned dst_bit_size = glsl_get_bit_size(type);
+      nir_op op = vtn_nir_alu_op_for_spirv_opcode(b, opcode, &swap,
+                                                  src0_bit_size, dst_bit_size);
+
+      assert (op == nir_op_ushr || op == nir_op_ishr || op == nir_op_ishl ||
+              op == nir_op_bitfield_insert || op == nir_op_ubitfield_extract ||
+              op == nir_op_ibitfield_extract);
+
+      for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++) {
+         unsigned src_bit_size =
+            nir_alu_type_get_type_size(nir_op_infos[op].input_types[i]);
+         if (src_bit_size == 0)
+            continue;
+         if (src_bit_size != src[i]->bit_size) {
+            assert(src_bit_size == 32);
+            /* Convert the Shift, Offset and Count  operands to 32 bits, which is the bitsize
+             * supported by the NIR instructions. See discussion here:
+             *
+             * https://lists.freedesktop.org/archives/mesa-dev/2018-April/193026.html
+             */
+            src[i] = nir_u2u32(&b->nb, src[i]);
+         }
+      }
+      val->ssa->def = nir_build_alu(&b->nb, op, src[0], src[1], src[2], src[3]);
+      break;
+   }
+
    default: {
       bool swap;
       unsigned src_bit_size = glsl_get_bit_size(vtn_src[0]->type);
@@ -653,6 +696,17 @@
          src[1] = tmp;
       }
 
+      switch (op) {
+      case nir_op_ishl:
+      case nir_op_ishr:
+      case nir_op_ushr:
+         if (src[1]->bit_size != 32)
+            src[1] = nir_u2u32(&b->nb, src[1]);
+         break;
+      default:
+         break;
+      }
+
       val->ssa->def = nir_build_alu(&b->nb, op, src[0], src[1], src[2], src[3]);
       break;
    } /* default */
diff --git a/src/compiler/spirv/vtn_cfg.c b/src/compiler/spirv/vtn_cfg.c
index ad43741..ed1ab5d 100644
--- a/src/compiler/spirv/vtn_cfg.c
+++ b/src/compiler/spirv/vtn_cfg.c
@@ -25,18 +25,21 @@
 #include "nir/nir_vla.h"
 
 static struct vtn_pointer *
-vtn_pointer_for_image_or_sampler_variable(struct vtn_builder *b,
-                                          struct vtn_variable *var)
+vtn_load_param_pointer(struct vtn_builder *b,
+                       struct vtn_type *param_type,
+                       uint32_t param_idx)
 {
-   assert(var->type->base_type == vtn_base_type_image ||
-          var->type->base_type == vtn_base_type_sampler);
+   struct vtn_type *ptr_type = param_type;
+   if (param_type->base_type != vtn_base_type_pointer) {
+      assert(param_type->base_type == vtn_base_type_image ||
+             param_type->base_type == vtn_base_type_sampler);
+      ptr_type = rzalloc(b, struct vtn_type);
+      ptr_type->base_type = vtn_base_type_pointer;
+      ptr_type->deref = param_type;
+      ptr_type->storage_class = SpvStorageClassUniformConstant;
+   }
 
-   struct vtn_type *ptr_type = rzalloc(b, struct vtn_type);
-   ptr_type->base_type = vtn_base_type_pointer;
-   ptr_type->storage_class = SpvStorageClassUniformConstant;
-   ptr_type->deref = var->type;
-
-   return vtn_pointer_for_variable(b, var, ptr_type);
+   return vtn_pointer_from_ssa(b, nir_load_param(&b->nb, param_idx), ptr_type);
 }
 
 static bool
@@ -56,48 +59,72 @@
       struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_function);
       val->func = b->func;
 
-      const struct vtn_type *func_type =
-         vtn_value(b, w[4], vtn_value_type_type)->type;
+      b->func->type = vtn_value(b, w[4], vtn_value_type_type)->type;
+      const struct vtn_type *func_type = b->func->type;
 
       vtn_assert(func_type->return_type->type == result_type);
 
       nir_function *func =
          nir_function_create(b->shader, ralloc_strdup(b->shader, val->name));
 
-      func->num_params = func_type->length;
-      func->params = ralloc_array(b->shader, nir_parameter, func->num_params);
-      unsigned np = 0;
+      unsigned num_params = func_type->length;
       for (unsigned i = 0; i < func_type->length; i++) {
-         if (func_type->params[i]->base_type == vtn_base_type_pointer &&
-             func_type->params[i]->type == NULL) {
-            func->params[np].type = func_type->params[i]->deref->type;
-            func->params[np].param_type = nir_parameter_inout;
-            np++;
-         } else if (func_type->params[i]->base_type ==
-                    vtn_base_type_sampled_image) {
-            /* Sampled images are actually two parameters */
-            func->params = reralloc(b->shader, func->params,
-                                    nir_parameter, func->num_params++);
-            func->params[np].type = func_type->params[i]->type;
-            func->params[np].param_type = nir_parameter_in;
-            np++;
-            func->params[np].type = glsl_bare_sampler_type();
-            func->params[np].param_type = nir_parameter_in;
-            np++;
+         /* Sampled images are actually two parameters */
+         if (func_type->params[i]->base_type == vtn_base_type_sampled_image)
+            num_params++;
+      }
+
+      /* Add one parameter for the function return value */
+      if (func_type->return_type->base_type != vtn_base_type_void)
+         num_params++;
+
+      func->num_params = num_params;
+      func->params = ralloc_array(b->shader, nir_parameter, num_params);
+
+      unsigned idx = 0;
+      if (func_type->return_type->base_type != vtn_base_type_void) {
+         /* The return value is a regular pointer */
+         func->params[idx++] = (nir_parameter) {
+            .num_components = 1, .bit_size = 32,
+         };
+      }
+
+      for (unsigned i = 0; i < func_type->length; i++) {
+         if (func_type->params[i]->base_type == vtn_base_type_sampled_image) {
+            /* Sampled images are two pointer parameters */
+            func->params[idx++] = (nir_parameter) {
+               .num_components = 1, .bit_size = 32,
+            };
+            func->params[idx++] = (nir_parameter) {
+               .num_components = 1, .bit_size = 32,
+            };
+         } else if (func_type->params[i]->base_type == vtn_base_type_pointer &&
+                    func_type->params[i]->type != NULL) {
+            /* Pointers with as storage class get passed by-value */
+            assert(glsl_type_is_vector_or_scalar(func_type->params[i]->type));
+            func->params[idx++] = (nir_parameter) {
+               .num_components =
+                  glsl_get_vector_elements(func_type->params[i]->type),
+               .bit_size = glsl_get_bit_size(func_type->params[i]->type),
+            };
          } else {
-            func->params[np].type = func_type->params[i]->type;
-            func->params[np].param_type = nir_parameter_in;
-            np++;
+            /* Everything else is a regular pointer */
+            func->params[idx++] = (nir_parameter) {
+               .num_components = 1, .bit_size = 32,
+            };
          }
       }
-      assert(np == func->num_params);
-
-      func->return_type = func_type->return_type->type;
+      assert(idx == num_params);
 
       b->func->impl = nir_function_impl_create(func);
+      nir_builder_init(&b->nb, func->impl);
       b->nb.cursor = nir_before_cf_list(&b->func->impl->body);
 
       b->func_param_idx = 0;
+
+      /* The return value is the first parameter */
+      if (func_type->return_type->base_type != vtn_base_type_void)
+         b->func_param_idx++;
       break;
    }
 
@@ -109,92 +136,46 @@
    case SpvOpFunctionParameter: {
       struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
 
-      vtn_assert(b->func_param_idx < b->func->impl->num_params);
-      nir_variable *param = b->func->impl->params[b->func_param_idx++];
+      vtn_assert(b->func_param_idx < b->func->impl->function->num_params);
 
-      if (type->base_type == vtn_base_type_pointer && type->type == NULL) {
-         struct vtn_variable *vtn_var = rzalloc(b, struct vtn_variable);
-         vtn_var->type = type->deref;
-         vtn_var->var = param;
+      if (type->base_type == vtn_base_type_sampled_image) {
+         /* Sampled images are actually two parameters.  The first is the
+          * image and the second is the sampler.
+          */
+         struct vtn_value *val =
+            vtn_push_value(b, w[2], vtn_value_type_sampled_image);
 
-         vtn_assert(vtn_var->type->type == param->type);
+         val->sampled_image = ralloc(b, struct vtn_sampled_image);
+         val->sampled_image->type = type;
 
-         struct vtn_type *without_array = vtn_var->type;
-         while(glsl_type_is_array(without_array->type))
-            without_array = without_array->array_element;
+         struct vtn_type *sampler_type = rzalloc(b, struct vtn_type);
+         sampler_type->base_type = vtn_base_type_sampler;
+         sampler_type->type = glsl_bare_sampler_type();
 
-         if (glsl_type_is_image(without_array->type)) {
-            vtn_var->mode = vtn_variable_mode_image;
-            param->interface_type = without_array->type;
-         } else if (glsl_type_is_sampler(without_array->type)) {
-            vtn_var->mode = vtn_variable_mode_sampler;
-            param->interface_type = without_array->type;
-         } else {
-            vtn_var->mode = vtn_variable_mode_param;
-         }
-
+         val->sampled_image->image =
+            vtn_load_param_pointer(b, type, b->func_param_idx++);
+         val->sampled_image->sampler =
+            vtn_load_param_pointer(b, sampler_type, b->func_param_idx++);
+      } else if (type->base_type == vtn_base_type_pointer &&
+                 type->type != NULL) {
+         /* This is a pointer with an actual storage type */
          struct vtn_value *val =
             vtn_push_value(b, w[2], vtn_value_type_pointer);
-
-         /* Name the parameter so it shows up nicely in NIR */
-         param->name = ralloc_strdup(param, val->name);
-
-         val->pointer = vtn_pointer_for_variable(b, vtn_var, type);
-      } else if (type->base_type == vtn_base_type_image ||
-                 type->base_type == vtn_base_type_sampler ||
-                 type->base_type == vtn_base_type_sampled_image) {
-         struct vtn_variable *vtn_var = rzalloc(b, struct vtn_variable);
-         vtn_var->type = type;
-         vtn_var->var = param;
-         param->interface_type = param->type;
-
-         if (type->base_type == vtn_base_type_sampled_image) {
-            /* Sampled images are actually two parameters.  The first is the
-             * image and the second is the sampler.
-             */
-            struct vtn_value *val =
-               vtn_push_value(b, w[2], vtn_value_type_sampled_image);
-
-            /* Name the parameter so it shows up nicely in NIR */
-            param->name = ralloc_strdup(param, val->name);
-
-            /* Adjust the type of the image variable to the image type */
-            vtn_var->type = type->image;
-
-            /* Now get the sampler parameter and set up its variable */
-            param = b->func->impl->params[b->func_param_idx++];
-            struct vtn_variable *sampler_var = rzalloc(b, struct vtn_variable);
-            sampler_var->type = rzalloc(b, struct vtn_type);
-            sampler_var->type->base_type = vtn_base_type_sampler;
-            sampler_var->type->type = glsl_bare_sampler_type();
-            sampler_var->var = param;
-            param->interface_type = param->type;
-            param->name = ralloc_strdup(param, val->name);
-
-            val->sampled_image = ralloc(b, struct vtn_sampled_image);
-            val->sampled_image->type = type;
-            val->sampled_image->image =
-               vtn_pointer_for_image_or_sampler_variable(b, vtn_var);
-            val->sampled_image->sampler =
-               vtn_pointer_for_image_or_sampler_variable(b, sampler_var);
-         } else {
-            struct vtn_value *val =
-               vtn_push_value(b, w[2], vtn_value_type_pointer);
-
-            /* Name the parameter so it shows up nicely in NIR */
-            param->name = ralloc_strdup(param, val->name);
-
-            val->pointer =
-               vtn_pointer_for_image_or_sampler_variable(b, vtn_var);
-         }
+         nir_ssa_def *ssa_ptr = nir_load_param(&b->nb, b->func_param_idx++);
+         val->pointer = vtn_pointer_from_ssa(b, ssa_ptr, type);
+      } else if (type->base_type == vtn_base_type_pointer ||
+                 type->base_type == vtn_base_type_image ||
+                 type->base_type == vtn_base_type_sampler) {
+         struct vtn_value *val =
+            vtn_push_value(b, w[2], vtn_value_type_pointer);
+         val->pointer =
+            vtn_load_param_pointer(b, type, b->func_param_idx++);
       } else {
          /* We're a regular SSA value. */
-         struct vtn_ssa_value *param_ssa =
-            vtn_local_load(b, nir_deref_var_create(b, param));
-         struct vtn_value *val = vtn_push_ssa(b, w[2], type, param_ssa);
-
-         /* Name the parameter so it shows up nicely in NIR */
-         param->name = ralloc_strdup(param, val->name);
+         nir_ssa_def *param_val = nir_load_param(&b->nb, b->func_param_idx++);
+         nir_deref_instr *deref =
+            nir_build_deref_cast(&b->nb, param_val, nir_var_local, type->type);
+         vtn_push_ssa(b, w[2], type, vtn_local_load(b, deref));
       }
       break;
    }
@@ -643,7 +624,7 @@
    _mesa_hash_table_insert(b->phi_table, w, phi_var);
 
    vtn_push_ssa(b, w[2], type,
-                vtn_local_load(b, nir_deref_var_create(b, phi_var)));
+                vtn_local_load(b, nir_build_deref_var(&b->nb, phi_var)));
 
    return true;
 }
@@ -667,7 +648,7 @@
 
       struct vtn_ssa_value *src = vtn_ssa_value(b, w[i]);
 
-      vtn_local_store(b, src, nir_deref_var_create(b, phi_var));
+      vtn_local_store(b, src, nir_build_deref_var(&b->nb, phi_var));
    }
 
    return true;
@@ -728,9 +709,14 @@
          nir_builder_instr_insert(&b->nb, &block->end_nop->instr);
 
          if ((*block->branch & SpvOpCodeMask) == SpvOpReturnValue) {
+            vtn_fail_if(b->func->type->return_type->base_type ==
+                        vtn_base_type_void,
+                        "Return with a value from a function returning void");
             struct vtn_ssa_value *src = vtn_ssa_value(b, block->branch[1]);
-            vtn_local_store(b, src,
-                            nir_deref_var_create(b, b->nb.impl->return_var));
+            nir_deref_instr *ret_deref =
+               nir_build_deref_cast(&b->nb, nir_load_param(&b->nb, 0),
+                                    nir_var_local, src->type);
+            vtn_local_store(b, src, ret_deref);
          }
 
          if (block->branch_type != vtn_branch_type_none) {
@@ -891,6 +877,7 @@
                   vtn_instruction_handler instruction_handler)
 {
    nir_builder_init(&b->nb, func->impl);
+   b->func = func;
    b->nb.cursor = nir_after_cf_list(&func->impl->body);
    b->has_loop_continue = false;
    b->phi_table = _mesa_hash_table_create(b, _mesa_hash_pointer,
diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c
index 6fa759b..06a49e4 100644
--- a/src/compiler/spirv/vtn_glsl450.c
+++ b/src/compiler/spirv/vtn_glsl450.c
@@ -26,6 +26,9 @@
  */
 
 #include <math.h>
+
+#include "nir/nir_builtin_builder.h"
+
 #include "vtn_private.h"
 #include "GLSL.std.450.h"
 
@@ -36,7 +39,7 @@
 static nir_ssa_def *
 build_mat2_det(nir_builder *b, nir_ssa_def *col[2])
 {
-   unsigned swiz[4] = {1, 0, 0, 0};
+   unsigned swiz[2] = {1, 0 };
    nir_ssa_def *p = nir_fmul(b, col[0], nir_swizzle(b, col[1], swiz, 2, true));
    return nir_fsub(b, nir_channel(b, p, 0), nir_channel(b, p, 1));
 }
@@ -44,8 +47,8 @@
 static nir_ssa_def *
 build_mat3_det(nir_builder *b, nir_ssa_def *col[3])
 {
-   unsigned yzx[4] = {1, 2, 0, 0};
-   unsigned zxy[4] = {2, 0, 1, 0};
+   unsigned yzx[3] = {1, 2, 0 };
+   unsigned zxy[3] = {2, 0, 1 };
 
    nir_ssa_def *prod0 =
       nir_fmul(b, col[0],
@@ -168,26 +171,6 @@
    return val;
 }
 
-static nir_ssa_def*
-build_length(nir_builder *b, nir_ssa_def *vec)
-{
-   switch (vec->num_components) {
-   case 1: return nir_fsqrt(b, nir_fmul(b, vec, vec));
-   case 2: return nir_fsqrt(b, nir_fdot2(b, vec, vec));
-   case 3: return nir_fsqrt(b, nir_fdot3(b, vec, vec));
-   case 4: return nir_fsqrt(b, nir_fdot4(b, vec, vec));
-   default:
-      unreachable("Invalid number of components");
-   }
-}
-
-static inline nir_ssa_def *
-build_fclamp(nir_builder *b,
-             nir_ssa_def *x, nir_ssa_def *min_val, nir_ssa_def *max_val)
-{
-   return nir_fmin(b, nir_fmax(b, x, min_val), max_val);
-}
-
 /**
  * Return e^x.
  */
@@ -540,10 +523,10 @@
 
    switch (entrypoint) {
    case GLSLstd450Radians:
-      val->ssa->def = nir_fmul(nb, src[0], nir_imm_float(nb, 0.01745329251));
+      val->ssa->def = nir_radians(nb, src[0]);
       return;
    case GLSLstd450Degrees:
-      val->ssa->def = nir_fmul(nb, src[0], nir_imm_float(nb, 57.2957795131));
+      val->ssa->def = nir_degrees(nb, src[0]);
       return;
    case GLSLstd450Tan:
       val->ssa->def = nir_fdiv(nb, nir_fsin(nb, src[0]),
@@ -554,8 +537,8 @@
       nir_ssa_def *sign = nir_fsign(nb, src[0]);
       nir_ssa_def *abs = nir_fabs(nb, src[0]);
       val->ssa->def = nir_fmul(nb, sign, nir_ffract(nb, abs));
-      nir_store_deref_var(nb, vtn_nir_deref(b, w[6]),
-                          nir_fmul(nb, sign, nir_ffloor(nb, abs)), 0xf);
+      nir_store_deref(nb, vtn_nir_deref(b, w[6]),
+                      nir_fmul(nb, sign, nir_ffloor(nb, abs)), 0xf);
       return;
    }
 
@@ -573,13 +556,13 @@
       return;
 
    case GLSLstd450Length:
-      val->ssa->def = build_length(nb, src[0]);
+      val->ssa->def = nir_fast_length(nb, src[0]);
       return;
    case GLSLstd450Distance:
-      val->ssa->def = build_length(nb, nir_fsub(nb, src[0], src[1]));
+      val->ssa->def = nir_fast_distance(nb, src[0], src[1]);
       return;
    case GLSLstd450Normalize:
-      val->ssa->def = nir_fdiv(nb, src[0], build_length(nb, src[0]));
+      val->ssa->def = nir_fast_normalize(nb, src[0]);
       return;
 
    case GLSLstd450Exp:
@@ -592,37 +575,22 @@
 
    case GLSLstd450FClamp:
    case GLSLstd450NClamp:
-      val->ssa->def = build_fclamp(nb, src[0], src[1], src[2]);
+      val->ssa->def = nir_fclamp(nb, src[0], src[1], src[2]);
       return;
    case GLSLstd450UClamp:
-      val->ssa->def = nir_umin(nb, nir_umax(nb, src[0], src[1]), src[2]);
+      val->ssa->def = nir_uclamp(nb, src[0], src[1], src[2]);
       return;
    case GLSLstd450SClamp:
-      val->ssa->def = nir_imin(nb, nir_imax(nb, src[0], src[1]), src[2]);
+      val->ssa->def = nir_iclamp(nb, src[0], src[1], src[2]);
       return;
 
    case GLSLstd450Cross: {
-      unsigned yzx[4] = { 1, 2, 0, 0 };
-      unsigned zxy[4] = { 2, 0, 1, 0 };
-      val->ssa->def =
-         nir_fsub(nb, nir_fmul(nb, nir_swizzle(nb, src[0], yzx, 3, true),
-                                   nir_swizzle(nb, src[1], zxy, 3, true)),
-                      nir_fmul(nb, nir_swizzle(nb, src[0], zxy, 3, true),
-                                   nir_swizzle(nb, src[1], yzx, 3, true)));
+      val->ssa->def = nir_cross(nb, src[0], src[1]);
       return;
    }
 
    case GLSLstd450SmoothStep: {
-      /* t = clamp((x - edge0) / (edge1 - edge0), 0, 1) */
-      nir_ssa_def *t =
-         build_fclamp(nb, nir_fdiv(nb, nir_fsub(nb, src[2], src[0]),
-                                       nir_fsub(nb, src[1], src[0])),
-                          NIR_IMM_FP(nb, 0.0), NIR_IMM_FP(nb, 1.0));
-      /* result = t * t * (3 - 2 * t) */
-      val->ssa->def =
-         nir_fmul(nb, t, nir_fmul(nb, t,
-            nir_fsub(nb, NIR_IMM_FP(nb, 3.0),
-                         nir_fmul(nb, NIR_IMM_FP(nb, 2.0), t))));
+      val->ssa->def = nir_smoothstep(nb, src[0], src[1], src[2]);
       return;
    }
 
@@ -749,7 +717,7 @@
          val->ssa->def = build_frexp64(nb, src[0], &exponent);
       else
          val->ssa->def = build_frexp32(nb, src[0], &exponent);
-      nir_store_deref_var(nb, vtn_nir_deref(b, w[6]), exponent, 0xf);
+      nir_store_deref(nb, vtn_nir_deref(b, w[6]), exponent, 0xf);
       return;
    }
 
@@ -786,13 +754,13 @@
    nir_intrinsic_op op;
    switch (opcode) {
    case GLSLstd450InterpolateAtCentroid:
-      op = nir_intrinsic_interp_var_at_centroid;
+      op = nir_intrinsic_interp_deref_at_centroid;
       break;
    case GLSLstd450InterpolateAtSample:
-      op = nir_intrinsic_interp_var_at_sample;
+      op = nir_intrinsic_interp_deref_at_sample;
       break;
    case GLSLstd450InterpolateAtOffset:
-      op = nir_intrinsic_interp_var_at_offset;
+      op = nir_intrinsic_interp_deref_at_offset;
       break;
    default:
       vtn_fail("Invalid opcode");
@@ -800,27 +768,56 @@
 
    nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(b->nb.shader, op);
 
-   nir_deref_var *deref = vtn_nir_deref(b, w[5]);
-   intrin->variables[0] = nir_deref_var_clone(deref, intrin);
+   struct vtn_pointer *ptr =
+      vtn_value(b, w[5], vtn_value_type_pointer)->pointer;
+   nir_deref_instr *deref = vtn_pointer_to_deref(b, ptr);
+
+   /* If the value we are interpolating has an index into a vector then
+    * interpolate the vector and index the result of that instead. This is
+    * necessary because the index will get generated as a series of nir_bcsel
+    * instructions so it would no longer be an input variable.
+    */
+   const bool vec_array_deref = deref->deref_type == nir_deref_type_array &&
+      glsl_type_is_vector(nir_deref_instr_parent(deref)->type);
+
+   nir_deref_instr *vec_deref = NULL;
+   if (vec_array_deref) {
+      vec_deref = deref;
+      deref = nir_deref_instr_parent(deref);
+   }
+   intrin->src[0] = nir_src_for_ssa(&deref->dest.ssa);
 
    switch (opcode) {
    case GLSLstd450InterpolateAtCentroid:
       break;
    case GLSLstd450InterpolateAtSample:
    case GLSLstd450InterpolateAtOffset:
-      intrin->src[0] = nir_src_for_ssa(vtn_ssa_value(b, w[6])->def);
+      intrin->src[1] = nir_src_for_ssa(vtn_ssa_value(b, w[6])->def);
       break;
    default:
       vtn_fail("Invalid opcode");
    }
 
-   intrin->num_components = glsl_get_vector_elements(dest_type);
+   intrin->num_components = glsl_get_vector_elements(deref->type);
    nir_ssa_dest_init(&intrin->instr, &intrin->dest,
-                     glsl_get_vector_elements(dest_type),
-                     glsl_get_bit_size(dest_type), NULL);
-   val->ssa->def = &intrin->dest.ssa;
+                     glsl_get_vector_elements(deref->type),
+                     glsl_get_bit_size(deref->type), NULL);
 
    nir_builder_instr_insert(&b->nb, &intrin->instr);
+
+   if (vec_array_deref) {
+      assert(vec_deref);
+      nir_const_value *const_index = nir_src_as_const_value(vec_deref->arr.index);
+      if (const_index) {
+         val->ssa->def = vtn_vector_extract(b, &intrin->dest.ssa,
+                                            const_index->u32[0]);
+      } else {
+         val->ssa->def = vtn_vector_extract_dynamic(b, &intrin->dest.ssa,
+                                                    vec_deref->arr.index.ssa);
+      }
+   } else {
+      val->ssa->def = &intrin->dest.ssa;
+   }
 }
 
 bool
diff --git a/src/compiler/spirv/vtn_private.h b/src/compiler/spirv/vtn_private.h
index b501bbf..b5199bd 100644
--- a/src/compiler/spirv/vtn_private.h
+++ b/src/compiler/spirv/vtn_private.h
@@ -51,6 +51,10 @@
                const char *fmt, ...) PRINTFLIKE(4, 5);
 #define vtn_warn(...) _vtn_warn(b, __FILE__, __LINE__, __VA_ARGS__)
 
+void _vtn_err(struct vtn_builder *b, const char *file, unsigned line,
+               const char *fmt, ...) PRINTFLIKE(4, 5);
+#define vtn_err(...) _vtn_err(b, __FILE__, __LINE__, __VA_ARGS__)
+
 /** Fail SPIR-V parsing
  *
  * This function logs an error and then bails out of the shader compile using
@@ -217,6 +221,8 @@
 struct vtn_function {
    struct exec_node node;
 
+   struct vtn_type *type;
+
    bool referenced;
    bool emitted;
 
@@ -401,12 +407,10 @@
 enum vtn_variable_mode {
    vtn_variable_mode_local,
    vtn_variable_mode_global,
-   vtn_variable_mode_param,
+   vtn_variable_mode_uniform,
    vtn_variable_mode_ubo,
    vtn_variable_mode_ssbo,
    vtn_variable_mode_push_constant,
-   vtn_variable_mode_image,
-   vtn_variable_mode_sampler,
    vtn_variable_mode_workgroup,
    vtn_variable_mode_input,
    vtn_variable_mode_output,
@@ -430,10 +434,17 @@
    /** The referenced variable, if known
     *
     * This field may be NULL if the pointer uses a (block_index, offset) pair
-    * instead of an access chain.
+    * instead of an access chain or if the access chain starts at a deref.
     */
    struct vtn_variable *var;
 
+   /** The deref at the base of the chain
+    *
+    * This field may be NULL if the pointer uses a (block_index, offset) pair
+    * instead of an access chain or if the access chain starts at a variable.
+    */
+   nir_deref_instr *deref;
+
    /** An access chain describing how to get from var to the referenced data
     *
     * This field may be NULL if the pointer references the entire variable or
@@ -453,11 +464,12 @@
 
    unsigned descriptor_set;
    unsigned binding;
+   bool explicit_binding;
+   unsigned offset;
    unsigned input_attachment_index;
    bool patch;
 
    nir_variable *var;
-   nir_variable **members;
 
    int shared_location;
 
@@ -664,22 +676,23 @@
 nir_ssa_def *vtn_vector_insert_dynamic(struct vtn_builder *b, nir_ssa_def *src,
                                        nir_ssa_def *insert, nir_ssa_def *index);
 
-nir_deref_var *vtn_nir_deref(struct vtn_builder *b, uint32_t id);
+nir_deref_instr *vtn_nir_deref(struct vtn_builder *b, uint32_t id);
 
 struct vtn_pointer *vtn_pointer_for_variable(struct vtn_builder *b,
                                              struct vtn_variable *var,
                                              struct vtn_type *ptr_type);
 
-nir_deref_var *vtn_pointer_to_deref(struct vtn_builder *b,
-                                    struct vtn_pointer *ptr);
+nir_deref_instr *vtn_pointer_to_deref(struct vtn_builder *b,
+                                      struct vtn_pointer *ptr);
 nir_ssa_def *
 vtn_pointer_to_offset(struct vtn_builder *b, struct vtn_pointer *ptr,
-                      nir_ssa_def **index_out, unsigned *end_idx_out);
+                      nir_ssa_def **index_out);
 
-struct vtn_ssa_value *vtn_local_load(struct vtn_builder *b, nir_deref_var *src);
+struct vtn_ssa_value *
+vtn_local_load(struct vtn_builder *b, nir_deref_instr *src);
 
 void vtn_local_store(struct vtn_builder *b, struct vtn_ssa_value *src,
-                     nir_deref_var *dest);
+                     nir_deref_instr *dest);
 
 struct vtn_ssa_value *
 vtn_variable_load(struct vtn_builder *b, struct vtn_pointer *src);
diff --git a/src/compiler/spirv/vtn_variables.c b/src/compiler/spirv/vtn_variables.c
index fd8ab7f..8dab86a 100644
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -27,6 +27,7 @@
 
 #include "vtn_private.h"
 #include "spirv_info.h"
+#include "nir_deref.h"
 
 static struct vtn_access_chain *
 vtn_access_chain_create(struct vtn_builder *b, unsigned length)
@@ -63,6 +64,7 @@
 {
    return ptr->mode == vtn_variable_mode_ubo ||
           ptr->mode == vtn_variable_mode_ssbo ||
+          ptr->mode == vtn_variable_mode_push_constant ||
           (ptr->mode == vtn_variable_mode_workgroup &&
            b->options->lower_workgroup_access_to_offsets);
 }
@@ -110,6 +112,7 @@
    ptr->mode = base->mode;
    ptr->type = type;
    ptr->var = base->var;
+   ptr->deref = base->deref;
    ptr->chain = chain;
 
    return ptr;
@@ -268,6 +271,12 @@
          }
 
          offset = nir_imm_int(&b->nb, base->var->shared_location);
+      } else if (base->mode == vtn_variable_mode_push_constant) {
+         /* Push constants neither need nor have a block index */
+         vtn_assert(!block_index);
+
+         /* Start off with at the start of the push constant block. */
+         offset = nir_imm_int(&b->nb, 0);
       } else {
          /* The code above should have ensured a block_index when needed. */
          vtn_assert(block_index);
@@ -349,22 +358,6 @@
    }
 }
 
-/* Crawls a chain of array derefs and rewrites the types so that the
- * lengths stay the same but the terminal type is the one given by
- * tail_type.  This is useful for split structures.
- */
-static void
-rewrite_deref_types(struct vtn_builder *b, nir_deref *deref,
-                    const struct glsl_type *type)
-{
-   deref->type = type;
-   if (deref->child) {
-      vtn_assert(deref->child->deref_type == nir_deref_type_array);
-      vtn_assert(glsl_type_is_array(deref->type));
-      rewrite_deref_types(b, deref->child, glsl_get_array_element(type));
-   }
-}
-
 struct vtn_pointer *
 vtn_pointer_for_variable(struct vtn_builder *b,
                          struct vtn_variable *var, struct vtn_type *ptr_type)
@@ -381,158 +374,99 @@
    return pointer;
 }
 
-nir_deref_var *
+/* Returns an atomic_uint type based on the original uint type. The returned
+ * type will be equivalent to the original one but will have an atomic_uint
+ * type as leaf instead of an uint.
+ *
+ * Manages uint scalars, arrays, and arrays of arrays of any nested depth.
+ */
+static const struct glsl_type *
+repair_atomic_type(const struct glsl_type *type)
+{
+   assert(glsl_get_base_type(glsl_without_array(type)) == GLSL_TYPE_UINT);
+   assert(glsl_type_is_scalar(glsl_without_array(type)));
+
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *atomic =
+         repair_atomic_type(glsl_get_array_element(type));
+
+      return glsl_array_type(atomic, glsl_get_length(type));
+   } else {
+      return glsl_atomic_uint_type();
+   }
+}
+
+nir_deref_instr *
 vtn_pointer_to_deref(struct vtn_builder *b, struct vtn_pointer *ptr)
 {
    /* Do on-the-fly copy propagation for samplers. */
-   if (ptr->var->copy_prop_sampler)
+   if (ptr->var && ptr->var->copy_prop_sampler)
       return vtn_pointer_to_deref(b, ptr->var->copy_prop_sampler);
 
-   nir_deref_var *deref_var;
-   if (ptr->var->var) {
-      deref_var = nir_deref_var_create(b, ptr->var->var);
-      /* Raw variable access */
-      if (!ptr->chain)
-         return deref_var;
+   nir_deref_instr *tail;
+   if (ptr->deref) {
+      tail = ptr->deref;
    } else {
-      vtn_assert(ptr->var->members);
-      /* Create the deref_var manually.  It will get filled out later. */
-      deref_var = rzalloc(b, nir_deref_var);
-      deref_var->deref.deref_type = nir_deref_type_var;
+      assert(ptr->var && ptr->var->var);
+      tail = nir_build_deref_var(&b->nb, ptr->var->var);
    }
 
+   /* Raw variable access */
+   if (!ptr->chain)
+      return tail;
+
    struct vtn_access_chain *chain = ptr->chain;
    vtn_assert(chain);
 
-   struct vtn_type *deref_type = ptr->var->type;
-   nir_deref *tail = &deref_var->deref;
-   nir_variable **members = ptr->var->members;
-
    for (unsigned i = 0; i < chain->length; i++) {
-      enum glsl_base_type base_type = glsl_get_base_type(deref_type->type);
-      switch (base_type) {
-      case GLSL_TYPE_UINT:
-      case GLSL_TYPE_INT:
-      case GLSL_TYPE_UINT16:
-      case GLSL_TYPE_INT16:
-      case GLSL_TYPE_UINT8:
-      case GLSL_TYPE_INT8:
-      case GLSL_TYPE_UINT64:
-      case GLSL_TYPE_INT64:
-      case GLSL_TYPE_FLOAT:
-      case GLSL_TYPE_FLOAT16:
-      case GLSL_TYPE_DOUBLE:
-      case GLSL_TYPE_BOOL:
-      case GLSL_TYPE_ARRAY: {
-         deref_type = deref_type->array_element;
-
-         nir_deref_array *deref_arr = nir_deref_array_create(b);
-         deref_arr->deref.type = deref_type->type;
-
-         if (chain->link[i].mode == vtn_access_mode_literal) {
-            deref_arr->deref_array_type = nir_deref_array_type_direct;
-            deref_arr->base_offset = chain->link[i].id;
-         } else {
-            vtn_assert(chain->link[i].mode == vtn_access_mode_id);
-            deref_arr->deref_array_type = nir_deref_array_type_indirect;
-            deref_arr->base_offset = 0;
-            deref_arr->indirect =
-               nir_src_for_ssa(vtn_ssa_value(b, chain->link[i].id)->def);
-         }
-         tail->child = &deref_arr->deref;
-         tail = tail->child;
-         break;
-      }
-
-      case GLSL_TYPE_STRUCT: {
+      if (glsl_type_is_struct(tail->type)) {
          vtn_assert(chain->link[i].mode == vtn_access_mode_literal);
          unsigned idx = chain->link[i].id;
-         deref_type = deref_type->members[idx];
-         if (members) {
-            /* This is a pre-split structure. */
-            deref_var->var = members[idx];
-            rewrite_deref_types(b, &deref_var->deref, members[idx]->type);
-            vtn_assert(tail->type == deref_type->type);
-            members = NULL;
+         tail = nir_build_deref_struct(&b->nb, tail, idx);
+      } else {
+         nir_ssa_def *index;
+         if (chain->link[i].mode == vtn_access_mode_literal) {
+            index = nir_imm_int(&b->nb, chain->link[i].id);
          } else {
-            nir_deref_struct *deref_struct = nir_deref_struct_create(b, idx);
-            deref_struct->deref.type = deref_type->type;
-            tail->child = &deref_struct->deref;
-            tail = tail->child;
+            vtn_assert(chain->link[i].mode == vtn_access_mode_id);
+            index = vtn_ssa_value(b, chain->link[i].id)->def;
          }
-         break;
-      }
-      default:
-         vtn_fail("Invalid type for deref");
+         tail = nir_build_deref_array(&b->nb, tail, index);
       }
    }
 
-   vtn_assert(members == NULL);
-   return deref_var;
+   return tail;
 }
 
 static void
-_vtn_local_load_store(struct vtn_builder *b, bool load, nir_deref_var *deref,
-                      nir_deref *tail, struct vtn_ssa_value *inout)
+_vtn_local_load_store(struct vtn_builder *b, bool load, nir_deref_instr *deref,
+                      struct vtn_ssa_value *inout)
 {
-   /* The deref tail may contain a deref to select a component of a vector (in
-    * other words, it might not be an actual tail) so we have to save it away
-    * here since we overwrite it later.
-    */
-   nir_deref *old_child = tail->child;
-
-   if (glsl_type_is_vector_or_scalar(tail->type)) {
-      /* Terminate the deref chain in case there is one more link to pick
-       * off a component of the vector.
-       */
-      tail->child = NULL;
-
-      nir_intrinsic_op op = load ? nir_intrinsic_load_var :
-                                   nir_intrinsic_store_var;
-
-      nir_intrinsic_instr *intrin = nir_intrinsic_instr_create(b->shader, op);
-      intrin->variables[0] = nir_deref_var_clone(deref, intrin);
-      intrin->num_components = glsl_get_vector_elements(tail->type);
-
+   if (glsl_type_is_vector_or_scalar(deref->type)) {
       if (load) {
-         nir_ssa_dest_init(&intrin->instr, &intrin->dest,
-                           intrin->num_components,
-                           glsl_get_bit_size(tail->type),
-                           NULL);
-         inout->def = &intrin->dest.ssa;
+         inout->def = nir_load_deref(&b->nb, deref);
       } else {
-         nir_intrinsic_set_write_mask(intrin, (1 << intrin->num_components) - 1);
-         intrin->src[0] = nir_src_for_ssa(inout->def);
+         nir_store_deref(&b->nb, deref, inout->def, ~0);
       }
-
-      nir_builder_instr_insert(&b->nb, &intrin->instr);
-   } else if (glsl_get_base_type(tail->type) == GLSL_TYPE_ARRAY ||
-              glsl_type_is_matrix(tail->type)) {
-      unsigned elems = glsl_get_length(tail->type);
-      nir_deref_array *deref_arr = nir_deref_array_create(b);
-      deref_arr->deref_array_type = nir_deref_array_type_direct;
-      deref_arr->deref.type = glsl_get_array_element(tail->type);
-      tail->child = &deref_arr->deref;
+   } else if (glsl_type_is_array(deref->type) ||
+              glsl_type_is_matrix(deref->type)) {
+      unsigned elems = glsl_get_length(deref->type);
       for (unsigned i = 0; i < elems; i++) {
-         deref_arr->base_offset = i;
-         _vtn_local_load_store(b, load, deref, tail->child, inout->elems[i]);
+         nir_deref_instr *child =
+            nir_build_deref_array(&b->nb, deref, nir_imm_int(&b->nb, i));
+         _vtn_local_load_store(b, load, child, inout->elems[i]);
       }
    } else {
-      vtn_assert(glsl_get_base_type(tail->type) == GLSL_TYPE_STRUCT);
-      unsigned elems = glsl_get_length(tail->type);
-      nir_deref_struct *deref_struct = nir_deref_struct_create(b, 0);
-      tail->child = &deref_struct->deref;
+      vtn_assert(glsl_type_is_struct(deref->type));
+      unsigned elems = glsl_get_length(deref->type);
       for (unsigned i = 0; i < elems; i++) {
-         deref_struct->index = i;
-         deref_struct->deref.type = glsl_get_struct_field(tail->type, i);
-         _vtn_local_load_store(b, load, deref, tail->child, inout->elems[i]);
+         nir_deref_instr *child = nir_build_deref_struct(&b->nb, deref, i);
+         _vtn_local_load_store(b, load, child, inout->elems[i]);
       }
    }
-
-   tail->child = old_child;
 }
 
-nir_deref_var *
+nir_deref_instr *
 vtn_nir_deref(struct vtn_builder *b, uint32_t id)
 {
    struct vtn_pointer *ptr = vtn_value(b, id, vtn_value_type_pointer)->pointer;
@@ -544,32 +478,35 @@
  * selecting which component due to OpAccessChain supporting per-component
  * indexing in SPIR-V.
  */
-static nir_deref *
-get_deref_tail(nir_deref_var *deref)
+static nir_deref_instr *
+get_deref_tail(nir_deref_instr *deref)
 {
-   nir_deref *cur = &deref->deref;
-   while (!glsl_type_is_vector_or_scalar(cur->type) && cur->child)
-      cur = cur->child;
+   if (deref->deref_type != nir_deref_type_array)
+      return deref;
 
-   return cur;
+   nir_deref_instr *parent =
+      nir_instr_as_deref(deref->parent.ssa->parent_instr);
+
+   if (glsl_type_is_vector(parent->type))
+      return parent;
+   else
+      return deref;
 }
 
 struct vtn_ssa_value *
-vtn_local_load(struct vtn_builder *b, nir_deref_var *src)
+vtn_local_load(struct vtn_builder *b, nir_deref_instr *src)
 {
-   nir_deref *src_tail = get_deref_tail(src);
+   nir_deref_instr *src_tail = get_deref_tail(src);
    struct vtn_ssa_value *val = vtn_create_ssa_value(b, src_tail->type);
-   _vtn_local_load_store(b, true, src, src_tail, val);
+   _vtn_local_load_store(b, true, src_tail, val);
 
-   if (src_tail->child) {
-      nir_deref_array *vec_deref = nir_deref_as_array(src_tail->child);
-      vtn_assert(vec_deref->deref.child == NULL);
-      val->type = vec_deref->deref.type;
-      if (vec_deref->deref_array_type == nir_deref_array_type_direct)
-         val->def = vtn_vector_extract(b, val->def, vec_deref->base_offset);
+   if (src_tail != src) {
+      val->type = src->type;
+      nir_const_value *const_index = nir_src_as_const_value(src->arr.index);
+      if (const_index)
+         val->def = vtn_vector_extract(b, val->def, const_index->u32[0]);
       else
-         val->def = vtn_vector_extract_dynamic(b, val->def,
-                                               vec_deref->indirect.ssa);
+         val->def = vtn_vector_extract_dynamic(b, val->def, src->arr.index.ssa);
    }
 
    return val;
@@ -577,93 +514,40 @@
 
 void
 vtn_local_store(struct vtn_builder *b, struct vtn_ssa_value *src,
-                nir_deref_var *dest)
+                nir_deref_instr *dest)
 {
-   nir_deref *dest_tail = get_deref_tail(dest);
+   nir_deref_instr *dest_tail = get_deref_tail(dest);
 
-   if (dest_tail->child) {
+   if (dest_tail != dest) {
       struct vtn_ssa_value *val = vtn_create_ssa_value(b, dest_tail->type);
-      _vtn_local_load_store(b, true, dest, dest_tail, val);
-      nir_deref_array *deref = nir_deref_as_array(dest_tail->child);
-      vtn_assert(deref->deref.child == NULL);
-      if (deref->deref_array_type == nir_deref_array_type_direct)
+      _vtn_local_load_store(b, true, dest_tail, val);
+
+      nir_const_value *const_index = nir_src_as_const_value(dest->arr.index);
+      if (const_index)
          val->def = vtn_vector_insert(b, val->def, src->def,
-                                      deref->base_offset);
+                                      const_index->u32[0]);
       else
          val->def = vtn_vector_insert_dynamic(b, val->def, src->def,
-                                              deref->indirect.ssa);
-      _vtn_local_load_store(b, false, dest, dest_tail, val);
+                                              dest->arr.index.ssa);
+      _vtn_local_load_store(b, false, dest_tail, val);
    } else {
-      _vtn_local_load_store(b, false, dest, dest_tail, src);
+      _vtn_local_load_store(b, false, dest_tail, src);
    }
 }
 
 nir_ssa_def *
 vtn_pointer_to_offset(struct vtn_builder *b, struct vtn_pointer *ptr,
-                      nir_ssa_def **index_out, unsigned *end_idx_out)
+                      nir_ssa_def **index_out)
 {
-   if (vtn_pointer_uses_ssa_offset(b, ptr)) {
-      if (!ptr->offset) {
-         struct vtn_access_chain chain = {
-            .length = 0,
-         };
-         ptr = vtn_ssa_offset_pointer_dereference(b, ptr, &chain);
-      }
-      *index_out = ptr->block_index;
-      return ptr->offset;
+   assert(vtn_pointer_uses_ssa_offset(b, ptr));
+   if (!ptr->offset) {
+      struct vtn_access_chain chain = {
+         .length = 0,
+      };
+      ptr = vtn_ssa_offset_pointer_dereference(b, ptr, &chain);
    }
-
-   vtn_assert(ptr->mode == vtn_variable_mode_push_constant);
-   *index_out = NULL;
-
-   unsigned idx = 0;
-   struct vtn_type *type = ptr->var->type;
-   nir_ssa_def *offset = nir_imm_int(&b->nb, 0);
-
-   if (ptr->chain) {
-      for (; idx < ptr->chain->length; idx++) {
-         enum glsl_base_type base_type = glsl_get_base_type(type->type);
-         switch (base_type) {
-         case GLSL_TYPE_UINT:
-         case GLSL_TYPE_INT:
-         case GLSL_TYPE_UINT16:
-         case GLSL_TYPE_INT16:
-         case GLSL_TYPE_UINT8:
-         case GLSL_TYPE_INT8:
-         case GLSL_TYPE_UINT64:
-         case GLSL_TYPE_INT64:
-         case GLSL_TYPE_FLOAT:
-         case GLSL_TYPE_FLOAT16:
-         case GLSL_TYPE_DOUBLE:
-         case GLSL_TYPE_BOOL:
-         case GLSL_TYPE_ARRAY:
-            offset = nir_iadd(&b->nb, offset,
-                              vtn_access_link_as_ssa(b, ptr->chain->link[idx],
-                                                     type->stride));
-
-            type = type->array_element;
-            break;
-
-         case GLSL_TYPE_STRUCT: {
-            vtn_assert(ptr->chain->link[idx].mode == vtn_access_mode_literal);
-            unsigned member = ptr->chain->link[idx].id;
-            offset = nir_iadd(&b->nb, offset,
-                              nir_imm_int(&b->nb, type->offsets[member]));
-            type = type->members[member];
-            break;
-         }
-
-         default:
-            vtn_fail("Invalid type for deref");
-         }
-      }
-   }
-
-   vtn_assert(type == ptr->type);
-   if (end_idx_out)
-      *end_idx_out = idx;
-
-   return offset;
+   *index_out = ptr->block_index;
+   return ptr->offset;
 }
 
 /* Tries to compute the size of an interface block based on the strides and
@@ -721,31 +605,6 @@
 }
 
 static void
-vtn_access_chain_get_offset_size(struct vtn_builder *b,
-                                 struct vtn_access_chain *chain,
-                                 struct vtn_type *type,
-                                 unsigned *access_offset,
-                                 unsigned *access_size)
-{
-   *access_offset = 0;
-
-   for (unsigned i = 0; i < chain->length; i++) {
-      if (chain->link[i].mode != vtn_access_mode_literal)
-         break;
-
-      if (glsl_type_is_struct(type->type)) {
-         *access_offset += type->offsets[chain->link[i].id];
-         type = type->members[chain->link[i].id];
-      } else {
-         *access_offset += type->stride * chain->link[i].id;
-         type = type->array_element;
-      }
-   }
-
-   *access_size = vtn_type_block_size(b, type);
-}
-
-static void
 _vtn_load_store_tail(struct vtn_builder *b, nir_intrinsic_op op, bool load,
                      nir_ssa_def *index, nir_ssa_def *offset,
                      unsigned access_offset, unsigned access_size,
@@ -795,13 +654,9 @@
 _vtn_block_load_store(struct vtn_builder *b, nir_intrinsic_op op, bool load,
                       nir_ssa_def *index, nir_ssa_def *offset,
                       unsigned access_offset, unsigned access_size,
-                      struct vtn_access_chain *chain, unsigned chain_idx,
                       struct vtn_type *type, struct vtn_ssa_value **inout)
 {
-   if (chain && chain_idx >= chain->length)
-      chain = NULL;
-
-   if (load && chain == NULL && *inout == NULL)
+   if (load && *inout == NULL)
       *inout = vtn_create_ssa_value(b, type->type);
 
    enum glsl_base_type base_type = glsl_get_base_type(type->type);
@@ -903,7 +758,6 @@
             nir_iadd(&b->nb, offset, nir_imm_int(&b->nb, i * type->stride));
          _vtn_block_load_store(b, op, load, index, elem_off,
                                access_offset, access_size,
-                               NULL, 0,
                                type->array_element, &(*inout)->elems[i]);
       }
       return;
@@ -916,7 +770,6 @@
             nir_iadd(&b->nb, offset, nir_imm_int(&b->nb, type->offsets[i]));
          _vtn_block_load_store(b, op, load, index, elem_off,
                                access_offset, access_size,
-                               NULL, 0,
                                type->members[i], &(*inout)->elems[i]);
       }
       return;
@@ -941,8 +794,7 @@
       break;
    case vtn_variable_mode_push_constant:
       op = nir_intrinsic_load_push_constant;
-      vtn_access_chain_get_offset_size(b, src->chain, src->var->type,
-                                       &access_offset, &access_size);
+      access_size = b->shader->num_uniforms;
       break;
    case vtn_variable_mode_workgroup:
       op = nir_intrinsic_load_shared;
@@ -952,13 +804,12 @@
    }
 
    nir_ssa_def *offset, *index = NULL;
-   unsigned chain_idx;
-   offset = vtn_pointer_to_offset(b, src, &index, &chain_idx);
+   offset = vtn_pointer_to_offset(b, src, &index);
 
    struct vtn_ssa_value *value = NULL;
    _vtn_block_load_store(b, op, true, index, offset,
                          access_offset, access_size,
-                         src->chain, chain_idx, src->type, &value);
+                         src->type, &value);
    return value;
 }
 
@@ -979,11 +830,10 @@
    }
 
    nir_ssa_def *offset, *index = NULL;
-   unsigned chain_idx;
-   offset = vtn_pointer_to_offset(b, dst, &index, &chain_idx);
+   offset = vtn_pointer_to_offset(b, dst, &index);
 
    _vtn_block_load_store(b, op, false, index, offset,
-                         0, 0, dst->chain, chain_idx, dst->type, &src);
+                         0, 0, dst->type, &src);
 }
 
 static void
@@ -1354,76 +1204,88 @@
       *location = SYSTEM_VALUE_SUBGROUP_LT_MASK,
       set_mode_system_value(b, mode);
       break;
+   case SpvBuiltInFragStencilRefEXT:
+      *location = FRAG_RESULT_STENCIL;
+      vtn_assert(*mode == nir_var_shader_out);
+      break;
+   case SpvBuiltInWorkDim:
+      *location = SYSTEM_VALUE_WORK_DIM;
+      set_mode_system_value(b, mode);
+      break;
+   case SpvBuiltInGlobalSize:
+      *location = SYSTEM_VALUE_GLOBAL_GROUP_SIZE;
+      set_mode_system_value(b, mode);
+      break;
    default:
-      vtn_fail("unsupported builtin");
+      vtn_fail("unsupported builtin: %u", builtin);
    }
 }
 
 static void
-apply_var_decoration(struct vtn_builder *b, nir_variable *nir_var,
+apply_var_decoration(struct vtn_builder *b,
+                     struct nir_variable_data *var_data,
                      const struct vtn_decoration *dec)
 {
    switch (dec->decoration) {
    case SpvDecorationRelaxedPrecision:
       break; /* FIXME: Do nothing with this for now. */
    case SpvDecorationNoPerspective:
-      nir_var->data.interpolation = INTERP_MODE_NOPERSPECTIVE;
+      var_data->interpolation = INTERP_MODE_NOPERSPECTIVE;
       break;
    case SpvDecorationFlat:
-      nir_var->data.interpolation = INTERP_MODE_FLAT;
+      var_data->interpolation = INTERP_MODE_FLAT;
       break;
    case SpvDecorationCentroid:
-      nir_var->data.centroid = true;
+      var_data->centroid = true;
       break;
    case SpvDecorationSample:
-      nir_var->data.sample = true;
+      var_data->sample = true;
       break;
    case SpvDecorationInvariant:
-      nir_var->data.invariant = true;
+      var_data->invariant = true;
       break;
    case SpvDecorationConstant:
-      vtn_assert(nir_var->constant_initializer != NULL);
-      nir_var->data.read_only = true;
+      var_data->read_only = true;
       break;
    case SpvDecorationNonReadable:
-      nir_var->data.image.write_only = true;
+      var_data->image.write_only = true;
       break;
    case SpvDecorationNonWritable:
-      nir_var->data.read_only = true;
-      nir_var->data.image.read_only = true;
+      var_data->read_only = true;
+      var_data->image.read_only = true;
       break;
    case SpvDecorationRestrict:
-      nir_var->data.image.restrict_flag = true;
+      var_data->image.restrict_flag = true;
       break;
    case SpvDecorationVolatile:
-      nir_var->data.image._volatile = true;
+      var_data->image._volatile = true;
       break;
    case SpvDecorationCoherent:
-      nir_var->data.image.coherent = true;
+      var_data->image.coherent = true;
       break;
    case SpvDecorationComponent:
-      nir_var->data.location_frac = dec->literals[0];
+      var_data->location_frac = dec->literals[0];
       break;
    case SpvDecorationIndex:
-      nir_var->data.index = dec->literals[0];
+      var_data->index = dec->literals[0];
       break;
    case SpvDecorationBuiltIn: {
       SpvBuiltIn builtin = dec->literals[0];
 
-      nir_variable_mode mode = nir_var->data.mode;
-      vtn_get_builtin_location(b, builtin, &nir_var->data.location, &mode);
-      nir_var->data.mode = mode;
+      nir_variable_mode mode = var_data->mode;
+      vtn_get_builtin_location(b, builtin, &var_data->location, &mode);
+      var_data->mode = mode;
 
       switch (builtin) {
       case SpvBuiltInTessLevelOuter:
       case SpvBuiltInTessLevelInner:
-         nir_var->data.compact = true;
+         var_data->compact = true;
          break;
       case SpvBuiltInFragCoord:
-         nir_var->data.pixel_center_integer = b->pixel_center_integer;
+         var_data->pixel_center_integer = b->pixel_center_integer;
          /* fallthrough */
       case SpvBuiltInSamplePosition:
-         nir_var->data.origin_upper_left = b->origin_upper_left;
+         var_data->origin_upper_left = b->origin_upper_left;
          break;
       default:
          break;
@@ -1436,13 +1298,11 @@
    case SpvDecorationMatrixStride:
    case SpvDecorationAliased:
    case SpvDecorationUniform:
-   case SpvDecorationStream:
-   case SpvDecorationOffset:
    case SpvDecorationLinkageAttributes:
       break; /* Do nothing with these here */
 
    case SpvDecorationPatch:
-      nir_var->data.patch = true;
+      var_data->patch = true;
       break;
 
    case SpvDecorationLocation:
@@ -1464,9 +1324,20 @@
       break;
 
    case SpvDecorationXfbBuffer:
+      var_data->explicit_xfb_buffer = true;
+      var_data->xfb_buffer = dec->literals[0];
+      break;
    case SpvDecorationXfbStride:
-      vtn_warn("Vulkan does not have transform feedback: %s",
-               spirv_decoration_to_string(dec->decoration));
+      var_data->explicit_xfb_stride = true;
+      var_data->xfb_stride = dec->literals[0];
+      break;
+   case SpvDecorationOffset:
+      var_data->explicit_offset = true;
+      var_data->offset = dec->literals[0];
+      break;
+
+   case SpvDecorationStream:
+      var_data->stream = dec->literals[0];
       break;
 
    case SpvDecorationCPacked:
@@ -1503,6 +1374,7 @@
    switch (dec->decoration) {
    case SpvDecorationBinding:
       vtn_var->binding = dec->literals[0];
+      vtn_var->explicit_binding = true;
       return;
    case SpvDecorationDescriptorSet:
       vtn_var->descriptor_set = dec->literals[0];
@@ -1513,6 +1385,9 @@
    case SpvDecorationPatch:
       vtn_var->patch = true;
       break;
+   case SpvDecorationOffset:
+      vtn_var->offset = dec->literals[0];
+      break;
    default:
       break;
    }
@@ -1531,10 +1406,9 @@
     */
    if (dec->decoration == SpvDecorationLocation) {
       unsigned location = dec->literals[0];
-      bool is_vertex_input;
+      bool is_vertex_input = false;
       if (b->shader->info.stage == MESA_SHADER_FRAGMENT &&
           vtn_var->mode == vtn_variable_mode_output) {
-         is_vertex_input = false;
          location += FRAG_RESULT_DATA0;
       } else if (b->shader->info.stage == MESA_SHADER_VERTEX &&
                  vtn_var->mode == vtn_variable_mode_input) {
@@ -1542,43 +1416,42 @@
          location += VERT_ATTRIB_GENERIC0;
       } else if (vtn_var->mode == vtn_variable_mode_input ||
                  vtn_var->mode == vtn_variable_mode_output) {
-         is_vertex_input = false;
          location += vtn_var->patch ? VARYING_SLOT_PATCH0 : VARYING_SLOT_VAR0;
-      } else {
-         vtn_warn("Location must be on input or output variable");
+      } else if (vtn_var->mode != vtn_variable_mode_uniform) {
+         vtn_warn("Location must be on input, output, uniform, sampler or "
+                  "image variable");
          return;
       }
 
-      if (vtn_var->var) {
+      if (vtn_var->var->num_members == 0) {
          /* This handles the member and lone variable cases */
          vtn_var->var->data.location = location;
       } else {
          /* This handles the structure member case */
-         assert(vtn_var->members);
-         unsigned length =
-            glsl_get_length(glsl_without_array(vtn_var->type->type));
-         for (unsigned i = 0; i < length; i++) {
-            vtn_var->members[i]->data.location = location;
-            location +=
-               glsl_count_attribute_slots(vtn_var->members[i]->interface_type,
-                                          is_vertex_input);
+         assert(vtn_var->var->members);
+         for (unsigned i = 0; i < vtn_var->var->num_members; i++) {
+            vtn_var->var->members[i].location = location;
+            const struct glsl_type *member_type =
+               glsl_get_struct_field(vtn_var->var->interface_type, i);
+            location += glsl_count_attribute_slots(member_type,
+                                                   is_vertex_input);
          }
       }
       return;
    } else {
       if (vtn_var->var) {
-         assert(member == -1);
-         apply_var_decoration(b, vtn_var->var, dec);
-      } else if (vtn_var->members) {
-         if (member >= 0) {
+         if (vtn_var->var->num_members == 0) {
+            assert(member == -1);
+            apply_var_decoration(b, &vtn_var->var->data, dec);
+         } else if (member >= 0) {
             /* Member decorations must come from a type */
             assert(val->value_type == vtn_value_type_type);
-            apply_var_decoration(b, vtn_var->members[member], dec);
+            apply_var_decoration(b, &vtn_var->var->members[member], dec);
          } else {
             unsigned length =
                glsl_get_length(glsl_without_array(vtn_var->type->type));
             for (unsigned i = 0; i < length; i++)
-               apply_var_decoration(b, vtn_var->members[i], dec);
+               apply_var_decoration(b, &vtn_var->var->members[i], dec);
          }
       } else {
          /* A few variables, those with external storage, have no actual
@@ -1611,7 +1484,9 @@
          mode = vtn_variable_mode_ssbo;
          nir_mode = 0;
       } else {
-         vtn_fail("Invalid uniform variable type");
+         /* Default-block uniforms, coming from gl_spirv */
+         mode = vtn_variable_mode_uniform;
+         nir_mode = nir_var_uniform;
       }
       break;
    case SpvStorageClassStorageBuffer:
@@ -1619,15 +1494,8 @@
       nir_mode = 0;
       break;
    case SpvStorageClassUniformConstant:
-      if (glsl_type_is_image(interface_type->type)) {
-         mode = vtn_variable_mode_image;
-         nir_mode = nir_var_uniform;
-      } else if (glsl_type_is_sampler(interface_type->type)) {
-         mode = vtn_variable_mode_sampler;
-         nir_mode = nir_var_uniform;
-      } else {
-         vtn_fail("Invalid uniform constant variable type");
-      }
+      mode = vtn_variable_mode_uniform;
+      nir_mode = nir_var_uniform;
       break;
    case SpvStorageClassPushConstant:
       mode = vtn_variable_mode_push_constant;
@@ -1653,9 +1521,12 @@
       mode = vtn_variable_mode_workgroup;
       nir_mode = nir_var_shared;
       break;
+   case SpvStorageClassAtomicCounter:
+      mode = vtn_variable_mode_uniform;
+      nir_mode = nir_var_uniform;
+      break;
    case SpvStorageClassCrossWorkgroup:
    case SpvStorageClassGeneric:
-   case SpvStorageClassAtomicCounter:
    default:
       vtn_fail("Unhandled variable storage class");
    }
@@ -1669,30 +1540,34 @@
 nir_ssa_def *
 vtn_pointer_to_ssa(struct vtn_builder *b, struct vtn_pointer *ptr)
 {
-   /* This pointer needs to have a pointer type with actual storage */
-   vtn_assert(ptr->ptr_type);
-   vtn_assert(ptr->ptr_type->type);
+   if (vtn_pointer_uses_ssa_offset(b, ptr)) {
+      /* This pointer needs to have a pointer type with actual storage */
+      vtn_assert(ptr->ptr_type);
+      vtn_assert(ptr->ptr_type->type);
 
-   if (!ptr->offset) {
-      /* If we don't have an offset then we must be a pointer to the variable
-       * itself.
-       */
-      vtn_assert(!ptr->offset && !ptr->block_index);
+      if (!ptr->offset) {
+         /* If we don't have an offset then we must be a pointer to the variable
+          * itself.
+          */
+         vtn_assert(!ptr->offset && !ptr->block_index);
 
-      struct vtn_access_chain chain = {
-         .length = 0,
-      };
-      ptr = vtn_ssa_offset_pointer_dereference(b, ptr, &chain);
-   }
+         struct vtn_access_chain chain = {
+            .length = 0,
+         };
+         ptr = vtn_ssa_offset_pointer_dereference(b, ptr, &chain);
+      }
 
-   vtn_assert(ptr->offset);
-   if (ptr->block_index) {
-      vtn_assert(ptr->mode == vtn_variable_mode_ubo ||
-                 ptr->mode == vtn_variable_mode_ssbo);
-      return nir_vec2(&b->nb, ptr->block_index, ptr->offset);
+      vtn_assert(ptr->offset);
+      if (ptr->block_index) {
+         vtn_assert(ptr->mode == vtn_variable_mode_ubo ||
+                    ptr->mode == vtn_variable_mode_ssbo);
+         return nir_vec2(&b->nb, ptr->block_index, ptr->offset);
+      } else {
+         vtn_assert(ptr->mode == vtn_variable_mode_workgroup);
+         return ptr->offset;
+      }
    } else {
-      vtn_assert(ptr->mode == vtn_variable_mode_workgroup);
-      return ptr->offset;
+      return &vtn_pointer_to_deref(b, ptr)->dest.ssa;
    }
 }
 
@@ -1702,27 +1577,35 @@
 {
    vtn_assert(ssa->num_components <= 2 && ssa->bit_size == 32);
    vtn_assert(ptr_type->base_type == vtn_base_type_pointer);
-   vtn_assert(ptr_type->deref->base_type != vtn_base_type_pointer);
-   /* This pointer type needs to have actual storage */
-   vtn_assert(ptr_type->type);
+
+   struct vtn_type *interface_type = ptr_type->deref;
+   while (interface_type->base_type == vtn_base_type_array)
+      interface_type = interface_type->array_element;
 
    struct vtn_pointer *ptr = rzalloc(b, struct vtn_pointer);
+   nir_variable_mode nir_mode;
    ptr->mode = vtn_storage_class_to_mode(b, ptr_type->storage_class,
-                                         ptr_type, NULL);
+                                         interface_type, &nir_mode);
    ptr->type = ptr_type->deref;
    ptr->ptr_type = ptr_type;
 
-   if (ssa->num_components > 1) {
+   if (ptr->mode == vtn_variable_mode_ubo ||
+       ptr->mode == vtn_variable_mode_ssbo) {
+      /* This pointer type needs to have actual storage */
+      vtn_assert(ptr_type->type);
       vtn_assert(ssa->num_components == 2);
-      vtn_assert(ptr->mode == vtn_variable_mode_ubo ||
-                 ptr->mode == vtn_variable_mode_ssbo);
       ptr->block_index = nir_channel(&b->nb, ssa, 0);
       ptr->offset = nir_channel(&b->nb, ssa, 1);
-   } else {
+   } else if (ptr->mode == vtn_variable_mode_workgroup ||
+              ptr->mode == vtn_variable_mode_push_constant) {
+      /* This pointer type needs to have actual storage */
+      vtn_assert(ptr_type->type);
       vtn_assert(ssa->num_components == 1);
-      vtn_assert(ptr->mode == vtn_variable_mode_workgroup);
       ptr->block_index = NULL;
       ptr->offset = ssa;
+   } else {
+      ptr->deref = nir_build_deref_cast(&b->nb, ssa, nir_mode,
+                                        ptr_type->deref->type);
    }
 
    return ptr;
@@ -1769,11 +1652,11 @@
    case vtn_variable_mode_ssbo:
       b->shader->info.num_ssbos++;
       break;
-   case vtn_variable_mode_image:
-      b->shader->info.num_images++;
-      break;
-   case vtn_variable_mode_sampler:
-      b->shader->info.num_textures++;
+   case vtn_variable_mode_uniform:
+      if (glsl_type_is_image(without_array->type))
+         b->shader->info.num_images++;
+      else if (glsl_type_is_sampler(without_array->type))
+         b->shader->info.num_textures++;
       break;
    case vtn_variable_mode_push_constant:
       b->shader->num_uniforms = vtn_type_block_size(b, type);
@@ -1793,23 +1676,23 @@
    switch (var->mode) {
    case vtn_variable_mode_local:
    case vtn_variable_mode_global:
-   case vtn_variable_mode_image:
-   case vtn_variable_mode_sampler:
+   case vtn_variable_mode_uniform:
       /* For these, we create the variable normally */
       var->var = rzalloc(b->shader, nir_variable);
       var->var->name = ralloc_strdup(var->var, val->name);
-      var->var->type = var->type->type;
-      var->var->data.mode = nir_mode;
 
-      switch (var->mode) {
-      case vtn_variable_mode_image:
-      case vtn_variable_mode_sampler:
-         var->var->interface_type = without_array->type;
-         break;
-      default:
-         var->var->interface_type = NULL;
-         break;
+      /* Need to tweak the nir type here as at vtn_handle_type we don't have
+       * the access to storage_class, that is the one that points us that is
+       * an atomic uint.
+       */
+      if (storage_class == SpvStorageClassAtomicCounter) {
+         var->var->type = repair_atomic_type(var->type->type);
+      } else {
+         var->var->type = var->type->type;
       }
+      var->var->data.mode = nir_mode;
+      var->var->data.location = -1;
+      var->var->interface_type = NULL;
       break;
 
    case vtn_variable_mode_workgroup:
@@ -1859,7 +1742,6 @@
        * able to preserve that information.
        */
 
-      int array_length = -1;
       struct vtn_type *interface_type = var->type;
       if (is_per_vertex_inout(var, b->shader->info.stage)) {
          /* In Geometry shaders (and some tessellation), inputs come
@@ -1869,43 +1751,25 @@
           * check should be sufficient.
           */
          interface_type = var->type->array_element;
-         array_length = glsl_get_length(var->type->type);
       }
 
+      var->var = rzalloc(b->shader, nir_variable);
+      var->var->name = ralloc_strdup(var->var, val->name);
+      var->var->type = var->type->type;
+      var->var->interface_type = interface_type->type;
+      var->var->data.mode = nir_mode;
+      var->var->data.patch = var->patch;
+
       if (glsl_type_is_struct(interface_type->type)) {
-         /* It's a struct.  Split it. */
-         unsigned num_members = glsl_get_length(interface_type->type);
-         var->members = ralloc_array(b, nir_variable *, num_members);
+         /* It's a struct.  Set it up as per-member. */
+         var->var->num_members = glsl_get_length(interface_type->type);
+         var->var->members = rzalloc_array(var->var, struct nir_variable_data,
+                                           var->var->num_members);
 
-         for (unsigned i = 0; i < num_members; i++) {
-            const struct glsl_type *mtype = interface_type->members[i]->type;
-            if (array_length >= 0)
-               mtype = glsl_array_type(mtype, array_length);
-
-            var->members[i] = rzalloc(b->shader, nir_variable);
-            var->members[i]->name =
-               ralloc_asprintf(var->members[i], "%s.%d", val->name, i);
-            var->members[i]->type = mtype;
-            var->members[i]->interface_type =
-               interface_type->members[i]->type;
-            var->members[i]->data.mode = nir_mode;
-            var->members[i]->data.patch = var->patch;
-
-            if (initializer) {
-               assert(i < initializer->num_elements);
-               var->members[i]->constant_initializer =
-                  nir_constant_clone(initializer->elements[i], var->members[i]);
-            }
+         for (unsigned i = 0; i < var->var->num_members; i++) {
+            var->var->members[i].mode = nir_mode;
+            var->var->members[i].patch = var->patch;
          }
-
-         initializer = NULL;
-      } else {
-         var->var = rzalloc(b->shader, nir_variable);
-         var->var->name = ralloc_strdup(var->var, val->name);
-         var->var->type = var->type->type;
-         var->var->interface_type = interface_type->type;
-         var->var->data.mode = nir_mode;
-         var->var->data.patch = var->patch;
       }
 
       /* For inputs and outputs, we need to grab locations and builtin
@@ -1917,9 +1781,6 @@
       break;
    }
 
-   case vtn_variable_mode_param:
-      vtn_fail("Not created through OpVariable");
-
    case vtn_variable_mode_ubo:
    case vtn_variable_mode_ssbo:
    case vtn_variable_mode_push_constant:
@@ -1934,30 +1795,25 @@
 
    vtn_foreach_decoration(b, val, var_decoration_cb, var);
 
-   if (var->mode == vtn_variable_mode_image ||
-       var->mode == vtn_variable_mode_sampler) {
+   if (var->mode == vtn_variable_mode_uniform) {
       /* XXX: We still need the binding information in the nir_variable
        * for these. We should fix that.
        */
       var->var->data.binding = var->binding;
+      var->var->data.explicit_binding = var->explicit_binding;
       var->var->data.descriptor_set = var->descriptor_set;
       var->var->data.index = var->input_attachment_index;
+      var->var->data.offset = var->offset;
 
-      if (var->mode == vtn_variable_mode_image)
+      if (glsl_type_is_image(without_array->type))
          var->var->data.image.format = without_array->image_format;
    }
 
    if (var->mode == vtn_variable_mode_local) {
-      vtn_assert(var->members == NULL && var->var != NULL);
+      vtn_assert(var->var != NULL && var->var->members == NULL);
       nir_function_impl_add_variable(b->nb.impl, var->var);
    } else if (var->var) {
       nir_shader_add_variable(b->shader, var->var);
-   } else if (var->members) {
-      unsigned count = glsl_get_length(without_array->type);
-      for (unsigned i = 0; i < count; i++) {
-         vtn_assert(var->members[i]->data.mode != nir_var_local);
-         nir_shader_add_variable(b->shader, var->members[i]);
-      }
    } else {
       vtn_assert(vtn_pointer_is_external_block(b, val->pointer));
    }
@@ -2083,8 +1939,8 @@
 
       vtn_assert_types_equal(b, opcode, res_type, src_val->type->deref);
 
-      if (src->mode == vtn_variable_mode_image ||
-          src->mode == vtn_variable_mode_sampler) {
+      if (glsl_type_is_image(res_type->type) ||
+          glsl_type_is_sampler(res_type->type)) {
          vtn_push_value(b, w[2], vtn_value_type_pointer)->pointer = src;
          return;
       }
diff --git a/src/egl/Android.mk b/src/egl/Android.mk
index 1181869..11128de 100644
--- a/src/egl/Android.mk
+++ b/src/egl/Android.mk
@@ -44,6 +44,7 @@
 	-DHAVE_ANDROID_PLATFORM
 
 LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/include/drm-uapi \
 	$(MESA_TOP)/src/egl/main \
 	$(MESA_TOP)/src/egl/drivers/dri2
 
@@ -57,9 +58,13 @@
 	libhardware \
 	liblog \
 	libcutils \
-	libgralloc_drm \
 	libsync
 
+ifeq ($(BOARD_USES_DRM_GRALLOC),true)
+	LOCAL_CFLAGS += -DHAVE_DRM_GRALLOC
+	LOCAL_SHARED_LIBRARIES += libgralloc_drm
+endif
+
 ifeq ($(filter $(MESA_ANDROID_MAJOR_VERSION), 4 5 6 7),)
 LOCAL_SHARED_LIBRARIES += libnativewindow
 endif
diff --git a/src/egl/Makefile.am b/src/egl/Makefile.am
index 116ed4e..b43805d 100644
--- a/src/egl/Makefile.am
+++ b/src/egl/Makefile.am
@@ -27,6 +27,7 @@
 
 AM_CFLAGS = \
 	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/include/drm-uapi \
 	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/egl/main \
 	-I$(top_srcdir)/src/gbm/main \
@@ -85,6 +86,8 @@
 AM_CFLAGS += $(WAYLAND_CLIENT_CFLAGS)
 libEGL_common_la_LIBADD += $(WAYLAND_CLIENT_LIBS)
 libEGL_common_la_LIBADD += $(LIBDRM_LIBS)
+AM_CFLAGS += $(WAYLAND_EGL_CFLAGS)
+libEGL_common_la_LIBADD += $(WAYLAND_EGL_LIBS)
 AM_CFLAGS += $(WAYLAND_SERVER_CFLAGS)
 libEGL_common_la_LIBADD += $(top_builddir)/src/egl/wayland/wayland-drm/libwayland-drm.la
 libEGL_common_la_LIBADD += $(WAYLAND_SERVER_LIBS)
@@ -115,7 +118,6 @@
 	-I$(top_builddir)/src/egl/drivers/dri2 \
 	-I$(top_srcdir)/src/egl/drivers/dri2 \
 	-I$(top_srcdir)/src/gbm/backends/dri \
-	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
 	-I$(top_builddir)/src/egl/wayland/wayland-drm \
 	-I$(top_srcdir)/src/egl/wayland/wayland-drm \
 	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\" \
diff --git a/src/egl/SConscript b/src/egl/SConscript
index 927092d..153fdc0 100644
--- a/src/egl/SConscript
+++ b/src/egl/SConscript
@@ -8,6 +8,7 @@
 
 env.Append(CPPPATH = [
     '#/include',
+    '#/include/drm-uapi',
     '#/include/HaikuGL',
     '#/src/egl/main',
     '#/src',
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 2981e70..3c5381f 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -65,34 +65,6 @@
 #include "util/u_vector.h"
 #include "mapi/glapi/glapi.h"
 
-/* The kernel header drm_fourcc.h defines the DRM formats below.  We duplicate
- * some of the definitions here so that building Mesa won't bleeding-edge
- * kernel headers.
- */
-#ifndef DRM_FORMAT_R8
-#define DRM_FORMAT_R8            fourcc_code('R', '8', ' ', ' ') /* [7:0] R */
-#endif
-
-#ifndef DRM_FORMAT_RG88
-#define DRM_FORMAT_RG88          fourcc_code('R', 'G', '8', '8') /* [15:0] R:G 8:8 little endian */
-#endif
-
-#ifndef DRM_FORMAT_GR88
-#define DRM_FORMAT_GR88          fourcc_code('G', 'R', '8', '8') /* [15:0] G:R 8:8 little endian */
-#endif
-
-#ifndef DRM_FORMAT_R16
-#define DRM_FORMAT_R16           fourcc_code('R', '1', '6', ' ') /* [15:0] R 16 little endian */
-#endif
-
-#ifndef DRM_FORMAT_GR1616
-#define DRM_FORMAT_GR1616        fourcc_code('G', 'R', '3', '2') /* [31:0] R:G 16:16 little endian */
-#endif
-
-#ifndef DRM_FORMAT_MOD_INVALID
-#define DRM_FORMAT_MOD_INVALID ((1ULL<<56) - 1)
-#endif
-
 #define NUM_ATTRIBS 12
 
 static void
@@ -2308,7 +2280,7 @@
 {
    unsigned plane_n = dri2_num_fourcc_format_planes(attrs->DMABufFourCC.Value);
    if (plane_n == 0) {
-      _eglError(EGL_BAD_ATTRIBUTE, "invalid format");
+      _eglError(EGL_BAD_MATCH, "unknown drm fourcc format");
       return 0;
    }
 
@@ -2621,6 +2593,28 @@
    return EGL_TRUE;
 }
 
+/**
+ * Checks if we can support EGL_MESA_image_dma_buf_export on this image.
+
+ * The spec provides a boolean return for the driver to reject exporting for
+ * basically any reason, but doesn't specify any particular error cases.  For
+ * now, we just fail if we don't have a DRM fourcc for the format.
+ */
+static bool
+dri2_can_export_dma_buf_image(_EGLDisplay *disp, _EGLImage *img)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   struct dri2_egl_image *dri2_img = dri2_egl_image(img);
+   EGLint fourcc;
+
+   if (!dri2_dpy->image->queryImage(dri2_img->dri_image,
+                                    __DRI_IMAGE_ATTRIB_FOURCC, &fourcc)) {
+      return false;
+   }
+
+   return true;
+}
+
 static EGLBoolean
 dri2_export_dma_buf_image_query_mesa(_EGLDriver *drv, _EGLDisplay *disp,
                                      _EGLImage *img,
@@ -2632,6 +2626,8 @@
 
    (void) drv;
 
+   if (!dri2_can_export_dma_buf_image(disp, img))
+      return EGL_FALSE;
 
    if (nplanes)
       dri2_dpy->image->queryImage(dri2_img->dri_image,
@@ -2655,6 +2651,9 @@
 
    (void) drv;
 
+   if (!dri2_can_export_dma_buf_image(disp, img))
+      return EGL_FALSE;
+
    /* rework later to provide multiple fds/strides/offsets */
    if (fds)
       dri2_dpy->image->queryImage(dri2_img->dri_image,
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index d99714a..93b06a0 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -67,8 +67,6 @@
 
 #include <system/window.h>
 #include <hardware/gralloc.h>
-#include <gralloc_drm_handle.h>
-
 #endif /* HAVE_ANDROID_PLATFORM */
 
 #include "eglconfig.h"
@@ -414,6 +412,8 @@
 dri2_initialize_x11(_EGLDriver *drv, _EGLDisplay *disp);
 void
 dri2_teardown_x11(struct dri2_egl_display *dri2_dpy);
+unsigned int
+dri2_x11_get_red_mask_for_depth(struct dri2_egl_display *dri2_dpy, int depth);
 #else
 static inline EGLBoolean
 dri2_initialize_x11(_EGLDriver *drv, _EGLDisplay *disp)
@@ -422,6 +422,11 @@
 }
 static inline void
 dri2_teardown_x11(struct dri2_egl_display *dri2_dpy) {}
+static inline unsigned int
+dri2_x11_get_red_mask_for_depth(struct dri2_egl_display *dri2_dpy, int depth)
+{
+   return 0;
+}
 #endif
 
 #ifdef HAVE_DRM_PLATFORM
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 7f1a496..5c4e5ae 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -27,17 +27,25 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
+#include <cutils/properties.h>
 #include <errno.h>
+#include <dirent.h>
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <xf86drm.h>
 #include <stdbool.h>
+#include <stdio.h>
 #include <sync/sync.h>
+#include <sys/types.h>
 
 #include "loader.h"
 #include "egl_dri2.h"
 #include "egl_dri2_fallbacks.h"
+
+#ifdef HAVE_DRM_GRALLOC
+#include <gralloc_drm_handle.h>
 #include "gralloc_drm.h"
+#endif /* HAVE_DRM_GRALLOC */
 
 #define ALIGN(val, align)	(((val) + (align) - 1) & ~((align) - 1))
 
@@ -164,11 +172,13 @@
    return (handle && handle->numFds) ? handle->data[0] : -1;
 }
 
+#ifdef HAVE_DRM_GRALLOC
 static int
 get_native_buffer_name(struct ANativeWindowBuffer *buf)
 {
    return gralloc_drm_get_gem_handle(buf->handle);
 }
+#endif /* HAVE_DRM_GRALLOC */
 
 static EGLBoolean
 droid_window_dequeue_buffer(struct dri2_egl_surface *dri2_surf)
@@ -337,8 +347,10 @@
 
    config = dri2_get_dri_config(dri2_conf, type,
                                 dri2_surf->base.GLColorspace);
-   if (!config)
+   if (!config) {
+      _eglError(EGL_BAD_MATCH, "Unsupported surfacetype/colorspace configuration");
       goto cleanup_surface;
+   }
 
    if (dri2_dpy->image_driver)
       createNewDrawable = dri2_dpy->image_driver->createNewDrawable;
@@ -836,6 +848,7 @@
    return dri2_create_image_dma_buf(disp, ctx, NULL, attr_list);
 }
 
+#ifdef HAVE_DRM_GRALLOC
 static _EGLImage *
 droid_create_image_from_name(_EGLDisplay *disp, _EGLContext *ctx,
                              struct ANativeWindowBuffer *buf)
@@ -879,6 +892,7 @@
 
    return &dri2_img->base;
 }
+#endif /* HAVE_DRM_GRALLOC */
 
 static EGLBoolean
 droid_query_surface(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf,
@@ -935,7 +949,11 @@
    if (fd >= 0)
       return droid_create_image_from_prime_fd(disp, ctx, buf, fd);
 
+#ifdef HAVE_DRM_GRALLOC
    return droid_create_image_from_name(disp, ctx, buf);
+#else
+   return NULL;
+#endif
 }
 
 static _EGLImage *
@@ -957,6 +975,7 @@
 {
 }
 
+#ifdef HAVE_DRM_GRALLOC
 static int
 droid_get_buffers_parse_attachments(struct dri2_egl_surface *dri2_surf,
                                     unsigned int *attachments, int count)
@@ -1032,6 +1051,7 @@
 
    return dri2_surf->buffers;
 }
+#endif /* HAVE_DRM_GRALLOC */
 
 static unsigned
 droid_get_capability(void *loaderPrivate, enum dri_loader_cap cap)
@@ -1114,8 +1134,9 @@
    return (config_count != 0);
 }
 
+#ifdef HAVE_DRM_GRALLOC
 static int
-droid_open_device(struct dri2_egl_display *dri2_dpy)
+droid_open_device_drm_gralloc(struct dri2_egl_display *dri2_dpy)
 {
    int fd = -1, err = -EINVAL;
 
@@ -1130,6 +1151,7 @@
 
    return (fd >= 0) ? fcntl(fd, F_DUPFD_CLOEXEC, 3) : -1;
 }
+#endif /* HAVE_DRM_GRALLOC */
 
 static const struct dri2_egl_display_vtbl droid_display_vtbl = {
    .authenticate = NULL,
@@ -1156,6 +1178,7 @@
    .get_dri_drawable = dri2_surface_get_dri_drawable,
 };
 
+#ifdef HAVE_DRM_GRALLOC
 static const __DRIdri2LoaderExtension droid_dri2_loader_extension = {
    .base = { __DRI_DRI2_LOADER, 4 },
 
@@ -1164,6 +1187,7 @@
    .getBuffersWithFormat = droid_get_buffers_with_format,
    .getCapability        = droid_get_capability,
 };
+#endif /* HAVE_DRM_GRALLOC */
 
 static const __DRIimageLoaderExtension droid_image_loader_extension = {
    .base = { __DRI_IMAGE_LOADER, 2 },
@@ -1173,12 +1197,14 @@
    .getCapability       = droid_get_capability,
 };
 
+#ifdef HAVE_DRM_GRALLOC
 static const __DRIextension *droid_dri2_loader_extensions[] = {
    &droid_dri2_loader_extension.base,
    &image_lookup_extension.base,
    &use_invalidate.base,
    NULL,
 };
+#endif /* HAVE_DRM_GRALLOC */
 
 static const __DRIextension *droid_image_loader_extensions[] = {
    &droid_image_loader_extension.base,
@@ -1188,6 +1214,169 @@
 };
 
 EGLBoolean
+droid_load_driver(_EGLDisplay *disp)
+{
+   struct dri2_egl_display *dri2_dpy = disp->DriverData;
+   const char *err;
+
+   dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd);
+   if (dri2_dpy->driver_name == NULL)
+      return false;
+
+   dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER;
+
+   if (!dri2_dpy->is_render_node) {
+#ifdef HAVE_DRM_GRALLOC
+       /* Handle control nodes using __DRI_DRI2_LOADER extension and GEM names
+        * for backwards compatibility with drm_gralloc. (Do not use on new
+        * systems.) */
+       dri2_dpy->loader_extensions = droid_dri2_loader_extensions;
+       if (!dri2_load_driver(disp)) {
+          err = "DRI2: failed to load driver";
+          goto error;
+       }
+#else
+       err = "DRI2: handle is not for a render node";
+       goto error;
+#endif
+   } else {
+       dri2_dpy->loader_extensions = droid_image_loader_extensions;
+       if (!dri2_load_driver_dri3(disp)) {
+          err = "DRI3: failed to load driver";
+          goto error;
+       }
+    }
+
+   return true;
+
+error:
+   free(dri2_dpy->driver_name);
+   dri2_dpy->driver_name = NULL;
+   return false;
+}
+
+static bool
+droid_probe_driver(int fd)
+{
+   char *driver_name;
+
+   driver_name = loader_get_driver_for_fd(fd);
+   if (driver_name == NULL)
+      return false;
+
+   free(driver_name);
+   return true;
+}
+
+typedef enum {
+   probe_fail = -1,
+   probe_success = 0,
+   probe_filtered_out = 1,
+} probe_ret_t;
+
+static probe_ret_t
+droid_probe_device(_EGLDisplay *disp, int fd, const char *vendor)
+{
+   int ret;
+
+   drmVersionPtr ver = drmGetVersion(fd);
+   if (!ver)
+      return probe_fail;
+
+   if (!ver->name) {
+      ret = probe_fail;
+      goto cleanup;
+   }
+
+   if (vendor && strncmp(vendor, ver->name, PROPERTY_VALUE_MAX) != 0) {
+      ret = probe_filtered_out;
+      goto cleanup;
+   }
+
+   if (!droid_probe_driver(fd)) {
+      ret = probe_fail;
+      goto cleanup;
+   }
+
+   ret = probe_success;
+
+cleanup:
+   drmFreeVersion(ver);
+   return ret;
+}
+
+static int
+droid_open_device(_EGLDisplay *disp)
+{
+   const int MAX_DRM_DEVICES = 32;
+   int prop_set, num_devices;
+   int fd = -1, fallback_fd = -1;
+
+   char *vendor_name = NULL;
+   char vendor_buf[PROPERTY_VALUE_MAX];
+
+   if (property_get("drm.gpu.vendor_name", vendor_buf, NULL) > 0)
+      vendor_name = vendor_buf;
+
+   const char *drm_dir_name = "/dev/dri";
+   DIR *sysdir = opendir(drm_dir_name);
+
+   if (!sysdir)
+       return -errno;
+
+   struct dirent *dent;
+   while ((dent = readdir(sysdir))) {
+      char dev_path[128];
+      const char render_dev_prefix[] = "renderD";
+      size_t prefix_len = sizeof(render_dev_prefix) - 1;
+
+      if (strncmp(render_dev_prefix, dent->d_name, prefix_len) != 0)
+         continue;
+
+      snprintf(dev_path, sizeof(dev_path), "%s/%s", drm_dir_name, dent->d_name);
+      fd = loader_open_device(dev_path);
+      if (fd < 0) {
+         _eglLog(_EGL_WARNING, "%s() Failed to open DRM device %s",
+                 __func__, dev_path);
+         continue;
+      }
+
+      int ret = droid_probe_device(disp, fd, vendor_name);
+      switch (ret) {
+      case probe_success:
+         goto success;
+      case probe_filtered_out:
+         /* Set as fallback */
+         if (fallback_fd == -1)
+            fallback_fd = fd;
+         break;
+      case probe_fail:
+         break;
+      }
+
+      if (fallback_fd != fd)
+         close(fd);
+      fd = -1;
+   }
+
+success:
+   closedir(sysdir);
+
+   if (fallback_fd < 0 && fd < 0) {
+      _eglLog(_EGL_WARNING, "Failed to open any DRM device");
+      return -1;
+   }
+
+   if (fd < 0) {
+      _eglLog(_EGL_WARNING, "Failed to open desired DRM device, using fallback");
+      return fallback_fd;
+   }
+
+   close(fallback_fd);
+   return fd;
+}
+
+EGLBoolean
 dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp)
 {
    struct dri2_egl_display *dri2_dpy;
@@ -1214,36 +1403,21 @@
 
    disp->DriverData = (void *) dri2_dpy;
 
-   dri2_dpy->fd = droid_open_device(dri2_dpy);
+#ifdef HAVE_DRM_GRALLOC
+   dri2_dpy->fd = droid_open_device_drm_gralloc(dri2_dpy);
+#else
+   dri2_dpy->fd = droid_open_device(disp);
+#endif
    if (dri2_dpy->fd < 0) {
       err = "DRI2: failed to open device";
       goto cleanup;
    }
 
-   dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd);
-   if (dri2_dpy->driver_name == NULL) {
-      err = "DRI2: failed to get driver name";
+   if (!droid_load_driver(disp)) {
+      err = "DRI2: failed to load driver";
       goto cleanup;
    }
 
-   dri2_dpy->is_render_node = drmGetNodeTypeFromFd(dri2_dpy->fd) == DRM_NODE_RENDER;
-
-   /* render nodes cannot use Gem names, and thus do not support
-    * the __DRI_DRI2_LOADER extension */
-   if (!dri2_dpy->is_render_node) {
-      dri2_dpy->loader_extensions = droid_dri2_loader_extensions;
-      if (!dri2_load_driver(disp)) {
-         err = "DRI2: failed to load driver";
-         goto cleanup;
-      }
-   } else {
-      dri2_dpy->loader_extensions = droid_image_loader_extensions;
-      if (!dri2_load_driver_dri3(disp)) {
-         err = "DRI3: failed to load driver";
-         goto cleanup;
-      }
-   }
-
    if (!dri2_create_screen(disp)) {
       err = "DRI2: failed to create screen";
       goto cleanup;
diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index dc4efea..35bc4b5 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -155,6 +155,11 @@
    config = dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
                                 dri2_surf->base.GLColorspace);
 
+   if (!config) {
+      _eglError(EGL_BAD_MATCH, "Unsupported surfacetype/colorspace configuration");
+      goto cleanup_surf;
+   }
+
    if (!dri2_drm_config_is_compatible(dri2_dpy, config, surface)) {
       _eglError(EGL_BAD_MATCH, "EGL config not compatible with GBM format");
       goto cleanup_surf;
diff --git a/src/egl/drivers/dri2/platform_surfaceless.c b/src/egl/drivers/dri2/platform_surfaceless.c
index 70b302c..bfc8fb9 100644
--- a/src/egl/drivers/dri2/platform_surfaceless.c
+++ b/src/egl/drivers/dri2/platform_surfaceless.c
@@ -130,8 +130,10 @@
    config = dri2_get_dri_config(dri2_conf, type,
                                 dri2_surf->base.GLColorspace);
 
-   if (!config)
+   if (!config) {
+      _eglError(EGL_BAD_MATCH, "Unsupported surfacetype/colorspace configuration");
       goto cleanup_surface;
+   }
 
    dri2_surf->dri_drawable =
       dri2_dpy->image_driver->createNewDrawable(dri2_dpy->dri_screen, config,
@@ -258,6 +260,13 @@
    .flushFrontBuffer = surfaceless_flush_front_buffer,
 };
 
+static const __DRIswrastLoaderExtension swrast_loader_extension = {
+   .base            = { __DRI_SWRAST_LOADER, 1 },
+   .getDrawableInfo = NULL,
+   .putImage        = NULL,
+   .getImage        = NULL,
+};
+
 #define DRM_RENDER_DEV_NAME  "%s/renderD%d"
 
 static const __DRIextension *image_loader_extensions[] = {
@@ -267,6 +276,14 @@
    NULL,
 };
 
+static const __DRIextension *swrast_loader_extensions[] = {
+   &swrast_loader_extension.base,
+   &image_loader_extension.base,
+   &image_lookup_extension.base,
+   &use_invalidate.base,
+   NULL,
+};
+
 static bool
 surfaceless_probe_device(_EGLDisplay *dpy, bool swrast)
 {
@@ -276,6 +293,7 @@
    int fd;
    int i;
 
+   /* Attempt to find DRM device. */
    for (i = 0; i < limit; ++i) {
       char *card_path;
       if (asprintf(&card_path, DRM_RENDER_DEV_NAME, DRM_DIR_NAME, base + i) < 0)
@@ -286,10 +304,13 @@
       if (fd < 0)
          continue;
 
-      if (swrast)
+      if (swrast) {
          dri2_dpy->driver_name = strdup("kms_swrast");
-      else
+         dri2_dpy->loader_extensions = swrast_loader_extensions;
+      } else {
          dri2_dpy->driver_name = loader_get_driver_for_fd(fd);
+         dri2_dpy->loader_extensions = image_loader_extensions;
+      }
       if (!dri2_dpy->driver_name) {
          close(fd);
          continue;
@@ -303,6 +324,25 @@
       dri2_dpy->fd = -1;
       free(dri2_dpy->driver_name);
       dri2_dpy->driver_name = NULL;
+      dri2_dpy->loader_extensions = NULL;
+   }
+
+   /* No DRM device, so attempt to fall back to software path w/o DRM. */
+   if (swrast) {
+      _eglLog(_EGL_DEBUG, "Falling back to surfaceless swrast without DRM.");
+      dri2_dpy->fd = -1;
+      dri2_dpy->driver_name = strdup("swrast");
+      if (!dri2_dpy->driver_name) {
+         return false;
+      }
+
+      if (dri2_load_driver_swrast(dpy)) {
+         dri2_dpy->loader_extensions = swrast_loader_extensions;
+         return true;
+      }
+
+      free(dri2_dpy->driver_name);
+      dri2_dpy->driver_name = NULL;
    }
 
    return false;
@@ -336,8 +376,6 @@
       goto cleanup;
    }
 
-   dri2_dpy->loader_extensions = image_loader_extensions;
-
    if (!dri2_create_screen(disp)) {
       err = "DRI2: failed to create screen";
       goto cleanup;
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index df178b1..15eeee5 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -45,20 +45,11 @@
 #include "util/u_vector.h"
 #include "eglglobals.h"
 
+#include <wayland-egl-backend.h>
 #include <wayland-client.h>
 #include "wayland-drm-client-protocol.h"
 #include "linux-dmabuf-unstable-v1-client-protocol.h"
 
-#include "wayland/wayland-egl/wayland-egl-backend.h"
-
-#ifndef DRM_FORMAT_MOD_INVALID
-#define DRM_FORMAT_MOD_INVALID ((1ULL << 56) - 1)
-#endif
-
-#ifndef DRM_FORMAT_MOD_LINEAR
-#define DRM_FORMAT_MOD_LINEAR 0
-#endif
-
 /*
  * The index of entries in this table is used as a bitmask in
  * dri2_dpy->formats, which tracks the formats supported by our server.
@@ -84,6 +75,18 @@
      { 0x3ff00000, 0x000ffc00, 0x000003ff, 0xc0000000 }
    },
    {
+     "XBGR2101010",
+     WL_DRM_FORMAT_XBGR2101010, WL_SHM_FORMAT_XBGR2101010,
+     __DRI_IMAGE_FORMAT_XBGR2101010, 32,
+     { 0x000003ff, 0x000ffc00, 0x3ff00000, 0x00000000 }
+   },
+   {
+     "ABGR2101010",
+     WL_DRM_FORMAT_ABGR2101010, WL_SHM_FORMAT_ABGR2101010,
+     __DRI_IMAGE_FORMAT_ABGR2101010, 32,
+     { 0x000003ff, 0x000ffc00, 0x3ff00000, 0xc0000000 }
+   },
+   {
      "XRGB8888",
      WL_DRM_FORMAT_XRGB8888, WL_SHM_FORMAT_XRGB8888,
      __DRI_IMAGE_FORMAT_XRGB8888, 32,
@@ -264,6 +267,11 @@
    config = dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
                                 dri2_surf->base.GLColorspace);
 
+   if (!config) {
+      _eglError(EGL_BAD_MATCH, "Unsupported surfacetype/colorspace configuration");
+      goto cleanup_surf;
+   }
+
    dri2_surf->base.Width = window->width;
    dri2_surf->base.Height = window->height;
 
@@ -310,7 +318,7 @@
                       dri2_surf->wl_queue);
 
    dri2_surf->wl_win = window;
-   dri2_surf->wl_win->private = dri2_surf;
+   dri2_surf->wl_win->driver_private = dri2_surf;
    dri2_surf->wl_win->destroy_window_callback = destroy_window_callback;
    if (dri2_dpy->flush)
       dri2_surf->wl_win->resize_callback = resize_callback;
@@ -396,7 +404,7 @@
       wl_callback_destroy(dri2_surf->throttle_callback);
 
    if (dri2_surf->wl_win) {
-      dri2_surf->wl_win->private = NULL;
+      dri2_surf->wl_win->driver_private = NULL;
       dri2_surf->wl_win->resize_callback = NULL;
       dri2_surf->wl_win->destroy_window_callback = NULL;
    }
@@ -1646,8 +1654,8 @@
    if (dri2_surf->back)
       return 0;
 
-   if (dri2_surf->base.Width != dri2_surf->wl_win->attached_width ||
-       dri2_surf->base.Height != dri2_surf->wl_win->attached_height) {
+   if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->height) {
 
       dri2_wl_release_buffers(dri2_surf);
 
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 60330b3..cfa5c4a 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -55,6 +55,9 @@
 dri2_x11_swap_interval(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf,
                        EGLint interval);
 
+uint32_t
+dri2_format_for_depth(struct dri2_egl_display *dri2_dpy, uint32_t depth);
+
 static void
 swrastCreateDrawable(struct dri2_egl_display * dri2_dpy,
                      struct dri2_egl_surface * dri2_surf)
@@ -209,6 +212,36 @@
     return NULL;
 }
 
+static xcb_visualtype_t *
+get_xcb_visualtype_for_depth(struct dri2_egl_display *dri2_dpy, int depth)
+{
+   xcb_visualtype_iterator_t visual_iter;
+   xcb_screen_t *screen = dri2_dpy->screen;
+   xcb_depth_iterator_t depth_iter = xcb_screen_allowed_depths_iterator(screen);
+
+   for (; depth_iter.rem; xcb_depth_next(&depth_iter)) {
+      if (depth_iter.data->depth != depth)
+         continue;
+
+      visual_iter = xcb_depth_visuals_iterator(depth_iter.data);
+      if (visual_iter.rem)
+         return visual_iter.data;
+   }
+
+   return NULL;
+}
+
+/* Get red channel mask for given depth. */
+unsigned int
+dri2_x11_get_red_mask_for_depth(struct dri2_egl_display *dri2_dpy, int depth)
+{
+   xcb_visualtype_t *visual = get_xcb_visualtype_for_depth(dri2_dpy, depth);
+
+   if (visual)
+      return visual->red_mask;
+
+   return 0;
+}
 
 /**
  * Called via eglCreateWindowSurface(), drv->API.CreateWindowSurface().
@@ -251,6 +284,11 @@
    config = dri2_get_dri_config(dri2_conf, type,
                                 dri2_surf->base.GLColorspace);
 
+   if (!config) {
+      _eglError(EGL_BAD_MATCH, "Unsupported surfacetype/colorspace configuration");
+      goto cleanup_pixmap;
+   }
+
    if (dri2_dpy->dri2) {
       dri2_surf->dri_drawable =
          dri2_dpy->dri2->createNewDrawable(dri2_dpy->dri_screen, config,
@@ -1001,6 +1039,27 @@
    return EGL_TRUE;
 }
 
+uint32_t
+dri2_format_for_depth(struct dri2_egl_display *dri2_dpy, uint32_t depth)
+{
+   switch (depth) {
+   case 16:
+      return __DRI_IMAGE_FORMAT_RGB565;
+   case 24:
+      return __DRI_IMAGE_FORMAT_XRGB8888;
+   case 30:
+      /* Different preferred formats for different hw */
+      if (dri2_x11_get_red_mask_for_depth(dri2_dpy, 30) == 0x3ff)
+         return __DRI_IMAGE_FORMAT_XBGR2101010;
+      else
+         return __DRI_IMAGE_FORMAT_XRGB2101010;
+   case 32:
+      return __DRI_IMAGE_FORMAT_ARGB8888;
+   default:
+      return __DRI_IMAGE_FORMAT_NONE;
+   }
+}
+
 static _EGLImage *
 dri2_create_image_khr_pixmap(_EGLDisplay *disp, _EGLContext *ctx,
 			     EGLClientBuffer buffer, const EGLint *attr_list)
@@ -1045,20 +1104,8 @@
       return NULL;
    }
 
-   switch (geometry_reply->depth) {
-   case 16:
-      format = __DRI_IMAGE_FORMAT_RGB565;
-      break;
-   case 24:
-      format = __DRI_IMAGE_FORMAT_XRGB8888;
-      break;
-   case 30:
-      format = __DRI_IMAGE_FORMAT_XRGB2101010;
-      break;
-   case 32:
-      format = __DRI_IMAGE_FORMAT_ARGB8888;
-      break;
-   default:
+   format = dri2_format_for_depth(dri2_dpy, geometry_reply->depth);
+   if (format == __DRI_IMAGE_FORMAT_NONE) {
       _eglError(EGL_BAD_PARAMETER,
 		"dri2_create_image_khr: unsupported pixmap depth");
       free(buffers_reply);
diff --git a/src/egl/drivers/dri2/platform_x11_dri3.c b/src/egl/drivers/dri2/platform_x11_dri3.c
index 54305c2..e1967422 100644
--- a/src/egl/drivers/dri2/platform_x11_dri3.c
+++ b/src/egl/drivers/dri2/platform_x11_dri3.c
@@ -39,23 +39,6 @@
 #include "loader.h"
 #include "loader_dri3_helper.h"
 
-static uint32_t
-dri3_format_for_depth(uint32_t depth)
-{
-   switch (depth) {
-   case 16:
-      return __DRI_IMAGE_FORMAT_RGB565;
-   case 24:
-      return __DRI_IMAGE_FORMAT_XRGB8888;
-   case 30:
-      return __DRI_IMAGE_FORMAT_XRGB2101010;
-   case 32:
-      return __DRI_IMAGE_FORMAT_ARGB8888;
-   default:
-      return __DRI_IMAGE_FORMAT_NONE;
-   }
-}
-
 static struct dri3_egl_surface *
 loader_drawable_to_egl_surface(struct loader_dri3_drawable *draw) {
    size_t offset = offsetof(struct dri3_egl_surface, loader_drawable);
@@ -188,6 +171,11 @@
    dri_config = dri2_get_dri_config(dri2_conf, type,
                                     dri3_surf->surf.base.GLColorspace);
 
+   if (!dri_config) {
+      _eglError(EGL_BAD_MATCH, "Unsupported surfacetype/colorspace configuration");
+      goto cleanup_pixmap;
+   }
+
    if (loader_dri3_drawable_init(dri2_dpy->conn, drawable,
                                  dri2_dpy->dri_screen,
                                  dri2_dpy->is_different_gpu,
@@ -298,7 +286,7 @@
       return NULL;
    }
 
-   format = dri3_format_for_depth(bp_reply->depth);
+   format = dri2_format_for_depth(dri2_dpy, bp_reply->depth);
    if (format == __DRI_IMAGE_FORMAT_NONE) {
       _eglError(EGL_BAD_PARAMETER,
                 "dri3_create_image_khr: unsupported pixmap depth");
@@ -350,7 +338,7 @@
       return EGL_NO_IMAGE_KHR;
    }
 
-   format = dri3_format_for_depth(bp_reply->depth);
+   format = dri2_format_for_depth(dri2_dpy, bp_reply->depth);
    if (format == __DRI_IMAGE_FORMAT_NONE) {
       _eglError(EGL_BAD_PARAMETER,
                 "dri3_create_image_khr: unsupported pixmap depth");
diff --git a/src/egl/drivers/dri2/platform_x11_dri3.h b/src/egl/drivers/dri2/platform_x11_dri3.h
index 96e7ee9..f60d1df 100644
--- a/src/egl/drivers/dri2/platform_x11_dri3.h
+++ b/src/egl/drivers/dri2/platform_x11_dri3.h
@@ -38,4 +38,7 @@
 EGLBoolean
 dri3_x11_connect(struct dri2_egl_display *dri2_dpy);
 
+uint32_t
+dri2_format_for_depth(struct dri2_egl_display *dri2_dpy, uint32_t depth);
+
 #endif
diff --git a/src/egl/generate/eglFunctionList.py b/src/egl/generate/eglFunctionList.py
index fb5b3c3..0f20573 100644
--- a/src/egl/generate/eglFunctionList.py
+++ b/src/egl/generate/eglFunctionList.py
@@ -196,6 +196,9 @@
     # EGL_ANDROID_native_fence_sync
     _eglFunc("eglDupNativeFenceFDANDROID",           "display"),
 
+    # EGL_ANDROID_blob_cache
+    _eglFunc("eglSetBlobCacheFuncsANDROID",          "display"),
+
     # EGL_EXT_image_dma_buf_import_modifiers
     _eglFunc("eglQueryDmaBufFormatsEXT",             "display"),
     _eglFunc("eglQueryDmaBufModifiersEXT",           "display"),
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index deb479b..fadb2b1 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -1208,6 +1208,9 @@
    if (_eglGetSurfaceHandle(surf) == EGL_NO_SURFACE)
       RETURN_EGL_ERROR(disp, EGL_BAD_SURFACE, EGL_FALSE);
 
+   if (surf->Type != EGL_WINDOW_BIT)
+      RETURN_EGL_EVAL(disp, EGL_TRUE);
+
    interval = CLAMP(interval,
                     surf->Config->MinSwapInterval,
                     surf->Config->MaxSwapInterval);
@@ -1243,6 +1246,9 @@
       RETURN_EGL_ERROR(disp, EGL_BAD_SURFACE, EGL_FALSE);
    #endif
 
+   if (surf->Type != EGL_WINDOW_BIT)
+      RETURN_EGL_EVAL(disp, EGL_TRUE);
+
    /* From the EGL 1.5 spec:
     *
     *    If eglSwapBuffers is called and the native window associated with
@@ -1282,6 +1288,9 @@
        surf != ctx->DrawSurface)
       RETURN_EGL_ERROR(disp, EGL_BAD_SURFACE, EGL_FALSE);
 
+   if (surf->Type != EGL_WINDOW_BIT)
+      RETURN_EGL_EVAL(disp, EGL_TRUE);
+
    if ((n_rects > 0 && rects == NULL) || n_rects < 0)
       RETURN_EGL_ERROR(disp, EGL_BAD_PARAMETER, EGL_FALSE);
 
diff --git a/src/egl/main/egldispatchstubs.c b/src/egl/main/egldispatchstubs.c
index e02abd7..96708ae 100644
--- a/src/egl/main/egldispatchstubs.c
+++ b/src/egl/main/egldispatchstubs.c
@@ -2,6 +2,7 @@
 #include "g_egldispatchstubs.h"
 
 #include <string.h>
+#include <stdlib.h>
 
 #include "eglcurrent.h"
 
@@ -10,26 +11,21 @@
 const int __EGL_DISPATCH_FUNC_COUNT = __EGL_DISPATCH_COUNT;
 int __EGL_DISPATCH_FUNC_INDICES[__EGL_DISPATCH_COUNT + 1];
 
+static int Compare(const void *l, const void *r)
+{
+    const char *s = *(const char **)r;
+    return strcmp(l, s);
+}
+
 static int FindProcIndex(const char *name)
 {
-    unsigned first = 0;
-    unsigned last = __EGL_DISPATCH_COUNT - 1;
+    const char **match = bsearch(name, __EGL_DISPATCH_FUNC_NAMES,
+            __EGL_DISPATCH_COUNT, sizeof(const char *), Compare);
 
-    while (first <= last) {
-        unsigned middle = (first + last) / 2;
-        int comp = strcmp(name,
-                          __EGL_DISPATCH_FUNC_NAMES[middle]);
+    if (match == NULL)
+        return __EGL_DISPATCH_COUNT;
 
-        if (comp > 0)
-            first = middle + 1;
-        else if (comp < 0)
-            last = middle - 1;
-        else
-            return middle;
-    }
-
-    /* Just point to the dummy entry at the end of the respective table */
-    return __EGL_DISPATCH_COUNT;
+    return match - __EGL_DISPATCH_FUNC_NAMES;
 }
 
 void __eglInitDispatchStubs(const __EGLapiExports *exportsTable)
@@ -63,6 +59,11 @@
     }
     if (func == NULL) {
         if (errorCode != EGL_SUCCESS) {
+            // Since we have no vendor, the follow-up eglGetError() call will
+            // end up using the GLVND error code. Set it here.
+            if (vendor == NULL) {
+                exports->setEGLError(errorCode);
+            }
             _eglError(errorCode, __EGL_DISPATCH_FUNC_NAMES[index]);
         }
         return NULL;
diff --git a/src/egl/main/egllog.c b/src/egl/main/egllog.c
index 64ff474..c223f49 100644
--- a/src/egl/main/egllog.c
+++ b/src/egl/main/egllog.c
@@ -47,7 +47,11 @@
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "EGL-MAIN"
+#if ANDROID_API_LEVEL >= 26
+#include <log/log.h>
+#else
 #include <cutils/log.h>
+#endif /* use log/log.h start from android 8 major version */
 
 #endif /* HAVE_ANDROID_PLATFORM */
 
diff --git a/src/egl/meson.build b/src/egl/meson.build
index fc01b95..1e0b1d3 100644
--- a/src/egl/meson.build
+++ b/src/egl/meson.build
@@ -24,7 +24,7 @@
 c_args_for_egl = []
 link_for_egl = []
 deps_for_egl = []
-incs_for_egl = [inc_include, inc_src, inc_egl]
+incs_for_egl = [inc_include, inc_drm_uapi, inc_src, inc_egl]
 
 files_egl = files(
   'main/eglapi.c',
@@ -117,7 +117,7 @@
   incs_for_egl += [inc_loader]
 endif
 if with_platform_wayland
-  deps_for_egl += [dep_wayland_client, dep_wayland_server]
+  deps_for_egl += [dep_wayland_client, dep_wayland_server, dep_wayland_egl_headers]
   link_for_egl += libwayland_drm
   files_egl += files('drivers/dri2/platform_wayland.c')
   files_egl += [
@@ -125,9 +125,7 @@
     linux_dmabuf_unstable_v1_client_protocol_h,
     wayland_drm_client_protocol_h,
   ]
-  incs_for_egl += include_directories(
-    'wayland/wayland-egl', 'wayland/wayland-drm',
-  )
+  incs_for_egl += include_directories('wayland/wayland-drm')
 endif
 if with_platform_android
   deps_for_egl += dep_android
@@ -144,8 +142,6 @@
   deps_for_egl += cpp.find_library('be')
 endif
 
-# TODO: glvnd
-
 if cc.has_function('mincore')
   c_args_for_egl += '-DHAVE_MINCORE'
 endif
@@ -200,10 +196,6 @@
   extra_cflags : gl_pkgconfig_c_flags,
 )
 
-if with_platform_wayland
-  subdir('wayland/wayland-egl')
-endif
-
 if with_tests
   if with_glvnd
     # TODO: add glvnd symbol check
diff --git a/src/egl/wayland/wayland-egl/Makefile.am b/src/egl/wayland/wayland-egl/Makefile.am
deleted file mode 100644
index 31dcca9..0000000
--- a/src/egl/wayland/wayland-egl/Makefile.am
+++ /dev/null
@@ -1,24 +0,0 @@
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = wayland-egl.pc
-
-AM_CFLAGS = $(DEFINES) \
-	    $(VISIBILITY_CFLAGS) \
-	    $(WAYLAND_CLIENT_CFLAGS)
-
-lib_LTLIBRARIES = libwayland-egl.la
-noinst_HEADERS = wayland-egl-backend.h
-libwayland_egl_la_SOURCES = wayland-egl.c
-libwayland_egl_la_LDFLAGS = \
-	-no-undefined \
-	-version-info 1 \
-	$(GC_SECTIONS) \
-	$(LD_NO_UNDEFINED)
-
-TESTS = wayland-egl-symbols-check \
-        wayland-egl-abi-check
-
-EXTRA_DIST = wayland-egl-symbols-check meson.build
-
-check_PROGRAMS = wayland-egl-abi-check
-
-include $(top_srcdir)/install-lib-links.mk
diff --git a/src/egl/wayland/wayland-egl/meson.build b/src/egl/wayland/wayland-egl/meson.build
deleted file mode 100644
index d0a7521d..0000000
--- a/src/egl/wayland/wayland-egl/meson.build
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright © 2017 Intel Corporation
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-
-libwayland_egl = shared_library(
-  'wayland-egl',
-  'wayland-egl.c',
-  c_args : [c_vis_args],
-  link_args : ld_args_gc_sections,
-  dependencies : dep_wayland_client,
-  version : '1.0.0',
-  install : true,
-)
-
-pkg.generate(
-  name : 'wayland-egl',
-  description : 'Mesa wayland-egl library',
-  libraries : libwayland_egl,
-  version : meson.project_version(),
-  requires : 'wayland-client',
-)
-
-if with_tests
-  test('wayland-egl-symbols-check',
-    find_program('wayland-egl-symbols-check'),
-    env : env_test,
-    args : libwayland_egl
-  )
-  test(
-    'wayland-egl-abi-check',
-    executable('wayland-egl-abi-check', 'wayland-egl-abi-check.c')
-  )
-endif
diff --git a/src/egl/wayland/wayland-egl/wayland-egl-abi-check.c b/src/egl/wayland/wayland-egl/wayland-egl-abi-check.c
deleted file mode 100644
index 62c51a2..0000000
--- a/src/egl/wayland/wayland-egl/wayland-egl-abi-check.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/*
- * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- */
-
-#include <stddef.h> /* offsetof */
-#include <stdio.h>  /* printf */
-
-#include "wayland-egl-backend.h" /* Current struct wl_egl_window implementation */
-
-/*
- * Following are previous implementations of wl_egl_window.
- *
- * DO NOT EVER CHANGE!
- */
-
-/* From: 214fc6e850 - Benjamin Franzke : egl: Implement libwayland-egl */
-struct wl_egl_window_v0 {
-    struct wl_surface *surface;
-
-    int width;
-    int height;
-    int dx;
-    int dy;
-
-    int attached_width;
-    int attached_height;
-};
-
-/* From: ca3ed3e024 - Ander Conselvan de Oliveira : egl/wayland: Don't invalidate drawable on swap buffers */
-struct wl_egl_window_v1 {
-    struct wl_surface *surface;
-
-    int width;
-    int height;
-    int dx;
-    int dy;
-
-    int attached_width;
-    int attached_height;
-
-    void *private;
-    void (*resize_callback)(struct wl_egl_window *, void *);
-};
-
-/* From: 690ead4a13 - Stencel, Joanna : egl/wayland-egl: Fix for segfault in dri2_wl_destroy_surface. */
-#define WL_EGL_WINDOW_VERSION_v2 2
-struct wl_egl_window_v2 {
-    struct wl_surface *surface;
-
-    int width;
-    int height;
-    int dx;
-    int dy;
-
-    int attached_width;
-    int attached_height;
-
-    void *private;
-    void (*resize_callback)(struct wl_egl_window *, void *);
-    void (*destroy_window_callback)(void *);
-};
-
-/* From: 2d5d61bc49 - Miguel A. Vico : wayland-egl: Make wl_egl_window a versioned struct */
-#define WL_EGL_WINDOW_VERSION_v3 3
-struct wl_egl_window_v3 {
-    const intptr_t version;
-
-    int width;
-    int height;
-    int dx;
-    int dy;
-
-    int attached_width;
-    int attached_height;
-
-    void *private;
-    void (*resize_callback)(struct wl_egl_window *, void *);
-    void (*destroy_window_callback)(void *);
-
-    struct wl_surface *surface;
-};
-
-
-/* This program checks we keep a backwards-compatible struct wl_egl_window
- * definition whenever it is modified in wayland-egl-backend.h.
- *
- * The previous definition should be added above as a new struct
- * wl_egl_window_vN, and the appropriate checks should be added below
- */
-
-#define MEMBER_SIZE(type, member) sizeof(((type *)0)->member)
-
-#define CHECK_RENAMED_MEMBER(a_ver, b_ver, a_member, b_member)                      \
-    do {                                                                            \
-        if (offsetof(struct wl_egl_window ## a_ver, a_member) !=                    \
-            offsetof(struct wl_egl_window ## b_ver, b_member)) {                    \
-            printf("Backards incompatible change detected!\n   "                    \
-                   "offsetof(struct wl_egl_window" #a_ver "::" #a_member ") != "    \
-                   "offsetof(struct wl_egl_window" #b_ver "::" #b_member ")\n");    \
-            return 1;                                                               \
-        }                                                                           \
-                                                                                    \
-        if (MEMBER_SIZE(struct wl_egl_window ## a_ver, a_member) !=                 \
-            MEMBER_SIZE(struct wl_egl_window ## b_ver, b_member)) {                 \
-            printf("Backards incompatible change detected!\n   "                    \
-                   "MEMBER_SIZE(struct wl_egl_window" #a_ver "::" #a_member ") != " \
-                   "MEMBER_SIZE(struct wl_egl_window" #b_ver "::" #b_member ")\n"); \
-            return 1;                                                               \
-        }                                                                           \
-    } while (0)
-
-#define CHECK_MEMBER(a_ver, b_ver, member) CHECK_RENAMED_MEMBER(a_ver, b_ver, member, member)
-#define CHECK_MEMBER_CURRENT(a_ver, member) CHECK_MEMBER(a_ver,, member)
-
-#define CHECK_SIZE(a_ver, b_ver)                                                    \
-    do {                                                                            \
-        if (sizeof(struct wl_egl_window ## a_ver) >                                 \
-            sizeof(struct wl_egl_window ## b_ver)) {                                \
-            printf("Backards incompatible change detected!\n   "                    \
-                   "sizeof(struct wl_egl_window" #a_ver ") > "                      \
-                   "sizeof(struct wl_egl_window" #b_ver ")\n");                     \
-            return 1;                                                               \
-        }                                                                           \
-    } while (0)
-
-#define CHECK_SIZE_CURRENT(a_ver)                                                   \
-    do {                                                                            \
-        if (sizeof(struct wl_egl_window ## a_ver) !=                                \
-            sizeof(struct wl_egl_window)) {                                         \
-            printf("Backards incompatible change detected!\n   "                    \
-                   "sizeof(struct wl_egl_window" #a_ver ") != "                     \
-                   "sizeof(struct wl_egl_window)\n");                               \
-            return 1;                                                               \
-        }                                                                           \
-    } while (0)
-
-#define CHECK_VERSION(a_ver, b_ver)                                                 \
-    do {                                                                            \
-        if ((WL_EGL_WINDOW_VERSION ## a_ver) >=                                     \
-            (WL_EGL_WINDOW_VERSION ## b_ver)) {                                     \
-            printf("Backards incompatible change detected!\n   "                    \
-                   "WL_EGL_WINDOW_VERSION" #a_ver " >= "                            \
-                   "WL_EGL_WINDOW_VERSION" #b_ver "\n");                            \
-            return 1;                                                               \
-        }                                                                           \
-    } while (0)
-
-#define CHECK_VERSION_CURRENT(a_ver)                                                \
-    do {                                                                            \
-        if ((WL_EGL_WINDOW_VERSION ## a_ver) !=                                     \
-            (WL_EGL_WINDOW_VERSION)) {                                              \
-            printf("Backards incompatible change detected!\n   "                    \
-                   "WL_EGL_WINDOW_VERSION" #a_ver " != "                            \
-                   "WL_EGL_WINDOW_VERSION\n");                                      \
-            return 1;                                                               \
-        }                                                                           \
-    } while (0)
-
-int main(int argc, char **argv)
-{
-    /* Check wl_egl_window_v1 ABI against wl_egl_window_v0 */
-    CHECK_MEMBER(_v0, _v1, surface);
-    CHECK_MEMBER(_v0, _v1, width);
-    CHECK_MEMBER(_v0, _v1, height);
-    CHECK_MEMBER(_v0, _v1, dx);
-    CHECK_MEMBER(_v0, _v1, dy);
-    CHECK_MEMBER(_v0, _v1, attached_width);
-    CHECK_MEMBER(_v0, _v1, attached_height);
-
-    CHECK_SIZE(_v0, _v1);
-
-    /* Check wl_egl_window_v2 ABI against wl_egl_window_v1 */
-    CHECK_MEMBER(_v1, _v2, surface);
-    CHECK_MEMBER(_v1, _v2, width);
-    CHECK_MEMBER(_v1, _v2, height);
-    CHECK_MEMBER(_v1, _v2, dx);
-    CHECK_MEMBER(_v1, _v2, dy);
-    CHECK_MEMBER(_v1, _v2, attached_width);
-    CHECK_MEMBER(_v1, _v2, attached_height);
-    CHECK_MEMBER(_v1, _v2, private);
-    CHECK_MEMBER(_v1, _v2, resize_callback);
-
-    CHECK_SIZE(_v1, _v2);
-
-    /* Check wl_egl_window_v3 ABI against wl_egl_window_v2 */
-    CHECK_RENAMED_MEMBER(_v2, _v3, surface, version);
-    CHECK_MEMBER        (_v2, _v3, width);
-    CHECK_MEMBER        (_v2, _v3, height);
-    CHECK_MEMBER        (_v2, _v3, dx);
-    CHECK_MEMBER        (_v2, _v3, dy);
-    CHECK_MEMBER        (_v2, _v3, attached_width);
-    CHECK_MEMBER        (_v2, _v3, attached_height);
-    CHECK_MEMBER        (_v2, _v3, private);
-    CHECK_MEMBER        (_v2, _v3, resize_callback);
-    CHECK_MEMBER        (_v2, _v3, destroy_window_callback);
-
-    CHECK_SIZE   (_v2, _v3);
-    CHECK_VERSION(_v2, _v3);
-
-    /* Check current wl_egl_window ABI against wl_egl_window_v3 */
-    CHECK_MEMBER_CURRENT(_v3, version);
-    CHECK_MEMBER_CURRENT(_v3, width);
-    CHECK_MEMBER_CURRENT(_v3, height);
-    CHECK_MEMBER_CURRENT(_v3, dx);
-    CHECK_MEMBER_CURRENT(_v3, dy);
-    CHECK_MEMBER_CURRENT(_v3, attached_width);
-    CHECK_MEMBER_CURRENT(_v3, attached_height);
-    CHECK_MEMBER_CURRENT(_v3, private);
-    CHECK_MEMBER_CURRENT(_v3, resize_callback);
-    CHECK_MEMBER_CURRENT(_v3, destroy_window_callback);
-    CHECK_MEMBER_CURRENT(_v3, surface);
-
-    CHECK_SIZE_CURRENT   (_v3);
-    CHECK_VERSION_CURRENT(_v3);
-
-    return 0;
-}
diff --git a/src/egl/wayland/wayland-egl/wayland-egl-backend.h b/src/egl/wayland/wayland-egl/wayland-egl-backend.h
deleted file mode 100644
index 82f025c..0000000
--- a/src/egl/wayland/wayland-egl/wayland-egl-backend.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright © 2011 Benjamin Franzke
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Benjamin Franzke <benjaminfranzke@googlemail.com>
- */
-
-#ifndef _WAYLAND_EGL_PRIV_H
-#define _WAYLAND_EGL_PRIV_H
-
-#include <stdint.h>
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define WL_EGL_WINDOW_VERSION 3
-
-struct wl_surface;
-
-struct wl_egl_window {
-	const intptr_t version;
-
-	int width;
-	int height;
-	int dx;
-	int dy;
-
-	int attached_width;
-	int attached_height;
-
-	void *private;
-	void (*resize_callback)(struct wl_egl_window *, void *);
-	void (*destroy_window_callback)(void *);
-
-	struct wl_surface *surface;
-};
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif
diff --git a/src/egl/wayland/wayland-egl/wayland-egl-symbols-check b/src/egl/wayland/wayland-egl/wayland-egl-symbols-check
deleted file mode 100755
index a5fab77..0000000
--- a/src/egl/wayland/wayland-egl/wayland-egl-symbols-check
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/sh
-set -eu
-
-LIB=${1-.libs/libwayland-egl.so}
-
-if ! [ -f "$LIB" ]
-then
-  exit 1
-fi
-
-FUNCS=$($NM -D --defined-only $LIB | grep -o "T .*" | cut -c 3- | while read func; do
-( grep -q "^$func$" || echo $func )  <<EOF
-wl_egl_window_resize
-wl_egl_window_create
-wl_egl_window_destroy
-wl_egl_window_get_attached_size
-_fini
-_init
-EOF
-done)
-
-test ! -n "$FUNCS" || echo $FUNCS
-test ! -n "$FUNCS"
-
diff --git a/src/egl/wayland/wayland-egl/wayland-egl.c b/src/egl/wayland/wayland-egl/wayland-egl.c
deleted file mode 100644
index e7cea89..0000000
--- a/src/egl/wayland/wayland-egl/wayland-egl.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright © 2011 Kristian Høgsberg
- * Copyright © 2011 Benjamin Franzke
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Kristian Høgsberg <krh@bitplanet.net>
- *    Benjamin Franzke <benjaminfranzke@googlemail.com>
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "wayland-egl.h"
-#include "wayland-egl-backend.h"
-
-/* GCC visibility */
-#if defined(__GNUC__)
-#define WL_EGL_EXPORT __attribute__ ((visibility("default")))
-#else
-#define WL_EGL_EXPORT
-#endif
-
-WL_EGL_EXPORT void
-wl_egl_window_resize(struct wl_egl_window *egl_window,
-		     int width, int height,
-		     int dx, int dy)
-{
-	if (width <= 0 || height <= 0)
-		return;
-
-	egl_window->width  = width;
-	egl_window->height = height;
-	egl_window->dx     = dx;
-	egl_window->dy     = dy;
-
-	if (egl_window->resize_callback)
-		egl_window->resize_callback(egl_window, egl_window->private);
-}
-
-WL_EGL_EXPORT struct wl_egl_window *
-wl_egl_window_create(struct wl_surface *surface,
-		     int width, int height)
-{
-	struct wl_egl_window *egl_window;
-
-	if (width <= 0 || height <= 0)
-		return NULL;
-
-	egl_window = calloc(1, sizeof *egl_window);
-	if (!egl_window)
-		return NULL;
-
-	/* Cast away the constness to set the version number.
-	 *
-	 * We want the const notation since it gives an explicit
-	 * feedback to the backend implementation, should it try to
-	 * change it.
-	 *
-	 * The latter in itself is not too surprising as these days APIs
-	 * tend to provide bidirectional version field.
-	 */
-	intptr_t *version = (intptr_t *)&egl_window->version;
-	*version = WL_EGL_WINDOW_VERSION;
-
-	egl_window->surface = surface;
-
-	egl_window->width  = width;
-	egl_window->height = height;
-
-	return egl_window;
-}
-
-WL_EGL_EXPORT void
-wl_egl_window_destroy(struct wl_egl_window *egl_window)
-{
-	if (egl_window->destroy_window_callback)
-		egl_window->destroy_window_callback(egl_window->private);
-	free(egl_window);
-}
-
-WL_EGL_EXPORT void
-wl_egl_window_get_attached_size(struct wl_egl_window *egl_window,
-				int *width, int *height)
-{
-	if (width)
-		*width = egl_window->attached_width;
-	if (height)
-		*height = egl_window->attached_height;
-}
diff --git a/src/egl/wayland/wayland-egl/wayland-egl.pc.in b/src/egl/wayland/wayland-egl/wayland-egl.pc.in
deleted file mode 100644
index 8a40cfa..0000000
--- a/src/egl/wayland/wayland-egl/wayland-egl.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: wayland-egl
-Description: Mesa wayland-egl library
-Version: @VERSION@
-Requires: wayland-client
-Libs: -L${libdir} -lwayland-egl
-Cflags: -I${includedir}
diff --git a/src/gallium/Automake.inc b/src/gallium/Automake.inc
index 3e21aa7..329c883 100644
--- a/src/gallium/Automake.inc
+++ b/src/gallium/Automake.inc
@@ -59,6 +59,12 @@
 	$(LIBDRM_LIBS)
 endif
 
+if HAVE_PLATFORM_ANDROID
+GALLIUM_COMMON_LIB_DEPS += \
+	$(ANDROID_LIBS) \
+	$(BACKTRACE_LIBS)
+endif
+
 GALLIUM_WINSYS_CFLAGS = \
 	-I$(top_srcdir)/src \
 	-I$(top_srcdir)/include \
diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am
index c153a5d..e75c186 100644
--- a/src/gallium/Makefile.am
+++ b/src/gallium/Makefile.am
@@ -88,9 +88,9 @@
 SUBDIRS += drivers/vc4 winsys/vc4/drm
 endif
 
-## vc5
-if HAVE_GALLIUM_VC5
-SUBDIRS += drivers/vc5 winsys/vc5/drm
+## v3d
+if HAVE_GALLIUM_V3D
+SUBDIRS += drivers/v3d winsys/v3d/drm
 endif
 
 ## virgl
diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am
index 6b048b8..4bfa764 100644
--- a/src/gallium/auxiliary/Makefile.am
+++ b/src/gallium/auxiliary/Makefile.am
@@ -13,6 +13,7 @@
 	$(MSVC2013_COMPAT_CFLAGS)
 
 AM_CXXFLAGS = \
+	$(GALLIUM_CFLAGS) \
 	$(VISIBILITY_CXXFLAGS) \
 	$(MSVC2013_COMPAT_CXXFLAGS)
 
@@ -21,6 +22,14 @@
 	$(NIR_SOURCES) \
 	$(GENERATED_SOURCES)
 
+if HAVE_PLATFORM_ANDROID
+# Android's libbacktrace headers required C++11, but the Android toolchain (at
+# least in the Chrome OS SDK) does not enable C++11 by default.
+AM_CXXFLAGS += $(CXX11_CXXFLAGS)
+
+libgallium_la_SOURCES += util/u_debug_stack_android.cpp
+endif
+
 if HAVE_LIBDRM
 
 AM_CFLAGS += \
@@ -37,7 +46,6 @@
 	$(LLVM_CFLAGS)
 
 AM_CXXFLAGS += \
-	$(GALLIUM_CFLAGS) \
 	$(LLVM_CXXFLAGS)
 
 libgallium_la_SOURCES += \
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 066746f..626cde1 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -256,6 +256,8 @@
 	util/u_fifo.h \
 	util/u_format.c \
 	util/u_format.h \
+	util/u_format_bptc.c \
+	util/u_format_bptc.h \
 	util/u_format_etc.c \
 	util/u_format_etc.h \
 	util/u_format_latc.c \
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
index b240c93..4c1a76d 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -87,7 +87,7 @@
    return hash;
 }
 
-static void delete_blend_state(void *state, void *data)
+static void delete_blend_state(void *state, UNUSED void *data)
 {
    struct cso_blend *cso = (struct cso_blend *)state;
    if (cso->delete_state)
@@ -95,7 +95,7 @@
    FREE(state);
 }
 
-static void delete_depth_stencil_state(void *state, void *data)
+static void delete_depth_stencil_state(void *state, UNUSED void *data)
 {
    struct cso_depth_stencil_alpha *cso = (struct cso_depth_stencil_alpha *)state;
    if (cso->delete_state)
@@ -103,7 +103,7 @@
    FREE(state);
 }
 
-static void delete_sampler_state(void *state, void *data)
+static void delete_sampler_state(void *state, UNUSED void *data)
 {
    struct cso_sampler *cso = (struct cso_sampler *)state;
    if (cso->delete_state)
@@ -111,7 +111,7 @@
    FREE(state);
 }
 
-static void delete_rasterizer_state(void *state, void *data)
+static void delete_rasterizer_state(void *state, UNUSED void *data)
 {
    struct cso_rasterizer *cso = (struct cso_rasterizer *)state;
    if (cso->delete_state)
@@ -119,7 +119,7 @@
    FREE(state);
 }
 
-static void delete_velements(void *state, void *data)
+static void delete_velements(void *state, UNUSED void *data)
 {
    struct cso_velements *cso = (struct cso_velements *)state;
    if (cso->delete_state)
@@ -163,7 +163,7 @@
 
 
 static inline void sanitize_cb(struct cso_hash *hash, enum cso_cache_type type,
-                               int max_size, void *user_data)
+			       int max_size, UNUSED void *user_data)
 {
    /* if we're approach the maximum size, remove fourth of the entries
     * otherwise every subsequent call will go through the same */
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 3a3a63a..97cb77e 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -87,9 +87,8 @@
     */
    int max_sampler_seen;
 
-   struct pipe_vertex_buffer aux_vertex_buffer_current;
-   struct pipe_vertex_buffer aux_vertex_buffer_saved;
-   unsigned aux_vertex_buffer_index;
+   struct pipe_vertex_buffer vertex_buffer0_current;
+   struct pipe_vertex_buffer vertex_buffer0_saved;
 
    struct pipe_constant_buffer aux_constbuf_current[PIPE_SHADER_TYPES];
    struct pipe_constant_buffer aux_constbuf_saved[PIPE_SHADER_TYPES];
@@ -161,7 +160,7 @@
    return TRUE;
 }
 
-static boolean delete_sampler_state(struct cso_context *ctx, void *state)
+static boolean delete_sampler_state(UNUSED struct cso_context *ctx, void *state)
 {
    struct cso_sampler *cso = (struct cso_sampler *)state;
    if (cso->delete_state)
@@ -291,8 +290,7 @@
 
    /* Install u_vbuf if there is anything unsupported. */
    if (u_vbuf_get_caps(cso->pipe->screen, &caps, flags)) {
-      cso->vbuf = u_vbuf_create(cso->pipe, &caps,
-                                cso->aux_vertex_buffer_index);
+      cso->vbuf = u_vbuf_create(cso->pipe, &caps);
    }
 }
 
@@ -313,8 +311,6 @@
    ctx->pipe = pipe;
    ctx->sample_mask = ~0;
 
-   ctx->aux_vertex_buffer_index = 0; /* 0 for now */
-
    cso_init_vbuf(ctx, u_vbuf_flags);
 
    /* Enable for testing: */
@@ -417,8 +413,8 @@
    util_unreference_framebuffer_state(&ctx->fb);
    util_unreference_framebuffer_state(&ctx->fb_saved);
 
-   pipe_vertex_buffer_unreference(&ctx->aux_vertex_buffer_current);
-   pipe_vertex_buffer_unreference(&ctx->aux_vertex_buffer_saved);
+   pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_current);
+   pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_saved);
 
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
       pipe_resource_reference(&ctx->aux_constbuf_current[i].buffer, NULL);
@@ -1151,6 +1147,9 @@
 {
    struct u_vbuf *vbuf = ctx->vbuf;
 
+   if (!count)
+      return;
+
    if (vbuf) {
       u_vbuf_set_vertex_buffers(vbuf, start_slot, count, buffers);
       return;
@@ -1158,15 +1157,12 @@
 
    /* Save what's in the auxiliary slot, so that we can save and restore it
     * for meta ops. */
-   if (start_slot <= ctx->aux_vertex_buffer_index &&
-       start_slot+count > ctx->aux_vertex_buffer_index) {
+   if (start_slot == 0) {
       if (buffers) {
-         const struct pipe_vertex_buffer *vb =
-               buffers + (ctx->aux_vertex_buffer_index - start_slot);
-
-         pipe_vertex_buffer_reference(&ctx->aux_vertex_buffer_current, vb);
+         pipe_vertex_buffer_reference(&ctx->vertex_buffer0_current,
+                                      buffers);
       } else {
-         pipe_vertex_buffer_unreference(&ctx->aux_vertex_buffer_current);
+         pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_current);
       }
    }
 
@@ -1174,40 +1170,33 @@
 }
 
 static void
-cso_save_aux_vertex_buffer_slot(struct cso_context *ctx)
+cso_save_vertex_buffer0(struct cso_context *ctx)
 {
    struct u_vbuf *vbuf = ctx->vbuf;
 
    if (vbuf) {
-      u_vbuf_save_aux_vertex_buffer_slot(vbuf);
+      u_vbuf_save_vertex_buffer0(vbuf);
       return;
    }
 
-   pipe_vertex_buffer_reference(&ctx->aux_vertex_buffer_saved,
-                                &ctx->aux_vertex_buffer_current);
+   pipe_vertex_buffer_reference(&ctx->vertex_buffer0_saved,
+                                &ctx->vertex_buffer0_current);
 }
 
 static void
-cso_restore_aux_vertex_buffer_slot(struct cso_context *ctx)
+cso_restore_vertex_buffer0(struct cso_context *ctx)
 {
    struct u_vbuf *vbuf = ctx->vbuf;
 
    if (vbuf) {
-      u_vbuf_restore_aux_vertex_buffer_slot(vbuf);
+      u_vbuf_restore_vertex_buffer0(vbuf);
       return;
    }
 
-   cso_set_vertex_buffers(ctx, ctx->aux_vertex_buffer_index, 1,
-                          &ctx->aux_vertex_buffer_saved);
-   pipe_vertex_buffer_unreference(&ctx->aux_vertex_buffer_saved);
+   cso_set_vertex_buffers(ctx, 0, 1, &ctx->vertex_buffer0_saved);
+   pipe_vertex_buffer_unreference(&ctx->vertex_buffer0_saved);
 }
 
-unsigned cso_get_aux_vertex_buffer_slot(struct cso_context *ctx)
-{
-   return ctx->aux_vertex_buffer_index;
-}
-
-
 
 void
 cso_single_sampler(struct cso_context *ctx, enum pipe_shader_type shader_stage,
@@ -1595,7 +1584,7 @@
    cso->saved_state = state_mask;
 
    if (state_mask & CSO_BIT_AUX_VERTEX_BUFFER_SLOT)
-      cso_save_aux_vertex_buffer_slot(cso);
+      cso_save_vertex_buffer0(cso);
    if (state_mask & CSO_BIT_BLEND)
       cso_save_blend(cso);
    if (state_mask & CSO_BIT_DEPTH_STENCIL_ALPHA)
@@ -1650,7 +1639,7 @@
    assert(state_mask);
 
    if (state_mask & CSO_BIT_AUX_VERTEX_BUFFER_SLOT)
-      cso_restore_aux_vertex_buffer_slot(cso);
+      cso_restore_vertex_buffer0(cso);
    if (state_mask & CSO_BIT_BLEND)
       cso_restore_blend(cso);
    if (state_mask & CSO_BIT_DEPTH_STENCIL_ALPHA)
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index 3a4e808..d3501fb 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -87,11 +87,6 @@
                             unsigned start_slot, unsigned count,
                             const struct pipe_vertex_buffer *buffers);
 
-/* One vertex buffer slot is provided with the save/restore functionality.
- * cso_context chooses the slot, it can be non-zero. */
-unsigned cso_get_aux_vertex_buffer_slot(struct cso_context *ctx);
-
-
 void cso_set_stream_outputs(struct cso_context *ctx,
                             unsigned num_targets,
                             struct pipe_stream_output_target **targets,
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index 4cfa54b..2a9c944 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -47,11 +47,6 @@
 /** Set to 1 to enable printing of coords before/after clipping */
 #define DEBUG_CLIP 0
 
-
-#ifndef DIFFERENT_SIGNS
-#define DIFFERENT_SIGNS(x, y) ((x) * (y) <= 0.0F && (x) - (y) != 0.0F)
-#endif
-
 #define MAX_CLIPPED_VERTICES ((2 * (6 + PIPE_MAX_CLIP_PLANES))+1)
 
 
@@ -215,30 +210,6 @@
 }
 
 /**
- * Checks whether the specified triangle is empty and if it is returns
- * true, otherwise returns false.
- * Triangle is considered null/empty if its area is equal to zero.
- */
-static inline boolean
-is_tri_null(const struct clip_stage *clip, const struct prim_header *header)
-{
-   const unsigned pos_attr = clip->pos_attr;
-   float x1 = header->v[1]->data[pos_attr][0] - header->v[0]->data[pos_attr][0];
-   float y1 = header->v[1]->data[pos_attr][1] - header->v[0]->data[pos_attr][1];
-   float z1 = header->v[1]->data[pos_attr][2] - header->v[0]->data[pos_attr][2];
-
-   float x2 = header->v[2]->data[pos_attr][0] - header->v[0]->data[pos_attr][0];
-   float y2 = header->v[2]->data[pos_attr][1] - header->v[0]->data[pos_attr][1];
-   float z2 = header->v[2]->data[pos_attr][2] - header->v[0]->data[pos_attr][2];
-
-   float vx = y1 * z2 - z1 * y2;
-   float vy = x1 * z2 - z1 * x2;
-   float vz = x1 * y2 - y1 * x2;
-
-   return (vx*vx  + vy*vy + vz*vz) == 0.f;
-}
-
-/**
  * Emit a post-clip polygon to the next pipeline stage.  The polygon
  * will be convex and the provoking vertex will always be vertex[0].
  */
@@ -252,8 +223,6 @@
    struct prim_header header;
    unsigned i;
    ushort edge_first, edge_middle, edge_last;
-   boolean last_tri_was_null = FALSE;
-   boolean tri_was_not_null = FALSE;
 
    if (stage->draw->rasterizer->flatshade_first) {
       edge_first  = DRAW_PIPE_EDGE_FLAG_0;
@@ -275,7 +244,6 @@
    header.pad = 0;
 
    for (i = 2; i < n; i++, header.flags = edge_middle) {
-      boolean tri_null;
       /* order the triangle verts to respect the provoking vertex mode */
       if (stage->draw->rasterizer->flatshade_first) {
          header.v[0] = inlist[0];  /* the provoking vertex */
@@ -288,19 +256,6 @@
          header.v[2] = inlist[0];  /* the provoking vertex */
       }
 
-      tri_null = is_tri_null(clipper, &header);
-      /* If we generated a triangle with an area, aka. non-null triangle,
-       * or if the previous triangle was also null then skip all subsequent
-       * null triangles */
-      if ((tri_was_not_null && tri_null) || (last_tri_was_null && tri_null)) {
-         last_tri_was_null = tri_null;
-         continue;
-      }
-      last_tri_was_null = tri_null;
-      if (!tri_null) {
-         tri_was_not_null = TRUE;
-      }
-
       if (!edgeflags[i-1]) {
          header.flags &= ~edge_middle;
       }
@@ -480,6 +435,7 @@
       for (i = 1; i <= n; i++) {
          struct vertex_header *vert = inlist[i];
          boolean *edge = &inEdges[i];
+         boolean different_sign;
 
          float dp = getclipdist(clipper, vert, plane_idx);
 
@@ -492,9 +448,12 @@
                return;
             outEdges[outcount] = *edge_prev;
             outlist[outcount++] = vert_prev;
+            different_sign = dp < 0.0f;
+         } else {
+            different_sign = !(dp < 0.0f);
          }
 
-         if (DIFFERENT_SIGNS(dp, dp_prev)) {
+         if (different_sign) {
             struct vertex_header *new_vert;
             boolean *new_edge;
 
@@ -512,7 +471,7 @@
 
             if (dp < 0.0f) {
                /* Going out of bounds.  Avoid division by zero as we
-                * know dp != dp_prev from DIFFERENT_SIGNS, above.
+                * know dp != dp_prev from different_sign, above.
                 */
                float t = dp / (dp - dp_prev);
                interp( clipper, new_vert, t, vert, vert_prev, viewport_index );
diff --git a/src/gallium/auxiliary/driver_ddebug/dd_public.h b/src/gallium/auxiliary/driver_ddebug/dd_public.h
index e660765..31c139d 100644
--- a/src/gallium/auxiliary/driver_ddebug/dd_public.h
+++ b/src/gallium/auxiliary/driver_ddebug/dd_public.h
@@ -30,7 +30,15 @@
 
 struct pipe_screen;
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct pipe_screen *
 ddebug_screen_create(struct pipe_screen *screen);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* DD_PUBLIC_H_ */
diff --git a/src/gallium/auxiliary/driver_ddebug/dd_screen.c b/src/gallium/auxiliary/driver_ddebug/dd_screen.c
index 5f922d8..a89af8a 100644
--- a/src/gallium/auxiliary/driver_ddebug/dd_screen.c
+++ b/src/gallium/auxiliary/driver_ddebug/dd_screen.c
@@ -147,12 +147,13 @@
                               enum pipe_format format,
                               enum pipe_texture_target target,
                               unsigned sample_count,
+                              unsigned storage_sample_count,
                               unsigned tex_usage)
 {
    struct pipe_screen *screen = dd_screen(_screen)->screen;
 
    return screen->is_format_supported(screen, format, target, sample_count,
-                                      tex_usage);
+                                      storage_sample_count, tex_usage);
 }
 
 static boolean
diff --git a/src/gallium/auxiliary/driver_noop/noop_pipe.c b/src/gallium/auxiliary/driver_noop/noop_pipe.c
index d1e795d..7de3e88 100644
--- a/src/gallium/auxiliary/driver_noop/noop_pipe.c
+++ b/src/gallium/auxiliary/driver_noop/noop_pipe.c
@@ -307,6 +307,11 @@
    return true;
 }
 
+static void noop_invalidate_resource(struct pipe_context *ctx,
+                                     struct pipe_resource *resource)
+{
+}
+
 static struct pipe_context *noop_create_context(struct pipe_screen *screen,
                                                 void *priv, unsigned flags)
 {
@@ -345,6 +350,7 @@
    ctx->transfer_unmap = noop_transfer_unmap;
    ctx->buffer_subdata = noop_buffer_subdata;
    ctx->texture_subdata = noop_texture_subdata;
+   ctx->invalidate_resource = noop_invalidate_resource;
    noop_init_state_functions(ctx);
 
    return ctx;
@@ -414,11 +420,13 @@
                                         enum pipe_format format,
                                         enum pipe_texture_target target,
                                         unsigned sample_count,
+                                        unsigned storage_sample_count,
                                         unsigned usage)
 {
    struct pipe_screen *screen = ((struct noop_pipe_screen*)pscreen)->oscreen;
 
-   return screen->is_format_supported(screen, format, target, sample_count, usage);
+   return screen->is_format_supported(screen, format, target, sample_count,
+                                      storage_sample_count, usage);
 }
 
 static uint64_t noop_get_timestamp(struct pipe_screen *pscreen)
diff --git a/src/gallium/auxiliary/driver_noop/noop_public.h b/src/gallium/auxiliary/driver_noop/noop_public.h
index 180ea59..46a7f40 100644
--- a/src/gallium/auxiliary/driver_noop/noop_public.h
+++ b/src/gallium/auxiliary/driver_noop/noop_public.h
@@ -23,7 +23,15 @@
 #ifndef NOOP_PUBLIC_H
 #define NOOP_PUBLIC_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct pipe_screen;
 struct pipe_screen *noop_screen_create(struct pipe_screen *screen);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/gallium/auxiliary/driver_rbug/rbug_screen.c b/src/gallium/auxiliary/driver_rbug/rbug_screen.c
index a1a77ad..693e7fa 100644
--- a/src/gallium/auxiliary/driver_rbug/rbug_screen.c
+++ b/src/gallium/auxiliary/driver_rbug/rbug_screen.c
@@ -124,6 +124,7 @@
                                 enum pipe_format format,
                                 enum pipe_texture_target target,
                                 unsigned sample_count,
+                                unsigned storage_sample_count,
                                 unsigned tex_usage)
 {
    struct rbug_screen *rb_screen = rbug_screen(_screen);
@@ -133,6 +134,7 @@
                                       format,
                                       target,
                                       sample_count,
+                                      storage_sample_count,
                                       tex_usage);
 }
 
diff --git a/src/gallium/auxiliary/driver_trace/tr_context.c b/src/gallium/auxiliary/driver_trace/tr_context.c
index 6d918d4..dc091ae 100644
--- a/src/gallium/auxiliary/driver_trace/tr_context.c
+++ b/src/gallium/auxiliary/driver_trace/tr_context.c
@@ -1430,35 +1430,59 @@
        */
 
       struct pipe_resource *resource = transfer->resource;
-      unsigned level = transfer->level;
       unsigned usage = transfer->usage;
       const struct pipe_box *box = &transfer->box;
       unsigned stride = transfer->stride;
       unsigned layer_stride = transfer->layer_stride;
 
-      if (resource->target == PIPE_BUFFER)
+      if (resource->target == PIPE_BUFFER) {
+         unsigned offset = box->x;
+         unsigned size = box->width;
+
          trace_dump_call_begin("pipe_context", "buffer_subdata");
-      else
+
+         trace_dump_arg(ptr, context);
+         trace_dump_arg(ptr, resource);
+         trace_dump_arg(uint, usage);
+         trace_dump_arg(uint, offset);
+         trace_dump_arg(uint, size);
+
+         trace_dump_arg_begin("data");
+         trace_dump_box_bytes(tr_trans->map,
+                              resource,
+                              box,
+                              stride,
+                              layer_stride);
+         trace_dump_arg_end();
+
+         trace_dump_arg(uint, stride);
+         trace_dump_arg(uint, layer_stride);
+
+         trace_dump_call_end();
+      } else {
+         unsigned level = transfer->level;
+
          trace_dump_call_begin("pipe_context", "texture_subdata");
 
-      trace_dump_arg(ptr, context);
-      trace_dump_arg(ptr, resource);
-      trace_dump_arg(uint, level);
-      trace_dump_arg(uint, usage);
-      trace_dump_arg(box, box);
+         trace_dump_arg(ptr, context);
+         trace_dump_arg(ptr, resource);
+         trace_dump_arg(uint, level);
+         trace_dump_arg(uint, usage);
+         trace_dump_arg(box, box);
 
-      trace_dump_arg_begin("data");
-      trace_dump_box_bytes(tr_trans->map,
-                           resource,
-                           box,
-                           stride,
-                           layer_stride);
-      trace_dump_arg_end();
+         trace_dump_arg_begin("data");
+         trace_dump_box_bytes(tr_trans->map,
+                              resource,
+                              box,
+                              stride,
+                              layer_stride);
+         trace_dump_arg_end();
 
-      trace_dump_arg(uint, stride);
-      trace_dump_arg(uint, layer_stride);
+         trace_dump_arg(uint, stride);
+         trace_dump_arg(uint, layer_stride);
 
-      trace_dump_call_end();
+         trace_dump_call_end();
+      }
 
       tr_trans->map = NULL;
    }
diff --git a/src/gallium/auxiliary/driver_trace/tr_dump_state.c b/src/gallium/auxiliary/driver_trace/tr_dump_state.c
index e7e3223..46fa574 100644
--- a/src/gallium/auxiliary/driver_trace/tr_dump_state.c
+++ b/src/gallium/auxiliary/driver_trace/tr_dump_state.c
@@ -69,6 +69,7 @@
 
    trace_dump_member(uint, templat, last_level);
    trace_dump_member(uint, templat, nr_samples);
+   trace_dump_member(uint, templat, nr_storage_samples);
    trace_dump_member(uint, templat, usage);
    trace_dump_member(uint, templat, bind);
    trace_dump_member(uint, templat, flags);
@@ -724,7 +725,7 @@
    if (!trace_dumping_enabled_locked())
       return;
 
-   if(!state) {
+   if (!state || !state->resource) {
       trace_dump_null();
       return;
    }
diff --git a/src/gallium/auxiliary/driver_trace/tr_screen.c b/src/gallium/auxiliary/driver_trace/tr_screen.c
index 704b2a3..b5bd3e1 100644
--- a/src/gallium/auxiliary/driver_trace/tr_screen.c
+++ b/src/gallium/auxiliary/driver_trace/tr_screen.c
@@ -225,6 +225,7 @@
                                  enum pipe_format format,
                                  enum pipe_texture_target target,
                                  unsigned sample_count,
+                                 unsigned storage_sample_count,
                                  unsigned tex_usage)
 {
    struct trace_screen *tr_scr = trace_screen(_screen);
@@ -240,7 +241,7 @@
    trace_dump_arg(uint, tex_usage);
 
    result = screen->is_format_supported(screen, format, target, sample_count,
-                                        tex_usage);
+                                        storage_sample_count, tex_usage);
 
    trace_dump_ret(bool, result);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index dae9d01..1f0a01c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -137,13 +137,26 @@
    }
 
    if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) {
-      /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
-       * but there are more on SVN.
-       * TODO: Add more passes.
+      /*
+       * TODO: Evaluate passes some more - keeping in mind
+       * both quality of generated code and compile times.
+       */
+      /*
+       * NOTE: if you change this, don't forget to change the output
+       * with GALLIVM_DEBUG_DUMP_BC in gallivm_compile_module.
        */
       LLVMAddScalarReplAggregatesPass(gallivm->passmgr);
-      LLVMAddLICMPass(gallivm->passmgr);
+      LLVMAddEarlyCSEPass(gallivm->passmgr);
       LLVMAddCFGSimplificationPass(gallivm->passmgr);
+      /*
+       * FIXME: LICM is potentially quite useful. However, for some
+       * rather crazy shaders the compile time can reach _hours_ per shader,
+       * due to licm implying lcssa (since llvm 3.5), which can take forever.
+       * Even for sane shaders, the cost of licm is rather high (and not just
+       * due to lcssa, licm itself too), though mostly only in cases when it
+       * can actually move things, so having to disable it is a pity.
+       * LLVMAddLICMPass(gallivm->passmgr);
+       */
       LLVMAddReassociatePass(gallivm->passmgr);
       LLVMAddPromoteMemoryToRegisterPass(gallivm->passmgr);
       LLVMAddConstantPropagationPass(gallivm->passmgr);
@@ -568,6 +581,22 @@
       gallivm->builder = NULL;
    }
 
+   /* Dump bitcode to a file */
+   if (gallivm_debug & GALLIVM_DEBUG_DUMP_BC) {
+      char filename[256];
+      assert(gallivm->module_name);
+      util_snprintf(filename, sizeof(filename), "ir_%s.bc", gallivm->module_name);
+      LLVMWriteBitcodeToFile(gallivm->module, filename);
+      debug_printf("%s written\n", filename);
+      debug_printf("Invoke as \"opt %s %s | llc -O%d %s%s\"\n",
+                   gallivm_debug & GALLIVM_DEBUG_NO_OPT ? "-mem2reg" :
+                   "-sroa -early-cse -simplifycfg -reassociate "
+                   "-mem2reg -constprop -instcombine -gvn",
+                   filename, gallivm_debug & GALLIVM_DEBUG_NO_OPT ? 0 : 2,
+                   (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "",
+                   "[-mattr=<-mattr option(s)>]");
+   }
+
    if (gallivm_debug & GALLIVM_DEBUG_PERF)
       time_begin = os_time_get();
 
@@ -595,25 +624,12 @@
 
    if (gallivm_debug & GALLIVM_DEBUG_PERF) {
       int64_t time_end = os_time_get();
-      int time_msec = (int)(time_end - time_begin) / 1000;
+      int time_msec = (int)((time_end - time_begin) / 1000);
       assert(gallivm->module_name);
       debug_printf("optimizing module %s took %d msec\n",
                    gallivm->module_name, time_msec);
    }
 
-   /* Dump byte code to a file */
-   if (gallivm_debug & GALLIVM_DEBUG_DUMP_BC) {
-      char filename[256];
-      assert(gallivm->module_name);
-      util_snprintf(filename, sizeof(filename), "ir_%s.bc", gallivm->module_name);
-      LLVMWriteBitcodeToFile(gallivm->module, filename);
-      debug_printf("%s written\n", filename);
-      debug_printf("Invoke as \"llc %s%s -o - %s\"\n",
-                   (HAVE_LLVM >= 0x0305) ? "[-mcpu=<-mcpu option>] " : "",
-                   "[-mattr=<-mattr option(s)>]",
-                   filename);
-   }
-
    if (use_mcjit) {
       /* Setting the module's DataLayout to an empty string will cause the
        * ExecutionEngine to copy to the DataLayout string from its target
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index c7755bf..7b66b75 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -143,6 +143,8 @@
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
       return 0;
+   case PIPE_SHADER_CAP_SCALAR_ISA:
+      return 1;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index e411f90..83d7dbe 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -741,7 +741,8 @@
 
    assert(lp_check_value(bld_store->type, val));
    assert(LLVMGetTypeKind(LLVMTypeOf(dst_ptr)) == LLVMPointerTypeKind);
-   assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val));
+   assert(LLVMGetElementType(LLVMTypeOf(dst_ptr)) == LLVMTypeOf(val) ||
+          LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(dst_ptr))) == LLVMArrayTypeKind);
 
    if (exec_mask) {
       LLVMValueRef res, dst;
@@ -852,7 +853,14 @@
 
    if (bld->indirect_files & (1 << file)) {
       LLVMValueRef lindex = lp_build_const_int32(bld->bld_base.base.gallivm, index * 4 + chan);
-      return LLVMBuildGEP(builder, var_of_array, &lindex, 1, "");
+      if (LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(var_of_array))) == LLVMArrayTypeKind) {
+         LLVMValueRef gep[2];
+         gep[0] = lp_build_const_int32(bld->bld_base.base.gallivm, 0);
+         gep[1] = lindex;
+         return LLVMBuildGEP(builder, var_of_array, gep, 2, "");
+      } else {
+         return LLVMBuildGEP(builder, var_of_array, &lindex, 1, "");
+      }
    }
    else {
       assert(index <= bld->bld_base.info->file_max[file]);
@@ -1352,21 +1360,20 @@
          /* Gather values from the immediate register array */
          res = build_gather(bld_base, imms_array, index_vec, NULL, index_vec2);
       } else {
-         LLVMValueRef lindex = lp_build_const_int32(gallivm,
-                                        reg->Register.Index * 4 + swizzle);
-         LLVMValueRef imms_ptr =  LLVMBuildGEP(builder,
-                                                bld->imms_array, &lindex, 1, "");
+         LLVMValueRef gep[2];
+         gep[0] = lp_build_const_int32(gallivm, 0);
+         gep[1] = lp_build_const_int32(gallivm, reg->Register.Index * 4 + swizzle);
+         LLVMValueRef imms_ptr = LLVMBuildGEP(builder,
+                                              bld->imms_array, gep, 2, "");
          res = LLVMBuildLoad(builder, imms_ptr, "");
 
          if (tgsi_type_is_64bit(stype)) {
-            LLVMValueRef lindex1;
             LLVMValueRef imms_ptr2;
             LLVMValueRef res2;
-
-            lindex1 = lp_build_const_int32(gallivm,
-                                           reg->Register.Index * 4 + swizzle + 1);
+            gep[1] = lp_build_const_int32(gallivm,
+                                          reg->Register.Index * 4 + swizzle + 1);
             imms_ptr2 = LLVMBuildGEP(builder,
-                                      bld->imms_array, &lindex1, 1, "");
+                                     bld->imms_array, gep, 2, "");
             res2 = LLVMBuildLoad(builder, imms_ptr2, "");
             res = emit_fetch_64bit(bld_base, stype, res, res2);
          }
@@ -2957,13 +2964,14 @@
       unsigned index = bld->num_immediates;
       struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
       LLVMBuilderRef builder = gallivm->builder;
+      LLVMValueRef gep[2];
+      gep[0] = lp_build_const_int32(gallivm, 0);
 
       assert(bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE));
       for (i = 0; i < 4; ++i ) {
-         LLVMValueRef lindex = lp_build_const_int32(
-                  bld->bld_base.base.gallivm, index * 4 + i);
+         gep[1] = lp_build_const_int32(gallivm, index * 4 + i);
          LLVMValueRef imm_ptr = LLVMBuildGEP(builder,
-                                             bld->imms_array, &lindex, 1, "");
+                                             bld->imms_array, gep, 2, "");
          LLVMBuildStore(builder, imms[i], imm_ptr);
       }
    } else {
@@ -2979,11 +2987,12 @@
          unsigned index = bld->num_immediates;
          struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
          LLVMBuilderRef builder = gallivm->builder;
+         LLVMValueRef gep[2];
+         gep[0] = lp_build_const_int32(gallivm, 0);
          for (i = 0; i < 4; ++i ) {
-            LLVMValueRef lindex = lp_build_const_int32(
-                     bld->bld_base.base.gallivm, index * 4 + i);
+            gep[1] = lp_build_const_int32(gallivm, index * 4 + i);
             LLVMValueRef imm_ptr = LLVMBuildGEP(builder,
-                                                bld->imms_array, &lindex, 1, "");
+                                                bld->imms_array, gep, 2, "");
             LLVMBuildStore(builder,
                            bld->immediates[index][i],
                            imm_ptr);
@@ -3649,12 +3658,10 @@
    struct gallivm_state * gallivm = bld_base->base.gallivm;
 
    if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
-      LLVMValueRef array_size =
-         lp_build_const_int32(gallivm,
-                         bld_base->info->file_max[TGSI_FILE_TEMPORARY] * 4 + 4);
-      bld->temps_array = lp_build_array_alloca(gallivm,
-                                              bld_base->base.vec_type, array_size,
-                                              "temp_array");
+      unsigned array_size = bld_base->info->file_max[TGSI_FILE_TEMPORARY] * 4 + 4;
+      bld->temps_array = lp_build_alloca_undef(gallivm,
+                                               LLVMArrayType(bld_base->base.vec_type, array_size),
+                                               "temp_array");
    }
 
    if (bld->indirect_files & (1 << TGSI_FILE_OUTPUT)) {
@@ -3667,11 +3674,9 @@
    }
 
    if (bld->indirect_files & (1 << TGSI_FILE_IMMEDIATE)) {
-      LLVMValueRef array_size =
-         lp_build_const_int32(gallivm,
-                         bld_base->info->file_max[TGSI_FILE_IMMEDIATE] * 4 + 4);
-      bld->imms_array = lp_build_array_alloca(gallivm,
-                                              bld_base->base.vec_type, array_size,
+      unsigned array_size = bld_base->info->file_max[TGSI_FILE_IMMEDIATE] * 4 + 4;
+      bld->imms_array = lp_build_alloca_undef(gallivm,
+                                              LLVMArrayType(bld_base->base.vec_type, array_size),
                                               "imms_array");
    }
 
diff --git a/src/gallium/auxiliary/hud/font.c b/src/gallium/auxiliary/hud/font.c
index 9fb9d7e..88b0349 100644
--- a/src/gallium/auxiliary/hud/font.c
+++ b/src/gallium/auxiliary/hud/font.c
@@ -390,7 +390,7 @@
 
    for (i = 0; i < ARRAY_SIZE(formats); i++) {
       if (screen->is_format_supported(screen, formats[i],
-                                   PIPE_TEXTURE_RECT, 0,
+                                   PIPE_TEXTURE_RECT, 0, 0,
                                    PIPE_BIND_SAMPLER_VIEW)) {
          tex_format = formats[i];
          break;
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 1baaabb..3dd7c10 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -90,8 +90,7 @@
    hud->constants.scale[1] = yscale;
    cso_set_constant_buffer(cso, PIPE_SHADER_VERTEX, 0, &hud->constbuf);
 
-   cso_set_vertex_buffers(cso, cso_get_aux_vertex_buffer_slot(cso),
-                          1, &hud->color_prims.vbuf);
+   cso_set_vertex_buffers(cso, 0, 1, &hud->color_prims.vbuf);
    cso_set_fragment_shader_handle(hud->cso, hud->fs_color);
    cso_draw_arrays(cso, prim, 0, num_vertices);
 
@@ -561,16 +560,14 @@
       hud->constants.scale[1] = 1;
 
       cso_set_constant_buffer(cso, PIPE_SHADER_VERTEX, 0, &hud->constbuf);
-      cso_set_vertex_buffers(cso, cso_get_aux_vertex_buffer_slot(cso), 1,
-                             &hud->bg.vbuf);
+      cso_set_vertex_buffers(cso, 0, 1, &hud->bg.vbuf);
       cso_draw_arrays(cso, PIPE_PRIM_QUADS, 0, hud->bg.num_vertices);
    }
    pipe_resource_reference(&hud->bg.vbuf.buffer.resource, NULL);
 
    /* draw accumulated vertices for text */
    if (hud->text.num_vertices) {
-      cso_set_vertex_buffers(cso, cso_get_aux_vertex_buffer_slot(cso), 1,
-                             &hud->text.vbuf);
+      cso_set_vertex_buffers(cso, 0, 1, &hud->text.vbuf);
       cso_set_fragment_shader_handle(hud->cso, hud->fs_text);
       cso_draw_arrays(cso, PIPE_PRIM_QUADS, 0, hud->text.num_vertices);
    }
@@ -598,8 +595,7 @@
    cso_set_constant_buffer(cso, PIPE_SHADER_VERTEX, 0, &hud->constbuf);
 
    if (hud->whitelines.num_vertices) {
-      cso_set_vertex_buffers(cso, cso_get_aux_vertex_buffer_slot(cso), 1,
-                             &hud->whitelines.vbuf);
+      cso_set_vertex_buffers(cso, 0, 1, &hud->whitelines.vbuf);
       cso_set_fragment_shader_handle(hud->cso, hud->fs_color);
       cso_draw_arrays(cso, PIPE_PRIM_LINES, 0, hud->whitelines.num_vertices);
    }
@@ -1218,6 +1214,8 @@
    }
 
    while ((num = parse_string(env, name_a)) != 0) {
+      bool added = true;
+
       env += num;
 
       /* check for explicit location, size and etc. settings */
@@ -1251,6 +1249,9 @@
       if (strcmp(name, "fps") == 0) {
          hud_fps_graph_install(pane);
       }
+      else if (strcmp(name, "frametime") == 0) {
+         hud_frametime_graph_install(pane);
+      }
       else if (strcmp(name, "cpu") == 0) {
          hud_cpu_graph_install(pane, ALL_CPUS);
       }
@@ -1387,6 +1388,7 @@
                                           screen, name)) {
                fprintf(stderr, "gallium_hud: unknown driver query '%s'\n", name);
                fflush(stderr);
+               added = false;
             }
          }
       }
@@ -1429,7 +1431,7 @@
          env += num;
 
          strip_hyphens(s);
-         if (!LIST_IS_EMPTY(&pane->graph_list)) {
+         if (added && !LIST_IS_EMPTY(&pane->graph_list)) {
             struct hud_graph *graph;
             graph = LIST_ENTRY(struct hud_graph, pane->graph_list.prev, head);
             strncpy(graph->name, s, sizeof(graph->name)-1);
@@ -1561,6 +1563,7 @@
    puts("");
    puts("  Available names:");
    puts("    fps");
+   puts("    frametime");
    puts("    cpu");
 
    for (i = 0; i < num_cpus; i++)
@@ -1839,7 +1842,7 @@
    hud->refcount = 1;
    hud->has_srgb = screen->is_format_supported(screen,
                                                PIPE_FORMAT_B8G8R8A8_SRGB,
-                                               PIPE_TEXTURE_2D, 0,
+                                               PIPE_TEXTURE_2D, 0, 0,
                                                PIPE_BIND_RENDER_TARGET) != 0;
 
    /* blend state */
@@ -1868,7 +1871,7 @@
    for (i = 0; i < 2; i++) {
       hud->velems[i].src_offset = i * 2 * sizeof(float);
       hud->velems[i].src_format = PIPE_FORMAT_R32G32_FLOAT;
-      hud->velems[i].vertex_buffer_index = cso_get_aux_vertex_buffer_slot(cso);
+      hud->velems[i].vertex_buffer_index = 0;
    }
 
    /* sampler state (for font drawing) */
diff --git a/src/gallium/auxiliary/hud/hud_fps.c b/src/gallium/auxiliary/hud/hud_fps.c
index c8438d0..29110f5 100644
--- a/src/gallium/auxiliary/hud/hud_fps.c
+++ b/src/gallium/auxiliary/hud/hud_fps.c
@@ -33,6 +33,7 @@
 #include "util/u_memory.h"
 
 struct fps_info {
+   boolean frametime;
    int frames;
    uint64_t last_time;
 };
@@ -46,7 +47,12 @@
    info->frames++;
 
    if (info->last_time) {
-      if (info->last_time + gr->pane->period <= now) {
+      if (info->frametime) {
+         double frametime = ((double)now - (double)info->last_time) / 1000.0;
+         hud_graph_add_value(gr, frametime);
+         info->last_time = now;
+      }
+      else if (info->last_time + gr->pane->period <= now) {
          double fps = ((uint64_t)info->frames) * 1000000 /
                       (double)(now - info->last_time);
          info->frames = 0;
@@ -80,6 +86,8 @@
       FREE(gr);
       return;
    }
+   struct fps_info *info = gr->query_data;
+   info->frametime = false;
 
    gr->query_new_value = query_fps;
 
@@ -90,3 +98,27 @@
 
    hud_pane_add_graph(pane, gr);
 }
+
+void
+hud_frametime_graph_install(struct hud_pane *pane)
+{
+   struct hud_graph *gr = CALLOC_STRUCT(hud_graph);
+
+   if (!gr)
+      return;
+
+   strcpy(gr->name, "frametime (ms)");
+   gr->query_data = CALLOC_STRUCT(fps_info);
+   if (!gr->query_data) {
+      FREE(gr);
+      return;
+   }
+   struct fps_info *info = gr->query_data;
+   info->frametime = true;
+
+   gr->query_new_value = query_fps;
+
+   gr->free_query_data = free_query_data;
+
+   hud_pane_add_graph(pane, gr);
+}
diff --git a/src/gallium/auxiliary/hud/hud_private.h b/src/gallium/auxiliary/hud/hud_private.h
index b64e29e..deed329 100644
--- a/src/gallium/auxiliary/hud/hud_private.h
+++ b/src/gallium/auxiliary/hud/hud_private.h
@@ -157,6 +157,7 @@
 int hud_get_num_cpus(void);
 
 void hud_fps_graph_install(struct hud_pane *pane);
+void hud_frametime_graph_install(struct hud_pane *pane);
 void hud_cpu_graph_install(struct hud_pane *pane, unsigned cpu_index);
 void hud_thread_busy_install(struct hud_pane *pane, const char *name, bool main);
 void hud_thread_counter_install(struct hud_pane *pane, const char *name,
diff --git a/src/gallium/auxiliary/indices/u_indices_gen.py b/src/gallium/auxiliary/indices/u_indices_gen.py
index 376348d..2d92978 100644
--- a/src/gallium/auxiliary/indices/u_indices_gen.py
+++ b/src/gallium/auxiliary/indices/u_indices_gen.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 copyright = '''
 /*
  * Copyright 2009 VMware, Inc.
@@ -69,9 +71,9 @@
 pr_idx = dict(prdisable='PR_DISABLE', prenable='PR_ENABLE')
 
 def prolog():
-    print '''/* File automatically generated by u_indices_gen.py */'''
-    print copyright
-    print r'''
+    print('''/* File automatically generated by u_indices_gen.py */''')
+    print(copyright)
+    print(r'''
 
 /**
  * @file
@@ -107,7 +109,7 @@
 static u_generate_func  generate[OUT_COUNT][PV_COUNT][PV_COUNT][PRIM_COUNT];
 
 
-'''
+''')
 
 def vert( intype, outtype, v0 ):
     if intype == GENERATE:
@@ -116,30 +118,30 @@
         return '(' + outtype + ')in[' + v0 + ']'
 
 def point( intype, outtype, ptr, v0 ):
-    print '      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';'
+    print('      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';')
 
 def line( intype, outtype, ptr, v0, v1 ):
-    print '      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';'
-    print '      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';'
+    print('      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';')
+    print('      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';')
 
 def tri( intype, outtype, ptr, v0, v1, v2 ):
-    print '      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';'
-    print '      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';'
-    print '      (' + ptr + ')[2] = ' + vert( intype, outtype, v2 ) + ';'
+    print('      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';')
+    print('      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';')
+    print('      (' + ptr + ')[2] = ' + vert( intype, outtype, v2 ) + ';')
 
 def lineadj( intype, outtype, ptr, v0, v1, v2, v3 ):
-    print '      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';'
-    print '      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';'
-    print '      (' + ptr + ')[2] = ' + vert( intype, outtype, v2 ) + ';'
-    print '      (' + ptr + ')[3] = ' + vert( intype, outtype, v3 ) + ';'
+    print('      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';')
+    print('      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';')
+    print('      (' + ptr + ')[2] = ' + vert( intype, outtype, v2 ) + ';')
+    print('      (' + ptr + ')[3] = ' + vert( intype, outtype, v3 ) + ';')
 
 def triadj( intype, outtype, ptr, v0, v1, v2, v3, v4, v5 ):
-    print '      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';'
-    print '      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';'
-    print '      (' + ptr + ')[2] = ' + vert( intype, outtype, v2 ) + ';'
-    print '      (' + ptr + ')[3] = ' + vert( intype, outtype, v3 ) + ';'
-    print '      (' + ptr + ')[4] = ' + vert( intype, outtype, v4 ) + ';'
-    print '      (' + ptr + ')[5] = ' + vert( intype, outtype, v5 ) + ';'
+    print('      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';')
+    print('      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';')
+    print('      (' + ptr + ')[2] = ' + vert( intype, outtype, v2 ) + ';')
+    print('      (' + ptr + ')[3] = ' + vert( intype, outtype, v3 ) + ';')
+    print('      (' + ptr + ')[4] = ' + vert( intype, outtype, v4 ) + ';')
+    print('      (' + ptr + ')[5] = ' + vert( intype, outtype, v5 ) + ';')
 
 def do_point( intype, outtype, ptr, v0 ):
     point( intype, outtype, ptr, v0 )
@@ -186,231 +188,231 @@
         return 'translate_' + prim + '_' + intype + '2' + outtype + '_' + inpv + '2' + outpv + '_' + pr
 
 def preamble(intype, outtype, inpv, outpv, pr, prim):
-    print 'static void ' + name( intype, outtype, inpv, outpv, pr, prim ) + '('
+    print('static void ' + name( intype, outtype, inpv, outpv, pr, prim ) + '(')
     if intype != GENERATE:
-        print '    const void * _in,'
-    print '    unsigned start,'
+        print('    const void * _in,')
+    print('    unsigned start,')
     if intype != GENERATE:
-        print '    unsigned in_nr,'
-    print '    unsigned out_nr,'
+        print('    unsigned in_nr,')
+    print('    unsigned out_nr,')
     if intype != GENERATE:
-        print '    unsigned restart_index,'
-    print '    void *_out )'
-    print '{'
+        print('    unsigned restart_index,')
+    print('    void *_out )')
+    print('{')
     if intype != GENERATE:
-        print '  const ' + intype + '*in = (const ' + intype + '*)_in;'
-    print '  ' + outtype + ' *out = (' + outtype + '*)_out;'
-    print '  unsigned i, j;'
-    print '  (void)j;'
+        print('  const ' + intype + '*in = (const ' + intype + '*)_in;')
+    print('  ' + outtype + ' *out = (' + outtype + '*)_out;')
+    print('  unsigned i, j;')
+    print('  (void)j;')
 
 def postamble():
-    print '}'
+    print('}')
 
 
 def points(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='points')
-    print '  for (i = start; i < (out_nr+start); i++) { '
+    print('  for (i = start; i < (out_nr+start); i++) { ')
     do_point( intype, outtype, 'out+i',  'i' );
-    print '   }'
+    print('   }')
     postamble()
 
 def lines(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='lines')
-    print '  for (i = start; i < (out_nr+start); i+=2) { '
+    print('  for (i = start; i < (out_nr+start); i+=2) { ')
     do_line( intype, outtype, 'out+i',  'i', 'i+1', inpv, outpv );
-    print '   }'
+    print('   }')
     postamble()
 
 def linestrip(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='linestrip')
-    print '  for (i = start, j = 0; j < out_nr; j+=2, i++) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=2, i++) { ')
     do_line( intype, outtype, 'out+j',  'i', 'i+1', inpv, outpv );
-    print '   }'
+    print('   }')
     postamble()
 
 def lineloop(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='lineloop')
-    print '  for (i = start, j = 0; j < out_nr - 2; j+=2, i++) { '
+    print('  for (i = start, j = 0; j < out_nr - 2; j+=2, i++) { ')
     do_line( intype, outtype, 'out+j',  'i', 'i+1', inpv, outpv );
-    print '   }'
+    print('   }')
     do_line( intype, outtype, 'out+j',  'i', 'start', inpv, outpv );
     postamble()
 
 def tris(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='tris')
-    print '  for (i = start; i < (out_nr+start); i+=3) { '
+    print('  for (i = start; i < (out_nr+start); i+=3) { ')
     do_tri( intype, outtype, 'out+i',  'i', 'i+1', 'i+2', inpv, outpv );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def tristrip(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='tristrip')
-    print '  for (i = start, j = 0; j < out_nr; j+=3, i++) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=3, i++) { ')
     if inpv == FIRST:
         do_tri( intype, outtype, 'out+j',  'i', 'i+1+(i&1)', 'i+2-(i&1)', inpv, outpv );
     else:
         do_tri( intype, outtype, 'out+j',  'i+(i&1)', 'i+1-(i&1)', 'i+2', inpv, outpv );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def trifan(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='trifan')
-    print '  for (i = start, j = 0; j < out_nr; j+=3, i++) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=3, i++) { ')
     do_tri( intype, outtype, 'out+j',  'start', 'i+1', 'i+2', inpv, outpv );
-    print '   }'
+    print('   }')
     postamble()
 
 
 
 def polygon(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='polygon')
-    print '  for (i = start, j = 0; j < out_nr; j+=3, i++) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=3, i++) { ')
     if pr == PRENABLE:
-        print 'restart:'
-        print '      if (i + 3 > in_nr) {'
-        print '         (out+j+0)[0] = restart_index;'
-        print '         (out+j+0)[1] = restart_index;'
-        print '         (out+j+0)[2] = restart_index;'
-        print '         continue;'
-        print '      }'
-        print '      if (in[i + 0] == restart_index) {'
-        print '         i += 1;'
-        print '         start = i;'
-        print '         goto restart;'
-        print '      }'
-        print '      if (in[i + 1] == restart_index) {'
-        print '         i += 2;'
-        print '         start = i;'
-        print '         goto restart;'
-        print '      }'
-        print '      if (in[i + 2] == restart_index) {'
-        print '         i += 3;'
-        print '         start = i;'
-        print '         goto restart;'
-        print '      }'
+        print('restart:')
+        print('      if (i + 3 > in_nr) {')
+        print('         (out+j+0)[0] = restart_index;')
+        print('         (out+j+0)[1] = restart_index;')
+        print('         (out+j+0)[2] = restart_index;')
+        print('         continue;')
+        print('      }')
+        print('      if (in[i + 0] == restart_index) {')
+        print('         i += 1;')
+        print('         start = i;')
+        print('         goto restart;')
+        print('      }')
+        print('      if (in[i + 1] == restart_index) {')
+        print('         i += 2;')
+        print('         start = i;')
+        print('         goto restart;')
+        print('      }')
+        print('      if (in[i + 2] == restart_index) {')
+        print('         i += 3;')
+        print('         start = i;')
+        print('         goto restart;')
+        print('      }')
 
     if inpv == FIRST:
         do_tri( intype, outtype, 'out+j',  'start', 'i+1', 'i+2', inpv, outpv );
     else:
         do_tri( intype, outtype, 'out+j',  'i+1', 'i+2', 'start', inpv, outpv );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def quads(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='quads')
-    print '  for (i = start, j = 0; j < out_nr; j+=6, i+=4) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=6, i+=4) { ')
     if pr == PRENABLE:
-        print 'restart:'
-        print '      if (i + 4 > in_nr) {'
-        print '         (out+j+0)[0] = restart_index;'
-        print '         (out+j+0)[1] = restart_index;'
-        print '         (out+j+0)[2] = restart_index;'
-        print '         (out+j+3)[0] = restart_index;'
-        print '         (out+j+3)[1] = restart_index;'
-        print '         (out+j+3)[2] = restart_index;'
-        print '         continue;'
-        print '      }'
-        print '      if (in[i + 0] == restart_index) {'
-        print '         i += 1;'
-        print '         goto restart;'
-        print '      }'
-        print '      if (in[i + 1] == restart_index) {'
-        print '         i += 2;'
-        print '         goto restart;'
-        print '      }'
-        print '      if (in[i + 2] == restart_index) {'
-        print '         i += 3;'
-        print '         goto restart;'
-        print '      }'
-        print '      if (in[i + 3] == restart_index) {'
-        print '         i += 4;'
-        print '         goto restart;'
-        print '      }'
+        print('restart:')
+        print('      if (i + 4 > in_nr) {')
+        print('         (out+j+0)[0] = restart_index;')
+        print('         (out+j+0)[1] = restart_index;')
+        print('         (out+j+0)[2] = restart_index;')
+        print('         (out+j+3)[0] = restart_index;')
+        print('         (out+j+3)[1] = restart_index;')
+        print('         (out+j+3)[2] = restart_index;')
+        print('         continue;')
+        print('      }')
+        print('      if (in[i + 0] == restart_index) {')
+        print('         i += 1;')
+        print('         goto restart;')
+        print('      }')
+        print('      if (in[i + 1] == restart_index) {')
+        print('         i += 2;')
+        print('         goto restart;')
+        print('      }')
+        print('      if (in[i + 2] == restart_index) {')
+        print('         i += 3;')
+        print('         goto restart;')
+        print('      }')
+        print('      if (in[i + 3] == restart_index) {')
+        print('         i += 4;')
+        print('         goto restart;')
+        print('      }')
 
     do_quad( intype, outtype, 'out+j', 'i+0', 'i+1', 'i+2', 'i+3', inpv, outpv );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def quadstrip(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='quadstrip')
-    print '  for (i = start, j = 0; j < out_nr; j+=6, i+=2) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=6, i+=2) { ')
     if pr == PRENABLE:
-        print 'restart:'
-        print '      if (i + 4 > in_nr) {'
-        print '         (out+j+0)[0] = restart_index;'
-        print '         (out+j+0)[1] = restart_index;'
-        print '         (out+j+0)[2] = restart_index;'
-        print '         (out+j+3)[0] = restart_index;'
-        print '         (out+j+3)[1] = restart_index;'
-        print '         (out+j+3)[2] = restart_index;'
-        print '         continue;'
-        print '      }'
-        print '      if (in[i + 0] == restart_index) {'
-        print '         i += 1;'
-        print '         goto restart;'
-        print '      }'
-        print '      if (in[i + 1] == restart_index) {'
-        print '         i += 2;'
-        print '         goto restart;'
-        print '      }'
-        print '      if (in[i + 2] == restart_index) {'
-        print '         i += 3;'
-        print '         goto restart;'
-        print '      }'
-        print '      if (in[i + 3] == restart_index) {'
-        print '         i += 4;'
-        print '         goto restart;'
-        print '      }'
+        print('restart:')
+        print('      if (i + 4 > in_nr) {')
+        print('         (out+j+0)[0] = restart_index;')
+        print('         (out+j+0)[1] = restart_index;')
+        print('         (out+j+0)[2] = restart_index;')
+        print('         (out+j+3)[0] = restart_index;')
+        print('         (out+j+3)[1] = restart_index;')
+        print('         (out+j+3)[2] = restart_index;')
+        print('         continue;')
+        print('      }')
+        print('      if (in[i + 0] == restart_index) {')
+        print('         i += 1;')
+        print('         goto restart;')
+        print('      }')
+        print('      if (in[i + 1] == restart_index) {')
+        print('         i += 2;')
+        print('         goto restart;')
+        print('      }')
+        print('      if (in[i + 2] == restart_index) {')
+        print('         i += 3;')
+        print('         goto restart;')
+        print('      }')
+        print('      if (in[i + 3] == restart_index) {')
+        print('         i += 4;')
+        print('         goto restart;')
+        print('      }')
     if inpv == LAST:
         do_quad( intype, outtype, 'out+j', 'i+2', 'i+0', 'i+1', 'i+3', inpv, outpv );
     else:
         do_quad( intype, outtype, 'out+j', 'i+0', 'i+1', 'i+3', 'i+2', inpv, outpv );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def linesadj(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='linesadj')
-    print '  for (i = start; i < (out_nr+start); i+=4) { '
+    print('  for (i = start; i < (out_nr+start); i+=4) { ')
     do_lineadj( intype, outtype, 'out+i',  'i+0', 'i+1', 'i+2', 'i+3', inpv, outpv )
-    print '  }'
+    print('  }')
     postamble()
 
 
 def linestripadj(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='linestripadj')
-    print '  for (i = start, j = 0; j < out_nr; j+=4, i++) {'
+    print('  for (i = start, j = 0; j < out_nr; j+=4, i++) {')
     do_lineadj( intype, outtype, 'out+j',  'i+0', 'i+1', 'i+2', 'i+3', inpv, outpv )
-    print '  }'
+    print('  }')
     postamble()
 
 
 def trisadj(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='trisadj')
-    print '  for (i = start; i < (out_nr+start); i+=6) { '
+    print('  for (i = start; i < (out_nr+start); i+=6) { ')
     do_triadj( intype, outtype, 'out+i',  'i+0', 'i+1', 'i+2', 'i+3',
                'i+4', 'i+5', inpv, outpv )
-    print '  }'
+    print('  }')
     postamble()
 
 
 def tristripadj(intype, outtype, inpv, outpv, pr):
     preamble(intype, outtype, inpv, outpv, pr, prim='tristripadj')
-    print '  for (i = start, j = 0; j < out_nr; i+=2, j+=6) { '
-    print '    if (i % 4 == 0) {'
-    print '      /* even triangle */'
+    print('  for (i = start, j = 0; j < out_nr; i+=2, j+=6) { ')
+    print('    if (i % 4 == 0) {')
+    print('      /* even triangle */')
     do_triadj( intype, outtype, 'out+j',
                'i+0', 'i+1', 'i+2', 'i+3', 'i+4', 'i+5', inpv, outpv )
-    print '    } else {'
-    print '      /* odd triangle */'
+    print('    } else {')
+    print('      /* odd triangle */')
     do_triadj( intype, outtype, 'out+j',
                'i+2', 'i-2', 'i+0', 'i+3', 'i+4', 'i+6', inpv, outpv )
-    print '    }'
-    print '  }'
+    print('    }')
+    print('  }')
     postamble()
 
 
@@ -466,19 +468,19 @@
                             init(intype, outtype, inpv, outpv, pr, prim)
 
 def emit_init():
-    print 'void u_index_init( void )'
-    print '{'
-    print '  static int firsttime = 1;'
-    print '  if (!firsttime) return;'
-    print '  firsttime = 0;'
+    print('void u_index_init( void )')
+    print('{')
+    print('  static int firsttime = 1;')
+    print('  if (!firsttime) return;')
+    print('  firsttime = 0;')
     emit_all_inits()
-    print '}'
+    print('}')
 
 
     
 
 def epilog():
-    print '#include "indices/u_indices.c"'
+    print('#include "indices/u_indices.c"')
 
 
 def main():
diff --git a/src/gallium/auxiliary/indices/u_unfilled_gen.py b/src/gallium/auxiliary/indices/u_unfilled_gen.py
index 4780d98..4c7d7c6 100644
--- a/src/gallium/auxiliary/indices/u_unfilled_gen.py
+++ b/src/gallium/auxiliary/indices/u_unfilled_gen.py
@@ -1,3 +1,5 @@
+from __future__ import print_function
+
 copyright = '''
 /*
  * Copyright 2009 VMware, Inc.
@@ -53,9 +55,9 @@
 
 
 def prolog():
-    print '''/* File automatically generated by u_unfilled_gen.py */'''
-    print copyright
-    print r'''
+    print('''/* File automatically generated by u_unfilled_gen.py */''')
+    print(copyright)
+    print(r'''
 
 /**
  * @file
@@ -93,7 +95,7 @@
 static u_generate_func generate_line[OUT_COUNT][PRIM_COUNT];
 static u_translate_func translate_line[IN_COUNT][OUT_COUNT][PRIM_COUNT];
 
-'''
+''')
 
 def vert( intype, outtype, v0 ):
     if intype == GENERATE:
@@ -102,8 +104,8 @@
         return '(' + outtype + ')in[' + v0 + ']'
 
 def line( intype, outtype, ptr, v0, v1 ):
-    print '      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';'
-    print '      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';'
+    print('      (' + ptr + ')[0] = ' + vert( intype, outtype, v0 ) + ';')
+    print('      (' + ptr + ')[1] = ' + vert( intype, outtype, v1 ) + ';')
 
 # XXX: have the opportunity here to avoid over-drawing shared lines in
 # tristrips, fans, etc, by integrating this into the calling functions
@@ -127,89 +129,89 @@
         return 'translate_' + prim + '_' + intype + '2' + outtype
 
 def preamble(intype, outtype, prim):
-    print 'static void ' + name( intype, outtype, prim ) + '('
+    print('static void ' + name( intype, outtype, prim ) + '(')
     if intype != GENERATE:
-        print '    const void * _in,'
-    print '    unsigned start,'
+        print('    const void * _in,')
+    print('    unsigned start,')
     if intype != GENERATE:
-        print '    unsigned in_nr,'
-    print '    unsigned out_nr,'
+        print('    unsigned in_nr,')
+    print('    unsigned out_nr,')
     if intype != GENERATE:
-        print '    unsigned restart_index,'
-    print '    void *_out )'
-    print '{'
+        print('    unsigned restart_index,')
+    print('    void *_out )')
+    print('{')
     if intype != GENERATE:
-        print '  const ' + intype + '*in = (const ' + intype + '*)_in;'
-    print '  ' + outtype + ' *out = (' + outtype + '*)_out;'
-    print '  unsigned i, j;'
-    print '  (void)j;'
+        print('  const ' + intype + '*in = (const ' + intype + '*)_in;')
+    print('  ' + outtype + ' *out = (' + outtype + '*)_out;')
+    print('  unsigned i, j;')
+    print('  (void)j;')
 
 def postamble():
-    print '}'
+    print('}')
 
 
 def tris(intype, outtype):
     preamble(intype, outtype, prim='tris')
-    print '  for (i = start, j = 0; j < out_nr; j+=6, i+=3) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=6, i+=3) { ')
     do_tri( intype, outtype, 'out+j',  'i', 'i+1', 'i+2' );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def tristrip(intype, outtype):
     preamble(intype, outtype, prim='tristrip')
-    print '  for (i = start, j = 0; j < out_nr; j+=6, i++) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=6, i++) { ')
     do_tri( intype, outtype, 'out+j',  'i', 'i+1/*+(i&1)*/', 'i+2/*-(i&1)*/' );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def trifan(intype, outtype):
     preamble(intype, outtype, prim='trifan')
-    print '  for (i = start, j = 0; j < out_nr; j+=6, i++) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=6, i++) { ')
     do_tri( intype, outtype, 'out+j',  '0', 'i+1', 'i+2' );
-    print '   }'
+    print('   }')
     postamble()
 
 
 
 def polygon(intype, outtype):
     preamble(intype, outtype, prim='polygon')
-    print '  for (i = start, j = 0; j < out_nr; j+=2, i++) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=2, i++) { ')
     line( intype, outtype, 'out+j', 'i', '(i+1)%(out_nr/2)' )
-    print '   }'
+    print('   }')
     postamble()
 
 
 def quads(intype, outtype):
     preamble(intype, outtype, prim='quads')
-    print '  for (i = start, j = 0; j < out_nr; j+=8, i+=4) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=8, i+=4) { ')
     do_quad( intype, outtype, 'out+j', 'i+0', 'i+1', 'i+2', 'i+3' );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def quadstrip(intype, outtype):
     preamble(intype, outtype, prim='quadstrip')
-    print '  for (i = start, j = 0; j < out_nr; j+=8, i+=2) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=8, i+=2) { ')
     do_quad( intype, outtype, 'out+j', 'i+2', 'i+0', 'i+1', 'i+3' );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def trisadj(intype, outtype):
     preamble(intype, outtype, prim='trisadj')
-    print '  for (i = start, j = 0; j < out_nr; j+=6, i+=6) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=6, i+=6) { ')
     do_tri( intype, outtype, 'out+j',  'i', 'i+2', 'i+4' );
-    print '   }'
+    print('   }')
     postamble()
 
 
 def tristripadj(intype, outtype):
     preamble(intype, outtype, prim='tristripadj')
-    print '  for (i = start, j = 0; j < out_nr; j+=6, i+=2) { '
+    print('  for (i = start, j = 0; j < out_nr; j+=6, i+=2) { ')
     do_tri( intype, outtype, 'out+j',  'i', 'i+2', 'i+4' );
-    print '   }'
+    print('   }')
     postamble()
 
 
@@ -227,16 +229,16 @@
 
 def init(intype, outtype, prim):
     if intype == GENERATE:
-        print ('generate_line[' + 
+        print(('generate_line[' + 
                outtype_idx[outtype] + 
                '][' + longprim[prim] + 
-               '] = ' + name( intype, outtype, prim ) + ';')
+               '] = ' + name( intype, outtype, prim ) + ';'))
     else:
-        print ('translate_line[' + 
+        print(('translate_line[' + 
                intype_idx[intype] + 
                '][' + outtype_idx[outtype] + 
                '][' + longprim[prim] + 
-               '] = ' + name( intype, outtype, prim ) + ';')
+               '] = ' + name( intype, outtype, prim ) + ';'))
 
 
 def emit_all_inits():
@@ -246,19 +248,19 @@
                 init(intype, outtype, prim)
 
 def emit_init():
-    print 'void u_unfilled_init( void )'
-    print '{'
-    print '  static int firsttime = 1;'
-    print '  if (!firsttime) return;'
-    print '  firsttime = 0;'
+    print('void u_unfilled_init( void )')
+    print('{')
+    print('  static int firsttime = 1;')
+    print('  if (!firsttime) return;')
+    print('  firsttime = 0;')
     emit_all_inits()
-    print '}'
+    print('}')
 
 
     
 
 def epilog():
-    print '#include "indices/u_unfilled_indices.c"'
+    print('#include "indices/u_unfilled_indices.c"')
 
 
 def main():
diff --git a/src/gallium/auxiliary/meson.build b/src/gallium/auxiliary/meson.build
index 1f75481..98542d7 100644
--- a/src/gallium/auxiliary/meson.build
+++ b/src/gallium/auxiliary/meson.build
@@ -276,6 +276,8 @@
   'util/u_fifo.h',
   'util/u_format.c',
   'util/u_format.h',
+  'util/u_format_bptc.c',
+  'util/u_format_bptc.h',
   'util/u_format_etc.c',
   'util/u_format_etc.h',
   'util/u_format_latc.c',
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index b5b4869..1b31b56 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -487,27 +487,15 @@
 /* generate either a constant or indirect deref chain for accessing an
  * array variable.
  */
-static nir_deref_var *
-ttn_array_deref(struct ttn_compile *c, nir_intrinsic_instr *instr,
-                nir_variable *var, unsigned offset,
+static nir_deref_instr *
+ttn_array_deref(struct ttn_compile *c, nir_variable *var, unsigned offset,
                 struct tgsi_ind_register *indirect)
 {
-   nir_deref_var *deref = nir_deref_var_create(instr, var);
-   nir_deref_array *arr = nir_deref_array_create(deref);
-
-   arr->base_offset = offset;
-   arr->deref.type = glsl_get_array_element(var->type);
-
-   if (indirect) {
-      arr->deref_array_type = nir_deref_array_type_indirect;
-      arr->indirect = nir_src_for_ssa(ttn_src_for_indirect(c, indirect));
-   } else {
-      arr->deref_array_type = nir_deref_array_type_direct;
-   }
-
-   deref->deref.child = &arr->deref;
-
-   return deref;
+   nir_deref_instr *deref = nir_build_deref_var(&c->build, var);
+   nir_ssa_def *index = nir_imm_int(&c->build, offset);
+   if (indirect)
+      index = nir_iadd(&c->build, index, ttn_src_for_indirect(c, indirect));
+   return nir_build_deref_array(&c->build, deref, index);
 }
 
 static nir_src
@@ -526,18 +514,10 @@
       if (c->temp_regs[index].var) {
          unsigned offset = c->temp_regs[index].offset;
          nir_variable *var = c->temp_regs[index].var;
-         nir_intrinsic_instr *load;
+         nir_ssa_def *load = nir_load_deref(&c->build,
+               ttn_array_deref(c, var, offset, indirect));
 
-         load = nir_intrinsic_instr_create(b->shader,
-                                           nir_intrinsic_load_var);
-         load->num_components = 4;
-         load->variables[0] = ttn_array_deref(c, load, var, offset, indirect);
-         nir_ssa_dest_init(&load->instr, &load->dest,
-                           4, 32, NULL);
-         nir_builder_instr_insert(b, &load->instr);
-
-         src = nir_src_for_ssa(&load->dest.ssa);
-
+         src = nir_src_for_ssa(load);
       } else {
          assert(!indirect);
          src.reg.reg = c->temp_regs[index].reg;
@@ -1829,17 +1809,11 @@
    if (var) {
       unsigned index = tgsi_dst->Register.Index;
       unsigned offset = c->temp_regs[index].offset;
-      nir_intrinsic_instr *store =
-         nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
       struct tgsi_ind_register *indirect = tgsi_dst->Register.Indirect ?
                                            &tgsi_dst->Indirect : NULL;
-
-      store->num_components = 4;
-      nir_intrinsic_set_write_mask(store, dest.write_mask);
-      store->variables[0] = ttn_array_deref(c, store, var, offset, indirect);
-      store->src[0] = nir_src_for_reg(dest.dest.reg.reg);
-
-      nir_builder_instr_insert(b, &store->instr);
+      nir_src val = nir_src_for_reg(dest.dest.reg.reg);
+      nir_store_deref(b, ttn_array_deref(c, var, offset, indirect),
+                      nir_ssa_for_src(b, val, 4), dest.write_mask);
    }
 }
 
diff --git a/src/gallium/auxiliary/os/os_process.c b/src/gallium/auxiliary/os/os_process.c
index 035bd22..766cf80 100644
--- a/src/gallium/auxiliary/os/os_process.c
+++ b/src/gallium/auxiliary/os/os_process.c
@@ -29,18 +29,13 @@
 #include "pipe/p_config.h"
 #include "os/os_process.h"
 #include "util/u_memory.h"
+#include "util/u_process.h"
 
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  include <windows.h>
-#elif defined(__GLIBC__) || defined(__CYGWIN__)
-#  include <errno.h>
-#elif defined(PIPE_OS_BSD) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_ANDROID)
-#  include <stdlib.h>
 #elif defined(PIPE_OS_HAIKU)
 #  include <kernel/OS.h>
 #  include <kernel/image.h>
-#else
-#warning unexpected platform in os_process.c
 #endif
 
 #if defined(PIPE_OS_LINUX)
@@ -84,20 +79,13 @@
 
       name = lpProcessName;
 
-#elif defined(__GLIBC__) || defined(__CYGWIN__)
-      name = program_invocation_short_name;
-#elif defined(PIPE_OS_BSD) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_ANDROID)
-      /* *BSD and OS X */
-      name = getprogname();
 #elif defined(PIPE_OS_HAIKU)
       image_info info;
       get_image_info(B_CURRENT_TEAM, &info);
       name = info.name;
 #else
-#warning unexpected platform in os_process.c
-      return FALSE;
+      name = util_get_process_name();
 #endif
-
    }
 
    assert(size > 0);
diff --git a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
index 21dc599..284e073 100644
--- a/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
+++ b/src/gallium/auxiliary/pipe-loader/driinfo_gallium.h
@@ -23,11 +23,14 @@
    DRI_CONF_DISABLE_SHADER_BIT_ENCODING("false")
    DRI_CONF_FORCE_GLSL_VERSION(0)
    DRI_CONF_ALLOW_GLSL_EXTENSION_DIRECTIVE_MIDSHADER("false")
+   DRI_CONF_ALLOW_GLSL_BUILTIN_CONST_EXPRESSION("false")
+   DRI_CONF_ALLOW_GLSL_RELAXED_ES("false")
    DRI_CONF_ALLOW_GLSL_BUILTIN_VARIABLE_REDECLARATION("false")
    DRI_CONF_ALLOW_GLSL_CROSS_STAGE_INTERPOLATION_MISMATCH("false")
    DRI_CONF_ALLOW_HIGHER_COMPAT_VERSION("false")
    DRI_CONF_FORCE_GLSL_ABS_SQRT("false")
    DRI_CONF_GLSL_CORRECT_DERIVATIVES_AFTER_DISCARD("false")
+   DRI_CONF_FORCE_COMPAT_PROFILE("false")
 DRI_CONF_SECTION_END
 
 DRI_CONF_SECTION_MISCELLANEOUS
diff --git a/src/gallium/auxiliary/pipe-loader/meson.build b/src/gallium/auxiliary/pipe-loader/meson.build
index 32e8188..c0b9a53 100644
--- a/src/gallium/auxiliary/pipe-loader/meson.build
+++ b/src/gallium/auxiliary/pipe-loader/meson.build
@@ -31,6 +31,9 @@
 if dep_libdrm.found()
   files_pipe_loader += files('pipe_loader_drm.c')
 endif
+if with_dri
+  libpipe_loader_defines += '-DHAVE_PIPE_LOADER_DRI'
+endif
 if with_gallium_drisw_kms
   libpipe_loader_defines += '-DHAVE_PIPE_LOADER_KMS'
 endif
@@ -42,10 +45,7 @@
     inc_util, inc_loader, inc_gallium, inc_include, inc_src, inc_gallium_aux,
     inc_gallium_winsys,
   ],
-  c_args : [
-    c_vis_args, '-DHAVE_PIPE_LOADER_DRI', '-DGALLIUM_STATIC_TARGETS=1',
-    libpipe_loader_defines,
-  ],
+  c_args : [c_vis_args, libpipe_loader_defines, '-DGALLIUM_STATIC_TARGETS=1'],
   link_with : [libloader, libxmlconfig],
   dependencies : [dep_libdrm],
   build_by_default : false,
@@ -59,7 +59,7 @@
     inc_gallium_winsys,
   ],
   c_args : [
-    c_vis_args, libpipe_loader_defines, '-DHAVE_PIPE_LOADER_DRI',
+    c_vis_args, libpipe_loader_defines,
     '-DPIPE_SEARCH_DIR="@0@"'.format(
       join_paths(get_option('prefix'), get_option('libdir'), 'gallium-pipe')
     )
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
index 3b959e53..6d2ed6e 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_drm.c
@@ -116,13 +116,13 @@
         .configuration = pipe_default_configuration_query,
     },
     {
-        .driver_name = "vc4",
-        .create_screen = pipe_vc4_create_screen,
+        .driver_name = "v3d",
+        .create_screen = pipe_v3d_create_screen,
         .configuration = pipe_default_configuration_query,
     },
     {
-        .driver_name = "vc5",
-        .create_screen = pipe_vc5_create_screen,
+        .driver_name = "vc4",
+        .create_screen = pipe_vc4_create_screen,
         .configuration = pipe_default_configuration_query,
     },
     {
diff --git a/src/gallium/auxiliary/postprocess/pp_init.c b/src/gallium/auxiliary/postprocess/pp_init.c
index b9eff78..2c830e8 100644
--- a/src/gallium/auxiliary/postprocess/pp_init.c
+++ b/src/gallium/auxiliary/postprocess/pp_init.c
@@ -279,7 +279,7 @@
    tmp_res.bind = PIPE_BIND_RENDER_TARGET;
 
    if (!p->screen->is_format_supported(p->screen, tmp_res.format,
-                                       tmp_res.target, 1, tmp_res.bind))
+                                       tmp_res.target, 1, 1, tmp_res.bind))
       pp_debug("Temp buffers' format fail\n");
 
    for (i = 0; i < ppq->n_tmp; i++) {
@@ -305,12 +305,12 @@
    tmp_res.format = p->surf.format = PIPE_FORMAT_S8_UINT_Z24_UNORM;
 
    if (!p->screen->is_format_supported(p->screen, tmp_res.format,
-                                       tmp_res.target, 1, tmp_res.bind)) {
+                                       tmp_res.target, 1, 1, tmp_res.bind)) {
 
       tmp_res.format = p->surf.format = PIPE_FORMAT_Z24_UNORM_S8_UINT;
 
       if (!p->screen->is_format_supported(p->screen, tmp_res.format,
-                                          tmp_res.target, 1, tmp_res.bind))
+                                          tmp_res.target, 1, 1, tmp_res.bind))
          pp_debug("Temp Sbuffer format fail\n");
    }
 
diff --git a/src/gallium/auxiliary/postprocess/pp_mlaa.c b/src/gallium/auxiliary/postprocess/pp_mlaa.c
index 610cedb..fd5a55d 100644
--- a/src/gallium/auxiliary/postprocess/pp_mlaa.c
+++ b/src/gallium/auxiliary/postprocess/pp_mlaa.c
@@ -240,10 +240,10 @@
    res.width0 = res.height0 = 165;
    res.bind = PIPE_BIND_SAMPLER_VIEW;
    res.usage = PIPE_USAGE_DEFAULT;
-   res.depth0 = res.array_size = res.nr_samples = 1;
+   res.depth0 = res.array_size = res.nr_samples = res.nr_storage_samples = 1;
 
    if (!ppq->p->screen->is_format_supported(ppq->p->screen, res.format,
-                                            res.target, 1, res.bind))
+                                            res.target, 1, 1, res.bind))
       pp_debug("Areamap format not supported\n");
 
    ppq->areamaptex = ppq->p->screen->resource_create(ppq->p->screen, &res);
diff --git a/src/gallium/auxiliary/postprocess/pp_program.c b/src/gallium/auxiliary/postprocess/pp_program.c
index 811f1fb..cb06c8d 100644
--- a/src/gallium/auxiliary/postprocess/pp_program.c
+++ b/src/gallium/auxiliary/postprocess/pp_program.c
@@ -119,7 +119,7 @@
 
    if (!p->screen->is_format_supported(p->screen,
                                        PIPE_FORMAT_R32G32B32A32_FLOAT,
-                                       PIPE_BUFFER, 1,
+                                       PIPE_BUFFER, 1, 1,
                                        PIPE_BIND_VERTEX_BUFFER))
       pp_debug("Vertex buf format fail\n");
 
diff --git a/src/gallium/auxiliary/renderonly/renderonly.c b/src/gallium/auxiliary/renderonly/renderonly.c
index d31f458..f83910a 100644
--- a/src/gallium/auxiliary/renderonly/renderonly.c
+++ b/src/gallium/auxiliary/renderonly/renderonly.c
@@ -98,7 +98,7 @@
 
    /* fill in winsys handle */
    memset(out_handle, 0, sizeof(*out_handle));
-   out_handle->type = DRM_API_HANDLE_TYPE_FD;
+   out_handle->type = WINSYS_HANDLE_TYPE_FD;
    out_handle->stride = create_dumb.pitch;
 
    err = drmPrimeHandleToFD(ro->kms_fd, create_dumb.handle, O_CLOEXEC,
@@ -130,7 +130,7 @@
    boolean status;
    int fd, err;
    struct winsys_handle handle = {
-      .type = DRM_API_HANDLE_TYPE_FD
+      .type = WINSYS_HANDLE_TYPE_FD
    };
 
    scanout = CALLOC_STRUCT(renderonly_scanout);
diff --git a/src/gallium/auxiliary/renderonly/renderonly.h b/src/gallium/auxiliary/renderonly/renderonly.h
index 6a89c29..a8d6a68 100644
--- a/src/gallium/auxiliary/renderonly/renderonly.h
+++ b/src/gallium/auxiliary/renderonly/renderonly.h
@@ -85,7 +85,7 @@
    if (!scanout)
       return FALSE;
 
-   assert(handle->type == DRM_API_HANDLE_TYPE_KMS);
+   assert(handle->type == WINSYS_HANDLE_TYPE_KMS);
    handle->handle = scanout->handle;
    handle->stride = scanout->stride;
 
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper.h b/src/gallium/auxiliary/target-helpers/drm_helper.h
index 7dc2497..7eefa6e 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper.h
@@ -310,24 +310,24 @@
 
 #endif
 
-#ifdef GALLIUM_VC5
-#include "vc5/drm/vc5_drm_public.h"
+#ifdef GALLIUM_V3D
+#include "v3d/drm/v3d_drm_public.h"
 
 struct pipe_screen *
-pipe_vc5_create_screen(int fd, const struct pipe_screen_config *config)
+pipe_v3d_create_screen(int fd, const struct pipe_screen_config *config)
 {
    struct pipe_screen *screen;
 
-   screen = vc5_drm_screen_create(fd);
+   screen = v3d_drm_screen_create(fd);
    return screen ? debug_screen_wrap(screen) : NULL;
 }
 
 #else
 
 struct pipe_screen *
-pipe_vc5_create_screen(int fd, const struct pipe_screen_config *config)
+pipe_v3d_create_screen(int fd, const struct pipe_screen_config *config)
 {
-   fprintf(stderr, "vc5: driver missing\n");
+   fprintf(stderr, "v3d: driver missing\n");
    return NULL;
 }
 
diff --git a/src/gallium/auxiliary/target-helpers/drm_helper_public.h b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
index c1a7bf4..155c525 100644
--- a/src/gallium/auxiliary/target-helpers/drm_helper_public.h
+++ b/src/gallium/auxiliary/target-helpers/drm_helper_public.h
@@ -37,10 +37,10 @@
 pipe_virgl_create_screen(int fd, const struct pipe_screen_config *config);
 
 struct pipe_screen *
-pipe_vc4_create_screen(int fd, const struct pipe_screen_config *config);
+pipe_v3d_create_screen(int fd, const struct pipe_screen_config *config);
 
 struct pipe_screen *
-pipe_vc5_create_screen(int fd, const struct pipe_screen_config *config);
+pipe_vc4_create_screen(int fd, const struct pipe_screen_config *config);
 
 struct pipe_screen *
 pipe_pl111_create_screen(int fd, const struct pipe_screen_config *config);
diff --git a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
index 66d46de..ef22cac 100644
--- a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h
@@ -16,6 +16,10 @@
 #include "driver_rbug/rbug_public.h"
 #include "driver_noop/noop_public.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /*
  * TODO: Audit the following *screen_create() - all of
  * them should return the original screen on failuire.
@@ -35,3 +39,7 @@
 }
 
 #endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_aa_point.c b/src/gallium/auxiliary/tgsi/tgsi_aa_point.c
index 4b14a2f..cdd4fef 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_aa_point.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_aa_point.c
@@ -73,7 +73,7 @@
       ts->num_input++;
    }
    else if (decl->Declaration.File == TGSI_FILE_TEMPORARY) {
-      ts->num_tmp = MAX2(ts->num_tmp, decl->Range.Last + 1);
+      ts->num_tmp = MAX2(ts->num_tmp, (unsigned)(decl->Range.Last + 1));
    }
 
    ctx->emit_declaration(ctx, decl);
@@ -217,7 +217,7 @@
    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
       struct tgsi_full_dst_register *dst = &inst->Dst[i];
       if (dst->Register.File == TGSI_FILE_OUTPUT &&
-          dst->Register.Index == ts->color_out) {
+	  dst->Register.Index == (int)ts->color_out) {
          dst->Register.File = TGSI_FILE_TEMPORARY;
          dst->Register.Index = ts->color_tmp;
       }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 36c36d9..3db117a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -607,7 +607,8 @@
    struct tgsi_header *header,
    unsigned maxsize )
 {
-   unsigned size = 0, i;
+   unsigned size = 0;
+   int i;
    struct tgsi_immediate *immediate;
 
    if( maxsize <= size )
@@ -712,7 +713,6 @@
 static struct tgsi_instruction_label
 tgsi_build_instruction_label(
    unsigned label,
-   struct tgsi_token  *prev_token,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header )
 {
@@ -745,7 +745,6 @@
    unsigned texture,
    unsigned num_offsets,
    unsigned return_type,
-   struct tgsi_token *prev_token,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header )
 {
@@ -780,7 +779,6 @@
    unsigned qualifier,
    unsigned texture,
    unsigned format,
-   struct tgsi_token *prev_token,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header )
 {
@@ -815,7 +813,6 @@
 static struct tgsi_texture_offset
 tgsi_build_texture_offset(
    int index, int file, int swizzle_x, int swizzle_y, int swizzle_z,
-   struct tgsi_token *prev_token,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header )
 {
@@ -1065,7 +1062,6 @@
    unsigned size = 0;
    unsigned i;
    struct tgsi_instruction *instruction;
-   struct tgsi_token *prev_token;
 
    if( maxsize <= size )
       return 0;
@@ -1078,7 +1074,6 @@
                                          full_inst->Instruction.NumDstRegs,
                                          full_inst->Instruction.NumSrcRegs,
                                          header);
-   prev_token = (struct tgsi_token  *) instruction;
 
    if (full_inst->Instruction.Label) {
       struct tgsi_instruction_label *instruction_label;
@@ -1091,10 +1086,8 @@
 
       *instruction_label = tgsi_build_instruction_label(
          full_inst->Label.Label,
-         prev_token,
          instruction,
-         header );
-      prev_token = (struct tgsi_token  *) instruction_label;
+	 header );
    }
 
    if (full_inst->Instruction.Texture) {
@@ -1110,10 +1103,8 @@
          full_inst->Texture.Texture,
          full_inst->Texture.NumOffsets,
          full_inst->Texture.ReturnType,
-         prev_token,
          instruction,
          header   );
-      prev_token = (struct tgsi_token  *) instruction_texture;
 
       for (i = 0; i < full_inst->Texture.NumOffsets; i++) {
          struct tgsi_texture_offset *texture_offset;
@@ -1128,10 +1119,8 @@
             full_inst->TexOffsets[i].SwizzleX,
             full_inst->TexOffsets[i].SwizzleY,
             full_inst->TexOffsets[i].SwizzleZ,
-            prev_token,
             instruction,
             header);
-         prev_token = (struct tgsi_token *) texture_offset;
       }
    }
 
@@ -1148,10 +1137,8 @@
          full_inst->Memory.Qualifier,
          full_inst->Memory.Texture,
          full_inst->Memory.Format,
-         prev_token,
          instruction,
          header );
-      prev_token = (struct tgsi_token  *) instruction_memory;
    }
 
    for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
@@ -1376,7 +1363,8 @@
    struct tgsi_header *header,
    unsigned maxsize )
 {
-   unsigned size = 0, i;
+   unsigned size = 0;
+   int i;
    struct tgsi_property *property;
 
    if( maxsize <= size )
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index f6c8539..705d9f1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -480,7 +480,7 @@
    struct tgsi_iterate_context *iter,
    struct tgsi_full_property *prop )
 {
-   unsigned i;
+   int i;
    struct dump_ctx *ctx = (struct dump_ctx *)iter;
 
    TXT( "PROPERTY " );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 793c0da..59194eb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1477,7 +1477,6 @@
 
 static void
 fetch_src_file_channel(const struct tgsi_exec_machine *mach,
-                       const uint chan_index,
                        const uint file,
                        const uint swizzle,
                        const union tgsi_exec_channel *index,
@@ -1591,8 +1590,7 @@
 fetch_source_d(const struct tgsi_exec_machine *mach,
                union tgsi_exec_channel *chan,
                const struct tgsi_full_src_register *reg,
-               const uint chan_index,
-               enum tgsi_exec_datatype src_datatype)
+	       const uint chan_index)
 {
    union tgsi_exec_channel index;
    union tgsi_exec_channel index2D;
@@ -1634,7 +1632,6 @@
       /* get current value of address register[swizzle] */
       swizzle = reg->Indirect.Swizzle;
       fetch_src_file_channel(mach,
-                             chan_index,
                              reg->Indirect.File,
                              swizzle,
                              &index2,
@@ -1694,7 +1691,6 @@
 
          swizzle = reg->DimIndirect.Swizzle;
          fetch_src_file_channel(mach,
-                                chan_index,
                                 reg->DimIndirect.File,
                                 swizzle,
                                 &index2,
@@ -1729,7 +1725,6 @@
 
    swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
    fetch_src_file_channel(mach,
-                          chan_index,
                           reg->Register.File,
                           swizzle,
                           &index,
@@ -1744,7 +1739,7 @@
              const uint chan_index,
              enum tgsi_exec_datatype src_datatype)
 {
-   fetch_source_d(mach, chan, reg, chan_index, src_datatype);
+   fetch_source_d(mach, chan, reg, chan_index);
 
    if (reg->Register.Absolute) {
       if (src_datatype == TGSI_EXEC_DATA_FLOAT) {
@@ -1767,7 +1762,6 @@
 store_dest_dstret(struct tgsi_exec_machine *mach,
                  const union tgsi_exec_channel *chan,
                  const struct tgsi_full_dst_register *reg,
-                 const struct tgsi_full_instruction *inst,
                  uint chan_index,
                  enum tgsi_exec_datatype dst_datatype)
 {
@@ -1808,7 +1802,6 @@
 
       /* fetch values from the address/indirection register */
       fetch_src_file_channel(mach,
-                             chan_index,
                              reg->Indirect.File,
                              swizzle,
                              &index,
@@ -1858,7 +1851,6 @@
 
          swizzle = reg->DimIndirect.Swizzle;
          fetch_src_file_channel(mach,
-                                chan_index,
                                 reg->DimIndirect.File,
                                 swizzle,
                                 &index2,
@@ -1937,7 +1929,6 @@
 store_dest_double(struct tgsi_exec_machine *mach,
                  const union tgsi_exec_channel *chan,
                  const struct tgsi_full_dst_register *reg,
-                 const struct tgsi_full_instruction *inst,
                  uint chan_index,
                  enum tgsi_exec_datatype dst_datatype)
 {
@@ -1945,8 +1936,7 @@
    const uint execmask = mach->ExecMask;
    int i;
 
-   dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
-			   dst_datatype);
+   dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
    if (!dst)
       return;
 
@@ -1968,8 +1958,7 @@
    const uint execmask = mach->ExecMask;
    int i;
 
-   dst = store_dest_dstret(mach, chan, reg, inst, chan_index,
-                    dst_datatype);
+   dst = store_dest_dstret(mach, chan, reg, chan_index, dst_datatype);
    if (!dst)
       return;
 
@@ -2045,8 +2034,7 @@
  * Unconditional fragment kill/discard.
  */
 static void
-exec_kill(struct tgsi_exec_machine *mach,
-          const struct tgsi_full_instruction *inst)
+exec_kill(struct tgsi_exec_machine *mach)
 {
    uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
 
@@ -2156,11 +2144,11 @@
       union tgsi_exec_channel index;
       union tgsi_exec_channel offset[3];
       index.i[0] = index.i[1] = index.i[2] = index.i[3] = inst->TexOffsets[0].Index;
-      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
+      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
                              inst->TexOffsets[0].SwizzleX, &index, &ZeroVec, &offset[0]);
-      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
+      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
                              inst->TexOffsets[0].SwizzleY, &index, &ZeroVec, &offset[1]);
-      fetch_src_file_channel(mach, 0, inst->TexOffsets[0].File,
+      fetch_src_file_channel(mach, inst->TexOffsets[0].File,
                              inst->TexOffsets[0].SwizzleZ, &index, &ZeroVec, &offset[2]);
      offsets[0] = offset[0].i[0];
      offsets[1] = offset[1].i[0];
@@ -2213,7 +2201,6 @@
       index2.i[3] = reg->Indirect.Index;
 
       fetch_src_file_channel(mach,
-                             0,
                              reg->Indirect.File,
                              reg->Indirect.Swizzle,
                              &index2,
@@ -2264,7 +2251,7 @@
 
    assert(dim <= 4);
    if (shadow_ref >= 0)
-      assert(shadow_ref >= dim && shadow_ref < ARRAY_SIZE(args));
+      assert(shadow_ref >= dim && shadow_ref < (int)ARRAY_SIZE(args));
 
    /* fetch modifier to the last argument */
    if (modifier != TEX_MODIFIER_NONE) {
@@ -2300,7 +2287,7 @@
          control = TGSI_SAMPLER_GATHER;
    }
    else {
-      for (i = dim; i < ARRAY_SIZE(args); i++)
+      for (i = dim; i < (int)ARRAY_SIZE(args); i++)
          args[i] = &ZeroVec;
    }
 
@@ -2352,8 +2339,8 @@
           const struct tgsi_full_instruction *inst)
 {
    uint resource_unit, sampler_unit;
-   int dim;
-   int i;
+   unsigned dim;
+   unsigned i;
    union tgsi_exec_channel coords[4];
    const union tgsi_exec_channel *args[ARRAY_SIZE(coords)];
    union tgsi_exec_channel r[2];
@@ -3561,8 +3548,8 @@
    union tgsi_exec_channel src[2];
    uint i;
 
-   fetch_source_d(mach, &src[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
-   fetch_source_d(mach, &src[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
+   fetch_source_d(mach, &src[0], reg, chan_0);
+   fetch_source_d(mach, &src[1], reg, chan_1);
 
    for (i = 0; i < TGSI_QUAD_SIZE; i++) {
       chan->u[i][0] = src[0].u[i];
@@ -3611,9 +3598,9 @@
          }
    }
 
-   store_dest_double(mach, &dst[0], reg, inst, chan_0, TGSI_EXEC_DATA_UINT);
-   if (chan_1 != -1)
-      store_dest_double(mach, &dst[1], reg, inst, chan_1, TGSI_EXEC_DATA_UINT);
+   store_dest_double(mach, &dst[0], reg, chan_0, TGSI_EXEC_DATA_UINT);
+   if (chan_1 != (unsigned)-1)
+      store_dest_double(mach, &dst[1], reg, chan_1, TGSI_EXEC_DATA_UINT);
 }
 
 static void
@@ -5146,7 +5133,7 @@
       break;
 
    case TGSI_OPCODE_KILL:
-      exec_kill (mach, inst);
+      exec_kill (mach);
       break;
 
    case TGSI_OPCODE_KILL_IF:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 0fac7ea..ed8b9e8 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -544,6 +544,8 @@
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
       return 0;
+   case PIPE_SHADER_CAP_SCALAR_ISA:
+      return 1;
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return PIPE_MAX_SHADER_BUFFERS;
    case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 4aa6587..bbe1a21 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -153,6 +153,8 @@
    case TGSI_OPCODE_POPC:
    case TGSI_OPCODE_LSB:
    case TGSI_OPCODE_UMSB:
+   case TGSI_OPCODE_IMG2HND:
+   case TGSI_OPCODE_SAMP2HND:
       return TGSI_TYPE_UNSIGNED;
    case TGSI_OPCODE_ARL:
    case TGSI_OPCODE_ARR:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h b/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h
index 1b2803c..c3787c2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_info_opcodes.h
@@ -162,8 +162,8 @@
 OPCODE(1, 1, COMP, ISSG)
 OPCODE(1, 2, OTHR, LOAD)
 OPCODE(1, 2, OTHR, STORE, .is_store = 1)
-OPCODE_GAP(163) /* removed */
-OPCODE_GAP(164) /* removed */
+OPCODE(1, 1, OTHR, IMG2HND)
+OPCODE(1, 1, OTHR, SAMP2HND, .is_tex = 1)
 OPCODE_GAP(165) /* removed */
 OPCODE(0, 0, OTHR, BARRIER)
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_lowering.c b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
index 47aa3df..664cb3b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_lowering.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
@@ -1170,7 +1170,7 @@
    struct tgsi_full_declaration decl;
    struct tgsi_full_instruction new_inst;
    unsigned inbase, tmpbase;
-   int i;
+   unsigned i;
 
    inbase  = info->file_max[TGSI_FILE_INPUT] + 1;
    tmpbase = info->file_max[TGSI_FILE_TEMPORARY] + 1;
@@ -1254,7 +1254,7 @@
    struct tgsi_full_declaration decl;
    struct tgsi_full_immediate immed;
    unsigned tmpbase;
-   int i;
+   unsigned i;
 
    tmpbase = info->file_max[TGSI_FILE_TEMPORARY] + 1;
 
@@ -1308,7 +1308,7 @@
       struct tgsi_src_register *src = &inst->Src[i].Register;
       if (src->File == TGSI_FILE_INPUT) {
          for (j = 0; j < ctx->two_side_colors; j++) {
-            if (src->Index == ctx->two_side_idx[j]) {
+	    if (src->Index == (int)ctx->two_side_idx[j]) {
                src->File = TGSI_FILE_TEMPORARY;
                src->Index = ctx->color_base + j;
                break;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
index 9a13fa6..54a1ee1 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h
@@ -160,6 +160,9 @@
 OP11(IABS)
 OP11(ISSG)
 
+OP11(IMG2HND)
+OP11(SAMP2HND)
+
 OP12(IMUL_HI)
 OP12(UMUL_HI)
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index c706fc8..65f7e74 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -51,7 +51,7 @@
 
 void
 tgsi_parse_free(
-   struct tgsi_parse_context *ctx )
+   UNUSED struct tgsi_parse_context *ctx )
 {
 }
 
@@ -59,8 +59,12 @@
 tgsi_parse_end_of_tokens(
    struct tgsi_parse_context *ctx )
 {
-   return ctx->Position >=
-      ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
+   /* All values involved are unsigned, but the sum will be promoted to
+    * a signed value (at least on 64 bit). To capture a possible overflow
+    * make it a signed comparison.
+    */
+   return (int)ctx->Position >=
+	 ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
 }
 
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c b/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c
index f60a17c..67b4b0a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_point_sprite.c
@@ -114,6 +114,7 @@
              struct tgsi_full_declaration *decl)
 {
    struct psprite_transform_context *ts = psprite_transform_context(ctx);
+   unsigned range_end = decl->Range.Last + 1;
 
    if (decl->Declaration.File == TGSI_FILE_INPUT) {
       if (decl->Semantic.Name == TGSI_SEMANTIC_PSIZE) {
@@ -135,13 +136,13 @@
          ts->point_coord_decl |= 1 << decl->Semantic.Index;
          ts->max_generic = MAX2(ts->max_generic, (int)decl->Semantic.Index);
       }
-      ts->num_out = MAX2(ts->num_out, decl->Range.Last + 1);
+      ts->num_out = MAX2(ts->num_out, range_end);
    }
    else if (decl->Declaration.File == TGSI_FILE_TEMPORARY) {
-      ts->num_tmp = MAX2(ts->num_tmp, decl->Range.Last + 1);
+      ts->num_tmp = MAX2(ts->num_tmp, range_end);
    }
    else if (decl->Declaration.File == TGSI_FILE_CONSTANT) {
-      ts->num_const = MAX2(ts->num_const, decl->Range.Last + 1);
+      ts->num_const = MAX2(ts->num_const, range_end);
    }
 
    ctx->emit_declaration(ctx, decl);
@@ -169,7 +170,7 @@
 {
    struct psprite_transform_context *ts = psprite_transform_context(ctx);
    unsigned point_coord_enable, en;
-   int i;
+   unsigned i;
 
    /* Replace output registers with temporary registers */
    for (i = 0; i < ts->num_out; i++) {
@@ -427,7 +428,7 @@
       psprite_emit_vertex_inst(ctx, inst);
    }
    else if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT &&
-            inst->Dst[0].Register.Index == ts->point_size_out) {
+	    inst->Dst[0].Register.Index == (int)ts->point_size_out) {
       /**
        * Replace point size output reg with tmp reg.
        * The tmp reg will be later used as a src reg for computing
@@ -451,7 +452,7 @@
                  TGSI_FILE_CONSTANT, ts->point_ivp, TGSI_SWIZZLE_W, false);
    }
    else if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT &&
-            inst->Dst[0].Register.Index == ts->point_pos_out) {
+	    inst->Dst[0].Register.Index == (int)ts->point_pos_out) {
       /**
        * Replace point pos output reg with tmp reg.
        */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 2c9ad99..47f426d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -313,7 +313,7 @@
    uint i;
 
    if (inst->Instruction.Opcode == TGSI_OPCODE_END) {
-      if (ctx->index_of_END != ~0) {
+      if (ctx->index_of_END != ~0u) {
          report_error( ctx, "Too many END instructions" );
       }
       ctx->index_of_END = ctx->num_instructions;
@@ -514,7 +514,7 @@
 
    /* There must be an END instruction somewhere.
     */
-   if (ctx->index_of_END == ~0) {
+   if (ctx->index_of_END == ~0u) {
       report_error( ctx, "Missing END instruction" );
    }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index aeccb05..e13500a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -837,13 +837,12 @@
           procType == PIPE_SHADER_TESS_EVAL ||
           procType == PIPE_SHADER_COMPUTE);
    info->processor = procType;
+   info->num_tokens = tgsi_num_tokens(parse.Tokens);
 
    /**
     ** Loop over incoming program tokens/instructions
     */
    while (!tgsi_parse_end_of_tokens(&parse)) {
-      info->num_tokens++;
-
       tgsi_parse_token( &parse );
 
       switch( parse.FullToken.Token.Type ) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 4f28b49..4348712 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -185,6 +185,8 @@
    "SAMPLE",
 };
 
+const char *tgsi_invariant_name = "INVARIANT";
+
 const char *tgsi_primitive_names[PIPE_PRIM_MAX] =
 {
    "POINTS",
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.h b/src/gallium/auxiliary/tgsi/tgsi_strings.h
index bb2d345..20e3f71 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.h
@@ -52,6 +52,8 @@
 
 extern const char *tgsi_interpolate_locations[TGSI_INTERPOLATE_LOC_COUNT];
 
+extern const char *tgsi_invariant_name;
+
 extern const char *tgsi_primitive_names[PIPE_PRIM_MAX];
 
 extern const char *tgsi_fs_coord_origin_names[2];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 02241a6..9779f21 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -211,7 +211,7 @@
 static boolean parse_identifier( const char **pcur, char *ret, size_t len )
 {
    const char *cur = *pcur;
-   int i = 0;
+   size_t i = 0;
    if (is_alpha_underscore( cur )) {
       ret[i++] = *cur++;
       while (is_alpha_underscore( cur ) || is_digit( cur )) {
@@ -866,7 +866,7 @@
 
    eat_opt_white( &cur );
    if (*cur == '.') {
-      uint i;
+      int i;
 
       cur++;
       eat_opt_white( &cur );
@@ -1037,7 +1037,7 @@
    struct translate_ctx *ctx,
    boolean has_label )
 {
-   uint i;
+   int i;
    uint saturate = 0;
    uint precise = 0;
    const struct tgsi_opcode_info *info;
@@ -1586,10 +1586,6 @@
             break;
          }
       }
-      if (i == TGSI_INTERPOLATE_COUNT) {
-         report_error( ctx, "Expected semantic or interpolate attribute" );
-         return FALSE;
-      }
    }
 
    cur = ctx->cur;
@@ -1609,6 +1605,20 @@
       }
    }
 
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (*cur == ',' && !is_vs_input) {
+      cur++;
+      eat_opt_white( &cur );
+      if (str_match_nocase_whole( &cur, tgsi_invariant_name )) {
+         decl.Declaration.Invariant = 1;
+         ctx->cur = cur;
+      } else {
+         report_error( ctx, "Expected semantic, interpolate attribute, or invariant ");
+         return FALSE;
+      }
+   }
+
    advance = tgsi_build_full_declaration(
       &decl,
       ctx->tokens_cur,
@@ -1626,7 +1636,7 @@
 {
    struct tgsi_full_immediate imm;
    uint advance;
-   int type;
+   uint type;
 
    if (*ctx->cur == '[') {
       uint uindex;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c b/src/gallium/auxiliary/tgsi/tgsi_transform.c
index cd076c9..4b2b10f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c
@@ -140,10 +140,6 @@
       return -1;
    }
    procType = parse.FullHeader.Processor.Processor;
-   assert(procType == PIPE_SHADER_FRAGMENT ||
-          procType == PIPE_SHADER_VERTEX ||
-          procType == PIPE_SHADER_GEOMETRY);
-
 
    /**
     **  Setup output shader
diff --git a/src/gallium/auxiliary/tgsi/tgsi_two_side.c b/src/gallium/auxiliary/tgsi/tgsi_two_side.c
index 2406e28..53ac2a3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_two_side.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_two_side.c
@@ -72,6 +72,7 @@
            struct tgsi_full_declaration *decl)
 {
    struct two_side_transform_context *ts = two_side_transform_context(ctx);
+   unsigned range_end = decl->Range.Last + 1;
 
    if (decl->Declaration.File == TGSI_FILE_INPUT) {
       if (decl->Semantic.Name == TGSI_SEMANTIC_COLOR) {
@@ -83,10 +84,10 @@
       else if (decl->Semantic.Name == TGSI_SEMANTIC_FACE) {
          ts->face_input = decl->Range.First;
       }
-      ts->num_inputs = MAX2(ts->num_inputs, decl->Range.Last + 1);
+      ts->num_inputs = MAX2(ts->num_inputs, range_end);
    }
    else if (decl->Declaration.File == TGSI_FILE_TEMPORARY) {
-      ts->num_temps = MAX2(ts->num_temps, decl->Range.Last + 1);
+      ts->num_temps = MAX2(ts->num_temps, range_end);
    }
 
    ctx->emit_declaration(ctx, decl);
@@ -181,7 +182,7 @@
    for (i = 0; i < info->num_src; i++) {
       if (inst->Src[i].Register.File == TGSI_FILE_INPUT) {
          for (j = 0; j < 2; j++) {
-            if (inst->Src[i].Register.Index == ts->front_color_input[j]) {
+	    if (inst->Src[i].Register.Index == (int)ts->front_color_input[j]) {
                /* replace color input with temp reg */
                inst->Src[i].Register.File = TGSI_FILE_TEMPORARY;
                inst->Src[i].Register.Index = ts->new_colors[j];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 393e015..92c98c7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2009-2010 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 
@@ -140,6 +140,7 @@
       unsigned first;
       unsigned last;
       unsigned array_id;
+      boolean invariant;
    } output[UREG_MAX_OUTPUT];
    unsigned nr_outputs, nr_output_regs;
 
@@ -170,7 +171,7 @@
    struct {
       unsigned index;
       enum tgsi_texture_type target;
-      unsigned format;
+      enum pipe_format format;
       boolean wr;
       boolean raw;
    } image[PIPE_MAX_SHADER_IMAGES];
@@ -349,7 +350,7 @@
 }
 
 
-struct ureg_src 
+struct ureg_src
 ureg_DECL_vs_input( struct ureg_program *ureg,
                     unsigned index )
 {
@@ -427,7 +428,8 @@
                         unsigned index,
                         unsigned usage_mask,
                         unsigned array_id,
-                        unsigned array_size)
+                        unsigned array_size,
+                        boolean invariant)
 {
    unsigned i;
 
@@ -455,6 +457,7 @@
       ureg->output[i].first = index;
       ureg->output[i].last = index + array_size - 1;
       ureg->output[i].array_id = array_id;
+      ureg->output[i].invariant = invariant;
       ureg->nr_output_regs = MAX2(ureg->nr_output_regs, index + array_size);
       ureg->nr_outputs++;
    }
@@ -480,11 +483,12 @@
                         unsigned array_size)
 {
    return ureg_DECL_output_layout(ureg, name, index, 0,
-                                  ureg->nr_output_regs, usage_mask, array_id, array_size);
+                                  ureg->nr_output_regs, usage_mask, array_id,
+                                  array_size, FALSE);
 }
 
 
-struct ureg_dst 
+struct ureg_dst
 ureg_DECL_output(struct ureg_program *ureg,
                  enum tgsi_semantic name,
                  unsigned index)
@@ -718,9 +722,9 @@
    unsigned i;
 
    for (i = 0; i < ureg->nr_samplers; i++)
-      if (ureg->sampler[i].Index == nr)
+      if (ureg->sampler[i].Index == (int)nr)
          return ureg->sampler[i];
-   
+
    if (i < PIPE_MAX_SAMPLERS) {
       ureg->sampler[i] = ureg_src_register( TGSI_FILE_SAMPLER, nr );
       ureg->nr_samplers++;
@@ -773,7 +777,7 @@
 ureg_DECL_image(struct ureg_program *ureg,
                 unsigned index,
                 enum tgsi_texture_type target,
-                unsigned format,
+                enum pipe_format format,
                 boolean wr,
                 boolean raw)
 {
@@ -834,7 +838,6 @@
 
 static int
 match_or_expand_immediate64( const unsigned *v,
-                             int type,
                              unsigned nr,
                              unsigned *v2,
                              unsigned *pnr2,
@@ -886,7 +889,7 @@
    if (type == TGSI_IMM_FLOAT64 ||
        type == TGSI_IMM_UINT64 ||
        type == TGSI_IMM_INT64)
-      return match_or_expand_immediate64(v, type, nr, v2, pnr2, swizzle);
+      return match_or_expand_immediate64(v, nr, v2, pnr2, swizzle);
 
    *swizzle = 0;
 
@@ -1116,7 +1119,7 @@
 
    assert(src.File != TGSI_FILE_NULL);
    assert(src.File < TGSI_FILE_COUNT);
-   
+
    out[n].value = 0;
    out[n].src.File = src.File;
    out[n].src.SwizzleX = src.SwizzleX;
@@ -1170,7 +1173,7 @@
 }
 
 
-void 
+void
 ureg_emit_dst( struct ureg_program *ureg,
                struct ureg_dst dst )
 {
@@ -1192,7 +1195,7 @@
    out[n].dst.Indirect = dst.Indirect;
    out[n].dst.Index = dst.Index;
    n++;
-   
+
    if (dst.Indirect) {
       out[n].value = 0;
       out[n].ind.File = dst.IndirectFile;
@@ -1261,7 +1264,7 @@
    struct ureg_emit_insn_result result;
 
    validate( opcode, num_dst, num_src );
-   
+
    out = get_tokens( ureg, DOMAIN_INSN, count );
    out[0].insn = tgsi_default_instruction();
    out[0].insn.Opcode = opcode;
@@ -1356,15 +1359,14 @@
 
    out[0].value = 0;
    out[0].insn_texture_offset = *offset;
-   
 }
 
 void
 ureg_emit_memory(struct ureg_program *ureg,
                  unsigned extended_token,
                  unsigned qualifier,
-                 unsigned texture,
-                 unsigned format)
+                 enum tgsi_texture_type texture,
+                 enum pipe_format format)
 {
    union tgsi_any_token *out, *insn;
 
@@ -1478,8 +1480,8 @@
                  const struct ureg_src *src,
                  unsigned nr_src,
                  unsigned qualifier,
-                 unsigned texture,
-                 unsigned format)
+                 enum tgsi_texture_type texture,
+                 enum pipe_format format)
 {
    struct ureg_emit_insn_result insn;
    unsigned i;
@@ -1512,7 +1514,8 @@
                    unsigned semantic_index,
                    unsigned streams,
                    unsigned usage_mask,
-                   unsigned array_id)
+                   unsigned array_id,
+                   boolean invariant)
 {
    union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, array_id ? 4 : 3);
 
@@ -1523,6 +1526,7 @@
    out[0].decl.UsageMask = usage_mask;
    out[0].decl.Semantic = 1;
    out[0].decl.Array = array_id != 0;
+   out[0].decl.Invariant = invariant;
 
    out[1].value = 0;
    out[1].decl_range.First = first;
@@ -1719,7 +1723,7 @@
 emit_decl_image(struct ureg_program *ureg,
                 unsigned index,
                 enum tgsi_texture_type target,
-                unsigned format,
+                enum pipe_format format,
                 boolean wr,
                 boolean raw)
 {
@@ -1818,7 +1822,7 @@
    unsigned i,j;
 
    for (i = 0; i < ARRAY_SIZE(ureg->properties); i++)
-      if (ureg->properties[i] != ~0)
+      if (ureg->properties[i] != ~0u)
          emit_property(ureg, i, ureg->properties[i]);
 
    if (ureg->processor == PIPE_SHADER_VERTEX) {
@@ -1870,7 +1874,8 @@
                                ureg->input[i].semantic_index,
                                0,
                                TGSI_WRITEMASK_XYZW,
-                               ureg->input[i].array_id);
+                               ureg->input[i].array_id,
+                               FALSE);
          }
       }
       else {
@@ -1883,7 +1888,7 @@
                                   ureg->input[i].semantic_index +
                                   (j - ureg->input[i].first),
                                   0,
-                                  TGSI_WRITEMASK_XYZW, 0);
+                                  TGSI_WRITEMASK_XYZW, 0, FALSE);
             }
          }
       }
@@ -1897,7 +1902,7 @@
                          ureg->system_value[i].semantic_name,
                          ureg->system_value[i].semantic_index,
                          0,
-                         TGSI_WRITEMASK_XYZW, 0);
+                         TGSI_WRITEMASK_XYZW, 0, FALSE);
    }
 
    if (ureg->supports_any_inout_decl_range) {
@@ -1910,7 +1915,8 @@
                             ureg->output[i].semantic_index,
                             ureg->output[i].streams,
                             ureg->output[i].usage_mask,
-                            ureg->output[i].array_id);
+                            ureg->output[i].array_id,
+                            ureg->output[i].invariant);
       }
    }
    else {
@@ -1923,13 +1929,15 @@
                                ureg->output[i].semantic_index +
                                (j - ureg->output[i].first),
                                ureg->output[i].streams,
-                               ureg->output[i].usage_mask, 0);
+                               ureg->output[i].usage_mask,
+                               0,
+                               ureg->output[i].invariant);
          }
       }
    }
 
    for (i = 0; i < ureg->nr_samplers; i++) {
-      emit_decl_range( ureg, 
+      emit_decl_range( ureg,
                        TGSI_FILE_SAMPLER,
                        ureg->sampler[i].Index, 1 );
    }
@@ -2029,12 +2037,12 @@
 static void copy_instructions( struct ureg_program *ureg )
 {
    unsigned nr_tokens = ureg->domain[DOMAIN_INSN].count;
-   union tgsi_any_token *out = get_tokens( ureg, 
-                                           DOMAIN_DECL, 
+   union tgsi_any_token *out = get_tokens( ureg,
+                                           DOMAIN_DECL,
                                            nr_tokens );
 
-   memcpy(out, 
-          ureg->domain[DOMAIN_INSN].tokens, 
+   memcpy(out,
+          ureg->domain[DOMAIN_INSN].tokens,
           nr_tokens * sizeof out[0] );
 }
 
@@ -2081,7 +2089,7 @@
    emit_decls( ureg );
    copy_instructions( ureg );
    fixup_header_size( ureg );
-   
+
    if (ureg->domain[0].tokens == error_tokens ||
        ureg->domain[1].tokens == error_tokens) {
       debug_printf("%s: error in generated shader\n", __FUNCTION__);
@@ -2092,7 +2100,7 @@
    tokens = &ureg->domain[DOMAIN_DECL].tokens[0].token;
 
    if (0) {
-      debug_printf("%s: emitted shader %d tokens:\n", __FUNCTION__, 
+      debug_printf("%s: emitted shader %d tokens:\n", __FUNCTION__,
                    ureg->domain[DOMAIN_DECL].count);
       tgsi_dump( tokens, 0 );
    }
@@ -2105,7 +2113,7 @@
    }
 #endif
 
-   
+
    return tokens;
 }
 
@@ -2149,7 +2157,7 @@
 
    tokens = &ureg->domain[DOMAIN_DECL].tokens[0].token;
 
-   if (nr_tokens) 
+   if (nr_tokens)
       *nr_tokens = ureg->domain[DOMAIN_DECL].count;
 
    ureg->domain[DOMAIN_DECL].tokens = 0;
@@ -2178,7 +2186,7 @@
 ureg_create_with_screen(enum pipe_shader_type processor,
                         struct pipe_screen *screen)
 {
-   int i;
+   uint i;
    struct ureg_program *ureg = CALLOC_STRUCT( ureg_program );
    if (!ureg)
       goto no_ureg;
@@ -2239,7 +2247,7 @@
    unsigned i;
 
    for (i = 0; i < ARRAY_SIZE(ureg->domain); i++) {
-      if (ureg->domain[i].tokens && 
+      if (ureg->domain[i].tokens &&
           ureg->domain[i].tokens != error_tokens)
          FREE(ureg->domain[i].tokens);
    }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 7eef94a..c974ed0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -79,6 +79,7 @@
    unsigned DimIndirect     : 1;  /* BOOL */
    unsigned Dimension       : 1;  /* BOOL */
    unsigned Saturate        : 1;  /* BOOL */
+   unsigned Invariant       : 1;  /* BOOL */
    int      Index           : 16; /* SINT */
    int      IndirectIndex   : 16; /* SINT */
    unsigned IndirectFile    : 4;  /* TGSI_FILE_ */
@@ -250,7 +251,8 @@
                         unsigned index,
                         unsigned usage_mask,
                         unsigned array_id,
-                        unsigned array_size);
+                        unsigned array_size,
+                        boolean invariant);
 
 struct ureg_dst
 ureg_DECL_output_masked(struct ureg_program *,
@@ -372,7 +374,7 @@
 ureg_DECL_image(struct ureg_program *ureg,
                 unsigned index,
                 enum tgsi_texture_type target,
-                unsigned format,
+                enum pipe_format format,
                 boolean wr,
                 boolean raw);
 
@@ -579,8 +581,8 @@
                  const struct ureg_src *src,
                  unsigned nr_src,
                  unsigned qualifier,
-                 unsigned texture,
-                 unsigned format);
+                 enum tgsi_texture_type texture,
+                 enum pipe_format format);
 
 /***********************************************************************
  * Internal instruction helpers, don't call these directly:
@@ -619,8 +621,8 @@
 ureg_emit_memory(struct ureg_program *ureg,
                  unsigned insn_token,
                  unsigned qualifier,
-                 unsigned texture,
-                 unsigned format);
+                 enum tgsi_texture_type texture,
+                 enum pipe_format format);
 
 void 
 ureg_emit_dst( struct ureg_program *ureg,
@@ -1017,6 +1019,7 @@
    dst.DimIndIndex = 0;
    dst.DimIndSwizzle = 0;
    dst.ArrayID = array_id;
+   dst.Invariant = 0;
 
    return dst;
 }
@@ -1048,6 +1051,7 @@
    dst.DimIndIndex = src.DimIndIndex;
    dst.DimIndSwizzle = src.DimIndSwizzle;
    dst.ArrayID = src.ArrayID;
+   dst.Invariant = 0;
 
    return dst;
 }
@@ -1139,6 +1143,7 @@
    dst.DimIndIndex = 0;
    dst.DimIndSwizzle = 0;
    dst.ArrayID = 0;
+   dst.Invariant = 0;
 
    return dst;
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index a78172a..ebbd561 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -106,7 +106,7 @@
 unsigned
 tgsi_util_get_full_src_register_sign_mode(
    const struct tgsi_full_src_register *reg,
-   unsigned component)
+   UNUSED unsigned component)
 {
    unsigned sign_mode;
 
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index d1571cd1f5..ca3d221 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -112,7 +112,7 @@
    for (i = 0; i < 2; i++) {
       ctx->velem[i].src_offset = i * 4 * sizeof(float);
       ctx->velem[i].instance_divisor = 0;
-      ctx->velem[i].vertex_buffer_index = cso_get_aux_vertex_buffer_slot(cso);
+      ctx->velem[i].vertex_buffer_index = 0;
       ctx->velem[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
    }
 
@@ -551,6 +551,7 @@
    assert(ctx->pipe->screen->is_format_supported(ctx->pipe->screen, dst->format,
                                                  PIPE_TEXTURE_2D,
                                                  dst->texture->nr_samples,
+                                                 dst->texture->nr_storage_samples,
                                                  PIPE_BIND_RENDER_TARGET));
 
    /* save state (restored below) */
@@ -631,8 +632,7 @@
                                   s0, t0, s1, t1,
                                   z);
 
-   util_draw_vertex_buffer(ctx->pipe, ctx->cso, ctx->vbuf,
-                           cso_get_aux_vertex_buffer_slot(ctx->cso),
+   util_draw_vertex_buffer(ctx->pipe, ctx->cso, ctx->vbuf, 0,
                            offset,
                            PIPE_PRIM_TRIANGLE_FAN,
                            4,  /* verts */
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 4748627..a9df711 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -1182,7 +1182,7 @@
             /* MSAA resolve shaders. */
             for (j = 2; j < 32; j++) {
                if (!screen->is_format_supported(screen, PIPE_FORMAT_R32_FLOAT,
-                                                target, j,
+                                                target, j, j,
                                                 PIPE_BIND_SAMPLER_VIEW)) {
                   continue;
                }
@@ -1258,8 +1258,20 @@
    pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb);
    pipe->bind_vertex_elements_state(pipe, vertex_elements_cso);
    pipe->bind_vs_state(pipe, get_vs(&ctx->base));
-   util_draw_arrays_instanced(pipe, PIPE_PRIM_TRIANGLE_FAN, 0, 4,
-                              0, num_instances);
+
+   if (ctx->base.use_index_buffer) {
+      /* Note that for V3D,
+       * dEQP-GLES3.functional.fbo.blit.rect.nearest_consistency_* require
+       * that the last vert of the two tris be the same.
+       */
+      static uint8_t indices[6] = { 0, 1, 2, 0, 3, 2 };
+      util_draw_elements_instanced(pipe, indices, 1, 0,
+                                   PIPE_PRIM_TRIANGLES, 0, 6,
+                                   0, num_instances);
+   } else {
+      util_draw_arrays_instanced(pipe, PIPE_PRIM_TRIANGLE_FAN, 0, 4,
+                                 0, num_instances);
+   }
    pipe_resource_reference(&vb.buffer.resource, NULL);
 }
 
@@ -1527,7 +1539,8 @@
          bind = PIPE_BIND_RENDER_TARGET;
 
       if (!screen->is_format_supported(screen, dst_format, dst->target,
-                                       dst->nr_samples, bind)) {
+                                       dst->nr_samples, dst->nr_storage_samples,
+                                       bind)) {
          return false;
       }
    }
@@ -1538,7 +1551,8 @@
       }
 
       if (!screen->is_format_supported(screen, src_format, src->target,
-                                 src->nr_samples, PIPE_BIND_SAMPLER_VIEW)) {
+                                       src->nr_samples, src->nr_storage_samples,
+                                       PIPE_BIND_SAMPLER_VIEW)) {
          return false;
       }
 
@@ -1552,6 +1566,7 @@
             if (stencil_format != src_format &&
                 !screen->is_format_supported(screen, stencil_format,
                                              src->target, src->nr_samples,
+                                             src->nr_storage_samples,
                                              PIPE_BIND_SAMPLER_VIEW)) {
                return false;
             }
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index 9e94598..9ea1dc9 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -100,6 +100,8 @@
    /* Whether the blitter is running. */
    bool running;
 
+   bool use_index_buffer;
+
    /* Private members, really. */
    struct pipe_context *pipe; /**< pipe context */
 
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index 6a59f27..3c6ae4e 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -374,7 +374,7 @@
    }
 #elif defined(PIPE_OS_UNIX) && defined(_SC_NPROCESSORS_ONLN)
    util_cpu_caps.nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-   if (util_cpu_caps.nr_cpus == ~0u)
+   if (util_cpu_caps.nr_cpus == ~0)
       util_cpu_caps.nr_cpus = 1;
 #elif defined(PIPE_OS_BSD)
    {
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 4a34ac4d..7a63d55 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -46,7 +46,7 @@
 
 
 struct util_cpu_caps {
-   unsigned nr_cpus;
+   int nr_cpus;
 
    /* Feature flags */
    int x86_cpu_type;
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index d2ea89f..4c3b8ba 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -170,8 +170,8 @@
                         const char *file, 
                         unsigned line, 
                         const char *function)
-#ifdef __GNUC__
-   __attribute__((__noreturn__))
+#if defined(__GNUC__) && !defined(DEBUG)
+   __attribute__((noreturn))
 #endif
 ;
 
diff --git a/src/gallium/auxiliary/util/u_debug_memory.c b/src/gallium/auxiliary/util/u_debug_memory.c
index 1ba553c..0a52daa 100644
--- a/src/gallium/auxiliary/util/u_debug_memory.c
+++ b/src/gallium/auxiliary/util/u_debug_memory.c
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2008 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,17 +22,17 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 /**
  * @file
  * Memory debugging.
- * 
+ *
  * @author José Fonseca <jfonseca@vmware.com>
  */
 
-#include "pipe/p_config.h" 
+#include "pipe/p_config.h"
 
 #define DEBUG_MEMORY_IMPLEMENTATION
 
@@ -40,12 +40,12 @@
 #include "os/os_memory_debug.h"
 #include "os/os_thread.h"
 
-#include "util/u_debug.h" 
-#include "util/u_debug_stack.h" 
+#include "util/u_debug.h"
+#include "util/u_debug_stack.h"
 #include "util/list.h"
 
 
-#define DEBUG_MEMORY_MAGIC 0x6e34090aU 
+#define DEBUG_MEMORY_MAGIC 0x6e34090aU
 #define DEBUG_MEMORY_STACK 0 /* XXX: disabled until we have symbol lookup */
 
 /**
@@ -59,10 +59,10 @@
 #define DEBUG_FREED_BYTE 0x33
 
 
-struct debug_memory_header 
+struct debug_memory_header
 {
    struct list_head head;
-   
+
    unsigned long no;
    const char *file;
    unsigned line;
@@ -95,7 +95,7 @@
 static inline struct debug_memory_header *
 header_from_data(void *data)
 {
-   if(data)
+   if (data)
       return (struct debug_memory_header *)((char *)data - sizeof(struct debug_memory_header));
    else
       return NULL;
@@ -104,7 +104,7 @@
 static inline void *
 data_from_header(struct debug_memory_header *hdr)
 {
-   if(hdr)
+   if (hdr)
       return (void *)((char *)hdr + sizeof(struct debug_memory_header));
    else
       return NULL;
@@ -113,7 +113,7 @@
 static inline struct debug_memory_footer *
 footer_from_header(struct debug_memory_header *hdr)
 {
-   if(hdr)
+   if (hdr)
       return (struct debug_memory_footer *)((char *)hdr + sizeof(struct debug_memory_header) + hdr->size);
    else
       return NULL;
@@ -122,11 +122,11 @@
 
 void *
 debug_malloc(const char *file, unsigned line, const char *function,
-             size_t size) 
+             size_t size)
 {
    struct debug_memory_header *hdr;
    struct debug_memory_footer *ftr;
-   
+
    hdr = os_malloc(sizeof(*hdr) + size + sizeof(*ftr));
    if (!hdr) {
       debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n",
@@ -134,7 +134,7 @@
                    (long unsigned)size);
       return NULL;
    }
- 
+
    hdr->no = last_no++;
    hdr->file = file;
    hdr->line = line;
@@ -152,26 +152,26 @@
 
    ftr = footer_from_header(hdr);
    ftr->magic = DEBUG_MEMORY_MAGIC;
-   
+
    mtx_lock(&list_mutex);
    LIST_ADDTAIL(&hdr->head, &list);
    mtx_unlock(&list_mutex);
-   
+
    return data_from_header(hdr);
 }
 
 void
 debug_free(const char *file, unsigned line, const char *function,
-           void *ptr) 
+           void *ptr)
 {
    struct debug_memory_header *hdr;
    struct debug_memory_footer *ftr;
-   
+
    if (!ptr)
       return;
-   
+
    hdr = header_from_data(ptr);
-   if(hdr->magic != DEBUG_MEMORY_MAGIC) {
+   if (hdr->magic != DEBUG_MEMORY_MAGIC) {
       debug_printf("%s:%u:%s: freeing bad or corrupted memory %p\n",
                    file, line, function,
                    ptr);
@@ -180,7 +180,7 @@
    }
 
    ftr = footer_from_header(hdr);
-   if(ftr->magic != DEBUG_MEMORY_MAGIC) {
+   if (ftr->magic != DEBUG_MEMORY_MAGIC) {
       debug_printf("%s:%u:%s: buffer overflow %p\n",
                    hdr->file, hdr->line, hdr->function,
                    ptr);
@@ -203,7 +203,7 @@
    mtx_unlock(&list_mutex);
    hdr->magic = 0;
    ftr->magic = 0;
-   
+
    os_free(hdr);
 #endif
 }
@@ -225,26 +225,26 @@
    struct debug_memory_header *old_hdr, *new_hdr;
    struct debug_memory_footer *old_ftr, *new_ftr;
    void *new_ptr;
-   
+
    if (!old_ptr)
       return debug_malloc( file, line, function, new_size );
-   
-   if(!new_size) {
+
+   if (!new_size) {
       debug_free( file, line, function, old_ptr );
       return NULL;
    }
-   
+
    old_hdr = header_from_data(old_ptr);
-   if(old_hdr->magic != DEBUG_MEMORY_MAGIC) {
+   if (old_hdr->magic != DEBUG_MEMORY_MAGIC) {
       debug_printf("%s:%u:%s: reallocating bad or corrupted memory %p\n",
                    file, line, function,
                    old_ptr);
       debug_assert(0);
       return NULL;
    }
-   
+
    old_ftr = footer_from_header(old_hdr);
-   if(old_ftr->magic != DEBUG_MEMORY_MAGIC) {
+   if (old_ftr->magic != DEBUG_MEMORY_MAGIC) {
       debug_printf("%s:%u:%s: buffer overflow %p\n",
                    old_hdr->file, old_hdr->line, old_hdr->function,
                    old_ptr);
@@ -269,10 +269,10 @@
 #if DEBUG_FREED_MEMORY
    new_hdr->freed = FALSE;
 #endif
-   
+
    new_ftr = footer_from_header(new_hdr);
    new_ftr->magic = DEBUG_MEMORY_MAGIC;
-   
+
    mtx_lock(&list_mutex);
    LIST_REPLACE(&old_hdr->head, &new_hdr->head);
    mtx_unlock(&list_mutex);
@@ -295,13 +295,13 @@
    return last_no;
 }
 
-void 
+void
 debug_memory_end(unsigned long start_no)
 {
    size_t total_size = 0;
    struct list_head *entry;
 
-   if(start_no == last_no)
+   if (start_no == last_no)
       return;
 
    entry = list.prev;
@@ -314,25 +314,25 @@
       ptr = data_from_header(hdr);
       ftr = footer_from_header(hdr);
 
-      if(hdr->magic != DEBUG_MEMORY_MAGIC) {
+      if (hdr->magic != DEBUG_MEMORY_MAGIC) {
          debug_printf("%s:%u:%s: bad or corrupted memory %p\n",
                       hdr->file, hdr->line, hdr->function,
                       ptr);
          debug_assert(0);
       }
 
-      if((start_no <= hdr->no && hdr->no < last_no) ||
-	 (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) {
-	 debug_printf("%s:%u:%s: %lu bytes at %p not freed\n",
-		      hdr->file, hdr->line, hdr->function,
-		      (unsigned long) hdr->size, ptr);
+      if ((start_no <= hdr->no && hdr->no < last_no) ||
+          (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) {
+         debug_printf("%s:%u:%s: %lu bytes at %p not freed\n",
+                      hdr->file, hdr->line, hdr->function,
+                      (unsigned long) hdr->size, ptr);
 #if DEBUG_MEMORY_STACK
-	 debug_backtrace_dump(hdr->backtrace, DEBUG_MEMORY_STACK);
+         debug_backtrace_dump(hdr->backtrace, DEBUG_MEMORY_STACK);
 #endif
-	 total_size += hdr->size;
+         total_size += hdr->size;
       }
 
-      if(ftr->magic != DEBUG_MEMORY_MAGIC) {
+      if (ftr->magic != DEBUG_MEMORY_MAGIC) {
          debug_printf("%s:%u:%s: buffer overflow %p\n",
                       hdr->file, hdr->line, hdr->function,
                       ptr);
@@ -340,9 +340,9 @@
       }
    }
 
-   if(total_size) {
+   if (total_size) {
       debug_printf("Total of %lu KB of system memory apparently leaked\n",
-		   (unsigned long) (total_size + 1023)/1024);
+                   (unsigned long) (total_size + 1023)/1024);
    }
    else {
       debug_printf("No memory leaks detected.\n");
@@ -358,10 +358,10 @@
 debug_memory_tag(void *ptr, unsigned tag)
 {
    struct debug_memory_header *hdr;
-   
+
    if (!ptr)
       return;
-   
+
    hdr = header_from_data(ptr);
    if (hdr->magic != DEBUG_MEMORY_MAGIC) {
       debug_printf("%s corrupted memory at %p\n", __FUNCTION__, ptr);
@@ -380,10 +380,10 @@
 {
    struct debug_memory_header *hdr;
    struct debug_memory_footer *ftr;
-   
+
    if (!ptr)
       return;
-   
+
    hdr = header_from_data(ptr);
    ftr = footer_from_header(hdr);
 
@@ -406,7 +406,7 @@
  * We can periodically call this from elsewhere to do a basic sanity
  * check of the heap memory we've allocated.
  */
-void 
+void
 debug_memory_check(void)
 {
    struct list_head *entry;
diff --git a/src/gallium/auxiliary/util/u_debug_stack_android.cpp b/src/gallium/auxiliary/util/u_debug_stack_android.cpp
index b3d56ae..395a1fe 100644
--- a/src/gallium/auxiliary/util/u_debug_stack_android.cpp
+++ b/src/gallium/auxiliary/util/u_debug_stack_android.cpp
@@ -49,10 +49,10 @@
       backtrace_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
                                                 _mesa_key_pointer_equal);
 
-   backtrace_entry = _mesa_hash_table_search(backtrace_table, (void*) tid);
+   backtrace_entry = _mesa_hash_table_search(backtrace_table, (void*) (uintptr_t)tid);
    if (!backtrace_entry) {
       backtrace = Backtrace::Create(getpid(), tid);
-      _mesa_hash_table_insert(backtrace_table, (void*) tid, backtrace);
+      _mesa_hash_table_insert(backtrace_table, (void*) (uintptr_t)tid, backtrace);
    } else {
       backtrace = (Backtrace *) backtrace_entry->data;
    }
diff --git a/src/gallium/auxiliary/util/u_draw.h b/src/gallium/auxiliary/util/u_draw.h
index e8af140..d0955fa 100644
--- a/src/gallium/auxiliary/util/u_draw.h
+++ b/src/gallium/auxiliary/util/u_draw.h
@@ -67,7 +67,9 @@
 }
 
 static inline void
-util_draw_elements(struct pipe_context *pipe, unsigned index_size,
+util_draw_elements(struct pipe_context *pipe,
+                   void *indices,
+                   unsigned index_size,
                    int index_bias, enum pipe_prim_type mode,
                    uint start,
                    uint count)
@@ -75,6 +77,8 @@
    struct pipe_draw_info info;
 
    util_draw_init_info(&info);
+   info.index.user = indices;
+   info.has_user_indices = true;
    info.index_size = index_size;
    info.mode = mode;
    info.start = start;
@@ -108,6 +112,7 @@
 
 static inline void
 util_draw_elements_instanced(struct pipe_context *pipe,
+                             void *indices,
                              unsigned index_size,
                              int index_bias,
                              enum pipe_prim_type mode,
@@ -119,6 +124,8 @@
    struct pipe_draw_info info;
 
    util_draw_init_info(&info);
+   info.index.user = indices;
+   info.has_user_indices = true;
    info.index_size = index_size;
    info.mode = mode;
    info.start = start;
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index b68de13..286d5fa 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -319,6 +319,7 @@
 
    util_dump_member(stream, uint, state, last_level);
    util_dump_member(stream, uint, state, nr_samples);
+   util_dump_member(stream, uint, state, nr_storage_samples);
    util_dump_member(stream, uint, state, usage);
    util_dump_member(stream, uint, state, bind);
    util_dump_member(stream, uint, state, flags);
diff --git a/src/gallium/auxiliary/util/u_format.c b/src/gallium/auxiliary/util/u_format.c
index 369b4c3..1dd724d 100644
--- a/src/gallium/auxiliary/util/u_format.c
+++ b/src/gallium/auxiliary/util/u_format.c
@@ -235,28 +235,6 @@
       desc->block.bits == 32;
 }
 
-boolean
-util_format_is_supported(enum pipe_format format, unsigned bind)
-{
-   if (format >= PIPE_FORMAT_COUNT) {
-      return FALSE;
-   }
-
-#ifndef TEXTURE_FLOAT_ENABLED
-   if ((bind & PIPE_BIND_RENDER_TARGET) &&
-       format != PIPE_FORMAT_R9G9B9E5_FLOAT &&
-       format != PIPE_FORMAT_R11G11B10_FLOAT &&
-       util_format_is_float(format)) {
-      return FALSE;
-   }
-#else
-   (void)bind;
-#endif
-
-   return TRUE;
-}
-
-
 /**
  * Calculates the MRD for the depth format. MRD is used in depth bias
  * for UNORM and unbound depth buffers. When the depth buffer is floating
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index e497b4b..f421222 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -718,13 +718,6 @@
                           const struct util_format_description *dst_desc);
 
 /**
- * Whether the format is supported by Gallium for the given bindings.
- * This covers S3TC textures and floating-point render targets.
- */
-boolean
-util_format_is_supported(enum pipe_format format, unsigned bind);
-
-/**
  * Whether this format is a rgab8 variant.
  *
  * That is, any format that matches the
diff --git a/src/gallium/auxiliary/util/u_format_bptc.c b/src/gallium/auxiliary/util/u_format_bptc.c
new file mode 100644
index 0000000..87ec413
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_bptc.c
@@ -0,0 +1,279 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ * Copyright (c) 2008 VMware, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "u_math.h"
+#include "u_format.h"
+#include "u_format_bptc.h"
+#include "util/format_srgb.h"
+
+#define BPTC_BLOCK_DECODE
+#include "../../../mesa/main/texcompress_bptc_tmp.h"
+
+void
+util_format_bptc_rgba_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                               const uint8_t *src_row, unsigned src_stride,
+                                               unsigned width, unsigned height)
+{
+  decompress_rgba_unorm(width, height,
+                        src_row, src_stride,
+                        dst_row, dst_stride);
+}
+
+void
+util_format_bptc_rgba_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                             const uint8_t *src_row, unsigned src_stride,
+                                             unsigned width, unsigned height)
+{
+   compress_rgba_unorm(width, height,
+                       src_row, src_stride,
+                       dst_row, dst_stride);
+}
+
+void
+util_format_bptc_rgba_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                              const uint8_t *src_row, unsigned src_stride,
+                                              unsigned width, unsigned height)
+{
+   uint8_t *temp_block;
+   temp_block = malloc(width * height * 4 * sizeof(uint8_t));
+   decompress_rgba_unorm(width, height,
+                         src_row, src_stride,
+                         temp_block, width * 4 * sizeof(uint8_t));
+   util_format_read_4f(PIPE_FORMAT_R8G8B8A8_UNORM,
+                       dst_row, dst_stride,
+                       temp_block, width * 4 * sizeof(uint8_t),
+                       0, 0, width, height);
+   free((void *) temp_block);
+}
+
+void
+util_format_bptc_rgba_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                            const float *src_row, unsigned src_stride,
+                                            unsigned width, unsigned height)
+{
+   uint8_t *temp_block;
+   temp_block = malloc(width * height * 4 * sizeof(uint8_t));
+   util_format_read_4ub(PIPE_FORMAT_R32G32B32A32_FLOAT,
+                        temp_block, width * 4 * sizeof(uint8_t),
+                        src_row, src_stride,
+                        0, 0, width, height);
+   compress_rgba_unorm(width, height,
+                       temp_block, width * 4 * sizeof(uint8_t),
+                       dst_row, dst_stride);
+   free((void *) temp_block);
+}
+
+void
+util_format_bptc_rgba_unorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                             unsigned width, unsigned height)
+{
+   uint8_t temp_block[4];
+
+   fetch_rgba_unorm_from_block(src + ((width * sizeof(uint8_t)) * (height / 4) + (width / 4)) * 16,
+                               temp_block, (width % 4) + (height % 4) * 4);
+
+   util_format_read_4f(PIPE_FORMAT_R8G8B8A8_UNORM,
+                       dst, 4 * sizeof(float),
+                       temp_block, 4 * sizeof(uint8_t),
+                       0, 0, 1, 1);
+}
+
+void
+util_format_bptc_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                          const uint8_t *src_row, unsigned src_stride,
+                                          unsigned width, unsigned height)
+{
+   decompress_rgba_unorm(width, height,
+                         src_row, src_stride,
+                         dst_row, dst_stride);
+}
+
+void
+util_format_bptc_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                        const uint8_t *src_row, unsigned src_stride,
+                                        unsigned width, unsigned height)
+{
+   compress_rgba_unorm(width, height,
+                       src_row, src_stride,
+                       dst_row, dst_stride);
+}
+
+void
+util_format_bptc_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height)
+{
+   uint8_t *temp_block;
+   temp_block = malloc(width * height * 4 * sizeof(uint8_t));
+   decompress_rgba_unorm(width, height,
+                         src_row, src_stride,
+                         temp_block, width * 4 * sizeof(uint8_t));
+   util_format_read_4f(PIPE_FORMAT_R8G8B8A8_SRGB,
+                       dst_row, dst_stride,
+                       temp_block, width * 4 * sizeof(uint8_t),
+                       0, 0, width, height);
+   free((void *) temp_block);
+}
+
+void
+util_format_bptc_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                       const float *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height)
+{
+   compress_rgb_float(width, height,
+                      src_row, src_stride,
+                      dst_row, dst_stride,
+                      true);
+}
+
+void
+util_format_bptc_srgba_fetch_rgba_float(float *dst, const uint8_t *src,
+                                        unsigned width, unsigned height)
+{
+   uint8_t temp_block[4];
+
+   fetch_rgba_unorm_from_block(src + ((width * sizeof(uint8_t)) * (height / 4) + (width / 4)) * 16,
+                               temp_block, (width % 4) + (height % 4) * 4);
+   util_format_read_4f(PIPE_FORMAT_R8G8B8A8_SRGB,
+                       dst, 4 * sizeof(float),
+                       temp_block, width * 4 * sizeof(uint8_t),
+                       0, 0, 1, 1);
+}
+
+void
+util_format_bptc_rgb_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                              const uint8_t *src_row, unsigned src_stride,
+                                              unsigned width, unsigned height)
+{
+   float *temp_block;
+   temp_block = malloc(width * height * 4 * sizeof(float));
+   decompress_rgb_float(width, height,
+                        src_row, src_stride,
+                        temp_block, width * 4 * sizeof(float),
+                        true);
+   util_format_read_4ub(PIPE_FORMAT_R32G32B32A32_FLOAT,
+                        dst_row, dst_stride,
+                        temp_block, width * 4 * sizeof(float),
+                        0, 0, width, height);
+   free((void *) temp_block);
+}
+
+void
+util_format_bptc_rgb_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                            const uint8_t *src_row, unsigned src_stride,
+                                            unsigned width, unsigned height)
+{
+   compress_rgba_unorm(width, height,
+                       src_row, src_stride,
+                       dst_row, dst_stride);
+}
+
+void
+util_format_bptc_rgb_float_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                             const uint8_t *src_row, unsigned src_stride,
+                                             unsigned width, unsigned height)
+{
+   decompress_rgb_float(width, height,
+                        src_row, src_stride,
+                        dst_row, dst_stride,
+                        true);
+}
+
+void
+util_format_bptc_rgb_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                           const float *src_row, unsigned src_stride,
+                                           unsigned width, unsigned height)
+{
+   compress_rgb_float(width, height,
+                      src_row, src_stride,
+                      dst_row, dst_stride,
+                      true);
+}
+
+void
+util_format_bptc_rgb_float_fetch_rgba_float(float *dst, const uint8_t *src,
+                                            unsigned width, unsigned height)
+{
+   fetch_rgb_float_from_block(src + ((width * sizeof(uint8_t)) * (height / 4) + (width / 4)) * 16,
+                              dst, (width % 4) + (height % 4) * 4, true);
+}
+
+void
+util_format_bptc_rgb_ufloat_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                               const uint8_t *src_row, unsigned src_stride,
+                                               unsigned width, unsigned height)
+{
+   float *temp_block;
+   temp_block = malloc(width * height * 4 * sizeof(float));
+   decompress_rgb_float(width, height,
+                        src_row, src_stride,
+                        temp_block, width * 4 * sizeof(float),
+                        false);
+   util_format_read_4ub(PIPE_FORMAT_R32G32B32A32_FLOAT,
+                        dst_row, dst_stride,
+                        temp_block, width * 4 * sizeof(float),
+                        0, 0, width, height);
+   free((void *) temp_block);
+}
+
+void
+util_format_bptc_rgb_ufloat_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                             const uint8_t *src_row, unsigned src_stride,
+                                             unsigned width, unsigned height)
+{
+   compress_rgba_unorm(width, height,
+                       src_row, src_stride,
+                       dst_row, dst_stride);
+}
+
+void
+util_format_bptc_rgb_ufloat_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                              const uint8_t *src_row, unsigned src_stride,
+                                              unsigned width, unsigned height)
+{
+   decompress_rgb_float(width, height,
+                        src_row, src_stride,
+                        dst_row, dst_stride,
+                        false);
+}
+
+void
+util_format_bptc_rgb_ufloat_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                            const float *src_row, unsigned src_stride,
+                                            unsigned width, unsigned height)
+{
+   compress_rgb_float(width, height,
+                      src_row, src_stride,
+                      dst_row, dst_stride,
+                      false);
+}
+
+void
+util_format_bptc_rgb_ufloat_fetch_rgba_float(float *dst, const uint8_t *src,
+                                             unsigned width, unsigned height)
+{
+   fetch_rgb_float_from_block(src + ((width * sizeof(uint8_t)) * (height / 4) + (width / 4)) * 16,
+                              dst, (width % 4) + (height % 4) * 4, false);
+}
diff --git a/src/gallium/auxiliary/util/u_format_bptc.h b/src/gallium/auxiliary/util/u_format_bptc.h
new file mode 100644
index 0000000..eaf3ec3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_format_bptc.h
@@ -0,0 +1,122 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#ifndef U_FORMAT_BPTC_H_
+#define U_FORMAT_BPTC_H_
+
+
+#include "pipe/p_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+util_format_bptc_rgba_unorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                               const uint8_t *src_row, unsigned src_stride,
+                                               unsigned width, unsigned height);
+void
+util_format_bptc_rgba_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                             const uint8_t *src_row, unsigned src_stride,
+                                             unsigned width, unsigned height);
+void
+util_format_bptc_rgba_unorm_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                              const uint8_t *src_row, unsigned src_stride,
+                                              unsigned width, unsigned height);
+void
+util_format_bptc_rgba_unorm_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                            const float *src_row, unsigned src_stride,
+                                            unsigned width, unsigned height);
+void
+util_format_bptc_rgba_unorm_fetch_rgba_float(float *dst, const uint8_t *src,
+                                             unsigned width, unsigned height);
+
+void
+util_format_bptc_srgba_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                          const uint8_t *src_row, unsigned src_stride,
+                                          unsigned width, unsigned height);
+void
+util_format_bptc_srgba_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                        const uint8_t *src_row, unsigned src_stride,
+                                        unsigned width, unsigned height);
+void
+util_format_bptc_srgba_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                         const uint8_t *src_row, unsigned src_stride,
+                                         unsigned width, unsigned height);
+void
+util_format_bptc_srgba_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                       const float *src_row, unsigned src_stride,
+                                       unsigned width, unsigned height);
+void
+util_format_bptc_srgba_fetch_rgba_float(float *dst, const uint8_t *src,
+                                        unsigned width, unsigned height);
+
+void
+util_format_bptc_rgb_float_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                              const uint8_t *src_row, unsigned src_stride,
+                                              unsigned width, unsigned height);
+void
+util_format_bptc_rgb_float_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                            const uint8_t *src_row, unsigned src_stride,
+                                            unsigned width, unsigned height);
+void
+util_format_bptc_rgb_float_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                             const uint8_t *src_row, unsigned src_stride,
+                                             unsigned width, unsigned height);
+void
+util_format_bptc_rgb_float_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                           const float *src_row, unsigned src_stride,
+                                           unsigned width, unsigned height);
+void
+util_format_bptc_rgb_float_fetch_rgba_float(float *dst, const uint8_t *src,
+                                            unsigned width, unsigned height);
+
+void
+util_format_bptc_rgb_ufloat_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                               const uint8_t *src_row, unsigned src_stride,
+                                               unsigned width, unsigned height);
+void
+util_format_bptc_rgb_ufloat_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride,
+                                             const uint8_t *src_row, unsigned src_stride,
+                                             unsigned width, unsigned height);
+void
+util_format_bptc_rgb_ufloat_unpack_rgba_float(float *dst_row, unsigned dst_stride,
+                                              const uint8_t *src_row, unsigned src_stride,
+                                              unsigned width, unsigned height);
+void
+util_format_bptc_rgb_ufloat_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride,
+                                            const float *src_row, unsigned src_stride,
+                                            unsigned width, unsigned height);
+void
+util_format_bptc_rgb_ufloat_fetch_rgba_float(float *dst, const uint8_t *src,
+                                             unsigned width, unsigned height);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_FORMAT_BPTC_H_ */
diff --git a/src/gallium/auxiliary/util/u_format_pack.py b/src/gallium/auxiliary/util/u_format_pack.py
index c9b8cd7..7a952a4 100644
--- a/src/gallium/auxiliary/util/u_format_pack.py
+++ b/src/gallium/auxiliary/util/u_format_pack.py
@@ -36,6 +36,8 @@
 '''
 
 
+from __future__ import print_function
+
 from u_format_parse import *
 
 
@@ -53,11 +55,11 @@
     if format.nr_channels() <= 1:
         func(format.le_channels, format.le_swizzles)
     else:
-        print '#ifdef PIPE_ARCH_BIG_ENDIAN'
+        print('#ifdef PIPE_ARCH_BIG_ENDIAN')
         func(format.be_channels, format.be_swizzles)
-        print '#else'
+        print('#else')
         func(format.le_channels, format.le_swizzles)
-        print '#endif'
+        print('#endif')
 
 def generate_format_type(format):
     '''Generate a structure that describes the format.'''
@@ -68,18 +70,18 @@
         for channel in channels:
             if channel.type == VOID:
                 if channel.size:
-                    print '      unsigned %s:%u;' % (channel.name, channel.size)
+                    print('      unsigned %s:%u;' % (channel.name, channel.size))
             elif channel.type == UNSIGNED:
-                print '      unsigned %s:%u;' % (channel.name, channel.size)
+                print('      unsigned %s:%u;' % (channel.name, channel.size))
             elif channel.type in (SIGNED, FIXED):
-                print '      int %s:%u;' % (channel.name, channel.size)
+                print('      int %s:%u;' % (channel.name, channel.size))
             elif channel.type == FLOAT:
                 if channel.size == 64:
-                    print '      double %s;' % (channel.name)
+                    print('      double %s;' % (channel.name))
                 elif channel.size == 32:
-                    print '      float %s;' % (channel.name)
+                    print('      float %s;' % (channel.name))
                 else:
-                    print '      unsigned %s:%u;' % (channel.name, channel.size)
+                    print('      unsigned %s:%u;' % (channel.name, channel.size))
             else:
                 assert 0
 
@@ -88,41 +90,41 @@
             assert channel.size % 8 == 0 and is_pot(channel.size)
             if channel.type == VOID:
                 if channel.size:
-                    print '      uint%u_t %s;' % (channel.size, channel.name)
+                    print('      uint%u_t %s;' % (channel.size, channel.name))
             elif channel.type == UNSIGNED:
-                print '      uint%u_t %s;' % (channel.size, channel.name)
+                print('      uint%u_t %s;' % (channel.size, channel.name))
             elif channel.type in (SIGNED, FIXED):
-                print '      int%u_t %s;' % (channel.size, channel.name)
+                print('      int%u_t %s;' % (channel.size, channel.name))
             elif channel.type == FLOAT:
                 if channel.size == 64:
-                    print '      double %s;' % (channel.name)
+                    print('      double %s;' % (channel.name))
                 elif channel.size == 32:
-                    print '      float %s;' % (channel.name)
+                    print('      float %s;' % (channel.name))
                 elif channel.size == 16:
-                    print '      uint16_t %s;' % (channel.name)
+                    print('      uint16_t %s;' % (channel.name))
                 else:
                     assert 0
             else:
                 assert 0
 
-    print 'union util_format_%s {' % format.short_name()
+    print('union util_format_%s {' % format.short_name())
     
     if format.block_size() in (8, 16, 32, 64):
-        print '   uint%u_t value;' % (format.block_size(),)
+        print('   uint%u_t value;' % (format.block_size(),))
 
     use_bitfields = False
     for channel in format.le_channels:
         if channel.size % 8 or not is_pot(channel.size):
             use_bitfields = True
 
-    print '   struct {'
+    print('   struct {')
     if use_bitfields:
         print_channels(format, generate_bitfields)
     else:
         print_channels(format, generate_full_fields)
-    print '   } chan;'
-    print '};'
-    print
+    print('   } chan;')
+    print('};')
+    print()
 
 
 def is_format_supported(format):
@@ -444,15 +446,15 @@
 
     def unpack_from_bitmask(channels, swizzles):
         depth = format.block_size()
-        print '         uint%u_t value = *(const uint%u_t *)src;' % (depth, depth) 
+        print('         uint%u_t value = *(const uint%u_t *)src;' % (depth, depth)) 
 
         # Declare the intermediate variables
         for i in range(format.nr_channels()):
             src_channel = channels[i]
             if src_channel.type == UNSIGNED:
-                print '         uint%u_t %s;' % (depth, src_channel.name)
+                print('         uint%u_t %s;' % (depth, src_channel.name))
             elif src_channel.type == SIGNED:
-                print '         int%u_t %s;' % (depth, src_channel.name)
+                print('         int%u_t %s;' % (depth, src_channel.name))
 
         # Compute the intermediate unshifted values 
         for i in range(format.nr_channels()):
@@ -479,7 +481,7 @@
                 value = None
                 
             if value is not None:
-                print '         %s = %s;' % (src_channel.name, value)
+                print('         %s = %s;' % (src_channel.name, value))
                 
         # Convert, swizzle, and store final values
         for i in range(4):
@@ -503,11 +505,11 @@
                 value = '0'
             else:
                 assert False
-            print '         dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i])
+            print('         dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i]))
         
     def unpack_from_union(channels, swizzles):
-        print '         union util_format_%s pixel;' % format.short_name()
-        print '         memcpy(&pixel, src, sizeof pixel);'
+        print('         union util_format_%s pixel;' % format.short_name())
+        print('         memcpy(&pixel, src, sizeof pixel);')
     
         for i in range(4):
             swizzle = swizzles[i]
@@ -530,7 +532,7 @@
                 value = '0'
             else:
                 assert False
-            print '         dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i])
+            print('         dst[%u] = %s; /* %s */' % (i, value, 'rgba'[i]))
     
     if format.is_bitmask():
         print_channels(format, unpack_from_bitmask)
@@ -551,7 +553,7 @@
         inv_swizzle = inv_swizzles(swizzles)
 
         depth = format.block_size()
-        print '         uint%u_t value = 0;' % depth 
+        print('         uint%u_t value = 0;' % depth) 
 
         for i in range(4):
             dst_channel = channels[i]
@@ -577,14 +579,14 @@
                 else:
                     value = None
                 if value is not None:
-                    print '         value |= %s;' % (value)
+                    print('         value |= %s;' % (value))
                 
-        print '         *(uint%u_t *)dst = value;' % depth 
+        print('         *(uint%u_t *)dst = value;' % depth) 
 
     def pack_into_union(channels, swizzles):
         inv_swizzle = inv_swizzles(swizzles)
 
-        print '         union util_format_%s pixel;' % format.short_name()
+        print('         union util_format_%s pixel;' % format.short_name())
     
         for i in range(4):
             dst_channel = channels[i]
@@ -600,9 +602,9 @@
                                     dst_channel, dst_native_type, 
                                     value, 
                                     dst_colorspace = dst_colorspace)
-            print '         pixel.chan.%s = %s;' % (dst_channel.name, value)
+            print('         pixel.chan.%s = %s;' % (dst_channel.name, value))
     
-        print '         memcpy(dst, &pixel, sizeof pixel);'
+        print('         memcpy(dst, &pixel, sizeof pixel);')
     
     if format.is_bitmask():
         print_channels(format, pack_into_bitmask)
@@ -615,28 +617,28 @@
 
     name = format.short_name()
 
-    print 'static inline void'
-    print 'util_format_%s_unpack_%s(%s *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, dst_suffix, dst_native_type)
-    print '{'
+    print('static inline void')
+    print('util_format_%s_unpack_%s(%s *dst_row, unsigned dst_stride, const uint8_t *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, dst_suffix, dst_native_type))
+    print('{')
 
     if is_format_supported(format):
-        print '   unsigned x, y;'
-        print '   for(y = 0; y < height; y += %u) {' % (format.block_height,)
-        print '      %s *dst = dst_row;' % (dst_native_type)
-        print '      const uint8_t *src = src_row;'
-        print '      for(x = 0; x < width; x += %u) {' % (format.block_width,)
+        print('   unsigned x, y;')
+        print('   for(y = 0; y < height; y += %u) {' % (format.block_height,))
+        print('      %s *dst = dst_row;' % (dst_native_type))
+        print('      const uint8_t *src = src_row;')
+        print('      for(x = 0; x < width; x += %u) {' % (format.block_width,))
         
         generate_unpack_kernel(format, dst_channel, dst_native_type)
     
-        print '         src += %u;' % (format.block_size() / 8,)
-        print '         dst += 4;'
-        print '      }'
-        print '      src_row += src_stride;'
-        print '      dst_row += dst_stride/sizeof(*dst_row);'
-        print '   }'
+        print('         src += %u;' % (format.block_size() / 8,))
+        print('         dst += 4;')
+        print('      }')
+        print('      src_row += src_stride;')
+        print('      dst_row += dst_stride/sizeof(*dst_row);')
+        print('   }')
 
-    print '}'
-    print
+    print('}')
+    print()
     
 
 def generate_format_pack(format, src_channel, src_native_type, src_suffix):
@@ -644,28 +646,28 @@
 
     name = format.short_name()
 
-    print 'static inline void'
-    print 'util_format_%s_pack_%s(uint8_t *dst_row, unsigned dst_stride, const %s *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, src_suffix, src_native_type)
-    print '{'
+    print('static inline void')
+    print('util_format_%s_pack_%s(uint8_t *dst_row, unsigned dst_stride, const %s *src_row, unsigned src_stride, unsigned width, unsigned height)' % (name, src_suffix, src_native_type))
+    print('{')
     
     if is_format_supported(format):
-        print '   unsigned x, y;'
-        print '   for(y = 0; y < height; y += %u) {' % (format.block_height,)
-        print '      const %s *src = src_row;' % (src_native_type)
-        print '      uint8_t *dst = dst_row;'
-        print '      for(x = 0; x < width; x += %u) {' % (format.block_width,)
+        print('   unsigned x, y;')
+        print('   for(y = 0; y < height; y += %u) {' % (format.block_height,))
+        print('      const %s *src = src_row;' % (src_native_type))
+        print('      uint8_t *dst = dst_row;')
+        print('      for(x = 0; x < width; x += %u) {' % (format.block_width,))
     
         generate_pack_kernel(format, src_channel, src_native_type)
             
-        print '         src += 4;'
-        print '         dst += %u;' % (format.block_size() / 8,)
-        print '      }'
-        print '      dst_row += dst_stride;'
-        print '      src_row += src_stride/sizeof(*src_row);'
-        print '   }'
+        print('         src += 4;')
+        print('         dst += %u;' % (format.block_size() / 8,))
+        print('      }')
+        print('      dst_row += dst_stride;')
+        print('      src_row += src_stride/sizeof(*src_row);')
+        print('   }')
         
-    print '}'
-    print
+    print('}')
+    print()
     
 
 def generate_format_fetch(format, dst_channel, dst_native_type, dst_suffix):
@@ -673,15 +675,15 @@
 
     name = format.short_name()
 
-    print 'static inline void'
-    print 'util_format_%s_fetch_%s(%s *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)' % (name, dst_suffix, dst_native_type)
-    print '{'
+    print('static inline void')
+    print('util_format_%s_fetch_%s(%s *dst, const uint8_t *src, UNUSED unsigned i, UNUSED unsigned j)' % (name, dst_suffix, dst_native_type))
+    print('{')
 
     if is_format_supported(format):
         generate_unpack_kernel(format, dst_channel, dst_native_type)
 
-    print '}'
-    print
+    print('}')
+    print()
 
 
 def is_format_hand_written(format):
@@ -689,16 +691,16 @@
 
 
 def generate(formats):
-    print
-    print '#include "pipe/p_compiler.h"'
-    print '#include "u_math.h"'
-    print '#include "u_half.h"'
-    print '#include "u_format.h"'
-    print '#include "u_format_other.h"'
-    print '#include "util/format_srgb.h"'
-    print '#include "u_format_yuv.h"'
-    print '#include "u_format_zs.h"'
-    print
+    print()
+    print('#include "pipe/p_compiler.h"')
+    print('#include "u_math.h"')
+    print('#include "u_half.h"')
+    print('#include "u_format.h"')
+    print('#include "u_format_other.h"')
+    print('#include "util/format_srgb.h"')
+    print('#include "u_format_yuv.h"')
+    print('#include "u_format_zs.h"')
+    print()
 
     for format in formats:
         if not is_format_hand_written(format):
diff --git a/src/gallium/auxiliary/util/u_format_table.py b/src/gallium/auxiliary/util/u_format_table.py
index a09ae53..1a966c5 100644
--- a/src/gallium/auxiliary/util/u_format_table.py
+++ b/src/gallium/auxiliary/util/u_format_table.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 CopyRight = '''
 /**************************************************************************
@@ -79,22 +80,23 @@
 
 
 def write_format_table(formats):
-    print '/* This file is autogenerated by u_format_table.py from u_format.csv. Do not edit directly. */'
-    print
+    print('/* This file is autogenerated by u_format_table.py from u_format.csv. Do not edit directly. */')
+    print()
     # This will print the copyright message on the top of this file
-    print CopyRight.strip()
-    print
-    print '#include "u_format.h"'
-    print '#include "u_format_s3tc.h"'
-    print '#include "u_format_rgtc.h"'
-    print '#include "u_format_latc.h"'
-    print '#include "u_format_etc.h"'
-    print
+    print(CopyRight.strip())
+    print()
+    print('#include "u_format.h"')
+    print('#include "u_format_bptc.h"')
+    print('#include "u_format_s3tc.h"')
+    print('#include "u_format_rgtc.h"')
+    print('#include "u_format_latc.h"')
+    print('#include "u_format_etc.h"')
+    print()
     
     u_format_pack.generate(formats)
     
     def do_channel_array(channels, swizzles):
-        print "   {"
+        print("   {")
         for i in range(4):
             channel = channels[i]
             if i < 3:
@@ -102,13 +104,13 @@
             else:
                 sep = ""
             if channel.size:
-                print "      {%s, %s, %s, %u, %u}%s\t/* %s = %s */" % (type_map[channel.type], bool_map(channel.norm), bool_map(channel.pure), channel.size, channel.shift, sep, "xyzw"[i], channel.name)
+                print("      {%s, %s, %s, %u, %u}%s\t/* %s = %s */" % (type_map[channel.type], bool_map(channel.norm), bool_map(channel.pure), channel.size, channel.shift, sep, "xyzw"[i], channel.name))
             else:
-                print "      {0, 0, 0, 0, 0}%s" % (sep,)
-        print "   },"
+                print("      {0, 0, 0, 0, 0}%s" % (sep,))
+        print("   },")
 
     def do_swizzle_array(channels, swizzles):
-        print "   {"
+        print("   {")
         for i in range(4):
             swizzle = swizzles[i]
             if i < 3:
@@ -119,102 +121,102 @@
                 comment = colorspace_channels_map[format.colorspace][i]
             except (KeyError, IndexError):
                 comment = 'ignored'
-            print "      %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment)
-        print "   },"
+            print("      %s%s\t/* %s */" % (swizzle_map[swizzle], sep, comment))
+        print("   },")
 
     for format in formats:
-        print 'const struct util_format_description'
-        print 'util_format_%s_description = {' % (format.short_name(),)
-        print "   %s," % (format.name,)
-        print "   \"%s\"," % (format.name,)
-        print "   \"%s\"," % (format.short_name(),)
-        print "   {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size())
-        print "   %s," % (layout_map(format.layout),)
-        print "   %u,\t/* nr_channels */" % (format.nr_channels(),)
-        print "   %s,\t/* is_array */" % (bool_map(format.is_array()),)
-        print "   %s,\t/* is_bitmask */" % (bool_map(format.is_bitmask()),)
-        print "   %s,\t/* is_mixed */" % (bool_map(format.is_mixed()),)
+        print('const struct util_format_description')
+        print('util_format_%s_description = {' % (format.short_name(),))
+        print("   %s," % (format.name,))
+        print("   \"%s\"," % (format.name,))
+        print("   \"%s\"," % (format.short_name(),))
+        print("   {%u, %u, %u},\t/* block */" % (format.block_width, format.block_height, format.block_size()))
+        print("   %s," % (layout_map(format.layout),))
+        print("   %u,\t/* nr_channels */" % (format.nr_channels(),))
+        print("   %s,\t/* is_array */" % (bool_map(format.is_array()),))
+        print("   %s,\t/* is_bitmask */" % (bool_map(format.is_bitmask()),))
+        print("   %s,\t/* is_mixed */" % (bool_map(format.is_mixed()),))
         u_format_pack.print_channels(format, do_channel_array)
         u_format_pack.print_channels(format, do_swizzle_array)
-        print "   %s," % (colorspace_map(format.colorspace),)
+        print("   %s," % (colorspace_map(format.colorspace),))
         access = True
-        if format.layout in ('bptc', 'astc'):
+        if format.layout == 'astc':
             access = False
         if format.layout == 'etc' and format.short_name() != 'etc1_rgb8':
             access = False
         if format.colorspace != ZS and not format.is_pure_color() and access:
-            print "   &util_format_%s_unpack_rgba_8unorm," % format.short_name() 
-            print "   &util_format_%s_pack_rgba_8unorm," % format.short_name() 
+            print("   &util_format_%s_unpack_rgba_8unorm," % format.short_name())
+            print("   &util_format_%s_pack_rgba_8unorm," % format.short_name())
             if format.layout == 's3tc' or format.layout == 'rgtc':
-                print "   &util_format_%s_fetch_rgba_8unorm," % format.short_name()
+                print("   &util_format_%s_fetch_rgba_8unorm," % format.short_name())
             else:
-                print "   NULL, /* fetch_rgba_8unorm */" 
-            print "   &util_format_%s_unpack_rgba_float," % format.short_name() 
-            print "   &util_format_%s_pack_rgba_float," % format.short_name() 
-            print "   &util_format_%s_fetch_rgba_float," % format.short_name()
+                print("   NULL, /* fetch_rgba_8unorm */")
+            print("   &util_format_%s_unpack_rgba_float," % format.short_name())
+            print("   &util_format_%s_pack_rgba_float," % format.short_name())
+            print("   &util_format_%s_fetch_rgba_float," % format.short_name())
         else:
-            print "   NULL, /* unpack_rgba_8unorm */" 
-            print "   NULL, /* pack_rgba_8unorm */" 
-            print "   NULL, /* fetch_rgba_8unorm */" 
-            print "   NULL, /* unpack_rgba_float */" 
-            print "   NULL, /* pack_rgba_float */" 
-            print "   NULL, /* fetch_rgba_float */" 
+            print("   NULL, /* unpack_rgba_8unorm */")
+            print("   NULL, /* pack_rgba_8unorm */")
+            print("   NULL, /* fetch_rgba_8unorm */")
+            print("   NULL, /* unpack_rgba_float */")
+            print("   NULL, /* pack_rgba_float */")
+            print("   NULL, /* fetch_rgba_float */")
         if format.has_depth():
-            print "   &util_format_%s_unpack_z_32unorm," % format.short_name() 
-            print "   &util_format_%s_pack_z_32unorm," % format.short_name() 
-            print "   &util_format_%s_unpack_z_float," % format.short_name() 
-            print "   &util_format_%s_pack_z_float," % format.short_name() 
+            print("   &util_format_%s_unpack_z_32unorm," % format.short_name())
+            print("   &util_format_%s_pack_z_32unorm," % format.short_name())
+            print("   &util_format_%s_unpack_z_float," % format.short_name())
+            print("   &util_format_%s_pack_z_float," % format.short_name())
         else:
-            print "   NULL, /* unpack_z_32unorm */" 
-            print "   NULL, /* pack_z_32unorm */" 
-            print "   NULL, /* unpack_z_float */" 
-            print "   NULL, /* pack_z_float */" 
+            print("   NULL, /* unpack_z_32unorm */")
+            print("   NULL, /* pack_z_32unorm */")
+            print("   NULL, /* unpack_z_float */")
+            print("   NULL, /* pack_z_float */")
         if format.has_stencil():
-            print "   &util_format_%s_unpack_s_8uint," % format.short_name() 
-            print "   &util_format_%s_pack_s_8uint," % format.short_name() 
+            print("   &util_format_%s_unpack_s_8uint," % format.short_name())
+            print("   &util_format_%s_pack_s_8uint," % format.short_name())
         else:
-            print "   NULL, /* unpack_s_8uint */" 
-            print "   NULL, /* pack_s_8uint */"
+            print("   NULL, /* unpack_s_8uint */")
+            print("   NULL, /* pack_s_8uint */")
         if format.is_pure_unsigned():
-            print "   &util_format_%s_unpack_unsigned, /* unpack_rgba_uint */" % format.short_name() 
-            print "   &util_format_%s_pack_unsigned, /* pack_rgba_uint */" % format.short_name()
-            print "   &util_format_%s_unpack_signed, /* unpack_rgba_sint */" % format.short_name()
-            print "   &util_format_%s_pack_signed,  /* pack_rgba_sint */" % format.short_name()
-            print "   &util_format_%s_fetch_unsigned,  /* fetch_rgba_uint */" % format.short_name()
-            print "   NULL  /* fetch_rgba_sint */"
+            print("   &util_format_%s_unpack_unsigned, /* unpack_rgba_uint */" % format.short_name())
+            print("   &util_format_%s_pack_unsigned, /* pack_rgba_uint */" % format.short_name())
+            print("   &util_format_%s_unpack_signed, /* unpack_rgba_sint */" % format.short_name())
+            print("   &util_format_%s_pack_signed,  /* pack_rgba_sint */" % format.short_name())
+            print("   &util_format_%s_fetch_unsigned,  /* fetch_rgba_uint */" % format.short_name())
+            print("   NULL  /* fetch_rgba_sint */")
         elif format.is_pure_signed():
-            print "   &util_format_%s_unpack_unsigned, /* unpack_rgba_uint */" % format.short_name()
-            print "   &util_format_%s_pack_unsigned, /* pack_rgba_uint */" % format.short_name()
-            print "   &util_format_%s_unpack_signed, /* unpack_rgba_sint */" % format.short_name()
-            print "   &util_format_%s_pack_signed,  /* pack_rgba_sint */" % format.short_name()
-            print "   NULL,  /* fetch_rgba_uint */"
-            print "   &util_format_%s_fetch_signed  /* fetch_rgba_sint */" % format.short_name()
+            print("   &util_format_%s_unpack_unsigned, /* unpack_rgba_uint */" % format.short_name())
+            print("   &util_format_%s_pack_unsigned, /* pack_rgba_uint */" % format.short_name())
+            print("   &util_format_%s_unpack_signed, /* unpack_rgba_sint */" % format.short_name())
+            print("   &util_format_%s_pack_signed,  /* pack_rgba_sint */" % format.short_name())
+            print("   NULL,  /* fetch_rgba_uint */")
+            print("   &util_format_%s_fetch_signed  /* fetch_rgba_sint */" % format.short_name())
         else:
-            print "   NULL, /* unpack_rgba_uint */" 
-            print "   NULL, /* pack_rgba_uint */" 
-            print "   NULL, /* unpack_rgba_sint */" 
-            print "   NULL, /* pack_rgba_sint */"
-            print "   NULL, /* fetch_rgba_uint */"
-            print "   NULL  /* fetch_rgba_sint */"
-        print "};"
-        print
+            print("   NULL, /* unpack_rgba_uint */")
+            print("   NULL, /* pack_rgba_uint */")
+            print("   NULL, /* unpack_rgba_sint */")
+            print("   NULL, /* pack_rgba_sint */")
+            print("   NULL, /* fetch_rgba_uint */")
+            print("   NULL  /* fetch_rgba_sint */")
+        print("};")
+        print()
         
-    print "const struct util_format_description *"
-    print "util_format_description(enum pipe_format format)"
-    print "{"
-    print "   if (format >= PIPE_FORMAT_COUNT) {"
-    print "      return NULL;"
-    print "   }"
-    print
-    print "   switch (format) {"
+    print("const struct util_format_description *")
+    print("util_format_description(enum pipe_format format)")
+    print("{")
+    print("   if (format >= PIPE_FORMAT_COUNT) {")
+    print("      return NULL;")
+    print("   }")
+    print()
+    print("   switch (format) {")
     for format in formats:
-        print "   case %s:" % format.name
-        print "      return &util_format_%s_description;" % (format.short_name(),)
-    print "   default:"
-    print "      return NULL;"
-    print "   }"
-    print "}"
-    print
+        print("   case %s:" % format.name)
+        print("      return &util_format_%s_description;" % (format.short_name(),))
+    print("   default:")
+    print("      return NULL;")
+    print("   }")
+    print("}")
+    print()
 
 
 def main():
diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c
index c2948a5..5bafddc 100644
--- a/src/gallium/auxiliary/util/u_framebuffer.c
+++ b/src/gallium/auxiliary/util/u_framebuffer.c
@@ -240,3 +240,33 @@
 
    return 1;
 }
+
+
+/**
+ * Flip the sample location state along the Y axis.
+ */
+void
+util_sample_locations_flip_y(struct pipe_screen *screen, unsigned fb_height,
+                             unsigned samples, uint8_t *locations)
+{
+   unsigned row, i, shift, grid_width, grid_height;
+   uint8_t new_locations[
+      PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE *
+      PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * 32];
+
+   screen->get_sample_pixel_grid(screen, samples, &grid_width, &grid_height);
+
+   shift = fb_height % grid_height;
+
+   for (row = 0; row < grid_height; row++) {
+      unsigned row_size = grid_width * samples;
+      for (i = 0; i < row_size; i++) {
+         unsigned dest_row = grid_height - row - 1;
+         /* this relies on unsigned integer wraparound behaviour */
+         dest_row = (dest_row - shift) % grid_height;
+         new_locations[dest_row * row_size + i] = locations[row * row_size + i];
+      }
+   }
+
+   memcpy(locations, new_locations, grid_width * grid_height * samples);
+}
diff --git a/src/gallium/auxiliary/util/u_framebuffer.h b/src/gallium/auxiliary/util/u_framebuffer.h
index c73942c..877e6e3 100644
--- a/src/gallium/auxiliary/util/u_framebuffer.h
+++ b/src/gallium/auxiliary/util/u_framebuffer.h
@@ -64,6 +64,11 @@
 util_framebuffer_get_num_samples(const struct pipe_framebuffer_state *fb);
 
 
+extern void
+util_sample_locations_flip_y(struct pipe_screen *screen, unsigned fb_height,
+                             unsigned samples, uint8_t *locations);
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index 3c55d9f..06737c5 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -76,7 +76,7 @@
       return TRUE;
 
    if (!screen->is_format_supported(screen, format, pt->target,
-                                    pt->nr_samples,
+                                    pt->nr_samples, pt->nr_storage_samples,
                                     PIPE_BIND_SAMPLER_VIEW |
                                     (is_zs ? PIPE_BIND_DEPTH_STENCIL :
                                      PIPE_BIND_RENDER_TARGET))) {
diff --git a/src/gallium/auxiliary/util/u_hash_table.c b/src/gallium/auxiliary/util/u_hash_table.c
index 77fa477..201b9a2 100644
--- a/src/gallium/auxiliary/util/u_hash_table.c
+++ b/src/gallium/auxiliary/util/u_hash_table.c
@@ -271,7 +271,7 @@
 
 
 static enum pipe_error
-util_hash_inc(void *k, void *v, void *d)
+util_hash_inc(UNUSED void *k, UNUSED void *v, void *d)
 {
    ++*(size_t *)d;
    return PIPE_OK;
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index 4bd9b7e..dee6f8f 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,7 +22,7 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 #ifndef U_INLINES_H
@@ -70,8 +70,8 @@
  * \return TRUE if the object's refcount hits zero and should be destroyed.
  */
 static inline boolean
-pipe_reference_described(struct pipe_reference *ptr, 
-                         struct pipe_reference *reference, 
+pipe_reference_described(struct pipe_reference *ptr,
+                         struct pipe_reference *reference,
                          debug_reference_descriptor get_desc)
 {
    boolean destroy = FALSE;
@@ -99,8 +99,9 @@
 static inline boolean
 pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference)
 {
-   return pipe_reference_described(ptr, reference, 
-                                   (debug_reference_descriptor)debug_describe_reference);
+   return pipe_reference_described(ptr, reference,
+                                   (debug_reference_descriptor)
+                                   debug_describe_reference);
 }
 
 static inline void
@@ -108,8 +109,9 @@
 {
    struct pipe_surface *old_surf = *ptr;
 
-   if (pipe_reference_described(&(*ptr)->reference, &surf->reference, 
-                                (debug_reference_descriptor)debug_describe_surface))
+   if (pipe_reference_described(&(*ptr)->reference, &surf->reference,
+                                (debug_reference_descriptor)
+                                debug_describe_surface))
       old_surf->context->surface_destroy(old_surf->context, old_surf);
    *ptr = surf;
 }
@@ -124,7 +126,8 @@
 pipe_surface_release(struct pipe_context *pipe, struct pipe_surface **ptr)
 {
    if (pipe_reference_described(&(*ptr)->reference, NULL,
-                                (debug_reference_descriptor)debug_describe_surface))
+                                (debug_reference_descriptor)
+                                debug_describe_surface))
       pipe->surface_destroy(pipe, *ptr);
    *ptr = NULL;
 }
@@ -135,8 +138,9 @@
 {
    struct pipe_resource *old_tex = *ptr;
 
-   if (pipe_reference_described(&(*ptr)->reference, &tex->reference, 
-                                (debug_reference_descriptor)debug_describe_resource)) {
+   if (pipe_reference_described(&(*ptr)->reference, &tex->reference,
+                                (debug_reference_descriptor)
+                                debug_describe_resource)) {
       /* Avoid recursion, which would prevent inlining this function */
       do {
          struct pipe_resource *next = old_tex->next;
@@ -144,7 +148,8 @@
          old_tex->screen->resource_destroy(old_tex->screen, old_tex);
          old_tex = next;
       } while (pipe_reference_described(&old_tex->reference, NULL,
-                                        (debug_reference_descriptor)debug_describe_resource));
+                                        (debug_reference_descriptor)
+                                        debug_describe_resource));
    }
    *ptr = tex;
 }
@@ -156,12 +161,14 @@
  * the same context (if they exist), and that this must be the current context.
  */
 static inline void
-pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_view *view)
+pipe_sampler_view_reference(struct pipe_sampler_view **ptr,
+                            struct pipe_sampler_view *view)
 {
    struct pipe_sampler_view *old_view = *ptr;
 
    if (pipe_reference_described(&(*ptr)->reference, &view->reference,
-                                (debug_reference_descriptor)debug_describe_sampler_view))
+                                (debug_reference_descriptor)
+                                debug_describe_sampler_view))
       old_view->context->sampler_view_destroy(old_view->context, old_view);
    *ptr = view;
 }
@@ -264,10 +271,10 @@
  * \param usage  a PIPE_USAGE_x value
  */
 static inline struct pipe_resource *
-pipe_buffer_create( struct pipe_screen *screen,
-		    unsigned bind,
-		    enum pipe_resource_usage usage,
-		    unsigned size )
+pipe_buffer_create(struct pipe_screen *screen,
+                   unsigned bind,
+                   enum pipe_resource_usage usage,
+                   unsigned size)
 {
    struct pipe_resource buffer;
    memset(&buffer, 0, sizeof buffer);
@@ -307,18 +314,18 @@
 
 /**
  * Map a range of a resource.
- * \param offset  start of region, in bytes 
- * \param length  size of region, in bytes 
+ * \param offset  start of region, in bytes
+ * \param length  size of region, in bytes
  * \param access  bitmask of PIPE_TRANSFER_x flags
  * \param transfer  returns a transfer object
  */
 static inline void *
 pipe_buffer_map_range(struct pipe_context *pipe,
-		      struct pipe_resource *buffer,
-		      unsigned offset,
-		      unsigned length,
-		      unsigned access,
-		      struct pipe_transfer **transfer)
+                      struct pipe_resource *buffer,
+                      unsigned offset,
+                      unsigned length,
+                      unsigned access,
+                      struct pipe_transfer **transfer)
 {
    struct pipe_box box;
    void *map;
@@ -349,7 +356,8 @@
                 unsigned access,
                 struct pipe_transfer **transfer)
 {
-   return pipe_buffer_map_range(pipe, buffer, 0, buffer->width0, access, transfer);
+   return pipe_buffer_map_range(pipe, buffer, 0, buffer->width0,
+                                access, transfer);
 }
 
 
@@ -443,10 +451,10 @@
    ubyte *map;
 
    map = (ubyte *) pipe_buffer_map_range(pipe,
-					 buf,
-					 offset, size,
-					 PIPE_TRANSFER_READ,
-					 &src_transfer);
+                                         buf,
+                                         offset, size,
+                                         PIPE_TRANSFER_READ,
+                                         &src_transfer);
    if (!map)
       return;
 
@@ -501,10 +509,10 @@
 }
 
 static inline void
-pipe_transfer_unmap( struct pipe_context *context,
-                     struct pipe_transfer *transfer )
+pipe_transfer_unmap(struct pipe_context *context,
+                    struct pipe_transfer *transfer)
 {
-   context->transfer_unmap( context, transfer );
+   context->transfer_unmap(context, transfer);
 }
 
 static inline void
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 46d0297..79869a1 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -421,6 +421,23 @@
 #endif
 }
 
+static inline uint64_t
+util_logbase2_64(uint64_t n)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   return ((sizeof(uint64_t) * 8 - 1) - __builtin_clzll(n | 1));
+#else
+   uint64_t pos = 0ull;
+   if (n >= 1ull<<32) { n >>= 32; pos += 32; }
+   if (n >= 1ull<<16) { n >>= 16; pos += 16; }
+   if (n >= 1ull<< 8) { n >>=  8; pos +=  8; }
+   if (n >= 1ull<< 4) { n >>=  4; pos +=  4; }
+   if (n >= 1ull<< 2) { n >>=  2; pos +=  2; }
+   if (n >= 1ull<< 1) {           pos +=  1; }
+   return pos;
+#endif
+}
+
 /**
  * Returns the ceiling of log n base 2, and 0 when n == 0. Equivalently,
  * returns the smallest x such that n <= 2**x.
@@ -434,6 +451,15 @@
    return 1 + util_logbase2(n - 1);
 }
 
+static inline uint64_t
+util_logbase2_ceil64(uint64_t n)
+{
+   if (n <= 1)
+      return 0;
+
+   return 1ull + util_logbase2_64(n - 1);
+}
+
 /**
  * Returns the smallest power of two >= x
  */
@@ -465,6 +491,35 @@
 #endif
 }
 
+static inline uint64_t
+util_next_power_of_two64(uint64_t x)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   if (x <= 1)
+       return 1;
+
+   return (1ull << ((sizeof(uint64_t) * 8) - __builtin_clzll(x - 1)));
+#else
+   uint64_t val = x;
+
+   if (x <= 1)
+      return 1;
+
+   if (util_is_power_of_two_or_zero64(x))
+      return x;
+
+   val--;
+   val = (val >> 1)  | val;
+   val = (val >> 2)  | val;
+   val = (val >> 4)  | val;
+   val = (val >> 8)  | val;
+   val = (val >> 16) | val;
+   val = (val >> 32) | val;
+   val++;
+   return val;
+#endif
+}
+
 
 /**
  * Return number of bits set in n.
diff --git a/src/gallium/auxiliary/util/u_tests.c b/src/gallium/auxiliary/util/u_tests.c
index f251434..7360eb9 100644
--- a/src/gallium/auxiliary/util/u_tests.c
+++ b/src/gallium/auxiliary/util/u_tests.c
@@ -55,6 +55,7 @@
    templ.depth0 = 1;
    templ.array_size = 1;
    templ.nr_samples = num_samples;
+   templ.nr_storage_samples = num_samples;
    templ.format = format;
    templ.usage = PIPE_USAGE_DEFAULT;
    templ.bind = PIPE_BIND_SAMPLER_VIEW |
@@ -649,7 +650,7 @@
       /* Vertex shader. */
       void *vs = util_set_passthrough_vertex_shader(cso, ctx, false);
 
-      for (int i = 0; i < num_samples / 2; i++) {
+      for (unsigned i = 0; i < num_samples / 2; i++) {
          float value;
 
          /* 2 consecutive samples should have the same color to test MSAA
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index 1c647a3..fc7eb13 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -2205,7 +2205,8 @@
       bind = PIPE_BIND_RENDER_TARGET;
 
    if (!screen->is_format_supported(screen, format, res->target,
-                                    res->nr_samples, bind))
+                                    res->nr_samples, res->nr_storage_samples,
+                                    bind))
       return false;
 
    struct tc_generate_mipmap *p =
@@ -2566,7 +2567,7 @@
     * from the queue before being executed, so keep one tc_batch slot for that
     * execution. Also, keep one unused slot for an unflushed batch.
     */
-   if (!util_queue_init(&tc->queue, "gallium_drv", TC_MAX_BATCHES - 2, 1, 0))
+   if (!util_queue_init(&tc->queue, "gdrv", TC_MAX_BATCHES - 2, 1, 0))
       goto fail;
 
    for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h
index 53c5a7e..245a8af 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -408,14 +408,6 @@
    return (struct threaded_transfer*)transfer;
 }
 
-static inline struct pipe_context *
-threaded_context_unwrap_unsync(struct pipe_context *pipe)
-{
-   if (!pipe || !pipe->priv)
-      return pipe;
-   return (struct pipe_context*)pipe->priv;
-}
-
 static inline void
 tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
                                    struct tc_unflushed_batch_token *src)
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
index b91bb41..0239b87 100644
--- a/src/gallium/auxiliary/util/u_tile.c
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -35,6 +35,7 @@
 #include "util/u_inlines.h"
 
 #include "util/u_format.h"
+#include "util/u_format_bptc.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_surface.h"
diff --git a/src/gallium/auxiliary/util/u_transfer_helper.c b/src/gallium/auxiliary/util/u_transfer_helper.c
index 3b085fd..df67f82 100644
--- a/src/gallium/auxiliary/util/u_transfer_helper.c
+++ b/src/gallium/auxiliary/util/u_transfer_helper.c
@@ -207,13 +207,18 @@
       pctx->blit(pctx, &blit);
    }
 
-   void *ss_map = pctx->transfer_map(pctx, trans->ss, 0, usage, box,
+   struct pipe_box map_box = *box;
+   map_box.x = 0;
+   map_box.y = 0;
+
+   void *ss_map = pctx->transfer_map(pctx, trans->ss, 0, usage, &map_box,
          &trans->trans);
    if (!ss_map) {
       free(trans);
       return NULL;
    }
 
+   ptrans->stride = trans->trans->stride;
    *pptrans = ptrans;
    return ss_map;
 }
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index 8a680d6..f721613 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -154,8 +154,7 @@
    uint32_t enabled_vb_mask;
 
    /* Saved vertex buffer. */
-   unsigned aux_vertex_buffer_slot;
-   struct pipe_vertex_buffer aux_vertex_buffer_saved;
+   struct pipe_vertex_buffer vertex_buffer0_saved;
 
    /* Vertex buffers for the driver.
     * There are usually no user buffers. */
@@ -270,7 +269,7 @@
    for (i = 0; i < ARRAY_SIZE(vbuf_format_fallbacks); i++) {
       enum pipe_format format = vbuf_format_fallbacks[i].from;
 
-      if (!screen->is_format_supported(screen, format, PIPE_BUFFER, 0,
+      if (!screen->is_format_supported(screen, format, PIPE_BUFFER, 0, 0,
                                        PIPE_BIND_VERTEX_BUFFER)) {
          caps->format_translation[format] = vbuf_format_fallbacks[i].to;
          fallback = TRUE;
@@ -300,13 +299,11 @@
 }
 
 struct u_vbuf *
-u_vbuf_create(struct pipe_context *pipe,
-              struct u_vbuf_caps *caps, unsigned aux_vertex_buffer_index)
+u_vbuf_create(struct pipe_context *pipe, struct u_vbuf_caps *caps)
 {
    struct u_vbuf *mgr = CALLOC_STRUCT(u_vbuf);
 
    mgr->caps = *caps;
-   mgr->aux_vertex_buffer_slot = aux_vertex_buffer_index;
    mgr->pipe = pipe;
    mgr->cso_cache = cso_cache_create();
    mgr->translate_cache = translate_cache_create();
@@ -381,7 +378,7 @@
    for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
       pipe_vertex_buffer_unreference(&mgr->real_vertex_buffer[i]);
 
-   pipe_vertex_buffer_unreference(&mgr->aux_vertex_buffer_saved);
+   pipe_vertex_buffer_unreference(&mgr->vertex_buffer0_saved);
 
    translate_cache_destroy(mgr->translate_cache);
    cso_cache_delete(mgr->cso_cache);
@@ -1025,13 +1022,84 @@
             mgr->nonzero_stride_vb_mask)) != 0;
 }
 
-static void u_vbuf_get_minmax_index(struct pipe_context *pipe,
-                                    const struct pipe_draw_info *info,
-                                    int *out_min_index, int *out_max_index)
+static void
+u_vbuf_get_minmax_index_mapped(const struct pipe_draw_info *info,
+                               const void *indices, unsigned *out_min_index,
+                               unsigned *out_max_index)
+{
+   unsigned max = 0;
+   unsigned min = ~0u;
+
+   switch (info->index_size) {
+   case 4: {
+      const unsigned *ui_indices = (const unsigned*)indices;
+      if (info->primitive_restart) {
+         for (unsigned i = 0; i < info->count; i++) {
+            if (ui_indices[i] != info->restart_index) {
+               if (ui_indices[i] > max) max = ui_indices[i];
+               if (ui_indices[i] < min) min = ui_indices[i];
+            }
+         }
+      }
+      else {
+         for (unsigned i = 0; i < info->count; i++) {
+            if (ui_indices[i] > max) max = ui_indices[i];
+            if (ui_indices[i] < min) min = ui_indices[i];
+         }
+      }
+      break;
+   }
+   case 2: {
+      const unsigned short *us_indices = (const unsigned short*)indices;
+      if (info->primitive_restart) {
+         for (unsigned i = 0; i < info->count; i++) {
+            if (us_indices[i] != info->restart_index) {
+               if (us_indices[i] > max) max = us_indices[i];
+               if (us_indices[i] < min) min = us_indices[i];
+            }
+         }
+      }
+      else {
+         for (unsigned i = 0; i < info->count; i++) {
+            if (us_indices[i] > max) max = us_indices[i];
+            if (us_indices[i] < min) min = us_indices[i];
+         }
+      }
+      break;
+   }
+   case 1: {
+      const unsigned char *ub_indices = (const unsigned char*)indices;
+      if (info->primitive_restart) {
+         for (unsigned i = 0; i < info->count; i++) {
+            if (ub_indices[i] != info->restart_index) {
+               if (ub_indices[i] > max) max = ub_indices[i];
+               if (ub_indices[i] < min) min = ub_indices[i];
+            }
+         }
+      }
+      else {
+         for (unsigned i = 0; i < info->count; i++) {
+            if (ub_indices[i] > max) max = ub_indices[i];
+            if (ub_indices[i] < min) min = ub_indices[i];
+         }
+      }
+      break;
+   }
+   default:
+      assert(0);
+   }
+
+   *out_min_index = min;
+   *out_max_index = max;
+}
+
+static void
+u_vbuf_get_minmax_index(struct pipe_context *pipe,
+                        const struct pipe_draw_info *info,
+                        unsigned *out_min_index, unsigned *out_max_index)
 {
    struct pipe_transfer *transfer = NULL;
    const void *indices;
-   unsigned i;
 
    if (info->has_user_indices) {
       indices = (uint8_t*)info->index.user +
@@ -1043,78 +1111,7 @@
                                       PIPE_TRANSFER_READ, &transfer);
    }
 
-   switch (info->index_size) {
-   case 4: {
-      const unsigned *ui_indices = (const unsigned*)indices;
-      unsigned max_ui = 0;
-      unsigned min_ui = ~0U;
-      if (info->primitive_restart) {
-         for (i = 0; i < info->count; i++) {
-            if (ui_indices[i] != info->restart_index) {
-               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
-               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
-            }
-         }
-      }
-      else {
-         for (i = 0; i < info->count; i++) {
-            if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
-            if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
-         }
-      }
-      *out_min_index = min_ui;
-      *out_max_index = max_ui;
-      break;
-   }
-   case 2: {
-      const unsigned short *us_indices = (const unsigned short*)indices;
-      unsigned max_us = 0;
-      unsigned min_us = ~0U;
-      if (info->primitive_restart) {
-         for (i = 0; i < info->count; i++) {
-            if (us_indices[i] != info->restart_index) {
-               if (us_indices[i] > max_us) max_us = us_indices[i];
-               if (us_indices[i] < min_us) min_us = us_indices[i];
-            }
-         }
-      }
-      else {
-         for (i = 0; i < info->count; i++) {
-            if (us_indices[i] > max_us) max_us = us_indices[i];
-            if (us_indices[i] < min_us) min_us = us_indices[i];
-         }
-      }
-      *out_min_index = min_us;
-      *out_max_index = max_us;
-      break;
-   }
-   case 1: {
-      const unsigned char *ub_indices = (const unsigned char*)indices;
-      unsigned max_ub = 0;
-      unsigned min_ub = ~0U;
-      if (info->primitive_restart) {
-         for (i = 0; i < info->count; i++) {
-            if (ub_indices[i] != info->restart_index) {
-               if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
-               if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
-            }
-         }
-      }
-      else {
-         for (i = 0; i < info->count; i++) {
-            if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
-            if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
-         }
-      }
-      *out_min_index = min_ub;
-      *out_max_index = max_ub;
-      break;
-   }
-   default:
-      assert(0);
-      *out_min_index = 0;
-      *out_max_index = 0;
-   }
+   u_vbuf_get_minmax_index_mapped(info, indices, out_min_index, out_max_index);
 
    if (transfer) {
       pipe_buffer_unmap(pipe, transfer);
@@ -1134,10 +1131,36 @@
    mgr->dirty_real_vb_mask = 0;
 }
 
+static void
+u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info,
+                               unsigned *indirect_data, unsigned stride,
+                               unsigned draw_count)
+{
+   assert(info->index_size);
+   info->indirect = NULL;
+
+   for (unsigned i = 0; i < draw_count; i++) {
+      unsigned offset = i * stride / 4;
+
+      info->count = indirect_data[offset + 0];
+      info->instance_count = indirect_data[offset + 1];
+
+      if (!info->count || !info->instance_count)
+         continue;
+
+      info->start = indirect_data[offset + 2];
+      info->index_bias = indirect_data[offset + 3];
+      info->start_instance = indirect_data[offset + 4];
+
+      u_vbuf_draw_vbo(mgr, info);
+   }
+}
+
 void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
 {
    struct pipe_context *pipe = mgr->pipe;
-   int start_vertex, min_index;
+   int start_vertex;
+   unsigned min_index;
    unsigned num_vertices;
    boolean unroll_indices = FALSE;
    const uint32_t used_vb_mask = mgr->ve->used_vb_mask;
@@ -1162,36 +1185,170 @@
 
    new_info = *info;
 
-   /* Fallback. We need to know all the parameters. */
+   /* Handle indirect (multi)draws. */
    if (new_info.indirect) {
-      struct pipe_transfer *transfer = NULL;
-      int *data;
+      const struct pipe_draw_indirect_info *indirect = new_info.indirect;
+      unsigned draw_count = 0;
 
-      if (new_info.index_size) {
-         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
-                                      new_info.indirect->offset, 20,
-                                      PIPE_TRANSFER_READ, &transfer);
-         new_info.index_bias = data[3];
-         new_info.start_instance = data[4];
-      }
-      else {
-         data = pipe_buffer_map_range(pipe, new_info.indirect->buffer,
-                                      new_info.indirect->offset, 16,
-                                      PIPE_TRANSFER_READ, &transfer);
-         new_info.start_instance = data[3];
+      /* Get the number of draws. */
+      if (indirect->indirect_draw_count) {
+         pipe_buffer_read(pipe, indirect->indirect_draw_count,
+                          indirect->indirect_draw_count_offset,
+                          4, &draw_count);
+      } else {
+         draw_count = indirect->draw_count;
       }
 
-      new_info.count = data[0];
-      new_info.instance_count = data[1];
-      new_info.start = data[2];
-      pipe_buffer_unmap(pipe, transfer);
-      new_info.indirect = NULL;
+      if (!draw_count)
+         return;
+
+      unsigned data_size = (draw_count - 1) * indirect->stride +
+                           (new_info.index_size ? 20 : 16);
+      unsigned *data = malloc(data_size);
+      if (!data)
+         return; /* report an error? */
+
+      /* Read the used buffer range only once, because the read can be
+       * uncached.
+       */
+      pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size,
+                       data);
+
+      if (info->index_size) {
+         /* Indexed multidraw. */
+         unsigned index_bias0 = data[3];
+         bool index_bias_same = true;
+
+         /* If we invoke the translate path, we have to split the multidraw. */
+         if (incompatible_vb_mask ||
+             mgr->ve->incompatible_elem_mask) {
+            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
+                                           indirect->stride, draw_count);
+            free(data);
+            return;
+         }
+
+         /* See if index_bias is the same for all draws. */
+         for (unsigned i = 1; i < draw_count; i++) {
+            if (data[i * indirect->stride / 4 + 3] != index_bias0) {
+               index_bias_same = false;
+               break;
+            }
+         }
+
+         /* Split the multidraw if index_bias is different. */
+         if (!index_bias_same) {
+            u_vbuf_split_indexed_multidraw(mgr, &new_info, data,
+                                           indirect->stride, draw_count);
+            free(data);
+            return;
+         }
+
+         /* If we don't need to use the translate path and index_bias is
+          * the same, we can process the multidraw with the time complexity
+          * equal to 1 draw call (except for the index range computation).
+          * We only need to compute the index range covering all draw calls
+          * of the multidraw.
+          *
+          * The driver will not look at these values because indirect != NULL.
+          * These values determine the user buffer bounds to upload.
+          */
+         new_info.index_bias = index_bias0;
+         new_info.min_index = ~0u;
+         new_info.max_index = 0;
+         new_info.start_instance = ~0u;
+         unsigned end_instance = 0;
+
+         struct pipe_transfer *transfer = NULL;
+         const uint8_t *indices;
+
+         if (info->has_user_indices) {
+            indices = (uint8_t*)info->index.user;
+         } else {
+            indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource,
+                                                PIPE_TRANSFER_READ, &transfer);
+         }
+
+         for (unsigned i = 0; i < draw_count; i++) {
+            unsigned offset = i * indirect->stride / 4;
+            unsigned start = data[offset + 2];
+            unsigned count = data[offset + 0];
+            unsigned start_instance = data[offset + 4];
+            unsigned instance_count = data[offset + 1];
+
+            if (!count || !instance_count)
+               continue;
+
+            /* Update the ranges of instances. */
+            new_info.start_instance = MIN2(new_info.start_instance,
+                                           start_instance);
+            end_instance = MAX2(end_instance, start_instance + instance_count);
+
+            /* Update the index range. */
+            unsigned min, max;
+            new_info.count = count; /* only used by get_minmax_index */
+            u_vbuf_get_minmax_index_mapped(&new_info,
+                                           indices +
+                                           new_info.index_size * start,
+                                           &min, &max);
+
+            new_info.min_index = MIN2(new_info.min_index, min);
+            new_info.max_index = MAX2(new_info.max_index, max);
+         }
+         free(data);
+
+         if (transfer)
+            pipe_buffer_unmap(pipe, transfer);
+
+         /* Set the final instance count. */
+         new_info.instance_count = end_instance - new_info.start_instance;
+
+         if (new_info.start_instance == ~0u || !new_info.instance_count)
+            return;
+      } else {
+         /* Non-indexed multidraw.
+          *
+          * Keep the draw call indirect and compute minimums & maximums,
+          * which will determine the user buffer bounds to upload, but
+          * the driver will not look at these values because indirect != NULL.
+          *
+          * This efficiently processes the multidraw with the time complexity
+          * equal to 1 draw call.
+          */
+         new_info.start = ~0u;
+         new_info.start_instance = ~0u;
+         unsigned end_vertex = 0;
+         unsigned end_instance = 0;
+
+         for (unsigned i = 0; i < draw_count; i++) {
+            unsigned offset = i * indirect->stride / 4;
+            unsigned start = data[offset + 2];
+            unsigned count = data[offset + 0];
+            unsigned start_instance = data[offset + 3];
+            unsigned instance_count = data[offset + 1];
+
+            new_info.start = MIN2(new_info.start, start);
+            new_info.start_instance = MIN2(new_info.start_instance,
+                                           start_instance);
+
+            end_vertex = MAX2(end_vertex, start + count);
+            end_instance = MAX2(end_instance, start_instance + instance_count);
+         }
+         free(data);
+
+         /* Set the final counts. */
+         new_info.count = end_vertex - new_info.start;
+         new_info.instance_count = end_instance - new_info.start_instance;
+
+         if (new_info.start == ~0u || !new_info.count || !new_info.instance_count)
+            return;
+      }
    }
 
    if (new_info.index_size) {
       /* See if anything needs to be done for per-vertex attribs. */
       if (u_vbuf_need_minmax_index(mgr)) {
-         int max_index;
+         unsigned max_index;
 
          if (new_info.max_index != ~0u) {
             min_index = new_info.min_index;
@@ -1210,7 +1367,8 @@
           * We would have to break this drawing operation into several ones. */
          /* Use some heuristic to see if unrolling indices improves
           * performance. */
-         if (!new_info.primitive_restart &&
+         if (!info->indirect &&
+             !new_info.primitive_restart &&
              num_vertices > new_info.count*2 &&
              num_vertices - new_info.count > 32 &&
              !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {
@@ -1313,15 +1471,14 @@
    mgr->ve_saved = NULL;
 }
 
-void u_vbuf_save_aux_vertex_buffer_slot(struct u_vbuf *mgr)
+void u_vbuf_save_vertex_buffer0(struct u_vbuf *mgr)
 {
-   pipe_vertex_buffer_reference(&mgr->aux_vertex_buffer_saved,
-                           &mgr->vertex_buffer[mgr->aux_vertex_buffer_slot]);
+   pipe_vertex_buffer_reference(&mgr->vertex_buffer0_saved,
+                                &mgr->vertex_buffer[0]);
 }
 
-void u_vbuf_restore_aux_vertex_buffer_slot(struct u_vbuf *mgr)
+void u_vbuf_restore_vertex_buffer0(struct u_vbuf *mgr)
 {
-   u_vbuf_set_vertex_buffers(mgr, mgr->aux_vertex_buffer_slot, 1,
-                             &mgr->aux_vertex_buffer_saved);
-   pipe_vertex_buffer_unreference(&mgr->aux_vertex_buffer_saved);
+   u_vbuf_set_vertex_buffers(mgr, 0, 1, &mgr->vertex_buffer0_saved);
+   pipe_vertex_buffer_unreference(&mgr->vertex_buffer0_saved);
 }
diff --git a/src/gallium/auxiliary/util/u_vbuf.h b/src/gallium/auxiliary/util/u_vbuf.h
index d070452..a613983 100644
--- a/src/gallium/auxiliary/util/u_vbuf.h
+++ b/src/gallium/auxiliary/util/u_vbuf.h
@@ -61,8 +61,7 @@
                         unsigned flags);
 
 struct u_vbuf *
-u_vbuf_create(struct pipe_context *pipe,
-              struct u_vbuf_caps *caps, unsigned aux_vertex_buffer_index);
+u_vbuf_create(struct pipe_context *pipe, struct u_vbuf_caps *caps);
 
 void u_vbuf_destroy(struct u_vbuf *mgr);
 
@@ -77,7 +76,7 @@
 /* Save/restore functionality. */
 void u_vbuf_save_vertex_elements(struct u_vbuf *mgr);
 void u_vbuf_restore_vertex_elements(struct u_vbuf *mgr);
-void u_vbuf_save_aux_vertex_buffer_slot(struct u_vbuf *mgr);
-void u_vbuf_restore_aux_vertex_buffer_slot(struct u_vbuf *mgr);
+void u_vbuf_save_vertex_buffer0(struct u_vbuf *mgr);
+void u_vbuf_restore_vertex_buffer0(struct u_vbuf *mgr);
 
 #endif
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
index 8a2dae3..9589b1e 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -908,20 +908,20 @@
 
    for (i = 0; i < num_configs; ++i) {
       if (!screen->is_format_supported(screen, configs[i].zscan_source_format, PIPE_TEXTURE_2D,
-                                       1, PIPE_BIND_SAMPLER_VIEW))
+                                       1, 1, PIPE_BIND_SAMPLER_VIEW))
          continue;
 
       if (configs[i].idct_source_format != PIPE_FORMAT_NONE) {
          if (!screen->is_format_supported(screen, configs[i].idct_source_format, PIPE_TEXTURE_2D,
-                                          1, PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET))
+                                          1, 1, PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET))
             continue;
 
          if (!screen->is_format_supported(screen, configs[i].mc_source_format, PIPE_TEXTURE_3D,
-                                          1, PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET))
+                                          1, 1, PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET))
             continue;
       } else {
          if (!screen->is_format_supported(screen, configs[i].mc_source_format, PIPE_TEXTURE_2D,
-                                          1, PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET))
+                                          1, 1, PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET))
             continue;
       }
       return &configs[i];
diff --git a/src/gallium/auxiliary/vl/vl_video_buffer.c b/src/gallium/auxiliary/vl/vl_video_buffer.c
index 3b97ac8..5b54ee1 100644
--- a/src/gallium/auxiliary/vl/vl_video_buffer.c
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.c
@@ -192,11 +192,11 @@
          continue;
 
       /* we at least need to sample from it */
-      if (!screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, 0, PIPE_BIND_SAMPLER_VIEW))
+      if (!screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW))
          return false;
 
       format = vl_video_buffer_surface_format(format);
-      if (!screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, 0, PIPE_BIND_RENDER_TARGET))
+      if (!screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_RENDER_TARGET))
          return false;
    }
 
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri.c b/src/gallium/auxiliary/vl/vl_winsys_dri.c
index 79ebf75..bb1ff50 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -231,7 +231,7 @@
    }
 
    memset(&dri2_handle, 0, sizeof(dri2_handle));
-   dri2_handle.type = DRM_API_HANDLE_TYPE_SHARED;
+   dri2_handle.type = WINSYS_HANDLE_TYPE_SHARED;
    dri2_handle.handle = back_left->name;
    dri2_handle.stride = back_left->pitch;
 
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri3.c b/src/gallium/auxiliary/vl/vl_winsys_dri3.c
index 8251087..df2c9c0 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri3.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri3.c
@@ -271,7 +271,7 @@
       pixmap_buffer_texture = buffer->texture;
    }
    memset(&whandle, 0, sizeof(whandle));
-   whandle.type= DRM_API_HANDLE_TYPE_FD;
+   whandle.type= WINSYS_HANDLE_TYPE_FD;
    usage = PIPE_HANDLE_USAGE_EXPLICIT_FLUSH | PIPE_HANDLE_USAGE_READ;
    scrn->base.pscreen->resource_get_handle(scrn->base.pscreen, NULL,
                                            pixmap_buffer_texture, &whandle,
@@ -492,7 +492,7 @@
       goto free_reply;
 
    memset(&whandle, 0, sizeof(whandle));
-   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.type = WINSYS_HANDLE_TYPE_FD;
    whandle.handle = (unsigned)fds[0];
    whandle.stride = bp_reply->stride;
    memset(&templ, 0, sizeof(templ));
@@ -712,7 +712,6 @@
    if (scrn->front_buffer) {
       dri3_free_front_buffer(scrn, scrn->front_buffer);
       scrn->front_buffer = NULL;
-      return;
    }
 
    for (i = 0; i < BACK_BUFFER_NUM; ++i) {
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index e8e80dc..20d0df7 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -68,6 +68,9 @@
   that this takes effect even if multisampling is not explicitly enabled if
   the frambuffer surface(s) are multisampled.  Also, this mask is AND-ed
   with the optional fragment shader sample mask output (when emitted).
+* ``set_sample_locations`` sets the sample locations used for rasterization.
+  ```get_sample_position``` still returns the default locations. When NULL,
+  the default locations are used.
 * ``set_min_samples`` sets the minimum number of samples that must be run.
 * ``set_clip_state``
 * ``set_polygon_stipple``
@@ -270,6 +273,17 @@
 multi-byte element value starting at offset bytes from resource start, going
 for size bytes. It is guaranteed that size % clear_value_size == 0.
 
+Evaluating Depth Buffers
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+``evaluate_depth_buffer`` is a hint to decompress the current depth buffer
+assuming the current sample locations to avoid problems that could arise when
+using programmable sample locations.
+
+If a depth buffer is rendered with different sample location state than
+what is current at the time of reading the depth buffer, the values may differ
+because depth buffer compression can depend the sample locations.
+
 
 Uploading
 ^^^^^^^^^
diff --git a/src/gallium/docs/source/cso/rasterizer.rst b/src/gallium/docs/source/cso/rasterizer.rst
index 616e451..4dabcc0 100644
--- a/src/gallium/docs/source/cso/rasterizer.rst
+++ b/src/gallium/docs/source/cso/rasterizer.rst
@@ -340,3 +340,26 @@
     If any clip distance output is written, those half-spaces for which no
     clip distance is written count as disabled; i.e. user clip planes and
     shader clip distances cannot be mixed, and clip distances take precedence.
+
+conservative_raster_mode
+    The conservative rasterization mode.  For PIPE_CONSERVATIVE_RASTER_OFF,
+    conservative rasterization is disabled.  For IPE_CONSERVATIVE_RASTER_POST_SNAP
+    or PIPE_CONSERVATIVE_RASTER_PRE_SNAP, conservative rasterization is nabled.
+    When conservative rasterization is enabled, the polygon smooth, line mooth,
+    point smooth and line stipple settings are ignored.
+    With the post-snap mode, unlike the pre-snap mode, fragments are never
+    generated for degenerate primitives.  Degenerate primitives, when rasterized,
+    are considered back-facing and the vertex attributes and depth are that of
+    the provoking vertex.
+    If the post-snap mode is used with an unsupported primitive, the pre-snap
+    mode is used, if supported.  Behavior is similar for the pre-snap mode.
+    If the pre-snap mode is used, fragments are generated with respect to the primitive
+    before vertex snapping.
+
+conservative_raster_dilate
+    The amount of dilation during conservative rasterization.
+
+subpixel_precision_x
+    A bias added to the horizontal subpixel precision during conservative rasterization.
+subpixel_precision_y
+    A bias added to the vertical subpixel precision during conservative rasterization.
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 3837360..e85246c 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -93,6 +93,12 @@
   shader.
 * ``PIPE_CAP_GLSL_FEATURE_LEVEL``: Whether the driver supports features
   equivalent to a specific GLSL version. E.g. for GLSL 1.3, report 130.
+* ``PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY``: Whether the driver supports
+  features equivalent to a specific GLSL version including all legacy OpenGL
+  features only present in the OpenGL compatibility profile.
+  The only legacy features that Gallium drivers must implement are
+  the legacy shader inputs and outputs (colors, texcoords, fog, clipvertex,
+  edgeflag).
 * ``PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION``: Whether quads adhere to
   the flatshade_first setting in ``pipe_rasterizer_state``.
 * ``PIPE_CAP_USER_VERTEX_BUFFERS``: Whether the driver supports user vertex
@@ -405,6 +411,17 @@
 * ``PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES``: Limit on combined shader
   output resources (images + buffers + fragment outputs). If 0 the state
   tracker works it out.
+* ``PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS``: This determines limitations
+  on the number of samples that framebuffer attachments can have.
+  Possible values:
+    0: color.nr_samples == zs.nr_samples == color.nr_storage_samples
+       (standard MSAA quality)
+    1: color.nr_samples >= zs.nr_samples == color.nr_storage_samples
+       (enhanced MSAA quality)
+    2: color.nr_samples >= zs.nr_samples >= color.nr_storage_samples
+       (full flexibility in tuning MSAA quality and performance)
+  All color attachments must have the same number of samples and the same
+  number of storage samples.
 * ``PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET``:
   Whether pipe_vertex_buffer::buffer_offset is treated as signed. The u_vbuf
   module needs this for optimal performance in workstation applications.
@@ -420,6 +437,21 @@
   by the driver, and the driver can throw assertion failures.
 * ``PIPE_CAP_PACKED_UNIFORMS``: True if the driver supports packed uniforms
   as opposed to padding to vec4s.
+* ``PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES``: Whether the
+  PIPE_CONSERVATIVE_RASTER_POST_SNAP mode is supported for triangles.
+* ``PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES``: Whether the
+PIPE_CONSERVATIVE_RASTER_POST_SNAP mode is supported for points and lines.
+* ``PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES``: Whether the
+PIPE_CONSERVATIVE_RASTER_PRE_SNAP mode is supported for triangles.
+* ``PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES``: Whether the
+PIPE_CONSERVATIVE_RASTER_PRE_SNAP mode is supported for points and lines.
+* ``PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE``: Whether PIPE_CAP_POST_DEPTH_COVERAGE
+works with conservative rasterization.
+* ``PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS``: The maximum
+subpixel precision bias in bits during conservative rasterization.
+* ``PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS``: True is the driver supports
+  programmable sample location through ```get_sample_pixel_grid``` and
+  ```set_sample_locations```.
 
 
 .. _pipe_capf:
@@ -437,6 +469,12 @@
   applied to anisotropically filtered textures.
 * ``PIPE_CAPF_MAX_TEXTURE_LOD_BIAS``: The maximum :term:`LOD` bias that may be applied
   to filtered textures.
+* ``PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE``: The minimum conservative rasterization
+  dilation.
+* ``PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE``: The maximum conservative rasterization
+  dilation.
+* ``PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY``: The conservative rasterization
+  dilation granularity for values relative to the minimum dilation.
 
 
 .. _pipe_shader_cap:
@@ -522,6 +560,7 @@
   how many HW counters are available for this stage. (0 uses SSBO atomics).
 * ``PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS``: If atomic counters are
   separate, how many atomic counter buffers are available for this stage.
+* ``PIPE_SHADER_CAP_SCALAR_ISA``: Whether the ISA is a scalar one.
 
 .. _pipe_compute_cap:
 
@@ -707,6 +746,9 @@
 **sample_count** the number of samples. 0 and 1 mean no multisampling,
 the maximum allowed legal value is 32.
 
+**storage_sample_count** the number of storage samples. This must be <=
+sample_count. See the documentation of ``pipe_resource::nr_storage_samples``.
+
 **bindings** is a bitmask of :ref:`PIPE_BIND` flags.
 
 Returns TRUE if all usages can be satisfied.
@@ -750,8 +792,27 @@
 
 **last_level** the last mip map level present.
 
-**nr_samples** the nr of msaa samples. 0 (or 1) specifies a resource
-which isn't multisampled.
+**nr_samples**: Number of samples determining quality, driving the rasterizer,
+shading, and framebuffer. It is the number of samples seen by the whole
+graphics pipeline. 0 and 1 specify a resource which isn't multisampled.
+
+**nr_storage_samples**: Only color buffers can set this lower than nr_samples.
+Multiple samples within a pixel can have the same color. ``nr_storage_samples``
+determines how many slots for different colors there are per pixel.
+If there are not enough slots to store all sample colors, some samples will
+have an undefined color (called "undefined samples").
+
+The resolve blit behavior is driver-specific, but can be one of these two:
+1. Only defined samples will be averaged. Undefined samples will be ignored.
+2. Undefined samples will be approximated by looking at surrounding defined
+   samples (even in different pixels).
+
+Blits and MSAA texturing: If the sample being fetched is undefined, one of
+the defined samples is returned instead.
+
+Sample shading (``set_min_samples``) will operate at a sample frequency that
+is at most ``nr_storage_samples``. Greater ``min_samples`` values will be
+replaced by ``nr_storage_samples``.
 
 **usage** one of the :ref:`PIPE_USAGE` flags.
 
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 9e95658..7d4ebb6 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2592,6 +2592,31 @@
   barrier in between.
 
 
+.. _bindlessopcodes:
+
+Bindless Opcodes
+^^^^^^^^^^^^^^^^
+
+These opcodes are for working with bindless sampler or image handles and
+require PIPE_CAP_BINDLESS_TEXTURE.
+
+.. opcode:: IMG2HND - Get a bindless handle for a image
+
+  Syntax: ``IMG2HND dst, image``
+
+  Example: ``IMG2HND TEMP[0], IMAGE[0]``
+
+  Sets 'dst' to a bindless handle for 'image'.
+
+.. opcode:: SAMP2HND - Get a bindless handle for a sampler
+
+  Syntax: ``SAMP2HND dst, sampler``
+
+  Example: ``SAMP2HND TEMP[0], SAMP[0]``
+
+  Sets 'dst' to a bindless handle for 'sampler'.
+
+
 .. _threadsyncopcodes:
 
 Inter-thread synchronization opcodes
diff --git a/src/gallium/drivers/etnaviv/etnaviv_fence.c b/src/gallium/drivers/etnaviv/etnaviv_fence.c
index 22a964a..cf3e677 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_fence.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_fence.c
@@ -36,7 +36,6 @@
 
 struct pipe_fence_handle {
    struct pipe_reference reference;
-   struct etna_context *ctx;
    struct etna_screen *screen;
    int fence_fd;
    uint32_t timestamp;
@@ -111,7 +110,6 @@
 
    pipe_reference_init(&fence->reference, 1);
 
-   fence->ctx = ctx;
    fence->screen = ctx->screen;
    fence->timestamp = etna_cmd_stream_timestamp(ctx->stream);
    fence->fence_fd = fence_fd;
diff --git a/src/gallium/drivers/etnaviv/etnaviv_resource.c b/src/gallium/drivers/etnaviv/etnaviv_resource.c
index c600eff..7fd374a 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_resource.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_resource.c
@@ -250,7 +250,7 @@
       if (!scanout)
          return NULL;
 
-      assert(handle.type == DRM_API_HANDLE_TYPE_FD);
+      assert(handle.type == WINSYS_HANDLE_TYPE_FD);
       handle.modifier = modifier;
       rsc = etna_resource(pscreen->resource_from_handle(pscreen, templat,
                                                         &handle,
@@ -600,16 +600,16 @@
    handle->stride = rsc->levels[0].stride;
    handle->modifier = layout_to_modifier(rsc->layout);
 
-   if (handle->type == DRM_API_HANDLE_TYPE_SHARED) {
+   if (handle->type == WINSYS_HANDLE_TYPE_SHARED) {
       return etna_bo_get_name(rsc->bo, &handle->handle) == 0;
-   } else if (handle->type == DRM_API_HANDLE_TYPE_KMS) {
+   } else if (handle->type == WINSYS_HANDLE_TYPE_KMS) {
       if (renderonly_get_handle(scanout, handle)) {
          return TRUE;
       } else {
          handle->handle = etna_bo_handle(rsc->bo);
          return TRUE;
       }
-   } else if (handle->type == DRM_API_HANDLE_TYPE_FD) {
+   } else if (handle->type == WINSYS_HANDLE_TYPE_FD) {
       handle->handle = etna_bo_dmabuf(rsc->bo);
       return TRUE;
    } else {
diff --git a/src/gallium/drivers/etnaviv/etnaviv_rs.c b/src/gallium/drivers/etnaviv/etnaviv_rs.c
index b8a3b12..fc4f65d 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_rs.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_rs.c
@@ -26,8 +26,6 @@
 
 #include "etnaviv_rs.h"
 
-#include "hw/common.xml.h"
-
 #include "etnaviv_clear_blit.h"
 #include "etnaviv_context.h"
 #include "etnaviv_emit.h"
diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c
index b0f8b4b..35707e6 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_screen.c
+++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c
@@ -154,6 +154,7 @@
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 4; /* XXX could easily be supported */
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
       return 120;
 
    case PIPE_CAP_NPOT_TEXTURES:
@@ -269,11 +270,19 @@
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return 0;
 
    /* Stream output. */
@@ -374,6 +383,10 @@
       return 16.0f;
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return util_last_bit(screen->specs.max_texture_size);
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0f;
    }
 
    debug_printf("unknown paramf %d", param);
@@ -462,6 +475,7 @@
    case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+   case PIPE_SHADER_CAP_SCALAR_ISA:
       return 0;
    }
 
@@ -520,7 +534,9 @@
 etna_screen_is_format_supported(struct pipe_screen *pscreen,
                                 enum pipe_format format,
                                 enum pipe_texture_target target,
-                                unsigned sample_count, unsigned usage)
+                                unsigned sample_count,
+                                unsigned storage_sample_count,
+                                unsigned usage)
 {
    struct etna_screen *screen = etna_screen(pscreen);
    unsigned allowed = 0;
@@ -533,6 +549,9 @@
        target != PIPE_TEXTURE_RECT)
       return FALSE;
 
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+      return false;
+
    if (usage & PIPE_BIND_RENDER_TARGET) {
       /* if render target, must be RS-supported format */
       if (translate_rs_format(format) != ETNA_NO_MATCH) {
@@ -845,9 +864,9 @@
    struct etna_screen *screen = etna_screen(pscreen);
    struct etna_bo *bo;
 
-   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+   if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
       bo = etna_bo_from_name(screen->dev, whandle->handle);
-   } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+   } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
       bo = etna_bo_from_dmabuf(screen->dev, whandle->handle);
    } else {
       DBG("Attempt to import unsupported handle type %d", whandle->type);
diff --git a/src/gallium/drivers/etnaviv/etnaviv_translate.h b/src/gallium/drivers/etnaviv/etnaviv_translate.h
index 7c85f81..88ce107 100644
--- a/src/gallium/drivers/etnaviv/etnaviv_translate.h
+++ b/src/gallium/drivers/etnaviv/etnaviv_translate.h
@@ -30,7 +30,6 @@
 
 #include "etnaviv_debug.h"
 #include "etnaviv_format.h"
-#include "etnaviv_tiling.h"
 #include "etnaviv_util.h"
 #include "hw/cmdstream.xml.h"
 #include "hw/common_3d.xml.h"
@@ -40,8 +39,6 @@
 #include "util/u_format.h"
 #include "util/u_math.h"
 
-#include <stdio.h>
-
 /* Returned when there is no match of pipe value to etna value */
 #define ETNA_NO_MATCH (~0)
 
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index e0f0586..328cbdf 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -16,6 +16,7 @@
 	freedreno_fence.h \
 	freedreno_gmem.c \
 	freedreno_gmem.h \
+	freedreno_perfcntr.h \
 	freedreno_program.c \
 	freedreno_program.h \
 	freedreno_query.c \
@@ -144,6 +145,7 @@
 	a5xx/fd5_gmem.h \
 	a5xx/fd5_image.c \
 	a5xx/fd5_image.h \
+	a5xx/fd5_perfcntr.c \
 	a5xx/fd5_program.c \
 	a5xx/fd5_program.h \
 	a5xx/fd5_query.c \
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index 279a652..aeaf105 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -8,17 +8,19 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13612 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  34499 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 146261 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    501 bytes, from 2018-01-31 18:26:32)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  36805 bytes, from 2018-05-20 19:03:35)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13634 bytes, from 2018-06-10 17:35:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  41584 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2018-01-10 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 147158 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx.xml          (  88437 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx_gmu.xml      (  10431 bytes, from 2018-06-10 17:37:04)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2018-01-08 14:56:24)
 
-Copyright (C) 2013-2017 by the following authors:
+Copyright (C) 2013-2018 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
 
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
index 8df1793..6f0535f 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -101,12 +101,14 @@
 	OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
 	OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
 
-	OUT_WFI (ring);
+	if (!is_a20x(ctx->screen)) {
+		OUT_WFI (ring);
 
-	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-	OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
-	OUT_RING(ring, info->max_index);        /* VGT_MAX_VTX_INDX */
-	OUT_RING(ring, info->min_index);        /* VGT_MIN_VTX_INDX */
+		OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+		OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
+		OUT_RING(ring, info->max_index);        /* VGT_MAX_VTX_INDX */
+		OUT_RING(ring, info->min_index);        /* VGT_MIN_VTX_INDX */
+	}
 
 	fd_draw_emit(ctx->batch, ring, ctx->primtypes[info->mode],
 				 IGNORE_VISIBILITY, info, index_offset);
@@ -133,7 +135,7 @@
 	uint32_t reg, colr = 0;
 
 	if ((buffers & PIPE_CLEAR_COLOR) && fb->nr_cbufs)
-		colr  = pack_rgba(fb->cbufs[0]->format, color->f);
+		colr = pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f);
 
 	/* emit generic state now: */
 	fd2_emit_state(ctx, ctx->dirty &
@@ -157,9 +159,18 @@
 	OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
 	OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
 
-	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
-	OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR));
-	OUT_RING(ring, colr);
+	if (is_a20x(ctx->screen)) {
+		OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+		OUT_RING(ring, 0x00000480);
+		OUT_RING(ring, color->ui[0]);
+		OUT_RING(ring, color->ui[1]);
+		OUT_RING(ring, color->ui[2]);
+		OUT_RING(ring, color->ui[3]);
+	} else {
+		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+		OUT_RING(ring, CP_REG(REG_A2XX_CLEAR_COLOR));
+		OUT_RING(ring, colr);
+	}
 
 	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 	OUT_RING(ring, CP_REG(REG_A2XX_A220_RB_LRZ_VSC_CONTROL));
@@ -264,10 +275,12 @@
 		OUT_RING(ring, 0x0);
 	}
 
-	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-	OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
-	OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
-	OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
+	if (!is_a20x(ctx->screen)) {
+		OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+		OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
+		OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
+		OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
+	}
 
 	fd_draw(ctx->batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
 			DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
index a787b71..dcf7ed1 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -58,10 +58,6 @@
 	uint32_t start_base = base;
 	unsigned i;
 
-	// XXX TODO only emit dirty consts.. but we need to keep track if
-	// they are clobbered by a clear, gmem2mem, or mem2gmem..
-	constbuf->dirty_mask = enabled_mask;
-
 	/* emit user constants: */
 	while (enabled_mask) {
 		unsigned index = ffs(enabled_mask) - 1;
@@ -79,26 +75,22 @@
 		if (shader && ((base - start_base) >= (shader->first_immediate * 4)))
 			break;
 
-		if (constbuf->dirty_mask & (1 << index)) {
-			const uint32_t *dwords;
+		const uint32_t *dwords;
 
-			if (cb->user_buffer) {
-				dwords = cb->user_buffer;
-			} else {
-				struct fd_resource *rsc = fd_resource(cb->buffer);
-				dwords = fd_bo_map(rsc->bo);
-			}
-
-			dwords = (uint32_t *)(((uint8_t *)dwords) + cb->buffer_offset);
-
-			OUT_PKT3(ring, CP_SET_CONSTANT, size + 1);
-			OUT_RING(ring, base);
-			for (i = 0; i < size; i++)
-				OUT_RING(ring, *(dwords++));
-
-			constbuf->dirty_mask &= ~(1 << index);
+		if (cb->user_buffer) {
+			dwords = cb->user_buffer;
+		} else {
+			struct fd_resource *rsc = fd_resource(cb->buffer);
+			dwords = fd_bo_map(rsc->bo);
 		}
 
+		dwords = (uint32_t *)(((uint8_t *)dwords) + cb->buffer_offset);
+
+		OUT_PKT3(ring, CP_SET_CONSTANT, size + 1);
+		OUT_RING(ring, base);
+		for (i = 0; i < size; i++)
+			OUT_RING(ring, *(dwords++));
+
 		base += size;
 		enabled_mask &= ~(1 << index);
 	}
@@ -303,7 +295,7 @@
 	if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_ZSA)) {
 		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 		OUT_RING(ring, CP_REG(REG_A2XX_RB_COLORCONTROL));
-		OUT_RING(ring, zsa->rb_colorcontrol | blend->rb_colorcontrol);
+		OUT_RING(ring, blend ? zsa->rb_colorcontrol | blend->rb_colorcontrol : 0);
 	}
 
 	if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_FRAMEBUFFER)) {
@@ -313,13 +305,13 @@
 
 		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 		OUT_RING(ring, CP_REG(REG_A2XX_RB_BLEND_CONTROL));
-		OUT_RING(ring, blend->rb_blendcontrol_alpha |
+		OUT_RING(ring, blend ? blend->rb_blendcontrol_alpha |
 			COND(has_alpha, blend->rb_blendcontrol_rgb) |
-			COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb));
+			COND(!has_alpha, blend->rb_blendcontrol_no_alpha_rgb) : 0);
 
 		OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 		OUT_RING(ring, CP_REG(REG_A2XX_RB_COLOR_MASK));
-		OUT_RING(ring, blend->rb_colormask);
+		OUT_RING(ring, blend ? blend->rb_colormask : 0xf);
 	}
 
 	if (dirty & FD_DIRTY_BLEND_COLOR) {
@@ -340,6 +332,16 @@
 void
 fd2_emit_restore(struct fd_context *ctx, struct fd_ringbuffer *ring)
 {
+	if (is_a20x(ctx->screen)) {
+		OUT_PKT0(ring, REG_A2XX_RB_BC_CONTROL, 1);
+		OUT_RING(ring,
+			A2XX_RB_BC_CONTROL_ACCUM_TIMEOUT_SELECT(3) |
+			A2XX_RB_BC_CONTROL_DISABLE_LZ_NULL_ZCMD_DROP |
+			A2XX_RB_BC_CONTROL_ENABLE_CRC_UPDATE |
+			A2XX_RB_BC_CONTROL_ACCUM_DATA_FIFO_LIMIT(8) |
+			A2XX_RB_BC_CONTROL_MEM_EXPORT_TIMEOUT_SELECT(3));
+	}
+
 	OUT_PKT0(ring, REG_A2XX_TP0_CHICKEN, 1);
 	OUT_RING(ring, 0x00000002);
 
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
index 46a7d18..6238299 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
@@ -89,12 +89,14 @@
 			A2XX_RB_COPY_DEST_INFO_WRITE_BLUE |
 			A2XX_RB_COPY_DEST_INFO_WRITE_ALPHA);
 
-	OUT_WFI (ring);
+	if (!is_a20x(batch->ctx->screen)) {
+		OUT_WFI (ring);
 
-	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-	OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
-	OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
-	OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
+		OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+		OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
+		OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
+		OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
+	}
 
 	fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
 			DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL);
@@ -214,10 +216,12 @@
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, 0x00000200);
 
-	OUT_PKT3(ring, CP_SET_CONSTANT, 3);
-	OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
-	OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
-	OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
+	if (!is_a20x(batch->ctx->screen)) {
+		OUT_PKT3(ring, CP_SET_CONSTANT, 3);
+		OUT_RING(ring, CP_REG(REG_A2XX_VGT_MAX_VTX_INDX));
+		OUT_RING(ring, 3);                 /* VGT_MAX_VTX_INDX */
+		OUT_RING(ring, 0);                 /* VGT_MIN_VTX_INDX */
+	}
 
 	fd_draw(batch, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
 			DI_SRC_SEL_AUTO_INDEX, 3, 0, INDEX_SIZE_IGN, 0, 0, NULL);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
index 9a77457..834a7c7 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
@@ -54,6 +54,8 @@
 static void
 delete_shader(struct fd2_shader_stateobj *so)
 {
+	if (!so)
+		return;
 	ir2_shader_destroy(so->ir);
 	free(so->tokens);
 	free(so->bin);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_screen.c b/src/gallium/drivers/freedreno/a2xx/fd2_screen.c
index c2a60c6..2f701dd 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_screen.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_screen.c
@@ -38,18 +38,21 @@
 		enum pipe_format format,
 		enum pipe_texture_target target,
 		unsigned sample_count,
+		unsigned storage_sample_count,
 		unsigned usage)
 {
 	unsigned retval = 0;
 
 	if ((target >= PIPE_MAX_TEXTURE_TYPES) ||
-			(sample_count > 1) || /* TODO add MSAA */
-			!util_format_is_supported(format, usage)) {
+			(sample_count > 1)) { /* TODO add MSAA */
 		DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x",
 				util_format_name(format), target, sample_count, usage);
 		return FALSE;
 	}
 
+	if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+		return false;
+
 	/* TODO figure out how to render to other formats.. */
 	if ((usage & PIPE_BIND_RENDER_TARGET) &&
 			((format != PIPE_FORMAT_B5G6R5_UNORM) &&
diff --git a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
index 0d6e138..ac972ed 100644
--- a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
+++ b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
@@ -366,10 +366,8 @@
 	uint8_t             pred_select              : 1;
 	/* dword2: */
 	uint8_t             stride                   : 8;
-	/* possibly offset and reserved4 are swapped on a200? */
-	uint8_t             offset                   : 8;
-	uint8_t             reserved4                : 8;
-	uint8_t             reserved5                : 7;
+	uint32_t            offset                   : 22;
+	uint8_t             reserved4                : 1;
 	uint8_t             pred_condition           : 1;
 } instr_fetch_vtx_t;
 
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index 3b11f10..64ea69b 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -8,17 +8,19 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13612 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  34499 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 146261 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    501 bytes, from 2018-01-31 18:26:32)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  36805 bytes, from 2018-05-20 19:03:35)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13634 bytes, from 2018-06-10 17:35:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  41584 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2018-01-10 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 147158 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx.xml          (  88437 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx_gmu.xml      (  10431 bytes, from 2018-06-10 17:37:04)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2018-01-08 14:56:24)
 
-Copyright (C) 2013-2017 by the following authors:
+Copyright (C) 2013-2018 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 761f25b..1ad6955 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -149,12 +149,17 @@
 	fixup_shader_state(ctx, &emit.key);
 
 	unsigned dirty = ctx->dirty;
+	const struct ir3_shader_variant *vp = fd3_emit_get_vp(&emit);
+	const struct ir3_shader_variant *fp = fd3_emit_get_fp(&emit);
 
 	/* do regular pass first, since that is more likely to fail compiling: */
 
-	if (!(fd3_emit_get_vp(&emit) && fd3_emit_get_fp(&emit)))
+	if (!vp || !fp)
 		return false;
 
+	ctx->stats.vs_regs += ir3_shader_halfregs(vp);
+	ctx->stats.fs_regs += ir3_shader_halfregs(fp);
+
 	emit.key.binning_pass = false;
 	emit.dirty = dirty;
 	draw_impl(ctx, ctx->batch->draw, &emit, index_offset);
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 3419ba8..bab3d3d 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -409,8 +409,17 @@
 					(instance_regid != regid(63, 0)) ||
 					(vtxcnt_regid != regid(63, 0));
 			bool isint = util_format_is_pure_integer(pfmt);
+			uint32_t off = vb->buffer_offset + elem->src_offset;
 			uint32_t fs = util_format_get_blocksize(pfmt);
 
+#ifdef DEBUG
+			/* see dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10
+			 * should mesa/st be protecting us from this?
+			 */
+			if (off > fd_bo_size(rsc->bo))
+				continue;
+#endif
+
 			debug_assert(fmt != ~0);
 
 			OUT_PKT0(ring, REG_A3XX_VFD_FETCH(j), 2);
@@ -420,7 +429,7 @@
 					A3XX_VFD_FETCH_INSTR_0_INDEXCODE(j) |
 					COND(elem->instance_divisor, A3XX_VFD_FETCH_INSTR_0_INSTANCED) |
 					A3XX_VFD_FETCH_INSTR_0_STEPRATE(MAX2(1, elem->instance_divisor)));
-			OUT_RELOC(ring, rsc->bo, vb->buffer_offset + elem->src_offset, 0, 0);
+			OUT_RELOC(ring, rsc->bo, off, 0, 0);
 
 			OUT_PKT0(ring, REG_A3XX_VFD_DECODE_INSTR(j), 1);
 			OUT_RING(ring, A3XX_VFD_DECODE_INSTR_CONSTFILL |
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
index 5e574da..72e807e 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -62,8 +62,8 @@
 fd3_emit_get_vp(struct fd3_emit *emit)
 {
 	if (!emit->vp) {
-		struct fd3_shader_stateobj *so = emit->prog->vp;
-		emit->vp = ir3_shader_variant(so->shader, emit->key, emit->debug);
+		struct ir3_shader *shader = emit->prog->vp;
+		emit->vp = ir3_shader_variant(shader, emit->key, emit->debug);
 	}
 	return emit->vp;
 }
@@ -77,8 +77,8 @@
 			static const struct ir3_shader_variant binning_fp = {};
 			emit->fp = &binning_fp;
 		} else {
-			struct fd3_shader_stateobj *so = emit->prog->fp;
-			emit->fp = ir3_shader_variant(so->shader, emit->key, emit->debug);
+			struct ir3_shader *shader = emit->prog->fp;
+			emit->fp = ir3_shader_variant(shader, emit->key, emit->debug);
 		}
 	}
 	return emit->fp;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index f43d5c4..64eeb10 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -40,22 +40,13 @@
 #include "fd3_texture.h"
 #include "fd3_format.h"
 
-static void
-delete_shader_stateobj(struct fd3_shader_stateobj *so)
-{
-	ir3_shader_destroy(so->shader);
-	free(so);
-}
-
-static struct fd3_shader_stateobj *
+static struct ir3_shader *
 create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso,
 		enum shader_t type)
 {
 	struct fd_context *ctx = fd_context(pctx);
 	struct ir3_compiler *compiler = ctx->screen->compiler;
-	struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj);
-	so->shader = ir3_shader_create(compiler, cso, type, &ctx->debug);
-	return so;
+	return ir3_shader_create(compiler, cso, type, &ctx->debug);
 }
 
 static void *
@@ -68,8 +59,8 @@
 static void
 fd3_fp_state_delete(struct pipe_context *pctx, void *hwcso)
 {
-	struct fd3_shader_stateobj *so = hwcso;
-	delete_shader_stateobj(so);
+	struct ir3_shader *so = hwcso;
+	ir3_shader_destroy(so);
 }
 
 static void *
@@ -82,15 +73,15 @@
 static void
 fd3_vp_state_delete(struct pipe_context *pctx, void *hwcso)
 {
-	struct fd3_shader_stateobj *so = hwcso;
-	delete_shader_stateobj(so);
+	struct ir3_shader *so = hwcso;
+	ir3_shader_destroy(so);
 }
 
 bool
-fd3_needs_manual_clipping(const struct fd3_shader_stateobj *so,
+fd3_needs_manual_clipping(const struct ir3_shader *shader,
 						  const struct pipe_rasterizer_state *rast)
 {
-	uint64_t outputs = ir3_shader_outputs(so->shader);
+	uint64_t outputs = ir3_shader_outputs(shader);
 
 	return (!rast->depth_clip ||
 			util_bitcount(rast->clip_plane_enable) > 6 ||
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h
index b95df4c..04ebf12 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h
@@ -33,10 +33,6 @@
 #include "freedreno_context.h"
 #include "ir3_shader.h"
 
-struct fd3_shader_stateobj {
-	struct ir3_shader *shader;
-};
-
 struct fd3_emit;
 
 void fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
@@ -44,7 +40,7 @@
 
 void fd3_prog_init(struct pipe_context *pctx);
 
-bool fd3_needs_manual_clipping(const struct fd3_shader_stateobj *,
+bool fd3_needs_manual_clipping(const struct ir3_shader *,
 							   const struct pipe_rasterizer_state *);
 
 #endif /* FD3_PROGRAM_H_ */
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
index 998ec7a..a1c4668 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
@@ -39,18 +39,21 @@
 		enum pipe_format format,
 		enum pipe_texture_target target,
 		unsigned sample_count,
+		unsigned storage_sample_count,
 		unsigned usage)
 {
 	unsigned retval = 0;
 
 	if ((target >= PIPE_MAX_TEXTURE_TYPES) ||
-			(sample_count > 1) || /* TODO add MSAA */
-			!util_format_is_supported(format, usage)) {
+			(sample_count > 1)) { /* TODO add MSAA */
 		DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x",
 				util_format_name(format), target, sample_count, usage);
 		return FALSE;
 	}
 
+	if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+		return false;
+
 	if ((usage & PIPE_BIND_VERTEX_BUFFER) &&
 			(fd3_pipe2vtx(format) != (enum a3xx_vtx_fmt)~0)) {
 		retval |= PIPE_BIND_VERTEX_BUFFER;
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 927cf18..bd66481 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -8,17 +8,19 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13612 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  34499 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 146261 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    501 bytes, from 2018-01-31 18:26:32)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  36805 bytes, from 2018-05-20 19:03:35)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13634 bytes, from 2018-06-10 17:35:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  41584 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2018-01-10 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 147158 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx.xml          (  88437 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx_gmu.xml      (  10431 bytes, from 2018-06-10 17:37:04)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2018-01-08 14:56:24)
 
-Copyright (C) 2013-2017 by the following authors:
+Copyright (C) 2013-2018 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index 840e917..1c04a82 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -135,12 +135,17 @@
 	fixup_shader_state(ctx, &emit.key);
 
 	enum fd_dirty_3d_state dirty = ctx->dirty;
+	const struct ir3_shader_variant *vp = fd4_emit_get_vp(&emit);
+	const struct ir3_shader_variant *fp = fd4_emit_get_fp(&emit);
 
 	/* do regular pass first, since that is more likely to fail compiling: */
 
-	if (!(fd4_emit_get_vp(&emit) && fd4_emit_get_fp(&emit)))
+	if (!vp || !fp)
 		return false;
 
+	ctx->stats.vs_regs += ir3_shader_halfregs(vp);
+	ctx->stats.fs_regs += ir3_shader_halfregs(fp);
+
 	emit.key.binning_pass = false;
 	emit.dirty = dirty;
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 42268ce..8470fa9 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -418,6 +418,13 @@
 			uint32_t size = fd_bo_size(rsc->bo) - off;
 			debug_assert(fmt != ~0);
 
+#ifdef DEBUG
+			/* see dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10
+			 */
+			if (off > fd_bo_size(rsc->bo))
+				continue;
+#endif
+
 			OUT_PKT0(ring, REG_A4XX_VFD_FETCH(j), 4);
 			OUT_RING(ring, A4XX_VFD_FETCH_INSTR_0_FETCHSIZE(fs - 1) |
 					A4XX_VFD_FETCH_INSTR_0_BUFSTRIDE(vb->stride) |
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index a724cae..73bf199 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -71,8 +71,8 @@
 fd4_emit_get_vp(struct fd4_emit *emit)
 {
 	if (!emit->vp) {
-		struct fd4_shader_stateobj *so = emit->prog->vp;
-		emit->vp = ir3_shader_variant(so->shader, emit->key, emit->debug);
+		struct ir3_shader *shader = emit->prog->vp;
+		emit->vp = ir3_shader_variant(shader, emit->key, emit->debug);
 	}
 	return emit->vp;
 }
@@ -86,8 +86,8 @@
 			static const struct ir3_shader_variant binning_fp = {};
 			emit->fp = &binning_fp;
 		} else {
-			struct fd4_shader_stateobj *so = emit->prog->fp;
-			emit->fp = ir3_shader_variant(so->shader, emit->key, emit->debug);
+			struct ir3_shader *shader = emit->prog->fp;
+			emit->fp = ir3_shader_variant(shader, emit->key, emit->debug);
 		}
 	}
 	return emit->fp;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index 05b0c4f..7c399d9 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -39,22 +39,13 @@
 #include "fd4_texture.h"
 #include "fd4_format.h"
 
-static void
-delete_shader_stateobj(struct fd4_shader_stateobj *so)
-{
-	ir3_shader_destroy(so->shader);
-	free(so);
-}
-
-static struct fd4_shader_stateobj *
+static struct ir3_shader *
 create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso,
 		enum shader_t type)
 {
 	struct fd_context *ctx = fd_context(pctx);
 	struct ir3_compiler *compiler = ctx->screen->compiler;
-	struct fd4_shader_stateobj *so = CALLOC_STRUCT(fd4_shader_stateobj);
-	so->shader = ir3_shader_create(compiler, cso, type, &ctx->debug);
-	return so;
+	return ir3_shader_create(compiler, cso, type, &ctx->debug);
 }
 
 static void *
@@ -67,8 +58,8 @@
 static void
 fd4_fp_state_delete(struct pipe_context *pctx, void *hwcso)
 {
-	struct fd4_shader_stateobj *so = hwcso;
-	delete_shader_stateobj(so);
+	struct ir3_shader *so = hwcso;
+	ir3_shader_destroy(so);
 }
 
 static void *
@@ -81,8 +72,8 @@
 static void
 fd4_vp_state_delete(struct pipe_context *pctx, void *hwcso)
 {
-	struct fd4_shader_stateobj *so = hwcso;
-	delete_shader_stateobj(so);
+	struct ir3_shader *so = hwcso;
+	ir3_shader_destroy(so);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.h b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
index 8dfccaf..5d8eb55 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.h
@@ -33,10 +33,6 @@
 #include "freedreno_context.h"
 #include "ir3_shader.h"
 
-struct fd4_shader_stateobj {
-	struct ir3_shader *shader;
-};
-
 struct fd4_emit;
 
 void fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index 1b81f8d..bfec76c 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -39,18 +39,21 @@
 		enum pipe_format format,
 		enum pipe_texture_target target,
 		unsigned sample_count,
+		unsigned storage_sample_count,
 		unsigned usage)
 {
 	unsigned retval = 0;
 
 	if ((target >= PIPE_MAX_TEXTURE_TYPES) ||
-			(sample_count > 1) || /* TODO add MSAA */
-			!util_format_is_supported(format, usage)) {
+			(sample_count > 1)) { /* TODO add MSAA */
 		DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x",
 				util_format_name(format), target, sample_count, usage);
 		return FALSE;
 	}
 
+	if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+		return false;
+
 	if ((usage & PIPE_BIND_VERTEX_BUFFER) &&
 			(fd4_pipe2vtx(format) != (enum a4xx_vtx_fmt)~0)) {
 		retval |= PIPE_BIND_VERTEX_BUFFER;
diff --git a/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h b/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
index 7356e5f..0e4826d 100644
--- a/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
+++ b/src/gallium/drivers/freedreno/a5xx/a5xx.xml.h
@@ -8,15 +8,17 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13612 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  34499 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 146261 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    501 bytes, from 2018-01-31 18:26:32)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  36805 bytes, from 2018-05-20 19:03:35)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13634 bytes, from 2018-06-10 17:35:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  41584 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2018-01-10 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 147158 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx.xml          (  88437 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx_gmu.xml      (  10431 bytes, from 2018-06-10 17:37:04)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2018-01-08 14:56:24)
 
 Copyright (C) 2013-2018 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -2381,11 +2383,11 @@
 
 #define REG_A5XX_VBIF_PERF_CNT_CLR0				0x000030c8
 
-#define REG_A5XX_VBIF_PERF_CNT_CLR1				0x000030c8
+#define REG_A5XX_VBIF_PERF_CNT_CLR1				0x000030c9
 
-#define REG_A5XX_VBIF_PERF_CNT_CLR2				0x000030c8
+#define REG_A5XX_VBIF_PERF_CNT_CLR2				0x000030ca
 
-#define REG_A5XX_VBIF_PERF_CNT_CLR3				0x000030c8
+#define REG_A5XX_VBIF_PERF_CNT_CLR3				0x000030cb
 
 #define REG_A5XX_VBIF_PERF_CNT_SEL0				0x000030d0
 
@@ -3000,7 +3002,9 @@
 #define A5XX_RB_RENDER_CONTROL0_WCOORD				0x00000200
 
 #define REG_A5XX_RB_RENDER_CONTROL1				0x0000e145
+#define A5XX_RB_RENDER_CONTROL1_SAMPLEMASK			0x00000001
 #define A5XX_RB_RENDER_CONTROL1_FACENESS			0x00000002
+#define A5XX_RB_RENDER_CONTROL1_SAMPLEID			0x00000004
 
 #define REG_A5XX_RB_FS_OUTPUT_CNTL				0x0000e146
 #define A5XX_RB_FS_OUTPUT_CNTL_MRT__MASK			0x0000000f
@@ -3296,6 +3300,7 @@
 	return ((val) << A5XX_RB_BLEND_CNTL_ENABLE_BLEND__SHIFT) & A5XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK;
 }
 #define A5XX_RB_BLEND_CNTL_INDEPENDENT_BLEND			0x00000100
+#define A5XX_RB_BLEND_CNTL_ALPHA_TO_COVERAGE			0x00000400
 #define A5XX_RB_BLEND_CNTL_SAMPLE_MASK__MASK			0xffff0000
 #define A5XX_RB_BLEND_CNTL_SAMPLE_MASK__SHIFT			16
 static inline uint32_t A5XX_RB_BLEND_CNTL_SAMPLE_MASK(uint32_t val)
@@ -3557,6 +3562,7 @@
 
 #define REG_A5XX_RB_CLEAR_CNTL					0x0000e21c
 #define A5XX_RB_CLEAR_CNTL_FAST_CLEAR				0x00000002
+#define A5XX_RB_CLEAR_CNTL_MSAA_RESOLVE				0x00000004
 #define A5XX_RB_CLEAR_CNTL_MASK__MASK				0x000000f0
 #define A5XX_RB_CLEAR_CNTL_MASK__SHIFT				4
 static inline uint32_t A5XX_RB_CLEAR_CNTL_MASK(uint32_t val)
@@ -4147,6 +4153,7 @@
 #define REG_A5XX_SP_BLEND_CNTL					0x0000e5c9
 #define A5XX_SP_BLEND_CNTL_ENABLED				0x00000001
 #define A5XX_SP_BLEND_CNTL_UNK8					0x00000100
+#define A5XX_SP_BLEND_CNTL_ALPHA_TO_COVERAGE			0x00000400
 
 #define REG_A5XX_SP_FS_OUTPUT_CNTL				0x0000e5ca
 #define A5XX_SP_FS_OUTPUT_CNTL_MRT__MASK			0x0000000f
@@ -4228,7 +4235,33 @@
 
 #define REG_A5XX_SP_CS_OBJ_START_HI				0x0000e5f4
 
-#define REG_A5XX_UNKNOWN_E600					0x0000e600
+#define REG_A5XX_SP_HS_CTRL_REG0				0x0000e600
+#define A5XX_SP_HS_CTRL_REG0_THREADSIZE__MASK			0x00000008
+#define A5XX_SP_HS_CTRL_REG0_THREADSIZE__SHIFT			3
+static inline uint32_t A5XX_SP_HS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_SP_HS_CTRL_REG0_THREADSIZE__SHIFT) & A5XX_SP_HS_CTRL_REG0_THREADSIZE__MASK;
+}
+#define A5XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
+#define A5XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
+static inline uint32_t A5XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
+{
+	return ((val) << A5XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A5XX_SP_HS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
+}
+#define A5XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0000fc00
+#define A5XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT		10
+static inline uint32_t A5XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
+{
+	return ((val) << A5XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A5XX_SP_HS_CTRL_REG0_FULLREGFOOTPRINT__MASK;
+}
+#define A5XX_SP_HS_CTRL_REG0_VARYING				0x00010000
+#define A5XX_SP_HS_CTRL_REG0_PIXLODENABLE			0x00100000
+#define A5XX_SP_HS_CTRL_REG0_BRANCHSTACK__MASK			0xfe000000
+#define A5XX_SP_HS_CTRL_REG0_BRANCHSTACK__SHIFT			25
+static inline uint32_t A5XX_SP_HS_CTRL_REG0_BRANCHSTACK(uint32_t val)
+{
+	return ((val) << A5XX_SP_HS_CTRL_REG0_BRANCHSTACK__SHIFT) & A5XX_SP_HS_CTRL_REG0_BRANCHSTACK__MASK;
+}
 
 #define REG_A5XX_UNKNOWN_E602					0x0000e602
 
@@ -4236,13 +4269,67 @@
 
 #define REG_A5XX_SP_HS_OBJ_START_HI				0x0000e604
 
+#define REG_A5XX_SP_DS_CTRL_REG0				0x0000e610
+#define A5XX_SP_DS_CTRL_REG0_THREADSIZE__MASK			0x00000008
+#define A5XX_SP_DS_CTRL_REG0_THREADSIZE__SHIFT			3
+static inline uint32_t A5XX_SP_DS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_SP_DS_CTRL_REG0_THREADSIZE__SHIFT) & A5XX_SP_DS_CTRL_REG0_THREADSIZE__MASK;
+}
+#define A5XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
+#define A5XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
+static inline uint32_t A5XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
+{
+	return ((val) << A5XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A5XX_SP_DS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
+}
+#define A5XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0000fc00
+#define A5XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT		10
+static inline uint32_t A5XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
+{
+	return ((val) << A5XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A5XX_SP_DS_CTRL_REG0_FULLREGFOOTPRINT__MASK;
+}
+#define A5XX_SP_DS_CTRL_REG0_VARYING				0x00010000
+#define A5XX_SP_DS_CTRL_REG0_PIXLODENABLE			0x00100000
+#define A5XX_SP_DS_CTRL_REG0_BRANCHSTACK__MASK			0xfe000000
+#define A5XX_SP_DS_CTRL_REG0_BRANCHSTACK__SHIFT			25
+static inline uint32_t A5XX_SP_DS_CTRL_REG0_BRANCHSTACK(uint32_t val)
+{
+	return ((val) << A5XX_SP_DS_CTRL_REG0_BRANCHSTACK__SHIFT) & A5XX_SP_DS_CTRL_REG0_BRANCHSTACK__MASK;
+}
+
 #define REG_A5XX_UNKNOWN_E62B					0x0000e62b
 
 #define REG_A5XX_SP_DS_OBJ_START_LO				0x0000e62c
 
 #define REG_A5XX_SP_DS_OBJ_START_HI				0x0000e62d
 
-#define REG_A5XX_UNKNOWN_E640					0x0000e640
+#define REG_A5XX_SP_GS_CTRL_REG0				0x0000e640
+#define A5XX_SP_GS_CTRL_REG0_THREADSIZE__MASK			0x00000008
+#define A5XX_SP_GS_CTRL_REG0_THREADSIZE__SHIFT			3
+static inline uint32_t A5XX_SP_GS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val)
+{
+	return ((val) << A5XX_SP_GS_CTRL_REG0_THREADSIZE__SHIFT) & A5XX_SP_GS_CTRL_REG0_THREADSIZE__MASK;
+}
+#define A5XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT__MASK		0x000003f0
+#define A5XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT		4
+static inline uint32_t A5XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
+{
+	return ((val) << A5XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A5XX_SP_GS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
+}
+#define A5XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0000fc00
+#define A5XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT		10
+static inline uint32_t A5XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
+{
+	return ((val) << A5XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A5XX_SP_GS_CTRL_REG0_FULLREGFOOTPRINT__MASK;
+}
+#define A5XX_SP_GS_CTRL_REG0_VARYING				0x00010000
+#define A5XX_SP_GS_CTRL_REG0_PIXLODENABLE			0x00100000
+#define A5XX_SP_GS_CTRL_REG0_BRANCHSTACK__MASK			0xfe000000
+#define A5XX_SP_GS_CTRL_REG0_BRANCHSTACK__SHIFT			25
+static inline uint32_t A5XX_SP_GS_CTRL_REG0_BRANCHSTACK(uint32_t val)
+{
+	return ((val) << A5XX_SP_GS_CTRL_REG0_BRANCHSTACK__SHIFT) & A5XX_SP_GS_CTRL_REG0_BRANCHSTACK__MASK;
+}
 
 #define REG_A5XX_UNKNOWN_E65B					0x0000e65b
 
@@ -4362,6 +4449,18 @@
 {
 	return ((val) << A5XX_HLSQ_CONTROL_2_REG_FACEREGID__SHIFT) & A5XX_HLSQ_CONTROL_2_REG_FACEREGID__MASK;
 }
+#define A5XX_HLSQ_CONTROL_2_REG_SAMPLEID__MASK			0x0000ff00
+#define A5XX_HLSQ_CONTROL_2_REG_SAMPLEID__SHIFT			8
+static inline uint32_t A5XX_HLSQ_CONTROL_2_REG_SAMPLEID(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CONTROL_2_REG_SAMPLEID__SHIFT) & A5XX_HLSQ_CONTROL_2_REG_SAMPLEID__MASK;
+}
+#define A5XX_HLSQ_CONTROL_2_REG_SAMPLEMASK__MASK		0x00ff0000
+#define A5XX_HLSQ_CONTROL_2_REG_SAMPLEMASK__SHIFT		16
+static inline uint32_t A5XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CONTROL_2_REG_SAMPLEMASK__SHIFT) & A5XX_HLSQ_CONTROL_2_REG_SAMPLEMASK__MASK;
+}
 
 #define REG_A5XX_HLSQ_CONTROL_3_REG				0x0000e787
 #define A5XX_HLSQ_CONTROL_3_REG_FRAGCOORDXYREGID__MASK		0x000000ff
@@ -4564,34 +4663,52 @@
 }
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_1				0x0000e7b1
-#define A5XX_HLSQ_CS_NDRANGE_1_SIZE_X__MASK			0xffffffff
-#define A5XX_HLSQ_CS_NDRANGE_1_SIZE_X__SHIFT			0
-static inline uint32_t A5XX_HLSQ_CS_NDRANGE_1_SIZE_X(uint32_t val)
+#define A5XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X__MASK		0xffffffff
+#define A5XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X__SHIFT		0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_CS_NDRANGE_1_SIZE_X__SHIFT) & A5XX_HLSQ_CS_NDRANGE_1_SIZE_X__MASK;
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X__SHIFT) & A5XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X__MASK;
 }
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_2				0x0000e7b2
+#define A5XX_HLSQ_CS_NDRANGE_2_GLOBALOFF_X__MASK		0xffffffff
+#define A5XX_HLSQ_CS_NDRANGE_2_GLOBALOFF_X__SHIFT		0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_2_GLOBALOFF_X(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_2_GLOBALOFF_X__SHIFT) & A5XX_HLSQ_CS_NDRANGE_2_GLOBALOFF_X__MASK;
+}
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_3				0x0000e7b3
-#define A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y__MASK			0xffffffff
-#define A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y__SHIFT			0
-static inline uint32_t A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y(uint32_t val)
+#define A5XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y__MASK		0xffffffff
+#define A5XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y__SHIFT		0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y__SHIFT) & A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y__MASK;
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y__SHIFT) & A5XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y__MASK;
 }
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_4				0x0000e7b4
+#define A5XX_HLSQ_CS_NDRANGE_4_GLOBALOFF_Y__MASK		0xffffffff
+#define A5XX_HLSQ_CS_NDRANGE_4_GLOBALOFF_Y__SHIFT		0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_4_GLOBALOFF_Y(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_4_GLOBALOFF_Y__SHIFT) & A5XX_HLSQ_CS_NDRANGE_4_GLOBALOFF_Y__MASK;
+}
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_5				0x0000e7b5
-#define A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z__MASK			0xffffffff
-#define A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z__SHIFT			0
-static inline uint32_t A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z(uint32_t val)
+#define A5XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z__MASK		0xffffffff
+#define A5XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z__SHIFT		0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(uint32_t val)
 {
-	return ((val) << A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z__SHIFT) & A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z__MASK;
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z__SHIFT) & A5XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z__MASK;
 }
 
 #define REG_A5XX_HLSQ_CS_NDRANGE_6				0x0000e7b6
+#define A5XX_HLSQ_CS_NDRANGE_6_GLOBALOFF_Z__MASK		0xffffffff
+#define A5XX_HLSQ_CS_NDRANGE_6_GLOBALOFF_Z__SHIFT		0
+static inline uint32_t A5XX_HLSQ_CS_NDRANGE_6_GLOBALOFF_Z(uint32_t val)
+{
+	return ((val) << A5XX_HLSQ_CS_NDRANGE_6_GLOBALOFF_Z__SHIFT) & A5XX_HLSQ_CS_NDRANGE_6_GLOBALOFF_Z__MASK;
+}
 
 #define REG_A5XX_HLSQ_CS_CNTL_0					0x0000e7b7
 #define A5XX_HLSQ_CS_CNTL_0_WGIDCONSTID__MASK			0x000000ff
@@ -4923,6 +5040,12 @@
 {
 	return ((val) << A5XX_TEX_CONST_0_MIPLVLS__SHIFT) & A5XX_TEX_CONST_0_MIPLVLS__MASK;
 }
+#define A5XX_TEX_CONST_0_SAMPLES__MASK				0x00300000
+#define A5XX_TEX_CONST_0_SAMPLES__SHIFT				20
+static inline uint32_t A5XX_TEX_CONST_0_SAMPLES(enum a3xx_msaa_samples val)
+{
+	return ((val) << A5XX_TEX_CONST_0_SAMPLES__SHIFT) & A5XX_TEX_CONST_0_SAMPLES__MASK;
+}
 #define A5XX_TEX_CONST_0_FMT__MASK				0x3fc00000
 #define A5XX_TEX_CONST_0_FMT__SHIFT				22
 static inline uint32_t A5XX_TEX_CONST_0_FMT(enum a5xx_tex_fmt val)
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_blend.c b/src/gallium/drivers/freedreno/a5xx/fd5_blend.c
index 98b6d44..fee6ba3 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_blend.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_blend.c
@@ -140,8 +140,10 @@
 	}
 
 	so->rb_blend_cntl = A5XX_RB_BLEND_CNTL_ENABLE_BLEND(mrt_blend) |
+		COND(cso->alpha_to_coverage, A5XX_RB_BLEND_CNTL_ALPHA_TO_COVERAGE) |
 		COND(cso->independent_blend_enable, A5XX_RB_BLEND_CNTL_INDEPENDENT_BLEND);
 	so->sp_blend_cntl = A5XX_SP_BLEND_CNTL_UNK8 |
+		COND(cso->alpha_to_coverage, A5XX_SP_BLEND_CNTL_ALPHA_TO_COVERAGE) |
 		COND(mrt_blend, A5XX_SP_BLEND_CNTL_ENABLED);
 
 	return so;
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c
index 9d3039c..8e2c228 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_compute.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_compute.c
@@ -70,13 +70,28 @@
 
 /* maybe move to fd5_program? */
 static void
-cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v)
+cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v,
+		const struct pipe_grid_info *info)
 {
+	const unsigned *local_size = info->block;
 	const struct ir3_info *i = &v->info;
 	enum a3xx_threadsize thrsz;
+	unsigned instrlen = v->instrlen;
 
-	/* note: blob uses local_size_x/y/z threshold to choose threadsize: */
-	thrsz = FOUR_QUADS;
+	/* if shader is more than 32*16 instructions, don't preload it.  Similar
+	 * to the combined restriction of 64*16 for VS+FS
+	 */
+	if (instrlen > 32)
+		instrlen = 0;
+
+	/* maybe the limit should be 1024.. basically if we can't have full
+	 * occupancy, use TWO_QUAD mode to reduce divergence penalty.
+	 */
+	if ((local_size[0] * local_size[1] * local_size[2]) < 512) {
+		thrsz = TWO_QUADS;
+	} else {
+		thrsz = FOUR_QUADS;
+	}
 
 	OUT_PKT4(ring, REG_A5XX_SP_SP_CNTL, 1);
 	OUT_RING(ring, 0x00000000);        /* SP_SP_CNTL */
@@ -99,7 +114,7 @@
 		A5XX_HLSQ_CS_CONFIG_ENABLED);
 
 	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CNTL, 1);
-	OUT_RING(ring, A5XX_HLSQ_CS_CNTL_INSTRLEN(v->instrlen) |
+	OUT_RING(ring, A5XX_HLSQ_CS_CNTL_INSTRLEN(instrlen) |
 		COND(v->has_ssbo, A5XX_HLSQ_CS_CNTL_SSBO_ENABLE));
 
 	OUT_PKT4(ring, REG_A5XX_SP_CS_CONFIG, 1);
@@ -110,7 +125,7 @@
 	unsigned constlen = align(v->constlen, 4) / 4;
 	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_CONSTLEN, 2);
 	OUT_RING(ring, constlen);          /* HLSQ_CS_CONSTLEN */
-	OUT_RING(ring, v->instrlen);       /* HLSQ_CS_INSTRLEN */
+	OUT_RING(ring, instrlen);          /* HLSQ_CS_INSTRLEN */
 
 	OUT_PKT4(ring, REG_A5XX_SP_CS_OBJ_START_LO, 2);
 	OUT_RELOC(ring, v->bo, 0, 0, 0);   /* SP_CS_OBJ_START_LO/HI */
@@ -129,7 +144,8 @@
 		A5XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
 	OUT_RING(ring, 0x1);               /* HLSQ_CS_CNTL_1 */
 
-	fd5_emit_shader(ring, v);
+	if (instrlen > 0)
+		fd5_emit_shader(ring, v);
 }
 
 static void
@@ -144,7 +160,7 @@
 	OUT_RING(ring, 0x0);
 
 	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-	OUT_RING(ring, UNK_19);
+	OUT_RING(ring, PC_CCU_INVALIDATE_COLOR);
 
 	OUT_PKT4(ring, REG_A5XX_PC_POWER_CNTL, 1);
 	OUT_RING(ring, 0x00000003);   /* PC_POWER_CNTL */
@@ -173,9 +189,11 @@
 	emit_setup(ctx);
 
 	v = ir3_shader_variant(so->shader, key, &ctx->debug);
+	if (!v)
+		return;
 
 	if (ctx->dirty_shader[PIPE_SHADER_COMPUTE] & FD_DIRTY_SHADER_PROG)
-		cs_program_emit(ring, v);
+		cs_program_emit(ring, v, info);
 
 	fd5_emit_cs_state(ctx, ring, v);
 	ir3_emit_cs_consts(v, ring, ctx, info);
@@ -206,12 +224,12 @@
 		A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEX(local_size[0] - 1) |
 		A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEY(local_size[1] - 1) |
 		A5XX_HLSQ_CS_NDRANGE_0_LOCALSIZEZ(local_size[2] - 1));
-	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_1_SIZE_X(local_size[0] * num_groups[0]));
-	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_2 */
-	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_3_SIZE_Y(local_size[1] * num_groups[1]));
-	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_4 */
-	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_5_SIZE_Z(local_size[2] * num_groups[2]));
-	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_6 */
+	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_1_GLOBALSIZE_X(local_size[0] * num_groups[0]));
+	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_2_GLOBALOFF_X */
+	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_3_GLOBALSIZE_Y(local_size[1] * num_groups[1]));
+	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_4_GLOBALOFF_Y */
+	OUT_RING(ring, A5XX_HLSQ_CS_NDRANGE_5_GLOBALSIZE_Z(local_size[2] * num_groups[2]));
+	OUT_RING(ring, 0);            /* HLSQ_CS_NDRANGE_6_GLOBALOFF_Z */
 
 	OUT_PKT4(ring, REG_A5XX_HLSQ_CS_KERNEL_GROUP_X, 3);
 	OUT_RING(ring, 1);            /* HLSQ_CS_KERNEL_GROUP_X */
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_context.c b/src/gallium/drivers/freedreno/a5xx/fd5_context.c
index 426a8e0..c43a8ad 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_context.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_context.c
@@ -101,6 +101,8 @@
 	if (!pctx)
 		return NULL;
 
+	util_blitter_set_texture_multisample(fd5_ctx->base.blitter, true);
+
 	fd5_ctx->vs_pvt_mem = fd_bo_new(screen->dev, 0x2000,
 			DRM_FREEDRENO_GEM_TYPE_KMEM);
 
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
index 8c3be5e..aa06b3c 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_draw.c
@@ -106,8 +106,6 @@
 			.vclamp_color = ctx->rasterizer->clamp_vertex_color,
 			.fclamp_color = ctx->rasterizer->clamp_fragment_color,
 			.rasterflat = ctx->rasterizer->flatshade,
-			.half_precision = ctx->in_blit &&
-					fd_half_precision(&ctx->batch->framebuffer),
 			.ucp_enables = ctx->rasterizer->clip_plane_enable,
 			.has_per_samp = (fd5_ctx->fsaturate || fd5_ctx->vsaturate ||
 					fd5_ctx->fastc_srgb || fd5_ctx->vastc_srgb),
@@ -119,6 +117,8 @@
 			.fsaturate_r = fd5_ctx->fsaturate_r,
 			.vastc_srgb = fd5_ctx->vastc_srgb,
 			.fastc_srgb = fd5_ctx->fastc_srgb,
+			.vsamples = ctx->tex[PIPE_SHADER_VERTEX].samples,
+			.fsamples = ctx->tex[PIPE_SHADER_FRAGMENT].samples,
 		},
 		.rasterflat = ctx->rasterizer->flatshade,
 		.sprite_coord_enable = ctx->rasterizer->sprite_coord_enable,
@@ -136,6 +136,9 @@
 	if (!vp || !fp)
 		return false;
 
+	ctx->stats.vs_regs += ir3_shader_halfregs(vp);
+	ctx->stats.fs_regs += ir3_shader_halfregs(fp);
+
 	/* figure out whether we need to disable LRZ write for binning
 	 * pass using draw pass's fp:
 	 */
@@ -209,7 +212,8 @@
 	OUT_RING(ring, 0x20fffff);
 
 	OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1);
-	OUT_RING(ring, A5XX_GRAS_SU_CNTL_LINEHALFWIDTH(0.0));
+	OUT_RING(ring, A5XX_GRAS_SU_CNTL_LINEHALFWIDTH(0.0) |
+			COND(zsbuf->base.nr_samples > 1, A5XX_GRAS_SU_CNTL_MSAA_ENABLE));
 
 	OUT_PKT4(ring, REG_A5XX_GRAS_CNTL, 1);
 	OUT_RING(ring, 0x00000000);
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_draw.h b/src/gallium/drivers/freedreno/a5xx/fd5_draw.h
index 3edfc39..c0d50b2 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_draw.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_draw.h
@@ -32,6 +32,7 @@
 #include "freedreno_draw.h"
 
 #include "fd5_context.h"
+#include "fd5_screen.h"
 
 /* some bits in common w/ a4xx: */
 #include "a4xx/fd4_draw.h"
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_emit.c b/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
index b2809d6..bca8aeb 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_emit.c
@@ -42,6 +42,7 @@
 #include "fd5_program.h"
 #include "fd5_rasterizer.h"
 #include "fd5_texture.h"
+#include "fd5_screen.h"
 #include "fd5_format.h"
 #include "fd5_zsa.h"
 
@@ -135,6 +136,7 @@
 	uint32_t fp32[4];
 	uint16_t ui16[4];
 	int16_t  si16[4];
+
 	uint16_t fp16[4];
 	uint16_t rgb565;
 	uint16_t rgb5a1;
@@ -144,7 +146,9 @@
 	int8_t   si8[4];
 	uint32_t rgb10a2;
 	uint32_t z24; /* also s8? */
-	uint8_t  __pad1[32];
+
+	uint16_t srgb[4];      /* appears to duplicate fp16[], but clamped, used for srgb */
+	uint8_t  __pad1[24];
 };
 
 #define FD5_BORDER_COLOR_SIZE        0x60
@@ -178,8 +182,9 @@
 		if ((i >= tex->num_textures) || !tex->textures[i])
 			continue;
 
+		enum pipe_format format = tex->textures[i]->format;
 		const struct util_format_description *desc =
-				util_format_description(tex->textures[i]->format);
+				util_format_description(format);
 
 		e->rgb565 = 0;
 		e->rgb5a1 = 0;
@@ -189,6 +194,24 @@
 
 		for (j = 0; j < 4; j++) {
 			int c = desc->swizzle[j];
+			int cd = c;
+
+			/*
+			 * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the
+			 * stencil border color value in bc->ui[0] but according
+			 * to desc->swizzle and desc->channel, the .x component
+			 * is NONE and the stencil value is in the y component.
+			 * Meanwhile the hardware wants this in the .x componetn.
+			 */
+			if ((format == PIPE_FORMAT_X24S8_UINT) ||
+					(format == PIPE_FORMAT_X32_S8X24_UINT)) {
+				if (j == 0) {
+					c = 1;
+					cd = 0;
+				} else {
+					continue;
+				}
+			}
 
 			if (c >= 4)
 				continue;
@@ -222,8 +245,8 @@
 					clamped = 0;
 					break;
 				}
-				e->fp32[c] = bc->ui[j];
-				e->fp16[c] = clamped;
+				e->fp32[cd] = bc->ui[j];
+				e->fp16[cd] = clamped;
 			} else {
 				float f = bc->f[j];
 				float f_u = CLAMP(f, 0, 1);
@@ -231,6 +254,7 @@
 
 				e->fp32[c] = fui(f);
 				e->fp16[c] = util_float_to_half(f);
+				e->srgb[c] = util_float_to_half(f_u);
 				e->ui16[c] = f_u * 0xffff;
 				e->si16[c] = f_s * 0x7fff;
 				e->ui8[c]  = f_u * 0xff;
@@ -455,6 +479,13 @@
 			uint32_t size = fd_bo_size(rsc->bo) - off;
 			debug_assert(fmt != ~0);
 
+#ifdef DEBUG
+			/* see dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10
+			 */
+			if (off > fd_bo_size(rsc->bo))
+				continue;
+#endif
+
 			OUT_PKT4(ring, REG_A5XX_VFD_FETCH(j), 4);
 			OUT_RELOC(ring, rsc->bo, off, 0, 0);
 			OUT_RING(ring, size);           /* VFD_FETCH[j].SIZE */
@@ -610,7 +641,8 @@
 				fd5_rasterizer_stateobj(ctx->rasterizer);
 
 		OUT_PKT4(ring, REG_A5XX_GRAS_SU_CNTL, 1);
-		OUT_RING(ring, rasterizer->gras_su_cntl);
+		OUT_RING(ring, rasterizer->gras_su_cntl |
+				COND(pfb->samples > 1, A5XX_GRAS_SU_CNTL_MSAA_ENABLE));
 
 		OUT_PKT4(ring, REG_A5XX_GRAS_SU_POINT_MINMAX, 2);
 		OUT_RING(ring, rasterizer->gras_su_point_minmax);
@@ -667,43 +699,41 @@
 				A5XX_SP_FS_OUTPUT_CNTL_SAMPLEMASK_REGID(regid(63, 0)));
 	}
 
-	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
-		ir3_emit_vs_consts(vp, ring, ctx, emit->info);
-		if (!emit->key.binning_pass)
-			ir3_emit_fs_consts(fp, ring, ctx);
+	ir3_emit_vs_consts(vp, ring, ctx, emit->info);
+	if (!emit->key.binning_pass)
+		ir3_emit_fs_consts(fp, ring, ctx);
 
-		struct pipe_stream_output_info *info = &vp->shader->stream_output;
-		if (info->num_outputs) {
-			struct fd_streamout_stateobj *so = &ctx->streamout;
+	struct pipe_stream_output_info *info = &vp->shader->stream_output;
+	if (info->num_outputs) {
+		struct fd_streamout_stateobj *so = &ctx->streamout;
 
-			for (unsigned i = 0; i < so->num_targets; i++) {
-				struct pipe_stream_output_target *target = so->targets[i];
+		for (unsigned i = 0; i < so->num_targets; i++) {
+			struct pipe_stream_output_target *target = so->targets[i];
 
-				if (!target)
-					continue;
+			if (!target)
+				continue;
 
-				unsigned offset = (so->offsets[i] * info->stride[i] * 4) +
-						target->buffer_offset;
+			unsigned offset = (so->offsets[i] * info->stride[i] * 4) +
+					target->buffer_offset;
 
-				OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_BASE_LO(i), 3);
-				/* VPC_SO[i].BUFFER_BASE_LO: */
-				OUT_RELOCW(ring, fd_resource(target->buffer)->bo, 0, 0, 0);
-				OUT_RING(ring, target->buffer_size + offset);
+			OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_BASE_LO(i), 3);
+			/* VPC_SO[i].BUFFER_BASE_LO: */
+			OUT_RELOCW(ring, fd_resource(target->buffer)->bo, 0, 0, 0);
+			OUT_RING(ring, target->buffer_size + offset);
 
-				OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(i), 3);
-				OUT_RING(ring, offset);
-				/* VPC_SO[i].FLUSH_BASE_LO/HI: */
-				// TODO just give hw a dummy addr for now.. we should
-				// be using this an then CP_MEM_TO_REG to set the
-				// VPC_SO[i].BUFFER_OFFSET for the next draw..
-				OUT_RELOCW(ring, fd5_context(ctx)->blit_mem, 0x100, 0, 0);
+			OUT_PKT4(ring, REG_A5XX_VPC_SO_BUFFER_OFFSET(i), 3);
+			OUT_RING(ring, offset);
+			/* VPC_SO[i].FLUSH_BASE_LO/HI: */
+			// TODO just give hw a dummy addr for now.. we should
+			// be using this an then CP_MEM_TO_REG to set the
+			// VPC_SO[i].BUFFER_OFFSET for the next draw..
+			OUT_RELOCW(ring, fd5_context(ctx)->blit_mem, 0x100, 0, 0);
 
-				emit->streamout_mask |= (1 << i);
-			}
+			emit->streamout_mask |= (1 << i);
 		}
 	}
 
-	if ((dirty & FD_DIRTY_BLEND)) {
+	if (dirty & FD_DIRTY_BLEND) {
 		struct fd5_blend_stateobj *blend = fd5_blend_stateobj(ctx->blend);
 		uint32_t i;
 
@@ -733,14 +763,18 @@
 			OUT_RING(ring, blend_control);
 		}
 
-		OUT_PKT4(ring, REG_A5XX_RB_BLEND_CNTL, 1);
-		OUT_RING(ring, blend->rb_blend_cntl |
-				A5XX_RB_BLEND_CNTL_SAMPLE_MASK(0xffff));
-
 		OUT_PKT4(ring, REG_A5XX_SP_BLEND_CNTL, 1);
 		OUT_RING(ring, blend->sp_blend_cntl);
 	}
 
+	if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_SAMPLE_MASK)) {
+		struct fd5_blend_stateobj *blend = fd5_blend_stateobj(ctx->blend);
+
+		OUT_PKT4(ring, REG_A5XX_RB_BLEND_CNTL, 1);
+		OUT_RING(ring, blend->rb_blend_cntl |
+				A5XX_RB_BLEND_CNTL_SAMPLE_MASK(ctx->sample_mask));
+	}
+
 	if (dirty & FD_DIRTY_BLEND_COLOR) {
 		struct pipe_blend_color *bcolor = &ctx->blend_color;
 
@@ -1018,10 +1052,10 @@
 	OUT_PKT4(ring, REG_A5XX_UNKNOWN_E5DB, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT4(ring, REG_A5XX_UNKNOWN_E600, 1);
+	OUT_PKT4(ring, REG_A5XX_SP_HS_CTRL_REG0, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT4(ring, REG_A5XX_UNKNOWN_E640, 1);
+	OUT_PKT4(ring, REG_A5XX_SP_GS_CTRL_REG0, 1);
 	OUT_RING(ring, 0x00000000);
 
 	OUT_PKT4(ring, REG_A5XX_TPL1_VS_TEX_COUNT, 4);
@@ -1071,7 +1105,15 @@
 static void
 fd5_emit_ib(struct fd_ringbuffer *ring, struct fd_ringbuffer *target)
 {
+	/* for debug after a lock up, write a unique counter value
+	 * to scratch6 for each IB, to make it easier to match up
+	 * register dumps to cmdstream.  The combination of IB and
+	 * DRAW (scratch7) is enough to "triangulate" the particular
+	 * draw that caused lockup.
+	 */
+	emit_marker5(ring, 6);
 	__OUT_IB5(ring, target);
+	emit_marker5(ring, 6);
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_emit.h b/src/gallium/drivers/freedreno/a5xx/fd5_emit.h
index 2d8a0fd..bed52d4 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_emit.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_emit.h
@@ -33,6 +33,7 @@
 #include "fd5_context.h"
 #include "fd5_format.h"
 #include "fd5_program.h"
+#include "fd5_screen.h"
 #include "ir3_shader.h"
 
 struct fd_ringbuffer;
@@ -75,8 +76,8 @@
 fd5_emit_get_vp(struct fd5_emit *emit)
 {
 	if (!emit->vp) {
-		struct fd5_shader_stateobj *so = emit->prog->vp;
-		emit->vp = ir3_shader_variant(so->shader, emit->key, emit->debug);
+		struct ir3_shader *shader = emit->prog->vp;
+		emit->vp = ir3_shader_variant(shader, emit->key, emit->debug);
 	}
 	return emit->vp;
 }
@@ -90,8 +91,8 @@
 			static const struct ir3_shader_variant binning_fp = {};
 			emit->fp = &binning_fp;
 		} else {
-			struct fd5_shader_stateobj *so = emit->prog->fp;
-			emit->fp = ir3_shader_variant(so->shader, emit->key, emit->debug);
+			struct ir3_shader *shader = emit->prog->fp;
+			emit->fp = ir3_shader_variant(shader, emit->key, emit->debug);
 		}
 	}
 	return emit->fp;
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c b/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
index ae73bc1..c367ecd 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_gmem.c
@@ -85,7 +85,7 @@
 					psurf->u.tex.first_layer);
 
 			if (gmem) {
-				stride = gmem->bin_w * rsc->cpp;
+				stride = gmem->bin_w * gmem->cbuf_cpp[i];
 				size = stride * gmem->bin_h;
 				base = gmem->cbuf_base[i];
 			} else {
@@ -580,21 +580,23 @@
 	emit_zs(ring, pfb->zsbuf, gmem);
 	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, gmem);
 
-	// TODO MSAA
+	enum a3xx_msaa_samples samples = fd_msaa_samples(pfb->samples);
+
 	OUT_PKT4(ring, REG_A5XX_TPL1_TP_RAS_MSAA_CNTL, 2);
-	OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE));
-	OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) |
-			A5XX_TPL1_TP_DEST_MSAA_CNTL_MSAA_DISABLE);
+	OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(samples));
+	OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(samples) |
+			COND(samples == MSAA_ONE, A5XX_TPL1_TP_DEST_MSAA_CNTL_MSAA_DISABLE));
 
 	OUT_PKT4(ring, REG_A5XX_RB_RAS_MSAA_CNTL, 2);
-	OUT_RING(ring, A5XX_RB_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE));
-	OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) |
-			A5XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE);
+	OUT_RING(ring, A5XX_RB_RAS_MSAA_CNTL_SAMPLES(samples));
+	OUT_RING(ring, A5XX_RB_DEST_MSAA_CNTL_SAMPLES(samples) |
+			COND(samples == MSAA_ONE, A5XX_RB_DEST_MSAA_CNTL_MSAA_DISABLE));
+
 
 	OUT_PKT4(ring, REG_A5XX_GRAS_SC_RAS_MSAA_CNTL, 2);
-	OUT_RING(ring, A5XX_GRAS_SC_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE));
-	OUT_RING(ring, A5XX_GRAS_SC_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) |
-			A5XX_GRAS_SC_DEST_MSAA_CNTL_MSAA_DISABLE);
+	OUT_RING(ring, A5XX_GRAS_SC_RAS_MSAA_CNTL_SAMPLES(samples));
+	OUT_RING(ring, A5XX_GRAS_SC_DEST_MSAA_CNTL_SAMPLES(samples) |
+			COND(samples == MSAA_ONE, A5XX_GRAS_SC_DEST_MSAA_CNTL_MSAA_DISABLE));
 }
 
 
@@ -640,6 +642,11 @@
 	OUT_PKT4(ring, REG_A5XX_RB_BLIT_CNTL, 1);
 	OUT_RING(ring, A5XX_RB_BLIT_CNTL_BUF(buf));
 
+//	bool msaa_resolve = pfb->samples > 1;
+	bool msaa_resolve = false;
+	OUT_PKT4(ring, REG_A5XX_RB_CLEAR_CNTL, 1);
+	OUT_RING(ring, COND(msaa_resolve, A5XX_RB_CLEAR_CNTL_MSAA_RESOLVE));
+
 	fd5_emit_blit(batch->ctx, ring);
 }
 
@@ -700,7 +707,7 @@
 	OUT_RING(ring, 0x0);
 
 	OUT_PKT7(ring, CP_EVENT_WRITE, 1);
-	OUT_RING(ring, UNK_19);
+	OUT_RING(ring, PC_CCU_INVALIDATE_COLOR);
 
 	OUT_PKT4(ring, REG_A5XX_PC_POWER_CNTL, 1);
 	OUT_RING(ring, 0x00000003);   /* PC_POWER_CNTL */
@@ -742,7 +749,6 @@
 	emit_zs(ring, pfb->zsbuf, NULL);
 	emit_mrt(ring, pfb->nr_cbufs, pfb->cbufs, NULL);
 
-	// TODO MSAA
 	OUT_PKT4(ring, REG_A5XX_TPL1_TP_RAS_MSAA_CNTL, 2);
 	OUT_RING(ring, A5XX_TPL1_TP_RAS_MSAA_CNTL_SAMPLES(MSAA_ONE));
 	OUT_RING(ring, A5XX_TPL1_TP_DEST_MSAA_CNTL_SAMPLES(MSAA_ONE) |
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_image.c b/src/gallium/drivers/freedreno/a5xx/fd5_image.c
index a945e7e..028497f 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_image.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_image.c
@@ -81,17 +81,43 @@
 		lvl = 0;
 		img->offset = pimg->u.buf.offset;
 		img->pitch  = pimg->u.buf.size;
-		img->array_pitch = 0;
 	} else {
 		lvl = pimg->u.tex.level;
-		img->offset = rsc->slices[lvl].offset;
+		img->offset = fd_resource_offset(rsc, lvl, pimg->u.tex.first_layer);
 		img->pitch  = rsc->slices[lvl].pitch * rsc->cpp;
-		img->array_pitch = rsc->layer_size;
 	}
 
 	img->width     = u_minify(prsc->width0, lvl);
 	img->height    = u_minify(prsc->height0, lvl);
-	img->depth     = u_minify(prsc->depth0, lvl);
+
+	unsigned layers = pimg->u.tex.last_layer - pimg->u.tex.first_layer + 1;
+
+	switch (prsc->target) {
+	case PIPE_TEXTURE_RECT:
+	case PIPE_TEXTURE_1D:
+	case PIPE_TEXTURE_2D:
+		img->array_pitch = rsc->layer_size;
+		img->depth = 1;
+		break;
+	case PIPE_TEXTURE_1D_ARRAY:
+	case PIPE_TEXTURE_2D_ARRAY:
+		img->array_pitch = rsc->layer_size;
+		img->depth = layers;
+		break;
+	case PIPE_TEXTURE_CUBE:
+	case PIPE_TEXTURE_CUBE_ARRAY:
+		img->array_pitch = rsc->layer_size;
+		img->depth = layers;
+		break;
+	case PIPE_TEXTURE_3D:
+		img->array_pitch = rsc->slices[lvl].size0;
+		img->depth = u_minify(prsc->depth0, lvl);
+		break;
+	default:
+		img->array_pitch = 0;
+		img->depth = 0;
+		break;
+	}
 }
 
 static void emit_image_tex(struct fd_ringbuffer *ring, unsigned slot,
@@ -134,19 +160,6 @@
 static void emit_image_ssbo(struct fd_ringbuffer *ring, unsigned slot,
 		struct fd5_image *img, enum pipe_shader_type shader)
 {
-	OUT_PKT7(ring, CP_LOAD_STATE4, 3 + 4);
-	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) |
-		CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
-		CP_LOAD_STATE4_0_STATE_BLOCK(imgsb[shader]) |
-		CP_LOAD_STATE4_0_NUM_UNIT(1));
-	OUT_RING(ring, CP_LOAD_STATE4_1_STATE_TYPE(0) |
-		CP_LOAD_STATE4_1_EXT_SRC_ADDR(0));
-	OUT_RING(ring, CP_LOAD_STATE4_2_EXT_SRC_ADDR_HI(0));
-	OUT_RING(ring, A5XX_SSBO_0_0_BASE_LO(0));
-	OUT_RING(ring, A5XX_SSBO_0_1_PITCH(img->pitch));
-	OUT_RING(ring, A5XX_SSBO_0_2_ARRAY_PITCH(img->array_pitch));
-	OUT_RING(ring, A5XX_SSBO_0_3_CPP(img->cpp));
-
 	OUT_PKT7(ring, CP_LOAD_STATE4, 3 + 2);
 	OUT_RING(ring, CP_LOAD_STATE4_0_DST_OFF(slot) |
 		CP_LOAD_STATE4_0_STATE_SRC(SS4_DIRECT) |
@@ -206,11 +219,10 @@
 		enum pipe_shader_type shader)
 {
 	struct fd_shaderimg_stateobj *so = &ctx->shaderimg[shader];
+	unsigned enabled_mask = so->enabled_mask;
 
-	so->dirty_mask &= so->enabled_mask;
-
-	while (so->dirty_mask) {
-		unsigned index = u_bit_scan(&so->dirty_mask);
+	while (enabled_mask) {
+		unsigned index = u_bit_scan(&enabled_mask);
 		unsigned slot = get_image_slot(index);
 		struct fd5_image img;
 
@@ -220,4 +232,3 @@
 		emit_image_ssbo(ring, slot, &img, shader);
 	}
 }
-
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c b/src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c
new file mode 100644
index 0000000..cf5571d
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_perfcntr.c
@@ -0,0 +1,766 @@
+/*
+ * Copyright (C) 2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FD5_PERFCNTR_H_
+#define FD5_PERFCNTR_H_
+
+#include "freedreno_perfcntr.h"
+#include "fd5_format.h"
+
+#define REG(_x) REG_A5XX_ ## _x
+
+#define COUNTER(_sel, _lo, _hi) {  \
+	.select_reg = REG(_sel),       \
+	.counter_reg_lo = REG(_lo),    \
+	.counter_reg_hi = REG(_hi),    \
+}
+
+#define COUNTER2(_sel, _lo, _hi, _en, _clr) { \
+	.select_reg     = REG(_sel),  \
+	.counter_reg_lo = REG(_lo),   \
+	.counter_reg_hi = REG(_hi),   \
+	.enable         = REG(_en),   \
+	.clear          = REG(_clr),  \
+}
+
+#define COUNTABLE(_selector, _query_type, _result_type) {            \
+	.name        = #_selector,                                       \
+	.selector    = _selector,                                        \
+	.query_type  = PIPE_DRIVER_QUERY_TYPE_ ## _query_type,           \
+	.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_ ## _result_type,   \
+}
+
+#define GROUP(_name, _counters, _countables) {   \
+	.name           = _name,                     \
+	.num_counters   = ARRAY_SIZE(_counters),     \
+	.counters       = _counters,                 \
+	.num_countables = ARRAY_SIZE(_countables),   \
+	.countables     = _countables,               \
+}
+
+static const struct fd_perfcntr_counter cp_counters[] = {
+//RESERVED: for kernel
+//	COUNTER(CP_PERFCTR_CP_SEL_0, RBBM_PERFCTR_CP_0_LO, RBBM_PERFCTR_CP_0_HI),
+	COUNTER(CP_PERFCTR_CP_SEL_1, RBBM_PERFCTR_CP_1_LO, RBBM_PERFCTR_CP_1_HI),
+	COUNTER(CP_PERFCTR_CP_SEL_2, RBBM_PERFCTR_CP_2_LO, RBBM_PERFCTR_CP_2_HI),
+	COUNTER(CP_PERFCTR_CP_SEL_3, RBBM_PERFCTR_CP_3_LO, RBBM_PERFCTR_CP_3_HI),
+	COUNTER(CP_PERFCTR_CP_SEL_4, RBBM_PERFCTR_CP_4_LO, RBBM_PERFCTR_CP_4_HI),
+	COUNTER(CP_PERFCTR_CP_SEL_5, RBBM_PERFCTR_CP_5_LO, RBBM_PERFCTR_CP_5_HI),
+	COUNTER(CP_PERFCTR_CP_SEL_6, RBBM_PERFCTR_CP_6_LO, RBBM_PERFCTR_CP_6_HI),
+	COUNTER(CP_PERFCTR_CP_SEL_7, RBBM_PERFCTR_CP_7_LO, RBBM_PERFCTR_CP_7_HI),
+};
+
+static const struct fd_perfcntr_countable cp_countables[] = {
+	COUNTABLE(PERF_CP_ALWAYS_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_BUSY_GFX_CORE_IDLE, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PFP_IDLE, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PFP_BUSY_WORKING, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PFP_STALL_CYCLES_ANY, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PFP_STARVE_CYCLES_ANY, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PFP_ICACHE_MISS, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PFP_ICACHE_HIT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PFP_MATCH_PM4_PKT_PROFILE, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_BUSY_WORKING, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_IDLE, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_STARVE_CYCLES_ANY, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_FIFO_EMPTY_PFP_IDLE, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_FIFO_EMPTY_PFP_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_FIFO_FULL_ME_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_FIFO_FULL_ME_NON_WORKING, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_STALL_CYCLES_ANY, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_ICACHE_MISS, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ME_ICACHE_HIT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_NUM_PREEMPTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PREEMPTION_REACTION_DELAY, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PREEMPTION_SWITCH_OUT_TIME, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PREEMPTION_SWITCH_IN_TIME, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_DEAD_DRAWS_IN_BIN_RENDER, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_PREDICATED_DRAWS_KILLED, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_MODE_SWITCH, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_ZPASS_DONE, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_CONTEXT_DONE, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_CACHE_FLUSH, UINT64, AVERAGE),
+	COUNTABLE(PERF_CP_LONG_PREEMPTIONS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter ccu_counters[] = {
+	COUNTER(RB_PERFCTR_CCU_SEL_0, RBBM_PERFCTR_CCU_0_LO, RBBM_PERFCTR_CCU_0_HI),
+	COUNTER(RB_PERFCTR_CCU_SEL_1, RBBM_PERFCTR_CCU_1_LO, RBBM_PERFCTR_CCU_1_HI),
+	COUNTER(RB_PERFCTR_CCU_SEL_2, RBBM_PERFCTR_CCU_2_LO, RBBM_PERFCTR_CCU_2_HI),
+	COUNTER(RB_PERFCTR_CCU_SEL_3, RBBM_PERFCTR_CCU_3_LO, RBBM_PERFCTR_CCU_3_HI),
+};
+
+static const struct fd_perfcntr_countable ccu_countables[] = {
+	COUNTABLE(PERF_CCU_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_STALL_CYCLES_RB_DEPTH_RETURN, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_STALL_CYCLES_RB_COLOR_RETURN, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_STARVE_CYCLES_FLAG_RETURN, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_DEPTH_BLOCKS, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_COLOR_BLOCKS, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_DEPTH_BLOCK_HIT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_COLOR_BLOCK_HIT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_PARTIAL_BLOCK_READ, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_GMEM_READ, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_GMEM_WRITE, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_DEPTH_READ_FLAG0_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_DEPTH_READ_FLAG1_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_DEPTH_READ_FLAG2_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_DEPTH_READ_FLAG3_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_DEPTH_READ_FLAG4_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_COLOR_READ_FLAG0_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_COLOR_READ_FLAG1_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_COLOR_READ_FLAG2_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_COLOR_READ_FLAG3_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_COLOR_READ_FLAG4_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_2D_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_2D_RD_REQ, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_2D_WR_REQ, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_2D_REORDER_STARVE_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_CCU_2D_PIXELS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter tse_counters[] = {
+	COUNTER(GRAS_PERFCTR_TSE_SEL_0, RBBM_PERFCTR_TSE_0_LO, RBBM_PERFCTR_TSE_0_HI),
+	COUNTER(GRAS_PERFCTR_TSE_SEL_1, RBBM_PERFCTR_TSE_1_LO, RBBM_PERFCTR_TSE_1_HI),
+	COUNTER(GRAS_PERFCTR_TSE_SEL_2, RBBM_PERFCTR_TSE_2_LO, RBBM_PERFCTR_TSE_2_HI),
+	COUNTER(GRAS_PERFCTR_TSE_SEL_3, RBBM_PERFCTR_TSE_3_LO, RBBM_PERFCTR_TSE_3_HI),
+};
+
+static const struct fd_perfcntr_countable tse_countables[] = {
+	COUNTABLE(PERF_TSE_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_CLIPPING_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_STALL_CYCLES_RAS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_STALL_CYCLES_LRZ_BARYPLANE, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_STALL_CYCLES_LRZ_ZPLANE, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_STARVE_CYCLES_PC, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_INPUT_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_INPUT_NULL_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_TRIVAL_REJ_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_CLIPPED_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_ZERO_AREA_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_FACENESS_CULLED_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_ZERO_PIXEL_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_OUTPUT_NULL_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_OUTPUT_VISIBLE_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_CINVOCATION, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_CPRIMITIVES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_2D_INPUT_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_TSE_2D_ALIVE_CLCLES, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter ras_counters[] = {
+	COUNTER(GRAS_PERFCTR_RAS_SEL_0, RBBM_PERFCTR_RAS_0_LO, RBBM_PERFCTR_RAS_0_HI),
+	COUNTER(GRAS_PERFCTR_RAS_SEL_1, RBBM_PERFCTR_RAS_1_LO, RBBM_PERFCTR_RAS_1_HI),
+	COUNTER(GRAS_PERFCTR_RAS_SEL_2, RBBM_PERFCTR_RAS_2_LO, RBBM_PERFCTR_RAS_2_HI),
+	COUNTER(GRAS_PERFCTR_RAS_SEL_3, RBBM_PERFCTR_RAS_3_LO, RBBM_PERFCTR_RAS_3_HI),
+};
+
+static const struct fd_perfcntr_countable ras_countables[] = {
+	COUNTABLE(PERF_RAS_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_RAS_SUPERTILE_ACTIVE_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_RAS_STALL_CYCLES_LRZ, UINT64, AVERAGE),
+	COUNTABLE(PERF_RAS_STARVE_CYCLES_TSE, UINT64, AVERAGE),
+	COUNTABLE(PERF_RAS_SUPER_TILES, UINT64, AVERAGE),
+	COUNTABLE(PERF_RAS_8X4_TILES, UINT64, AVERAGE),
+	COUNTABLE(PERF_RAS_MASKGEN_ACTIVE, UINT64, AVERAGE),
+	COUNTABLE(PERF_RAS_FULLY_COVERED_SUPER_TILES, UINT64, AVERAGE),
+	COUNTABLE(PERF_RAS_FULLY_COVERED_8X4_TILES, UINT64, AVERAGE),
+	COUNTABLE(PERF_RAS_PRIM_KILLED_INVISILBE, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter lrz_counters[] = {
+	COUNTER(GRAS_PERFCTR_LRZ_SEL_0, RBBM_PERFCTR_LRZ_0_LO, RBBM_PERFCTR_LRZ_0_HI),
+	COUNTER(GRAS_PERFCTR_LRZ_SEL_1, RBBM_PERFCTR_LRZ_1_LO, RBBM_PERFCTR_LRZ_1_HI),
+	COUNTER(GRAS_PERFCTR_LRZ_SEL_2, RBBM_PERFCTR_LRZ_2_LO, RBBM_PERFCTR_LRZ_2_HI),
+	COUNTER(GRAS_PERFCTR_LRZ_SEL_3, RBBM_PERFCTR_LRZ_3_LO, RBBM_PERFCTR_LRZ_3_HI),
+};
+
+static const struct fd_perfcntr_countable lrz_countables[] = {
+	COUNTABLE(PERF_LRZ_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_STARVE_CYCLES_RAS, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_STALL_CYCLES_RB, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_STALL_CYCLES_VSC, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_STALL_CYCLES_VPC, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_STALL_CYCLES_FLAG_PREFETCH, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_LRZ_READ, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_LRZ_WRITE, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_READ_LATENCY, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_MERGE_CACHE_UPDATING, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_PRIM_KILLED_BY_MASKGEN, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_PRIM_KILLED_BY_LRZ, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_VISIBLE_PRIM_AFTER_LRZ, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_FULL_8X8_TILES, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_PARTIAL_8X8_TILES, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_TILE_KILLED, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_TOTAL_PIXEL, UINT64, AVERAGE),
+	COUNTABLE(PERF_LRZ_VISIBLE_PIXEL_AFTER_LRZ, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter hlsq_counters[] = {
+	COUNTER(HLSQ_PERFCTR_HLSQ_SEL_0, RBBM_PERFCTR_HLSQ_0_LO, RBBM_PERFCTR_HLSQ_0_HI),
+	COUNTER(HLSQ_PERFCTR_HLSQ_SEL_1, RBBM_PERFCTR_HLSQ_1_LO, RBBM_PERFCTR_HLSQ_1_HI),
+	COUNTER(HLSQ_PERFCTR_HLSQ_SEL_2, RBBM_PERFCTR_HLSQ_2_LO, RBBM_PERFCTR_HLSQ_2_HI),
+	COUNTER(HLSQ_PERFCTR_HLSQ_SEL_3, RBBM_PERFCTR_HLSQ_3_LO, RBBM_PERFCTR_HLSQ_3_HI),
+	COUNTER(HLSQ_PERFCTR_HLSQ_SEL_4, RBBM_PERFCTR_HLSQ_4_LO, RBBM_PERFCTR_HLSQ_4_HI),
+	COUNTER(HLSQ_PERFCTR_HLSQ_SEL_5, RBBM_PERFCTR_HLSQ_5_LO, RBBM_PERFCTR_HLSQ_5_HI),
+	COUNTER(HLSQ_PERFCTR_HLSQ_SEL_6, RBBM_PERFCTR_HLSQ_6_LO, RBBM_PERFCTR_HLSQ_6_HI),
+	COUNTER(HLSQ_PERFCTR_HLSQ_SEL_7, RBBM_PERFCTR_HLSQ_7_LO, RBBM_PERFCTR_HLSQ_7_HI),
+};
+
+static const struct fd_perfcntr_countable hlsq_countables[] = {
+	COUNTABLE(PERF_HLSQ_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_STALL_CYCLES_SP_STATE, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_STALL_CYCLES_SP_FS_STAGE, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_UCHE_LATENCY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_UCHE_LATENCY_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_FS_STAGE_32_WAVES, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_FS_STAGE_64_WAVES, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_QUADS, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_SP_STATE_COPY_TRANS_FS_STAGE, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_SP_STATE_COPY_TRANS_VS_STAGE, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_TP_STATE_COPY_TRANS_FS_STAGE, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_TP_STATE_COPY_TRANS_VS_STAGE, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_CS_INVOCATIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_HLSQ_COMPUTE_DRAWCALLS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter pc_counters[] = {
+	COUNTER(PC_PERFCTR_PC_SEL_0, RBBM_PERFCTR_PC_0_LO, RBBM_PERFCTR_PC_0_HI),
+	COUNTER(PC_PERFCTR_PC_SEL_1, RBBM_PERFCTR_PC_1_LO, RBBM_PERFCTR_PC_1_HI),
+	COUNTER(PC_PERFCTR_PC_SEL_2, RBBM_PERFCTR_PC_2_LO, RBBM_PERFCTR_PC_2_HI),
+	COUNTER(PC_PERFCTR_PC_SEL_3, RBBM_PERFCTR_PC_3_LO, RBBM_PERFCTR_PC_3_HI),
+	COUNTER(PC_PERFCTR_PC_SEL_4, RBBM_PERFCTR_PC_4_LO, RBBM_PERFCTR_PC_4_HI),
+	COUNTER(PC_PERFCTR_PC_SEL_5, RBBM_PERFCTR_PC_5_LO, RBBM_PERFCTR_PC_5_HI),
+	COUNTER(PC_PERFCTR_PC_SEL_6, RBBM_PERFCTR_PC_6_LO, RBBM_PERFCTR_PC_6_HI),
+	COUNTER(PC_PERFCTR_PC_SEL_7, RBBM_PERFCTR_PC_7_LO, RBBM_PERFCTR_PC_7_HI),
+};
+
+static const struct fd_perfcntr_countable pc_countables[] = {
+	COUNTABLE(PERF_PC_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_WORKING_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STALL_CYCLES_VFD, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STALL_CYCLES_TSE, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STALL_CYCLES_VPC, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STALL_CYCLES_TESS, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STALL_CYCLES_TSE_ONLY, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STALL_CYCLES_VPC_ONLY, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_PASS1_TF_STALL_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_INDEX, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_TESS_FACTOR, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_VIZ_STREAM, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STARVE_CYCLES_FOR_POSITION, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_STARVE_CYCLES_DI, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_VIS_STREAMS_LOADED, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_INSTANCES, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_VPC_PRIMITIVES, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_DEAD_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_LIVE_PRIM, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_VERTEX_HITS, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_IA_VERTICES, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_IA_PRIMITIVES, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_GS_PRIMITIVES, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_HS_INVOCATIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_DS_INVOCATIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_VS_INVOCATIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_GS_INVOCATIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_DS_PRIMITIVES, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_VPC_POS_DATA_TRANSACTION, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_3D_DRAWCALLS, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_2D_DRAWCALLS, UINT64, AVERAGE),
+	COUNTABLE(PERF_PC_NON_DRAWCALL_GLOBAL_EVENTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TESS_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TESS_WORKING_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TESS_STALL_CYCLES_PC, UINT64, AVERAGE),
+	COUNTABLE(PERF_TESS_STARVE_CYCLES_PC, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter rb_counters[] = {
+	COUNTER(RB_PERFCTR_RB_SEL_0, RBBM_PERFCTR_RB_0_LO, RBBM_PERFCTR_RB_0_HI),
+	COUNTER(RB_PERFCTR_RB_SEL_1, RBBM_PERFCTR_RB_1_LO, RBBM_PERFCTR_RB_1_HI),
+	COUNTER(RB_PERFCTR_RB_SEL_2, RBBM_PERFCTR_RB_2_LO, RBBM_PERFCTR_RB_2_HI),
+	COUNTER(RB_PERFCTR_RB_SEL_3, RBBM_PERFCTR_RB_3_LO, RBBM_PERFCTR_RB_3_HI),
+	COUNTER(RB_PERFCTR_RB_SEL_4, RBBM_PERFCTR_RB_4_LO, RBBM_PERFCTR_RB_4_HI),
+	COUNTER(RB_PERFCTR_RB_SEL_5, RBBM_PERFCTR_RB_5_LO, RBBM_PERFCTR_RB_5_HI),
+	COUNTER(RB_PERFCTR_RB_SEL_6, RBBM_PERFCTR_RB_6_LO, RBBM_PERFCTR_RB_6_HI),
+	COUNTER(RB_PERFCTR_RB_SEL_7, RBBM_PERFCTR_RB_7_LO, RBBM_PERFCTR_RB_7_HI),
+};
+
+static const struct fd_perfcntr_countable rb_countables[] = {
+	COUNTABLE(PERF_RB_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STALL_CYCLES_CCU, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STALL_CYCLES_HLSQ, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STALL_CYCLES_FIFO0_FULL, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STALL_CYCLES_FIFO1_FULL, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STALL_CYCLES_FIFO2_FULL, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STARVE_CYCLES_SP, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STARVE_CYCLES_LRZ_TILE, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STARVE_CYCLES_CCU, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STARVE_CYCLES_Z_PLANE, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_STARVE_CYCLES_BARY_PLANE, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_Z_WORKLOAD, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_HLSQ_ACTIVE, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_Z_READ, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_Z_WRITE, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_C_READ, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_C_WRITE, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_TOTAL_PASS, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_Z_PASS, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_Z_FAIL, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_S_FAIL, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_BLENDED_FXP_COMPONENTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_BLENDED_FP16_COMPONENTS, UINT64, AVERAGE),
+	COUNTABLE(RB_RESERVED, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_2D_ALIVE_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_2D_STALL_CYCLES_A2D, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_2D_STARVE_CYCLES_SRC, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_2D_STARVE_CYCLES_SP, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_2D_STARVE_CYCLES_DST, UINT64, AVERAGE),
+	COUNTABLE(PERF_RB_2D_VALID_PIXELS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter rbbm_counters[] = {
+//RESERVED: for kernel
+//	COUNTER(RBBM_PERFCTR_RBBM_SEL_0, RBBM_PERFCTR_RBBM_0_LO, RBBM_PERFCTR_RBBM_0_HI),
+	COUNTER(RBBM_PERFCTR_RBBM_SEL_1, RBBM_PERFCTR_RBBM_1_LO, RBBM_PERFCTR_RBBM_1_HI),
+	COUNTER(RBBM_PERFCTR_RBBM_SEL_2, RBBM_PERFCTR_RBBM_2_LO, RBBM_PERFCTR_RBBM_2_HI),
+	COUNTER(RBBM_PERFCTR_RBBM_SEL_3, RBBM_PERFCTR_RBBM_3_LO, RBBM_PERFCTR_RBBM_3_HI),
+};
+
+static const struct fd_perfcntr_countable rbbm_countables[] = {
+	COUNTABLE(PERF_RBBM_ALWAYS_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_ALWAYS_ON, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_TSE_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_RAS_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_PC_DCALL_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_PC_VSD_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_STATUS_MASKED, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_COM_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_DCOM_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_VBIF_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_VSC_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_TESS_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_UCHE_BUSY, UINT64, AVERAGE),
+	COUNTABLE(PERF_RBBM_HLSQ_BUSY, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter sp_counters[] = {
+//RESERVED: for kernel
+//	COUNTER(SP_PERFCTR_SP_SEL_0,  RBBM_PERFCTR_SP_0_LO,  RBBM_PERFCTR_SP_0_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_1,  RBBM_PERFCTR_SP_1_LO,  RBBM_PERFCTR_SP_1_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_2,  RBBM_PERFCTR_SP_2_LO,  RBBM_PERFCTR_SP_2_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_3,  RBBM_PERFCTR_SP_3_LO,  RBBM_PERFCTR_SP_3_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_4,  RBBM_PERFCTR_SP_4_LO,  RBBM_PERFCTR_SP_4_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_5,  RBBM_PERFCTR_SP_5_LO,  RBBM_PERFCTR_SP_5_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_6,  RBBM_PERFCTR_SP_6_LO,  RBBM_PERFCTR_SP_6_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_7,  RBBM_PERFCTR_SP_7_LO,  RBBM_PERFCTR_SP_7_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_8,  RBBM_PERFCTR_SP_8_LO,  RBBM_PERFCTR_SP_8_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_9,  RBBM_PERFCTR_SP_9_LO,  RBBM_PERFCTR_SP_9_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_10, RBBM_PERFCTR_SP_10_LO, RBBM_PERFCTR_SP_10_HI),
+	COUNTER(SP_PERFCTR_SP_SEL_11, RBBM_PERFCTR_SP_11_LO, RBBM_PERFCTR_SP_11_HI),
+};
+
+static const struct fd_perfcntr_countable sp_countables[] = {
+	COUNTABLE(PERF_SP_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_ALU_WORKING_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_EFU_WORKING_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_STALL_CYCLES_VPC, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_STALL_CYCLES_TP, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_STALL_CYCLES_RB, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_SCHEDULER_NON_WORKING, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_CONTEXTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_CONTEXT_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_STAGE_WAVE_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_STAGE_WAVE_SAMPLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_VS_STAGE_WAVE_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_VS_STAGE_WAVE_SAMPLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_STAGE_DURATION_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_VS_STAGE_DURATION_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_CTRL_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_LOAD_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_EMIT_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_NOP_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_WAIT_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_FETCH_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_IDLE_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_END_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_LONG_SYNC_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_SHORT_SYNC_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_WAVE_JOIN_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_LM_LOAD_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_LM_STORE_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_LM_ATOMICS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_GM_LOAD_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_GM_STORE_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_GM_ATOMICS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_VS_STAGE_TEX_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_VS_STAGE_CFLOW_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_VS_STAGE_EFU_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_VS_STAGE_FULL_ALU_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_VS_STAGE_HALF_ALU_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_STAGE_TEX_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_STAGE_CFLOW_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_STAGE_EFU_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_STAGE_FULL_ALU_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_STAGE_HALF_ALU_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_STAGE_BARY_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_VS_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_FS_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_ADDR_LOCK_COUNT, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_UCHE_READ_TRANS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_UCHE_WRITE_TRANS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_EXPORT_VPC_TRANS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_EXPORT_RB_TRANS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_PIXELS_KILLED, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_ICL1_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_ICL1_MISSES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_ICL0_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_ICL0_MISSES, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_HS_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_DS_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_GS_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_CS_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_GPR_READ, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_GPR_WRITE, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_LM_CH0_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_LM_CH1_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_SP_LM_BANK_CONFLICTS, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter tp_counters[] = {
+	COUNTER(TPL1_PERFCTR_TP_SEL_0, RBBM_PERFCTR_TP_0_LO, RBBM_PERFCTR_TP_0_HI),
+	COUNTER(TPL1_PERFCTR_TP_SEL_1, RBBM_PERFCTR_TP_1_LO, RBBM_PERFCTR_TP_1_HI),
+	COUNTER(TPL1_PERFCTR_TP_SEL_2, RBBM_PERFCTR_TP_2_LO, RBBM_PERFCTR_TP_2_HI),
+	COUNTER(TPL1_PERFCTR_TP_SEL_3, RBBM_PERFCTR_TP_3_LO, RBBM_PERFCTR_TP_3_HI),
+	COUNTER(TPL1_PERFCTR_TP_SEL_4, RBBM_PERFCTR_TP_4_LO, RBBM_PERFCTR_TP_4_HI),
+	COUNTER(TPL1_PERFCTR_TP_SEL_5, RBBM_PERFCTR_TP_5_LO, RBBM_PERFCTR_TP_5_HI),
+	COUNTER(TPL1_PERFCTR_TP_SEL_6, RBBM_PERFCTR_TP_6_LO, RBBM_PERFCTR_TP_6_HI),
+	COUNTER(TPL1_PERFCTR_TP_SEL_7, RBBM_PERFCTR_TP_7_LO, RBBM_PERFCTR_TP_7_HI),
+};
+
+static const struct fd_perfcntr_countable tp_countables[] = {
+	COUNTABLE(PERF_TP_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_LATENCY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_LATENCY_TRANS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_FLAG_CACHE_REQUEST_SAMPLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_FLAG_CACHE_REQUEST_LATENCY, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_L1_CACHELINE_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_L1_CACHELINE_MISSES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_SP_TP_TRANS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_TP_SP_TRANS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_OUTPUT_PIXELS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_FILTER_WORKLOAD_16BIT, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_FILTER_WORKLOAD_32BIT, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_RECEIVED, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_OFFSET, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_SHADOW, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_ARRAY, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_GRADIENT, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_1D, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_2D, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_BUFFER, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_3D, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_QUADS_CUBE, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_STATE_CACHE_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_STATE_CACHE_MISSES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_DIVERGENT_QUADS_RECEIVED, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_BINDLESS_STATE_CACHE_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_BINDLESS_STATE_CACHE_MISSES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_PRT_NON_RESIDENT_EVENTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_OUTPUT_PIXELS_POINT, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_OUTPUT_PIXELS_BILINEAR, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_OUTPUT_PIXELS_MIP, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_OUTPUT_PIXELS_ANISO, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_OUTPUT_PIXELS_ZERO_LOD, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_FLAG_CACHE_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_FLAG_CACHE_MISSES, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_L1_5_L2_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS_POINT, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_2D_OUTPUT_PIXELS_BILINEAR, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_2D_FILTER_WORKLOAD_16BIT, UINT64, AVERAGE),
+	COUNTABLE(PERF_TP_2D_FILTER_WORKLOAD_32BIT, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter uche_counters[] = {
+	COUNTER(UCHE_PERFCTR_UCHE_SEL_0, RBBM_PERFCTR_UCHE_0_LO, RBBM_PERFCTR_UCHE_0_HI),
+	COUNTER(UCHE_PERFCTR_UCHE_SEL_1, RBBM_PERFCTR_UCHE_1_LO, RBBM_PERFCTR_UCHE_1_HI),
+	COUNTER(UCHE_PERFCTR_UCHE_SEL_2, RBBM_PERFCTR_UCHE_2_LO, RBBM_PERFCTR_UCHE_2_HI),
+	COUNTER(UCHE_PERFCTR_UCHE_SEL_3, RBBM_PERFCTR_UCHE_3_LO, RBBM_PERFCTR_UCHE_3_HI),
+	COUNTER(UCHE_PERFCTR_UCHE_SEL_4, RBBM_PERFCTR_UCHE_4_LO, RBBM_PERFCTR_UCHE_4_HI),
+	COUNTER(UCHE_PERFCTR_UCHE_SEL_5, RBBM_PERFCTR_UCHE_5_LO, RBBM_PERFCTR_UCHE_5_HI),
+	COUNTER(UCHE_PERFCTR_UCHE_SEL_6, RBBM_PERFCTR_UCHE_6_LO, RBBM_PERFCTR_UCHE_6_HI),
+	COUNTER(UCHE_PERFCTR_UCHE_SEL_7, RBBM_PERFCTR_UCHE_7_LO, RBBM_PERFCTR_UCHE_7_HI),
+};
+
+static const struct fd_perfcntr_countable uche_countables[] = {
+	COUNTABLE(PERF_UCHE_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_STALL_CYCLES_VBIF, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_VBIF_LATENCY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_VBIF_LATENCY_SAMPLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_TP, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_VFD, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_HLSQ, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_LRZ, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_SP, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_READ_REQUESTS_TP, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_READ_REQUESTS_VFD, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_READ_REQUESTS_HLSQ, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_READ_REQUESTS_LRZ, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_READ_REQUESTS_SP, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_WRITE_REQUESTS_LRZ, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_WRITE_REQUESTS_SP, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_WRITE_REQUESTS_VPC, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_WRITE_REQUESTS_VSC, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_EVICTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_BANK_REQ0, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_BANK_REQ1, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_BANK_REQ2, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_BANK_REQ3, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_BANK_REQ4, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_BANK_REQ5, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_BANK_REQ6, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_BANK_REQ7, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_CH0, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_VBIF_READ_BEATS_CH1, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_GMEM_READ_BEATS, UINT64, AVERAGE),
+	COUNTABLE(PERF_UCHE_FLAG_COUNT, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter vfd_counters[] = {
+	COUNTER(VFD_PERFCTR_VFD_SEL_0, RBBM_PERFCTR_VFD_0_LO, RBBM_PERFCTR_VFD_0_HI),
+	COUNTER(VFD_PERFCTR_VFD_SEL_1, RBBM_PERFCTR_VFD_1_LO, RBBM_PERFCTR_VFD_1_HI),
+	COUNTER(VFD_PERFCTR_VFD_SEL_2, RBBM_PERFCTR_VFD_2_LO, RBBM_PERFCTR_VFD_2_HI),
+	COUNTER(VFD_PERFCTR_VFD_SEL_3, RBBM_PERFCTR_VFD_3_LO, RBBM_PERFCTR_VFD_3_HI),
+	COUNTER(VFD_PERFCTR_VFD_SEL_4, RBBM_PERFCTR_VFD_4_LO, RBBM_PERFCTR_VFD_4_HI),
+	COUNTER(VFD_PERFCTR_VFD_SEL_5, RBBM_PERFCTR_VFD_5_LO, RBBM_PERFCTR_VFD_5_HI),
+	COUNTER(VFD_PERFCTR_VFD_SEL_6, RBBM_PERFCTR_VFD_6_LO, RBBM_PERFCTR_VFD_6_HI),
+	COUNTER(VFD_PERFCTR_VFD_SEL_7, RBBM_PERFCTR_VFD_7_LO, RBBM_PERFCTR_VFD_7_HI),
+};
+
+static const struct fd_perfcntr_countable vfd_countables[] = {
+	COUNTABLE(PERF_VFD_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_STALL_CYCLES_VPC_ALLOC, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_STALL_CYCLES_MISS_VB, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_STALL_CYCLES_MISS_Q, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_STALL_CYCLES_SP_INFO, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_STALL_CYCLES_SP_ATTR, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_STALL_CYCLES_VFDP_VB, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_STALL_CYCLES_VFDP_Q, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_DECODER_PACKER_STALL, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_STARVE_CYCLES_UCHE, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_RBUFFER_FULL, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_ATTR_INFO_FIFO_FULL, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_DECODED_ATTRIBUTE_BYTES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_NUM_ATTRIBUTES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_INSTRUCTIONS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_UPPER_SHADER_FIBERS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_LOWER_SHADER_FIBERS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_MODE_0_FIBERS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_MODE_1_FIBERS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_MODE_2_FIBERS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_MODE_3_FIBERS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_MODE_4_FIBERS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_TOTAL_VERTICES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_NUM_ATTR_MISS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFD_1_BURST_REQ, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD_INDEX, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFDP_STALL_CYCLES_VFD_PROG, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFDP_STARVE_CYCLES_PC, UINT64, AVERAGE),
+	COUNTABLE(PERF_VFDP_VS_STAGE_32_WAVES, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter vpc_counters[] = {
+	COUNTER(VPC_PERFCTR_VPC_SEL_0, RBBM_PERFCTR_VPC_0_LO, RBBM_PERFCTR_VPC_0_HI),
+	COUNTER(VPC_PERFCTR_VPC_SEL_1, RBBM_PERFCTR_VPC_1_LO, RBBM_PERFCTR_VPC_1_HI),
+	COUNTER(VPC_PERFCTR_VPC_SEL_2, RBBM_PERFCTR_VPC_2_LO, RBBM_PERFCTR_VPC_2_HI),
+	COUNTER(VPC_PERFCTR_VPC_SEL_3, RBBM_PERFCTR_VPC_3_LO, RBBM_PERFCTR_VPC_3_HI),
+};
+
+static const struct fd_perfcntr_countable vpc_countables[] = {
+	COUNTABLE(PERF_VPC_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_WORKING_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_STALL_CYCLES_VFD_WACK, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_STALL_CYCLES_HLSQ_PRIM_ALLOC, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_STALL_CYCLES_PC, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_STALL_CYCLES_SP_LM, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_POS_EXPORT_STALL_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_STARVE_CYCLES_SP, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_STARVE_CYCLES_LRZ, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_PC_PRIMITIVES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_SP_COMPONENTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_SP_LM_PRIMITIVES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_SP_LM_COMPONENTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_SP_LM_DWORDS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_STREAMOUT_COMPONENTS, UINT64, AVERAGE),
+	COUNTABLE(PERF_VPC_GRANT_PHASES, UINT64, AVERAGE),
+};
+
+static const struct fd_perfcntr_counter vsc_counters[] = {
+	COUNTER(VSC_PERFCTR_VSC_SEL_0, RBBM_PERFCTR_VSC_0_LO, RBBM_PERFCTR_VSC_0_HI),
+	COUNTER(VSC_PERFCTR_VSC_SEL_1, RBBM_PERFCTR_VSC_1_LO, RBBM_PERFCTR_VSC_1_HI),
+};
+
+static const struct fd_perfcntr_countable vsc_countables[] = {
+	COUNTABLE(PERF_VSC_BUSY_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VSC_WORKING_CYCLES, UINT64, AVERAGE),
+	COUNTABLE(PERF_VSC_STALL_CYCLES_UCHE, UINT64, AVERAGE),
+	COUNTABLE(PERF_VSC_EOT_NUM, UINT64, AVERAGE),
+};
+
+/* VBIF counters probably not too userful for userspace, and they make
+ * frameretrace take many more passes to collect all the metrics, so
+ * for now let's hide them.
+ */
+#if 0
+/* VBIF counters break the pattern a bit, with enable and clear regs: */
+static const struct fd_perfcntr_counter vbif_counters[] = {
+	COUNTER2(VBIF_PERF_CNT_SEL0, VBIF_PERF_CNT_LOW0, VBIF_PERF_CNT_HIGH0, VBIF_PERF_CNT_EN0, VBIF_PERF_CNT_CLR0),
+	COUNTER2(VBIF_PERF_CNT_SEL1, VBIF_PERF_CNT_LOW1, VBIF_PERF_CNT_HIGH1, VBIF_PERF_CNT_EN1, VBIF_PERF_CNT_CLR1),
+	COUNTER2(VBIF_PERF_CNT_SEL2, VBIF_PERF_CNT_LOW2, VBIF_PERF_CNT_HIGH2, VBIF_PERF_CNT_EN2, VBIF_PERF_CNT_CLR2),
+	COUNTER2(VBIF_PERF_CNT_SEL3, VBIF_PERF_CNT_LOW3, VBIF_PERF_CNT_HIGH3, VBIF_PERF_CNT_EN3, VBIF_PERF_CNT_CLR3),
+};
+
+static const struct fd_perfcntr_countable vbif_countables[] = {
+	COUNTABLE(AXI_READ_REQUESTS_ID_0, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_1, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_2, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_3, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_4, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_5, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_6, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_7, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_8, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_9, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_10, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_11, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_12, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_13, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_14, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_ID_15, UINT64, AVERAGE),
+	COUNTABLE(AXI0_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI1_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI2_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI3_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_0, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_1, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_2, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_3, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_4, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_5, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_6, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_7, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_8, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_9, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_10, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_11, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_12, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_13, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_14, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_ID_15, UINT64, AVERAGE),
+	COUNTABLE(AXI0_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI1_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI2_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI3_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_REQUESTS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI_TOTAL_REQUESTS, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_0, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_1, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_2, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_3, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_4, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_5, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_6, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_7, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_8, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_9, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_10, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_11, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_12, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_13, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_14, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_ID_15, UINT64, AVERAGE),
+	COUNTABLE(AXI0_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI1_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI2_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI3_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI_READ_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_0, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_1, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_2, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_3, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_4, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_5, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_6, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_7, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_8, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_9, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_10, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_11, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_12, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_13, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_14, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_ID_15, UINT64, AVERAGE),
+	COUNTABLE(AXI0_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI1_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI2_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI3_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI_WRITE_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+	COUNTABLE(AXI_DATA_BEATS_TOTAL, UINT64, AVERAGE),
+};
+#endif
+
+const struct fd_perfcntr_group a5xx_perfcntr_groups[] = {
+	GROUP("CP", cp_counters, cp_countables),
+	GROUP("CCU", ccu_counters, ccu_countables),
+	GROUP("TSE", tse_counters, tse_countables),
+	GROUP("RAS", ras_counters, ras_countables),
+	GROUP("LRZ", lrz_counters, lrz_countables),
+	GROUP("HLSQ", hlsq_counters, hlsq_countables),
+	GROUP("PC", pc_counters, pc_countables),
+	GROUP("RB", rb_counters, rb_countables),
+	GROUP("RBBM", rbbm_counters, rbbm_countables),
+	GROUP("SP", sp_counters, sp_countables),
+	GROUP("TP", tp_counters, tp_countables),
+	GROUP("UCHE", uche_counters, uche_countables),
+	GROUP("VFD", vfd_counters, vfd_countables),
+	GROUP("VPC", vpc_counters, vpc_countables),
+	GROUP("VSC", vsc_counters, vsc_countables),
+//	GROUP("VBIF", vbif_counters, vbif_countables),
+};
+
+const unsigned a5xx_num_perfcntr_groups = ARRAY_SIZE(a5xx_perfcntr_groups);
+
+#endif /* FD5_PERFCNTR_H_ */
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.c b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
index 81fe7d4..286411e 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.c
@@ -38,22 +38,13 @@
 #include "fd5_texture.h"
 #include "fd5_format.h"
 
-static void
-delete_shader_stateobj(struct fd5_shader_stateobj *so)
-{
-	ir3_shader_destroy(so->shader);
-	free(so);
-}
-
-static struct fd5_shader_stateobj *
+static struct ir3_shader *
 create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso,
 		enum shader_t type)
 {
 	struct fd_context *ctx = fd_context(pctx);
 	struct ir3_compiler *compiler = ctx->screen->compiler;
-	struct fd5_shader_stateobj *so = CALLOC_STRUCT(fd5_shader_stateobj);
-	so->shader = ir3_shader_create(compiler, cso, type, &ctx->debug);
-	return so;
+	return ir3_shader_create(compiler, cso, type, &ctx->debug);
 }
 
 static void *
@@ -66,8 +57,8 @@
 static void
 fd5_fp_state_delete(struct pipe_context *pctx, void *hwcso)
 {
-	struct fd5_shader_stateobj *so = hwcso;
-	delete_shader_stateobj(so);
+	struct ir3_shader *so = hwcso;
+	ir3_shader_destroy(so);
 }
 
 static void *
@@ -80,8 +71,8 @@
 static void
 fd5_vp_state_delete(struct pipe_context *pctx, void *hwcso)
 {
-	struct fd5_shader_stateobj *so = hwcso;
-	delete_shader_stateobj(so);
+	struct ir3_shader *so = hwcso;
+	ir3_shader_destroy(so);
 }
 
 void
@@ -329,7 +320,7 @@
 {
 	struct stage s[MAX_STAGES];
 	uint32_t pos_regid, psize_regid, color_regid[8];
-	uint32_t face_regid, coord_regid, zwcoord_regid;
+	uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid, samp_mask_regid;
 	uint32_t vcoord_regid, vertex_regid, instance_regid;
 	enum a3xx_threadsize fssz;
 	uint8_t psize_loc = ~0;
@@ -359,6 +350,8 @@
 		color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7);
 	}
 
+	samp_id_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_ID);
+	samp_mask_regid = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_MASK_IN);
 	/* TODO get these dynamically: */
 	face_regid = s[FS].v->frag_face ? regid(0,0) : regid(63,0);
 	coord_regid = s[FS].v->frag_coord ? regid(0,0) : regid(63,0);
@@ -557,7 +550,9 @@
 			0x00000880);               /* XXX HLSQ_CONTROL_0 */
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_1_REG_PRIMALLOCTHRESHOLD(63));
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
-			0xfcfcfc00);               /* XXX */
+			A5XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
+			A5XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(samp_mask_regid) |
+			0xfc000000);               /* XXX */
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_3_REG_FRAGCOORDXYREGID(vcoord_regid) |
 			0xfcfcfc00);               /* XXX */
 	OUT_RING(ring, A5XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
@@ -600,7 +595,12 @@
 					A5XX_RB_RENDER_CONTROL0_WCOORD |
 					A5XX_RB_RENDER_CONTROL0_UNK3) |
 			COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL0_UNK3));
-	OUT_RING(ring, COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL1_FACENESS));
+	OUT_RING(ring,
+			COND(samp_mask_regid != regid(63, 0),
+				A5XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
+			COND(s[FS].v->frag_face, A5XX_RB_RENDER_CONTROL1_FACENESS) |
+			COND(samp_id_regid != regid(63, 0),
+				A5XX_RB_RENDER_CONTROL1_SAMPLEID));
 
 	OUT_PKT4(ring, REG_A5XX_SP_FS_OUTPUT_REG(0), 8);
 	for (i = 0; i < 8; i++) {
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_program.h b/src/gallium/drivers/freedreno/a5xx/fd5_program.h
index 585263e..72cbf9a 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_program.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_program.h
@@ -31,10 +31,6 @@
 #include "freedreno_context.h"
 #include "ir3_shader.h"
 
-struct fd5_shader_stateobj {
-	struct ir3_shader *shader;
-};
-
 struct fd5_emit;
 
 void fd5_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so);
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_query.c b/src/gallium/drivers/freedreno/a5xx/fd5_query.c
index 87417f1..b438c7a 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_query.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_query.c
@@ -39,11 +39,17 @@
 	uint64_t stop;
 };
 
-#define query_sample(aq, field)                 \
+/* offset of a single field of an array of fd5_query_sample: */
+#define query_sample_idx(aq, idx, field)        \
 	fd_resource((aq)->prsc)->bo,                \
+	(idx * sizeof(struct fd5_query_sample)) +   \
 	offsetof(struct fd5_query_sample, field),   \
 	0, 0
 
+/* offset of a single field of fd5_query_sample: */
+#define query_sample(aq, field)                 \
+	query_sample_idx(aq, 0, field)
+
 /*
  * Occlusion Query:
  *
@@ -111,7 +117,7 @@
 }
 
 static void
-occlusion_counter_result(struct fd_context *ctx, void *buf,
+occlusion_counter_result(struct fd_acc_query *aq, void *buf,
 		union pipe_query_result *result)
 {
 	struct fd5_query_sample *sp = buf;
@@ -119,7 +125,7 @@
 }
 
 static void
-occlusion_predicate_result(struct fd_context *ctx, void *buf,
+occlusion_predicate_result(struct fd_acc_query *aq, void *buf,
 		union pipe_query_result *result)
 {
 	struct fd5_query_sample *sp = buf;
@@ -196,7 +202,7 @@
 }
 
 static uint64_t
-ticks_to_ns(struct fd_context *ctx, uint32_t ts)
+ticks_to_ns(uint32_t ts)
 {
 	/* This is based on the 19.2MHz always-on rbbm timer.
 	 *
@@ -206,19 +212,19 @@
 }
 
 static void
-time_elapsed_accumulate_result(struct fd_context *ctx, void *buf,
+time_elapsed_accumulate_result(struct fd_acc_query *aq, void *buf,
 		union pipe_query_result *result)
 {
 	struct fd5_query_sample *sp = buf;
-	result->u64 = ticks_to_ns(ctx, sp->result);
+	result->u64 = ticks_to_ns(sp->result);
 }
 
 static void
-timestamp_accumulate_result(struct fd_context *ctx, void *buf,
+timestamp_accumulate_result(struct fd_acc_query *aq, void *buf,
 		union pipe_query_result *result)
 {
 	struct fd5_query_sample *sp = buf;
-	result->u64 = ticks_to_ns(ctx, sp->result);
+	result->u64 = ticks_to_ns(sp->result);
 }
 
 static const struct fd_acc_sample_provider time_elapsed = {
@@ -246,6 +252,201 @@
 		.result = timestamp_accumulate_result,
 };
 
+/*
+ * Performance Counter (batch) queries:
+ *
+ * Only one of these is active at a time, per design of the gallium
+ * batch_query API design.  On perfcntr query tracks N query_types,
+ * each of which has a 'fd_batch_query_entry' that maps it back to
+ * the associated group and counter.
+ */
+
+struct fd_batch_query_entry {
+	uint8_t gid;        /* group-id */
+	uint8_t cid;        /* countable-id within the group */
+};
+
+struct fd_batch_query_data {
+	struct fd_screen *screen;
+	unsigned num_query_entries;
+	struct fd_batch_query_entry query_entries[];
+};
+
+static void
+perfcntr_resume(struct fd_acc_query *aq, struct fd_batch *batch)
+{
+	struct fd_batch_query_data *data = aq->query_data;
+	struct fd_screen *screen = data->screen;
+	struct fd_ringbuffer *ring = batch->draw;
+
+	unsigned counters_per_group[screen->num_perfcntr_groups];
+	memset(counters_per_group, 0, sizeof(counters_per_group));
+
+	fd_wfi(batch, ring);
+
+	/* configure performance counters for the requested queries: */
+	for (unsigned i = 0; i < data->num_query_entries; i++) {
+		struct fd_batch_query_entry *entry = &data->query_entries[i];
+		const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
+		unsigned counter_idx = counters_per_group[entry->gid]++;
+
+		debug_assert(counter_idx < g->num_counters);
+
+		OUT_PKT4(ring, g->counters[counter_idx].select_reg, 1);
+		OUT_RING(ring, g->countables[entry->cid].selector);
+	}
+
+	memset(counters_per_group, 0, sizeof(counters_per_group));
+
+	/* and snapshot the start values */
+	for (unsigned i = 0; i < data->num_query_entries; i++) {
+		struct fd_batch_query_entry *entry = &data->query_entries[i];
+		const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
+		unsigned counter_idx = counters_per_group[entry->gid]++;
+		const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
+
+		OUT_PKT7(ring, CP_REG_TO_MEM, 3);
+		OUT_RING(ring, CP_REG_TO_MEM_0_64B |
+			CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
+		OUT_RELOCW(ring, query_sample_idx(aq, i, start));
+	}
+}
+
+static void
+perfcntr_pause(struct fd_acc_query *aq, struct fd_batch *batch)
+{
+	struct fd_batch_query_data *data = aq->query_data;
+	struct fd_screen *screen = data->screen;
+	struct fd_ringbuffer *ring = batch->draw;
+
+	unsigned counters_per_group[screen->num_perfcntr_groups];
+	memset(counters_per_group, 0, sizeof(counters_per_group));
+
+	fd_wfi(batch, ring);
+
+	/* TODO do we need to bother to turn anything off? */
+
+	/* snapshot the end values: */
+	for (unsigned i = 0; i < data->num_query_entries; i++) {
+		struct fd_batch_query_entry *entry = &data->query_entries[i];
+		const struct fd_perfcntr_group *g = &screen->perfcntr_groups[entry->gid];
+		unsigned counter_idx = counters_per_group[entry->gid]++;
+		const struct fd_perfcntr_counter *counter = &g->counters[counter_idx];
+
+		OUT_PKT7(ring, CP_REG_TO_MEM, 3);
+		OUT_RING(ring, CP_REG_TO_MEM_0_64B |
+			CP_REG_TO_MEM_0_REG(counter->counter_reg_lo));
+		OUT_RELOCW(ring, query_sample_idx(aq, i, stop));
+	}
+
+	/* and compute the result: */
+	for (unsigned i = 0; i < data->num_query_entries; i++) {
+		/* result += stop - start: */
+		OUT_PKT7(ring, CP_MEM_TO_MEM, 9);
+		OUT_RING(ring, CP_MEM_TO_MEM_0_DOUBLE |
+				CP_MEM_TO_MEM_0_NEG_C);
+		OUT_RELOCW(ring, query_sample_idx(aq, i, result));     /* dst */
+		OUT_RELOC(ring, query_sample_idx(aq, i, result));      /* srcA */
+		OUT_RELOC(ring, query_sample_idx(aq, i, stop));        /* srcB */
+		OUT_RELOC(ring, query_sample_idx(aq, i, start));       /* srcC */
+	}
+}
+
+static void
+perfcntr_accumulate_result(struct fd_acc_query *aq, void *buf,
+		union pipe_query_result *result)
+{
+	struct fd_batch_query_data *data = aq->query_data;
+	struct fd5_query_sample *sp = buf;
+
+	for (unsigned i = 0; i < data->num_query_entries; i++) {
+		result->batch[i].u64 = sp[i].result;
+	}
+}
+
+static const struct fd_acc_sample_provider perfcntr = {
+		.query_type = FD_QUERY_FIRST_PERFCNTR,
+		.active = FD_STAGE_DRAW | FD_STAGE_CLEAR,
+		.resume = perfcntr_resume,
+		.pause = perfcntr_pause,
+		.result = perfcntr_accumulate_result,
+};
+
+static struct pipe_query *
+fd5_create_batch_query(struct pipe_context *pctx,
+		unsigned num_queries, unsigned *query_types)
+{
+	struct fd_context *ctx = fd_context(pctx);
+	struct fd_screen *screen = ctx->screen;
+	struct fd_query *q;
+	struct fd_acc_query *aq;
+	struct fd_batch_query_data *data;
+
+	data = CALLOC_VARIANT_LENGTH_STRUCT(fd_batch_query_data,
+			num_queries * sizeof(data->query_entries[0]));
+
+	data->screen = screen;
+	data->num_query_entries = num_queries;
+
+	/* validate the requested query_types and ensure we don't try
+	 * to request more query_types of a given group than we have
+	 * counters:
+	 */
+	unsigned counters_per_group[screen->num_perfcntr_groups];
+	memset(counters_per_group, 0, sizeof(counters_per_group));
+
+	for (unsigned i = 0; i < num_queries; i++) {
+		unsigned idx = query_types[i] - FD_QUERY_FIRST_PERFCNTR;
+
+		/* verify valid query_type, ie. is it actually a perfcntr? */
+		if ((query_types[i] < FD_QUERY_FIRST_PERFCNTR) ||
+				(idx >= screen->num_perfcntr_queries)) {
+			debug_printf("invalid batch query query_type: %u\n", query_types[i]);
+			goto error;
+		}
+
+		struct fd_batch_query_entry *entry = &data->query_entries[i];
+		struct pipe_driver_query_info *pq = &screen->perfcntr_queries[idx];
+
+		entry->gid = pq->group_id;
+
+		/* the perfcntr_queries[] table flattens all the countables
+		 * for each group in series, ie:
+		 *
+		 *   (G0,C0), .., (G0,Cn), (G1,C0), .., (G1,Cm), ...
+		 *
+		 * So to find the countable index just step back through the
+		 * table to find the first entry with the same group-id.
+		 */
+		while (pq > screen->perfcntr_queries) {
+			pq--;
+			if (pq->group_id == entry->gid)
+				entry->cid++;
+		}
+
+		if (counters_per_group[entry->gid] >=
+				screen->perfcntr_groups[entry->gid].num_counters) {
+			debug_printf("too many counters for group %u\n", entry->gid);
+			goto error;
+		}
+
+		counters_per_group[entry->gid]++;
+	}
+
+	q = fd_acc_create_query2(ctx, 0, &perfcntr);
+	aq = fd_acc_query(q);
+
+	/* sample buffer size is based on # of queries: */
+	aq->size = num_queries * sizeof(struct fd5_query_sample);
+	aq->query_data = data;
+
+	return (struct pipe_query *)q;
+
+error:
+	free(data);
+	return NULL;
+}
+
 void
 fd5_query_context_init(struct pipe_context *pctx)
 {
@@ -254,6 +455,8 @@
 	ctx->create_query = fd_acc_create_query;
 	ctx->query_set_stage = fd_acc_query_set_stage;
 
+	pctx->create_batch_query = fd5_create_batch_query;
+
 	fd_acc_query_register_provider(pctx, &occlusion_counter);
 	fd_acc_query_register_provider(pctx, &occlusion_predicate);
 	fd_acc_query_register_provider(pctx, &occlusion_predicate_conservative);
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
index 7d7e76e..7d8d2b3 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.c
@@ -35,33 +35,50 @@
 
 #include "ir3_compiler.h"
 
+static bool
+valid_sample_count(unsigned sample_count)
+{
+	switch (sample_count) {
+	case 0:
+	case 1:
+	case 2:
+	case 4:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static boolean
 fd5_screen_is_format_supported(struct pipe_screen *pscreen,
 		enum pipe_format format,
 		enum pipe_texture_target target,
 		unsigned sample_count,
+		unsigned storage_sample_count,
 		unsigned usage)
 {
 	unsigned retval = 0;
 
 	if ((target >= PIPE_MAX_TEXTURE_TYPES) ||
-			(sample_count > 1) || /* TODO add MSAA */
-			!util_format_is_supported(format, usage)) {
+			!valid_sample_count(sample_count)) {
 		DBG("not supported: format=%s, target=%d, sample_count=%d, usage=%x",
 				util_format_name(format), target, sample_count, usage);
 		return FALSE;
 	}
 
+	if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+		return false;
+
 	if ((usage & PIPE_BIND_VERTEX_BUFFER) &&
 			(fd5_pipe2vtx(format) != (enum a5xx_vtx_fmt)~0)) {
 		retval |= PIPE_BIND_VERTEX_BUFFER;
 	}
 
-	if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
+	if ((usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) &&
 			(target == PIPE_BUFFER ||
 			 util_format_get_blocksize(format) != 12) &&
 			(fd5_pipe2tex(format) != (enum a5xx_tex_fmt)~0)) {
-		retval |= PIPE_BIND_SAMPLER_VIEW;
+		retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE);
 	}
 
 	if ((usage & (PIPE_BIND_RENDER_TARGET |
@@ -103,6 +120,9 @@
 	return retval == usage;
 }
 
+extern const struct fd_perfcntr_group a5xx_perfcntr_groups[];
+extern const unsigned a5xx_num_perfcntr_groups;
+
 void
 fd5_screen_init(struct pipe_screen *pscreen)
 {
@@ -115,4 +135,9 @@
 	screen->setup_slices = fd5_setup_slices;
 	if (fd_mesa_debug & FD_DBG_TTILE)
 		screen->tile_mode = fd5_tile_mode;
+
+	if (fd_mesa_debug & FD_DBG_PERFC) {
+		screen->perfcntr_groups = a5xx_perfcntr_groups;
+		screen->num_perfcntr_groups = a5xx_num_perfcntr_groups;
+	}
 }
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_screen.h b/src/gallium/drivers/freedreno/a5xx/fd5_screen.h
index ba0c7f1..0a65b3b 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_screen.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_screen.h
@@ -29,6 +29,19 @@
 
 #include "pipe/p_screen.h"
 
+#include "freedreno_util.h"
+
+#include "a5xx.xml.h"
+
 void fd5_screen_init(struct pipe_screen *pscreen);
 
+static inline void
+emit_marker5(struct fd_ringbuffer *ring, int scratch_idx)
+{
+	extern unsigned marker_cnt;
+	unsigned reg = REG_A5XX_CP_SCRATCH_REG(scratch_idx);
+	OUT_PKT4(ring, reg, 1);
+	OUT_RING(ring, ++marker_cnt);
+}
+
 #endif /* FD5_SCREEN_H_ */
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_texture.c b/src/gallium/drivers/freedreno/a5xx/fd5_texture.c
index 9795189..e8e29d0 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_texture.c
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_texture.c
@@ -217,6 +217,7 @@
 
 	so->texconst0 =
 		A5XX_TEX_CONST_0_FMT(fd5_pipe2tex(format)) |
+		A5XX_TEX_CONST_0_SAMPLES(fd_msaa_samples(prsc->nr_samples)) |
 		fd5_tex_swiz(format, cso->swizzle_r, cso->swizzle_g,
 				cso->swizzle_b, cso->swizzle_a);
 
diff --git a/src/gallium/drivers/freedreno/a5xx/fd5_texture.h b/src/gallium/drivers/freedreno/a5xx/fd5_texture.h
index 11a3b2e..5cb201d 100644
--- a/src/gallium/drivers/freedreno/a5xx/fd5_texture.h
+++ b/src/gallium/drivers/freedreno/a5xx/fd5_texture.h
@@ -62,9 +62,6 @@
 	return (struct fd5_pipe_sampler_view *)pview;
 }
 
-unsigned fd5_get_const_idx(struct fd_context *ctx,
-		struct fd_texture_stateobj *tex, unsigned samp_id);
-
 void fd5_texture_init(struct pipe_context *pctx);
 
 
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index 76500a9..fc0dcb0 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -8,17 +8,19 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13612 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  34499 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 146261 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    501 bytes, from 2018-01-31 18:26:32)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  36805 bytes, from 2018-05-20 19:03:35)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13634 bytes, from 2018-06-10 17:35:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  41584 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2018-01-10 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 147158 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx.xml          (  88437 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx_gmu.xml      (  10431 bytes, from 2018-06-10 17:37:04)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2018-01-08 14:56:24)
 
-Copyright (C) 2013-2017 by the following authors:
+Copyright (C) 2013-2018 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)
 
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 004748f..08f8ff2 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -8,15 +8,17 @@
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  37162 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13612 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  34499 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2017-05-17 13:21:27)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2017-12-19 18:19:46)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 146261 bytes, from 2018-01-03 15:58:51)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2017-05-17 13:21:27)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    501 bytes, from 2018-01-31 18:26:32)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  36805 bytes, from 2018-05-20 19:03:35)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  13634 bytes, from 2018-06-10 17:35:36)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  41584 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2018-01-10 16:21:40)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 112086 bytes, from 2018-01-08 14:56:24)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          ( 147158 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx.xml          (  88437 bytes, from 2018-06-18 14:25:44)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a6xx_gmu.xml      (  10431 bytes, from 2018-06-10 17:37:04)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2018-01-08 14:56:24)
 
 Copyright (C) 2013-2018 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -71,7 +73,8 @@
 	FLUSH_SO_1 = 18,
 	FLUSH_SO_2 = 19,
 	FLUSH_SO_3 = 20,
-	UNK_19 = 25,
+	PC_CCU_INVALIDATE_DEPTH = 24,
+	PC_CCU_INVALIDATE_COLOR = 25,
 	UNK_1C = 28,
 	UNK_1D = 29,
 	BLIT = 30,
@@ -203,6 +206,8 @@
 	CP_EXEC_CS = 51,
 	CP_PERFCOUNTER_ACTION = 80,
 	CP_SMMU_TABLE_UPDATE = 83,
+	CP_SET_MARKER = 101,
+	CP_SET_PSEUDO_REG = 86,
 	CP_CONTEXT_REG_BUNCH = 92,
 	CP_YIELD_ENABLE = 28,
 	CP_SKIP_IB2_ENABLE_GLOBAL = 29,
@@ -216,7 +221,7 @@
 	CP_COMPUTE_CHECKPOINT = 110,
 	CP_MEM_TO_MEM = 115,
 	CP_BLIT = 44,
-	CP_UNK_39 = 57,
+	CP_REG_TEST = 57,
 	IN_IB_PREFETCH_END = 23,
 	IN_SUBBLK_PREFETCH = 31,
 	IN_INSTR_PREFETCH = 32,
@@ -353,7 +358,7 @@
 }
 
 #define REG_CP_LOAD_STATE4_0					0x00000000
-#define CP_LOAD_STATE4_0_DST_OFF__MASK				0x0000ffff
+#define CP_LOAD_STATE4_0_DST_OFF__MASK				0x00003fff
 #define CP_LOAD_STATE4_0_DST_OFF__SHIFT				0
 static inline uint32_t CP_LOAD_STATE4_0_DST_OFF(uint32_t val)
 {
@@ -1105,14 +1110,14 @@
 #define REG_CP_COMPUTE_CHECKPOINT_2				0x00000002
 
 #define REG_CP_COMPUTE_CHECKPOINT_3				0x00000003
+#define CP_COMPUTE_CHECKPOINT_3_ADDR_1_LEN__MASK		0xffffffff
+#define CP_COMPUTE_CHECKPOINT_3_ADDR_1_LEN__SHIFT		0
+static inline uint32_t CP_COMPUTE_CHECKPOINT_3_ADDR_1_LEN(uint32_t val)
+{
+	return ((val) << CP_COMPUTE_CHECKPOINT_3_ADDR_1_LEN__SHIFT) & CP_COMPUTE_CHECKPOINT_3_ADDR_1_LEN__MASK;
+}
 
 #define REG_CP_COMPUTE_CHECKPOINT_4				0x00000004
-#define CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN__MASK		0xffffffff
-#define CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN__SHIFT		0
-static inline uint32_t CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN(uint32_t val)
-{
-	return ((val) << CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN__SHIFT) & CP_COMPUTE_CHECKPOINT_4_ADDR_1_LEN__MASK;
-}
 
 #define REG_CP_COMPUTE_CHECKPOINT_5				0x00000005
 #define CP_COMPUTE_CHECKPOINT_5_ADDR_1_LO__MASK			0xffffffff
@@ -1130,6 +1135,8 @@
 	return ((val) << CP_COMPUTE_CHECKPOINT_6_ADDR_1_HI__SHIFT) & CP_COMPUTE_CHECKPOINT_6_ADDR_1_HI__MASK;
 }
 
+#define REG_CP_COMPUTE_CHECKPOINT_7				0x00000007
+
 #define REG_CP_PERFCOUNTER_ACTION_0				0x00000000
 
 #define REG_CP_PERFCOUNTER_ACTION_1				0x00000001
diff --git a/src/gallium/drivers/freedreno/disasm.h b/src/gallium/drivers/freedreno/disasm.h
index 579dd50..fd8053b 100644
--- a/src/gallium/drivers/freedreno/disasm.h
+++ b/src/gallium/drivers/freedreno/disasm.h
@@ -24,6 +24,19 @@
 #ifndef DISASM_H_
 #define DISASM_H_
 
+#include <stdio.h>
+#include <stdbool.h>
+
+#include "util/u_debug.h"
+
+enum fd_shader_debug {
+	FD_DBG_SHADER_VS = 0x01,
+	FD_DBG_SHADER_FS = 0x02,
+	FD_DBG_SHADER_CS = 0x04,
+};
+
+extern enum fd_shader_debug fd_shader_debug;
+
 enum shader_t {
 	SHADER_VERTEX,
 	SHADER_TCS,
@@ -34,6 +47,38 @@
 	SHADER_MAX,
 };
 
+static inline bool
+shader_debug_enabled(enum shader_t type)
+{
+	switch (type) {
+	case SHADER_VERTEX:      return !!(fd_shader_debug & FD_DBG_SHADER_VS);
+	case SHADER_FRAGMENT:    return !!(fd_shader_debug & FD_DBG_SHADER_FS);
+	case SHADER_COMPUTE:     return !!(fd_shader_debug & FD_DBG_SHADER_CS);
+	default:
+		debug_assert(0);
+		return false;
+	}
+}
+
+static inline const char *
+shader_stage_name(enum shader_t type)
+{
+	/* NOTE these names are chosen to match the INTEL_DEBUG output
+	 * which frameretrace parses.  Hurray accidental ABI!
+	 */
+	switch (type) {
+	case SHADER_VERTEX:      return "vertex";
+	case SHADER_TCS:         return "tessellation control";
+	case SHADER_TES:         return "tessellation evaluation";
+	case SHADER_GEOM:        return "geometry";
+	case SHADER_FRAGMENT:    return "fragment";
+	case SHADER_COMPUTE:     return "compute";
+	default:
+		debug_assert(0);
+		return NULL;
+	}
+}
+
 /* bitmask of debug flags */
 enum debug_t {
 	PRINT_RAW      = 0x1,    /* dump raw hexdump */
@@ -41,7 +86,7 @@
 };
 
 int disasm_a2xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type);
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type);
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out);
 void disasm_set_debug(enum debug_t debug);
 
 #endif /* DISASM_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.c b/src/gallium/drivers/freedreno/freedreno_batch.c
index 9918c25..84de03b 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.c
+++ b/src/gallium/drivers/freedreno/freedreno_batch.c
@@ -72,6 +72,7 @@
 	batch->cleared = batch->partial_cleared = 0;
 	batch->restore = batch->resolve = 0;
 	batch->needs_flush = false;
+	batch->flushed = false;
 	batch->gmem_reason = 0;
 	batch->num_draws = 0;
 	batch->stage = FD_STAGE_NULL;
@@ -117,6 +118,8 @@
 static void
 batch_fini(struct fd_batch *batch)
 {
+	DBG("%p", batch);
+
 	pipe_resource_reference(&batch->query_buf, NULL);
 
 	if (batch->in_fence_fd != -1)
@@ -220,24 +223,26 @@
 void
 __fd_batch_destroy(struct fd_batch *batch)
 {
+	struct fd_context *ctx = batch->ctx;
+
 	DBG("%p", batch);
 
-	util_copy_framebuffer_state(&batch->framebuffer, NULL);
+	fd_context_assert_locked(batch->ctx);
 
-	mtx_lock(&batch->ctx->screen->lock);
 	fd_bc_invalidate_batch(batch, true);
-	mtx_unlock(&batch->ctx->screen->lock);
 
-	batch_fini(batch);
-
-	batch_reset_resources(batch);
+	batch_reset_resources_locked(batch);
 	debug_assert(batch->resources->entries == 0);
 	_mesa_set_destroy(batch->resources, NULL);
 
 	batch_flush_reset_dependencies(batch, false);
 	debug_assert(batch->dependents_mask == 0);
 
+	fd_context_unlock(ctx);
+	util_copy_framebuffer_state(&batch->framebuffer, NULL);
+	batch_fini(batch);
 	free(batch);
+	fd_context_lock(ctx);
 }
 
 void
@@ -259,6 +264,8 @@
 {
 	struct fd_batch *batch = job;
 
+	DBG("%p", batch);
+
 	fd_gmem_render_tiles(batch);
 	batch_reset_resources(batch);
 }
@@ -275,13 +282,8 @@
 {
 	DBG("%p: needs_flush=%d", batch, batch->needs_flush);
 
-	if (!batch->needs_flush) {
-		if (force) {
-			fd_gmem_render_noop(batch);
-			goto out;
-		}
+	if (batch->flushed)
 		return;
-	}
 
 	batch->needs_flush = false;
 
@@ -293,6 +295,8 @@
 	fd_context_all_dirty(batch->ctx);
 	batch_flush_reset_dependencies(batch, true);
 
+	batch->flushed = true;
+
 	if (batch->ctx->screen->reorder) {
 		struct fd_batch *tmp = NULL;
 		fd_batch_reference(&tmp, batch);
@@ -310,14 +314,9 @@
 
 	debug_assert(batch->reference.count > 0);
 
-out:
-	if (batch == batch->ctx->batch) {
-		batch_reset(batch);
-	} else {
-		mtx_lock(&batch->ctx->screen->lock);
-		fd_bc_invalidate_batch(batch, false);
-		mtx_unlock(&batch->ctx->screen->lock);
-	}
+	mtx_lock(&batch->ctx->screen->lock);
+	fd_bc_invalidate_batch(batch, false);
+	mtx_unlock(&batch->ctx->screen->lock);
 }
 
 /* NOTE: could drop the last ref to batch
@@ -331,16 +330,36 @@
 void
 fd_batch_flush(struct fd_batch *batch, bool sync, bool force)
 {
+	struct fd_batch *tmp = NULL;
+	bool newbatch = false;
+
 	/* NOTE: we need to hold an extra ref across the body of flush,
 	 * since the last ref to this batch could be dropped when cleaning
 	 * up used_resources
 	 */
-	struct fd_batch *tmp = NULL;
-
 	fd_batch_reference(&tmp, batch);
+
+	if (batch == batch->ctx->batch) {
+		batch->ctx->batch = NULL;
+		newbatch = true;
+	}
+
 	batch_flush(tmp, force);
+
+	if (newbatch) {
+		struct fd_context *ctx = batch->ctx;
+		struct fd_batch *new_batch =
+			fd_batch_from_fb(&ctx->screen->batch_cache, ctx, &batch->framebuffer);
+
+		util_copy_framebuffer_state(&new_batch->framebuffer, &batch->framebuffer);
+
+		fd_batch_reference(&batch, NULL);
+		ctx->batch = new_batch;
+	}
+
 	if (sync)
 		fd_batch_sync(tmp);
+
 	fd_batch_reference(&tmp, NULL);
 }
 
@@ -367,20 +386,27 @@
 	if (batch->dependents_mask & (1 << dep->idx))
 		return;
 
-	/* if the new depedency already depends on us, we need to flush
-	 * to avoid a loop in the dependency graph.
-	 */
-	if (batch_depends_on(dep, batch)) {
-		DBG("%p: flush forced on %p!", batch, dep);
-		mtx_unlock(&batch->ctx->screen->lock);
-		fd_batch_flush(dep, false, false);
-		mtx_lock(&batch->ctx->screen->lock);
-	} else {
-		struct fd_batch *other = NULL;
-		fd_batch_reference_locked(&other, dep);
-		batch->dependents_mask |= (1 << dep->idx);
-		DBG("%p: added dependency on %p", batch, dep);
-	}
+	/* a loop should not be possible */
+	debug_assert(!batch_depends_on(dep, batch));
+
+	struct fd_batch *other = NULL;
+	fd_batch_reference_locked(&other, dep);
+	batch->dependents_mask |= (1 << dep->idx);
+	DBG("%p: added dependency on %p", batch, dep);
+}
+
+static void
+flush_write_batch(struct fd_resource *rsc)
+{
+	struct fd_batch *b = NULL;
+	fd_batch_reference(&b, rsc->write_batch);
+
+	mtx_unlock(&b->ctx->screen->lock);
+	fd_batch_flush(b, true, false);
+	mtx_lock(&b->ctx->screen->lock);
+
+	fd_bc_invalidate_batch(b, false);
+	fd_batch_reference_locked(&b, NULL);
 }
 
 void
@@ -402,21 +428,12 @@
 
 	if (write) {
 		/* if we are pending read or write by any other batch: */
-		if (rsc->batch_mask != (1 << batch->idx)) {
+		if (rsc->batch_mask & ~(1 << batch->idx)) {
 			struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
 			struct fd_batch *dep;
 
-			if (rsc->write_batch && rsc->write_batch != batch) {
-				struct fd_batch *b = NULL;
-				fd_batch_reference(&b, rsc->write_batch);
-
-				mtx_unlock(&batch->ctx->screen->lock);
-				fd_batch_flush(b, true, false);
-				mtx_lock(&batch->ctx->screen->lock);
-
-				fd_bc_invalidate_batch(b, false);
-				fd_batch_reference_locked(&b, NULL);
-			}
+			if (rsc->write_batch && rsc->write_batch != batch)
+				flush_write_batch(rsc);
 
 			foreach_batch(dep, cache, rsc->batch_mask) {
 				struct fd_batch *b = NULL;
@@ -434,10 +451,12 @@
 		}
 		fd_batch_reference_locked(&rsc->write_batch, batch);
 	} else {
-		if (rsc->write_batch) {
-			fd_batch_add_dep(batch, rsc->write_batch);
-			fd_bc_invalidate_batch(rsc->write_batch, false);
-		}
+		/* If reading a resource pending a write, go ahead and flush the
+		 * writer.  This avoids situations where we end up having to
+		 * flush the current batch in _resource_used()
+		 */
+		if (rsc->write_batch && rsc->write_batch != batch)
+			flush_write_batch(rsc);
 	}
 
 	if (rsc->batch_mask & (1 << batch->idx))
@@ -452,6 +471,8 @@
 void
 fd_batch_check_size(struct fd_batch *batch)
 {
+	debug_assert(!batch->flushed);
+
 	if (fd_device_version(batch->ctx->screen->dev) >= FD_VERSION_UNLIMITED_CMDS)
 		return;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_batch.h b/src/gallium/drivers/freedreno/freedreno_batch.h
index 56665b7..4b0539d 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch.h
+++ b/src/gallium/drivers/freedreno/freedreno_batch.h
@@ -96,6 +96,7 @@
 	/* is this a non-draw batch (ie compute/blit which has no pfb state)? */
 	bool nondraw : 1;
 	bool needs_flush : 1;
+	bool flushed : 1;
 	bool blit : 1;
 	bool back_blit : 1;      /* only blit so far is resource shadowing back-blit */
 
@@ -115,7 +116,6 @@
 		FD_GMEM_DEPTH_ENABLED        = 0x02,
 		FD_GMEM_STENCIL_ENABLED      = 0x04,
 
-		FD_GMEM_MSAA_ENABLED         = 0x08,
 		FD_GMEM_BLEND_ENABLED        = 0x10,
 		FD_GMEM_LOGICOP_ENABLED      = 0x20,
 	} gmem_reason;
@@ -228,16 +228,6 @@
  * __fd_batch_destroy() needs to unref resources)
  */
 
-static inline void
-fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch)
-{
-	struct fd_batch *old_batch = *ptr;
-	if (pipe_reference_described(&(*ptr)->reference, &batch->reference,
-			(debug_reference_descriptor)__fd_batch_describe))
-		__fd_batch_destroy(old_batch);
-	*ptr = batch;
-}
-
 /* fwd-decl prototypes to untangle header dependency :-/ */
 static inline void fd_context_assert_locked(struct fd_context *ctx);
 static inline void fd_context_lock(struct fd_context *ctx);
@@ -248,21 +238,32 @@
 {
 	struct fd_batch *old_batch = *ptr;
 
+	/* only need lock if a reference is dropped: */
 	if (old_batch)
 		fd_context_assert_locked(old_batch->ctx);
-	else if (batch)
-		fd_context_assert_locked(batch->ctx);
 
 	if (pipe_reference_described(&(*ptr)->reference, &batch->reference,
-			(debug_reference_descriptor)__fd_batch_describe)) {
-		struct fd_context *ctx = old_batch->ctx;
-		fd_context_unlock(ctx);
+			(debug_reference_descriptor)__fd_batch_describe))
 		__fd_batch_destroy(old_batch);
-		fd_context_lock(ctx);
-	}
+
 	*ptr = batch;
 }
 
+static inline void
+fd_batch_reference(struct fd_batch **ptr, struct fd_batch *batch)
+{
+	struct fd_batch *old_batch = *ptr;
+	struct fd_context *ctx = old_batch ? old_batch->ctx : NULL;
+
+	if (ctx)
+		fd_context_lock(ctx);
+
+	fd_batch_reference_locked(ptr, batch);
+
+	if (ctx)
+		fd_context_unlock(ctx);
+}
+
 #include "freedreno_context.h"
 
 static inline void
diff --git a/src/gallium/drivers/freedreno/freedreno_batch_cache.c b/src/gallium/drivers/freedreno/freedreno_batch_cache.c
index b3a6041..1bf656c 100644
--- a/src/gallium/drivers/freedreno/freedreno_batch_cache.c
+++ b/src/gallium/drivers/freedreno/freedreno_batch_cache.c
@@ -124,33 +124,52 @@
 	_mesa_hash_table_destroy(cache->ht, NULL);
 }
 
+static void
+bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx, bool deferred)
+{
+	/* fd_batch_flush() (and fd_batch_add_dep() which calls it indirectly)
+	 * can cause batches to be unref'd and freed under our feet, so grab
+	 * a reference to all the batches we need up-front.
+	 */
+	struct fd_batch *batches[ARRAY_SIZE(cache->batches)] = {0};
+	struct fd_batch *batch;
+	unsigned n = 0;
+
+	fd_context_lock(ctx);
+
+	foreach_batch(batch, cache, cache->batch_mask) {
+		if (batch->ctx == ctx) {
+			fd_batch_reference_locked(&batches[n++], batch);
+		}
+	}
+
+	if (deferred) {
+		struct fd_batch *current_batch = ctx->batch;
+
+		for (unsigned i = 0; i < n; i++) {
+			if (batches[i] != current_batch) {
+				fd_batch_add_dep(current_batch, batches[i]);
+			}
+		}
+
+		fd_context_unlock(ctx);
+	} else {
+		fd_context_unlock(ctx);
+
+		for (unsigned i = 0; i < n; i++) {
+			fd_batch_flush(batches[i], false, false);
+		}
+	}
+
+	for (unsigned i = 0; i < n; i++) {
+		fd_batch_reference(&batches[i], NULL);
+	}
+}
+
 void
 fd_bc_flush(struct fd_batch_cache *cache, struct fd_context *ctx)
 {
-	struct hash_entry *entry;
-	struct fd_batch *last_batch = NULL;
-
-	mtx_lock(&ctx->screen->lock);
-
-	hash_table_foreach(cache->ht, entry) {
-		struct fd_batch *batch = NULL;
-		/* hold a reference since we can drop screen->lock: */
-		fd_batch_reference_locked(&batch, (struct fd_batch *)entry->data);
-		if (batch->ctx == ctx) {
-			mtx_unlock(&ctx->screen->lock);
-			fd_batch_reference(&last_batch, batch);
-			fd_batch_flush(batch, false, false);
-			mtx_lock(&ctx->screen->lock);
-		}
-		fd_batch_reference_locked(&batch, NULL);
-	}
-
-	mtx_unlock(&ctx->screen->lock);
-
-	if (last_batch) {
-		fd_batch_sync(last_batch);
-		fd_batch_reference(&last_batch, NULL);
-	}
+	bc_flush(cache, ctx, false);
 }
 
 /* deferred flush doesn't actually flush, but it marks every other
@@ -161,20 +180,7 @@
 void
 fd_bc_flush_deferred(struct fd_batch_cache *cache, struct fd_context *ctx)
 {
-	struct fd_batch *current_batch = ctx->batch;
-	struct hash_entry *entry;
-
-	mtx_lock(&ctx->screen->lock);
-
-	hash_table_foreach(cache->ht, entry) {
-		struct fd_batch *batch = entry->data;
-		if (batch == current_batch)
-			continue;
-		if (batch->ctx == ctx)
-			fd_batch_add_dep(current_batch, batch);
-	}
-
-	mtx_unlock(&ctx->screen->lock);
+	bc_flush(cache, ctx, true);
 }
 
 void
@@ -187,14 +193,24 @@
 
 	foreach_batch(batch, cache, cache->batch_mask) {
 		if (batch->ctx == ctx)
-			fd_batch_reference_locked(&batch, NULL);
+			fd_bc_invalidate_batch(batch, true);
 	}
 
 	mtx_unlock(&ctx->screen->lock);
 }
 
+/**
+ * Note that when batch is flushed, it needs to remain in the cache so
+ * that fd_bc_invalidate_resource() can work.. otherwise we can have
+ * the case where a rsc is destroyed while a batch still has a dangling
+ * reference to it.
+ *
+ * Note that the cmdstream (or, after the SUBMIT ioctl, the kernel)
+ * would have a reference to the underlying bo, so it is ok for the
+ * rsc to be destroyed before the batch.
+ */
 void
-fd_bc_invalidate_batch(struct fd_batch *batch, bool destroy)
+fd_bc_invalidate_batch(struct fd_batch *batch, bool remove)
 {
 	if (!batch)
 		return;
@@ -202,9 +218,9 @@
 	struct fd_batch_cache *cache = &batch->ctx->screen->batch_cache;
 	struct key *key = (struct key *)batch->key;
 
-	pipe_mutex_assert_locked(batch->ctx->screen->lock);
+	fd_context_assert_locked(batch->ctx);
 
-	if (destroy) {
+	if (remove) {
 		cache->batches[batch->idx] = NULL;
 		cache->batch_mask &= ~(1 << batch->idx);
 	}
@@ -398,7 +414,7 @@
 	key->width = pfb->width;
 	key->height = pfb->height;
 	key->layers = pfb->layers;
-	key->samples = pfb->samples;
+	key->samples = util_framebuffer_get_num_samples(pfb);
 	key->ctx = ctx;
 
 	if (pfb->zsbuf)
diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c
index 9c6a21c..e1324e8 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.c
+++ b/src/gallium/drivers/freedreno/freedreno_context.c
@@ -101,6 +101,8 @@
 	if (!ctx->batch)
 		return;
 
+	ctx->batch->needs_flush = true;
+
 	ring = ctx->batch->draw;
 
 	/* max packet size is 0x3fff dwords: */
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index af564bd..a93561e 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -52,6 +52,8 @@
 	struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS];
 	unsigned num_samplers;
 	unsigned valid_samplers;
+	/* number of samples per sampler, 2 bits per sampler: */
+	uint32_t samples;
 };
 
 struct fd_program_stateobj {
@@ -69,26 +71,22 @@
 struct fd_constbuf_stateobj {
 	struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS];
 	uint32_t enabled_mask;
-	uint32_t dirty_mask;
 };
 
 struct fd_shaderbuf_stateobj {
 	struct pipe_shader_buffer sb[PIPE_MAX_SHADER_BUFFERS];
 	uint32_t enabled_mask;
-	uint32_t dirty_mask;
 };
 
 struct fd_shaderimg_stateobj {
 	struct pipe_image_view si[PIPE_MAX_SHADER_IMAGES];
 	uint32_t enabled_mask;
-	uint32_t dirty_mask;
 };
 
 struct fd_vertexbuf_stateobj {
 	struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS];
 	unsigned count;
 	uint32_t enabled_mask;
-	uint32_t dirty_mask;
 };
 
 struct fd_vertex_stateobj {
@@ -223,6 +221,7 @@
 		uint64_t draw_calls;
 		uint64_t batch_total, batch_sysmem, batch_gmem, batch_nondraw, batch_restore;
 		uint64_t staging_uploads, shadow_uploads;
+		uint64_t vs_regs, fs_regs;
 	} stats;
 
 	/* Current batch.. the rule here is that you can deref ctx->batch
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index eb36a93..3bcda34 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -170,9 +170,6 @@
 
 		buffers |= PIPE_CLEAR_COLOR0 << i;
 
-		if (surf->nr_samples > 1)
-			batch->gmem_reason |= FD_GMEM_MSAA_ENABLED;
-
 		if (fd_blend_enabled(ctx, i))
 			batch->gmem_reason |= FD_GMEM_BLEND_ENABLED;
 	}
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.h b/src/gallium/drivers/freedreno/freedreno_draw.h
index b293f73..4a922d9 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.h
+++ b/src/gallium/drivers/freedreno/freedreno_draw.h
@@ -74,18 +74,25 @@
 		OUT_RING(ring, 0);
 	}
 
-	OUT_PKT3(ring, CP_DRAW_INDX, idx_buffer ? 5 : 3);
-	OUT_RING(ring, 0x00000000);        /* viz query info. */
-	if (vismode == USE_VISIBILITY) {
-		/* leave vis mode blank for now, it will be patched up when
-		 * we know if we are binning or not
-		 */
-		OUT_RINGP(ring, DRAW(primtype, src_sel, idx_type, 0, instances),
-				&batch->draw_patches);
+	if (is_a20x(batch->ctx->screen)) {
+		OUT_PKT3(ring, CP_DRAW_INDX, idx_buffer ? 4 : 2);
+		OUT_RING(ring, 0x00000000);
+		OUT_RING(ring, DRAW_A20X(primtype, src_sel, idx_type, vismode, count));
 	} else {
-		OUT_RING(ring, DRAW(primtype, src_sel, idx_type, vismode, instances));
+		OUT_PKT3(ring, CP_DRAW_INDX, idx_buffer ? 5 : 3);
+		OUT_RING(ring, 0x00000000);        /* viz query info. */
+		if (vismode == USE_VISIBILITY) {
+			/* leave vis mode blank for now, it will be patched up when
+			 * we know if we are binning or not
+			 */
+			OUT_RINGP(ring, DRAW(primtype, src_sel, idx_type, 0, instances),
+					&batch->draw_patches);
+		} else {
+			OUT_RING(ring, DRAW(primtype, src_sel, idx_type, vismode, instances));
+		}
+		OUT_RING(ring, count);             /* NumIndices */
 	}
-	OUT_RING(ring, count);             /* NumIndices */
+
 	if (idx_buffer) {
 		OUT_RELOC(ring, fd_resource(idx_buffer)->bo, idx_offset, 0, 0);
 		OUT_RING (ring, idx_size);
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index 1925f72..c4e2022 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -41,7 +41,7 @@
 	 * fence_fd become valid and the week reference is dropped.
 	 */
 	struct fd_batch *batch;
-	struct fd_context *ctx;
+	struct fd_pipe *pipe;
 	struct fd_screen *screen;
 	int fence_fd;
 	uint32_t timestamp;
@@ -68,6 +68,7 @@
 {
 	if (fence->fence_fd != -1)
 		close(fence->fence_fd);
+	fd_pipe_del(fence->pipe);
 	FREE(fence);
 }
 
@@ -93,7 +94,7 @@
 		return ret == 0;
 	}
 
-	if (fd_pipe_wait_timeout(fence->ctx->pipe, fence->timestamp, timeout))
+	if (fd_pipe_wait_timeout(fence->pipe, fence->timestamp, timeout))
 		return false;
 
 	return true;
@@ -111,7 +112,7 @@
 	pipe_reference_init(&fence->reference, 1);
 
 	fence->batch = batch;
-	fence->ctx = ctx;
+	fence->pipe = fd_pipe_ref(ctx->pipe);
 	fence->screen = ctx->screen;
 	fence->timestamp = timestamp;
 	fence->fence_fd = fence_fd;
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index 37a2f33..95b4c16 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -135,6 +135,8 @@
 			cbuf_cpp[i] = util_format_get_blocksize(pfb->cbufs[i]->format);
 		else
 			cbuf_cpp[i] = 4;
+		/* if MSAA, color buffers are super-sampled in GMEM: */
+		cbuf_cpp[i] *= pfb->samples;
 	}
 
 	if (!memcmp(gmem->zsbuf_cpp, zsbuf_cpp, sizeof(zsbuf_cpp)) &&
@@ -393,9 +395,11 @@
 
 	if (ctx->emit_sysmem_prep && !batch->nondraw) {
 		if (batch->cleared || batch->gmem_reason ||
-				((batch->num_draws > 5) && !batch->blit)) {
-			DBG("GMEM: cleared=%x, gmem_reason=%x, num_draws=%u",
-				batch->cleared, batch->gmem_reason, batch->num_draws);
+				((batch->num_draws > 5) && !batch->blit) ||
+				(pfb->samples > 1)) {
+			DBG("GMEM: cleared=%x, gmem_reason=%x, num_draws=%u, samples=%u",
+				batch->cleared, batch->gmem_reason, batch->num_draws,
+				pfb->samples);
 		} else if (!(fd_mesa_debug & FD_DBG_NOBYPASS)) {
 			sysmem = true;
 		}
@@ -438,21 +442,6 @@
 	flush_ring(batch);
 }
 
-/* special case for when we need to create a fence but have no rendering
- * to flush.. just emit a no-op string-marker packet.
- */
-void
-fd_gmem_render_noop(struct fd_batch *batch)
-{
-	struct fd_context *ctx = batch->ctx;
-	struct pipe_context *pctx = &ctx->base;
-
-	pctx->emit_string_marker(pctx, "noop", 4);
-	/* emit IB to drawcmds (which contain the string marker): */
-	ctx->emit_ib(batch->gmem, batch->draw);
-	flush_ring(batch);
-}
-
 /* tile needs restore if it isn't completely contained within the
  * cleared scissor:
  */
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.h b/src/gallium/drivers/freedreno/freedreno_gmem.h
index 07e13f5..144e950 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.h
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.h
@@ -63,7 +63,6 @@
 struct fd_batch;
 
 void fd_gmem_render_tiles(struct fd_batch *batch);
-void fd_gmem_render_noop(struct fd_batch *batch);
 
 bool fd_gmem_needs_restore(struct fd_batch *batch, struct fd_tile *tile,
 		uint32_t buffers);
diff --git a/src/gallium/drivers/freedreno/freedreno_perfcntr.h b/src/gallium/drivers/freedreno/freedreno_perfcntr.h
new file mode 100644
index 0000000..7975930
--- /dev/null
+++ b/src/gallium/drivers/freedreno/freedreno_perfcntr.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2018 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#ifndef FREEDRENO_PERFCNTR_H_
+#define FREEDRENO_PERFCNTR_H_
+
+#include "pipe/p_defines.h"
+
+/*
+ * Mapping very closely to the AMD_performance_monitor extension, adreno has
+ * groups of performance counters where each group has N counters, which can
+ * select from M different countables (things that can be counted), where
+ * generally M > N.
+ */
+
+/* Describes a single counter: */
+struct fd_perfcntr_counter {
+	/* offset of the select register to choose what to count: */
+	unsigned select_reg;
+	/* offset of the lo/hi 32b to read current counter value: */
+	unsigned counter_reg_lo;
+	unsigned counter_reg_hi;
+	/* Optional, most counters don't have enable/clear registers: */
+	unsigned enable;
+	unsigned clear;
+};
+
+/* Describes a single countable: */
+struct fd_perfcntr_countable {
+	const char *name;
+	/* selector register enum value to select this countable: */
+	unsigned selector;
+
+	/* description of the countable: */
+	enum pipe_driver_query_type query_type;
+	enum pipe_driver_query_result_type result_type;
+};
+
+/* Describes an entire counter group: */
+struct fd_perfcntr_group {
+	const char *name;
+	unsigned num_counters;
+	const struct fd_perfcntr_counter *counters;
+	unsigned num_countables;
+	const struct fd_perfcntr_countable *countables;
+};
+
+
+#endif /* FREEDRENO_PERFCNTR_H_ */
diff --git a/src/gallium/drivers/freedreno/freedreno_query.c b/src/gallium/drivers/freedreno/freedreno_query.c
index 2a809a3..2b100a3 100644
--- a/src/gallium/drivers/freedreno/freedreno_query.c
+++ b/src/gallium/drivers/freedreno/freedreno_query.c
@@ -118,29 +118,73 @@
 	ctx->cond_mode = mode;
 }
 
+#define _Q(_name, _query_type, _type, _result_type) {                \
+	.name        = _name,                                            \
+	.query_type  = _query_type,                                      \
+	.type        = PIPE_DRIVER_QUERY_TYPE_ ## _type,                 \
+	.result_type = PIPE_DRIVER_QUERY_RESULT_TYPE_ ## _result_type,   \
+	.group_id    = ~(unsigned)0,                                     \
+}
+
+#define FQ(_name, _query_type, _type, _result_type) \
+	_Q(_name, FD_QUERY_ ## _query_type, _type, _result_type)
+
+#define PQ(_name, _query_type, _type, _result_type) \
+	_Q(_name, PIPE_QUERY_ ## _query_type, _type, _result_type)
+
+static const struct pipe_driver_query_info sw_query_list[] = {
+	FQ("draw-calls", DRAW_CALLS, UINT64, AVERAGE),
+	FQ("batches", BATCH_TOTAL, UINT64, AVERAGE),
+	FQ("batches-sysmem", BATCH_SYSMEM, UINT64, AVERAGE),
+	FQ("batches-gmem", BATCH_GMEM, UINT64, AVERAGE),
+	FQ("batches-nondraw", BATCH_NONDRAW, UINT64, AVERAGE),
+	FQ("restores", BATCH_RESTORE, UINT64, AVERAGE),
+	PQ("prims-emitted", PRIMITIVES_EMITTED, UINT64, AVERAGE),
+	FQ("staging", STAGING_UPLOADS, UINT64, AVERAGE),
+	FQ("shadow", SHADOW_UPLOADS, UINT64, AVERAGE),
+	FQ("vsregs", VS_REGS, FLOAT, AVERAGE),
+	FQ("fsregs", FS_REGS, FLOAT, AVERAGE),
+};
+
 static int
 fd_get_driver_query_info(struct pipe_screen *pscreen,
 		unsigned index, struct pipe_driver_query_info *info)
 {
-	struct pipe_driver_query_info list[] = {
-			{"draw-calls", FD_QUERY_DRAW_CALLS, {0}},
-			{"batches", FD_QUERY_BATCH_TOTAL, {0}},
-			{"batches-sysmem", FD_QUERY_BATCH_SYSMEM, {0}},
-			{"batches-gmem", FD_QUERY_BATCH_GMEM, {0}},
-			{"batches-nondraw", FD_QUERY_BATCH_NONDRAW, {0}},
-			{"restores", FD_QUERY_BATCH_RESTORE, {0}},
-			{"prims-emitted", PIPE_QUERY_PRIMITIVES_EMITTED, {0}},
-			{"staging", FD_QUERY_STAGING_UPLOADS, {0}},
-			{"shadow", FD_QUERY_SHADOW_UPLOADS, {0}},
-	};
+	struct fd_screen *screen = fd_screen(pscreen);
 
 	if (!info)
-		return ARRAY_SIZE(list);
+		return ARRAY_SIZE(sw_query_list) + screen->num_perfcntr_queries;
 
-	if (index >= ARRAY_SIZE(list))
+	if (index >= ARRAY_SIZE(sw_query_list)) {
+		index -= ARRAY_SIZE(sw_query_list);
+		if (index >= screen->num_perfcntr_queries)
+			return 0;
+		*info = screen->perfcntr_queries[index];
+		return 1;
+	}
+
+	*info = sw_query_list[index];
+	return 1;
+}
+
+static int
+fd_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index,
+		struct pipe_driver_query_group_info *info)
+{
+	struct fd_screen *screen = fd_screen(pscreen);
+
+	if (!info)
+		return screen->num_perfcntr_groups;
+
+	if (index >= screen->num_perfcntr_groups)
 		return 0;
 
-	*info = list[index];
+	const struct fd_perfcntr_group *g = &screen->perfcntr_groups[index];
+
+	info->name = g->name;
+	info->max_active_queries = g->num_counters;
+	info->num_queries = g->num_countables;
+
 	return 1;
 }
 
@@ -149,10 +193,45 @@
 {
 }
 
+static void
+setup_perfcntr_query_info(struct fd_screen *screen)
+{
+	unsigned num_queries = 0;
+
+	for (unsigned i = 0; i < screen->num_perfcntr_groups; i++)
+		num_queries += screen->perfcntr_groups[i].num_countables;
+
+	screen->perfcntr_queries =
+		calloc(num_queries, sizeof(screen->perfcntr_queries[0]));
+	screen->num_perfcntr_queries = num_queries;
+
+	unsigned idx = 0;
+	for (unsigned i = 0; i < screen->num_perfcntr_groups; i++) {
+		const struct fd_perfcntr_group *g = &screen->perfcntr_groups[i];
+		for (unsigned j = 0; j < g->num_countables; j++) {
+			struct pipe_driver_query_info *info =
+				&screen->perfcntr_queries[idx];
+			const struct fd_perfcntr_countable *c =
+				&g->countables[j];
+
+			info->name = c->name;
+			info->query_type = FD_QUERY_FIRST_PERFCNTR + idx;
+			info->type = c->query_type;
+			info->result_type = c->result_type;
+			info->group_id = i;
+			info->flags = PIPE_DRIVER_QUERY_FLAG_BATCH;
+
+			idx++;
+		}
+	}
+}
+
 void
 fd_query_screen_init(struct pipe_screen *pscreen)
 {
 	pscreen->get_driver_query_info = fd_get_driver_query_info;
+	pscreen->get_driver_query_group_info = fd_get_driver_query_group_info;
+	setup_perfcntr_query_info(fd_screen(pscreen));
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/freedreno_query.h b/src/gallium/drivers/freedreno/freedreno_query.h
index 296c356..22e54ad 100644
--- a/src/gallium/drivers/freedreno/freedreno_query.h
+++ b/src/gallium/drivers/freedreno/freedreno_query.h
@@ -64,6 +64,12 @@
 #define FD_QUERY_BATCH_RESTORE   (PIPE_QUERY_DRIVER_SPECIFIC + 5)  /* batches requiring GMEM restore */
 #define FD_QUERY_STAGING_UPLOADS (PIPE_QUERY_DRIVER_SPECIFIC + 6)  /* texture/buffer uploads using staging blit */
 #define FD_QUERY_SHADOW_UPLOADS  (PIPE_QUERY_DRIVER_SPECIFIC + 7)  /* texture/buffer uploads that shadowed rsc */
+#define FD_QUERY_VS_REGS         (PIPE_QUERY_DRIVER_SPECIFIC + 8)  /* avg # of VS registers (scaled up by 100x) */
+#define FD_QUERY_FS_REGS         (PIPE_QUERY_DRIVER_SPECIFIC + 9)  /* avg # of VS registers (scaled up by 100x) */
+/* insert any new non-perfcntr queries here, the first perfcntr index
+ * needs to come last!
+ */
+#define FD_QUERY_FIRST_PERFCNTR  (PIPE_QUERY_DRIVER_SPECIFIC + 10)
 
 void fd_query_screen_init(struct pipe_screen *pscreen);
 void fd_query_context_init(struct pipe_context *pctx);
diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.c b/src/gallium/drivers/freedreno/freedreno_query_acc.c
index 2cb1a4d..a7420d6 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_acc.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_acc.c
@@ -49,6 +49,7 @@
 	pipe_resource_reference(&aq->prsc, NULL);
 	list_del(&aq->node);
 
+	free(aq->query_data);
 	free(aq);
 }
 
@@ -69,7 +70,7 @@
 	fd_bo_cpu_prep(rsc->bo, ctx->pipe, DRM_FREEDRENO_PREP_WRITE);
 
 	map = fd_bo_map(rsc->bo);
-	memset(map, 0, aq->provider->size);
+	memset(map, 0, aq->size);
 	fd_bo_cpu_fini(rsc->bo);
 }
 
@@ -157,7 +158,7 @@
 	fd_bo_cpu_prep(rsc->bo, ctx->pipe, DRM_FREEDRENO_PREP_READ);
 
 	void *ptr = fd_bo_map(rsc->bo);
-	p->result(ctx, ptr, result);
+	p->result(aq, ptr, result);
 	fd_bo_cpu_fini(rsc->bo);
 
 	return true;
@@ -171,14 +172,11 @@
 };
 
 struct fd_query *
-fd_acc_create_query(struct fd_context *ctx, unsigned query_type)
+fd_acc_create_query2(struct fd_context *ctx, unsigned query_type,
+		const struct fd_acc_sample_provider *provider)
 {
 	struct fd_acc_query *aq;
 	struct fd_query *q;
-	int idx = pidx(query_type);
-
-	if ((idx < 0) || !ctx->acc_sample_providers[idx])
-		return NULL;
 
 	aq = CALLOC_STRUCT(fd_acc_query);
 	if (!aq)
@@ -186,7 +184,8 @@
 
 	DBG("%p: query_type=%u", aq, query_type);
 
-	aq->provider = ctx->acc_sample_providers[idx];
+	aq->provider = provider;
+	aq->size = provider->size;
 
 	list_inithead(&aq->node);
 
@@ -197,6 +196,18 @@
 	return q;
 }
 
+struct fd_query *
+fd_acc_create_query(struct fd_context *ctx, unsigned query_type)
+{
+	int idx = pidx(query_type);
+
+	if ((idx < 0) || !ctx->acc_sample_providers[idx])
+		return NULL;
+
+	return fd_acc_create_query2(ctx, query_type,
+			ctx->acc_sample_providers[idx]);
+}
+
 void
 fd_acc_query_set_stage(struct fd_batch *batch, enum fd_render_stage stage)
 {
diff --git a/src/gallium/drivers/freedreno/freedreno_query_acc.h b/src/gallium/drivers/freedreno/freedreno_query_acc.h
index f8dfabc..3bbffe4 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_acc.h
+++ b/src/gallium/drivers/freedreno/freedreno_query_acc.h
@@ -67,7 +67,7 @@
 	void (*resume)(struct fd_acc_query *aq, struct fd_batch *batch);
 	void (*pause)(struct fd_acc_query *aq, struct fd_batch *batch);
 
-	void (*result)(struct fd_context *ctx, void *buf,
+	void (*result)(struct fd_acc_query *aq, void *buf,
 			union pipe_query_result *result);
 };
 
@@ -77,11 +77,18 @@
 	const struct fd_acc_sample_provider *provider;
 
 	struct pipe_resource *prsc;
-	unsigned offset;
+
+	/* usually the same as provider->size but for batch queries we
+	 * need to calculate the size dynamically when the query is
+	 * allocated:
+	 */
+	unsigned size;
 
 	struct list_head node;   /* list-node in ctx->active_acc_queries */
 
 	int no_wait_cnt;         /* see fd_acc_get_query_result() */
+
+	void *query_data;        /* query specific data */
 };
 
 static inline struct fd_acc_query *
@@ -91,6 +98,8 @@
 }
 
 struct fd_query * fd_acc_create_query(struct fd_context *ctx, unsigned query_type);
+struct fd_query * fd_acc_create_query2(struct fd_context *ctx, unsigned query_type,
+		const struct fd_acc_sample_provider *provider);
 void fd_acc_query_set_stage(struct fd_batch *batch, enum fd_render_stage stage);
 void fd_acc_query_register_provider(struct pipe_context *pctx,
 		const struct fd_acc_sample_provider *provider);
diff --git a/src/gallium/drivers/freedreno/freedreno_query_sw.c b/src/gallium/drivers/freedreno/freedreno_query_sw.c
index 080b2b1..13ab0e8 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_sw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_sw.c
@@ -73,12 +73,16 @@
 		return ctx->stats.staging_uploads;
 	case FD_QUERY_SHADOW_UPLOADS:
 		return ctx->stats.shadow_uploads;
+	case FD_QUERY_VS_REGS:
+		return ctx->stats.vs_regs;
+	case FD_QUERY_FS_REGS:
+		return ctx->stats.fs_regs;
 	}
 	return 0;
 }
 
 static bool
-is_rate_query(struct fd_query *q)
+is_time_rate_query(struct fd_query *q)
 {
 	switch (q->type) {
 	case FD_QUERY_BATCH_TOTAL:
@@ -94,14 +98,29 @@
 	}
 }
 
+static bool
+is_draw_rate_query(struct fd_query *q)
+{
+	switch (q->type) {
+	case FD_QUERY_VS_REGS:
+	case FD_QUERY_FS_REGS:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static boolean
 fd_sw_begin_query(struct fd_context *ctx, struct fd_query *q)
 {
 	struct fd_sw_query *sq = fd_sw_query(q);
 	sq->begin_value = read_counter(ctx, q->type);
-	if (is_rate_query(q))
+	if (is_time_rate_query(q)) {
 		sq->begin_time = os_time_get();
-   return true;
+	} else if (is_draw_rate_query(q)) {
+		sq->begin_time = ctx->stats.draw_calls;
+	}
+	return true;
 }
 
 static void
@@ -109,8 +128,11 @@
 {
 	struct fd_sw_query *sq = fd_sw_query(q);
 	sq->end_value = read_counter(ctx, q->type);
-	if (is_rate_query(q))
+	if (is_time_rate_query(q)) {
 		sq->end_time = os_time_get();
+	} else if (is_draw_rate_query(q)) {
+		sq->end_time = ctx->stats.draw_calls;
+	}
 }
 
 static boolean
@@ -121,10 +143,14 @@
 
 	result->u64 = sq->end_value - sq->begin_value;
 
-	if (is_rate_query(q)) {
+	if (is_time_rate_query(q)) {
 		double fps = (result->u64 * 1000000) /
 				(double)(sq->end_time - sq->begin_time);
 		result->u64 = (uint64_t)fps;
+	} else if (is_draw_rate_query(q)) {
+		double avg = ((double)result->u64) /
+				(double)(sq->end_time - sq->begin_time);
+		result->f = avg;
 	}
 
 	return true;
@@ -154,6 +180,8 @@
 	case FD_QUERY_BATCH_RESTORE:
 	case FD_QUERY_STAGING_UPLOADS:
 	case FD_QUERY_SHADOW_UPLOADS:
+	case FD_QUERY_VS_REGS:
+	case FD_QUERY_FS_REGS:
 		break;
 	default:
 		return NULL;
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index 7e6de8c..3fbf500 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -148,6 +148,7 @@
 	 */
 	if (!pctx->screen->is_format_supported(pctx->screen,
 			prsc->format, prsc->target, prsc->nr_samples,
+			prsc->nr_storage_samples,
 			PIPE_BIND_RENDER_TARGET))
 		fallback = true;
 
@@ -837,6 +838,8 @@
 
 	rsc->internal_format = format;
 	rsc->cpp = util_format_get_blocksize(format);
+	prsc->nr_samples = MAX2(1, prsc->nr_samples);
+	rsc->cpp *= prsc->nr_samples;
 
 	assert(rsc->cpp);
 
@@ -919,8 +922,9 @@
 	if (!rsc->bo)
 		goto fail;
 
+	prsc->nr_samples = MAX2(1, prsc->nr_samples);
 	rsc->internal_format = tmpl->format;
-	rsc->cpp = util_format_get_blocksize(tmpl->format);
+	rsc->cpp = prsc->nr_samples * util_format_get_blocksize(tmpl->format);
 	slice->pitch = handle->stride / rsc->cpp;
 	slice->offset = handle->offset;
 	slice->size0 = handle->stride * prsc->height0;
@@ -1030,14 +1034,6 @@
 	struct pipe_blit_info info = *blit_info;
 	bool discard = false;
 
-	if (info.src.resource->nr_samples > 1 &&
-			info.dst.resource->nr_samples <= 1 &&
-			!util_format_is_depth_or_stencil(info.src.resource->format) &&
-			!util_format_is_pure_integer(info.src.resource->format)) {
-		DBG("color resolve unimplemented");
-		return;
-	}
-
 	if (info.render_condition_enable && !fd_render_condition_check(pctx))
 		return;
 
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index f338d75..4d54446 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -84,6 +84,7 @@
 		{"noblit",    FD_DBG_NOBLIT, "Disable blitter (fallback to generic blit path)"},
 		{"hiprio",    FD_DBG_HIPRIO, "Force high-priority context"},
 		{"ttile",     FD_DBG_TTILE,  "Enable texture tiling (a5xx)"},
+		{"perfcntrs", FD_DBG_PERFC,  "Expose performance counters"},
 		DEBUG_NAMED_VALUE_END
 };
 
@@ -93,6 +94,17 @@
 bool fd_binning_enabled = true;
 static bool glsl120 = false;
 
+static const struct debug_named_value shader_debug_options[] = {
+		{"vs", FD_DBG_SHADER_VS, "Print shader disasm for vertex shaders"},
+		{"fs", FD_DBG_SHADER_FS, "Print shader disasm for fragment shaders"},
+		{"cs", FD_DBG_SHADER_CS, "Print shader disasm for compute shaders"},
+		DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(fd_shader_debug, "FD_SHADER_DEBUG", shader_debug_options, 0)
+
+enum fd_shader_debug fd_shader_debug = 0;
+
 static const char *
 fd_screen_get_name(struct pipe_screen *pscreen)
 {
@@ -151,6 +163,7 @@
 
 	ralloc_free(screen->compiler);
 
+	free(screen->perfcntr_queries);
 	free(screen);
 }
 
@@ -197,7 +210,6 @@
 	case PIPE_CAP_SHADER_STENCIL_EXPORT:
 	case PIPE_CAP_TGSI_TEXCOORD:
 	case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
-	case PIPE_CAP_TEXTURE_MULTISAMPLE:
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_QUERY_MEMORY_INFO:
 	case PIPE_CAP_PCI_GROUP:
@@ -216,11 +228,16 @@
 	case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
 	case PIPE_CAP_CONDITIONAL_RENDER:
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
-	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
 	case PIPE_CAP_CLIP_HALFZ:
 		return is_a3xx(screen) || is_a4xx(screen) || is_a5xx(screen);
 
+	case PIPE_CAP_FAKE_SW_MSAA:
+		return !fd_screen_get_param(pscreen, PIPE_CAP_TEXTURE_MULTISAMPLE);
+
+	case PIPE_CAP_TEXTURE_MULTISAMPLE:
+		return is_a5xx(screen);
+
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
 		return is_a3xx(screen) || is_a4xx(screen);
 
@@ -261,6 +278,7 @@
 		return 64;
 
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
+	case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
 		if (glsl120)
 			return 120;
 		return is_ir3(screen) ? 140 : 120;
@@ -336,10 +354,18 @@
 	case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
 	case PIPE_CAP_TILE_RASTER_ORDER:
 	case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+	case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
 	case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
 	case PIPE_CAP_FENCE_SIGNAL:
 	case PIPE_CAP_CONSTBUF0_FLAGS:
 	case PIPE_CAP_PACKED_UNIFORMS:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+	case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+	case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
 		return 0;
 
 	case PIPE_CAP_CONTEXT_PRIORITY_MASK:
@@ -479,6 +505,10 @@
 		return 16.0f;
 	case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
 		return 15.0f;
+	case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+	case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+	case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+		return 0.0f;
 	}
 	debug_printf("unknown paramf %d\n", param);
 	return 0;
@@ -550,6 +580,10 @@
 	case PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
+	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
+	case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
 		return 0;
 	case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
 		return 1;
@@ -577,11 +611,8 @@
 		return 0;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
-	case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
-	case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
-	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
-	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
-		return 0;
+	case PIPE_SHADER_CAP_SCALAR_ISA:
+		return is_ir3(screen) ? 1 : 0;
 	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
 	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
 		if (is_a5xx(screen)) {
@@ -719,12 +750,12 @@
 {
 	whandle->stride = stride;
 
-	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+	if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
 		return fd_bo_get_name(bo, &whandle->handle) == 0;
-	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+	} else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
 		whandle->handle = fd_bo_handle(bo);
 		return TRUE;
-	} else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+	} else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
 		whandle->handle = fd_bo_dmabuf(bo);
 		return TRUE;
 	} else {
@@ -739,11 +770,11 @@
 	struct fd_screen *screen = fd_screen(pscreen);
 	struct fd_bo *bo;
 
-	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+	if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
 		bo = fd_bo_from_name(screen->dev, whandle->handle);
-	} else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+	} else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
 		bo = fd_bo_from_handle(screen->dev, whandle->handle, 0);
-	} else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+	} else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
 		bo = fd_bo_from_dmabuf(screen->dev, whandle->handle);
 	} else {
 		DBG("Attempt to import unsupported handle type %d", whandle->type);
@@ -766,6 +797,7 @@
 	uint64_t val;
 
 	fd_mesa_debug = debug_get_option_fd_mesa_debug();
+	fd_shader_debug = debug_get_option_fd_shader_debug();
 
 	if (fd_mesa_debug & FD_DBG_NOBIN)
 		fd_binning_enabled = false;
@@ -858,6 +890,7 @@
 	 * send a patch ;-)
 	 */
 	switch (screen->gpu_id) {
+	case 205:
 	case 220:
 		fd2_screen_init(pscreen);
 		break;
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index 6be739a..251a26f 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -38,6 +38,7 @@
 #include "os/os_thread.h"
 
 #include "freedreno_batch_cache.h"
+#include "freedreno_perfcntr.h"
 #include "freedreno_util.h"
 
 struct fd_bo;
@@ -71,6 +72,13 @@
 	uint32_t priority_mask;
 	bool has_timestamp;
 
+	unsigned num_perfcntr_groups;
+	const struct fd_perfcntr_group *perfcntr_groups;
+
+	/* generated at startup from the perfcntr groups: */
+	unsigned num_perfcntr_queries;
+	struct pipe_driver_query_info *perfcntr_queries;
+
 	void *compiler;          /* currently unused for a2xx */
 
 	struct fd_device *dev;
@@ -106,6 +114,12 @@
 
 struct pipe_screen * fd_screen_create(struct fd_device *dev);
 
+static inline boolean
+is_a20x(struct fd_screen *screen)
+{
+	return (screen->gpu_id >= 200) && (screen->gpu_id < 210);
+}
+
 /* is a3xx patch revision 0? */
 /* TODO a306.0 probably doesn't need this.. be more clever?? */
 static inline boolean
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c
index 05717da..7d5ca25 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -103,12 +103,10 @@
 	 */
 	if (unlikely(!cb)) {
 		so->enabled_mask &= ~(1 << index);
-		so->dirty_mask &= ~(1 << index);
 		return;
 	}
 
 	so->enabled_mask |= 1 << index;
-	so->dirty_mask |= 1 << index;
 	ctx->dirty_shader[shader] |= FD_DIRTY_SHADER_CONST;
 	ctx->dirty |= FD_DIRTY_CONST;
 }
@@ -157,7 +155,6 @@
 		so->enabled_mask &= ~mask;
 	}
 
-	so->dirty_mask |= mask;
 	ctx->dirty_shader[shader] |= FD_DIRTY_SHADER_SSBO;
 }
 
@@ -204,7 +201,6 @@
 		so->enabled_mask &= ~mask;
 	}
 
-	so->dirty_mask |= mask;
 	ctx->dirty_shader[shader] |= FD_DIRTY_SHADER_IMAGE;
 }
 
@@ -215,6 +211,15 @@
 	struct fd_context *ctx = fd_context(pctx);
 	struct pipe_framebuffer_state *cso;
 
+	cso = &ctx->batch->framebuffer;
+
+	if (util_framebuffer_state_equal(cso, framebuffer))
+		return;
+
+	util_copy_framebuffer_state(cso, framebuffer);
+
+	cso->samples = util_framebuffer_get_num_samples(cso);
+
 	if (ctx->screen->reorder) {
 		struct fd_batch *batch, *old_batch = NULL;
 
@@ -243,12 +248,9 @@
 		DBG("%d: cbufs[0]=%p, zsbuf=%p", ctx->batch->needs_flush,
 				framebuffer->cbufs[0], framebuffer->zsbuf);
 		fd_batch_flush(ctx->batch, false, false);
+		util_copy_framebuffer_state(&ctx->batch->framebuffer, cso);
 	}
 
-	cso = &ctx->batch->framebuffer;
-
-	util_copy_framebuffer_state(cso, framebuffer);
-
 	ctx->dirty |= FD_DIRTY_FRAMEBUFFER;
 
 	ctx->disabled_scissor.minx = 0;
diff --git a/src/gallium/drivers/freedreno/freedreno_texture.c b/src/gallium/drivers/freedreno/freedreno_texture.c
index 1487f74..5ba851f 100644
--- a/src/gallium/drivers/freedreno/freedreno_texture.c
+++ b/src/gallium/drivers/freedreno/freedreno_texture.c
@@ -70,6 +70,7 @@
 		unsigned start, unsigned nr, struct pipe_sampler_view **views)
 {
 	unsigned i;
+	unsigned samplers = 0;
 
 	for (i = 0; i < nr; i++) {
 		struct pipe_sampler_view *view = views ? views[i] : NULL;
@@ -82,6 +83,13 @@
 	}
 
 	tex->num_textures = util_last_bit(tex->valid_textures);
+
+	for (i = 0; i < tex->num_textures; i++) {
+		uint nr_samples = tex->textures[i]->texture->nr_samples;
+		samplers |= (nr_samples >> 1) << (i * 2);
+	}
+
+	tex->samples = samplers;
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index bfeec4c..10151aa 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -84,6 +84,7 @@
 #define FD_DBG_NOBLIT  0x80000
 #define FD_DBG_HIPRIO 0x100000
 #define FD_DBG_TTILE  0x200000
+#define FD_DBG_PERFC  0x400000
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
@@ -112,6 +113,19 @@
 			(instances         << 24);
 }
 
+static inline uint32_t DRAW_A20X(enum pc_di_primtype prim_type,
+		enum pc_di_src_sel source_select, enum pc_di_index_size index_size,
+		enum pc_di_vis_cull_mode vis_cull_mode,
+		uint16_t count)
+{
+	return (prim_type         << 0) |
+			(source_select     << 6) |
+			((index_size & 1)  << 11) |
+			((index_size >> 1) << 13) |
+			(vis_cull_mode     << 9) |
+			(count         << 16);
+}
+
 /* for tracking cmdstream positions that need to be patched: */
 struct fd_cs_patch {
 	uint32_t *cs;
@@ -183,7 +197,6 @@
 #define LOG_DWORDS 0
 
 static inline void emit_marker(struct fd_ringbuffer *ring, int scratch_idx);
-static inline void emit_marker5(struct fd_ringbuffer *ring, int scratch_idx);
 
 static inline void
 OUT_RING(struct fd_ringbuffer *ring, uint32_t data)
@@ -343,6 +356,9 @@
 static inline void
 __OUT_IB(struct fd_ringbuffer *ring, bool prefetch, struct fd_ringbuffer *target)
 {
+	if (target->cur == target->start)
+		return;
+
 	unsigned count = fd_ringbuffer_cmd_count(target);
 
 	debug_assert(__gpu_id(ring) < 500);
@@ -370,15 +386,10 @@
 static inline void
 __OUT_IB5(struct fd_ringbuffer *ring, struct fd_ringbuffer *target)
 {
-	unsigned count = fd_ringbuffer_cmd_count(target);
+	if (target->cur == target->start)
+		return;
 
-	/* for debug after a lock up, write a unique counter value
-	 * to scratch6 for each IB, to make it easier to match up
-	 * register dumps to cmdstream.  The combination of IB and
-	 * DRAW (scratch7) is enough to "triangulate" the particular
-	 * draw that caused lockup.
-	 */
-	emit_marker5(ring, 6);
+	unsigned count = fd_ringbuffer_cmd_count(target);
 
 	for (unsigned i = 0; i < count; i++) {
 		uint32_t dwords;
@@ -387,8 +398,6 @@
 		assert(dwords > 0);
 		OUT_RING(ring, dwords);
 	}
-
-	emit_marker5(ring, 6);
 }
 
 /* CP_SCRATCH_REG4 is used to hold base address for query results: */
@@ -409,16 +418,6 @@
 	OUT_RING(ring, ++marker_cnt);
 }
 
-static inline void
-emit_marker5(struct fd_ringbuffer *ring, int scratch_idx)
-{
-	extern unsigned marker_cnt;
-//XXX	unsigned reg = REG_A5XX_CP_SCRATCH_REG(scratch_idx);
-	unsigned reg = 0x00000b78 + scratch_idx;
-	OUT_PKT4(ring, reg, 1);
-	OUT_RING(ring, ++marker_cnt);
-}
-
 /* helper to get numeric value from environment variable..  mostly
  * just leaving this here because it is helpful to brute-force figure
  * out unknown formats, etc, which blob driver does not support:
@@ -452,6 +451,22 @@
 #define BIT(bit) (1u << bit)
 
 /*
+ * a3xx+ helpers:
+ */
+
+static inline enum a3xx_msaa_samples
+fd_msaa_samples(unsigned samples)
+{
+	switch (samples) {
+	default:
+		debug_assert(0);
+	case 1: return MSAA_ONE;
+	case 2: return MSAA_TWO;
+	case 4: return MSAA_FOUR;
+	}
+}
+
+/*
  * a4xx+ helpers:
  */
 
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index 5499e19..247175f 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
+#include <stdbool.h>
 #include <string.h>
 #include <assert.h>
 
@@ -68,8 +69,16 @@
 		[TYPE_S8]  = "s8",
 };
 
-static void print_reg(reg_t reg, bool full, bool r, bool c, bool im,
-		bool neg, bool abs, bool addr_rel)
+struct disasm_ctx {
+	FILE *out;
+	int level;
+
+	/* current instruction repeat flag: */
+	unsigned repeat;
+};
+
+static void print_reg(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+		bool c, bool im, bool neg, bool abs, bool addr_rel)
 {
 	const char type = c ? 'c' : 'r';
 
@@ -77,49 +86,46 @@
 	// by libllvm-a3xx for easy diffing..
 
 	if (abs && neg)
-		printf("(absneg)");
+		fprintf(ctx->out, "(absneg)");
 	else if (neg)
-		printf("(neg)");
+		fprintf(ctx->out, "(neg)");
 	else if (abs)
-		printf("(abs)");
+		fprintf(ctx->out, "(abs)");
 
 	if (r)
-		printf("(r)");
+		fprintf(ctx->out, "(r)");
 
 	if (im) {
-		printf("%d", reg.iim_val);
+		fprintf(ctx->out, "%d", reg.iim_val);
 	} else if (addr_rel) {
 		/* I would just use %+d but trying to make it diff'able with
 		 * libllvm-a3xx...
 		 */
 		if (reg.iim_val < 0)
-			printf("%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
+			fprintf(ctx->out, "%s%c<a0.x - %d>", full ? "" : "h", type, -reg.iim_val);
 		else if (reg.iim_val > 0)
-			printf("%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
+			fprintf(ctx->out, "%s%c<a0.x + %d>", full ? "" : "h", type, reg.iim_val);
 		else
-			printf("%s%c<a0.x>", full ? "" : "h", type);
+			fprintf(ctx->out, "%s%c<a0.x>", full ? "" : "h", type);
 	} else if ((reg.num == REG_A0) && !c) {
-		printf("a0.%c", component[reg.comp]);
+		fprintf(ctx->out, "a0.%c", component[reg.comp]);
 	} else if ((reg.num == REG_P0) && !c) {
-		printf("p0.%c", component[reg.comp]);
+		fprintf(ctx->out, "p0.%c", component[reg.comp]);
 	} else {
-		printf("%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
+		fprintf(ctx->out, "%s%c%d.%c", full ? "" : "h", type, reg.num & 0x3f, component[reg.comp]);
 	}
 }
 
 
-/* current instruction repeat flag: */
-static unsigned repeat;
-
-static void print_reg_dst(reg_t reg, bool full, bool addr_rel)
+static void print_reg_dst(struct disasm_ctx *ctx, reg_t reg, bool full, bool addr_rel)
 {
-	print_reg(reg, full, false, false, false, false, false, addr_rel);
+	print_reg(ctx, reg, full, false, false, false, false, false, addr_rel);
 }
 
-static void print_reg_src(reg_t reg, bool full, bool r, bool c, bool im,
-		bool neg, bool abs, bool addr_rel)
+static void print_reg_src(struct disasm_ctx *ctx, reg_t reg, bool full, bool r,
+		bool c, bool im, bool neg, bool abs, bool addr_rel)
 {
-	print_reg(reg, full, r, c, im, neg, abs, addr_rel);
+	print_reg(ctx, reg, full, r, c, im, neg, abs, addr_rel);
 }
 
 /* TODO switch to using reginfo struct everywhere, since more readable
@@ -137,100 +143,100 @@
 	bool addr_rel;
 };
 
-static void print_src(struct reginfo *info)
+static void print_src(struct disasm_ctx *ctx, struct reginfo *info)
 {
-	print_reg_src(info->reg, info->full, info->r, info->c, info->im,
+	print_reg_src(ctx, info->reg, info->full, info->r, info->c, info->im,
 			info->neg, info->abs, info->addr_rel);
 }
 
-//static void print_dst(struct reginfo *info)
+//static void print_dst(struct disasm_ctx *ctx, struct reginfo *info)
 //{
-//	print_reg_dst(info->reg, info->full, info->addr_rel);
+//	print_reg_dst(ctx, info->reg, info->full, info->addr_rel);
 //}
 
-static void print_instr_cat0(instr_t *instr)
+static void print_instr_cat0(struct disasm_ctx *ctx, instr_t *instr)
 {
 	instr_cat0_t *cat0 = &instr->cat0;
 
 	switch (cat0->opc) {
 	case OPC_KILL:
-		printf(" %sp0.%c", cat0->inv ? "!" : "",
+		fprintf(ctx->out, " %sp0.%c", cat0->inv ? "!" : "",
 				component[cat0->comp]);
 		break;
 	case OPC_BR:
-		printf(" %sp0.%c, #%d", cat0->inv ? "!" : "",
+		fprintf(ctx->out, " %sp0.%c, #%d", cat0->inv ? "!" : "",
 				component[cat0->comp], cat0->a3xx.immed);
 		break;
 	case OPC_JUMP:
 	case OPC_CALL:
-		printf(" #%d", cat0->a3xx.immed);
+		fprintf(ctx->out, " #%d", cat0->a3xx.immed);
 		break;
 	}
 
 	if ((debug & PRINT_VERBOSE) && (cat0->dummy2|cat0->dummy3|cat0->dummy4))
-		printf("\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4);
+		fprintf(ctx->out, "\t{0: %x,%x,%x}", cat0->dummy2, cat0->dummy3, cat0->dummy4);
 }
 
-static void print_instr_cat1(instr_t *instr)
+static void print_instr_cat1(struct disasm_ctx *ctx, instr_t *instr)
 {
 	instr_cat1_t *cat1 = &instr->cat1;
 
 	if (cat1->ul)
-		printf("(ul)");
+		fprintf(ctx->out, "(ul)");
 
 	if (cat1->src_type == cat1->dst_type) {
 		if ((cat1->src_type == TYPE_S16) && (((reg_t)cat1->dst).num == REG_A0)) {
 			/* special case (nmemonic?): */
-			printf("mova");
+			fprintf(ctx->out, "mova");
 		} else {
-			printf("mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+			fprintf(ctx->out, "mov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
 		}
 	} else {
-		printf("cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
+		fprintf(ctx->out, "cov.%s%s", type[cat1->src_type], type[cat1->dst_type]);
 	}
 
-	printf(" ");
+	fprintf(ctx->out, " ");
 
 	if (cat1->even)
-		printf("(even)");
+		fprintf(ctx->out, "(even)");
 
 	if (cat1->pos_inf)
-		printf("(pos_infinity)");
+		fprintf(ctx->out, "(pos_infinity)");
 
-	print_reg_dst((reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
+	print_reg_dst(ctx, (reg_t)(cat1->dst), type_size(cat1->dst_type) == 32,
 			cat1->dst_rel);
 
-	printf(", ");
+	fprintf(ctx->out, ", ");
 
 	/* ugg, have to special case this.. vs print_reg().. */
 	if (cat1->src_im) {
 		if (type_float(cat1->src_type))
-			printf("(%f)", cat1->fim_val);
+			fprintf(ctx->out, "(%f)", cat1->fim_val);
 		else if (type_uint(cat1->src_type))
-			printf("0x%08x", cat1->uim_val);
+			fprintf(ctx->out, "0x%08x", cat1->uim_val);
 		else
-			printf("%d", cat1->iim_val);
+			fprintf(ctx->out, "%d", cat1->iim_val);
 	} else if (cat1->src_rel && !cat1->src_c) {
 		/* I would just use %+d but trying to make it diff'able with
 		 * libllvm-a3xx...
 		 */
 		char type = cat1->src_rel_c ? 'c' : 'r';
 		if (cat1->off < 0)
-			printf("%c<a0.x - %d>", type, -cat1->off);
+			fprintf(ctx->out, "%c<a0.x - %d>", type, -cat1->off);
 		else if (cat1->off > 0)
-			printf("%c<a0.x + %d>", type, cat1->off);
+			fprintf(ctx->out, "%c<a0.x + %d>", type, cat1->off);
 		else
-			printf("%c<a0.x>", type);
+			fprintf(ctx->out, "%c<a0.x>", type);
 	} else {
-		print_reg_src((reg_t)(cat1->src), type_size(cat1->src_type) == 32,
+		print_reg_src(ctx, (reg_t)(cat1->src), type_size(cat1->src_type) == 32,
 				cat1->src_r, cat1->src_c, cat1->src_im, false, false, false);
 	}
 
 	if ((debug & PRINT_VERBOSE) && (cat1->must_be_0))
-		printf("\t{1: %x}", cat1->must_be_0);
+		fprintf(ctx->out, "\t{1: %x}", cat1->must_be_0);
 }
 
-static void print_instr_cat2(instr_t *instr)
+static void print_instr_cat2(struct disasm_ctx *ctx, instr_t *instr)
 {
 	instr_cat2_t *cat2 = &instr->cat2;
 	static const char *cond[] = {
@@ -250,26 +256,26 @@
 	case OPC_CMPV_F:
 	case OPC_CMPV_U:
 	case OPC_CMPV_S:
-		printf(".%s", cond[cat2->cond]);
+		fprintf(ctx->out, ".%s", cond[cat2->cond]);
 		break;
 	}
 
-	printf(" ");
+	fprintf(ctx->out, " ");
 	if (cat2->ei)
-		printf("(ei)");
-	print_reg_dst((reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
-	printf(", ");
+		fprintf(ctx->out, "(ei)");
+	print_reg_dst(ctx, (reg_t)(cat2->dst), cat2->full ^ cat2->dst_half, false);
+	fprintf(ctx->out, ", ");
 
 	if (cat2->c1.src1_c) {
-		print_reg_src((reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
+		print_reg_src(ctx, (reg_t)(cat2->c1.src1), cat2->full, cat2->src1_r,
 				cat2->c1.src1_c, cat2->src1_im, cat2->src1_neg,
 				cat2->src1_abs, false);
 	} else if (cat2->rel1.src1_rel) {
-		print_reg_src((reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
+		print_reg_src(ctx, (reg_t)(cat2->rel1.src1), cat2->full, cat2->src1_r,
 				cat2->rel1.src1_c, cat2->src1_im, cat2->src1_neg,
 				cat2->src1_abs, cat2->rel1.src1_rel);
 	} else {
-		print_reg_src((reg_t)(cat2->src1), cat2->full, cat2->src1_r,
+		print_reg_src(ctx, (reg_t)(cat2->src1), cat2->full, cat2->src1_r,
 				false, cat2->src1_im, cat2->src1_neg,
 				cat2->src1_abs, false);
 	}
@@ -292,17 +298,17 @@
 		/* these only have one src reg */
 		break;
 	default:
-		printf(", ");
+		fprintf(ctx->out, ", ");
 		if (cat2->c2.src2_c) {
-			print_reg_src((reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
+			print_reg_src(ctx, (reg_t)(cat2->c2.src2), cat2->full, cat2->src2_r,
 					cat2->c2.src2_c, cat2->src2_im, cat2->src2_neg,
 					cat2->src2_abs, false);
 		} else if (cat2->rel2.src2_rel) {
-			print_reg_src((reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
+			print_reg_src(ctx, (reg_t)(cat2->rel2.src2), cat2->full, cat2->src2_r,
 					cat2->rel2.src2_c, cat2->src2_im, cat2->src2_neg,
 					cat2->src2_abs, cat2->rel2.src2_rel);
 		} else {
-			print_reg_src((reg_t)(cat2->src2), cat2->full, cat2->src2_r,
+			print_reg_src(ctx, (reg_t)(cat2->src2), cat2->full, cat2->src2_r,
 					false, cat2->src2_im, cat2->src2_neg,
 					cat2->src2_abs, false);
 		}
@@ -310,74 +316,74 @@
 	}
 }
 
-static void print_instr_cat3(instr_t *instr)
+static void print_instr_cat3(struct disasm_ctx *ctx, instr_t *instr)
 {
 	instr_cat3_t *cat3 = &instr->cat3;
 	bool full = instr_cat3_full(cat3);
 
-	printf(" ");
-	print_reg_dst((reg_t)(cat3->dst), full ^ cat3->dst_half, false);
-	printf(", ");
+	fprintf(ctx->out, " ");
+	print_reg_dst(ctx, (reg_t)(cat3->dst), full ^ cat3->dst_half, false);
+	fprintf(ctx->out, ", ");
 	if (cat3->c1.src1_c) {
-		print_reg_src((reg_t)(cat3->c1.src1), full,
+		print_reg_src(ctx, (reg_t)(cat3->c1.src1), full,
 				cat3->src1_r, cat3->c1.src1_c, false, cat3->src1_neg,
 				false, false);
 	} else if (cat3->rel1.src1_rel) {
-		print_reg_src((reg_t)(cat3->rel1.src1), full,
+		print_reg_src(ctx, (reg_t)(cat3->rel1.src1), full,
 				cat3->src1_r, cat3->rel1.src1_c, false, cat3->src1_neg,
 				false, cat3->rel1.src1_rel);
 	} else {
-		print_reg_src((reg_t)(cat3->src1), full,
+		print_reg_src(ctx, (reg_t)(cat3->src1), full,
 				cat3->src1_r, false, false, cat3->src1_neg,
 				false, false);
 	}
-	printf(", ");
-	print_reg_src((reg_t)cat3->src2, full,
+	fprintf(ctx->out, ", ");
+	print_reg_src(ctx, (reg_t)cat3->src2, full,
 			cat3->src2_r, cat3->src2_c, false, cat3->src2_neg,
 			false, false);
-	printf(", ");
+	fprintf(ctx->out, ", ");
 	if (cat3->c2.src3_c) {
-		print_reg_src((reg_t)(cat3->c2.src3), full,
+		print_reg_src(ctx, (reg_t)(cat3->c2.src3), full,
 				cat3->src3_r, cat3->c2.src3_c, false, cat3->src3_neg,
 				false, false);
 	} else if (cat3->rel2.src3_rel) {
-		print_reg_src((reg_t)(cat3->rel2.src3), full,
+		print_reg_src(ctx, (reg_t)(cat3->rel2.src3), full,
 				cat3->src3_r, cat3->rel2.src3_c, false, cat3->src3_neg,
 				false, cat3->rel2.src3_rel);
 	} else {
-		print_reg_src((reg_t)(cat3->src3), full,
+		print_reg_src(ctx, (reg_t)(cat3->src3), full,
 				cat3->src3_r, false, false, cat3->src3_neg,
 				false, false);
 	}
 }
 
-static void print_instr_cat4(instr_t *instr)
+static void print_instr_cat4(struct disasm_ctx *ctx, instr_t *instr)
 {
 	instr_cat4_t *cat4 = &instr->cat4;
 
-	printf(" ");
-	print_reg_dst((reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
-	printf(", ");
+	fprintf(ctx->out, " ");
+	print_reg_dst(ctx, (reg_t)(cat4->dst), cat4->full ^ cat4->dst_half, false);
+	fprintf(ctx->out, ", ");
 
 	if (cat4->c.src_c) {
-		print_reg_src((reg_t)(cat4->c.src), cat4->full,
+		print_reg_src(ctx, (reg_t)(cat4->c.src), cat4->full,
 				cat4->src_r, cat4->c.src_c, cat4->src_im,
 				cat4->src_neg, cat4->src_abs, false);
 	} else if (cat4->rel.src_rel) {
-		print_reg_src((reg_t)(cat4->rel.src), cat4->full,
+		print_reg_src(ctx, (reg_t)(cat4->rel.src), cat4->full,
 				cat4->src_r, cat4->rel.src_c, cat4->src_im,
 				cat4->src_neg, cat4->src_abs, cat4->rel.src_rel);
 	} else {
-		print_reg_src((reg_t)(cat4->src), cat4->full,
+		print_reg_src(ctx, (reg_t)(cat4->src), cat4->full,
 				cat4->src_r, false, cat4->src_im,
 				cat4->src_neg, cat4->src_abs, false);
 	}
 
 	if ((debug & PRINT_VERBOSE) && (cat4->dummy1|cat4->dummy2))
-		printf("\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
+		fprintf(ctx->out, "\t{4: %x,%x}", cat4->dummy1, cat4->dummy2);
 }
 
-static void print_instr_cat5(instr_t *instr)
+static void print_instr_cat5(struct disasm_ctx *ctx, instr_t *instr)
 {
 	static const struct {
 		bool src1, src2, samp, tex;
@@ -414,69 +420,69 @@
 	instr_cat5_t *cat5 = &instr->cat5;
 	int i;
 
-	if (cat5->is_3d)   printf(".3d");
-	if (cat5->is_a)    printf(".a");
-	if (cat5->is_o)    printf(".o");
-	if (cat5->is_p)    printf(".p");
-	if (cat5->is_s)    printf(".s");
-	if (cat5->is_s2en) printf(".s2en");
+	if (cat5->is_3d)   fprintf(ctx->out, ".3d");
+	if (cat5->is_a)    fprintf(ctx->out, ".a");
+	if (cat5->is_o)    fprintf(ctx->out, ".o");
+	if (cat5->is_p)    fprintf(ctx->out, ".p");
+	if (cat5->is_s)    fprintf(ctx->out, ".s");
+	if (cat5->is_s2en) fprintf(ctx->out, ".s2en");
 
-	printf(" ");
+	fprintf(ctx->out, " ");
 
 	switch (_OPC(5, cat5->opc)) {
 	case OPC_DSXPP_1:
 	case OPC_DSYPP_1:
 		break;
 	default:
-		printf("(%s)", type[cat5->type]);
+		fprintf(ctx->out, "(%s)", type[cat5->type]);
 		break;
 	}
 
-	printf("(");
+	fprintf(ctx->out, "(");
 	for (i = 0; i < 4; i++)
 		if (cat5->wrmask & (1 << i))
-			printf("%c", "xyzw"[i]);
-	printf(")");
+			fprintf(ctx->out, "%c", "xyzw"[i]);
+	fprintf(ctx->out, ")");
 
-	print_reg_dst((reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
+	print_reg_dst(ctx, (reg_t)(cat5->dst), type_size(cat5->type) == 32, false);
 
 	if (info[cat5->opc].src1) {
-		printf(", ");
-		print_reg_src((reg_t)(cat5->src1), cat5->full, false, false, false,
+		fprintf(ctx->out, ", ");
+		print_reg_src(ctx, (reg_t)(cat5->src1), cat5->full, false, false, false,
 				false, false, false);
 	}
 
 	if (cat5->is_s2en) {
-		printf(", ");
-		print_reg_src((reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
+		fprintf(ctx->out, ", ");
+		print_reg_src(ctx, (reg_t)(cat5->s2en.src2), cat5->full, false, false, false,
 				false, false, false);
-		printf(", ");
-		print_reg_src((reg_t)(cat5->s2en.src3), false, false, false, false,
+		fprintf(ctx->out, ", ");
+		print_reg_src(ctx, (reg_t)(cat5->s2en.src3), false, false, false, false,
 				false, false, false);
 	} else {
 		if (cat5->is_o || info[cat5->opc].src2) {
-			printf(", ");
-			print_reg_src((reg_t)(cat5->norm.src2), cat5->full,
+			fprintf(ctx->out, ", ");
+			print_reg_src(ctx, (reg_t)(cat5->norm.src2), cat5->full,
 					false, false, false, false, false, false);
 		}
 		if (info[cat5->opc].samp)
-			printf(", s#%d", cat5->norm.samp);
+			fprintf(ctx->out, ", s#%d", cat5->norm.samp);
 		if (info[cat5->opc].tex)
-			printf(", t#%d", cat5->norm.tex);
+			fprintf(ctx->out, ", t#%d", cat5->norm.tex);
 	}
 
 	if (debug & PRINT_VERBOSE) {
 		if (cat5->is_s2en) {
 			if ((debug & PRINT_VERBOSE) && (cat5->s2en.dummy1|cat5->s2en.dummy2|cat5->dummy2))
-				printf("\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
+				fprintf(ctx->out, "\t{5: %x,%x,%x}", cat5->s2en.dummy1, cat5->s2en.dummy2, cat5->dummy2);
 		} else {
 			if ((debug & PRINT_VERBOSE) && (cat5->norm.dummy1|cat5->dummy2))
-				printf("\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
+				fprintf(ctx->out, "\t{5: %x,%x}", cat5->norm.dummy1, cat5->dummy2);
 		}
 	}
 }
 
-static void print_instr_cat6(instr_t *instr)
+static void print_instr_cat6(struct disasm_ctx *ctx, instr_t *instr)
 {
 	instr_cat6_t *cat6 = &instr->cat6;
 	char sd = 0, ss = 0;  /* dst/src address space */
@@ -522,20 +528,20 @@
 	case OPC_PREFETCH:
 		break;
 	case OPC_RESINFO:
-		printf(".%dd", cat6->ldgb.d + 1);
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
 		break;
 	case OPC_LDGB:
-		printf(".%s", cat6->ldgb.typed ? "typed" : "untyped");
-		printf(".%dd", cat6->ldgb.d + 1);
-		printf(".%s", type[cat6->type]);
-		printf(".%d", cat6->ldgb.type_size + 1);
+		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
 		break;
 	case OPC_STGB:
 	case OPC_STIB:
-		printf(".%s", cat6->stgb.typed ? "typed" : "untyped");
-		printf(".%dd", cat6->stgb.d + 1);
-		printf(".%s", type[cat6->type]);
-		printf(".%d", cat6->stgb.type_size + 1);
+		fprintf(ctx->out, ".%s", cat6->stgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->stgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->stgb.type_size + 1);
 		break;
 	case OPC_ATOMIC_ADD:
 	case OPC_ATOMIC_SUB:
@@ -549,18 +555,18 @@
 	case OPC_ATOMIC_OR:
 	case OPC_ATOMIC_XOR:
 		ss = cat6->g ? 'g' : 'l';
-		printf(".%s", cat6->ldgb.typed ? "typed" : "untyped");
-		printf(".%dd", cat6->ldgb.d + 1);
-		printf(".%s", type[cat6->type]);
-		printf(".%d", cat6->ldgb.type_size + 1);
-		printf(".%c", ss);
+		fprintf(ctx->out, ".%s", cat6->ldgb.typed ? "typed" : "untyped");
+		fprintf(ctx->out, ".%dd", cat6->ldgb.d + 1);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%d", cat6->ldgb.type_size + 1);
+		fprintf(ctx->out, ".%c", ss);
 		break;
 	default:
 		dst.im = cat6->g && !cat6->dst_off;
-		printf(".%s", type[cat6->type]);
+		fprintf(ctx->out, ".%s", type[cat6->type]);
 		break;
 	}
-	printf(" ");
+	fprintf(ctx->out, " ");
 
 	switch (_OPC(6, cat6->opc)) {
 	case OPC_STG:
@@ -619,15 +625,15 @@
 		src3.im  = cat6->stgb.src3_im;
 		src3.full = true;
 
-		printf("g[%u], ", cat6->stgb.dst_ssbo);
-		print_src(&src1);
-		printf(", ");
-		print_src(&src2);
-		printf(", ");
-		print_src(&src3);
+		fprintf(ctx->out, "g[%u], ", cat6->stgb.dst_ssbo);
+		print_src(ctx, &src1);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src3);
 
 		if (debug & PRINT_VERBOSE)
-			printf(" (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
+			fprintf(ctx->out, " (pad0=%x, pad3=%x)", cat6->stgb.pad0, cat6->stgb.pad3);
 
 		return;
 	}
@@ -640,8 +646,8 @@
 		src2.im  = cat6->ldgb.src2_im;
 		dst.reg  = (reg_t)(cat6->ldgb.dst);
 
-		print_src(&dst);
-		printf(", ");
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
 		if (ss == 'g') {
 			struct reginfo src3;
 			memset(&src3, 0, sizeof(src3));
@@ -657,25 +663,25 @@
 			 * uvec2(offset * 4, 0).  Not sure the point of that.
 			 */
 
-			printf("g[%u], ", cat6->ldgb.src_ssbo);
-			print_src(&src1);  /* value */
-			printf(", ");
-			print_src(&src2);  /* offset/coords */
-			printf(", ");
-			print_src(&src3);  /* 64b byte offset.. */
+			fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+			print_src(ctx, &src1);  /* value */
+			fprintf(ctx->out, ", ");
+			print_src(ctx, &src2);  /* offset/coords */
+			fprintf(ctx->out, ", ");
+			print_src(ctx, &src3);  /* 64b byte offset.. */
 
 			if (debug & PRINT_VERBOSE) {
-				printf(" (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
+				fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0,
 						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
 			}
 		} else { /* ss == 'l' */
-			printf("l[");
-			print_src(&src1);  /* simple byte offset */
-			printf("], ");
-			print_src(&src2);  /* value */
+			fprintf(ctx->out, "l[");
+			print_src(ctx, &src1);  /* simple byte offset */
+			fprintf(ctx->out, "], ");
+			print_src(ctx, &src2);  /* value */
 
 			if (debug & PRINT_VERBOSE) {
-				printf(" (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
+				fprintf(ctx->out, " (src3=%x, pad0=%x, pad3=%x, mustbe0=%x)",
 						cat6->ldgb.src3, cat6->ldgb.pad0,
 						cat6->ldgb.pad3, cat6->ldgb.mustbe0);
 			}
@@ -685,9 +691,9 @@
 	} else if (_OPC(6, cat6->opc) == OPC_RESINFO) {
 		dst.reg  = (reg_t)(cat6->ldgb.dst);
 
-		print_src(&dst);
-		printf(", ");
-		printf("g[%u]", cat6->ldgb.src_ssbo);
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		fprintf(ctx->out, "g[%u]", cat6->ldgb.src_ssbo);
 
 		return;
 	} else if (_OPC(6, cat6->opc) == OPC_LDGB) {
@@ -698,15 +704,15 @@
 		src2.im  = cat6->ldgb.src2_im;
 		dst.reg  = (reg_t)(cat6->ldgb.dst);
 
-		print_src(&dst);
-		printf(", ");
-		printf("g[%u], ", cat6->ldgb.src_ssbo);
-		print_src(&src1);
-		printf(", ");
-		print_src(&src2);
+		print_src(ctx, &dst);
+		fprintf(ctx->out, ", ");
+		fprintf(ctx->out, "g[%u], ", cat6->ldgb.src_ssbo);
+		print_src(ctx, &src1);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
 
 		if (debug & PRINT_VERBOSE)
-			printf(" (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
+			fprintf(ctx->out, " (pad0=%x, pad3=%x, mustbe0=%x)", cat6->ldgb.pad0, cat6->ldgb.pad3, cat6->ldgb.mustbe0);
 
 		return;
 	}
@@ -732,56 +738,56 @@
 
 	if (!nodst) {
 		if (sd)
-			printf("%c[", sd);
+			fprintf(ctx->out, "%c[", sd);
 		/* note: dst might actually be a src (ie. address to store to) */
-		print_src(&dst);
+		print_src(ctx, &dst);
 		if (dstoff)
-			printf("%+d", dstoff);
+			fprintf(ctx->out, "%+d", dstoff);
 		if (sd)
-			printf("]");
-		printf(", ");
+			fprintf(ctx->out, "]");
+		fprintf(ctx->out, ", ");
 	}
 
 	if (ss)
-		printf("%c[", ss);
+		fprintf(ctx->out, "%c[", ss);
 
 	/* can have a larger than normal immed, so hack: */
 	if (src1.im) {
-		printf("%u", src1.reg.dummy13);
+		fprintf(ctx->out, "%u", src1.reg.dummy13);
 	} else {
-		print_src(&src1);
+		print_src(ctx, &src1);
 	}
 
 	if (src1off)
-		printf("%+d", src1off);
+		fprintf(ctx->out, "%+d", src1off);
 	if (ss)
-		printf("]");
+		fprintf(ctx->out, "]");
 
 	switch (_OPC(6, cat6->opc)) {
 	case OPC_RESINFO:
 	case OPC_RESFMT:
 		break;
 	default:
-		printf(", ");
-		print_src(&src2);
+		fprintf(ctx->out, ", ");
+		print_src(ctx, &src2);
 		break;
 	}
 }
 
-static void print_instr_cat7(instr_t *instr)
+static void print_instr_cat7(struct disasm_ctx *ctx, instr_t *instr)
 {
 	instr_cat7_t *cat7 = &instr->cat7;
 
 	if (cat7->g)
-		printf(".g");
+		fprintf(ctx->out, ".g");
 	if (cat7->l)
-		printf(".l");
+		fprintf(ctx->out, ".l");
 
 	if (_OPC(7, cat7->opc) == OPC_FENCE) {
 		if (cat7->r)
-			printf(".r");
+			fprintf(ctx->out, ".r");
 		if (cat7->w)
-			printf(".w");
+			fprintf(ctx->out, ".w");
 	}
 }
 
@@ -792,7 +798,7 @@
 	uint16_t cat;
 	uint16_t opc;
 	const char *name;
-	void (*print)(instr_t *instr);
+	void (*print)(struct disasm_ctx *ctx, instr_t *instr);
 } opcs[1 << (3+NOPC_BITS)] = {
 #define OPC(cat, opc, name) [(opc)] = { (cat), (opc), #name, print_instr_cat##cat }
 	/* category 0: */
@@ -966,55 +972,62 @@
 	return opcs[instr->opc].name;
 }
 
-static void print_instr(uint32_t *dwords, int level, int n)
+static bool print_instr(struct disasm_ctx *ctx, uint32_t *dwords, int n)
 {
 	instr_t *instr = (instr_t *)dwords;
 	uint32_t opc = instr_opc(instr);
 	const char *name;
 
 	if (debug & PRINT_VERBOSE)
-		printf("%s%04d[%08xx_%08xx] ", levels[level], n, dwords[1], dwords[0]);
+		fprintf(ctx->out, "%s%04d[%08xx_%08xx] ", levels[ctx->level], n, dwords[1], dwords[0]);
 
 	/* NOTE: order flags are printed is a bit fugly.. but for now I
 	 * try to match the order in llvm-a3xx disassembler for easy
 	 * diff'ing..
 	 */
 
-	repeat = instr_repeat(instr);
+	ctx->repeat = instr_repeat(instr);
 
 	if (instr->sync)
-		printf("(sy)");
+		fprintf(ctx->out, "(sy)");
 	if (instr->ss && ((instr->opc_cat <= 4) || (instr->opc_cat == 7)))
-		printf("(ss)");
+		fprintf(ctx->out, "(ss)");
 	if (instr->jmp_tgt)
-		printf("(jp)");
+		fprintf(ctx->out, "(jp)");
 	if (instr_sat(instr))
-		printf("(sat)");
-	if (repeat)
-		printf("(rpt%d)", repeat);
+		fprintf(ctx->out, "(sat)");
+	if (ctx->repeat)
+		fprintf(ctx->out, "(rpt%d)", ctx->repeat);
 	if (instr->ul && ((2 <= instr->opc_cat) && (instr->opc_cat <= 4)))
-		printf("(ul)");
+		fprintf(ctx->out, "(ul)");
 
 	name = GETINFO(instr)->name;
 
 	if (name) {
-		printf("%s", name);
-		GETINFO(instr)->print(instr);
+		fprintf(ctx->out, "%s", name);
+		GETINFO(instr)->print(ctx, instr);
 	} else {
-		printf("unknown(%d,%d)", instr->opc_cat, opc);
+		fprintf(ctx->out, "unknown(%d,%d)", instr->opc_cat, opc);
 	}
 
-	printf("\n");
+	fprintf(ctx->out, "\n");
+
+	return (instr->opc_cat == 0) && (opc == OPC_END);
 }
 
-int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, enum shader_t type)
+int disasm_a3xx(uint32_t *dwords, int sizedwords, int level, FILE *out)
 {
+	struct disasm_ctx ctx;
 	int i;
 
 	assert((sizedwords % 2) == 0);
 
+	memset(&ctx, 0, sizeof(ctx));
+	ctx.out = out;
+	ctx.level = level;
+
 	for (i = 0; i < sizedwords; i += 2)
-		print_instr(&dwords[i], level, i/2);
+		print_instr(&ctx, &dwords[i], i/2);
 
 	return 0;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index 904f88c..8ec64b6 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -27,6 +27,7 @@
 #define PACKED __attribute__((__packed__))
 
 #include <stdint.h>
+#include <stdbool.h>
 #include <assert.h>
 
 /* size of largest OPC field of all the instruction categories: */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index ff1d9d1..23d5006 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -46,6 +46,7 @@
 
 #include "compiler/glsl/standalone.h"
 #include "compiler/glsl/glsl_to_nir.h"
+#include "compiler/glsl/gl_nir.h"
 #include "compiler/nir_types.h"
 #include "compiler/spirv/nir_spirv.h"
 
@@ -55,7 +56,7 @@
 	const char *type = ir3_shader_stage(so->shader);
 	bin = ir3_shader_assemble(so, so->shader->compiler->gpu_id);
 	debug_printf("; %s: %s\n", type, str);
-	ir3_shader_disasm(so, bin);
+	ir3_shader_disasm(so, bin, stdout);
 	free(bin);
 }
 
@@ -125,7 +126,6 @@
 
 	NIR_PASS_V(nir, nir_split_var_copies);
 	NIR_PASS_V(nir, nir_lower_var_copies);
-	NIR_PASS_V(nir, nir_lower_io_types);
 
 	switch (stage) {
 	case MESA_SHADER_VERTEX:
@@ -162,7 +162,7 @@
 
 	NIR_PASS_V(nir, nir_lower_system_values);
 	NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size, 0);
-	NIR_PASS_V(nir, nir_lower_samplers, prog);
+	NIR_PASS_V(nir, gl_nir_lower_samplers, prog);
 
 	return nir;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 858f5ce..6eb1e03 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -71,6 +71,9 @@
 	/* For vertex shaders, keep track of the system values sources */
 	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
 
+	/* For fragment shaders: */
+	struct ir3_instruction *samp_id, *samp_mask_in;
+
 	/* Compute shader inputs: */
 	struct ir3_instruction *local_invocation_id, *work_group_id;
 
@@ -115,6 +118,9 @@
 	 */
 	bool unminify_coords;
 
+	/* on a3xx do txf_ms w/ isaml and scaled coords: */
+	bool txf_ms_with_isaml;
+
 	/* on a4xx, for array textures we need to add 0.5 to the array
 	 * index coordinate:
 	 */
@@ -123,6 +129,8 @@
 	/* on a4xx, bitmask of samplers which need astc+srgb workaround: */
 	unsigned astc_srgb;
 
+	unsigned samples;             /* bitmask of x,y sample shifts */
+
 	unsigned max_texture_index;
 
 	/* set if we encounter something we can't handle yet, so we
@@ -152,23 +160,31 @@
 		ctx->flat_bypass = true;
 		ctx->levels_add_one = false;
 		ctx->unminify_coords = false;
+		ctx->txf_ms_with_isaml = false;
 		ctx->array_index_add_half = true;
 
-		if (so->type == SHADER_VERTEX)
+		if (so->type == SHADER_VERTEX) {
 			ctx->astc_srgb = so->key.vastc_srgb;
-		else if (so->type == SHADER_FRAGMENT)
+		} else if (so->type == SHADER_FRAGMENT) {
 			ctx->astc_srgb = so->key.fastc_srgb;
+		}
 
 	} else {
 		/* no special handling for "flat" */
 		ctx->flat_bypass = false;
 		ctx->levels_add_one = true;
 		ctx->unminify_coords = true;
+		ctx->txf_ms_with_isaml = true;
 		ctx->array_index_add_half = false;
+
+		if (so->type == SHADER_VERTEX) {
+			ctx->samples = so->key.vsamples;
+		} else if (so->type == SHADER_FRAGMENT) {
+			ctx->samples = so->key.fsamples;
+		}
 	}
 
 	ctx->compiler = compiler;
-	ctx->ir = so->ir;
 	ctx->so = so;
 	ctx->def_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
@@ -204,6 +220,12 @@
 		nir_print_shader(ctx->s, stdout);
 	}
 
+	if (shader_debug_enabled(so->type)) {
+		fprintf(stderr, "NIR (final form) for %s shader:\n",
+			shader_stage_name(so->type));
+		nir_print_shader(ctx->s, stderr);
+	}
+
 	ir3_nir_scan_driver_consts(ctx->s, &so->const_layout);
 
 	so->num_uniforms = ctx->s->num_uniforms;
@@ -492,7 +514,10 @@
 
 	if (bit_size < 32) {
 		for (unsigned i = 0; i < ctx->last_dst_n; i++) {
-			ctx->last_dst[i]->regs[0]->flags |= IR3_REG_HALF;
+			struct ir3_instruction *dst = ctx->last_dst[i];
+			dst->regs[0]->flags |= IR3_REG_HALF;
+			if (ctx->last_dst[i]->opc == OPC_META_FO)
+				dst->regs[1]->instr->regs[0]->flags |= IR3_REG_HALF;
 		}
 	}
 
@@ -913,11 +938,138 @@
  * alu/sfu instructions:
  */
 
+static struct ir3_instruction *
+create_cov(struct ir3_context *ctx, struct ir3_instruction *src,
+		unsigned src_bitsize, nir_op op)
+{
+	type_t src_type, dst_type;
+
+	switch (op) {
+	case nir_op_f2f32:
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
+	case nir_op_f2i32:
+	case nir_op_f2i16:
+	case nir_op_f2i8:
+	case nir_op_f2u32:
+	case nir_op_f2u16:
+	case nir_op_f2u8:
+		switch (src_bitsize) {
+		case 32:
+			src_type = TYPE_F32;
+			break;
+		case 16:
+			src_type = TYPE_F16;
+			break;
+		default:
+			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+		}
+		break;
+
+	case nir_op_i2f32:
+	case nir_op_i2f16:
+	case nir_op_i2i32:
+	case nir_op_i2i16:
+	case nir_op_i2i8:
+		switch (src_bitsize) {
+		case 32:
+			src_type = TYPE_S32;
+			break;
+		case 16:
+			src_type = TYPE_S16;
+			break;
+		case 8:
+			src_type = TYPE_S8;
+			break;
+		default:
+			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+		}
+		break;
+
+	case nir_op_u2f32:
+	case nir_op_u2f16:
+	case nir_op_u2u32:
+	case nir_op_u2u16:
+	case nir_op_u2u8:
+		switch (src_bitsize) {
+		case 32:
+			src_type = TYPE_U32;
+			break;
+		case 16:
+			src_type = TYPE_U16;
+			break;
+		case 8:
+			src_type = TYPE_U8;
+			break;
+		default:
+			compile_error(ctx, "invalid src bit size: %u", src_bitsize);
+		}
+		break;
+
+	default:
+		compile_error(ctx, "invalid conversion op: %u", op);
+	}
+
+	switch (op) {
+	case nir_op_f2f32:
+	case nir_op_i2f32:
+	case nir_op_u2f32:
+		dst_type = TYPE_F32;
+		break;
+
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
+		/* TODO how to handle rounding mode? */
+	case nir_op_i2f16:
+	case nir_op_u2f16:
+		dst_type = TYPE_F16;
+		break;
+
+	case nir_op_f2i32:
+	case nir_op_i2i32:
+		dst_type = TYPE_S32;
+		break;
+
+	case nir_op_f2i16:
+	case nir_op_i2i16:
+		dst_type = TYPE_S16;
+		break;
+
+	case nir_op_f2i8:
+	case nir_op_i2i8:
+		dst_type = TYPE_S8;
+		break;
+
+	case nir_op_f2u32:
+	case nir_op_u2u32:
+		dst_type = TYPE_U32;
+		break;
+
+	case nir_op_f2u16:
+	case nir_op_u2u16:
+		dst_type = TYPE_U16;
+		break;
+
+	case nir_op_f2u8:
+	case nir_op_u2u8:
+		dst_type = TYPE_U8;
+		break;
+
+	default:
+		compile_error(ctx, "invalid conversion op: %u", op);
+	}
+
+	return ir3_COV(ctx->block, src, src_type, dst_type);
+}
+
 static void
 emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
 {
 	const nir_op_info *info = &nir_op_infos[alu->op];
 	struct ir3_instruction **dst, *src[info->num_inputs];
+	unsigned bs[info->num_inputs];     /* bit size */
 	struct ir3_block *b = ctx->block;
 	unsigned dst_sz, wrmask;
 
@@ -984,22 +1136,33 @@
 		compile_assert(ctx, !asrc->negate);
 
 		src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
+		bs[i] = nir_src_bit_size(asrc->src);
 
 		compile_assert(ctx, src[i]);
 	}
 
 	switch (alu->op) {
+	case nir_op_f2f32:
+	case nir_op_f2f16_rtne:
+	case nir_op_f2f16_rtz:
+	case nir_op_f2f16:
 	case nir_op_f2i32:
-		dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_S32);
-		break;
+	case nir_op_f2i16:
+	case nir_op_f2i8:
 	case nir_op_f2u32:
-		dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_U32);
-		break;
+	case nir_op_f2u16:
+	case nir_op_f2u8:
 	case nir_op_i2f32:
-		dst[0] = ir3_COV(b, src[0], TYPE_S32, TYPE_F32);
-		break;
+	case nir_op_i2f16:
+	case nir_op_i2i32:
+	case nir_op_i2i16:
+	case nir_op_i2i8:
 	case nir_op_u2f32:
-		dst[0] = ir3_COV(b, src[0], TYPE_U32, TYPE_F32);
+	case nir_op_u2f16:
+	case nir_op_u2u32:
+	case nir_op_u2u16:
+	case nir_op_u2u8:
+		dst[0] = create_cov(ctx, src[0], bs[0], alu->op);
 		break;
 	case nir_op_f2b:
 		dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
@@ -1231,10 +1394,18 @@
 		dst[0] = ir3_n2b(b, dst[0]);
 		break;
 
-	case nir_op_bcsel:
-		dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0);
+	case nir_op_bcsel: {
+		struct ir3_instruction *cond = ir3_b2n(b, src[0]);
+		compile_assert(ctx, bs[1] == bs[2]);
+		/* the boolean condition is 32b even if src[1] and src[2] are
+		 * half-precision, but sel.b16 wants all three src's to be the
+		 * same type.
+		 */
+		if (bs[1] < 32)
+			cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16);
+		dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0);
 		break;
-
+	}
 	case nir_op_bit_count:
 		dst[0] = ir3_CBITS_B(b, src[0], 0);
 		break;
@@ -1690,33 +1861,74 @@
  * logic if we supported images in anything other than FS..
  */
 static unsigned
-get_image_slot(struct ir3_context *ctx, const nir_variable *var)
+get_image_slot(struct ir3_context *ctx, nir_deref_instr *deref)
 {
+	unsigned int loc = 0;
+	unsigned inner_size = 1;
+
+	while (deref->deref_type != nir_deref_type_var) {
+		assert(deref->deref_type == nir_deref_type_array);
+		nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
+		assert(const_index);
+
+		/* Go to the next instruction */
+		deref = nir_deref_instr_parent(deref);
+
+		assert(glsl_type_is_array(deref->type));
+		const unsigned array_len = glsl_get_length(deref->type);
+		loc += MIN2(const_index->u32[0], array_len - 1) * inner_size;
+
+		/* Update the inner size */
+		inner_size *= array_len;
+	}
+
+	loc += deref->var->data.driver_location;
+
 	/* TODO figure out real limit per generation, and don't hardcode: */
 	const unsigned max_samplers = 16;
-	return max_samplers - var->data.driver_location - 1;
+	return max_samplers - loc - 1;
 }
 
+/* see tex_info() for equiv logic for texture instructions.. it would be
+ * nice if this could be better unified..
+ */
 static unsigned
-get_image_coords(const nir_variable *var)
+get_image_coords(const nir_variable *var, unsigned *flagsp)
 {
-	switch (glsl_get_sampler_dim(glsl_without_array(var->type))) {
+	const struct glsl_type *type = glsl_without_array(var->type);
+	unsigned coords, flags = 0;
+
+	switch (glsl_get_sampler_dim(type)) {
 	case GLSL_SAMPLER_DIM_1D:
 	case GLSL_SAMPLER_DIM_BUF:
-		return 1;
+		coords = 1;
 		break;
 	case GLSL_SAMPLER_DIM_2D:
 	case GLSL_SAMPLER_DIM_RECT:
 	case GLSL_SAMPLER_DIM_EXTERNAL:
 	case GLSL_SAMPLER_DIM_MS:
-		return 2;
+		coords = 2;
+		break;
 	case GLSL_SAMPLER_DIM_3D:
 	case GLSL_SAMPLER_DIM_CUBE:
-		return 3;
+		flags |= IR3_INSTR_3D;
+		coords = 3;
+		break;
 	default:
 		unreachable("bad sampler dim");
 		return 0;
 	}
+
+	if (glsl_sampler_type_is_array(type)) {
+		/* note: unlike tex_info(), adjust # of coords to include array idx: */
+		coords++;
+		flags |= IR3_INSTR_A;
+	}
+
+	if (flagsp)
+		*flagsp = flags;
+
+	return coords;
 }
 
 static type_t
@@ -1741,7 +1953,7 @@
 {
 	struct ir3_block *b = ctx->block;
 	struct ir3_instruction *offset;
-	unsigned ncoords = get_image_coords(var);
+	unsigned ncoords = get_image_coords(var, NULL);
 
 	/* to calculate the byte offset (yes, uggg) we need (up to) three
 	 * const values to know the bytes per pixel, and y and z stride:
@@ -1779,22 +1991,33 @@
 	}, 2);
 }
 
-/* src[] = { coord, sample_index }. const_index[] = {} */
+/* src[] = { deref, coord, sample_index }. const_index[] = {} */
 static void
 emit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr,
 		struct ir3_instruction **dst)
 {
 	struct ir3_block *b = ctx->block;
-	const nir_variable *var = intr->variables[0]->var;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
 	struct ir3_instruction *sam;
-	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]);
-	unsigned ncoords = get_image_coords(var);
-	unsigned tex_idx = get_image_slot(ctx, var);
+	struct ir3_instruction * const *src0 = get_src(ctx, &intr->src[1]);
+	struct ir3_instruction *coords[4];
+	unsigned flags, ncoords = get_image_coords(var, &flags);
+	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
 	type_t type = get_image_type(var);
-	unsigned flags = 0;
 
-	if (ncoords == 3)
-		flags |= IR3_INSTR_3D;
+	/* hmm, this seems a bit odd, but it is what blob does and (at least
+	 * a5xx) just faults on bogus addresses otherwise:
+	 */
+	if (flags & IR3_INSTR_3D) {
+		flags &= ~IR3_INSTR_3D;
+		flags |= IR3_INSTR_A;
+	}
+
+	for (unsigned i = 0; i < ncoords; i++)
+		coords[i] = src0[i];
+
+	if (ncoords == 1)
+		coords[ncoords++] = create_immed(b, 0);
 
 	sam = ir3_SAM(b, OPC_ISAM, type, TGSI_WRITEMASK_XYZW, flags,
 			tex_idx, tex_idx, create_collect(ctx, coords, ncoords), NULL);
@@ -1805,17 +2028,17 @@
 	split_dest(b, dst, sam, 0, 4);
 }
 
-/* src[] = { coord, sample_index, value }. const_index[] = {} */
+/* src[] = { deref, coord, sample_index, value }. const_index[] = {} */
 static void
 emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
 	struct ir3_block *b = ctx->block;
-	const nir_variable *var = intr->variables[0]->var;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
 	struct ir3_instruction *stib, *offset;
-	struct ir3_instruction * const *value = get_src(ctx, &intr->src[2]);
-	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]);
-	unsigned ncoords = get_image_coords(var);
-	unsigned tex_idx = get_image_slot(ctx, var);
+	struct ir3_instruction * const *value = get_src(ctx, &intr->src[3]);
+	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+	unsigned ncoords = get_image_coords(var, NULL);
+	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
 
 	/* src0 is value
 	 * src1 is coords
@@ -1848,69 +2071,88 @@
 		struct ir3_instruction **dst)
 {
 	struct ir3_block *b = ctx->block;
-	const nir_variable *var = intr->variables[0]->var;
-	unsigned ncoords = get_image_coords(var);
-	unsigned tex_idx = get_image_slot(ctx, var);
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+	unsigned tex_idx = get_image_slot(ctx, nir_src_as_deref(intr->src[0]));
 	struct ir3_instruction *sam, *lod;
-	unsigned flags = 0;
-
-	if (ncoords == 3)
-		flags = IR3_INSTR_3D;
+	unsigned flags, ncoords = get_image_coords(var, &flags);
 
 	lod = create_immed(b, 0);
 	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
 			tex_idx, tex_idx, lod, NULL);
 
-	split_dest(b, dst, sam, 0, ncoords);
+	/* Array size actually ends up in .w rather than .z. This doesn't
+	 * matter for miplevel 0, but for higher mips the value in z is
+	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
+	 * returned, which means that we have to add 1 to it for arrays for
+	 * a3xx.
+	 *
+	 * Note use a temporary dst and then copy, since the size of the dst
+	 * array that is passed in is based on nir's understanding of the
+	 * result size, not the hardware's
+	 */
+	struct ir3_instruction *tmp[4];
+
+	split_dest(b, tmp, sam, 0, 4);
+
+	for (unsigned i = 0; i < ncoords; i++)
+		dst[i] = tmp[i];
+
+	if (flags & IR3_INSTR_A) {
+		if (ctx->levels_add_one) {
+			dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0);
+		} else {
+			dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32);
+		}
+	}
 }
 
-/* src[] = { coord, sample_index, value, compare }. const_index[] = {} */
+/* src[] = { deref, coord, sample_index, value, compare }. const_index[] = {} */
 static struct ir3_instruction *
 emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 {
 	struct ir3_block *b = ctx->block;
-	const nir_variable *var = intr->variables[0]->var;
+	const nir_variable *var = nir_intrinsic_get_var(intr, 0);
 	struct ir3_instruction *atomic, *image, *src0, *src1, *src2;
-	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[0]);
-	unsigned ncoords = get_image_coords(var);
+	struct ir3_instruction * const *coords = get_src(ctx, &intr->src[1]);
+	unsigned ncoords = get_image_coords(var, NULL);
 
-	image = create_immed(b, get_image_slot(ctx, var));
+	image = create_immed(b, get_image_slot(ctx, nir_src_as_deref(intr->src[0])));
 
 	/* src0 is value (or uvec2(value, compare))
 	 * src1 is coords
 	 * src2 is 64b byte offset
 	 */
-	src0 = get_src(ctx, &intr->src[2])[0];
+	src0 = get_src(ctx, &intr->src[3])[0];
 	src1 = create_collect(ctx, coords, ncoords);
 	src2 = get_image_offset(ctx, var, coords, false);
 
 	switch (intr->intrinsic) {
-	case nir_intrinsic_image_var_atomic_add:
+	case nir_intrinsic_image_deref_atomic_add:
 		atomic = ir3_ATOMIC_ADD_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 		break;
-	case nir_intrinsic_image_var_atomic_min:
+	case nir_intrinsic_image_deref_atomic_min:
 		atomic = ir3_ATOMIC_MIN_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 		break;
-	case nir_intrinsic_image_var_atomic_max:
+	case nir_intrinsic_image_deref_atomic_max:
 		atomic = ir3_ATOMIC_MAX_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 		break;
-	case nir_intrinsic_image_var_atomic_and:
+	case nir_intrinsic_image_deref_atomic_and:
 		atomic = ir3_ATOMIC_AND_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 		break;
-	case nir_intrinsic_image_var_atomic_or:
+	case nir_intrinsic_image_deref_atomic_or:
 		atomic = ir3_ATOMIC_OR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 		break;
-	case nir_intrinsic_image_var_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_xor:
 		atomic = ir3_ATOMIC_XOR_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 		break;
-	case nir_intrinsic_image_var_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_exchange:
 		atomic = ir3_ATOMIC_XCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 		break;
-	case nir_intrinsic_image_var_atomic_comp_swap:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
 		/* for cmpxchg, src0 is [ui]vec2(data, compare): */
 		src0 = create_collect(ctx, (struct ir3_instruction*[]){
 			src0,
-			get_src(ctx, &intr->src[3])[0],
+			get_src(ctx, &intr->src[4])[0],
 		}, 2);
 		atomic = ir3_ATOMIC_CMPXCHG_G(b, image, 0, src0, 0, src1, 0, src2, 0);
 		break;
@@ -2139,23 +2381,23 @@
 	case nir_intrinsic_shared_atomic_comp_swap:
 		dst[0] = emit_intrinsic_atomic_shared(ctx, intr);
 		break;
-	case nir_intrinsic_image_var_load:
+	case nir_intrinsic_image_deref_load:
 		emit_intrinsic_load_image(ctx, intr, dst);
 		break;
-	case nir_intrinsic_image_var_store:
+	case nir_intrinsic_image_deref_store:
 		emit_intrinsic_store_image(ctx, intr);
 		break;
-	case nir_intrinsic_image_var_size:
+	case nir_intrinsic_image_deref_size:
 		emit_intrinsic_image_size(ctx, intr, dst);
 		break;
-	case nir_intrinsic_image_var_atomic_add:
-	case nir_intrinsic_image_var_atomic_min:
-	case nir_intrinsic_image_var_atomic_max:
-	case nir_intrinsic_image_var_atomic_and:
-	case nir_intrinsic_image_var_atomic_or:
-	case nir_intrinsic_image_var_atomic_xor:
-	case nir_intrinsic_image_var_atomic_exchange:
-	case nir_intrinsic_image_var_atomic_comp_swap:
+	case nir_intrinsic_image_deref_atomic_add:
+	case nir_intrinsic_image_deref_atomic_min:
+	case nir_intrinsic_image_deref_atomic_max:
+	case nir_intrinsic_image_deref_atomic_and:
+	case nir_intrinsic_image_deref_atomic_or:
+	case nir_intrinsic_image_deref_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
 		dst[0] = emit_intrinsic_atomic_image(ctx, intr);
 		break;
 	case nir_intrinsic_barrier:
@@ -2208,6 +2450,24 @@
 		}
 		dst[0] = ctx->instance_id;
 		break;
+	case nir_intrinsic_load_sample_id:
+	case nir_intrinsic_load_sample_id_no_per_sample:
+		if (!ctx->samp_id) {
+			ctx->samp_id = create_input(b, 0);
+			ctx->samp_id->regs[0]->flags |= IR3_REG_HALF;
+			add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID,
+					ctx->samp_id);
+		}
+		dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32);
+		break;
+	case nir_intrinsic_load_sample_mask_in:
+		if (!ctx->samp_mask_in) {
+			ctx->samp_mask_in = create_input(b, 0);
+			add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN,
+					ctx->samp_mask_in);
+		}
+		dst[0] = ctx->samp_mask_in;
+		break;
 	case nir_intrinsic_load_user_clip_plane:
 		idx = nir_intrinsic_ucp_id(intr);
 		for (int i = 0; i < intr->num_components; i++) {
@@ -2366,7 +2626,7 @@
 	struct ir3_block *b = ctx->block;
 	struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
 	struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy;
-	struct ir3_instruction *lod, *compare, *proj;
+	struct ir3_instruction *lod, *compare, *proj, *sample_index;
 	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
 	unsigned i, coords, flags;
 	unsigned nsrc0 = 0, nsrc1 = 0;
@@ -2374,7 +2634,7 @@
 	opc_t opc = 0;
 
 	coord = off = ddx = ddy = NULL;
-	lod = proj = compare = NULL;
+	lod = proj = compare = sample_index = NULL;
 
 	/* TODO: might just be one component for gathers? */
 	dst = get_dst(ctx, &tex->dest, 4);
@@ -2409,6 +2669,9 @@
 		case nir_tex_src_ddy:
 			ddy = get_src(ctx, &tex->src[i].src);
 			break;
+		case nir_tex_src_ms_index:
+			sample_index = get_src(ctx, &tex->src[i].src)[0];
+			break;
 		default:
 			compile_error(ctx, "Unhandled NIR tex src type: %d\n",
 					tex->src[i].src_type);
@@ -2417,7 +2680,7 @@
 	}
 
 	switch (tex->op) {
-	case nir_texop_tex:      opc = OPC_SAM;      break;
+	case nir_texop_tex:      opc = has_lod ? OPC_SAML : OPC_SAM; break;
 	case nir_texop_txb:      opc = OPC_SAMB;     break;
 	case nir_texop_txl:      opc = OPC_SAML;     break;
 	case nir_texop_txd:      opc = OPC_SAMGQ;    break;
@@ -2435,7 +2698,7 @@
 		case 3:              opc = OPC_GATHER4A; break;
 		}
 		break;
-	case nir_texop_txf_ms:
+	case nir_texop_txf_ms:   opc = OPC_ISAMM;    break;
 	case nir_texop_txs:
 	case nir_texop_query_levels:
 	case nir_texop_texture_samples:
@@ -2464,6 +2727,27 @@
 
 	nsrc0 = i;
 
+	/* NOTE a3xx (and possibly a4xx?) might be different, using isaml
+	 * with scaled x coord according to requested sample:
+	 */
+	if (tex->op == nir_texop_txf_ms) {
+		if (ctx->txf_ms_with_isaml) {
+			/* the samples are laid out in x dimension as
+			 *     0 1 2 3
+			 * x_ms = (x << ms) + sample_index;
+			 */
+			struct ir3_instruction *ms;
+			ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3);
+
+			src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0);
+			src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0);
+
+			opc = OPC_ISAML;
+		} else {
+			src0[nsrc0++] = sample_index;
+		}
+	}
+
 	/* scale up integer coords for TXF based on the LOD */
 	if (ctx->unminify_coords && (opc == OPC_ISAML)) {
 		assert(has_lod);
@@ -2690,6 +2974,9 @@
 	case nir_instr_type_alu:
 		emit_alu(ctx, nir_instr_as_alu(instr));
 		break;
+	case nir_instr_type_deref:
+		/* ignored, handled as part of the intrinsic they are src to */
+		break;
 	case nir_instr_type_intrinsic:
 		emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
 		break;
@@ -3150,6 +3437,7 @@
 }
 
 static const unsigned max_sysvals[SHADER_MAX] = {
+	[SHADER_FRAGMENT] = 8,
 	[SHADER_VERTEX]  = 16,
 	[SHADER_COMPUTE] = 16, // TODO how many do we actually need?
 };
@@ -3490,7 +3778,7 @@
 			so->varying_in++;
 			so->inputs[i].compmask = (1 << maxcomp) - 1;
 			inloc += maxcomp;
-		} else if (!so->inputs[i].sysval){
+		} else if (!so->inputs[i].sysval) {
 			so->inputs[i].compmask = compmask;
 		}
 		so->inputs[i].regid = regid;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index e3a3a9d..0ee8ea2 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -543,6 +543,10 @@
 		if (reg->flags & IR3_REG_ARRAY)
 			continue;
 
+		/* Don't CP absneg into meta instructions, that won't end well: */
+		if (is_meta(instr) && (src->opc != OPC_MOV))
+			continue;
+
 		reg_cp(ctx, instr, reg, n);
 	}
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index fecb89f..c859034 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -263,6 +263,10 @@
 			struct ir3_instruction *instr = block->keeps[i];
 			instr_find_neighbors(instr);
 		}
+
+		/* We also need to account for if-condition: */
+		if (block->condition)
+			instr_find_neighbors(block->condition);
 	}
 }
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
index cd1f9c5..db1d74f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -35,6 +35,7 @@
 
 #include "nir/tgsi_to_nir.h"
 
+
 static const nir_shader_compiler_options options = {
 		.lower_fpow = true,
 		.lower_scmp = true,
@@ -51,6 +52,7 @@
 		.lower_extract_byte = true,
 		.lower_extract_word = true,
 		.lower_all_io_to_temps = true,
+		.lower_helper_invocation = true,
 };
 
 struct nir_shader *
@@ -204,6 +206,8 @@
 
 	OPT_V(s, nir_remove_dead_variables, nir_var_local);
 
+	OPT_V(s, nir_move_load_const);
+
 	if (fd_mesa_debug & FD_DBG_DISASM) {
 		debug_printf("----------------------\n");
 		nir_print_shader(s, stdout);
@@ -242,12 +246,20 @@
 						layout->ssbo_size.count;
 					layout->ssbo_size.count += 1; /* one const per */
 					break;
-				case nir_intrinsic_image_var_store:
-					idx = intr->variables[0]->var->data.driver_location;
+				case nir_intrinsic_image_deref_atomic_add:
+				case nir_intrinsic_image_deref_atomic_min:
+				case nir_intrinsic_image_deref_atomic_max:
+				case nir_intrinsic_image_deref_atomic_and:
+				case nir_intrinsic_image_deref_atomic_or:
+				case nir_intrinsic_image_deref_atomic_xor:
+				case nir_intrinsic_image_deref_atomic_exchange:
+				case nir_intrinsic_image_deref_atomic_comp_swap:
+				case nir_intrinsic_image_deref_store:
+					idx = nir_intrinsic_get_var(intr, 0)->data.driver_location;
 					if (layout->image_dims.mask & (1 << idx))
 						break;
 					layout->image_dims.mask |= (1 << idx);
-					layout->ssbo_size.off[idx] =
+					layout->image_dims.off[idx] =
 						layout->image_dims.count;
 					layout->image_dims.count += 3; /* three const per */
 					break;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
index de304bf..37a3dcb 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_tg4_to_tex.c
@@ -40,7 +40,7 @@
 		if (instr->type != nir_instr_type_tex)
 			continue;
 
-        nir_tex_instr *tg4 = (nir_tex_instr *)instr;
+		nir_tex_instr *tg4 = (nir_tex_instr *)instr;
 
 		if (tg4->op != nir_texop_tg4)
 			continue;
@@ -62,9 +62,7 @@
 			tex->is_shadow = tg4->is_shadow;
 			tex->is_new_style_shadow = tg4->is_new_style_shadow;
 			tex->texture_index = tg4->texture_index;
-			tex->texture = nir_deref_var_clone(tg4->texture, tex);
 			tex->sampler_index = tg4->sampler_index;
-			tex->sampler = nir_deref_var_clone(tg4->sampler, tex);
 			tex->dest_type = tg4->dest_type;
 
 			for (int j = 0; j < tg4->num_srcs; j++) {
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py b/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py
index a0ab9d0..3968aea 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_trig.py
@@ -20,6 +20,8 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
+from __future__ import print_function
+
 import argparse
 import sys
 
@@ -40,9 +42,9 @@
 def run():
     import nir_algebraic  # pylint: disable=import-error
 
-    print '#include "ir3_nir.h"'
-    print nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds",
-                                      trig_workarounds).render()
+    print('#include "ir3_nir.h"')
+    print(nir_algebraic.AlgebraicPass("ir3_nir_apply_trig_workarounds",
+                                      trig_workarounds).render())
 
 
 if __name__ == '__main__':
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 8ed7f56..b0663d5 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -157,9 +157,17 @@
 
 	if (fd_mesa_debug & FD_DBG_DISASM) {
 		struct ir3_shader_key key = v->key;
-		DBG("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
+		printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
 			key.binning_pass, key.color_two_side, key.half_precision);
-		ir3_shader_disasm(v, bin);
+		ir3_shader_disasm(v, bin, stdout);
+	}
+
+	if (shader_debug_enabled(v->shader->type)) {
+		fprintf(stderr, "Native code for unnamed %s shader %s:\n",
+			shader_stage_name(v->shader->type), v->shader->nir->info.name);
+		if (v->shader->type == SHADER_FRAGMENT)
+			fprintf(stderr, "SIMD0\n");
+		ir3_shader_disasm(v, bin, stderr);
 	}
 
 	free(bin);
@@ -248,6 +256,7 @@
 			key.vsaturate_t = 0;
 			key.vsaturate_r = 0;
 			key.vastc_srgb = 0;
+			key.vsamples = 0;
 		}
 		break;
 	case SHADER_VERTEX:
@@ -259,6 +268,7 @@
 			key.fsaturate_t = 0;
 			key.fsaturate_r = 0;
 			key.fastc_srgb = 0;
+			key.fsamples = 0;
 		}
 		break;
 	default:
@@ -373,29 +383,29 @@
 	/* do first pass optimization, ignoring the key: */
 	shader->nir = ir3_optimize_nir(shader, nir, NULL);
 	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("dump nir%d: type=%d", shader->id, shader->type);
+		printf("dump nir%d: type=%d\n", shader->id, shader->type);
 		nir_print_shader(shader->nir, stdout);
 	}
 
 	return shader;
 }
 
-static void dump_reg(const char *name, uint32_t r)
+static void dump_reg(FILE *out, const char *name, uint32_t r)
 {
 	if (r != regid(63,0))
-		debug_printf("; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
+		fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
 }
 
-static void dump_output(struct ir3_shader_variant *so,
+static void dump_output(FILE *out, struct ir3_shader_variant *so,
 		unsigned slot, const char *name)
 {
 	uint32_t regid;
 	regid = ir3_find_output_regid(so, slot);
-	dump_reg(name, regid);
+	dump_reg(out, name, regid);
 }
 
 void
-ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
+ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
 {
 	struct ir3 *ir = so->ir;
 	struct ir3_register *reg;
@@ -405,19 +415,19 @@
 
 	for (i = 0; i < ir->ninputs; i++) {
 		if (!ir->inputs[i]) {
-			debug_printf("; in%d unused\n", i);
+			fprintf(out, "; in%d unused\n", i);
 			continue;
 		}
 		reg = ir->inputs[i]->regs[0];
 		regid = reg->num;
-		debug_printf("@in(%sr%d.%c)\tin%d\n",
+		fprintf(out, "@in(%sr%d.%c)\tin%d\n",
 				(reg->flags & IR3_REG_HALF) ? "h" : "",
 				(regid >> 2), "xyzw"[regid & 0x3], i);
 	}
 
 	for (i = 0; i < ir->noutputs; i++) {
 		if (!ir->outputs[i]) {
-			debug_printf("; out%d unused\n", i);
+			fprintf(out, "; out%d unused\n", i);
 			continue;
 		}
 		/* kill shows up as a virtual output.. skip it! */
@@ -425,63 +435,63 @@
 			continue;
 		reg = ir->outputs[i]->regs[0];
 		regid = reg->num;
-		debug_printf("@out(%sr%d.%c)\tout%d\n",
+		fprintf(out, "@out(%sr%d.%c)\tout%d\n",
 				(reg->flags & IR3_REG_HALF) ? "h" : "",
 				(regid >> 2), "xyzw"[regid & 0x3], i);
 	}
 
 	for (i = 0; i < so->immediates_count; i++) {
-		debug_printf("@const(c%d.x)\t", so->constbase.immediate + i);
-		debug_printf("0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
+		fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i);
+		fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
 				so->immediates[i].val[0],
 				so->immediates[i].val[1],
 				so->immediates[i].val[2],
 				so->immediates[i].val[3]);
 	}
 
-	disasm_a3xx(bin, so->info.sizedwords, 0, so->type);
+	disasm_a3xx(bin, so->info.sizedwords, 0, out);
 
 	switch (so->type) {
 	case SHADER_VERTEX:
-		debug_printf("; %s: outputs:", type);
+		fprintf(out, "; %s: outputs:", type);
 		for (i = 0; i < so->outputs_count; i++) {
 			uint8_t regid = so->outputs[i].regid;
-			debug_printf(" r%d.%c (%s)",
+			fprintf(out, " r%d.%c (%s)",
 					(regid >> 2), "xyzw"[regid & 0x3],
 					gl_varying_slot_name(so->outputs[i].slot));
 		}
-		debug_printf("\n");
-		debug_printf("; %s: inputs:", type);
+		fprintf(out, "\n");
+		fprintf(out, "; %s: inputs:", type);
 		for (i = 0; i < so->inputs_count; i++) {
 			uint8_t regid = so->inputs[i].regid;
-			debug_printf(" r%d.%c (cm=%x,il=%u,b=%u)",
+			fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)",
 					(regid >> 2), "xyzw"[regid & 0x3],
 					so->inputs[i].compmask,
 					so->inputs[i].inloc,
 					so->inputs[i].bary);
 		}
-		debug_printf("\n");
+		fprintf(out, "\n");
 		break;
 	case SHADER_FRAGMENT:
-		debug_printf("; %s: outputs:", type);
+		fprintf(out, "; %s: outputs:", type);
 		for (i = 0; i < so->outputs_count; i++) {
 			uint8_t regid = so->outputs[i].regid;
-			debug_printf(" r%d.%c (%s)",
+			fprintf(out, " r%d.%c (%s)",
 					(regid >> 2), "xyzw"[regid & 0x3],
 					gl_frag_result_name(so->outputs[i].slot));
 		}
-		debug_printf("\n");
-		debug_printf("; %s: inputs:", type);
+		fprintf(out, "\n");
+		fprintf(out, "; %s: inputs:", type);
 		for (i = 0; i < so->inputs_count; i++) {
 			uint8_t regid = so->inputs[i].regid;
-			debug_printf(" r%d.%c (%s,cm=%x,il=%u,b=%u)",
+			fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)",
 					(regid >> 2), "xyzw"[regid & 0x3],
 					gl_varying_slot_name(so->inputs[i].slot),
 					so->inputs[i].compmask,
 					so->inputs[i].inloc,
 					so->inputs[i].bary);
 		}
-		debug_printf("\n");
+		fprintf(out, "\n");
 		break;
 	default:
 		/* TODO */
@@ -489,53 +499,53 @@
 	}
 
 	/* print generic shader info: */
-	debug_printf("; %s prog %d/%d: %u instructions, %d half, %d full\n",
+	fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n",
 			type, so->shader->id, so->id,
 			so->info.instrs_count,
 			so->info.max_half_reg + 1,
 			so->info.max_reg + 1);
 
-	debug_printf("; %d const, %u constlen\n",
+	fprintf(out, "; %d const, %u constlen\n",
 			so->info.max_const + 1,
 			so->constlen);
 
-	debug_printf("; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
+	fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
 
 	/* print shader type specific info: */
 	switch (so->type) {
 	case SHADER_VERTEX:
-		dump_output(so, VARYING_SLOT_POS, "pos");
-		dump_output(so, VARYING_SLOT_PSIZ, "psize");
+		dump_output(out, so, VARYING_SLOT_POS, "pos");
+		dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
 		break;
 	case SHADER_FRAGMENT:
-		dump_reg("pos (bary)", so->pos_regid);
-		dump_output(so, FRAG_RESULT_DEPTH, "posz");
+		dump_reg(out, "pos (bary)", so->pos_regid);
+		dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
 		if (so->color0_mrt) {
-			dump_output(so, FRAG_RESULT_COLOR, "color");
+			dump_output(out, so, FRAG_RESULT_COLOR, "color");
 		} else {
-			dump_output(so, FRAG_RESULT_DATA0, "data0");
-			dump_output(so, FRAG_RESULT_DATA1, "data1");
-			dump_output(so, FRAG_RESULT_DATA2, "data2");
-			dump_output(so, FRAG_RESULT_DATA3, "data3");
-			dump_output(so, FRAG_RESULT_DATA4, "data4");
-			dump_output(so, FRAG_RESULT_DATA5, "data5");
-			dump_output(so, FRAG_RESULT_DATA6, "data6");
-			dump_output(so, FRAG_RESULT_DATA7, "data7");
+			dump_output(out, so, FRAG_RESULT_DATA0, "data0");
+			dump_output(out, so, FRAG_RESULT_DATA1, "data1");
+			dump_output(out, so, FRAG_RESULT_DATA2, "data2");
+			dump_output(out, so, FRAG_RESULT_DATA3, "data3");
+			dump_output(out, so, FRAG_RESULT_DATA4, "data4");
+			dump_output(out, so, FRAG_RESULT_DATA5, "data5");
+			dump_output(out, so, FRAG_RESULT_DATA6, "data6");
+			dump_output(out, so, FRAG_RESULT_DATA7, "data7");
 		}
 		/* these two are hard-coded since we don't know how to
 		 * program them to anything but all 0's...
 		 */
 		if (so->frag_coord)
-			debug_printf("; fragcoord: r0.x\n");
+			fprintf(out, "; fragcoord: r0.x\n");
 		if (so->frag_face)
-			debug_printf("; fragface: hr0.x\n");
+			fprintf(out, "; fragface: hr0.x\n");
 		break;
 	default:
 		/* TODO */
 		break;
 	}
 
-	debug_printf("\n");
+	fprintf(out, "\n");
 }
 
 uint64_t
@@ -559,10 +569,8 @@
 		struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
 {
 	const unsigned index = 0;     /* user consts are index 0 */
-	/* TODO save/restore dirty_mask for binning pass instead: */
-	uint32_t dirty_mask = constbuf->enabled_mask;
 
-	if (dirty_mask & (1 << index)) {
+	if (constbuf->enabled_mask & (1 << index)) {
 		struct pipe_constant_buffer *cb = &constbuf->cb[index];
 		unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
 
@@ -587,7 +595,6 @@
 			ctx->emit_const(ring, v->type, 0,
 					cb->buffer_offset, size,
 					cb->user_buffer, cb->buffer);
-			constbuf->dirty_mask &= ~(1 << index);
 		}
 	}
 }
@@ -660,11 +667,21 @@
 			img = &si->si[index];
 			rsc = fd_resource(img->resource);
 
-			dims[off + 0] = rsc->cpp;
+			dims[off + 0] = util_format_get_blocksize(img->format);
 			if (img->resource->target != PIPE_BUFFER) {
 				unsigned lvl = img->u.tex.level;
+				/* note for 2d/cube/etc images, even if re-interpreted
+				 * as a different color format, the pixel size should
+				 * be the same, so use original dimensions for y and z
+				 * stride:
+				 */
 				dims[off + 1] = rsc->slices[lvl].pitch * rsc->cpp;
-				dims[off + 2] = rsc->slices[lvl].size0;
+				/* see corresponding logic in fd_resource_offset(): */
+				if (rsc->layer_first) {
+					dims[off + 2] = rsc->layer_size;
+				} else {
+					dims[off + 2] = rsc->slices[lvl].size0;
+				}
 			}
 		}
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 272368c..cff0206 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -143,6 +143,9 @@
 	 */
 	uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
 
+	/* bitmask of ms shifts */
+	uint32_t vsamples, fsamples;
+
 	/* bitmask of samplers which need astc srgb workaround: */
 	uint16_t vastc_srgb, fastc_srgb;
 };
@@ -164,6 +167,7 @@
 		if ((last_key->fsaturate_s != key->fsaturate_s) ||
 				(last_key->fsaturate_t != key->fsaturate_t) ||
 				(last_key->fsaturate_r != key->fsaturate_r) ||
+				(last_key->fsamples != key->fsamples) ||
 				(last_key->fastc_srgb != key->fastc_srgb))
 			return true;
 	}
@@ -194,6 +198,7 @@
 		if ((last_key->vsaturate_s != key->vsaturate_s) ||
 				(last_key->vsaturate_t != key->vsaturate_t) ||
 				(last_key->vsaturate_r != key->vsaturate_r) ||
+				(last_key->vsamples != key->vsamples) ||
 				(last_key->vastc_srgb != key->vastc_srgb))
 			return true;
 	}
@@ -376,7 +381,7 @@
 void ir3_shader_destroy(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key, struct pipe_debug_callback *debug);
-void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin);
+void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out);
 uint64_t ir3_shader_outputs(const struct ir3_shader *so);
 
 struct fd_ringbuffer;
@@ -522,4 +527,13 @@
 	return regid(63, 0);
 }
 
+/* calculate register footprint in terms of half-regs (ie. one full
+ * reg counts as two half-regs).
+ */
+static inline uint32_t
+ir3_shader_halfregs(const struct ir3_shader_variant *v)
+{
+	return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1);
+}
+
 #endif /* IR3_SHADER_H_ */
diff --git a/src/gallium/drivers/freedreno/meson.build b/src/gallium/drivers/freedreno/meson.build
index ba1a21c..5cc1975 100644
--- a/src/gallium/drivers/freedreno/meson.build
+++ b/src/gallium/drivers/freedreno/meson.build
@@ -48,6 +48,7 @@
   'freedreno_fence.h',
   'freedreno_gmem.c',
   'freedreno_gmem.h',
+  'freedreno_perfcntr.h',
   'freedreno_program.c',
   'freedreno_program.h',
   'freedreno_query.c',
@@ -168,6 +169,7 @@
   'a5xx/fd5_gmem.h',
   'a5xx/fd5_image.c',
   'a5xx/fd5_image.h',
+  'a5xx/fd5_perfcntr.c',
   'a5xx/fd5_program.c',
   'a5xx/fd5_program.h',
   'a5xx/fd5_query.c',
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 59d2ec6..dda7c5b 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -283,6 +283,12 @@
    case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
    case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
    case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
@@ -322,11 +328,13 @@
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return 0;
 
    case PIPE_CAP_MAX_VIEWPORTS:
@@ -336,6 +344,7 @@
       return 64;
 
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
       return 120;
 
    case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
@@ -435,6 +444,13 @@
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return 16.0;
 
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+      /* fall-through */
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+      /* fall-through */
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0f;
+
    default:
       debug_printf("%s: Unknown cap %u.\n", __FUNCTION__, cap);
       return 0;
@@ -446,6 +462,7 @@
                          enum pipe_format format,
                          enum pipe_texture_target target,
                          unsigned sample_count,
+                         unsigned storage_sample_count,
                          unsigned tex_usage)
 {
    static const enum pipe_format tex_supported[] = {
@@ -498,12 +515,12 @@
    const enum pipe_format *list;
    uint i;
 
-   if (!util_format_is_supported(format, tex_usage))
-      return FALSE;
-
    if (sample_count > 1)
       return FALSE;
 
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+      return false;
+
    if(tex_usage & PIPE_BIND_DEPTH_STENCIL)
       list = depth_supported;
    else if (tex_usage & PIPE_BIND_RENDER_TARGET)
diff --git a/src/gallium/drivers/i915/i915_screen.h b/src/gallium/drivers/i915/i915_screen.h
index 3be941a..c58055a 100644
--- a/src/gallium/drivers/i915/i915_screen.h
+++ b/src/gallium/drivers/i915/i915_screen.h
@@ -70,6 +70,7 @@
                          enum pipe_format format,
                          enum pipe_texture_target target,
                          unsigned sample_count,
+                         unsigned storage_sample_count,
                          unsigned tex_usage);
 
 #endif /* I915_SCREEN_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 54d98fd..7a2f253 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -136,7 +136,6 @@
    struct blitter_context *blitter;
 
    unsigned tex_timestamp;
-   boolean no_rast;
 
    /** List of all fragment shader variants */
    struct lp_fs_variant_list_item fs_variants_list;
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index a2762f3..e2309f4 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -212,6 +212,7 @@
       elem_types[LP_JIT_THREAD_DATA_CACHE] =
             LLVMPointerType(lp_build_format_cache_type(gallivm), 0);
       elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc);
+      elem_types[LP_JIT_THREAD_DATA_INVOCATIONS] = LLVMInt64TypeInContext(lc);
       elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] =
             LLVMInt32TypeInContext(lc);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 9db26f2..312d1a1 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -192,6 +192,7 @@
 {
    struct lp_build_format_cache *cache;
    uint64_t vis_counter;
+   uint64_t ps_invocations;
 
    /*
     * Non-interpolated rasterizer state passed through to the fragment shader.
@@ -205,6 +206,7 @@
 enum {
    LP_JIT_THREAD_DATA_CACHE = 0,
    LP_JIT_THREAD_DATA_COUNTER,
+   LP_JIT_THREAD_DATA_INVOCATIONS,
    LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX,
    LP_JIT_THREAD_DATA_COUNT
 };
@@ -216,6 +218,9 @@
 #define lp_jit_thread_data_counter(_gallivm, _ptr) \
    lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, "counter")
 
+#define lp_jit_thread_data_invocations(_gallivm, _ptr) \
+   lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_INVOCATIONS, "invocs")
+
 #define lp_jit_thread_data_raster_state_viewport_index(_gallivm, _ptr) \
    lp_build_struct_get(_gallivm, _ptr, \
                        LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX, \
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index 939944a..9d4f9f8 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -107,7 +107,7 @@
                     task->scene->fb.height - y * TILE_SIZE : TILE_SIZE;
 
    task->thread_data.vis_counter = 0;
-   task->ps_invocations = 0;
+   task->thread_data.ps_invocations = 0;
 
    for (i = 0; i < task->scene->fb.nr_cbufs; i++) {
       if (task->scene->fb.cbufs[i]) {
@@ -446,10 +446,6 @@
     * allocated 4x4 blocks hence need to filter them out here.
     */
    if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
-      /* not very accurate would need a popcount on the mask */
-      /* always count this not worth bothering? */
-      task->ps_invocations += 1 * variant->ps_inv_multiplier;
-
       /* Propagate non-interpolated raster state. */
       task->thread_data.raster_state.viewport_index = inputs->viewport_index;
 
@@ -491,7 +487,7 @@
       pq->start[task->thread_index] = task->thread_data.vis_counter;
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
-      pq->start[task->thread_index] = task->ps_invocations;
+      pq->start[task->thread_index] = task->thread_data.ps_invocations;
       break;
    default:
       assert(0);
@@ -524,7 +520,7 @@
       break;
    case PIPE_QUERY_PIPELINE_STATISTICS:
       pq->end[task->thread_index] +=
-         task->ps_invocations - pq->start[task->thread_index];
+         task->thread_data.ps_invocations - pq->start[task->thread_index];
       pq->start[task->thread_index] = 0;
       break;
    default:
@@ -679,7 +675,7 @@
 #endif
 #endif
 
-   if (!task->rast->no_rast && !scene->discard) {
+   if (!task->rast->no_rast) {
       /* loop over scene bins, rasterize each */
       {
          struct cmd_bin *bin;
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index fe078d5..59d3a2d 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -99,8 +99,6 @@
 
    /** Non-interpolated passthru state and occlude counter for visible pixels */
    struct lp_jit_thread_data thread_data;
-   uint64_t ps_invocations;
-   uint8_t ps_inv_multiplier;
 
    pipe_semaphore work_ready;
    pipe_semaphore work_done;
@@ -259,10 +257,6 @@
     * allocated 4x4 blocks hence need to filter them out here.
     */
    if ((x % TILE_SIZE) < task->width && (y % TILE_SIZE) < task->height) {
-      /* not very accurate would need a popcount on the mask */
-      /* always count this not worth bothering? */
-      task->ps_invocations += 1 * variant->ps_inv_multiplier;
-
       /* Propagate non-interpolated raster state. */
       task->thread_data.raster_state.viewport_index = inputs->viewport_index;
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index dfad9fa..ef0136c 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -507,15 +507,14 @@
 }
 
 
-void lp_scene_begin_binning( struct lp_scene *scene,
-                             struct pipe_framebuffer_state *fb, boolean discard )
+void lp_scene_begin_binning(struct lp_scene *scene,
+                            struct pipe_framebuffer_state *fb)
 {
    int i;
    unsigned max_layer = ~0;
 
    assert(lp_scene_is_empty(scene));
 
-   scene->discard = discard;
    util_copy_framebuffer_state(&scene->fb, fb);
 
    scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE;
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index da29057..b4ed881 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -166,7 +166,6 @@
    unsigned resource_reference_size;
 
    boolean alloc_failed;
-   boolean discard;
    /**
     * Number of active tiles in each dimension.
     * This basically the framebuffer size divided by tile size
@@ -389,12 +388,11 @@
 /* Begin/end binning of a scene
  */
 void
-lp_scene_begin_binning( struct lp_scene *scene,
-                        struct pipe_framebuffer_state *fb,
-                        boolean discard );
+lp_scene_begin_binning(struct lp_scene *scene,
+                       struct pipe_framebuffer_state *fb);
 
 void
-lp_scene_end_binning( struct lp_scene *scene );
+lp_scene_end_binning(struct lp_scene *scene);
 
 
 /* Begin/end rasterization of a scene
@@ -403,7 +401,7 @@
 lp_scene_begin_rasterization(struct lp_scene *scene);
 
 void
-lp_scene_end_rasterization(struct lp_scene *scene );
+lp_scene_end_rasterization(struct lp_scene *scene);
 
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 3f5d032..261bca9 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -205,6 +205,8 @@
       return 1;
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
       return 330;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      return 140;
    case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
       return 0;
    case PIPE_CAP_COMPUTE:
@@ -357,11 +359,19 @@
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return 0;
    }
    /* should only get here on unhandled cases */
@@ -422,6 +432,12 @@
       return 16.0; /* not actually signficant at this time */
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return 16.0; /* arbitrary */
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+      return 0.0;
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+      return 0.0;
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0;
    }
    /* should only get here on unhandled cases */
    debug_printf("Unexpected PIPE_CAP %d query\n", param);
@@ -439,6 +455,7 @@
                               enum pipe_format format,
                               enum pipe_texture_target target,
                               unsigned sample_count,
+                              unsigned storage_sample_count,
                               unsigned bind)
 {
    struct llvmpipe_screen *screen = llvmpipe_screen(_screen);
@@ -462,6 +479,9 @@
    if (sample_count > 1)
       return FALSE;
 
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+      return false;
+
    if (bind & PIPE_BIND_RENDER_TARGET) {
       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
          /* this is a lie actually other formats COULD exist where we would fail */
@@ -519,8 +539,7 @@
       }
    }
 
-   if (format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC ||
-       format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC) {
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC) {
       /* Software decoding is not hooked up. */
       return FALSE;
    }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index c157323..b087369 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -82,7 +82,7 @@
       lp_fence_wait(setup->scene->fence);
    }
 
-   lp_scene_begin_binning(setup->scene, &setup->fb, setup->rasterizer_discard);
+   lp_scene_begin_binning(setup->scene, &setup->fb);
 
 }
 
@@ -724,25 +724,27 @@
 
 
 void 
-lp_setup_set_flatshade_first( struct lp_setup_context *setup,
-                              boolean flatshade_first )
+lp_setup_set_flatshade_first(struct lp_setup_context *setup,
+                             boolean flatshade_first)
 {
    setup->flatshade_first = flatshade_first;
 }
 
 void
-lp_setup_set_rasterizer_discard( struct lp_setup_context *setup,
-                                 boolean rasterizer_discard )
+lp_setup_set_rasterizer_discard(struct lp_setup_context *setup,
+                                boolean rasterizer_discard)
 {
    if (setup->rasterizer_discard != rasterizer_discard) {
       setup->rasterizer_discard = rasterizer_discard;
-      set_scene_state( setup, SETUP_FLUSHED, __FUNCTION__ );
+      setup->line = first_line;
+      setup->point = first_point;
+      setup->triangle = first_triangle;
    }
 }
 
 void 
-lp_setup_set_vertex_info( struct lp_setup_context *setup,
-                          struct vertex_info *vertex_info )
+lp_setup_set_vertex_info(struct lp_setup_context *setup,
+                         struct vertex_info *vertex_info)
 {
    /* XXX: just silently holding onto the pointer:
     */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index d0bac5e..c1d8237 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -616,8 +616,7 @@
 
    LP_COUNT(nr_tris);
 
-   if (lp_context->active_statistics_queries &&
-       !llvmpipe_rasterization_disabled(lp_context)) {
+   if (lp_context->active_statistics_queries) {
       lp_context->pipeline_statistics.c_primitives++;
    }
 
@@ -759,24 +758,33 @@
 }
 
 
-static void lp_setup_line( struct lp_setup_context *setup,
-                           const float (*v0)[4],
-                           const float (*v1)[4] )
+static void lp_setup_line_discard(struct lp_setup_context *setup,
+                                  const float (*v0)[4],
+                                  const float (*v1)[4])
 {
-   if (!try_setup_line( setup, v0, v1 ))
-   {
+}
+
+static void lp_setup_line(struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4])
+{
+   if (!try_setup_line(setup, v0, v1)) {
       if (!lp_setup_flush_and_restart(setup))
          return;
 
-      if (!try_setup_line( setup, v0, v1 ))
+      if (!try_setup_line(setup, v0, v1))
          return;
    }
 }
 
 
-void lp_setup_choose_line( struct lp_setup_context *setup ) 
+void lp_setup_choose_line(struct lp_setup_context *setup)
 { 
-   setup->line = lp_setup_line;
+   if (setup->rasterizer_discard) {
+      setup->line = lp_setup_line_discard;
+   } else {
+      setup->line = lp_setup_line;
+   }
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 8cb6b83..2192789 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -458,8 +458,7 @@
 
    LP_COUNT(nr_tris);
 
-   if (lp_context->active_statistics_queries &&
-       !llvmpipe_rasterization_disabled(lp_context)) {
+   if (lp_context->active_statistics_queries) {
       lp_context->pipeline_statistics.c_primitives++;
    }
 
@@ -518,24 +517,33 @@
 
 
 static void 
+lp_setup_point_discard(struct lp_setup_context *setup,
+                       const float (*v0)[4])
+{
+}
+
+static void
 lp_setup_point(struct lp_setup_context *setup,
                const float (*v0)[4])
 {
-   if (!try_setup_point( setup, v0 ))
-   {
+   if (!try_setup_point(setup, v0)) {
       if (!lp_setup_flush_and_restart(setup))
          return;
 
-      if (!try_setup_point( setup, v0 ))
+      if (!try_setup_point(setup, v0))
          return;
    }
 }
 
 
 void 
-lp_setup_choose_point( struct lp_setup_context *setup )
+lp_setup_choose_point(struct lp_setup_context *setup)
 {
-   setup->point = lp_setup_point;
+   if (setup->rasterizer_discard) {
+      setup->point = lp_setup_point_discard;
+   } else {
+      setup->point = lp_setup_point;
+   }
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 39755d6..cec6198 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -1127,6 +1127,11 @@
                         const float (*v2)[4])
 {
    PIPE_ALIGN_VAR(16) struct fixed_position position;
+   struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
+
+   if (lp_context->active_statistics_queries) {
+      lp_context->pipeline_statistics.c_primitives++;
+   }
 
    calc_fixed_position(setup, &position, v0, v1, v2);
 
@@ -1148,6 +1153,11 @@
                          const float (*v2)[4])
 {
    PIPE_ALIGN_VAR(16) struct fixed_position position;
+   struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
+
+   if (lp_context->active_statistics_queries) {
+      lp_context->pipeline_statistics.c_primitives++;
+   }
 
    calc_fixed_position(setup, &position, v0, v1, v2);
 
@@ -1166,8 +1176,7 @@
    PIPE_ALIGN_VAR(16) struct fixed_position position;
    struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
 
-   if (lp_context->active_statistics_queries &&
-       !llvmpipe_rasterization_disabled(lp_context)) {
+   if (lp_context->active_statistics_queries) {
       lp_context->pipeline_statistics.c_primitives++;
    }
 
@@ -1196,17 +1205,21 @@
 }
 
 
-static void triangle_nop( struct lp_setup_context *setup,
-			  const float (*v0)[4],
-			  const float (*v1)[4],
-			  const float (*v2)[4] )
+static void triangle_noop(struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4],
+                          const float (*v2)[4])
 {
 }
 
 
 void 
-lp_setup_choose_triangle( struct lp_setup_context *setup )
+lp_setup_choose_triangle(struct lp_setup_context *setup)
 {
+   if (setup->rasterizer_discard) {
+      setup->triangle = triangle_noop;
+      return;
+   }
    switch (setup->cullmode) {
    case PIPE_FACE_NONE:
       setup->triangle = triangle_both;
@@ -1218,7 +1231,7 @@
       setup->triangle = setup->ccw_is_frontface ? triangle_cw : triangle_ccw;
       break;
    default:
-      setup->triangle = triangle_nop;
+      setup->triangle = triangle_noop;
       break;
    }
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
index 28a48d4..6675b20 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -571,7 +571,7 @@
       stats->gs_invocations;
    llvmpipe->pipeline_statistics.gs_primitives +=
       stats->gs_primitives;
-   if (!llvmpipe_rasterization_disabled(llvmpipe)) {
+   if (!setup->rasterizer_discard) {
       llvmpipe->pipeline_statistics.c_invocations +=
          stats->c_invocations;
    } else {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 3e75d44..4bcca90 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -207,13 +207,27 @@
                           LP_NEW_SAMPLER |
                           LP_NEW_SAMPLER_VIEW |
                           LP_NEW_OCCLUSION_QUERY))
-      llvmpipe_update_fs( llvmpipe );
+      llvmpipe_update_fs(llvmpipe);
 
-   if (llvmpipe->dirty & (LP_NEW_RASTERIZER)) {
+   if (llvmpipe->dirty & (LP_NEW_FS |
+                          LP_NEW_FRAMEBUFFER |
+                          LP_NEW_RASTERIZER |
+                          LP_NEW_DEPTH_STENCIL_ALPHA)) {
+
+      /*
+       * Rasterization is disabled if there is no pixel shader and
+       * both depth and stencil testing are disabled:
+       * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125
+       * FIXME: set rasterizer_discard in state tracker instead.
+       */
+      boolean null_fs = !llvmpipe->fs ||
+                        llvmpipe->fs->info.base.num_instructions <= 1;
       boolean discard =
          (llvmpipe->sample_mask & 1) == 0 ||
-         (llvmpipe->rasterizer ? llvmpipe->rasterizer->rasterizer_discard : FALSE);
-
+         (llvmpipe->rasterizer ? llvmpipe->rasterizer->rasterizer_discard : FALSE) ||
+         (null_fs &&
+          !llvmpipe->depth_stencil->depth.enabled &&
+          !llvmpipe->depth_stencil->stencil[0].enabled);
       lp_setup_set_rasterizer_discard(llvmpipe->setup, discard);
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 74b8d4d..b7e16f9 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -2554,6 +2554,25 @@
    assert(builder);
    LLVMPositionBuilderAtEnd(builder, block);
 
+   /*
+    * Must not count ps invocations if there's a null shader.
+    * (It would be ok to count with null shader if there's d/s tests,
+    * but only if there's d/s buffers too, which is different
+    * to implicit rasterization disable which must not depend
+    * on the d/s buffers.)
+    * Could use popcount on mask, but pixel accuracy is not required.
+    * Could disable if there's no stats query, but maybe not worth it.
+    */
+   if (shader->info.base.num_instructions > 1) {
+      LLVMValueRef invocs, val;
+      invocs = lp_jit_thread_data_invocations(gallivm, thread_data_ptr);
+      val = LLVMBuildLoad(builder, invocs, "");
+      val = LLVMBuildAdd(builder, val,
+                         LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), 1, 0),
+                         "invoc_count");
+      LLVMBuildStore(builder, val, invocs);
+   }
+
    /* code generated texture sampling */
    sampler = lp_llvm_sampler_soa_create(key->state);
 
@@ -2843,13 +2862,6 @@
          !shader->info.base.writes_samplemask
       ? TRUE : FALSE;
 
-   if ((shader->info.base.num_tokens <= 1) &&
-       !key->depth.enabled && !key->stencil[0].enabled) {
-      variant->ps_inv_multiplier = 0;
-   } else {
-      variant->ps_inv_multiplier = 1;
-   }
-
    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
       lp_debug_fs_variant(variant);
    }
@@ -3470,17 +3482,4 @@
    llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
 }
 
-/*
- * Rasterization is disabled if there is no pixel shader and
- * both depth and stencil testing are disabled:
- * http://msdn.microsoft.com/en-us/library/windows/desktop/bb205125
- */
-boolean
-llvmpipe_rasterization_disabled(struct llvmpipe_context *lp)
-{
-   boolean null_fs = !lp->fs || lp->fs->info.base.num_tokens <= 1;
 
-   return (null_fs &&
-           !lp->depth_stencil->depth.enabled &&
-           !lp->depth_stencil->stencil[0].enabled);
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 2ddd851..28eccde 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -98,7 +98,6 @@
    struct lp_fragment_shader_variant_key key;
 
    boolean opaque;
-   uint8_t ps_inv_multiplier;
 
    struct gallivm_state *gallivm;
 
@@ -150,8 +149,4 @@
 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
                                struct lp_fragment_shader_variant *variant);
 
-boolean
-llvmpipe_rasterization_disabled(struct llvmpipe_context *lp);
-
-
 #endif /* LP_STATE_FS_H_ */
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index 6e58a03..a4f313a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -211,6 +211,14 @@
    assert(src_type.length * num_srcs == dst_type.length * num_dsts);
 
    eps = MAX2(lp_const_eps(src_type), lp_const_eps(dst_type));
+   if (dst_type.norm && dst_type.sign && src_type.sign && !src_type.floating) {
+      /*
+       * This is quite inaccurate due to shift being used.
+       * I don't think it's possible to hit such conversions with
+       * llvmpipe though.
+       */
+      eps *= 2;
+   }
 
    context = LLVMContextCreate();
    gallivm = gallivm_create("test_module", context);
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index e9a6e01..a8aa33d 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -388,8 +388,7 @@
       }
 
       /* missing fetch funcs */
-      if (format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC ||
-          format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC) {
+      if (format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC) {
          continue;
       }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 518ca27..5ec0dd3 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -147,6 +147,7 @@
       if(type.sign) {
          long long lvalue = (long long)value;
          lvalue = MIN2(lvalue, ((long long)1 << (type.width - 1)) - 1);
+         lvalue = MAX2(lvalue, -((long long)1 << (type.width - 1)));
          switch(type.width) {
          case 8:
             *((int8_t *)dst + index) = (int8_t)lvalue;
@@ -200,16 +201,24 @@
       }
       else {
          unsigned long long mask;
-	 if (type.fixed)
+         if (type.fixed)
             mask = ((unsigned long long)1 << (type.width / 2)) - 1;
          else if (type.sign)
             mask = ((unsigned long long)1 << (type.width - 1)) - 1;
          else
             mask = ((unsigned long long)1 << type.width) - 1;
          value += (double)(mask & rand());
+         if (!type.fixed && !type.sign && type.width == 32) {
+            /*
+             * rand only returns half the possible range
+             * XXX 64bit values...
+             */
+            if(rand() & 1)
+               value += (double)0x80000000;
+         }
       }
    }
-   if(!type.sign)
+   if(type.sign)
       if(rand() & 1)
          value = -value;
    write_elem(type, dst, index, value);
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
index 90741b6..7ee5f8f 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm
@@ -27,11 +27,11 @@
    imul u32 u32 $r3 $r1 $r2
    imad u32 u32 hi $r2 $r2 $r3 $r2
    imul u32 u32 $r3 $r1 $r2
-   sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1)
+   sched (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 wt 0x1) (st 0x6 wr 0x0 rd 0x1 wt 0x1)
    imad u32 u32 hi $r2 $r2 $r3 $r2
    imul u32 u32 $r3 $r1 $r2
    imad u32 u32 hi $r2 $r2 $r3 $r2
-   sched (st 0x6) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2)
+   sched (st 0x6 wt 0x2) (st 0x6 wr 0x0 rd 0x1 wt 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x2)
    mov $r3 $r0 0xf
    imul u32 u32 hi $r0 $r0 $r2
    i2i u32 u32 $r2 neg $r1
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
index 8708a94..65c93f7 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
+++ b/src/gallium/drivers/nouveau/codegen/lib/gm107.asm.h
@@ -16,11 +16,11 @@
 	0x5c38000000270103,
 	0x5a40010000370202,
 	0x5c38000000270103,
-	0x003c1801e0c00f06,
+	0x00241801e0c00f06,
 	0x5a40010000370202,
 	0x5c38000000270103,
 	0x5a40010000370202,
-	0x00443c0120c007e6,
+	0x00443c0120c017e6,
 	0x5c98078000070003,
 	0x5c38008000270000,
 	0x5ce0200000170a02,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
index c987da9..49425b9 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -423,7 +423,10 @@
 bool
 ImmediateValue::isPow2() const
 {
-   return util_is_power_of_two_or_zero(reg.data.u32);
+   if (reg.type == TYPE_U64 || reg.type == TYPE_S64)
+      return util_is_power_of_two_or_zero64(reg.data.u64);
+   else
+      return util_is_power_of_two_or_zero(reg.data.u32);
 }
 
 void
@@ -440,6 +443,12 @@
    case TYPE_U32:
       reg.data.u32 = util_logbase2(reg.data.u32);
       break;
+   case TYPE_S64:
+      assert(!this->isNegative());
+      // fall through
+   case TYPE_U64:
+      reg.data.u64 = util_logbase2_64(reg.data.u64);
+      break;
    case TYPE_F32:
       reg.data.f32 = log2f(reg.data.f32);
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index f4f3c70..0b220cc 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -453,6 +453,7 @@
    SV_TESS_INNER,
    SV_TESS_COORD,
    SV_TID,
+   SV_COMBINED_TID,
    SV_CTAID,
    SV_NTID,
    SV_GRIDID,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 3d0782f..7c835ce 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -73,6 +73,7 @@
 #define NVISA_GK104_CHIPSET    0xe0
 #define NVISA_GK20A_CHIPSET    0xea
 #define NVISA_GM107_CHIPSET    0x110
+#define NVISA_GM200_CHIPSET    0x120
 
 struct nv50_ir_prog_info
 {
@@ -145,6 +146,7 @@
          bool persampleInvocation;
          bool usesSampleMaskIn;
          bool readsFramebuffer;
+         bool readsSampleLocations;
       } fp;
       struct {
          uint32_t inputOffset; /* base address for user args */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 370427d..2118c31 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -207,7 +207,11 @@
 {
    const ImmediateValue *imm = ref.get()->asImm();
 
-   return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
+   if (ty == TYPE_F32)
+      return imm && imm->reg.data.u32 & 0xfff;
+   else
+      return imm && (imm->reg.data.s32 > 0x7ffff ||
+                     imm->reg.data.s32 < -0x80000);
 }
 
 void
@@ -342,7 +346,7 @@
       code[1] |= ((u64 & 0x7fe0000000000000ULL) >> 53);
       code[1] |= ((u64 & 0x8000000000000000ULL) >> 36);
    } else {
-      assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
+      assert((u32 & 0xfff80000) == 0 || (u32 & 0xfff80000) == 0xfff80000);
       code[0] |= (u32 & 0x001ff) << 23;
       code[1] |= (u32 & 0x7fe00) >> 9;
       code[1] |= (u32 & 0x80000) << 8;
@@ -633,7 +637,7 @@
    assert(!i->src(0).mod.neg() && !i->src(1).mod.neg());
    assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
 
-   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+   if (isLIMM(i->src(1), TYPE_S32)) {
       emitForm_L(i, 0x280, 2, Modifier(0));
 
       if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
@@ -2293,6 +2297,7 @@
    case SV_INVOCATION_ID: return 0x11;
    case SV_YDIR:          return 0x12;
    case SV_THREAD_KILL:   return 0x13;
+   case SV_COMBINED_TID:  return 0x20;
    case SV_TID:           return 0x21 + SDATA(ref).sv.index;
    case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
    case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index fafece8..1d31f18 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -124,6 +124,7 @@
 
    void emitMOV();
    void emitS2R();
+   void emitCS2R();
    void emitF2F();
    void emitF2I();
    void emitI2F();
@@ -267,6 +268,7 @@
    case SV_INVOCATION_ID  : id = 0x11; break;
    case SV_THREAD_KILL    : id = 0x13; break;
    case SV_INVOCATION_INFO: id = 0x1d; break;
+   case SV_COMBINED_TID   : id = 0x20; break;
    case SV_TID            : id = 0x21 + val->reg.data.sv.index; break;
    case SV_CTAID          : id = 0x25 + val->reg.data.sv.index; break;
    case SV_LANEMASK_EQ    : id = 0x38; break;
@@ -321,14 +323,10 @@
 {
    if (ref.getFile() == FILE_IMMEDIATE) {
       const ImmediateValue *imm = ref.get()->asImm();
-      if (isFloatType(insn->sType)) {
-         if ((imm->reg.data.u32 & 0x00000fff) != 0x00000000)
-            return true;
-      } else {
-         if ((imm->reg.data.u32 & 0xfff00000) != 0x00000000 &&
-             (imm->reg.data.u32 & 0xfff00000) != 0xfff00000)
-            return true;
-      }
+      if (isFloatType(insn->sType))
+         return imm->reg.data.u32 & 0xfff;
+      else
+         return imm->reg.data.s32 > 0x7ffff || imm->reg.data.s32 < -0x80000;
    }
    return false;
 }
@@ -346,8 +344,9 @@
       } else if (insn->sType == TYPE_F64) {
          assert(!(imm->reg.data.u64 & 0x00000fffffffffffULL));
          val = imm->reg.data.u64 >> 44;
+      } else {
+         assert(!(val & 0xfff80000) || (val & 0xfff80000) == 0xfff80000);
       }
-      assert(!(val & 0xfff00000) || (val & 0xfff00000) == 0xfff00000);
       emitField( 56,   1, (val & 0x80000) >> 19);
       emitField(pos, len, (val & 0x7ffff));
    } else {
@@ -752,6 +751,14 @@
 }
 
 void
+CodeEmitterGM107::emitCS2R()
+{
+   emitInsn(0x50c80000);
+   emitSYS (0x14, insn->src(0));
+   emitGPR (0x00, insn->def(0));
+}
+
+void
 CodeEmitterGM107::emitF2F()
 {
    RoundMode rnd = insn->rnd;
@@ -1658,7 +1665,7 @@
       break;
    }
 
-   if (insn->src(1).getFile() != FILE_IMMEDIATE) {
+   if (!longIMMD(insn->src(1))) {
       switch (insn->src(1).getFile()) {
       case FILE_GPR:
          emitInsn(0x5c400000);
@@ -1731,7 +1738,7 @@
 void
 CodeEmitterGM107::emitIADD()
 {
-   if (insn->src(1).getFile() != FILE_IMMEDIATE) {
+   if (!longIMMD(insn->src(1))) {
       switch (insn->src(1).getFile()) {
       case FILE_GPR:
          emitInsn(0x5c100000);
@@ -1773,7 +1780,7 @@
 void
 CodeEmitterGM107::emitIMUL()
 {
-   if (insn->src(1).getFile() != FILE_IMMEDIATE) {
+   if (!longIMMD(insn->src(1))) {
       switch (insn->src(1).getFile()) {
       case FILE_GPR:
          emitInsn(0x5c380000);
@@ -3194,7 +3201,10 @@
       emitMOV();
       break;
    case OP_RDSV:
-      emitS2R();
+      if (targGM107->isCS2RSV(insn->getSrc(0)->reg.data.sv.sv))
+         emitCS2R();
+      else
+         emitS2R();
       break;
    case OP_ABS:
    case OP_NEG:
@@ -3621,6 +3631,7 @@
 
    bool insertBarriers(BasicBlock *);
 
+   bool doesInsnWriteTo(const Instruction *insn, const Value *val) const;
    Instruction *findFirstUse(const Instruction *) const;
    Instruction *findFirstDef(const Instruction *) const;
 
@@ -3951,8 +3962,48 @@
    return false;
 }
 
-// Find the next instruction inside the same basic block which uses the output
-// of the given instruction in order to avoid RaW hazards.
+// Helper function for findFirstUse() and findFirstDef()
+bool
+SchedDataCalculatorGM107::doesInsnWriteTo(const Instruction *insn,
+                                          const Value *val) const
+{
+   if (val->reg.file != FILE_GPR &&
+       val->reg.file != FILE_PREDICATE &&
+       val->reg.file != FILE_FLAGS)
+      return false;
+
+   for (int d = 0; insn->defExists(d); ++d) {
+      const Value* def = insn->getDef(d);
+      int minGPR = def->reg.data.id;
+      int maxGPR = minGPR + def->reg.size / 4 - 1;
+
+      if (def->reg.file != val->reg.file)
+         continue;
+
+      if (def->reg.file == FILE_GPR) {
+         if (val->reg.data.id + val->reg.size / 4 - 1 < minGPR ||
+             val->reg.data.id > maxGPR)
+            continue;
+         return true;
+      } else
+      if (def->reg.file == FILE_PREDICATE) {
+         if (val->reg.data.id != minGPR)
+            continue;
+         return true;
+      } else
+      if (def->reg.file == FILE_FLAGS) {
+         if (val->reg.data.id != minGPR)
+            continue;
+         return true;
+      }
+   }
+
+   return false;
+}
+
+// Find the next instruction inside the same basic block which uses (reads or
+// writes from) the output of the given instruction in order to avoid RaW and
+// WaW hazards.
 Instruction *
 SchedDataCalculatorGM107::findFirstUse(const Instruction *bari) const
 {
@@ -3964,34 +4015,13 @@
    for (insn = bari->next; insn != NULL; insn = next) {
       next = insn->next;
 
-      for (int s = 0; insn->srcExists(s); ++s) {
-         const Value *src = insn->src(s).rep();
-         for (int d = 0; bari->defExists(d); ++d) {
-            const ValueDef &def = bari->def(d);
-            int minGPR = def.rep()->reg.data.id;
-            int maxGPR = minGPR + def.rep()->reg.size / 4 - 1;
+      for (int s = 0; insn->srcExists(s); ++s)
+         if (doesInsnWriteTo(bari, insn->getSrc(s)))
+            return insn;
 
-            if (def.getFile() == FILE_GPR) {
-               if (insn->src(s).getFile() != FILE_GPR ||
-                   src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
-                   src->reg.data.id > maxGPR)
-                  continue;
-               return insn;
-            } else
-            if (def.getFile() == FILE_PREDICATE) {
-               if (insn->src(s).getFile() != FILE_PREDICATE ||
-                   src->reg.data.id != minGPR)
-                  continue;
-               return insn;
-            }
-            if (def.getFile() == FILE_FLAGS) {
-               if (insn->src(s).getFile() != FILE_FLAGS ||
-                   src->reg.data.id != minGPR)
-                  continue;
-               return insn;
-            }
-         }
-      }
+      for (int d = 0; insn->defExists(d); ++d)
+         if (doesInsnWriteTo(bari, insn->getDef(d)))
+            return insn;
    }
    return NULL;
 }
@@ -4002,34 +4032,16 @@
 SchedDataCalculatorGM107::findFirstDef(const Instruction *bari) const
 {
    Instruction *insn, *next;
-   int minGPR, maxGPR;
+
+   if (!bari->srcExists(0))
+      return NULL;
 
    for (insn = bari->next; insn != NULL; insn = next) {
       next = insn->next;
 
-      for (int d = 0; insn->defExists(d); ++d) {
-         const Value *def = insn->def(d).rep();
-         if (insn->def(d).getFile() != FILE_GPR &&
-             insn->def(d).getFile() != FILE_FLAGS)
-            continue;
-
-         minGPR = def->reg.data.id;
-         maxGPR = minGPR + def->reg.size / 4 - 1;
-
-         for (int s = 0; bari->srcExists(s); ++s) {
-            const Value *src = bari->src(s).rep();
-            if (bari->src(s).getFile() == FILE_FLAGS &&
-                insn->def(d).getFile() == FILE_FLAGS &&
-                src->reg.data.id == minGPR)
-               return insn;
-            if (bari->src(s).getFile() != FILE_GPR ||
-                insn->def(d).getFile() != FILE_GPR ||
-                src->reg.data.id + src->reg.size / 4 - 1 < minGPR ||
-                src->reg.data.id > maxGPR)
-               continue;
+      for (int s = 0; bari->srcExists(s); ++s)
+         if (doesInsnWriteTo(insn, bari->getSrc(s)))
             return insn;
-         }
-      }
    }
    return NULL;
 }
@@ -4087,7 +4099,8 @@
       if (need_wr_bar) {
          // When the instruction requires to emit a write dependency barrier
          // (all which write something at a variable latency), find the next
-         // instruction which reads the outputs.
+         // instruction which reads the outputs (or writes to them, potentially
+         // completing before this insn.
          usei = findFirstUse(insn);
 
          // Allocate and emit a new barrier.
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index be7ac18..b6e35dd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -213,7 +213,11 @@
 {
    const ImmediateValue *imm = ref.get()->asImm();
 
-   return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
+   if (ty == TYPE_F32)
+      return imm && imm->reg.data.u32 & 0xfff;
+   else
+      return imm && (imm->reg.data.s32 > 0x7ffff ||
+                     imm->reg.data.s32 < -0x80000);
 }
 
 void
@@ -352,7 +356,7 @@
    } else
    if ((code[0] & 0xf) == 0x3 || (code[0] & 0xf) == 4) {
       // integer immediate
-      assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
+      assert((u32 & 0xfff80000) == 0 || (u32 & 0xfff80000) == 0xfff80000);
       assert(!(code[1] & 0xc000));
       u32 &= 0xfffff;
       code[0] |= (u32 & 0x3f) << 26;
@@ -641,7 +645,7 @@
 CodeEmitterNVC0::emitUMUL(const Instruction *i)
 {
    if (i->encSize == 8) {
-      if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      if (isLIMM(i->src(1), TYPE_U32)) {
          emitForm_A(i, HEX64(10000000, 00000002));
       } else {
          emitForm_A(i, HEX64(50000000, 00000003));
@@ -1986,6 +1990,7 @@
    case SV_INVOCATION_ID: return 0x11;
    case SV_YDIR:          return 0x12;
    case SV_THREAD_KILL:   return 0x13;
+   case SV_COMBINED_TID:  return 0x20;
    case SV_TID:           return 0x21 + SDATA(ref).sv.index;
    case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
    case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
@@ -2069,7 +2074,7 @@
             assert(!(imm & 0x000fffff));
             code[0] = 0x00000318 | imm;
          } else {
-            assert(imm < 0x800 || ((int32_t)imm >= -0x800));
+            assert(imm < 0x800 && ((int32_t)imm >= -0x800));
             code[0] = 0x00000118 | (imm << 20);
          }
       } else {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 3c5bad0..2f9bcc1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1520,6 +1520,10 @@
          info->out[src.getIndex(0)].oread = 1;
       }
    }
+   if (src.getFile() == TGSI_FILE_SYSTEM_VALUE) {
+      if (info->sv[src.getIndex(0)].sn == TGSI_SEMANTIC_SAMPLEPOS)
+         info->prop.fp.readsSampleLocations = true;
+   }
    if (src.getFile() != TGSI_FILE_INPUT)
       return;
 
@@ -1560,9 +1564,17 @@
    if (insn.getOpcode() == TGSI_OPCODE_FBFETCH)
       info->prop.fp.readsFramebuffer = true;
 
+   if (insn.getOpcode() == TGSI_OPCODE_INTERP_SAMPLE)
+      info->prop.fp.readsSampleLocations = true;
+
    if (insn.dstCount()) {
       Instruction::DstRegister dst = insn.getDst(0);
 
+      if (insn.getOpcode() == TGSI_OPCODE_STORE &&
+          dst.getFile() != TGSI_FILE_MEMORY) {
+         info->io.globalAccess |= 0x2;
+      }
+
       if (dst.getFile() == TGSI_FILE_OUTPUT) {
          if (dst.isIndirect(0))
             for (unsigned i = 0; i < info->numOutputs; ++i)
@@ -1580,10 +1592,6 @@
          if (isEdgeFlagPassthrough(insn))
             info->io.edgeFlagIn = insn.getSrc(0).getIndex(0);
       } else
-      if (dst.getFile() != TGSI_FILE_MEMORY &&
-          insn.getOpcode() == TGSI_OPCODE_STORE) {
-         info->io.globalAccess |= 0x2;
-      } else
       if (dst.getFile() == TGSI_FILE_TEMPORARY) {
          if (dst.isIndirect(0))
             indirectTempArrays.insert(dst.getArrayId());
@@ -3605,6 +3613,9 @@
                                   info->out[info->io.viewportId].slot[0] * 4);
          mkStore(OP_EXPORT, TYPE_U32, vpSym, NULL, viewport);
       }
+      /* handle user clip planes for each emitted vertex */
+      if (info->io.genUserClip > 0)
+         handleUserClipPlanes();
       /* fallthrough */
    case TGSI_OPCODE_ENDPRIM:
    {
@@ -3779,7 +3790,9 @@
       setPosition(epilogue, true);
       if (prog->getType() == Program::TYPE_FRAGMENT)
          exportOutputs();
-      if (info->io.genUserClip > 0)
+      if ((prog->getType() == Program::TYPE_VERTEX ||
+           prog->getType() == Program::TYPE_TESSELLATION_EVAL
+          ) && info->io.genUserClip > 0)
          handleUserClipPlanes();
       mkOp(OP_EXIT, TYPE_NONE, NULL)->terminator = 1;
    }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 209f5c6..49a5f3b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -315,6 +315,19 @@
       samples->tex.query = TXQ_TYPE;
    }
 
+   if (suq->tex.target.isMS()) {
+      bld.setPosition(suq, true);
+
+      if (mask & 0x1)
+         bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(0), suq->getDef(0),
+                   loadMsAdjInfo32(suq->tex.target, 0, slot, ind, suq->tex.bindless));
+      if (mask & 0x2) {
+         int d = util_bitcount(mask & 0x1);
+         bld.mkOp2(OP_SHR, TYPE_U32, suq->getDef(d), suq->getDef(d),
+                   loadMsAdjInfo32(suq->tex.target, 1, slot, ind, suq->tex.bindless));
+      }
+   }
+
    return true;
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 36ab837..1f0fd46 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -1201,6 +1201,9 @@
          bld.mkMov(def, bld.mkImm(0));
       }
       break;
+   case SV_COMBINED_TID:
+      bld.mkMov(def, tid);
+      break;
    case SV_SAMPLE_POS: {
       Value *off = new_LValue(func, FILE_ADDRESS);
       bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index d967f1a..3feb1fc 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1712,35 +1712,6 @@
       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 }
 
-/* On nvc0, surface info is obtained via the surface binding points passed
- * to the SULD/SUST instructions.
- * On nve4, surface info is stored in c[] and is used by various special
- * instructions, e.g. for clamping coordinates or generating an address.
- * They couldn't just have added an equivalent to TIC now, couldn't they ?
- */
-#define NVC0_SU_INFO_ADDR   0x00
-#define NVC0_SU_INFO_FMT    0x04
-#define NVC0_SU_INFO_DIM_X  0x08
-#define NVC0_SU_INFO_PITCH  0x0c
-#define NVC0_SU_INFO_DIM_Y  0x10
-#define NVC0_SU_INFO_ARRAY  0x14
-#define NVC0_SU_INFO_DIM_Z  0x18
-#define NVC0_SU_INFO_UNK1C  0x1c
-#define NVC0_SU_INFO_WIDTH  0x20
-#define NVC0_SU_INFO_HEIGHT 0x24
-#define NVC0_SU_INFO_DEPTH  0x28
-#define NVC0_SU_INFO_TARGET 0x2c
-#define NVC0_SU_INFO_BSIZE  0x30
-#define NVC0_SU_INFO_RAW_X  0x34
-#define NVC0_SU_INFO_MS_X   0x38
-#define NVC0_SU_INFO_MS_Y   0x3c
-
-#define NVC0_SU_INFO__STRIDE 0x40
-
-#define NVC0_SU_INFO_DIM(i)  (0x08 + (i) * 8)
-#define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
-#define NVC0_SU_INFO_MS(i)   (0x38 + (i) * 4)
-
 inline Value *
 NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless)
 {
@@ -1761,6 +1732,45 @@
                         prog->driver->io.suInfoBase);
 }
 
+Value *
+NVC0LoweringPass::loadMsAdjInfo32(TexInstruction::Target target, uint32_t index, int slot, Value *ind, bool bindless)
+{
+   if (!bindless || targ->getChipset() < NVISA_GM107_CHIPSET)
+      return loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(index), bindless);
+
+   assert(bindless);
+
+   Value *samples = bld.getSSA();
+   // this shouldn't be lowered because it's being inserted before the current instruction
+   TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
+   tex->tex.target = target;
+   tex->tex.query = TXQ_TYPE;
+   tex->tex.mask = 0x4;
+   tex->tex.r = 0xff;
+   tex->tex.s = 0x1f;
+   tex->tex.rIndirectSrc = 0;
+   tex->setDef(0, samples);
+   tex->setSrc(0, ind);
+   tex->setSrc(1, bld.loadImm(NULL, 0));
+   bld.insert(tex);
+
+   // doesn't work with sample counts other than 1/2/4/8 but they aren't supported
+   switch (index) {
+   case 0: {
+      Value *tmp = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), samples, bld.mkImm(2));
+      return bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(2));
+   }
+   case 1: {
+      Value *tmp = bld.mkCmp(OP_SET, CC_GT, TYPE_U32, bld.getSSA(), TYPE_U32, samples, bld.mkImm(2))->getDef(0);
+      return bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), tmp, bld.mkImm(1));
+   }
+   default: {
+      assert(false);
+      return NULL;
+   }
+   }
+}
+
 static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
 {
    switch (su->tex.target.getEnum()) {
@@ -1846,8 +1856,8 @@
    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
    Value *ind = tex->getIndirectR();
 
-   Value *ms_x = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(0), tex->tex.bindless);
-   Value *ms_y = loadSuInfo32(ind, slot, NVC0_SU_INFO_MS(1), tex->tex.bindless);
+   Value *ms_x = loadMsAdjInfo32(tex->tex.target, 0, slot, ind, tex->tex.bindless);
+   Value *ms_y = loadMsAdjInfo32(tex->tex.target, 1, slot, ind, tex->tex.bindless);
 
    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
@@ -2374,6 +2384,8 @@
 
    bld.setPosition(su, false);
 
+   adjustCoordinatesMS(su);
+
    // add texture handle
    switch (su->op) {
    case OP_SUSTP:
@@ -2630,6 +2642,18 @@
          // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
          i->op = OP_MOV;
          i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
+      } else
+      if (sv == SV_TID) {
+         // Help CSE combine TID fetches
+         Value *tid = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(),
+                                 bld.mkSysVal(SV_COMBINED_TID, 0));
+         i->op = OP_EXTBF;
+         i->setSrc(0, tid);
+         switch (sym->reg.data.sv.index) {
+         case 0: i->setSrc(1, bld.mkImm(0x1000)); break;
+         case 1: i->setSrc(1, bld.mkImm(0x0a10)); break;
+         case 2: i->setSrc(1, bld.mkImm(0x061a)); break;
+         }
       }
       if (sv == SV_VERTEX_COUNT) {
          bld.setPosition(i, true);
@@ -2689,17 +2713,33 @@
       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
       break;
    case SV_SAMPLE_POS: {
-      Value *off = new_LValue(func, FILE_GPR);
-      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
+      Value *sampleID = bld.getScratch();
+      ld = bld.mkOp1(OP_PIXLD, TYPE_U32, sampleID, bld.mkImm(0));
       ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
-      bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
-      bld.mkLoad(TYPE_F32,
-                 i->getDef(0),
-                 bld.mkSymbol(
-                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
-                       TYPE_U32, prog->driver->io.sampleInfoBase +
-                       4 * sym->reg.data.sv.index),
-                 off);
+      Value *offset = calculateSampleOffset(sampleID);
+
+      assert(prog->driver->prop.fp.readsSampleLocations);
+
+      if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
+         bld.mkLoad(TYPE_F32,
+                    i->getDef(0),
+                    bld.mkSymbol(
+                          FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+                          TYPE_U32, prog->driver->io.sampleInfoBase),
+                    offset);
+         bld.mkOp2(OP_EXTBF, TYPE_U32, i->getDef(0), i->getDef(0),
+                   bld.mkImm(0x040c + sym->reg.data.sv.index * 16));
+         bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_U32, i->getDef(0));
+         bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(0), i->getDef(0), bld.mkImm(1.0f / 16.0f));
+      } else {
+         bld.mkLoad(TYPE_F32,
+                    i->getDef(0),
+                    bld.mkSymbol(
+                          FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+                          TYPE_U32, prog->driver->io.sampleInfoBase +
+                          4 * sym->reg.data.sv.index),
+                    offset);
+      }
       break;
    }
    case SV_SAMPLE_MASK: {
@@ -2859,6 +2899,69 @@
    return true;
 }
 
+Value *
+NVC0LoweringPass::calculateSampleOffset(Value *sampleID)
+{
+   Value *offset = bld.getScratch();
+   if (targ->getChipset() >= NVISA_GM200_CHIPSET) {
+      // Sample location offsets (in bytes) are calculated like so:
+      // offset = (SV_POSITION.y % 4 * 2) + (SV_POSITION.x % 2)
+      // offset = offset * 32 + sampleID % 8 * 4;
+      // which is equivalent to:
+      // offset = (SV_POSITION.y & 0x3) << 6 + (SV_POSITION.x & 0x1) << 5;
+      // offset += sampleID << 2
+
+      // The second operand (src1) of the INSBF instructions are like so:
+      // 0xssll where ss is the size and ll is the offset.
+      // so: dest = src2 | (src0 & (1 << ss - 1)) << ll
+
+      // Add sample ID (offset = (sampleID & 0x7) << 2)
+      bld.mkOp3(OP_INSBF, TYPE_U32, offset, sampleID, bld.mkImm(0x0302), bld.mkImm(0x0));
+
+      Symbol *xSym = bld.mkSysVal(SV_POSITION, 0);
+      Symbol *ySym = bld.mkSysVal(SV_POSITION, 1);
+      Value *coord = bld.getScratch();
+
+      // Add X coordinate (offset |= (SV_POSITION.x & 0x1) << 5)
+      bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
+                   targ->getSVAddress(FILE_SHADER_INPUT, xSym), NULL);
+      bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
+         ->rnd = ROUND_ZI;
+      bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0105), offset);
+
+      // Add Y coordinate (offset |= (SV_POSITION.y & 0x3) << 6)
+      bld.mkInterp(NV50_IR_INTERP_LINEAR, coord,
+                   targ->getSVAddress(FILE_SHADER_INPUT, ySym), NULL);
+      bld.mkCvt(OP_CVT, TYPE_U32, coord, TYPE_F32, coord)
+         ->rnd = ROUND_ZI;
+      bld.mkOp3(OP_INSBF, TYPE_U32, offset, coord, bld.mkImm(0x0206), offset);
+   } else {
+      bld.mkOp2(OP_SHL, TYPE_U32, offset, sampleID, bld.mkImm(3));
+   }
+   return offset;
+}
+
+// Handle programmable sample locations for GM20x+
+void
+NVC0LoweringPass::handlePIXLD(Instruction *i)
+{
+   if (i->subOp != NV50_IR_SUBOP_PIXLD_OFFSET)
+      return;
+   if (targ->getChipset() < NVISA_GM200_CHIPSET)
+      return;
+
+   assert(prog->driver->prop.fp.readsSampleLocations);
+
+   bld.mkLoad(TYPE_F32,
+              i->getDef(0),
+              bld.mkSymbol(
+                    FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+                    TYPE_U32, prog->driver->io.sampleInfoBase),
+              calculateSampleOffset(i->getSrc(0)));
+
+   bld.getBB()->remove(i);
+}
+
 // Generate a binary predicate if an instruction is predicated by
 // e.g. an f32 value.
 void
@@ -2958,6 +3061,9 @@
    case OP_BUFQ:
       handleBUFQ(i);
       break;
+   case OP_PIXLD:
+      handlePIXLD(i);
+      break;
    default:
       break;
    }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 7f78cd3..4136b1ec 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -23,6 +23,35 @@
 #include "codegen/nv50_ir.h"
 #include "codegen/nv50_ir_build_util.h"
 
+/* On nvc0, surface info is obtained via the surface binding points passed
+ * to the SULD/SUST instructions.
+ * On nve4, surface info is stored in c[] and is used by various special
+ * instructions, e.g. for clamping coordinates or generating an address.
+ * They couldn't just have added an equivalent to TIC now, couldn't they ?
+ */
+#define NVC0_SU_INFO_ADDR   0x00
+#define NVC0_SU_INFO_FMT    0x04
+#define NVC0_SU_INFO_DIM_X  0x08
+#define NVC0_SU_INFO_PITCH  0x0c
+#define NVC0_SU_INFO_DIM_Y  0x10
+#define NVC0_SU_INFO_ARRAY  0x14
+#define NVC0_SU_INFO_DIM_Z  0x18
+#define NVC0_SU_INFO_UNK1C  0x1c
+#define NVC0_SU_INFO_WIDTH  0x20
+#define NVC0_SU_INFO_HEIGHT 0x24
+#define NVC0_SU_INFO_DEPTH  0x28
+#define NVC0_SU_INFO_TARGET 0x2c
+#define NVC0_SU_INFO_BSIZE  0x30
+#define NVC0_SU_INFO_RAW_X  0x34
+#define NVC0_SU_INFO_MS_X   0x38
+#define NVC0_SU_INFO_MS_Y   0x3c
+
+#define NVC0_SU_INFO__STRIDE 0x40
+
+#define NVC0_SU_INFO_DIM(i)  (0x08 + (i) * 8)
+#define NVC0_SU_INFO_SIZE(i) (0x20 + (i) * 4)
+#define NVC0_SU_INFO_MS(i)   (0x38 + (i) * 4)
+
 namespace nv50_ir {
 
 class NVC0LegalizeSSA : public Pass
@@ -116,8 +145,10 @@
    void handleSharedATOMNVE4(Instruction *);
    void handleLDST(Instruction *);
    bool handleBUFQ(Instruction *);
+   void handlePIXLD(Instruction *);
 
    void checkPredicate(Instruction *);
+   Value *loadMsAdjInfo32(TexInstruction::Target targ, uint32_t index, int slot, Value *ind, bool bindless);
 
    virtual bool visit(Instruction *);
 
@@ -143,6 +174,7 @@
    void processSurfaceCoordsNVC0(TexInstruction *);
    void convertSurfaceFormat(TexInstruction *);
    void insertOOBSurfaceOpResult(TexInstruction *);
+   Value *calculateSampleOffset(Value *sampleID);
 
 protected:
    Value *loadTexHandle(Value *ptr, unsigned int slot);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 6a77f8a..7bb12cd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -283,6 +283,8 @@
 {
 private:
    virtual bool visit(BasicBlock *);
+
+   BuildUtil bld;
 };
 
 bool
@@ -294,6 +296,8 @@
    for (Instruction *i = bb->getEntry(); i; i = next) {
       next = i->next;
 
+      bld.setPosition(i, false);
+
       for (int s = 0; i->srcExists(s); ++s) {
          Instruction *insn;
          ImmediateValue imm;
@@ -325,6 +329,14 @@
             i->setIndirect(s, 0, NULL);
             i->setSrc(s, cloneShallow(func, i->getSrc(s)));
             i->src(s).get()->reg.data.offset += imm.reg.data.u32;
+         } else if (insn->op == OP_SHLADD) {
+            if (!insn->src(2).getImmediate(imm) ||
+                !targ->insnCanLoadOffset(i, s, imm.reg.data.s32))
+               continue;
+            i->setIndirect(s, 0, bld.mkOp2v(
+               OP_SHL, TYPE_U32, bld.getSSA(), insn->getSrc(0), insn->getSrc(1)));
+            i->setSrc(s, cloneShallow(func, i->getSrc(s)));
+            i->src(s).get()->reg.data.offset += imm.reg.data.u32;
          }
       }
    }
@@ -1654,6 +1666,7 @@
 // SLCT(a, b, const) -> cc(const) ? a : b
 // RCP(RCP(a)) -> a
 // MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
+// EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
 class AlgebraicOpt : public Pass
 {
 private:
@@ -1671,6 +1684,7 @@
    void handleCVT_EXTBF(Instruction *);
    void handleSUCLAMP(Instruction *);
    void handleNEG(Instruction *);
+   void handleEXTBF_RDSV(Instruction *);
 
    BuildUtil bld;
 };
@@ -2175,6 +2189,41 @@
    }
 }
 
+// EXTBF(RDSV(COMBINED_TID)) -> RDSV(TID)
+void
+AlgebraicOpt::handleEXTBF_RDSV(Instruction *i)
+{
+   Instruction *rdsv = i->getSrc(0)->getUniqueInsn();
+   if (rdsv->op != OP_RDSV ||
+       rdsv->getSrc(0)->asSym()->reg.data.sv.sv != SV_COMBINED_TID)
+      return;
+   // Avoid creating more RDSV instructions
+   if (rdsv->getDef(0)->refCount() > 1)
+      return;
+
+   ImmediateValue imm;
+   if (!i->src(1).getImmediate(imm))
+      return;
+
+   int index;
+   if (imm.isInteger(0x1000))
+      index = 0;
+   else
+   if (imm.isInteger(0x0a10))
+      index = 1;
+   else
+   if (imm.isInteger(0x061a))
+      index = 2;
+   else
+      return;
+
+   bld.setPosition(i, false);
+
+   i->op = OP_RDSV;
+   i->setSrc(0, bld.mkSysVal(SV_TID, index));
+   i->setSrc(1, NULL);
+}
+
 bool
 AlgebraicOpt::visit(BasicBlock *bb)
 {
@@ -2215,6 +2264,9 @@
       case OP_NEG:
          handleNEG(i);
          break;
+      case OP_EXTBF:
+         handleEXTBF_RDSV(i);
+         break;
       default:
          break;
       }
@@ -3432,6 +3484,11 @@
    } else
    if (this->asFlow()) {
       return false;
+   } else
+   if (this->op == OP_PHI && this->bb != that->bb) {
+      /* TODO: we could probably be a bit smarter here by following the
+       * control flow, but honestly, it is quite painful to check */
+      return false;
    } else {
       if (this->ipa != that->ipa ||
           this->lanes != that->lanes ||
@@ -3528,6 +3585,7 @@
             break;
       }
       if (!phi->srcExists(s)) {
+         assert(ik->op != OP_PHI);
          Instruction *entry = bb->getEntry();
          ik->bb->remove(ik);
          if (!entry || entry->op != OP_JOIN)
@@ -3798,10 +3856,10 @@
    RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
    RUN_PASS(1, ConstantFolding, foldAll);
    RUN_PASS(0, Split64BitOpPreRA, run);
+   RUN_PASS(2, LateAlgebraicOpt, run);
    RUN_PASS(1, LoadPropagation, run);
    RUN_PASS(1, IndirectPropagation, run);
    RUN_PASS(2, MemoryOpt, run);
-   RUN_PASS(2, LateAlgebraicOpt, run);
    RUN_PASS(2, LocalCSE, run);
    RUN_PASS(0, DeadCodeElim, buryAll);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index ab39f9f..ee3506f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -217,7 +217,7 @@
 
 static const char *pixldOpStr[] =
 {
-   "count", "covmask", "offset", "cent_offset", "sampleid"
+   "count", "covmask", "covered", "offset", "cent_offset", "sampleid"
 };
 
 static const char *rcprsqOpStr[] =
@@ -306,6 +306,7 @@
    "TESS_INNER",
    "TESS_COORD",
    "TID",
+   "COMBINED_TID",
    "CTAID",
    "NTID",
    "GRIDID",
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 3a0e56e..b660fec 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -257,6 +257,7 @@
    private:
       virtual bool visit(BasicBlock *);
 
+      void insertConstraintMove(Instruction *, int s);
       bool insertConstraintMoves();
 
       void condenseDefs(Instruction *);
@@ -1466,17 +1467,36 @@
          nodes[i].init(regs, lval);
          RIG.insert(&nodes[i]);
 
-         if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL &&
-             prog->getTarget()->getChipset() < 0xc0) {
+         if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL) {
             Instruction *insn = lval->getInsn();
-            if (insn->op == OP_MAD || insn->op == OP_FMA || insn->op == OP_SAD)
-               // Short encoding only possible if they're all GPRs, no need to
-               // affect them otherwise.
-               if (insn->flagsDef < 0 &&
-                   insn->src(0).getFile() == FILE_GPR &&
-                   insn->src(1).getFile() == FILE_GPR &&
-                   insn->src(2).getFile() == FILE_GPR)
-                  nodes[i].addRegPreference(getNode(insn->getSrc(2)->asLValue()));
+            if (insn->op != OP_MAD && insn->op != OP_FMA && insn->op != OP_SAD)
+               continue;
+            // For both of the cases below, we only want to add the preference
+            // if all arguments are in registers.
+            if (insn->src(0).getFile() != FILE_GPR ||
+                insn->src(1).getFile() != FILE_GPR ||
+                insn->src(2).getFile() != FILE_GPR)
+               continue;
+            if (prog->getTarget()->getChipset() < 0xc0) {
+               // Outputting a flag is not supported with short encodings nor
+               // with immediate arguments.
+               // See handleMADforNV50.
+               if (insn->flagsDef >= 0)
+                  continue;
+            } else {
+               // We can only fold immediate arguments if dst == src2. This
+               // only matters if one of the first two arguments is an
+               // immediate. This form is also only supported for floats.
+               // See handleMADforNVC0.
+               ImmediateValue imm;
+               if (insn->dType != TYPE_F32)
+                  continue;
+               if (!insn->src(0).getImmediate(imm) &&
+                   !insn->src(1).getImmediate(imm))
+                  continue;
+            }
+
+            nodes[i].addRegPreference(getNode(insn->getSrc(2)->asLValue()));
          }
       }
    }
@@ -2216,6 +2236,8 @@
    for (c = 0; tex->srcExists(c) || tex->defExists(c); ++c) {
       if (!tex->srcExists(c))
          tex->setSrc(c, new_LValue(func, tex->getSrc(0)->asLValue()));
+      else
+         insertConstraintMove(tex, c);
       if (!tex->defExists(c))
          tex->setDef(c, new_LValue(func, tex->getDef(0)->asLValue()));
    }
@@ -2288,6 +2310,53 @@
    return true;
 }
 
+void
+RegAlloc::InsertConstraintsPass::insertConstraintMove(Instruction *cst, int s)
+{
+   const uint8_t size = cst->src(s).getSize();
+
+   assert(cst->getSrc(s)->defs.size() == 1); // still SSA
+
+   Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
+   bool imm = defi->op == OP_MOV &&
+      defi->src(0).getFile() == FILE_IMMEDIATE;
+   bool load = defi->op == OP_LOAD &&
+      defi->src(0).getFile() == FILE_MEMORY_CONST &&
+      !defi->src(0).isIndirect(0);
+   // catch some cases where don't really need MOVs
+   if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs()) {
+      if (imm || load) {
+         // Move the defi right before the cst. No point in expanding
+         // the range.
+         defi->bb->remove(defi);
+         cst->bb->insertBefore(cst, defi);
+      }
+      return;
+   }
+
+   LValue *lval = new_LValue(func, cst->src(s).getFile());
+   lval->reg.size = size;
+
+   Instruction *mov = new_Instruction(func, OP_MOV, typeOfSize(size));
+   mov->setDef(0, lval);
+   mov->setSrc(0, cst->getSrc(s));
+
+   if (load) {
+      mov->op = OP_LOAD;
+      mov->setSrc(0, defi->getSrc(0));
+   } else if (imm) {
+      mov->setSrc(0, defi->getSrc(0));
+   }
+
+   if (defi->getPredicate())
+      mov->setPredicate(defi->cc, defi->getPredicate());
+
+   cst->setSrc(s, mov->getDef(0));
+   cst->bb->insertBefore(cst, mov);
+
+   cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help
+}
+
 // Insert extra moves so that, if multiple register constraints on a value are
 // in conflict, these conflicts can be resolved.
 bool
@@ -2328,46 +2397,8 @@
                cst->bb->insertBefore(cst, mov);
                continue;
             }
-            assert(cst->getSrc(s)->defs.size() == 1); // still SSA
 
-            Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
-            bool imm = defi->op == OP_MOV &&
-               defi->src(0).getFile() == FILE_IMMEDIATE;
-            bool load = defi->op == OP_LOAD &&
-               defi->src(0).getFile() == FILE_MEMORY_CONST &&
-               !defi->src(0).isIndirect(0);
-            // catch some cases where don't really need MOVs
-            if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs()) {
-               if (imm || load) {
-                  // Move the defi right before the cst. No point in expanding
-                  // the range.
-                  defi->bb->remove(defi);
-                  cst->bb->insertBefore(cst, defi);
-               }
-               continue;
-            }
-
-            LValue *lval = new_LValue(func, cst->src(s).getFile());
-            lval->reg.size = size;
-
-            mov = new_Instruction(func, OP_MOV, typeOfSize(size));
-            mov->setDef(0, lval);
-            mov->setSrc(0, cst->getSrc(s));
-
-            if (load) {
-               mov->op = OP_LOAD;
-               mov->setSrc(0, defi->getSrc(0));
-            } else if (imm) {
-               mov->setSrc(0, defi->getSrc(0));
-            }
-
-            cst->setSrc(s, mov->getDef(0));
-            cst->bb->insertBefore(cst, mov);
-
-            cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help
-
-            if (cst->op == OP_UNION)
-               mov->setPredicate(defi->cc, defi->getPredicate());
+            insertConstraintMove(cst, s);
          }
       }
    }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
index 04cbd40..adbfcc3 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
@@ -153,9 +153,10 @@
       case OP_AFETCH:
       case OP_PFETCH:
       case OP_PIXLD:
-      case OP_RDSV:
       case OP_SHFL:
          return true;
+      case OP_RDSV:
+         return !isCS2RSV(insn->getSrc(0)->reg.data.sv.sv);
       default:
          break;
       }
@@ -232,6 +233,8 @@
       if (insn->dType != TYPE_F64)
          return 6;
       break;
+   case OP_RDSV:
+      return isCS2RSV(insn->getSrc(0)->reg.data.sv.sv) ? 6 : 15;
    case OP_ABS:
    case OP_CEIL:
    case OP_CVT:
@@ -322,6 +325,12 @@
 }
 
 bool
+TargetGM107::isCS2RSV(SVSemantic sv) const
+{
+   return sv == SV_CLOCK;
+}
+
+bool
 TargetGM107::runLegalizePass(Program *prog, CGStage stage) const
 {
    if (stage == CG_STAGE_PRE_SSA) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
index dd4aa6a..10f06d2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.h
@@ -23,6 +23,8 @@
    virtual bool canDualIssue(const Instruction *, const Instruction *) const;
    virtual int getLatency(const Instruction *) const;
    virtual int getReadLatency(const Instruction *) const;
+
+   virtual bool isCS2RSV(SVSemantic) const;
 };
 
 } // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 83b4102..1ad3467 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -111,16 +111,15 @@
 {
    unsigned int i, j;
 
-   static const uint32_t commutative[(OP_LAST + 31) / 32] =
+   static const operation commutativeList[] =
    {
-      // ADD, MUL, MAD, FMA, AND, OR, XOR, MAX, MIN, SET_AND, SET_OR, SET_XOR,
-      // SET, SELP, SLCT
-      0x0ce0ca00, 0x0000007e, 0x00000000, 0x00000000
+      OP_ADD, OP_MUL, OP_MAD, OP_FMA, OP_AND, OP_OR, OP_XOR, OP_MAX, OP_MIN,
+      OP_SET_AND, OP_SET_OR, OP_SET_XOR, OP_SET, OP_SELP, OP_SLCT
    };
-   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+   static const operation shortFormList[] =
    {
-      // MOV, ADD, SUB, MUL, MAD, SAD, RCP, L/PINTERP, TEX, TXF
-      0x00014e40, 0x00000080, 0x00001260, 0x00000000
+      OP_MOV, OP_ADD, OP_SUB, OP_MUL, OP_MAD, OP_SAD, OP_RCP, OP_LINTERP,
+      OP_PINTERP, OP_TEX, OP_TXF
    };
    static const operation noDestList[] =
    {
@@ -157,18 +156,22 @@
 
       opInfo[i].hasDest = 1;
       opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
-      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+      opInfo[i].commutative = false; /* set below */
       opInfo[i].pseudo = (i < OP_MOV);
       opInfo[i].predicate = !opInfo[i].pseudo;
       opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
-      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+      opInfo[i].minEncSize = 8; /* set below */
    }
-   for (i = 0; i < sizeof(noDestList) / sizeof(noDestList[0]); ++i)
+   for (i = 0; i < ARRAY_SIZE(commutativeList); ++i)
+      opInfo[commutativeList[i]].commutative = true;
+   for (i = 0; i < ARRAY_SIZE(shortFormList); ++i)
+      opInfo[shortFormList[i]].minEncSize = 4;
+   for (i = 0; i < ARRAY_SIZE(noDestList); ++i)
       opInfo[noDestList[i]].hasDest = 0;
-   for (i = 0; i < sizeof(noPredList) / sizeof(noPredList[0]); ++i)
+   for (i = 0; i < ARRAY_SIZE(noPredList); ++i)
       opInfo[noPredList[i]].predicate = 0;
 
-   for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+   for (i = 0; i < ARRAY_SIZE(_initProps); ++i) {
       const struct opProperties *prop = &_initProps[i];
 
       for (int s = 0; s < 3; ++s) {
@@ -254,6 +257,7 @@
    case SV_NTID:
       return 0x2 + 2 * sym->reg.data.sv.index;
    case SV_TID:
+   case SV_COMBINED_TID:
       return 0;
    case SV_SAMPLE_POS:
       return 0; /* sample position is handled differently */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 8938d19..7e05923 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -191,17 +191,15 @@
 {
    unsigned int i, j;
 
-   static const uint32_t commutative[(OP_LAST + 31) / 32] =
+   static const operation commutative[] =
    {
-      // ADD, MUL, MAD, FMA, AND, OR, XOR, MAX, MIN, SET_AND, SET_OR, SET_XOR,
-      // SET, SELP, SLCT
-      0x0ce0ca00, 0x0000007e, 0x00000000, 0x00000000
+      OP_ADD, OP_MUL, OP_MAD, OP_FMA, OP_AND, OP_OR, OP_XOR, OP_MAX, OP_MIN,
+      OP_SET_AND, OP_SET_OR, OP_SET_XOR, OP_SET, OP_SELP, OP_SLCT
    };
 
-   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+   static const operation shortForm[] =
    {
-      // ADD, MUL, MAD, FMA, AND, OR, XOR, MAX, MIN
-      0x0ce0ca00, 0x00000000, 0x00000000, 0x00000000
+      OP_ADD, OP_MUL, OP_MAD, OP_FMA, OP_AND, OP_OR, OP_XOR, OP_MAX, OP_MIN
    };
 
    static const operation noDest[] =
@@ -240,15 +238,19 @@
 
       opInfo[i].hasDest = 1;
       opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
-      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+      opInfo[i].commutative = false; /* set below */
       opInfo[i].pseudo = (i < OP_MOV);
       opInfo[i].predicate = !opInfo[i].pseudo;
       opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
-      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+      opInfo[i].minEncSize = 8; /* set below */
    }
-   for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i)
+   for (i = 0; i < ARRAY_SIZE(commutative); ++i)
+      opInfo[commutative[i]].commutative = true;
+   for (i = 0; i < ARRAY_SIZE(shortForm); ++i)
+      opInfo[shortForm[i]].minEncSize = 4;
+   for (i = 0; i < ARRAY_SIZE(noDest); ++i)
       opInfo[noDest[i]].hasDest = 0;
-   for (i = 0; i < sizeof(noPred) / sizeof(noPred[0]); ++i)
+   for (i = 0; i < ARRAY_SIZE(noPred); ++i)
       opInfo[noPred[i]].predicate = 0;
 
    initProps(_initProps, ARRAY_SIZE(_initProps));
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index c144b39..d7898ed 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -102,14 +102,14 @@
       return NULL;
    }
 
-   if (whandle->type != DRM_API_HANDLE_TYPE_SHARED &&
-       whandle->type != DRM_API_HANDLE_TYPE_FD) {
+   if (whandle->type != WINSYS_HANDLE_TYPE_SHARED &&
+       whandle->type != WINSYS_HANDLE_TYPE_FD) {
       debug_printf("%s: attempt to import unsupported handle type %d\n",
                    __FUNCTION__, whandle->type);
       return NULL;
    }
 
-   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED)
+   if (whandle->type == WINSYS_HANDLE_TYPE_SHARED)
       ret = nouveau_bo_name_ref(dev, whandle->handle, &bo);
    else
       ret = nouveau_bo_prime_handle_ref(dev, whandle->handle, &bo);
@@ -133,12 +133,12 @@
 {
    whandle->stride = stride;
 
-   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+   if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
       return nouveau_bo_name_get(bo, &whandle->handle) == 0;
-   } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+   } else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
       whandle->handle = bo->handle;
       return true;
-   } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+   } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
       return nouveau_bo_set_prime(bo, (int *)&whandle->handle) == 0;
    } else {
       return false;
@@ -148,20 +148,21 @@
 static void
 nouveau_disk_cache_create(struct nouveau_screen *screen)
 {
-   uint32_t mesa_timestamp;
-   char *timestamp_str;
-   int res;
+   struct mesa_sha1 ctx;
+   unsigned char sha1[20];
+   char cache_id[20 * 2 + 1];
 
-   if (disk_cache_get_function_timestamp(nouveau_disk_cache_create,
-                                         &mesa_timestamp)) {
-      res = asprintf(&timestamp_str, "%u", mesa_timestamp);
-      if (res != -1) {
-         screen->disk_shader_cache =
-            disk_cache_create(nouveau_screen_get_name(&screen->base),
-                              timestamp_str, 0);
-         free(timestamp_str);
-      }
-   }
+   _mesa_sha1_init(&ctx);
+   if (!disk_cache_get_function_identifier(nouveau_disk_cache_create,
+                                           &ctx))
+      return;
+
+   _mesa_sha1_final(&ctx, sha1);
+   disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
+
+   screen->disk_shader_cache =
+      disk_cache_create(nouveau_screen_get_name(&screen->base),
+                        cache_id, 0);
 }
 
 int
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 1d1fbaa..556bd9b 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -64,6 +64,7 @@
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 13;
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
       return 120;
    case PIPE_CAP_ENDIANNESS:
       return PIPE_ENDIAN_LITTLE;
@@ -224,11 +225,19 @@
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -270,6 +279,10 @@
       return (eng3d->oclass >= NV40_3D_CLASS) ? 16.0 : 8.0;
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return 15.0;
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0;
    default:
       debug_printf("unknown paramf %d\n", param);
       return 0;
@@ -332,6 +345,7 @@
       case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+      case PIPE_SHADER_CAP_SCALAR_ISA:
          return 0;
       default:
          debug_printf("unknown vertex shader param %d\n", param);
@@ -382,6 +396,9 @@
       case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
       case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
       case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
+      case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
+      case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+      case PIPE_SHADER_CAP_SCALAR_ISA:
          return 0;
       default:
          debug_printf("unknown fragment shader param %d\n", param);
@@ -398,6 +415,7 @@
                                 enum pipe_format format,
                                 enum pipe_texture_target target,
                                 unsigned sample_count,
+                                unsigned storage_sample_count,
                                 unsigned bindings)
 {
    if (sample_count > nv30_screen(pscreen)->max_sample_count)
@@ -406,9 +424,8 @@
    if (!(0x00000017 & (1 << sample_count)))
       return false;
 
-   if (!util_format_is_supported(format, bindings)) {
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
       return false;
-   }
 
    /* shared is always supported */
    bindings &= ~PIPE_BIND_SHARED;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_blit.h b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
index 10fe527..01667bb 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_blit.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_blit.h
@@ -17,16 +17,17 @@
 void
 nv50_resource_resolve(struct pipe_context *, const struct pipe_resolve_info *);
 
-#define NV50_BLIT_MODE_PASS  0 /* pass through TEX $t0/$s0 output */
-#define NV50_BLIT_MODE_Z24S8 1 /* encode ZS values for RGBA unorm8 */
-#define NV50_BLIT_MODE_S8Z24 2
-#define NV50_BLIT_MODE_X24S8 3
-#define NV50_BLIT_MODE_S8X24 4
-#define NV50_BLIT_MODE_Z24X8 5
-#define NV50_BLIT_MODE_X8Z24 6
-#define NV50_BLIT_MODE_ZS    7 /* put $t0/$s0 into R, $t1/$s1 into G */
-#define NV50_BLIT_MODE_XS    8 /* put $t1/$s1 into G */
-#define NV50_BLIT_MODES      9
+#define NV50_BLIT_MODE_PASS       0 /* pass through TEX $t0/$s0 output */
+#define NV50_BLIT_MODE_Z24S8      1 /* encode ZS values for RGBA unorm8 */
+#define NV50_BLIT_MODE_S8Z24      2
+#define NV50_BLIT_MODE_X24S8      3
+#define NV50_BLIT_MODE_S8X24      4
+#define NV50_BLIT_MODE_Z24X8      5
+#define NV50_BLIT_MODE_X8Z24      6
+#define NV50_BLIT_MODE_ZS         7 /* put $t0/$s0 into R, $t1/$s1 into G */
+#define NV50_BLIT_MODE_XS         8 /* put $t1/$s1 into G */
+#define NV50_BLIT_MODE_INT_CLAMP  9 /* unsigned to signed integer conversion */
+#define NV50_BLIT_MODES          10
 
 /* CUBE and RECT textures are reinterpreted as 2D(_ARRAY) */
 #define NV50_BLIT_TEXTURE_BUFFER    0
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
index a55adfa..0a693d7 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@@ -318,8 +318,8 @@
    F3(A, R16G16B16X16_FLOAT, RGBX16_FLOAT, R, G, B, xx, FLOAT, R16_G16_B16_A16, TB),
    F3(A, R16G16B16X16_UNORM, RGBA16_UNORM, R, G, B, xx, UNORM, R16_G16_B16_A16, T),
    F3(A, R16G16B16X16_SNORM, RGBA16_SNORM, R, G, B, xx, SNORM, R16_G16_B16_A16, T),
-   I3(A, R16G16B16X16_SINT, RGBA16_SINT, R, G, B, xx, SINT, R16_G16_B16_A16, T),
-   I3(A, R16G16B16X16_UINT, RGBA16_UINT, R, G, B, xx, UINT, R16_G16_B16_A16, T),
+   I3(A, R16G16B16X16_SINT, RGBA16_SINT, R, G, B, xx, SINT, R16_G16_B16_A16, TR),
+   I3(A, R16G16B16X16_UINT, RGBA16_UINT, R, G, B, xx, UINT, R16_G16_B16_A16, TR),
 
    F2(A, R16G16_FLOAT, RG16_FLOAT, R, G, xx, xx, FLOAT, R16_G16, IB),
    F2(A, R16G16_UNORM, RG16_UNORM, R, G, xx, xx, UNORM, R16_G16, IC),
@@ -337,8 +337,8 @@
    C4(A, R8G8B8A8_SINT, RGBA8_SINT, R, G, B, A, SINT, A8B8G8R8, IR),
    C4(A, R8G8B8A8_UINT, RGBA8_UINT, R, G, B, A, UINT, A8B8G8R8, IR),
    F3(A, R8G8B8X8_SNORM, RGBA8_SNORM, R, G, B, xx, SNORM, A8B8G8R8, T),
-   I3(A, R8G8B8X8_SINT, RGBA8_SINT, R, G, B, xx, SINT, A8B8G8R8, T),
-   I3(A, R8G8B8X8_UINT, RGBA8_UINT, R, G, B, xx, UINT, A8B8G8R8, T),
+   I3(A, R8G8B8X8_SINT, RGBA8_SINT, R, G, B, xx, SINT, A8B8G8R8, TR),
+   I3(A, R8G8B8X8_UINT, RGBA8_UINT, R, G, B, xx, UINT, A8B8G8R8, TR),
 
    F2(A, R8G8_UNORM, RG8_UNORM, R, G, xx, xx, UNORM, G8R8, IB),
    F2(A, R8G8_SNORM, RG8_SNORM, R, G, xx, xx, SNORM, G8R8, IC),
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
index 5d03925..c64b045 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@@ -66,6 +66,7 @@
 #define NV50_TEXVIEW_SCALED_COORDS     (1 << 0)
 #define NV50_TEXVIEW_FILTER_MSAA8      (1 << 1)
 #define NV50_TEXVIEW_ACCESS_RESOLVE    (1 << 2)
+#define NV50_TEXVIEW_IMAGE_GM107       (1 << 3)
 
 
 /* Internal functions:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 6fd2982..cea6818 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -46,6 +46,7 @@
                                 enum pipe_format format,
                                 enum pipe_texture_target target,
                                 unsigned sample_count,
+                                unsigned storage_sample_count,
                                 unsigned bindings)
 {
    if (sample_count > 8)
@@ -55,7 +56,7 @@
    if (sample_count == 8 && util_format_get_blocksizebits(format) >= 128)
       return false;
 
-   if (!util_format_is_supported(format, bindings))
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
       return false;
 
    switch (format) {
@@ -109,6 +110,8 @@
       return 128 * 1024 * 1024;
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
       return 330;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      return 140;
    case PIPE_CAP_MAX_RENDER_TARGETS:
       return 8;
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
@@ -276,11 +279,19 @@
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -378,6 +389,8 @@
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
       return 0;
+   case PIPE_SHADER_CAP_SCALAR_ISA:
+      return 1;
    default:
       NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
       return 0;
@@ -398,6 +411,10 @@
       return 16.0f;
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return 4.0f;
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0f;
    }
 
    NOUVEAU_ERR("unknown PIPE_CAPF %d\n", param);
@@ -982,7 +999,7 @@
    nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
 
    screen->TPs = util_bitcount(value & 0xffff);
-   screen->MPsInTP = util_bitcount((value >> 24) & 0xf);
+   screen->MPsInTP = util_bitcount(value & 0x0f000000);
 
    screen->mp_count = screen->TPs * screen->MPsInTP;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 7f32900..2cbbdc0 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -185,6 +185,7 @@
                fb->cbufs[0]->format,
                fb->cbufs[0]->texture->target,
                fb->cbufs[0]->texture->nr_samples,
+               fb->cbufs[0]->texture->nr_storage_samples,
                PIPE_BIND_BLENDABLE);
       /* If we already have alphatest code, we have to keep updating
        * it. However we only have to have different code if the current RT0 is
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 037e14a..de840eb 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -892,6 +892,10 @@
    bool tex_s = false;
    bool cvt_un8 = false;
 
+   bool int_clamp = mode == NV50_BLIT_MODE_INT_CLAMP;
+   if (int_clamp)
+      mode = NV50_BLIT_MODE_PASS;
+
    if (mode != NV50_BLIT_MODE_PASS &&
        mode != NV50_BLIT_MODE_Z24X8 &&
        mode != NV50_BLIT_MODE_X8Z24)
@@ -936,6 +940,10 @@
                target, tc, ureg_DECL_sampler(ureg, 0));
    }
 
+   /* handle signed to unsigned integer conversions */
+   if (int_clamp)
+      ureg_UMIN(ureg, data, ureg_src(data), ureg_imm1u(ureg, 0x7fffffff));
+
    if (cvt_un8) {
       struct ureg_src mask;
       struct ureg_src scale;
@@ -1058,6 +1066,9 @@
          return NV50_BLIT_MODE_XS;
       }
    default:
+      if (util_format_is_pure_uint(info->src.format) &&
+          util_format_is_pure_sint(info->dst.format))
+         return NV50_BLIT_MODE_INT_CLAMP;
       return NV50_BLIT_MODE_PASS;
    }
 }
@@ -1658,6 +1669,13 @@
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    bool eng3d = FALSE;
 
+   if (info->src.box.width == 0 || info->src.box.height == 0 ||
+       info->dst.box.width == 0 || info->dst.box.height == 0) {
+      pipe_debug_message(&nv50->base.debug, ERROR,
+                         "Blit with zero-size src or dst box");
+      return;
+   }
+
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
       if (!(info->mask & PIPE_MASK_ZS))
          return;
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
index 7c5ec8f..38c2e86 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme
@@ -550,3 +550,33 @@
 qbw_done:
    exit send (extrinsrt 0x0 $r4 0x0 0x10 0x10)
    maddrsend 0x44
+
+/* NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE:
+ *
+ * This sets basically all the conservative rasterization state. It sets
+ * CONSERVATIVE_RASTER to one while doing so.
+ *
+ * arg = biasx | biasy<<4 | (dilation*4)<<8 | mode<<10
+ */
+.section #mme9097_conservative_raster_state
+   /* Mode and dilation */
+   maddr 0x1d00 /* SCRATCH[0] */
+   send 0x0 /* unknown */
+   send (extrinsrt 0x0 $r1 8 3 23) /* value */
+   mov $r2 0x7
+   send (extrinsrt 0x0 $r2 0 3 23) /* write mask */
+   maddr 0x18c4 /* FIRMWARE[4] */
+   mov $r2 0x831
+   send (extrinsrt 0x0 $r2 0 12 11) /* sends 0x418800 */
+   /* Subpixel precision */
+   mov $r2 (extrinsrt 0x0 $r1 0 4 0)
+   mov $r2 (extrinsrt $r2 $r1 4 4 8)
+   maddr 0x8287 /* SUBPIXEL_PRECISION[0] (incrementing by 8 methods) */
+   mov $r3 16 /* loop counter */
+crs_loop:
+   mov $r3 (add $r3 -1)
+   branz $r3 #crs_loop
+   send $r2
+   /* Enable */
+   exit maddr 0x1452 /* CONSERVATIVE_RASTER */
+   send 0x1
diff --git a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
index 9618da6..49c0891 100644
--- a/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
+++ b/src/gallium/drivers/nouveau/nvc0/mme/com9097.mme.h
@@ -373,3 +373,24 @@
 	0x840100c2,
 	0x00110071,
 };
+
+uint32_t mme9097_conservative_raster_state[] = {
+	0x07400021,
+	0x00000041,
+	0xb8d04042,
+/* 0x000c: crs_loop */
+	0x0001c211,
+	0xb8c08042,
+	0x06310021,
+	0x020c4211,
+	0x5b008042,
+	0x01004212,
+	0x41085212,
+	0x20a1c021,
+	0x00040311,
+	0xffffdb11,
+	0xffffd817,
+	0x00001041,
+	0x051480a1,
+	0x00004041,
+};
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
index d7245fb..c5456e4 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h
@@ -447,6 +447,10 @@
 #define NVC0_3D_VIEWPORT_TRANSLATE_Z__ESIZE			0x00000020
 #define NVC0_3D_VIEWPORT_TRANSLATE_Z__LEN			0x00000010
 
+#define NVC0_3D_SUBPIXEL_PRECISION(i0)			       (0x00000a1c + 0x20*(i0))
+#define NVC0_3D_SUBPIXEL_PRECISION__ESIZE			0x00000020
+#define NVC0_3D_SUBPIXEL_PRECISION__LEN				0x00000010
+
 #define NVC0_3D_VIEWPORT_HORIZ(i0)			       (0x00000c00 + 0x10*(i0))
 #define NVC0_3D_VIEWPORT_HORIZ__ESIZE				0x00000010
 #define NVC0_3D_VIEWPORT_HORIZ__LEN				0x00000010
@@ -780,6 +784,7 @@
 #define NVC0_3D_UNK1140					0x00001140
 
 #define NVC0_3D_UNK1144					0x00001144
+#define NVC0_3D_CONSERVATIVE_RASTER			0x00001148
 
 #define NVC0_3D_VTX_ATTR_DEFINE				0x0000114c
 #define NVC0_3D_VTX_ATTR_DEFINE_ATTR__MASK			0x000000ff
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index 11635c9..4963493 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -181,7 +181,7 @@
    /* Invalidate all 3D constbufs because they are aliased with COMPUTE. */
    for (s = 0; s < 5; s++) {
       nvc0->constbuf_dirty[s] |= nvc0->constbuf_valid[s];
-      nvc0->state.uniform_buffer_bound[s] = 0;
+      nvc0->state.uniform_buffer_bound[s] = false;
    }
    nvc0->dirty_3d |= NVC0_NEW_3D_CONSTBUF;
 }
@@ -203,19 +203,18 @@
          assert(i == 0); /* we really only want OpenGL uniforms here */
          assert(nvc0->constbuf[s][0].u.data);
 
-         if (nvc0->state.uniform_buffer_bound[s] < size) {
-            nvc0->state.uniform_buffer_bound[s] = align(size, 0x100);
+         if (!nvc0->state.uniform_buffer_bound[s]) {
+            nvc0->state.uniform_buffer_bound[s] = true;
 
             BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
-            PUSH_DATA (push, nvc0->state.uniform_buffer_bound[s]);
+            PUSH_DATA (push, NVC0_MAX_CONSTBUF_SIZE);
             PUSH_DATAh(push, bo->offset + base);
             PUSH_DATA (push, bo->offset + base);
             BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
             PUSH_DATA (push, (0 << 8) | 1);
          }
          nvc0_cb_bo_push(&nvc0->base, bo, NV_VRAM_DOMAIN(&nvc0->screen->base),
-                         base, nvc0->state.uniform_buffer_bound[s],
-                         0, (size + 3) / 4,
+                         base, NVC0_MAX_CONSTBUF_SIZE, 0, (size + 3) / 4,
                          nvc0->constbuf[s][0].u.data);
       } else {
          struct nv04_resource *res =
@@ -236,7 +235,7 @@
             PUSH_DATA (push, (i << 8) | 0);
          }
          if (i == 0)
-            nvc0->state.uniform_buffer_bound[s] = 0;
+            nvc0->state.uniform_buffer_bound[s] = false;
       }
    }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 0729c88..77237a3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -62,6 +62,8 @@
 #define NVC0_NEW_3D_DRIVERCONST  (1 << 27)
 #define NVC0_NEW_3D_WINDOW_RECTS (1 << 28)
 
+#define NVC0_NEW_3D_SAMPLE_LOCATIONS (1 << 29)
+
 #define NVC0_NEW_CP_PROGRAM   (1 << 0)
 #define NVC0_NEW_CP_SURFACES  (1 << 1)
 #define NVC0_NEW_CP_TEXTURES  (1 << 2)
@@ -134,20 +136,21 @@
 #define NVC0_CB_AUX_UBO_SIZE        ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4)
 /* 8 sets of 32-bits integer pairs sample offsets */
 #define NVC0_CB_AUX_SAMPLE_INFO     0x1a0 /* FP */
-#define NVC0_CB_AUX_SAMPLE_SIZE     (8 * 4 * 2)
+/* 256 bytes, though only 64 bytes used before GM200 */
+#define NVC0_CB_AUX_SAMPLE_SIZE     (8 * 2 * 4 * 4)
 /* draw parameters (index bais, base instance, drawid) */
 #define NVC0_CB_AUX_DRAW_INFO       0x1a0 /* VP */
 /* 32 user buffers, at 4 32-bits integers each */
-#define NVC0_CB_AUX_BUF_INFO(i)     0x220 + (i) * 4 * 4
+#define NVC0_CB_AUX_BUF_INFO(i)     0x2a0 + (i) * 4 * 4
 #define NVC0_CB_AUX_BUF_SIZE        (NVC0_MAX_BUFFERS * 4 * 4)
 /* 8 surfaces, at 16 32-bits integers each */
-#define NVC0_CB_AUX_SU_INFO(i)      0x420 + (i) * 16 * 4
+#define NVC0_CB_AUX_SU_INFO(i)      0x4a0 + (i) * 16 * 4
 #define NVC0_CB_AUX_SU_SIZE         (NVC0_MAX_IMAGES * 16 * 4)
 /* 1 64-bits address and 1 32-bits sequence */
-#define NVC0_CB_AUX_MP_INFO         0x620
+#define NVC0_CB_AUX_MP_INFO         0x6a0
 #define NVC0_CB_AUX_MP_SIZE         3 * 4
 /* 512 64-byte blocks for bindless image handles */
-#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x630 + (i) * 16 * 4
+#define NVC0_CB_AUX_BINDLESS_INFO(i) 0x6b0 + (i) * 16 * 4
 #define NVC0_CB_AUX_BINDLESS_SIZE   (NVE4_IMG_MAX_HANDLES * 16 * 4)
 /* 4 32-bits floats for the vertex runout, put at the end */
 #define NVC0_CB_AUX_RUNOUT_INFO     NVC0_CB_USR_SIZE + (NVC0_CB_AUX_SIZE * 6)
@@ -229,6 +232,8 @@
    struct list_head img_head;
 
    struct pipe_framebuffer_state framebuffer;
+   bool sample_locations_enabled;
+   uint8_t sample_locations[2 * 4 * 8];
    struct pipe_blend_color blend_colour;
    struct pipe_stencil_ref stencil_ref;
    struct pipe_poly_stipple stipple;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
index eeacc71..7aa0633 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_macros.h
@@ -35,6 +35,8 @@
 
 #define NVC0_3D_MACRO_QUERY_BUFFER_WRITE			0x00003858
 
-#define NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT       0x00003860
+#define NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT			0x00003860
+
+#define NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE			0x00003868
 
 #endif /* __NVC0_MACROS_H__ */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 9520d98..57d9875 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -481,6 +481,9 @@
          }
       }
    }
+   /* GM20x+ needs TGSI_SEMANTIC_POSITION to access sample locations */
+   if (info->prop.fp.readsSampleLocations && info->target >= NVISA_GM200_CHIPSET)
+      fp->hdr[5] |= 0x30000000;
 
    for (i = 0; i < info->numOutputs; ++i) {
       if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
index df5723d..726160d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -429,9 +429,9 @@
     * mov b32 $r6 $pm6
     * mov b32 $r7 $pm7
     * set $p0 0x1 eq u32 $r8 0x0
-    * mov b32 $r10 c7[0x620]
+    * mov b32 $r10 c7[0x6a0]
     * ext u32 $r8 $r12 0x414
-    * mov b32 $r11 c7[0x624]
+    * mov b32 $r11 c7[0x6a4]
     * sched 0x04 0x2e 0x04 0x20 0x20 0x28 0x04
     * ext u32 $r9 $r12 0x208
     * (not $p0) exit
@@ -449,7 +449,7 @@
     * add b32 $r12 $c $r12 $r9
     * st b128 wt g[$r10d] $r0q
     * sched 0x4 0x2c 0x20 0x04 0x2e 0x00 0x00
-    * mov b32 $r0 c7[0x628]
+    * mov b32 $r0 c7[0x6a8]
     * add b32 $r13 $r13 0x0 $c
     * $p1 st b128 wt g[$r12d+0x40] $r4q
     * st b32 wt g[$r12d+0x50] $r0
@@ -467,9 +467,9 @@
    0x2c00000028019c04ULL,
    0x2c0000002c01dc04ULL,
    0x190e0000fc81dc03ULL,
-   0x28005c1880029de4ULL,
+   0x28005c1a80029de4ULL,
    0x7000c01050c21c03ULL,
-   0x28005c189002dde4ULL,
+   0x28005c1a9002dde4ULL,
    0x204282020042e047ULL,
    0x7000c00820c25c03ULL,
    0x80000000000021e7ULL,
@@ -487,7 +487,7 @@
    0x4801000024c31c03ULL,
    0x9400000000a01fc5ULL,
    0x200002e04202c047ULL,
-   0x28005c18a0001de4ULL,
+   0x28005c1aa0001de4ULL,
    0x0800000000d35c42ULL,
    0x9400000100c107c5ULL,
    0x9400000140c01f85ULL,
@@ -510,9 +510,9 @@
    0x86400000051c001aULL,
    0x86400000059c001eULL,
    0xdb201c007f9c201eULL,
-   0x64c03ce0c41c002aULL,
+   0x64c03ce0d41c002aULL,
    0xc00000020a1c3021ULL,
-   0x64c03ce0c49c002eULL,
+   0x64c03ce0d49c002eULL,
    0x0810a0808010b810ULL,
    0xc0000001041c3025ULL,
    0x180000000020003cULL,
@@ -530,7 +530,7 @@
    0xe0840000049c3032ULL,
    0xfe800000001c2800ULL,
    0x080000b81080b010ULL,
-   0x64c03ce0c51c0002ULL,
+   0x64c03ce0d51c0002ULL,
    0xe08040007f9c3436ULL,
    0xfe80000020043010ULL,
    0xfc800000281c3000ULL,
@@ -554,10 +554,10 @@
    0x001f8401fc2007a1ULL, /* sched (st 0x1 wr 0x5) (st 0x1 wt 0x1) (st 0x1)         */
    0xf0c8000000b70007ULL, /* mov $r7 $pm7                                           */
    0x5b6403800087ff07ULL, /* isetp eq u32 and $p0 0x1 0x0 $r8 0x1                   */
-   0x4c98079c1887000aULL, /* mov $r10 c7[0x620] 0xf                                 */
+   0x4c98079c1a87000aULL, /* mov $r10 c7[0x6a0] 0xf                                 */
    0x001fa400fc2017e1ULL, /* sched (st 0x1 wt 0x2) (st 0x1) (st 0x9)                */
    0x3800000091470c08ULL, /* bfe u32 $r8 $r12 0x914                                 */
-   0x4c98079c1897000bULL, /* mov $r11 c7[0x624] 0xf                                 */
+   0x4c98079c1a97000bULL, /* mov $r11 c7[0x6a4] 0xf                                 */
    0x3800000020870c09ULL, /* bfe u32 $r9 $r12 0x208                                 */
    0x001c1800fc2007edULL, /* sched (st 0xd) (st 0x1) (st 0x6 wr 0x0)                */
    0xe30000000008000fULL, /* not $p0 exit                                           */
@@ -578,7 +578,7 @@
    0x003f983c1c4007e1ULL, /* sched (st 0x1) (st 0x2 rd 0x0 wt 0x3c) (st 0x6 wt 0x1) */
    0x5c1008000ff70d0dULL, /* iadd x $r13 $r13 0x0                                   */
    0xbfd0000000070a00ULL, /* st e wt b128 g[$r10] $r0 0x1                           */
-   0x4c98079c18a70000ULL, /* mov $r0 c7[0x628] 0xf                                  */
+   0x4c98079c1aa70000ULL, /* mov $r0 c7[0x6a8] 0xf                                  */
    0x001fbc00fc2007e6ULL, /* sched (st 0x1) (st 0x1) (st 0xf)                       */
    0xbfd0000004010c04ULL, /* $p1 st e wt b128 g[$r12+0x40] $r4 0x1                  */
    0xbf90000005070c00ULL, /* st e wt b32 g[$r12+0x50] $r0 0x1                       */
@@ -1760,14 +1760,14 @@
     * mov b32 $r6 $pm6
     * mov b32 $r7 $pm7
     * set $p0 0x1 eq u32 $r8 0x0
-    * mov b32 $r10 c15[0x620]
-    * mov b32 $r11 c15[0x624]
+    * mov b32 $r10 c15[0x6a0]
+    * mov b32 $r11 c15[0x6a4]
     * ext u32 $r8 $r9 0x414
     * (not $p0) exit
     * mul $r8 u32 $r8 u32 48
     * add b32 $r10 $c $r10 $r8
     * add b32 $r11 $r11 0x0 $c
-    * mov b32 $r8 c15[0x628]
+    * mov b32 $r8 c15[0x6a8]
     * st b128 wt g[$r10d+0x00] $r0q
     * st b128 wt g[$r10d+0x10] $r4q
     * st b32 wt g[$r10d+0x20] $r8
@@ -1783,14 +1783,14 @@
    0x2c00000028019c04ULL,
    0x2c0000002c01dc04ULL,
    0x190e0000fc81dc03ULL,
-   0x28007c1880029de4ULL,
-   0x28007c189002dde4ULL,
+   0x28007c1a80029de4ULL,
+   0x28007c1a9002dde4ULL,
    0x7000c01050921c03ULL,
    0x80000000000021e7ULL,
    0x10000000c0821c02ULL,
    0x4801000020a29c03ULL,
    0x0800000000b2dc42ULL,
-   0x28007c18a0021de4ULL,
+   0x28007c1aa0021de4ULL,
    0x9400000000a01fc5ULL,
    0x9400000040a11fc5ULL,
    0x9400000080a21f85ULL,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index eb50149..daa4edb 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -42,6 +42,7 @@
                                 enum pipe_format format,
                                 enum pipe_texture_target target,
                                 unsigned sample_count,
+                                unsigned storage_sample_count,
                                 unsigned bindings)
 {
    const struct util_format_description *desc = util_format_description(format);
@@ -51,15 +52,15 @@
    if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
       return false;
 
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+      return false;
+
    /* Short-circuit the rest of the logic -- this is used by the state tracker
     * to determine valid MS levels in a no-attachments scenario.
     */
    if (format == PIPE_FORMAT_NONE && bindings & PIPE_BIND_RENDER_TARGET)
       return true;
 
-   if (!util_format_is_supported(format, bindings))
-      return false;
-
    if ((bindings & PIPE_BIND_SAMPLER_VIEW) && (target != PIPE_BUFFER))
       if (util_format_get_blocksizebits(format) == 3 * 32)
          return false;
@@ -88,13 +89,6 @@
                  PIPE_BIND_SHARED);
 
    if (bindings & PIPE_BIND_SHADER_IMAGE) {
-      if (sample_count > 0 &&
-          nouveau_screen(pscreen)->class_3d >= GM107_3D_CLASS) {
-         /* MS images are currently unsupported on Maxwell because they have to
-          * be handled explicitly. */
-         return false;
-      }
-
       if (format == PIPE_FORMAT_B8G8R8A8_UNORM &&
           nouveau_screen(pscreen)->class_3d < NVE4_3D_CLASS) {
          /* This should work on Fermi, but for currently unknown reasons it
@@ -134,6 +128,8 @@
       return 128 * 1024 * 1024;
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
       return 430;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      return 140;
    case PIPE_CAP_MAX_RENDER_TARGETS:
       return 8;
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
@@ -172,6 +168,8 @@
       return 30;
    case PIPE_CAP_MAX_WINDOW_RECTANGLES:
       return NVC0_MAX_WINDOW_RECTANGLES;
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+      return class_3d >= GM200_3D_CLASS ? 8 : 0;
 
    /* supported caps */
    case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
@@ -264,7 +262,13 @@
    case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
    case PIPE_CAP_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return class_3d >= GM200_3D_CLASS;
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+      return class_3d >= GP100_3D_CLASS;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
    case PIPE_CAP_TGSI_BALLOT:
    case PIPE_CAP_BINDLESS_TEXTURE:
@@ -304,11 +308,13 @@
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -381,7 +387,7 @@
    case PIPE_SHADER_CAP_MAX_OUTPUTS:
       return 32;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
-      return 65536;
+      return NVC0_MAX_CONSTBUF_SIZE;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
       return NVC0_MAX_PIPE_CONSTBUFS;
    case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
@@ -415,6 +421,8 @@
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
       return 0;
+   case PIPE_SHADER_CAP_SCALAR_ISA:
+      return 1;
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return NVC0_MAX_BUFFERS;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
@@ -438,6 +446,8 @@
 static float
 nvc0_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
 {
+   const uint16_t class_3d = nouveau_screen(pscreen)->class_3d;
+
    switch (param) {
    case PIPE_CAPF_MAX_LINE_WIDTH:
    case PIPE_CAPF_MAX_LINE_WIDTH_AA:
@@ -450,6 +460,12 @@
       return 16.0f;
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return 15.0f;
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+      return 0.0f;
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+      return class_3d >= GM200_3D_CLASS ? 0.75f : 0.0f;
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return class_3d >= GM200_3D_CLASS ? 0.25f : 0.0f;
    }
 
    NOUVEAU_ERR("unknown PIPE_CAPF %d\n", param);
@@ -527,6 +543,36 @@
 }
 
 static void
+nvc0_screen_get_sample_pixel_grid(struct pipe_screen *pscreen,
+                                  unsigned sample_count,
+                                  unsigned *width, unsigned *height)
+{
+   switch (sample_count) {
+   case 0:
+   case 1:
+      /* this could be 4x4, but the GL state tracker makes it difficult to
+       * create a 1x MSAA texture and smaller grids save CB space */
+      *width = 2;
+      *height = 4;
+      break;
+   case 2:
+      *width = 2;
+      *height = 4;
+      break;
+   case 4:
+      *width = 2;
+      *height = 2;
+      break;
+   case 8:
+      *width = 1;
+      *height = 2;
+      break;
+   default:
+      assert(0);
+   }
+}
+
+static void
 nvc0_screen_destroy(struct pipe_screen *pscreen)
 {
    struct nvc0_screen *screen = nvc0_screen(pscreen);
@@ -788,6 +834,40 @@
    return 0;
 }
 
+void
+nvc0_screen_bind_cb_3d(struct nvc0_screen *screen, bool *can_serialize,
+                       int stage, int index, int size, uint64_t addr)
+{
+   assert(stage != 5);
+
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+
+   if (screen->base.class_3d >= GM107_3D_CLASS) {
+      struct nvc0_cb_binding *binding = &screen->cb_bindings[stage][index];
+
+      // TODO: Better figure out the conditions in which this is needed
+      bool serialize = binding->addr == addr && binding->size != size;
+      if (can_serialize)
+         serialize = serialize && *can_serialize;
+      if (serialize) {
+         IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0);
+         if (can_serialize)
+            *can_serialize = false;
+      }
+
+      binding->addr = addr;
+      binding->size = size;
+   }
+
+   if (size >= 0) {
+      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+      PUSH_DATA (push, size);
+      PUSH_DATAh(push, addr);
+      PUSH_DATA (push, addr);
+   }
+   IMMED_NVC0(push, NVC0_3D(CB_BIND(stage)), (index << 4) | (size >= 0));
+}
+
 #define FAIL_SCREEN_INIT(str, err)                    \
    do {                                               \
       NOUVEAU_ERR(str, err);                          \
@@ -852,6 +932,7 @@
    pscreen->get_param = nvc0_screen_get_param;
    pscreen->get_shader_param = nvc0_screen_get_shader_param;
    pscreen->get_paramf = nvc0_screen_get_paramf;
+   pscreen->get_sample_pixel_grid = nvc0_screen_get_sample_pixel_grid;
    pscreen->get_driver_query_info = nvc0_screen_get_driver_query_info;
    pscreen->get_driver_query_group_info = nvc0_screen_get_driver_query_group_info;
 
@@ -1197,6 +1278,7 @@
    MK_MACRO(NVC0_3D_MACRO_DRAW_ARRAYS_INDIRECT_COUNT, mme9097_draw_arrays_indirect_count);
    MK_MACRO(NVC0_3D_MACRO_DRAW_ELEMENTS_INDIRECT_COUNT, mme9097_draw_elts_indirect_count);
    MK_MACRO(NVC0_3D_MACRO_QUERY_BUFFER_WRITE, mme9097_query_buffer_write);
+   MK_MACRO(NVC0_3D_MACRO_CONSERVATIVE_RASTER_STATE, mme9097_conservative_raster_state);
    MK_MACRO(NVC0_CP_MACRO_LAUNCH_GRID_INDIRECT, mme90c0_launch_grid_indirect);
 
    BEGIN_NVC0(push, NVC0_3D(RASTERIZE_ENABLE), 1);
@@ -1229,14 +1311,14 @@
 
    /* XXX: Compute and 3D are somehow aliased on Fermi. */
    for (i = 0; i < 5; ++i) {
+      unsigned j = 0;
+      for (j = 0; j < 16; j++)
+         screen->cb_bindings[i][j].size = -1;
+
       /* TIC and TSC entries for each unit (nve4+ only) */
       /* auxiliary constants (6 user clip planes, base instance id) */
-      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-      PUSH_DATA (push, NVC0_CB_AUX_SIZE);
-      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
-      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
-      BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1);
-      PUSH_DATA (push, (15 << 4) | 1);
+      nvc0_screen_bind_cb_3d(screen, NULL, i, 15, NVC0_CB_AUX_SIZE,
+                             screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
       if (screen->eng3d->oclass >= NVE4_3D_CLASS) {
          unsigned j;
          BEGIN_1IC0(push, NVC0_3D(CB_POS), 9);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index efd62a8..d8223ba 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -16,7 +16,9 @@
 #define NVE4_IMG_MAX_HANDLES 512
 
 /* doesn't count driver-reserved slot */
-#define NVC0_MAX_PIPE_CONSTBUFS         15
+#define NVC0_MAX_PIPE_CONSTBUFS 15
+#define NVC0_MAX_CONST_BUFFERS  16
+#define NVC0_MAX_CONSTBUF_SIZE  65536
 
 #define NVC0_MAX_SURFACE_SLOTS 16
 
@@ -53,12 +55,17 @@
    uint8_t tls_required; /* bitmask of shader types using l[] */
    uint8_t clip_enable;
    uint32_t clip_mode;
-   uint32_t uniform_buffer_bound[6];
+   bool uniform_buffer_bound[6];
    struct nvc0_transform_feedback_state *tfb;
    bool seamless_cube_map;
    bool post_depth_coverage;
 };
 
+struct nvc0_cb_binding {
+   uint64_t addr;
+   int size;
+};
+
 struct nvc0_screen {
    struct nouveau_screen base;
 
@@ -114,6 +121,9 @@
       bool mp_counters_enabled;
    } pm;
 
+   /* only maintained on Maxwell+ */
+   struct nvc0_cb_binding cb_bindings[5][NVC0_MAX_CONST_BUFFERS];
+
    struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */
    struct nouveau_object *eng2d;
    struct nouveau_object *m2mf;
@@ -146,6 +156,9 @@
 
 int nvc0_screen_resize_text_area(struct nvc0_screen *, uint64_t);
 
+// 3D Only
+void nvc0_screen_bind_cb_3d(struct nvc0_screen *, bool *, int, int, int, uint64_t);
+
 static inline void
 nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 3e204f0..d9ee625 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -327,6 +327,20 @@
 
     SB_IMMED_3D(so, PIXEL_CENTER_INTEGER, !cso->half_pixel_center);
 
+    if (class_3d >= GM200_3D_CLASS) {
+        if (cso->conservative_raster_mode != PIPE_CONSERVATIVE_RASTER_OFF) {
+            bool post_snap = cso->conservative_raster_mode ==
+                PIPE_CONSERVATIVE_RASTER_POST_SNAP;
+            uint32_t state = cso->subpixel_precision_x;
+            state |= cso->subpixel_precision_y << 4;
+            state |= (uint32_t)(cso->conservative_raster_dilate * 4) << 8;
+            state |= (post_snap || class_3d < GP100_3D_CLASS) ? 1 << 10 : 0;
+            SB_IMMED_3D(so, MACRO_CONSERVATIVE_RASTER_STATE, state);
+        } else {
+            SB_IMMED_3D(so, CONSERVATIVE_RASTER, 0);
+        }
+    }
+
     assert(so->size <= ARRAY_SIZE(so->state));
     return (void *)so;
 }
@@ -840,7 +854,21 @@
 
     util_copy_framebuffer_state(&nvc0->framebuffer, fb);
 
-    nvc0->dirty_3d |= NVC0_NEW_3D_FRAMEBUFFER;
+    nvc0->dirty_3d |= NVC0_NEW_3D_FRAMEBUFFER | NVC0_NEW_3D_SAMPLE_LOCATIONS;
+}
+
+static void
+nvc0_set_sample_locations(struct pipe_context *pipe,
+                          size_t size, const uint8_t *locations)
+{
+    struct nvc0_context *nvc0 = nvc0_context(pipe);
+
+    nvc0->sample_locations_enabled = size && locations;
+    if (size > sizeof(nvc0->sample_locations))
+       size = sizeof(nvc0->sample_locations);
+    memcpy(nvc0->sample_locations, locations, size);
+
+    nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLE_LOCATIONS;
 }
 
 static void
@@ -1393,6 +1421,7 @@
    pipe->set_min_samples = nvc0_set_min_samples;
    pipe->set_constant_buffer = nvc0_set_constant_buffer;
    pipe->set_framebuffer_state = nvc0_set_framebuffer_state;
+   pipe->set_sample_locations = nvc0_set_sample_locations;
    pipe->set_polygon_stipple = nvc0_set_polygon_stipple;
    pipe->set_scissor_states = nvc0_set_scissor_states;
    pipe->set_viewport_states = nvc0_set_viewport_states;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 8e2192d..4f004a4 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -71,13 +71,132 @@
    PUSH_DATA (push, 0);      // base layer
 }
 
+static uint32_t
+gm200_encode_cb_sample_location(uint8_t x, uint8_t y)
+{
+   static const uint8_t lut[] = {
+      0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf,
+      0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7};
+   uint32_t result = 0;
+   /* S0.12 representation for TGSI_OPCODE_INTERP_SAMPLE */
+   result |= lut[x] << 8 | lut[y] << 24;
+   /* fill in gaps with data in a representation for SV_SAMPLE_POS */
+   result |= x << 12 | y << 28;
+   return result;
+}
+
+static void
+gm200_validate_sample_locations(struct nvc0_context *nvc0, unsigned ms)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
+   unsigned grid_width, grid_height, hw_grid_width;
+   uint8_t sample_locations[16][2];
+   unsigned cb[64];
+   unsigned i, pixel, pixel_y, pixel_x, sample;
+   uint32_t packed_locations[4] = {};
+
+   screen->base.base.get_sample_pixel_grid(
+      &screen->base.base, ms, &grid_width, &grid_height);
+
+   hw_grid_width = grid_width;
+   if (ms == 1) /* get_sample_pixel_grid() exposes 2x4 for 1x msaa */
+      hw_grid_width = 4;
+
+   if (nvc0->sample_locations_enabled) {
+      uint8_t locations[2 * 4 * 8];
+      memcpy(locations, nvc0->sample_locations, sizeof(locations));
+      util_sample_locations_flip_y(
+         &screen->base.base, nvc0->framebuffer.height, ms, locations);
+
+      for (pixel = 0; pixel < hw_grid_width*grid_height; pixel++) {
+         for (sample = 0; sample < ms; sample++) {
+            unsigned pixel_x = pixel % hw_grid_width;
+            unsigned pixel_y = pixel / hw_grid_width;
+            unsigned wi = pixel * ms + sample;
+            unsigned ri = (pixel_y * grid_width + pixel_x % grid_width);
+            ri = ri * ms + sample;
+            sample_locations[wi][0] = locations[ri] & 0xf;
+            sample_locations[wi][1] = 16 - (locations[ri] >> 4);
+         }
+      }
+   } else {
+      const uint8_t (*ptr)[2] = nvc0_get_sample_locations(ms);
+      for (i = 0; i < 16; i++) {
+         sample_locations[i][0] = ptr[i % ms][0];
+         sample_locations[i][1] = ptr[i % ms][1];
+      }
+   }
+
+   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+   PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
+   BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 64);
+   PUSH_DATA (push, NVC0_CB_AUX_SAMPLE_INFO);
+   for (pixel_y = 0; pixel_y < 4; pixel_y++) {
+      for (pixel_x = 0; pixel_x < 2; pixel_x++) {
+         for (sample = 0; sample < ms; sample++) {
+            unsigned write_index = (pixel_y * 2 + pixel_x) * 8 + sample;
+            unsigned read_index = pixel_y % grid_height * hw_grid_width;
+            uint8_t x, y;
+            read_index += pixel_x % grid_width;
+            read_index = read_index * ms + sample;
+            x = sample_locations[read_index][0];
+            y = sample_locations[read_index][1];
+            cb[write_index] = gm200_encode_cb_sample_location(x, y);
+         }
+      }
+   }
+   PUSH_DATAp(push, cb, 64);
+
+   for (i = 0; i < 16; i++) {
+      packed_locations[i / 4] |= sample_locations[i][0] << ((i % 4) * 8);
+      packed_locations[i / 4] |= sample_locations[i][1] << ((i % 4) * 8 + 4);
+   }
+
+   BEGIN_NVC0(push, SUBC_3D(0x11e0), 4);
+   PUSH_DATAp(push, packed_locations, 4);
+}
+
+static void
+nvc0_validate_sample_locations(struct nvc0_context *nvc0, unsigned ms)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
+   unsigned i;
+
+   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
+   PUSH_DATA (push, NVC0_CB_AUX_SIZE);
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
+   BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * ms);
+   PUSH_DATA (push, NVC0_CB_AUX_SAMPLE_INFO);
+   for (i = 0; i < ms; i++) {
+      float xy[2];
+      nvc0->base.pipe.get_sample_position(&nvc0->base.pipe, ms, i, xy);
+      PUSH_DATAf(push, xy[0]);
+      PUSH_DATAf(push, xy[1]);
+   }
+}
+
+static void
+validate_sample_locations(struct nvc0_context *nvc0)
+{
+   unsigned ms = util_framebuffer_get_num_samples(&nvc0->framebuffer);
+
+   if (nvc0->screen->base.class_3d >= GM200_3D_CLASS)
+      gm200_validate_sample_locations(nvc0, ms);
+   else
+      nvc0_validate_sample_locations(nvc0, ms);
+}
+
 static void
 nvc0_validate_fb(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
-   struct nvc0_screen *screen = nvc0->screen;
-   unsigned i, ms;
+   unsigned i;
    unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1;
    unsigned nr_cbufs = fb->nr_cbufs;
    bool serialize = false;
@@ -197,33 +316,6 @@
    PUSH_DATA (push, (076543210 << 4) | nr_cbufs);
    IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), ms_mode);
 
-   ms = 1 << ms_mode;
-   BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-   PUSH_DATA (push, NVC0_CB_AUX_SIZE);
-   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
-   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
-   BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * ms);
-   PUSH_DATA (push, NVC0_CB_AUX_SAMPLE_INFO);
-   for (i = 0; i < ms; i++) {
-      float xy[2];
-      nvc0->base.pipe.get_sample_position(&nvc0->base.pipe, ms, i, xy);
-      PUSH_DATAf(push, xy[0]);
-      PUSH_DATAf(push, xy[1]);
-   }
-
-   if (screen->base.class_3d >= GM200_3D_CLASS) {
-      const uint8_t (*ptr)[2] = nvc0_get_sample_locations(ms);
-      uint32_t val[4] = {};
-
-      for (i = 0; i < 16; i++) {
-         val[i / 4] |= ptr[i % ms][0] << (((i % 4) * 8) + 0);
-         val[i / 4] |= ptr[i % ms][1] << (((i % 4) * 8) + 4);
-      }
-
-      BEGIN_NVC0(push, SUBC_3D(0x11e0), 4);
-      PUSH_DATAp(push, val, 4);
-   }
-
    if (serialize)
       IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0);
 
@@ -473,9 +565,10 @@
 static void
 nvc0_constbufs_validate(struct nvc0_context *nvc0)
 {
-   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    unsigned s;
 
+   bool can_serialize = true;
+
    for (s = 0; s < 5; ++s) {
       while (nvc0->constbuf_dirty[s]) {
          int i = ffs(nvc0->constbuf_dirty[s]) - 1;
@@ -488,41 +581,34 @@
             assert(i == 0); /* we really only want OpenGL uniforms here */
             assert(nvc0->constbuf[s][0].u.data);
 
-            if (nvc0->state.uniform_buffer_bound[s] < size) {
-               nvc0->state.uniform_buffer_bound[s] = align(size, 0x100);
+            if (!nvc0->state.uniform_buffer_bound[s]) {
+               nvc0->state.uniform_buffer_bound[s] = true;
 
-               BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-               PUSH_DATA (push, nvc0->state.uniform_buffer_bound[s]);
-               PUSH_DATAh(push, bo->offset + base);
-               PUSH_DATA (push, bo->offset + base);
-               BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
-               PUSH_DATA (push, (0 << 4) | 1);
+               nvc0_screen_bind_cb_3d(nvc0->screen, &can_serialize, s, i,
+                                      NVC0_MAX_CONSTBUF_SIZE, bo->offset + base);
             }
             nvc0_cb_bo_push(&nvc0->base, bo, NV_VRAM_DOMAIN(&nvc0->screen->base),
-                         base, nvc0->state.uniform_buffer_bound[s],
+                         base, NVC0_MAX_CONSTBUF_SIZE,
                          0, (size + 3) / 4,
                          nvc0->constbuf[s][0].u.data);
          } else {
             struct nv04_resource *res =
                nv04_resource(nvc0->constbuf[s][i].u.buf);
             if (res) {
-               BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-               PUSH_DATA (push, nvc0->constbuf[s][i].size);
-               PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
-               PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
-               BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
-               PUSH_DATA (push, (i << 4) | 1);
+               nvc0_screen_bind_cb_3d(nvc0->screen, &can_serialize, s, i,
+                                      nvc0->constbuf[s][i].size,
+                                      res->address + nvc0->constbuf[s][i].offset);
 
                BCTX_REFN(nvc0->bufctx_3d, 3D_CB(s, i), res, RD);
 
                nvc0->cb_dirty = 1; /* Force cache flush for UBO. */
                res->cb_bindings[s] |= 1 << i;
-            } else {
-               BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
-               PUSH_DATA (push, (i << 4) | 0);
+
+               if (i == 0)
+                  nvc0->state.uniform_buffer_bound[s] = false;
+            } else if (i != 0) {
+               nvc0_screen_bind_cb_3d(nvc0->screen, &can_serialize, s, i, -1, 0);
             }
-            if (i == 0)
-               nvc0->state.uniform_buffer_bound[s] = 0;
          }
       }
    }
@@ -531,7 +617,7 @@
       /* Invalidate all COMPUTE constbufs because they are aliased with 3D. */
       nvc0->dirty_cp |= NVC0_NEW_CP_CONSTBUF;
       nvc0->constbuf_dirty[5] |= nvc0->constbuf_valid[5];
-      nvc0->state.uniform_buffer_bound[5] = 0;
+      nvc0->state.uniform_buffer_bound[5] = false;
    }
 }
 
@@ -618,18 +704,12 @@
 static void
 nvc0_validate_driverconst(struct nvc0_context *nvc0)
 {
-   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_screen *screen = nvc0->screen;
    int i;
 
-   for (i = 0; i < 5; ++i) {
-      BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-      PUSH_DATA (push, NVC0_CB_AUX_SIZE);
-      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
-      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
-      BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1);
-      PUSH_DATA (push, (15 << 4) | 1);
-   }
+   for (i = 0; i < 5; ++i)
+      nvc0_screen_bind_cb_3d(screen, NULL, i, 15, NVC0_CB_AUX_SIZE,
+                             screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
 
    nvc0->dirty_cp |= NVC0_NEW_CP_DRIVERCONST;
 }
@@ -879,6 +959,8 @@
                                    NVC0_NEW_3D_TEVLPROG |
                                    NVC0_NEW_3D_GMTYPROG },
     { nvc0_validate_driverconst,   NVC0_NEW_3D_DRIVERCONST },
+    { validate_sample_locations,   NVC0_NEW_3D_SAMPLE_LOCATIONS |
+                                   NVC0_NEW_3D_FRAMEBUFFER},
 };
 
 bool
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
index 3006ed6..e200328 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_stateobj.h
@@ -23,7 +23,7 @@
 struct nvc0_rasterizer_stateobj {
    struct pipe_rasterizer_state pipe;
    int size;
-   uint32_t state[43];
+   uint32_t state[44];
 };
 
 struct nvc0_zsa_stateobj {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 0f86c11..03881c6 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -754,6 +754,16 @@
    }
 }
 
+static void
+gm200_evaluate_depth_buffer(struct pipe_context *pipe)
+{
+   struct nvc0_context *nvc0 = nvc0_context(pipe);
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+
+   nvc0_state_validate_3d(nvc0, NVC0_NEW_3D_FRAMEBUFFER);
+   IMMED_NVC0(push, SUBC_3D(0x11fc), 1);
+}
+
 
 /* =============================== BLIT CODE ===================================
  */
@@ -1563,6 +1573,13 @@
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    bool eng3d = false;
 
+   if (info->src.box.width == 0 || info->src.box.height == 0 ||
+       info->dst.box.width == 0 || info->dst.box.height == 0) {
+      pipe_debug_message(&nvc0->base.debug, ERROR,
+                         "Blit with zero-size src or dst box");
+      return;
+   }
+
    if (util_format_is_depth_or_stencil(info->dst.resource->format)) {
       if (!(info->mask & PIPE_MASK_ZS))
          return;
@@ -1601,6 +1618,10 @@
             if (util_format_is_alpha(info->src.format))
                eng3d = info->src.format != PIPE_FORMAT_A8_UNORM;
             else
+            if (util_format_is_srgb(info->dst.format) &&
+                util_format_get_nr_components(info->src.format) == 1)
+               eng3d = true;
+            else
                eng3d = !nv50_2d_format_supported(info->src.format);
          }
       } else
@@ -1720,4 +1741,6 @@
    pipe->clear_depth_stencil = nvc0_clear_depth_stencil;
    pipe->clear_texture = nv50_clear_texture;
    pipe->clear_buffer = nvc0_clear_buffer;
+   if (nvc0->screen->base.class_3d >= GM200_3D_CLASS)
+      pipe->evaluate_depth_buffer = gm200_evaluate_depth_buffer;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 0471fff..f40600e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -208,7 +208,7 @@
              GM107_TIC2_3_LOD_ANISO_QUALITY_HIGH |
              GM107_TIC2_3_LOD_ISO_QUALITY_HIGH;
 
-   if (flags & NV50_TEXVIEW_ACCESS_RESOLVE) {
+   if (flags & (NV50_TEXVIEW_ACCESS_RESOLVE | NV50_TEXVIEW_IMAGE_GM107)) {
       width = mt->base.base.width0 << mt->ms_x;
       height = mt->base.base.height0 << mt->ms_y;
    } else {
@@ -268,7 +268,7 @@
       templ.u.tex.first_level = templ.u.tex.last_level = view->u.tex.level;
    }
 
-   flags = NV50_TEXVIEW_SCALED_COORDS;
+   flags = NV50_TEXVIEW_SCALED_COORDS | NV50_TEXVIEW_IMAGE_GM107;
 
    return nvc0_create_texture_view(pipe, &res->base, &templ, flags, target);
 }
@@ -755,7 +755,7 @@
          dirty &= ~(1 << i);
 
          BEGIN_NVC0(push, NVC0_3D(CB_POS), 2);
-         PUSH_DATA (push, (8 + i) * 4);
+         PUSH_DATA (push, NVC0_CB_AUX_TEX_INFO(i));
          PUSH_DATA (push, nvc0->tex_handles[s][i]);
       } while (dirty);
 
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index bc49775..01fccfb 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -574,10 +574,10 @@
     /* Handle non-renderable plain formats. */
     if (layout == UTIL_FORMAT_LAYOUT_PLAIN &&
         (!screen->is_format_supported(screen, src_templ.format, src->target,
-                                      src->nr_samples,
+                                      src->nr_samples, src->nr_storage_samples,
                                       PIPE_BIND_SAMPLER_VIEW) ||
          !screen->is_format_supported(screen, dst_templ.format, dst->target,
-                                      dst->nr_samples,
+                                      dst->nr_samples, dst->nr_storage_samples,
                                       PIPE_BIND_RENDER_TARGET))) {
         switch (util_format_get_blocksize(dst_templ.format)) {
             case 1:
@@ -644,9 +644,11 @@
     /* Fallback for textures. */
     if (!screen->is_format_supported(screen, dst_templ.format,
                                      dst->target, dst->nr_samples,
+                                     dst->nr_storage_samples,
                                      PIPE_BIND_RENDER_TARGET) ||
 	!screen->is_format_supported(screen, src_templ.format,
                                      src->target, src->nr_samples,
+                                     src->nr_storage_samples,
                                      PIPE_BIND_SAMPLER_VIEW)) {
         assert(0 && "this shouldn't happen, update r300_is_blit_supported");
         util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz,
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 2202515..667d3fd 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -449,7 +449,7 @@
     /* The submission context. */
     struct radeon_winsys_ctx *ctx;
     /* The command stream. */
-    struct radeon_winsys_cs *cs;
+    struct radeon_cmdbuf *cs;
     /* Screen. */
     struct r300_screen *screen;
 
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 727b9e2..560b775 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -39,7 +39,7 @@
  */
 
 #define CS_LOCALS(context) \
-    struct radeon_winsys_cs *cs_copy = (context)->cs; \
+    struct radeon_cmdbuf *cs_copy = (context)->cs; \
     struct radeon_winsys *cs_winsys = (context)->rws; \
     int cs_count = 0; (void) cs_count; (void) cs_winsys;
 
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 8ea2e87..046d68c 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -120,6 +120,7 @@
             return 16;
 
         case PIPE_CAP_GLSL_FEATURE_LEVEL:
+        case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
             return 120;
 
         /* r300 cannot do swizzling of compressed textures. Supported otherwise. */
@@ -246,11 +247,19 @@
         case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
         case PIPE_CAP_TILE_RASTER_ORDER:
         case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+        case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
         case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
         case PIPE_CAP_CONTEXT_PRIORITY_MASK:
         case PIPE_CAP_FENCE_SIGNAL:
         case PIPE_CAP_CONSTBUF0_FLAGS:
         case PIPE_CAP_PACKED_UNIFORMS:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+        case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+        case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
             return 0;
 
         /* SWTCL-only features. */
@@ -475,6 +484,10 @@
             return 16.0f;
         case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
             return 16.0f;
+        case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+            return 0.0f;
         default:
             debug_printf("r300: Warning: Unknown CAP %d in get_paramf.\n",
                          param);
@@ -578,6 +591,7 @@
                                         enum pipe_format format,
                                         enum pipe_texture_target target,
                                         unsigned sample_count,
+                                        unsigned storage_sample_count,
                                         unsigned usage)
 {
     uint32_t retval = 0;
@@ -603,8 +617,8 @@
                             format == PIPE_FORMAT_R16G16B16X16_FLOAT;
     const struct util_format_description *desc;
 
-    if (!util_format_is_supported(format, usage))
-       return FALSE;
+    if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+        return false;
 
     /* Check multisampling support. */
     switch (sample_count) {
diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am
index bf45fbf..1138de9 100644
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@@ -10,7 +10,7 @@
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(RADEON_CFLAGS) \
 	$(LIBELF_CFLAGS) \
-	-I$(top_srcdir)/src/amd/common
+	-I$(top_srcdir)/src/amd/common -Wstrict-overflow=0
 
 AM_CXXFLAGS = \
 	$(GALLIUM_DRIVER_CXXFLAGS) \
diff --git a/src/gallium/drivers/r600/cayman_msaa.c b/src/gallium/drivers/r600/cayman_msaa.c
index f97924a..9f40164 100644
--- a/src/gallium/drivers/r600/cayman_msaa.c
+++ b/src/gallium/drivers/r600/cayman_msaa.c
@@ -141,7 +141,7 @@
 		cayman_get_sample_position(ctx, 16, i, rctx->sample_locations_16x[i]);
 }
 
-static void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples)
+static void cayman_emit_msaa_sample_locs(struct radeon_cmdbuf *cs, int nr_samples)
 {
 	switch (nr_samples) {
 	default:
@@ -202,7 +202,7 @@
 	}
 }
 
-void cayman_emit_msaa_state(struct radeon_winsys_cs *cs, int nr_samples,
+void cayman_emit_msaa_state(struct radeon_cmdbuf *cs, int nr_samples,
 			    int ps_iter_samples, int overrast_samples)
 {
 	int setup_samples = nr_samples > 1 ? nr_samples :
@@ -225,7 +225,7 @@
 
 	if (setup_samples > 1) {
 		/* indexed by log2(nr_samples) */
-		unsigned max_dist[] = {
+		const unsigned max_dist[] = {
 			0,
 			eg_max_dist_2x,
 			eg_max_dist_4x,
diff --git a/src/gallium/drivers/r600/compute_memory_pool.c b/src/gallium/drivers/r600/compute_memory_pool.c
index 4b0e004..4c87b32 100644
--- a/src/gallium/drivers/r600/compute_memory_pool.c
+++ b/src/gallium/drivers/r600/compute_memory_pool.c
@@ -43,6 +43,29 @@
 #include <inttypes.h>
 
 #define ITEM_ALIGNMENT 1024
+
+/* A few forward declarations of static functions */
+static void compute_memory_shadow(struct compute_memory_pool* pool,
+	struct pipe_context *pipe, int device_to_host);
+
+static void compute_memory_defrag(struct compute_memory_pool *pool,
+	struct pipe_resource *src, struct pipe_resource *dst,
+	struct pipe_context *pipe);
+
+static int compute_memory_promote_item(struct compute_memory_pool *pool,
+	struct compute_memory_item *item, struct pipe_context *pipe,
+	int64_t allocated);
+
+static void compute_memory_move_item(struct compute_memory_pool *pool,
+	struct pipe_resource *src, struct pipe_resource *dst,
+	struct compute_memory_item *item, uint64_t new_start_in_dw,
+	struct pipe_context *pipe);
+
+static void compute_memory_transfer(struct compute_memory_pool* pool,
+	struct pipe_context * pipe, int device_to_host,
+	struct compute_memory_item* chunk, void* data,
+	int offset_in_chunk, int size);
+
 /**
  * Creates a new pool.
  */
@@ -91,7 +114,7 @@
 {
 	COMPUTE_DBG(pool->screen, "* compute_memory_pool_delete()\n");
 	free(pool->shadow);
-	pipe_resource_reference(&pool->bo, NULL);
+	r600_resource_reference(&pool->bo, NULL);
 	/* In theory, all of the items were freed in compute_memory_free.
 	 * Just delete the list heads
 	 */
@@ -102,88 +125,11 @@
 }
 
 /**
- * Searches for an empty space in the pool, return with the pointer to the
- * allocatable space in the pool.
- * \param size_in_dw	The size of the space we are looking for.
- * \return -1 on failure
- */
-int64_t compute_memory_prealloc_chunk(
-	struct compute_memory_pool* pool,
-	int64_t size_in_dw)
-{
-	struct compute_memory_item *item;
-
-	int last_end = 0;
-
-	assert(size_in_dw <= pool->size_in_dw);
-
-	COMPUTE_DBG(pool->screen, "* compute_memory_prealloc_chunk() size_in_dw = %"PRIi64"\n",
-		size_in_dw);
-
-	LIST_FOR_EACH_ENTRY(item, pool->item_list, link) {
-		if (last_end + size_in_dw <= item->start_in_dw) {
-			return last_end;
-		}
-
-		last_end = item->start_in_dw + align(item->size_in_dw, ITEM_ALIGNMENT);
-	}
-
-	if (pool->size_in_dw - last_end < size_in_dw) {
-		return -1;
-	}
-
-	return last_end;
-}
-
-/**
- *  Search for the chunk where we can link our new chunk after it.
- *  \param start_in_dw	The position of the item we want to add to the pool.
- *  \return The item that is just before the passed position
- */
-struct list_head *compute_memory_postalloc_chunk(
-	struct compute_memory_pool* pool,
-	int64_t start_in_dw)
-{
-	struct compute_memory_item *item;
-	struct compute_memory_item *next;
-	struct list_head *next_link;
-
-	COMPUTE_DBG(pool->screen, "* compute_memory_postalloc_chunck() start_in_dw = %"PRIi64"\n",
-		start_in_dw);
-
-	/* Check if we can insert it in the front of the list */
-	item = LIST_ENTRY(struct compute_memory_item, pool->item_list->next, link);
-	if (LIST_IS_EMPTY(pool->item_list) || item->start_in_dw > start_in_dw) {
-		return pool->item_list;
-	}
-
-	LIST_FOR_EACH_ENTRY(item, pool->item_list, link) {
-		next_link = item->link.next;
-
-		if (next_link != pool->item_list) {
-			next = container_of(next_link, item, link);
-			if (item->start_in_dw < start_in_dw
-				&& next->start_in_dw > start_in_dw) {
-				return &item->link;
-			}
-		}
-		else {
-			/* end of chain */
-			assert(item->start_in_dw < start_in_dw);
-			return &item->link;
-		}
-	}
-
-	assert(0 && "unreachable");
-	return NULL;
-}
-
-/**
  * Reallocates and defragments the pool, conserves data.
  * \returns -1 if it fails, 0 otherwise
  * \see compute_memory_finalize_pending
  */
-int compute_memory_grow_defrag_pool(struct compute_memory_pool *pool,
+static int compute_memory_grow_defrag_pool(struct compute_memory_pool *pool,
 	struct pipe_context *pipe, int new_size_in_dw)
 {
 	new_size_in_dw = align(new_size_in_dw, ITEM_ALIGNMENT);
@@ -211,7 +157,7 @@
 			compute_memory_defrag(pool, src, dst, pipe);
 
 			/* Release the old buffer */
-			pipe_resource_reference(&pool->bo, NULL);
+			r600_resource_reference(&pool->bo, NULL);
 			pool->bo = temp;
 			pool->size_in_dw = new_size_in_dw;
 		}
@@ -226,7 +172,7 @@
 
 			pool->size_in_dw = new_size_in_dw;
 			/* Release the old buffer */
-			pipe_resource_reference(&pool->bo, NULL);
+			r600_resource_reference(&pool->bo, NULL);
 			pool->bo = r600_compute_buffer_alloc_vram(pool->screen, pool->size_in_dw * 4);
 			compute_memory_shadow(pool, pipe, 0);
 
@@ -245,7 +191,7 @@
  * \param device_to_host 1 for device->host, 0 for host->device
  * \see compute_memory_grow_defrag_pool
  */
-void compute_memory_shadow(struct compute_memory_pool* pool,
+static void compute_memory_shadow(struct compute_memory_pool* pool,
 	struct pipe_context * pipe, int device_to_host)
 {
 	struct compute_memory_item chunk;
@@ -339,7 +285,7 @@
  * \param dst	The destination resource
  * \see compute_memory_grow_defrag_pool and compute_memory_finalize_pending
  */
-void compute_memory_defrag(struct compute_memory_pool *pool,
+static void compute_memory_defrag(struct compute_memory_pool *pool,
 	struct pipe_resource *src, struct pipe_resource *dst,
 	struct pipe_context *pipe)
 {
@@ -369,7 +315,7 @@
  * \return -1 if it fails, 0 otherwise
  * \see compute_memory_finalize_pending
  */
-int compute_memory_promote_item(struct compute_memory_pool *pool,
+static int compute_memory_promote_item(struct compute_memory_pool *pool,
 		struct compute_memory_item *item, struct pipe_context *pipe,
 		int64_t start_in_dw)
 {
@@ -474,7 +420,7 @@
  * \param new_start_in_dw	The new position of the item in \a item_list
  * \see compute_memory_defrag
  */
-void compute_memory_move_item(struct compute_memory_pool *pool,
+static void compute_memory_move_item(struct compute_memory_pool *pool,
 	struct pipe_resource *src, struct pipe_resource *dst,
 	struct compute_memory_item *item, uint64_t new_start_in_dw,
 	struct pipe_context *pipe)
@@ -646,7 +592,7 @@
  * \param device_to_host 1 for device->host, 0 for host->device.
  * \see compute_memory_shadow
  */
-void compute_memory_transfer(
+static void compute_memory_transfer(
 	struct compute_memory_pool* pool,
 	struct pipe_context * pipe,
 	int device_to_host,
@@ -686,18 +632,3 @@
 		pipe->transfer_unmap(pipe, xfer);
 	}
 }
-
-/**
- * Transfer data between chunk<->data, it is for VRAM<->GART transfers
- */
-void compute_memory_transfer_direct(
-	struct compute_memory_pool* pool,
-	int chunk_to_data,
-	struct compute_memory_item* chunk,
-	struct r600_resource* data,
-	int offset_in_chunk,
-	int offset_in_data,
-	int size)
-{
-	///TODO: DMA
-}
diff --git a/src/gallium/drivers/r600/compute_memory_pool.h b/src/gallium/drivers/r600/compute_memory_pool.h
index 161ddd5..2064e56 100644
--- a/src/gallium/drivers/r600/compute_memory_pool.h
+++ b/src/gallium/drivers/r600/compute_memory_pool.h
@@ -86,50 +86,15 @@
 
 void compute_memory_pool_delete(struct compute_memory_pool* pool);
 
-int64_t compute_memory_prealloc_chunk(struct compute_memory_pool* pool,
-	int64_t size_in_dw);
-
-struct list_head *compute_memory_postalloc_chunk(struct compute_memory_pool* pool,
-	int64_t start_in_dw);
-
-int compute_memory_grow_defrag_pool(struct compute_memory_pool* pool,
-	struct pipe_context *pipe, int new_size_in_dw);
-
-void compute_memory_shadow(struct compute_memory_pool* pool,
-	struct pipe_context *pipe, int device_to_host);
-
 int compute_memory_finalize_pending(struct compute_memory_pool* pool,
 	struct pipe_context * pipe);
 
-void compute_memory_defrag(struct compute_memory_pool *pool,
-	struct pipe_resource *src, struct pipe_resource *dst,
-	struct pipe_context *pipe);
-
-int compute_memory_promote_item(struct compute_memory_pool *pool,
-	struct compute_memory_item *item, struct pipe_context *pipe,
-	int64_t allocated);
-
 void compute_memory_demote_item(struct compute_memory_pool *pool,
 	struct compute_memory_item *item, struct pipe_context *pipe);
 
-void compute_memory_move_item(struct compute_memory_pool *pool,
-	struct pipe_resource *src, struct pipe_resource *dst,
-	struct compute_memory_item *item, uint64_t new_start_in_dw,
-	struct pipe_context *pipe);
-
 void compute_memory_free(struct compute_memory_pool* pool, int64_t id);
 
 struct compute_memory_item* compute_memory_alloc(struct compute_memory_pool* pool,
 	int64_t size_in_dw);
 
-void compute_memory_transfer(struct compute_memory_pool* pool,
-	struct pipe_context * pipe, int device_to_host,
-	struct compute_memory_item* chunk, void* data,
-	int offset_in_chunk, int size);
-
-void compute_memory_transfer_direct(struct compute_memory_pool* pool,
-	int chunk_to_data, struct compute_memory_item* chunk,
-	struct r600_resource* data, int offset_in_chunk,
-	int offset_in_data, int size);
-
 #endif
diff --git a/src/gallium/drivers/r600/eg_debug.c b/src/gallium/drivers/r600/eg_debug.c
index ceb7c16..56195df 100644
--- a/src/gallium/drivers/r600/eg_debug.c
+++ b/src/gallium/drivers/r600/eg_debug.c
@@ -78,7 +78,7 @@
 static void eg_dump_reg(FILE *file, unsigned offset, uint32_t value,
 			uint32_t field_mask)
 {
-	int r, f;
+	unsigned r, f;
 
 	for (r = 0; r < ARRAY_SIZE(egd_reg_table); r++) {
 		const struct eg_reg *reg = &egd_reg_table[r];
@@ -134,7 +134,7 @@
 				    unsigned reg_offset)
 {
 	unsigned reg = (ib[1] << 2) + reg_offset;
-	int i;
+	unsigned i;
 
 	for (i = 0; i < count; i++)
 		eg_dump_reg(f, reg + i*4, ib[2+i], ~0);
@@ -149,7 +149,7 @@
 	unsigned op = PKT3_IT_OPCODE_G(ib[0]);
 	const char *predicate = PKT3_PREDICATE(ib[0]) ? "(predicate)" : "";
 	const char *compute_mode = (ib[0] & 0x2) ? "(C)" : "";
-	int i;
+	unsigned i;
 
 	/* Print the name first. */
 	for (i = 0; i < ARRAY_SIZE(packet3_table); i++)
diff --git a/src/gallium/drivers/r600/egd_tables.py b/src/gallium/drivers/r600/egd_tables.py
index d7b78c7..8a60a62 100644
--- a/src/gallium/drivers/r600/egd_tables.py
+++ b/src/gallium/drivers/r600/egd_tables.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 CopyRight = '''
 /*
@@ -60,7 +61,7 @@
         """
         fragments = [
             '"%s\\0" /* %s */' % (
-                te[0].encode('string_escape'),
+                te[0].encode('unicode_escape').decode(),
                 ', '.join(str(idx) for idx in te[2])
             )
             for te in self.table
@@ -217,10 +218,10 @@
     strings = StringTable()
     strings_offsets = IntTable("int")
 
-    print '/* This file is autogenerated by egd_tables.py from evergreend.h. Do not edit directly. */'
-    print
-    print CopyRight.strip()
-    print '''
+    print('/* This file is autogenerated by egd_tables.py from evergreend.h. Do not edit directly. */')
+    print()
+    print(CopyRight.strip())
+    print('''
 #ifndef EG_TABLES_H
 #define EG_TABLES_H
 
@@ -242,20 +243,20 @@
         unsigned name_offset;
         unsigned op;
 };
-'''
+''')
 
-    print 'static const struct eg_packet3 packet3_table[] = {'
+    print('static const struct eg_packet3 packet3_table[] = {')
     for pkt in packets:
-        print '\t{%s, %s},' % (strings.add(pkt[5:]), pkt)
-    print '};'
-    print
+        print('\t{%s, %s},' % (strings.add(pkt[5:]), pkt))
+    print('};')
+    print()
 
-    print 'static const struct eg_field egd_fields_table[] = {'
+    print('static const struct eg_field egd_fields_table[] = {')
 
     fields_idx = 0
     for reg in regs:
         if len(reg.fields) and reg.own_fields:
-            print '\t/* %s */' % (fields_idx)
+            print('\t/* %s */' % (fields_idx))
 
             reg.fields_idx = fields_idx
 
@@ -266,34 +267,34 @@
                         while value[1] >= len(values_offsets):
                             values_offsets.append(-1)
                         values_offsets[value[1]] = strings.add(strip_prefix(value[0]))
-                    print '\t{%s, %s(~0u), %s, %s},' % (
+                    print('\t{%s, %s(~0u), %s, %s},' % (
                         strings.add(field.name), field.s_name,
-                        len(values_offsets), strings_offsets.add(values_offsets))
+                        len(values_offsets), strings_offsets.add(values_offsets)))
                 else:
-                    print '\t{%s, %s(~0u)},' % (strings.add(field.name), field.s_name)
+                    print('\t{%s, %s(~0u)},' % (strings.add(field.name), field.s_name))
                 fields_idx += 1
 
-    print '};'
-    print
+    print('};')
+    print()
 
-    print 'static const struct eg_reg egd_reg_table[] = {'
+    print('static const struct eg_reg egd_reg_table[] = {')
     for reg in regs:
         if len(reg.fields):
-            print '\t{%s, %s, %s, %s},' % (strings.add(reg.name), reg.r_name,
-                len(reg.fields), reg.fields_idx if reg.own_fields else reg.fields_owner.fields_idx)
+            print('\t{%s, %s, %s, %s},' % (strings.add(reg.name), reg.r_name,
+                len(reg.fields), reg.fields_idx if reg.own_fields else reg.fields_owner.fields_idx))
         else:
-            print '\t{%s, %s},' % (strings.add(reg.name), reg.r_name)
-    print '};'
-    print
+            print('\t{%s, %s},' % (strings.add(reg.name), reg.r_name))
+    print('};')
+    print()
 
     strings.emit(sys.stdout, "egd_strings")
 
-    print
+    print()
 
     strings_offsets.emit(sys.stdout, "egd_strings_offsets")
 
-    print
-    print '#endif'
+    print()
+    print('#endif')
 
 
 def main():
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index f873913..a77f582 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -575,7 +575,7 @@
 				    uint32_t indirect_grid[3])
 {
 	int i;
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 	unsigned num_waves;
@@ -654,7 +654,7 @@
 
 static void compute_setup_cbs(struct r600_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	unsigned i;
 
 	/* Emit colorbuffers. */
@@ -696,7 +696,7 @@
 static void compute_emit_cs(struct r600_context *rctx,
 			    const struct pipe_grid_info *info)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	bool compute_dirty = false;
 	struct r600_pipe_shader *current;
 	struct r600_shader_atomic combined_atomics[8];
@@ -858,7 +858,7 @@
 	struct r600_cs_shader_state *state =
 					(struct r600_cs_shader_state*)atom;
 	struct r600_pipe_compute *shader = state->shader;
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint64_t va;
 	struct r600_resource *code_bo;
 	unsigned ngpr, nstack;
diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index 6a533a4..5e0e27b 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -35,7 +35,7 @@
 			       uint64_t src_offset,
 			       uint64_t size)
 {
-	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
+	struct radeon_cmdbuf *cs = rctx->b.dma.cs;
 	unsigned i, ncopy, csize, sub_cmd, shift;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -64,10 +64,8 @@
 	for (i = 0; i < ncopy; i++) {
 		csize = size < EG_DMA_COPY_MAX_SIZE ? size : EG_DMA_COPY_MAX_SIZE;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ,
-				      RADEON_PRIO_SDMA_BUFFER);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE,
-				      RADEON_PRIO_SDMA_BUFFER);
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, 0);
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, 0);
 		radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize));
 		radeon_emit(cs, dst_offset & 0xffffffff);
 		radeon_emit(cs, src_offset & 0xffffffff);
@@ -87,7 +85,7 @@
 				   unsigned size, uint32_t clear_value,
 				   enum r600_coherency coher)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 
 	assert(size);
 	assert(rctx->screen->b.has_cp_dma);
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 7e0140d..cc41e11 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -243,6 +243,7 @@
 				      enum pipe_format format,
 				      enum pipe_texture_target target,
 				      unsigned sample_count,
+				      unsigned storage_sample_count,
 				      unsigned usage)
 {
 	struct r600_screen *rscreen = (struct r600_screen*)screen;
@@ -253,8 +254,8 @@
 		return FALSE;
 	}
 
-	if (!util_format_is_supported(format, usage))
-		return FALSE;
+	if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+		return false;
 
 	if (sample_count > 1) {
 		if (!rscreen->has_msaa)
@@ -575,11 +576,19 @@
 	unsigned max_aniso = rscreen->force_aniso >= 0 ? rscreen->force_aniso
 						       : state->max_anisotropy;
 	unsigned max_aniso_ratio = r600_tex_aniso_filter(max_aniso);
+	float max_lod = state->max_lod;
 
 	if (!ss) {
 		return NULL;
 	}
 
+	/* If the min_mip_filter is NONE, then the texture has no mipmapping and
+	 * MIP_FILTER will also be set to NONE. However, if more then one LOD is
+	 * configured, then the texture lookup seems to fail for some specific texture
+	 * formats. Forcing the number of LODs to one in this case fixes it. */
+	if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
+		max_lod = state->min_lod;
+
 	ss->border_color_use = sampler_state_needs_border_color(state);
 
 	/* R_03C000_SQ_TEX_SAMPLER_WORD0_0 */
@@ -596,7 +605,7 @@
 	/* R_03C004_SQ_TEX_SAMPLER_WORD1_0 */
 	ss->tex_sampler_words[1] =
 		S_03C004_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
-		S_03C004_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8));
+		S_03C004_MAX_LOD(S_FIXED(CLAMP(max_lod, 0, 15), 8));
 	/* R_03C008_SQ_TEX_SAMPLER_WORD2_0 */
 	ss->tex_sampler_words[2] =
 		S_03C008_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
@@ -965,7 +974,7 @@
 
 static void evergreen_emit_config_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_config_state *a = (struct r600_config_state*)atom;
 
 	radeon_set_config_reg_seq(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, 3);
@@ -992,7 +1001,7 @@
 
 static void evergreen_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct pipe_clip_state *state = &rctx->clip_state.state;
 
 	radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP0_X, 6*4);
@@ -1594,7 +1603,7 @@
 }
 
 /* 8xMSAA */
-static uint32_t sample_locs_8x[] = {
+static const uint32_t sample_locs_8x[] = {
 	FILL_SREG(-1,  1,  1,  5,  3, -5,  5,  3),
 	FILL_SREG(-7, -1, -3, -7,  7, -3, -5,  7),
 	FILL_SREG(-1,  1,  1,  5,  3, -5,  5,  3),
@@ -1648,7 +1657,7 @@
 static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples, int ps_iter_samples)
 {
 
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	unsigned max_dist = 0;
 
 	switch (nr_samples) {
@@ -1697,7 +1706,7 @@
 {
 	struct r600_image_state *state = (struct r600_image_state *)atom;
 	struct pipe_framebuffer_state *fb_state = &rctx->framebuffer.state;
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_texture *rtex;
 	struct r600_resource *resource;
 	int i;
@@ -1824,7 +1833,7 @@
 
 static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct pipe_framebuffer_state *state = &rctx->framebuffer.state;
 	unsigned nr_cbufs = state->nr_cbufs;
 	unsigned i, tl, br;
@@ -1859,7 +1868,7 @@
 		if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) {
 			cmask_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 				tex->cmask_buffer, RADEON_USAGE_READWRITE,
-				RADEON_PRIO_CMASK);
+				RADEON_PRIO_SEPARATE_META);
 		} else {
 			cmask_reloc = reloc;
 		}
@@ -1963,7 +1972,7 @@
 
 static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a;
 	float offset_units = state->offset_units;
 	float offset_scale = state->offset_scale;
@@ -2021,7 +2030,7 @@
 
 static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom;
 	unsigned fb_colormask = a->bound_cbufs_target_mask;
 	unsigned ps_colormask = a->ps_color_export_mask;
@@ -2036,7 +2045,7 @@
 
 static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_db_state *a = (struct r600_db_state*)atom;
 
 	if (a->rsurf && a->rsurf->db_htile_surface) {
@@ -2048,7 +2057,7 @@
 		radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, a->rsurf->db_preload_control);
 		radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base);
 		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, &rtex->resource,
-						  RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE);
+						  RADEON_USAGE_READWRITE, RADEON_PRIO_SEPARATE_META);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
 		radeon_emit(cs, reloc_idx);
 	} else {
@@ -2059,7 +2068,7 @@
 
 static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom;
 	unsigned db_render_control = 0;
 	unsigned db_count_control = 0;
@@ -2114,7 +2123,7 @@
 					  unsigned resource_offset,
 					  unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -2173,7 +2182,7 @@
 					    unsigned reg_alu_const_cache,
 					    unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -2325,7 +2334,7 @@
 					 struct r600_samplerview_state *state,
 					 unsigned resource_id_base, unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -2397,14 +2406,47 @@
 	                             EG_FETCH_CONSTANTS_OFFSET_CS + R600_MAX_CONST_BUFFERS, RADEON_CP_PACKET3_COMPUTE_MODE);
 }
 
+static void evergreen_convert_border_color(union pipe_color_union *in,
+                                           union pipe_color_union *out,
+                                           enum pipe_format format)
+{
+	if (util_format_is_pure_integer(format) &&
+		 !util_format_is_depth_or_stencil(format)) {
+		const struct util_format_description *d = util_format_description(format);
+
+		for (int i = 0; i < d->nr_channels; ++i) {
+			int cs = d->channel[i].size;
+			if (d->channel[i].type == UTIL_FORMAT_TYPE_SIGNED)
+				out->f[i] = (double)(in->i[i]) / ((1ul << (cs - 1)) - 1 );
+			else if (d->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED)
+				out->f[i] = (double)(in->ui[i]) / ((1ul << cs) - 1 );
+			else
+				out->f[i] = 0;
+		}
+
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_X24S8_UINT:
+		case PIPE_FORMAT_X32_S8X24_UINT:
+			out->f[0] = (double)(in->ui[0]) / 255.0;
+			out->f[1] = out->f[2] = out->f[3] = 0.0f;
+			break;
+		default:
+			memcpy(out->f, in->f, 4 * sizeof(float));
+		}
+	}
+}
+
 static void evergreen_emit_sampler_states(struct r600_context *rctx,
 				struct r600_textures_info *texinfo,
 				unsigned resource_id_base,
 				unsigned border_index_reg,
 				unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = texinfo->states.dirty_mask;
+	union pipe_color_union border_color = {{0,0,0,1}};
+	union pipe_color_union *border_color_ptr = &border_color;
 
 	while (dirty_mask) {
 		struct r600_pipe_sampler_state *rstate;
@@ -2413,6 +2455,16 @@
 		rstate = texinfo->states.states[i];
 		assert(rstate);
 
+		if (rstate->border_color_use) {
+			struct r600_pipe_sampler_view	*rview = texinfo->views.views[i];
+			if (rview) {
+				evergreen_convert_border_color(&rstate->border_color,
+				                               &border_color, rview->base.format);
+			} else {
+				border_color_ptr = &rstate->border_color;
+			}
+		}
+
 		radeon_emit(cs, PKT3(PKT3_SET_SAMPLER, 3, 0) | pkt_flags);
 		radeon_emit(cs, (resource_id_base + i) * 3);
 		radeon_emit_array(cs, rstate->tex_sampler_words, 3);
@@ -2420,7 +2472,7 @@
 		if (rstate->border_color_use) {
 			radeon_set_config_reg_seq(cs, border_index_reg, 5);
 			radeon_emit(cs, i);
-			radeon_emit_array(cs, rstate->border_color.ui, 4);
+			radeon_emit_array(cs, border_color_ptr->ui, 4);
 		}
 	}
 	texinfo->states.dirty_mask = 0;
@@ -2482,7 +2534,7 @@
 static void cayman_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a)
 {
 	struct r600_sample_mask *s = (struct r600_sample_mask*)a;
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint16_t mask = s->sample_mask;
 
 	radeon_set_context_reg_seq(cs, CM_R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
@@ -2492,7 +2544,7 @@
 
 static void evergreen_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_cso_state *state = (struct r600_cso_state*)a;
 	struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso;
 
@@ -2509,7 +2561,7 @@
 
 static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a;
 
 	uint32_t v = 0, v2 = 0, primid = 0, tf_param = 0;
@@ -2613,7 +2665,7 @@
 
 static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a;
 	struct r600_resource *rbuffer;
 
@@ -3716,7 +3768,7 @@
 				unsigned pitch,
 				unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
+	struct radeon_cmdbuf *cs = rctx->b.dma.cs;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
 	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
@@ -3802,9 +3854,9 @@
 		size = (cheight * pitch) / 4;
 		/* emit reloc before writing cs so that cs is always in consistent state */
 		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource,
-				      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
+				      RADEON_USAGE_READ, 0);
 		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource,
-				      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
+				      RADEON_USAGE_WRITE, 0);
 		radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, sub_cmd, size));
 		radeon_emit(cs, base >> 8);
 		radeon_emit(cs, (detile << 31) | (array_mode << 27) |
@@ -3965,7 +4017,7 @@
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct r600_atomic_buffer_state *astate;
-	int i, idx;
+	unsigned i, idx;
 
 	astate = &rctx->atomic_buffer_state;
 
@@ -3999,7 +4051,7 @@
 	struct r600_tex_color_info color;
 	struct eg_buf_res_params buf_params;
 	struct r600_resource *resource;
-	int i, idx;
+	unsigned i, idx;
 	unsigned old_mask;
 
 	if (shader != PIPE_SHADER_FRAGMENT &&
@@ -4093,7 +4145,7 @@
 					const struct pipe_image_view *images)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-	int i;
+	unsigned i;
 	struct r600_image_view *rview;
 	struct pipe_resource *image;
 	struct r600_resource *resource;
@@ -4555,14 +4607,14 @@
 }
 
 void evergreen_set_ls_hs_config(struct r600_context *rctx,
-				struct radeon_winsys_cs *cs,
+				struct radeon_cmdbuf *cs,
 				uint32_t ls_hs_config)
 {
 	radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
 }
 
 void evergreen_set_lds_alloc(struct r600_context *rctx,
-			     struct radeon_winsys_cs *cs,
+			     struct radeon_cmdbuf *cs,
 			     uint32_t lds_alloc)
 {
 	radeon_set_context_reg(cs, R_0288E8_SQ_LDS_ALLOC, lds_alloc);
@@ -4691,7 +4743,7 @@
 
 void eg_trace_emit(struct r600_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	unsigned reloc;
 
 	if (rctx->b.chip_class < EVERGREEN)
@@ -4721,7 +4773,7 @@
 					  struct r600_resource *resource,
 					  uint32_t pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 						   resource,
 						   RADEON_USAGE_READ,
@@ -4744,7 +4796,7 @@
 					   struct r600_resource *resource,
 					   uint32_t pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t event = EVENT_TYPE_PS_DONE;
 	uint32_t base_reg_0 = R_02872C_GDS_APPEND_COUNT_0;
 	uint32_t reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
@@ -4771,7 +4823,7 @@
 					struct r600_resource *resource,
 					uint32_t pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t event = EVENT_TYPE_PS_DONE;
 	uint32_t reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 						   resource,
@@ -4797,7 +4849,7 @@
 				      struct r600_resource *resource,
 				      uint32_t pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	unsigned reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 						   resource,
 						   RADEON_USAGE_READ,
@@ -4892,7 +4944,7 @@
 				       struct r600_shader_atomic *combined_atomics,
 				       uint8_t *atomic_used_mask_p)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_atomic_buffer_state *astate = &rctx->atomic_buffer_state;
 	uint32_t pkt_flags = 0;
 	uint32_t event = EVENT_TYPE_PS_DONE;
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 427e785..7029be2 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -2476,6 +2476,11 @@
 		return;
 	}
 
+	if (pformat == PIPE_FORMAT_A1B5G5R5_UNORM) {
+		*format = FMT_5_5_5_1;
+		return;
+	}
+
 	desc = util_format_description(pformat);
 	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) {
 		goto out_unknown;
@@ -2533,6 +2538,16 @@
 		/* Signed ints */
 	case UTIL_FORMAT_TYPE_SIGNED:
 		switch (desc->channel[i].size) {
+		case 4:
+			switch (desc->nr_channels) {
+			case 2:
+				*format = FMT_4_4;
+				break;
+			case 4:
+				*format = FMT_4_4_4_4;
+				break;
+			}
+			break;
 		case 8:
 			switch (desc->nr_channels) {
 			case 1:
diff --git a/src/gallium/drivers/r600/r600_cs.h b/src/gallium/drivers/r600/r600_cs.h
index 632c7f5..424adba 100644
--- a/src/gallium/drivers/r600/r600_cs.h
+++ b/src/gallium/drivers/r600/r600_cs.h
@@ -42,7 +42,7 @@
  */
 static inline bool
 radeon_cs_memory_below_limit(struct r600_common_screen *screen,
-			     struct radeon_winsys_cs *cs,
+			     struct radeon_cmdbuf *cs,
 			     uint64_t vram, uint64_t gtt)
 {
 	vram += cs->used_vram;
@@ -118,7 +118,7 @@
 				   enum radeon_bo_usage usage,
 				   enum radeon_bo_priority priority)
 {
-	struct radeon_winsys_cs *cs = ring->cs;
+	struct radeon_cmdbuf *cs = ring->cs;
 	bool has_vm = ((struct r600_common_screen*)rctx->b.screen)->info.r600_has_virtual_memory;
 	unsigned reloc = radeon_add_to_buffer_list(rctx, ring, rbo, usage, priority);
 
@@ -128,7 +128,7 @@
 	}
 }
 
-static inline void radeon_set_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg < R600_CONTEXT_REG_OFFSET);
 	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
@@ -136,13 +136,13 @@
 	radeon_emit(cs, (reg - R600_CONFIG_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_config_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CONTEXT_REG_OFFSET);
 	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
@@ -150,13 +150,13 @@
 	radeon_emit(cs, (reg - R600_CONTEXT_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_context_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_context_reg_idx(struct radeon_winsys_cs *cs,
+static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
 					      unsigned reg, unsigned idx,
 					      unsigned value)
 {
@@ -167,7 +167,7 @@
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
 	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
@@ -175,13 +175,13 @@
 	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_sh_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
 	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
@@ -189,13 +189,13 @@
 	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_uconfig_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_uconfig_reg_idx(struct radeon_winsys_cs *cs,
+static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
 					      unsigned reg, unsigned idx,
 					      unsigned value)
 {
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 2127f2b..a2f5f63 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -91,7 +91,7 @@
 
 void r600_flush_emit(struct r600_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	unsigned cp_coher_cntl = 0;
 	unsigned wait_until = 0;
 
@@ -260,7 +260,7 @@
 			    struct pipe_fence_handle **fence)
 {
 	struct r600_context *ctx = context;
-	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->b.gfx.cs;
 	struct radeon_winsys *ws = ctx->b.ws;
 
 	if (!radeon_emitted(cs, ctx->b.initial_gfx_cs_size))
@@ -436,7 +436,7 @@
 
 void r600_emit_pfp_sync_me(struct r600_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 
 	if (rctx->b.chip_class >= EVERGREEN &&
 	    rctx->b.screen->info.drm_minor >= 46) {
@@ -502,7 +502,7 @@
 			     struct pipe_resource *src, uint64_t src_offset,
 			     unsigned size)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 
 	assert(size);
 	assert(rctx->screen->b.has_cp_dma);
@@ -584,7 +584,7 @@
 			  uint64_t src_offset,
 			  uint64_t size)
 {
-	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
+	struct radeon_cmdbuf *cs = rctx->b.dma.cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -602,10 +602,8 @@
 	for (i = 0; i < ncopy; i++) {
 		csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ,
-				      RADEON_PRIO_SDMA_BUFFER);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE,
-				      RADEON_PRIO_SDMA_BUFFER);
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, 0);
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, 0);
 		radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize));
 		radeon_emit(cs, dst_offset & 0xfffffffc);
 		radeon_emit(cs, src_offset & 0xfffffffc);
diff --git a/src/gallium/drivers/r600/r600_isa.c b/src/gallium/drivers/r600/r600_isa.c
index 611b370..57b0e04 100644
--- a/src/gallium/drivers/r600/r600_isa.c
+++ b/src/gallium/drivers/r600/r600_isa.c
@@ -558,7 +558,7 @@
 
 	for (i = 0; i < ARRAY_SIZE(r600_alu_op_table); ++i) {
 		const struct alu_op_info *op = &r600_alu_op_table[i];
-		unsigned opc;
+		int opc;
 		if (op->flags & AF_LDS || op->slots[isa->hw_class] == 0)
 			continue;
 		opc = op->opcode[isa->hw_class >> 1];
@@ -571,7 +571,7 @@
 
 	for (i = 0; i < ARRAY_SIZE(fetch_op_table); ++i) {
 		const struct fetch_op_info *op = &fetch_op_table[i];
-		unsigned opc = op->opcode[isa->hw_class];
+		int opc = op->opcode[isa->hw_class];
 		if ((op->flags & FF_GDS) || ((opc & 0xFF) != opc))
 			continue; /* ignore GDS ops and INST_MOD versions for now */
 		isa->fetch_map[opc] = i + 1;
@@ -579,7 +579,7 @@
 
 	for (i = 0; i < ARRAY_SIZE(cf_op_table); ++i) {
 		const struct cf_op_info *op = &cf_op_table[i];
-		unsigned opc = op->opcode[isa->hw_class];
+		int opc = op->opcode[isa->hw_class];
 		if (opc == -1)
 			continue;
 		/* using offset for CF_ALU_xxx opcodes because they overlap with other
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index b27ab75..c7cefba 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -104,6 +104,12 @@
 	}
 	util_unreference_framebuffer_state(&rctx->framebuffer.state);
 
+	if (rctx->gs_rings.gsvs_ring.buffer)
+		pipe_resource_reference(&rctx->gs_rings.gsvs_ring.buffer, NULL);
+
+	if (rctx->gs_rings.esgs_ring.buffer)
+		pipe_resource_reference(&rctx->gs_rings.esgs_ring.buffer, NULL);
+
 	for (sh = 0; sh < PIPE_SHADER_TYPES; ++sh)
 		for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; ++i)
 			rctx->b.b.set_constant_buffer(context, sh, i, NULL);
@@ -343,6 +349,9 @@
 		   return 330;
 		return 140;
 
+	case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+		return 140;
+
 	/* Supported except the original R600. */
 	case PIPE_CAP_INDEP_BLEND_ENABLE:
 	case PIPE_CAP_INDEP_BLEND_FUNC:
@@ -419,6 +428,14 @@
 	case PIPE_CAP_FENCE_SIGNAL:
 	case PIPE_CAP_CONSTBUF0_FLAGS:
 	case PIPE_CAP_PACKED_UNIFORMS:
+	case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+	case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+	case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
 		return 0;
 
 	case PIPE_CAP_DOUBLES:
@@ -460,7 +477,8 @@
 		return family >= CHIP_CEDAR ? 4 : 1;
 
 	case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
-		return 2047;
+		/* Should be 2047, but 2048 is a requirement for GL 4.4 */
+		return 2048;
 
 	/* Texturing. */
 	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
@@ -643,6 +661,8 @@
 			return EG_MAX_ATOMIC_BUFFERS;
 		}
 		return 0;
+	case PIPE_SHADER_CAP_SCALAR_ISA:
+		return 0;
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		/* due to a bug in the shader compiler, some loops hang
 		 * if they are not unrolled, see:
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 7751c7c..239005c 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -577,7 +577,7 @@
 	bool                            gs_tri_strip_adj_fix;
 	boolean				dual_src_blend;
 	unsigned			zwritemask;
-	int					ps_iter_samples;
+	unsigned			ps_iter_samples;
 
 	/* The list of all texture buffer objects in this context.
 	 * This list is walked when a buffer is invalidated/reallocated and
@@ -614,7 +614,7 @@
 	uint32_t append_fence_id;
 };
 
-static inline void r600_emit_command_buffer(struct radeon_winsys_cs *cs,
+static inline void r600_emit_command_buffer(struct radeon_cmdbuf *cs,
 					    struct r600_command_buffer *cb)
 {
 	assert(cs->current.cdw + cb->num_dw <= cs->current.max_dw);
@@ -704,6 +704,7 @@
 				      enum pipe_format format,
 				      enum pipe_texture_target target,
 				      unsigned sample_count,
+				      unsigned storage_sample_count,
 				      unsigned usage);
 void evergreen_init_color_surface(struct r600_context *rctx,
 				  struct r600_surface *surf);
@@ -760,6 +761,7 @@
 				 enum pipe_format format,
 				 enum pipe_texture_target target,
 				 unsigned sample_count,
+				 unsigned storage_sample_count,
 				 unsigned usage);
 void r600_update_db_shader_control(struct r600_context * rctx);
 void r600_setup_scratch_buffers(struct r600_context *rctx);
@@ -802,10 +804,10 @@
 				    const struct pipe_draw_info *info,
 				    unsigned num_patches);
 void evergreen_set_ls_hs_config(struct r600_context *rctx,
-				struct radeon_winsys_cs *cs,
+				struct radeon_cmdbuf *cs,
 				uint32_t ls_hs_config);
 void evergreen_set_lds_alloc(struct r600_context *rctx,
-			     struct radeon_winsys_cs *cs,
+			     struct radeon_cmdbuf *cs,
 			     uint32_t lds_alloc);
 
 /* r600_state_common.c */
@@ -978,14 +980,14 @@
 void r600_init_command_buffer(struct r600_command_buffer *cb, unsigned num_dw);
 void r600_release_command_buffer(struct r600_command_buffer *cb);
 
-static inline void radeon_compute_set_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_compute_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	radeon_set_context_reg_seq(cs, reg, num);
 	/* Set the compute bit on the packet header */
 	cs->current.buf[cs->current.cdw - 2] |= RADEON_CP_PACKET3_COMPUTE_MODE;
 }
 
-static inline void radeon_set_ctl_const_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_ctl_const_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= R600_CTL_CONST_OFFSET);
 	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
@@ -993,13 +995,13 @@
 	radeon_emit(cs, (reg - R600_CTL_CONST_OFFSET) >> 2);
 }
 
-static inline void radeon_compute_set_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_compute_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_compute_set_context_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_context_reg_flag(struct radeon_winsys_cs *cs, unsigned reg, unsigned value, unsigned flag)
+static inline void radeon_set_context_reg_flag(struct radeon_cmdbuf *cs, unsigned reg, unsigned value, unsigned flag)
 {
 	if (flag & RADEON_CP_PACKET3_COMPUTE_MODE) {
 		radeon_compute_set_context_reg(cs, reg, value);
@@ -1008,7 +1010,7 @@
 	}
 }
 
-static inline void radeon_set_ctl_const(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_ctl_const(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_ctl_const_seq(cs, reg, 1);
 	radeon_emit(cs, value);
diff --git a/src/gallium/drivers/r600/r600_pipe_common.c b/src/gallium/drivers/r600/r600_pipe_common.c
index 2a05e44..e7c6456 100644
--- a/src/gallium/drivers/r600/r600_pipe_common.c
+++ b/src/gallium/drivers/r600/r600_pipe_common.c
@@ -105,7 +105,7 @@
 			      struct r600_resource *buf, uint64_t va,
 			      uint32_t new_fence, unsigned query_type)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx.cs;
 	unsigned op = EVENT_TYPE(event) |
 		      EVENT_INDEX(5) |
 		      event_flags;
@@ -137,7 +137,7 @@
 			 struct r600_resource *buf,
 			 uint64_t va, uint32_t ref, uint32_t mask)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx.cs;
 
 	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 	radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
@@ -242,7 +242,7 @@
 
 static void r600_dma_emit_wait_idle(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->dma.cs;
+	struct radeon_cmdbuf *cs = rctx->dma.cs;
 
 	if (rctx->chip_class >= EVERGREEN)
 		radeon_emit(cs, 0xf0000000); /* NOP */
@@ -314,12 +314,10 @@
 	if (ctx->screen->info.r600_has_virtual_memory) {
 		if (dst)
 			radeon_add_to_buffer_list(ctx, &ctx->dma, dst,
-						  RADEON_USAGE_WRITE,
-						  RADEON_PRIO_SDMA_BUFFER);
+						  RADEON_USAGE_WRITE, 0);
 		if (src)
 			radeon_add_to_buffer_list(ctx, &ctx->dma, src,
-						  RADEON_USAGE_READ,
-						  RADEON_PRIO_SDMA_BUFFER);
+						  RADEON_USAGE_READ, 0);
 	}
 
 	/* this function is called before all DMA calls, so increment this. */
@@ -468,7 +466,7 @@
 				struct pipe_fence_handle **fence)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct radeon_winsys_cs *cs = rctx->dma.cs;
+	struct radeon_cmdbuf *cs = rctx->dma.cs;
 	struct radeon_saved_cs saved;
 	bool check_vm =
 		(rctx->screen->debug_flags & DBG_CHECK_VM) &&
@@ -502,7 +500,7 @@
  * Store a linearized copy of all chunks of \p cs together with the buffer
  * list in \p saved.
  */
-void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
+void radeon_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
 		    struct radeon_saved_cs *saved, bool get_buffer_list)
 {
 	uint32_t *buf;
@@ -856,27 +854,28 @@
 	if (rscreen->debug_flags & DBG_ALL_SHADERS)
 		return;
 
-	uint32_t mesa_timestamp;
-	if (disk_cache_get_function_timestamp(r600_disk_cache_create,
-					      &mesa_timestamp)) {
-		char *timestamp_str;
-		int res = -1;
+	struct mesa_sha1 ctx;
+	unsigned char sha1[20];
+	char cache_id[20 * 2 + 1];
 
-		res = asprintf(&timestamp_str, "%u",mesa_timestamp);
-		if (res != -1) {
-			/* These flags affect shader compilation. */
-			uint64_t shader_debug_flags =
-				rscreen->debug_flags &
-				(DBG_FS_CORRECT_DERIVS_AFTER_KILL |
-				 DBG_UNSAFE_MATH);
+	_mesa_sha1_init(&ctx);
+	if (!disk_cache_get_function_identifier(r600_disk_cache_create,
+						&ctx))
+		return;
 
-			rscreen->disk_shader_cache =
-				disk_cache_create(r600_get_family_name(rscreen),
-						  timestamp_str,
-						  shader_debug_flags);
-			free(timestamp_str);
-		}
-	}
+	_mesa_sha1_final(&ctx, sha1);
+	disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
+
+	/* These flags affect shader compilation. */
+	uint64_t shader_debug_flags =
+		rscreen->debug_flags &
+		(DBG_FS_CORRECT_DERIVS_AFTER_KILL |
+		 DBG_UNSAFE_MATH);
+
+	rscreen->disk_shader_cache =
+		disk_cache_create(r600_get_family_name(rscreen),
+				  cache_id,
+				  shader_debug_flags);
 }
 
 static struct disk_cache *r600_get_disk_shader_cache(struct pipe_screen *pscreen)
@@ -910,6 +909,10 @@
 		return 16.0f;
 	case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
 		return 16.0f;
+    case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+    case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+    case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+        return 0.0f;
 	}
 	return 0.0f;
 }
diff --git a/src/gallium/drivers/r600/r600_pipe_common.h b/src/gallium/drivers/r600/r600_pipe_common.h
index ee8eb54..c4e60e9 100644
--- a/src/gallium/drivers/r600/r600_pipe_common.h
+++ b/src/gallium/drivers/r600/r600_pipe_common.h
@@ -488,7 +488,7 @@
 };
 
 struct r600_ring {
-	struct radeon_winsys_cs		*cs;
+	struct radeon_cmdbuf		*cs;
 	void (*flush)(void *ctx, unsigned flags,
 		      struct pipe_fence_handle **fence);
 };
@@ -708,7 +708,7 @@
 const char *r600_get_llvm_processor_name(enum radeon_family family);
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw,
 			 struct r600_resource *dst, struct r600_resource *src);
-void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
+void radeon_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
 		    struct radeon_saved_cs *saved, bool get_buffer_list);
 void radeon_clear_saved_cs(struct radeon_saved_cs *saved);
 bool r600_check_device_reset(struct r600_common_context *rctx);
@@ -799,7 +799,7 @@
 void cayman_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
 				unsigned sample_index, float *out_value);
 void cayman_init_msaa(struct pipe_context *ctx);
-void cayman_emit_msaa_state(struct radeon_winsys_cs *cs, int nr_samples,
+void cayman_emit_msaa_state(struct radeon_cmdbuf *cs, int nr_samples,
 			    int ps_iter_samples, int overrast_samples);
 
 
diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c
index 729c6f2..ccabab9 100644
--- a/src/gallium/drivers/r600/r600_query.c
+++ b/src/gallium/drivers/r600/r600_query.c
@@ -714,7 +714,7 @@
 	}
 }
 
-static void emit_sample_streamout(struct radeon_winsys_cs *cs, uint64_t va,
+static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va,
 				  unsigned stream)
 {
 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -728,7 +728,7 @@
 					struct r600_resource *buffer,
 					uint64_t va)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx.cs;
 
 	switch (query->b.type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
@@ -808,7 +808,7 @@
 				       struct r600_resource *buffer,
 				       uint64_t va)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx.cs;
 	uint64_t fence_va = 0;
 
 	switch (query->b.type) {
@@ -900,7 +900,7 @@
 			       struct r600_resource *buf, uint64_t va,
 			       uint32_t op)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx.cs;
 
 	radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
 	radeon_emit(cs, va);
@@ -1833,7 +1833,7 @@
 {
 	struct r600_common_context *ctx =
 		(struct r600_common_context*)rscreen->aux_context;
-	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	struct radeon_cmdbuf *cs = ctx->gfx.cs;
 	struct r600_resource *buffer;
 	uint32_t *results;
 	unsigned i, mask = 0;
@@ -1945,7 +1945,7 @@
 #define XG(group_, name_, query_type_, type_, result_type_) \
 	XFULL(name_, query_type_, type_, result_type_, R600_QUERY_GROUP_##group_)
 
-static struct pipe_driver_query_info r600_driver_query_list[] = {
+static const struct pipe_driver_query_info r600_driver_query_list[] = {
 	X("num-compilations",		NUM_COMPILATIONS,	UINT64, CUMULATIVE),
 	X("num-shaders-created",	NUM_SHADERS_CREATED,	UINT64, CUMULATIVE),
 	X("num-shader-cache-hits",	NUM_SHADER_CACHE_HITS,	UINT64, CUMULATIVE),
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index db4f9a1..2229dc8 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -1247,7 +1247,7 @@
 	tgsi_parse_free(&parse);
 
 	if (ctx->info.reads_samplemask &&
-	    (ctx->info.uses_linear_sample || ctx->info.uses_linear_sample)) {
+	    (ctx->info.uses_linear_sample || ctx->info.uses_persp_sample)) {
 		inputs[1].enabled = true;
 	}
 
@@ -4331,8 +4331,32 @@
 
 		if (spilled) {
 			struct r600_bytecode_output cf;
-			int reg = r600_get_temp(ctx);
+			int reg = 0;
 			int r;
+			bool add_pending_output = true;
+
+			memset(&cf, 0, sizeof(struct r600_bytecode_output));
+			get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
+				&cf.array_base, &cf.array_size);
+
+			/* If no component has spilled, reserve a register and add the spill code
+			 *  ctx->bc->n_pending_outputs is cleared after each instruction group */
+			if (ctx->bc->n_pending_outputs == 0) {
+				reg = r600_get_temp(ctx);
+			} else {
+				/* If we are already spilling and the output address is the same like
+				* before then just reuse the same slot */
+				struct r600_bytecode_output *tmpl = &ctx->bc->pending_outputs[ctx->bc->n_pending_outputs-1];
+				if ((cf.array_base + idx == tmpl->array_base) ||
+				    (cf.array_base == tmpl->array_base &&
+				     tmpl->index_gpr == ctx->bc->ar_reg &&
+				     tgsi_dst->Register.Indirect)) {
+					reg = ctx->bc->pending_outputs[0].gpr;
+					add_pending_output = false;
+				} else {
+					reg = r600_get_temp(ctx);
+				}
+			}
 
 			r600_dst->sel = reg;
 			r600_dst->chan = swizzle;
@@ -4341,29 +4365,26 @@
 				r600_dst->clamp = 1;
 			}
 
-			// needs to be added after op using tgsi_dst
-			memset(&cf, 0, sizeof(struct r600_bytecode_output));
-			cf.op = CF_OP_MEM_SCRATCH;
-			cf.elem_size = 3;
-			cf.gpr = reg;
-			cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
-			cf.mark = 1;
-			cf.comp_mask = inst->Dst[0].Register.WriteMask;
-			cf.swizzle_x = 0;
-			cf.swizzle_y = 1;
-			cf.swizzle_z = 2;
-			cf.swizzle_w = 3;
-			cf.burst_count = 1;
+			/* Add new outputs as pending */
+			if (add_pending_output) {
+				cf.op = CF_OP_MEM_SCRATCH;
+				cf.elem_size = 3;
+				cf.gpr = reg;
+				cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
+				cf.mark = 1;
+				cf.comp_mask = inst->Dst[0].Register.WriteMask;
+				cf.swizzle_x = 0;
+				cf.swizzle_y = 1;
+				cf.swizzle_z = 2;
+				cf.swizzle_w = 3;
+				cf.burst_count = 1;
 
-			get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
-				&cf.array_base, &cf.array_size);
-
-			if (tgsi_dst->Register.Indirect) {
-				if (ctx->bc->chip_class < R700)
-					cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
-				else
-					cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
-				cf.index_gpr = ctx->bc->ar_reg;
+				if (tgsi_dst->Register.Indirect) {
+					if (ctx->bc->chip_class < R700)
+						cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
+					else
+						cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
+					cf.index_gpr = ctx->bc->ar_reg;
 			}
 			else {
 				cf.array_base += idx;
@@ -4376,7 +4397,7 @@
 
 			if (ctx->bc->chip_class >= R700)
 				r600_bytecode_need_wait_ack(ctx->bc, true);
-
+			}
 			return;
 		}
 		else {
@@ -5719,10 +5740,19 @@
 	struct r600_bytecode_alu alu;
 	int i, r, j;
 	unsigned write_mask = inst->Dst[0].Register.WriteMask;
+	int lasti = tgsi_last_instruction(write_mask);
 	int tmp0 = ctx->temp_reg;
 	int tmp1 = r600_get_temp(ctx);
 	int tmp2 = r600_get_temp(ctx);
 	int tmp3 = r600_get_temp(ctx);
+	int tmp4 = 0;
+
+	/* Use additional temp if dst register and src register are the same */
+	if (inst->Src[0].Register.Index == inst->Dst[0].Register.Index ||
+	    inst->Src[1].Register.Index == inst->Dst[0].Register.Index) {
+		tmp4 = r600_get_temp(ctx);
+	}
+
 	/* Unsigned path:
 	 *
 	 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
@@ -6337,7 +6367,13 @@
 			alu.dst.chan = 2;
 			alu.dst.write = 1;
 		} else {
-			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+			if (tmp4 > 0) {
+				alu.dst.sel = tmp4;
+				alu.dst.chan = i;
+				alu.dst.write = 1;
+			} else {
+				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+			}
 		}
 
 		alu.src[0].sel = tmp1;
@@ -6379,7 +6415,13 @@
 				alu.op = ALU_OP3_CNDGE_INT;
 				alu.is_op3 = 1;
 
-				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+				if (tmp4 > 0) {
+					alu.dst.sel = tmp4;
+					alu.dst.chan = i;
+					alu.dst.write = 1;
+				} else {
+					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+				}
 
 				r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
 				alu.src[1].sel = tmp0;
@@ -6415,7 +6457,13 @@
 				alu.op = ALU_OP3_CNDGE_INT;
 				alu.is_op3 = 1;
 
-				tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+				if (tmp4 > 0) {
+					alu.dst.sel = tmp4;
+					alu.dst.chan = i;
+					alu.dst.write = 1;
+				} else {
+					tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+				}
 
 				alu.src[0].sel = tmp2;
 				alu.src[0].chan = 2;
@@ -6430,6 +6478,25 @@
 			}
 		}
 	}
+
+	if (tmp4 > 0) {
+		for (i = 0; i <= lasti; ++i) {
+			if (!(write_mask & (1<<i)))
+				continue;
+
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP1_MOV;
+			tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+			alu.src[0].sel = tmp4;
+			alu.src[0].chan = i;
+
+			if (i == lasti)
+				alu.last = 1;
+			if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
+				return r;
+		}
+	}
+
 	return 0;
 }
 
@@ -6625,11 +6692,15 @@
 static int tgsi_ssg(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
+	unsigned write_mask = inst->Dst[0].Register.WriteMask;
+	int last_inst = tgsi_last_instruction(write_mask);
 	struct r600_bytecode_alu alu;
 	int i, r;
 
 	/* tmp = (src > 0 ? 1 : src) */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i <= last_inst; i++) {
+		if (!(write_mask & (1 << i)))
+			continue;
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 		alu.op = ALU_OP3_CNDGT;
 		alu.is_op3 = 1;
@@ -6641,7 +6712,7 @@
 		alu.src[1].sel = V_SQ_ALU_SRC_1;
 		r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
 
-		if (i == 3)
+		if (i == last_inst)
 			alu.last = 1;
 		r = r600_bytecode_add_alu(ctx->bc, &alu);
 		if (r)
@@ -6649,7 +6720,9 @@
 	}
 
 	/* dst = (-tmp > 0 ? -1 : tmp) */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i <= last_inst; i++) {
+		if (!(write_mask & (1 << i)))
+			continue;
 		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 		alu.op = ALU_OP3_CNDGT;
 		alu.is_op3 = 1;
@@ -6665,7 +6738,7 @@
 		alu.src[2].sel = ctx->temp_reg;
 		alu.src[2].chan = i;
 
-		if (i == 3)
+		if (i == last_inst)
 			alu.last = 1;
 		r = r600_bytecode_add_alu(ctx->bc, &alu);
 		if (r)
@@ -7372,9 +7445,10 @@
 {
 	struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
 	struct r600_bytecode_tex tex;
+	struct r600_bytecode_tex grad_offs[3];
 	struct r600_bytecode_alu alu;
 	unsigned src_gpr;
-	int r, i, j;
+	int r, i, j, n_grad_offs = 0;
 	int opcode;
 	bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
 				    inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
@@ -7396,6 +7470,7 @@
 	int8_t offset_x = 0, offset_y = 0, offset_z = 0;
 	boolean has_txq_cube_array_z = false;
 	unsigned sampler_index_mode;
+	int array_index_offset_channel = -1;
 
 	if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
 	    ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
@@ -7637,11 +7712,43 @@
 				if (r)
 					return r;
 
+				/* Evaluate the array index according to floor(idx + 0.5). This
+				 * needs to be done before merging the face select value, because
+				 * otherwise the fractional part of the array index will interfere
+				 * with the face select value */
+				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
+				alu.op = ALU_OP1_RNDNE;
+				alu.dst.sel = ctx->temp_reg;
+				alu.dst.chan = 3;
+				alu.dst.write = 1;
+				alu.last = 1;
+				r = r600_bytecode_add_alu(ctx->bc, &alu);
+				if (r)
+					return r;
+
+				/* Because the array slice index and the cube face index are merged
+				 * into one value we have to make sure the array slice index is >= 0,
+				 * otherwise the face selection will fail */
+				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+				alu.op = ALU_OP2_MAX;
+				alu.src[0].sel = ctx->temp_reg;
+				alu.src[0].chan = 3;
+				alu.src[1].sel = V_SQ_ALU_SRC_0;
+				alu.dst.sel = ctx->temp_reg;
+				alu.dst.chan = 3;
+				alu.dst.write = 1;
+				alu.last = 1;
+				r = r600_bytecode_add_alu(ctx->bc, &alu);
+				if (r)
+					return r;
+
 				/* have to multiply original layer by 8 and add to face id (temp.w) in Z */
 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 				alu.op = ALU_OP3_MULADD;
 				alu.is_op3 = 1;
-				r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
+				alu.src[0].sel = ctx->temp_reg;
+				alu.src[0].chan = 3;
 				alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
 				alu.src[1].chan = 0;
 				alu.src[1].value = u_bitcast_f2u(8.0f);
@@ -7747,31 +7854,29 @@
 		}
 		for (i = 1; i < 3; i++) {
 			/* set gradients h/v */
-			memset(&tex, 0, sizeof(struct r600_bytecode_tex));
-			tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
+			struct r600_bytecode_tex *t = &grad_offs[n_grad_offs++];
+			memset(t, 0, sizeof(struct r600_bytecode_tex));
+			t->op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
 				FETCH_OP_SET_GRADIENTS_V;
-			tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
-			tex.sampler_index_mode = sampler_index_mode;
-			tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
-			tex.resource_index_mode = sampler_index_mode;
+			t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
+			t->sampler_index_mode = sampler_index_mode;
+			t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
+			t->resource_index_mode = sampler_index_mode;
 
-			tex.src_gpr = (i == 1) ? temp_h : temp_v;
-			tex.src_sel_x = 0;
-			tex.src_sel_y = 1;
-			tex.src_sel_z = 2;
-			tex.src_sel_w = 3;
+			t->src_gpr = (i == 1) ? temp_h : temp_v;
+			t->src_sel_x = 0;
+			t->src_sel_y = 1;
+			t->src_sel_z = 2;
+			t->src_sel_w = 3;
 
-			tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
-			tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
+			t->dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
+			t->dst_sel_x = t->dst_sel_y = t->dst_sel_z = t->dst_sel_w = 7;
 			if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
-				tex.coord_type_x = 1;
-				tex.coord_type_y = 1;
-				tex.coord_type_z = 1;
-				tex.coord_type_w = 1;
+				t->coord_type_x = 1;
+				t->coord_type_y = 1;
+				t->coord_type_z = 1;
+				t->coord_type_w = 1;
 			}
-			r = r600_bytecode_add_tex(ctx->bc, &tex);
-			if (r)
-				return r;
 		}
 	}
 
@@ -8176,32 +8281,37 @@
 	if (opcode == FETCH_OP_GATHER4 &&
 		inst->TexOffsets[0].File != TGSI_FILE_NULL &&
 		inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
+		struct r600_bytecode_tex *t;
 		opcode = FETCH_OP_GATHER4_O;
 
 		/* GATHER4_O/GATHER4_C_O use offset values loaded by
 		   SET_TEXTURE_OFFSETS instruction. The immediate offset values
 		   encoded in the instruction are ignored. */
-		memset(&tex, 0, sizeof(struct r600_bytecode_tex));
-		tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
-		tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
-		tex.sampler_index_mode = sampler_index_mode;
-		tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
-		tex.resource_index_mode = sampler_index_mode;
+		t = &grad_offs[n_grad_offs++];
+		memset(t, 0, sizeof(struct r600_bytecode_tex));
+		t->op = FETCH_OP_SET_TEXTURE_OFFSETS;
+		t->sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
+		t->sampler_index_mode = sampler_index_mode;
+		t->resource_id = t->sampler_id + R600_MAX_CONST_BUFFERS;
+		t->resource_index_mode = sampler_index_mode;
 
-		tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
-		tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
-		tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
-		tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
-		tex.src_sel_w = 4;
+		t->src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
+		t->src_sel_x = inst->TexOffsets[0].SwizzleX;
+		t->src_sel_y = inst->TexOffsets[0].SwizzleY;
+		if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
+			 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)
+			/* make sure array index selector is 0, this is just a safety
+			 * precausion because TGSI seems to emit something strange here */
+			t->src_sel_z = 4;
+		else
+			t->src_sel_z = inst->TexOffsets[0].SwizzleZ;
 
-		tex.dst_sel_x = 7;
-		tex.dst_sel_y = 7;
-		tex.dst_sel_z = 7;
-		tex.dst_sel_w = 7;
+		t->src_sel_w = 4;
 
-		r = r600_bytecode_add_tex(ctx->bc, &tex);
-		if (r)
-			return r;
+		t->dst_sel_x = 7;
+		t->dst_sel_y = 7;
+		t->dst_sel_z = 7;
+		t->dst_sel_w = 7;
 	}
 
 	if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
@@ -8351,18 +8461,42 @@
 		    opcode == FETCH_OP_SAMPLE_C_LB) {
 			/* the array index is read from Y */
 			tex.coord_type_y = 0;
+			array_index_offset_channel = tex.src_sel_y;
 		} else {
 			/* the array index is read from Z */
 			tex.coord_type_z = 0;
 			tex.src_sel_z = tex.src_sel_y;
+			array_index_offset_channel = tex.src_sel_z;
 		}
 	} else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
-		   inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
-		   ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-		    (ctx->bc->chip_class >= EVERGREEN)))
-		/* the array index is read from Z */
+		    inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) {
 		tex.coord_type_z = 0;
+		array_index_offset_channel = tex.src_sel_z;
+	} else if  ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
+		    inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
+		    (ctx->bc->chip_class >= EVERGREEN))
+		/* the array index is read from Z, coordinate will be corrected elsewhere  */
+		tex.coord_type_z = 0;
+
+	/* We have array access to 1D or 2D ARRAY, the coordinates are not int ->
+	 * evaluate the array index  */
+	if (array_index_offset_channel >= 0 &&
+		 opcode != FETCH_OP_LD &&
+		 opcode != FETCH_OP_GET_TEXTURE_RESINFO) {
+		memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+		alu.src[0].sel =  tex.src_gpr;
+		alu.src[0].chan =  array_index_offset_channel;
+		alu.src[0].rel = tex.src_rel;
+		alu.op = ALU_OP1_RNDNE;
+		alu.dst.sel = tex.src_gpr;
+		alu.dst.chan = array_index_offset_channel;
+		alu.dst.rel = tex.src_rel;
+		alu.dst.write = 1;
+		alu.last = 1;
+		r = r600_bytecode_add_alu(ctx->bc, &alu);
+		if (r)
+			return r;
+	}
 
 	/* mask unused source components */
 	if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
@@ -8384,6 +8518,13 @@
 		}
 	}
 
+	/* Emit set gradient and offset instructions. */
+	for (i = 0; i < n_grad_offs; ++i) {
+		r = r600_bytecode_add_tex(ctx->bc, &grad_offs[i]);
+		if (r)
+			return r;
+	}
+
 	r = r600_bytecode_add_tex(ctx->bc, &tex);
 	if (r)
 		return r;
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index a37a701..c26a38d 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -162,6 +162,7 @@
 				 enum pipe_format format,
 				 enum pipe_texture_target target,
 				 unsigned sample_count,
+				 unsigned storage_sample_count,
 				 unsigned usage)
 {
 	struct r600_screen *rscreen = (struct r600_screen*)screen;
@@ -172,8 +173,8 @@
 		return FALSE;
 	}
 
-	if (!util_format_is_supported(format, usage))
-		return FALSE;
+	if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+		return false;
 
 	if (sample_count > 1) {
 		if (!rscreen->has_msaa)
@@ -245,7 +246,7 @@
 
 static void r600_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a;
 	float offset_units = state->offset_units;
 	float offset_scale = state->offset_scale;
@@ -791,7 +792,7 @@
 
 static void r600_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct pipe_clip_state *state = &rctx->clip_state.state;
 
 	radeon_set_context_reg_seq(cs, R_028E20_PA_CL_UCP0_X, 6*4);
@@ -1225,22 +1226,22 @@
 	rctx->framebuffer.do_update_surf_dirtiness = true;
 }
 
-static uint32_t sample_locs_2x[] = {
+static const uint32_t sample_locs_2x[] = {
 	FILL_SREG(-4, 4, 4, -4, -4, 4, 4, -4),
 	FILL_SREG(-4, 4, 4, -4, -4, 4, 4, -4),
 };
-static unsigned max_dist_2x = 4;
+static const unsigned max_dist_2x = 4;
 
-static uint32_t sample_locs_4x[] = {
+static const uint32_t sample_locs_4x[] = {
 	FILL_SREG(-2, -2, 2, 2, -6, 6, 6, -6),
 	FILL_SREG(-2, -2, 2, 2, -6, 6, 6, -6),
 };
-static unsigned max_dist_4x = 6;
-static uint32_t sample_locs_8x[] = {
+static const unsigned max_dist_4x = 6;
+static const uint32_t sample_locs_8x[] = {
 	FILL_SREG(-1,  1,  1,  5,  3, -5,  5,  3),
 	FILL_SREG(-7, -1, -3, -7,  7, -3, -5,  7),
 };
-static unsigned max_dist_8x = 7;
+static const unsigned max_dist_8x = 7;
 
 static void r600_get_sample_position(struct pipe_context *ctx,
 				     unsigned sample_count,
@@ -1283,7 +1284,7 @@
 
 static void r600_emit_msaa_state(struct r600_context *rctx, int nr_samples)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	unsigned max_dist = 0;
 
 	if (rctx->b.family == CHIP_R600) {
@@ -1350,7 +1351,7 @@
 
 static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct pipe_framebuffer_state *state = &rctx->framebuffer.state;
 	unsigned nr_cbufs = state->nr_cbufs;
 	struct r600_surface **cb = (struct r600_surface**)&state->cbufs[0];
@@ -1516,7 +1517,7 @@
 
 static void r600_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom;
 
 	if (G_028808_SPECIAL_OP(a->cb_color_control) == V_028808_SPECIAL_RESOLVE_BOX) {
@@ -1546,7 +1547,7 @@
 
 static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_db_state *a = (struct r600_db_state*)atom;
 
 	if (a->rsurf && a->rsurf->db_htile_surface) {
@@ -1557,7 +1558,7 @@
 		radeon_set_context_reg(cs, R_028D24_DB_HTILE_SURFACE, a->rsurf->db_htile_surface);
 		radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base);
 		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, &rtex->resource,
-						  RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE);
+						  RADEON_USAGE_READWRITE, RADEON_PRIO_SEPARATE_META);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
 		radeon_emit(cs, reloc_idx);
 	} else {
@@ -1567,7 +1568,7 @@
 
 static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom;
 	unsigned db_render_control = 0;
 	unsigned db_render_override =
@@ -1652,7 +1653,7 @@
 
 static void r600_emit_config_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_config_state *a = (struct r600_config_state*)atom;
 
 	radeon_set_config_reg(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, a->sq_gpr_resource_mgmt_1);
@@ -1661,7 +1662,7 @@
 
 static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = rctx->vertex_buffer_state.dirty_mask;
 
 	while (dirty_mask) {
@@ -1701,7 +1702,7 @@
 				       unsigned reg_alu_constbuf_size,
 				       unsigned reg_alu_const_cache)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1775,7 +1776,7 @@
 				    struct r600_samplerview_state *state,
 				    unsigned resource_id_base)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1822,7 +1823,7 @@
 				unsigned resource_id_base,
 				unsigned border_color_reg)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = texinfo->states.dirty_mask;
 
 	while (dirty_mask) {
@@ -1883,7 +1884,7 @@
 
 static void r600_emit_seamless_cube_map(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	unsigned tmp;
 
 	tmp = S_009508_DISABLE_CUBE_ANISO(1) |
@@ -1907,7 +1908,7 @@
 
 static void r600_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_cso_state *state = (struct r600_cso_state*)a;
 	struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso;
 
@@ -1923,7 +1924,7 @@
 
 static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a;
 
 	uint32_t v2 = 0, primid = 0;
@@ -1958,7 +1959,7 @@
 
 static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a;
 	struct r600_resource *rbuffer;
 
@@ -2855,7 +2856,7 @@
 				unsigned pitch,
 				unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
+	struct radeon_cmdbuf *cs = rctx->b.dma.cs;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
 	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
@@ -2925,10 +2926,8 @@
 		cheight = cheight > copy_height ? copy_height : cheight;
 		size = (cheight * pitch) / 4;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource, RADEON_USAGE_READ,
-				      RADEON_PRIO_SDMA_TEXTURE);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE,
-				      RADEON_PRIO_SDMA_TEXTURE);
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, 0);
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, 0);
 		radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 1, 0, size));
 		radeon_emit(cs, base >> 8);
 		radeon_emit(cs, (detile << 31) | (array_mode << 27) |
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 0f5e90d..e6c1b0b 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -77,7 +77,7 @@
 
 void r600_emit_alphatest_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_alphatest_state *a = (struct r600_alphatest_state*)atom;
 	unsigned alpha_ref = a->sx_alpha_ref;
 
@@ -241,7 +241,7 @@
 
 void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct pipe_blend_color *state = &rctx->blend_color.state;
 
 	radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
@@ -253,7 +253,7 @@
 
 void r600_emit_vgt_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_vgt_state *a = (struct r600_vgt_state *)atom;
 
 	radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, a->vgt_multi_prim_ib_reset_en);
@@ -287,7 +287,7 @@
 
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_stencil_ref_state *a = (struct r600_stencil_ref_state*)atom;
 
 	radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
@@ -570,7 +570,10 @@
 	/* Set vertex buffers. */
 	if (input) {
 		for (i = 0; i < count; i++) {
-			if (memcmp(&input[i], &vb[i], sizeof(struct pipe_vertex_buffer))) {
+			if ((input[i].buffer.resource != vb[i].buffer.resource) ||
+			    (vb[i].stride != input[i].stride) ||
+			    (vb[i].buffer_offset != input[i].buffer_offset) ||
+			    (vb[i].is_user_buffer != input[i].is_user_buffer)) {
 				if (input[i].buffer.resource) {
 					vb[i].stride = input[i].stride;
 					vb[i].buffer_offset = input[i].buffer_offset;
@@ -1307,7 +1310,7 @@
 }
 
 static void *r600_alloc_buf_consts(struct r600_context *rctx, int shader_type,
-				   int array_size, uint32_t *base_offset)
+				   unsigned array_size, uint32_t *base_offset)
 {
 	struct r600_shader_driver_constants_info *info = &rctx->driver_consts[shader_type];
 	if (array_size + R600_UCP_SIZE > info->alloc_size) {
@@ -1430,14 +1433,13 @@
 /* set sample xy locations as array of fragment shader constants */
 void r600_set_sample_locations_constant_buffer(struct r600_context *rctx)
 {
-	int i;
 	struct pipe_context *ctx = &rctx->b.b;
 
 	assert(rctx->framebuffer.nr_samples < R600_UCP_SIZE);
 	assert(rctx->framebuffer.nr_samples <= ARRAY_SIZE(rctx->sample_positions)/4);
 
 	memset(rctx->sample_positions, 0, 4 * 4 * 16);
-	for (i = 0; i < rctx->framebuffer.nr_samples; i++) {
+	for (unsigned i = 0; i < rctx->framebuffer.nr_samples; i++) {
 		ctx->get_sample_position(ctx, rctx->framebuffer.nr_samples, i, &rctx->sample_positions[4*i]);
 		/* Also fill in center-zeroed positions used for interpolateAtSample */
 		rctx->sample_positions[4*i + 2] = rctx->sample_positions[4*i + 0] - 0.5f;
@@ -1625,7 +1627,7 @@
 	if (scratch->dirty ||
 		unlikely(shader->scratch_space_needed != scratch->item_size ||
 		size > scratch->size)) {
-		struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+		struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 
 		scratch->dirty = false;
 
@@ -1969,7 +1971,7 @@
 
 void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_clip_misc_state *state = &rctx->clip_misc_state;
 
 	radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
@@ -1989,7 +1991,7 @@
 /* rast_prim is the primitive type after GS. */
 static inline void r600_emit_rasterizer_prim_state(struct r600_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	enum pipe_prim_type rast_prim = rctx->current_rast_prim;
 
 	/* Skip this if not rendering lines. */
@@ -2016,7 +2018,7 @@
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct pipe_resource *indexbuf = info->has_user_indices ? NULL : info->index.resource;
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 	bool has_user_indices = info->has_user_indices;
 	uint64_t mask;
@@ -2536,7 +2538,7 @@
 void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a)
 {
 
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->b.gfx.cs;
 	struct r600_pipe_shader *shader = ((struct r600_shader_state*)a)->shader;
 
 	if (!shader)
diff --git a/src/gallium/drivers/r600/r600_streamout.c b/src/gallium/drivers/r600/r600_streamout.c
index 7833406..de3e767 100644
--- a/src/gallium/drivers/r600/r600_streamout.c
+++ b/src/gallium/drivers/r600/r600_streamout.c
@@ -154,7 +154,7 @@
 
 static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->gfx.cs;
 	unsigned reg_strmout_cntl;
 
 	/* The register is at different places on different ASICs. */
@@ -180,7 +180,7 @@
 
 static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->gfx.cs;
 	struct r600_so_target **t = rctx->streamout.targets;
 	uint16_t *stride_in_dw = rctx->streamout.stride_in_dw;
 	unsigned i, update_flags = 0;
@@ -253,7 +253,7 @@
 
 void r600_emit_streamout_end(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->gfx.cs;
 	struct r600_so_target **t = rctx->streamout.targets;
 	unsigned i;
 	uint64_t va;
diff --git a/src/gallium/drivers/r600/r600_test_dma.c b/src/gallium/drivers/r600/r600_test_dma.c
index 9e1ff9e..af86ad3 100644
--- a/src/gallium/drivers/r600/r600_test_dma.c
+++ b/src/gallium/drivers/r600/r600_test_dma.c
@@ -57,7 +57,7 @@
 {
 	struct pipe_transfer *t;
 	uint8_t *map;
-	int x,y,z;
+	unsigned x,y,z;
 
 	map = pipe_transfer_map_3d(ctx, tex, 0, PIPE_TRANSFER_WRITE,
 				   0, 0, 0, tex->width0, tex->height0,
diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
index 0c3489d..d08c6e5 100644
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -245,8 +245,8 @@
 	if (!(ptex->flags & R600_RESOURCE_FLAG_FORCE_TILING))
 		flags |= RADEON_SURF_OPTIMIZE_FOR_SPACE;
 
-	r = rscreen->ws->surface_init(rscreen->ws, ptex, flags, bpe,
-				      array_mode, surface);
+	r = rscreen->ws->surface_init(rscreen->ws, ptex,
+				      flags, bpe, array_mode, surface);
 	if (r) {
 		return r;
 	}
@@ -616,8 +616,8 @@
 		bpe *= 2;
 	}
 
-	if (rscreen->ws->surface_init(rscreen->ws, &templ, flags, bpe,
-				      RADEON_SURF_MODE_2D, &fmask)) {
+	if (rscreen->ws->surface_init(rscreen->ws, &templ,
+				      flags, bpe, RADEON_SURF_MODE_2D, &fmask)) {
 		R600_ERR("Got error in surface_init while allocating FMASK.\n");
 		return;
 	}
@@ -1636,7 +1636,7 @@
 			desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1);
 
 		if (screen->is_format_supported(screen, tex->format,
-						tex->target, 0,
+						tex->target, 0, 0,
 						PIPE_BIND_RENDER_TARGET)) {
 			pipe->clear_render_target(pipe, sf, &color,
 						  box->x, box->y,
@@ -1794,6 +1794,16 @@
 		    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
 			continue;
 
+		/* Use a slow clear for small surfaces where the cost of
+		 * the eliminate pass can be higher than the benefit of fast
+		 * clear. AMDGPU-pro does this, but the numbers may differ.
+		 *
+		 * This helps on both dGPUs and APUs, even small ones.
+		 */
+		if (tex->resource.b.b.nr_samples <= 1 &&
+		    tex->resource.b.b.width0 * tex->resource.b.b.height0 <= 300 * 300)
+			continue;
+
 		{
 			/* 128-bit formats are unusupported */
 			if (tex->surface.bpe > 8) {
diff --git a/src/gallium/drivers/r600/r600_viewport.c b/src/gallium/drivers/r600/r600_viewport.c
index 0797f93..7a5bf8f 100644
--- a/src/gallium/drivers/r600/r600_viewport.c
+++ b/src/gallium/drivers/r600/r600_viewport.c
@@ -154,7 +154,7 @@
 }
 
 static void r600_emit_one_scissor(struct r600_common_context *rctx,
-				  struct radeon_winsys_cs *cs,
+				  struct radeon_cmdbuf *cs,
 				  struct r600_signed_scissor *vp_scissor,
 				  struct pipe_scissor_state *scissor)
 {
@@ -185,7 +185,7 @@
 static void r600_emit_guardband(struct r600_common_context *rctx,
 				struct r600_signed_scissor *vp_as_scissor)
 {
-	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->gfx.cs;
 	struct pipe_viewport_state vp;
 	float left, top, right, bottom, max_range, guardband_x, guardband_y;
 
@@ -235,7 +235,7 @@
 
 static void r600_emit_scissors(struct r600_common_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->gfx.cs;
 	struct pipe_scissor_state *states = rctx->scissors.states;
 	unsigned mask = rctx->scissors.dirty_mask;
 	bool scissor_enabled = rctx->scissor_enabled;
@@ -306,7 +306,7 @@
 static void r600_emit_one_viewport(struct r600_common_context *rctx,
 				   struct pipe_viewport_state *state)
 {
-	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->gfx.cs;
 
 	radeon_emit(cs, fui(state->scale[0]));
 	radeon_emit(cs, fui(state->translate[0]));
@@ -318,7 +318,7 @@
 
 static void r600_emit_viewports(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->gfx.cs;
 	struct pipe_viewport_state *states = rctx->viewports.states;
 	unsigned mask = rctx->viewports.dirty_mask;
 
@@ -348,7 +348,7 @@
 
 static void r600_emit_depth_ranges(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct radeon_cmdbuf *cs = rctx->gfx.cs;
 	struct pipe_viewport_state *states = rctx->viewports.states;
 	unsigned mask = rctx->viewports.depth_range_dirty_mask;
 	float zmin, zmax;
diff --git a/src/gallium/drivers/r600/radeon_uvd.c b/src/gallium/drivers/r600/radeon_uvd.c
index 17ff3d5..ac4f40e 100644
--- a/src/gallium/drivers/r600/radeon_uvd.c
+++ b/src/gallium/drivers/r600/radeon_uvd.c
@@ -73,7 +73,7 @@
 
 	struct pipe_screen		*screen;
 	struct radeon_winsys*		ws;
-	struct radeon_winsys_cs*	cs;
+	struct radeon_cmdbuf*	cs;
 
 	unsigned			cur_buffer;
 
@@ -120,8 +120,7 @@
 	int reloc_idx;
 
 	reloc_idx = dec->ws->cs_add_buffer(dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
-					   domain,
-					  RADEON_PRIO_UVD);
+					   domain, 0);
 	if (!dec->use_legacy) {
 		uint64_t addr;
 		addr = dec->ws->buffer_get_virtual_address(buf);
diff --git a/src/gallium/drivers/r600/radeon_vce.c b/src/gallium/drivers/r600/radeon_vce.c
index 533bc18..e49e5aa 100644
--- a/src/gallium/drivers/r600/radeon_vce.c
+++ b/src/gallium/drivers/r600/radeon_vce.c
@@ -518,7 +518,7 @@
 	int reloc_idx;
 
 	reloc_idx = enc->ws->cs_add_buffer(enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
-					   domain, RADEON_PRIO_VCE);
+					   domain, 0);
 	if (enc->use_vm) {
 		uint64_t addr;
 		addr = enc->ws->buffer_get_virtual_address(buf);
diff --git a/src/gallium/drivers/r600/radeon_vce.h b/src/gallium/drivers/r600/radeon_vce.h
index f79e65c..71f0287 100644
--- a/src/gallium/drivers/r600/radeon_vce.h
+++ b/src/gallium/drivers/r600/radeon_vce.h
@@ -387,7 +387,7 @@
 
 	struct pipe_screen		*screen;
 	struct radeon_winsys*		ws;
-	struct radeon_winsys_cs*	cs;
+	struct radeon_cmdbuf*	cs;
 
 	rvce_get_buffer			get_buffer;
 
diff --git a/src/gallium/drivers/r600/radeon_video.c b/src/gallium/drivers/r600/radeon_video.c
index c7acc3d..02fcf77 100644
--- a/src/gallium/drivers/r600/radeon_video.c
+++ b/src/gallium/drivers/r600/radeon_video.c
@@ -85,7 +85,7 @@
 }
 
 /* reallocate a buffer, preserving its content */
-bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
+bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_cmdbuf *cs,
 			struct rvid_buffer *new_buf, unsigned new_size)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen *)screen;
diff --git a/src/gallium/drivers/r600/radeon_video.h b/src/gallium/drivers/r600/radeon_video.h
index 3347c4e..8befc2f 100644
--- a/src/gallium/drivers/r600/radeon_video.h
+++ b/src/gallium/drivers/r600/radeon_video.h
@@ -58,7 +58,7 @@
 void rvid_destroy_buffer(struct rvid_buffer *buffer);
 
 /* reallocate a buffer, preserving its content */
-bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
+bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_cmdbuf *cs,
 			struct rvid_buffer *new_buf, unsigned new_size);
 
 /* clear the buffer with zeros */
diff --git a/src/gallium/drivers/r600/sb/sb_bc_builder.cpp b/src/gallium/drivers/r600/sb/sb_bc_builder.cpp
index 5681fdc..b7d87ea 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_builder.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_builder.cpp
@@ -567,7 +567,7 @@
 	const fetch_op_info *fop = bc.op_ptr;
 	unsigned gds_op = (ctx.fetch_opcode(bc.op) >> 8) & 0x3f;
 	unsigned mem_op = 4;
-	assert(fop->flags && FF_GDS);
+	assert(fop->flags & FF_GDS);
 
 	if (bc.op == FETCH_OP_TF_WRITE) {
 		mem_op = 5;
diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
index a7b8282..4b909f8 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
@@ -617,7 +617,7 @@
 	int r;
 	unsigned i = cf->bc.addr << 1, cnt = cf->bc.count + 1;
 
-	if (cf->bc.op_ptr->flags && FF_GDS)
+	if (cf->bc.op_ptr->flags & FF_GDS)
 		cf->subtype = NST_GDS_CLAUSE;
 	else
 		cf->subtype = NST_TEX_CLAUSE;
diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp
index ad79845..05674ff 100644
--- a/src/gallium/drivers/r600/sb/sb_expr.cpp
+++ b/src/gallium/drivers/r600/sb/sb_expr.cpp
@@ -332,7 +332,7 @@
 }
 
 void expr_handler::apply_alu_dst_mod(const bc_alu &bc, literal &v) {
-	float omod_coeff[] = {2.0f, 4.0, 0.5f};
+	const float omod_coeff[] = {2.0f, 4.0, 0.5f};
 
 	if (bc.omod)
 		v = v.f * omod_coeff[bc.omod - 1];
diff --git a/src/gallium/drivers/r600/sb/sb_ra_init.cpp b/src/gallium/drivers/r600/sb/sb_ra_init.cpp
index 985e179..c557b86 100644
--- a/src/gallium/drivers/r600/sb/sb_ra_init.cpp
+++ b/src/gallium/drivers/r600/sb/sb_ra_init.cpp
@@ -545,10 +545,13 @@
 			continue;
 
 		value *t = sh.create_temp_value();
+		alu_node* n = sh.create_copy_mov(t, v);
+		if (loop)
+			n->flags |= NF_DONT_MOVE;
 		if (loop && id == 0)
-			loc->insert_before(sh.create_copy_mov(t, v));
+			loc->insert_before(n);
 		else
-			loc->push_back(sh.create_copy_mov(t, v));
+			loc->push_back(n);
 		v = t;
 
 		sh.coal.add_edge(v, d, coalescer::phi_cost);
@@ -566,9 +569,10 @@
 
 		value *t = sh.create_temp_value();
 		node *cp = sh.create_copy_mov(v, t);
-		if (loop)
+		if (loop) {
+			cp->flags |= NF_DONT_MOVE;
 			static_cast<container_node*>(loc)->push_front(cp);
-		else
+		} else
 			loc->insert_after(cp);
 		v = t;
 	}
diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp
index ffc6601..fe887c8 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.cpp
+++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
@@ -1154,14 +1154,21 @@
 
 	assert(!ready.empty() || !ready_copies.empty());
 
-	bool improving = true;
+	/* This number is rather arbitrary, important is that the scheduler has
+	 * more than one try to create an instruction group
+	 */
+	int improving = 10;
 	int last_pending = pending.count();
-	while (improving) {
+	while (improving > 0) {
 		prev_regmap = regmap;
 		if (!prepare_alu_group()) {
 
 			int new_pending = pending.count();
-			improving = (new_pending < last_pending) || (last_pending == 0);
+			if ((new_pending < last_pending) || (last_pending == 0))
+				improving = 10;
+			else
+				--improving;
+
 			last_pending = new_pending;
 
 			if (alu.current_idx[0] || alu.current_idx[1]) {
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
deleted file mode 100644
index 66e9a0b..0000000
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Copyright 2013 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * This file is going to be removed.
- */
-
-#ifndef R600_PIPE_COMMON_H
-#define R600_PIPE_COMMON_H
-
-#include <stdio.h>
-
-#include "amd/common/ac_binary.h"
-
-#include "radeon/radeon_winsys.h"
-
-#include "util/disk_cache.h"
-#include "util/u_blitter.h"
-#include "util/list.h"
-#include "util/u_range.h"
-#include "util/slab.h"
-#include "util/u_suballoc.h"
-#include "util/u_transfer.h"
-#include "util/u_threaded_context.h"
-
-struct u_log_context;
-struct si_screen;
-struct si_context;
-struct si_perfcounters;
-struct tgsi_shader_info;
-struct si_qbo_state;
-
-/* Only 32-bit buffer allocations are supported, gallium doesn't support more
- * at the moment.
- */
-struct r600_resource {
-	struct threaded_resource	b;
-
-	/* Winsys objects. */
-	struct pb_buffer		*buf;
-	uint64_t			gpu_address;
-	/* Memory usage if the buffer placement is optimal. */
-	uint64_t			vram_usage;
-	uint64_t			gart_usage;
-
-	/* Resource properties. */
-	uint64_t			bo_size;
-	unsigned			bo_alignment;
-	enum radeon_bo_domain		domains;
-	enum radeon_bo_flag		flags;
-	unsigned			bind_history;
-	int				max_forced_staging_uploads;
-
-	/* The buffer range which is initialized (with a write transfer,
-	 * streamout, DMA, or as a random access target). The rest of
-	 * the buffer is considered invalid and can be mapped unsynchronized.
-	 *
-	 * This allows unsychronized mapping of a buffer range which hasn't
-	 * been used yet. It's for applications which forget to use
-	 * the unsynchronized map flag and expect the driver to figure it out.
-         */
-	struct util_range		valid_buffer_range;
-
-	/* For buffers only. This indicates that a write operation has been
-	 * performed by TC L2, but the cache hasn't been flushed.
-	 * Any hw block which doesn't use or bypasses TC L2 should check this
-	 * flag and flush the cache before using the buffer.
-	 *
-	 * For example, TC L2 must be flushed if a buffer which has been
-	 * modified by a shader store instruction is about to be used as
-	 * an index buffer. The reason is that VGT DMA index fetching doesn't
-	 * use TC L2.
-	 */
-	bool				TC_L2_dirty;
-
-	/* Whether the resource has been exported via resource_get_handle. */
-	unsigned			external_usage; /* PIPE_HANDLE_USAGE_* */
-
-	/* Whether this resource is referenced by bindless handles. */
-	bool				texture_handle_allocated;
-	bool				image_handle_allocated;
-};
-
-struct r600_transfer {
-	struct threaded_transfer	b;
-	struct r600_resource		*staging;
-	unsigned			offset;
-};
-
-struct r600_fmask_info {
-	uint64_t offset;
-	uint64_t size;
-	unsigned alignment;
-	unsigned pitch_in_pixels;
-	unsigned bank_height;
-	unsigned slice_tile_max;
-	unsigned tile_mode_index;
-	unsigned tile_swizzle;
-};
-
-struct r600_cmask_info {
-	uint64_t offset;
-	uint64_t size;
-	unsigned alignment;
-	unsigned slice_tile_max;
-	uint64_t base_address_reg;
-};
-
-struct r600_texture {
-	struct r600_resource		resource;
-
-	struct radeon_surf		surface;
-	uint64_t			size;
-	struct r600_texture		*flushed_depth_texture;
-
-	/* Colorbuffer compression and fast clear. */
-	struct r600_fmask_info		fmask;
-	struct r600_cmask_info		cmask;
-	struct r600_resource		*cmask_buffer;
-	uint64_t			dcc_offset; /* 0 = disabled */
-	unsigned			cb_color_info; /* fast clear enable bit */
-	unsigned			color_clear_value[2];
-	unsigned			last_msaa_resolve_target_micro_mode;
-	unsigned			num_level0_transfers;
-
-	/* Depth buffer compression and fast clear. */
-	uint64_t			htile_offset;
-	float				depth_clear_value;
-	uint16_t			dirty_level_mask; /* each bit says if that mipmap is compressed */
-	uint16_t			stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
-	enum pipe_format		db_render_format:16;
-	uint8_t				stencil_clear_value;
-	bool				tc_compatible_htile:1;
-	bool				depth_cleared:1; /* if it was cleared at least once */
-	bool				stencil_cleared:1; /* if it was cleared at least once */
-	bool				upgraded_depth:1; /* upgraded from unorm to Z32_FLOAT */
-	bool				is_depth:1;
-	bool				db_compatible:1;
-	bool				can_sample_z:1;
-	bool				can_sample_s:1;
-
-	/* We need to track DCC dirtiness, because st/dri usually calls
-	 * flush_resource twice per frame (not a bug) and we don't wanna
-	 * decompress DCC twice. Also, the dirty tracking must be done even
-	 * if DCC isn't used, because it's required by the DCC usage analysis
-	 * for a possible future enablement.
-	 */
-	bool				separate_dcc_dirty:1;
-	/* Statistics gathering for the DCC enablement heuristic. */
-	bool				dcc_gather_statistics:1;
-	/* Counter that should be non-zero if the texture is bound to a
-	 * framebuffer.
-	 */
-	unsigned                        framebuffers_bound;
-	/* Whether the texture is a displayable back buffer and needs DCC
-	 * decompression, which is expensive. Therefore, it's enabled only
-	 * if statistics suggest that it will pay off and it's allocated
-	 * separately. It can't be bound as a sampler by apps. Limited to
-	 * target == 2D and last_level == 0. If enabled, dcc_offset contains
-	 * the absolute GPUVM address, not the relative one.
-	 */
-	struct r600_resource		*dcc_separate_buffer;
-	/* When DCC is temporarily disabled, the separate buffer is here. */
-	struct r600_resource		*last_dcc_separate_buffer;
-	/* Estimate of how much this color buffer is written to in units of
-	 * full-screen draws: ps_invocations / (width * height)
-	 * Shader kills, late Z, and blending with trivial discards make it
-	 * inaccurate (we need to count CB updates, not PS invocations).
-	 */
-	unsigned			ps_draw_ratio;
-	/* The number of clears since the last DCC usage analysis. */
-	unsigned			num_slow_clears;
-};
-
-struct r600_surface {
-	struct pipe_surface		base;
-
-	/* These can vary with block-compressed textures. */
-	uint16_t width0;
-	uint16_t height0;
-
-	bool color_initialized:1;
-	bool depth_initialized:1;
-
-	/* Misc. color flags. */
-	bool color_is_int8:1;
-	bool color_is_int10:1;
-	bool dcc_incompatible:1;
-
-	/* Color registers. */
-	unsigned cb_color_info;
-	unsigned cb_color_view;
-	unsigned cb_color_attrib;
-	unsigned cb_color_attrib2;	/* GFX9 and later */
-	unsigned cb_dcc_control;	/* VI and later */
-	unsigned spi_shader_col_format:8;	/* no blending, no alpha-to-coverage. */
-	unsigned spi_shader_col_format_alpha:8;	/* alpha-to-coverage */
-	unsigned spi_shader_col_format_blend:8;	/* blending without alpha. */
-	unsigned spi_shader_col_format_blend_alpha:8; /* blending with alpha. */
-
-	/* DB registers. */
-	uint64_t db_depth_base;		/* DB_Z_READ/WRITE_BASE */
-	uint64_t db_stencil_base;
-	uint64_t db_htile_data_base;
-	unsigned db_depth_info;
-	unsigned db_z_info;
-	unsigned db_z_info2;		/* GFX9+ */
-	unsigned db_depth_view;
-	unsigned db_depth_size;
-	unsigned db_depth_slice;
-	unsigned db_stencil_info;
-	unsigned db_stencil_info2;	/* GFX9+ */
-	unsigned db_htile_surface;
-};
-
-struct si_mmio_counter {
-	unsigned busy;
-	unsigned idle;
-};
-
-union si_mmio_counters {
-	struct {
-		/* For global GPU load including SDMA. */
-		struct si_mmio_counter gpu;
-
-		/* GRBM_STATUS */
-		struct si_mmio_counter spi;
-		struct si_mmio_counter gui;
-		struct si_mmio_counter ta;
-		struct si_mmio_counter gds;
-		struct si_mmio_counter vgt;
-		struct si_mmio_counter ia;
-		struct si_mmio_counter sx;
-		struct si_mmio_counter wd;
-		struct si_mmio_counter bci;
-		struct si_mmio_counter sc;
-		struct si_mmio_counter pa;
-		struct si_mmio_counter db;
-		struct si_mmio_counter cp;
-		struct si_mmio_counter cb;
-
-		/* SRBM_STATUS2 */
-		struct si_mmio_counter sdma;
-
-		/* CP_STAT */
-		struct si_mmio_counter pfp;
-		struct si_mmio_counter meq;
-		struct si_mmio_counter me;
-		struct si_mmio_counter surf_sync;
-		struct si_mmio_counter cp_dma;
-		struct si_mmio_counter scratch_ram;
-	} named;
-	unsigned array[0];
-};
-
-struct r600_memory_object {
-	struct pipe_memory_object	b;
-	struct pb_buffer		*buf;
-	uint32_t			stride;
-	uint32_t			offset;
-};
-
-/* This encapsulates a state or an operation which can emitted into the GPU
- * command stream. */
-struct r600_atom {
-	void (*emit)(struct si_context *ctx, struct r600_atom *state);
-	unsigned short		id;
-};
-
-/* Saved CS data for debugging features. */
-struct radeon_saved_cs {
-	uint32_t			*ib;
-	unsigned			num_dw;
-
-	struct radeon_bo_list_item	*bo_list;
-	unsigned			bo_count;
-};
-
-/* r600_perfcounters.c */
-void si_perfcounters_destroy(struct si_screen *sscreen);
-
-
-/* Inline helpers. */
-
-static inline struct r600_resource *r600_resource(struct pipe_resource *r)
-{
-	return (struct r600_resource*)r;
-}
-
-static inline void
-r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res)
-{
-	pipe_resource_reference((struct pipe_resource **)ptr,
-				(struct pipe_resource *)res);
-}
-
-static inline void
-r600_texture_reference(struct r600_texture **ptr, struct r600_texture *res)
-{
-	pipe_resource_reference((struct pipe_resource **)ptr, &res->resource.b.b);
-}
-
-static inline bool
-vi_dcc_enabled(struct r600_texture *tex, unsigned level)
-{
-	return tex->dcc_offset && level < tex->surface.num_dcc_levels;
-}
-
-#endif
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 92336ed..a7ef425 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -67,7 +67,7 @@
 
 	struct pipe_screen		*screen;
 	struct radeon_winsys*		ws;
-	struct radeon_winsys_cs*	cs;
+	struct radeon_cmdbuf*	cs;
 
 	unsigned			cur_buffer;
 
@@ -116,8 +116,7 @@
 	int reloc_idx;
 
 	reloc_idx = dec->ws->cs_add_buffer(dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
-					   domain,
-					  RADEON_PRIO_UVD);
+					   domain, 0);
 	if (!dec->use_legacy) {
 		uint64_t addr;
 		addr = dec->ws->buffer_get_virtual_address(buf);
@@ -1004,25 +1003,35 @@
 	size++;
 
 	for (i = 0; i < 2; ++i) {
+		int num = 0, j;
+
 		if (pic->huffman_table.load_huffman_table[i] == 0)
 			continue;
 
 		buf[size++] = 0x00 | i;
 		memcpy((buf + size), &pic->huffman_table.table[i].num_dc_codes, 16);
 		size += 16;
-		memcpy((buf + size), &pic->huffman_table.table[i].dc_values, 12);
-		size += 12;
+		for (j = 0; j < 16; ++j)
+			num += pic->huffman_table.table[i].num_dc_codes[j];
+		assert(num <= 12);
+		memcpy((buf + size), &pic->huffman_table.table[i].dc_values, num);
+		size += num;
 	}
 
 	for (i = 0; i < 2; ++i) {
+		int num = 0, j;
+
 		if (pic->huffman_table.load_huffman_table[i] == 0)
 			continue;
 
 		buf[size++] = 0x10 | i;
 		memcpy((buf + size), &pic->huffman_table.table[i].num_ac_codes, 16);
 		size += 16;
-		memcpy((buf + size), &pic->huffman_table.table[i].ac_values, 162);
-		size += 162;
+		for (j = 0; j < 16; ++j)
+			num += pic->huffman_table.table[i].num_ac_codes[j];
+		assert(num <= 162);
+		memcpy((buf + size), &pic->huffman_table.table[i].ac_values, num);
+		size += num;
 	}
 
 	bs = (uint16_t*)&buf[len_pos];
diff --git a/src/gallium/drivers/radeon/radeon_uvd_enc.h b/src/gallium/drivers/radeon/radeon_uvd_enc.h
index 20c340d..63176d2 100644
--- a/src/gallium/drivers/radeon/radeon_uvd_enc.h
+++ b/src/gallium/drivers/radeon/radeon_uvd_enc.h
@@ -433,7 +433,7 @@
 
    struct pipe_screen *screen;
    struct radeon_winsys *ws;
-   struct radeon_winsys_cs *cs;
+   struct radeon_cmdbuf *cs;
 
    radeon_uvd_enc_get_buffer get_buffer;
 
diff --git a/src/gallium/drivers/radeon/radeon_uvd_enc_1_1.c b/src/gallium/drivers/radeon/radeon_uvd_enc_1_1.c
index 42a9fa9..ddb2197 100644
--- a/src/gallium/drivers/radeon/radeon_uvd_enc_1_1.c
+++ b/src/gallium/drivers/radeon/radeon_uvd_enc_1_1.c
@@ -55,7 +55,7 @@
                           enum radeon_bo_domain domain, signed offset)
 {
    enc->ws->cs_add_buffer(enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
-                          domain, RADEON_PRIO_VCE);
+                          domain, 0);
    uint64_t addr;
    addr = enc->ws->buffer_get_virtual_address(buf);
    addr = addr + offset;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 22168b5..8972253 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -506,7 +506,7 @@
 		break;
 
 	default:
-		if ((sscreen->info.vce_fw_version & (0xff << 24)) == FW_53) {
+		if ((sscreen->info.vce_fw_version & (0xff << 24)) >= FW_53) {
 			si_vce_52_init(enc);
 			si_get_pic_param = si_vce_52_get_param;
 		} else
@@ -542,7 +542,7 @@
 	case FW_52_8_3:
 		return true;
 	default:
-		if ((sscreen->info.vce_fw_version & (0xff << 24)) == FW_53)
+		if ((sscreen->info.vce_fw_version & (0xff << 24)) >= FW_53)
 			return true;
 		else
 			return false;
@@ -559,7 +559,7 @@
 	int reloc_idx;
 
 	reloc_idx = enc->ws->cs_add_buffer(enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
-					   domain, RADEON_PRIO_VCE);
+					   domain, 0);
 	if (enc->use_vm) {
 		uint64_t addr;
 		addr = enc->ws->buffer_get_virtual_address(buf);
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 7f30877..cf625e6 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -381,7 +381,7 @@
 
 	struct pipe_screen		*screen;
 	struct radeon_winsys*		ws;
-	struct radeon_winsys_cs*	cs;
+	struct radeon_cmdbuf*	cs;
 
 	rvce_get_buffer			get_buffer;
 
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index 04e9d7f..66b54da 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -34,7 +34,7 @@
 
 #include "vl/vl_video_buffer.h"
 
-#include "r600_pipe_common.h"
+#include "si_pipe.h"
 #include "radeon_video.h"
 #include "radeon_vce.h"
 
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index b4f9771..f4cbc9b 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -34,7 +34,7 @@
 
 #include "vl/vl_video_buffer.h"
 
-#include "r600_pipe_common.h"
+#include "si_pipe.h"
 #include "radeon_video.h"
 #include "radeon_vce.h"
 
diff --git a/src/gallium/drivers/radeon/radeon_vcn_dec.c b/src/gallium/drivers/radeon/radeon_vcn_dec.c
index 046b371..c2e2204 100644
--- a/src/gallium/drivers/radeon/radeon_vcn_dec.c
+++ b/src/gallium/drivers/radeon/radeon_vcn_dec.c
@@ -66,7 +66,7 @@
 
 	struct pipe_screen		*screen;
 	struct radeon_winsys		*ws;
-	struct radeon_winsys_cs		*cs;
+	struct radeon_cmdbuf		*cs;
 
 	void				*msg;
 	uint32_t			*fb;
@@ -759,15 +759,15 @@
 
 	result.vop_time_increment_resolution = pic->vop_time_increment_resolution;
 
-	result.short_video_header |= pic->short_video_header << 0;
-	result.interlaced |= pic->interlaced << 2;
-        result.load_intra_quant_mat |= 1 << 3;
-	result.load_nonintra_quant_mat |= 1 << 4;
-	result.quarter_sample |= pic->quarter_sample << 5;
-	result.complexity_estimation_disable |= 1 << 6;
-	result.resync_marker_disable |= pic->resync_marker_disable << 7;
-	result.newpred_enable |= 0 << 10; //
-	result.reduced_resolution_vop_enable |= 0 << 11;
+	result.short_video_header = pic->short_video_header;
+	result.interlaced = pic->interlaced;
+	result.load_intra_quant_mat = 1;
+	result.load_nonintra_quant_mat = 1;
+	result.quarter_sample = pic->quarter_sample;
+	result.complexity_estimation_disable = 1;
+	result.resync_marker_disable = pic->resync_marker_disable;
+	result.newpred_enable = 0;
+	result.reduced_resolution_vop_enable = 0;
 
 	result.quant_type = pic->quant_type;
 
@@ -808,10 +808,10 @@
 					struct pipe_video_buffer *target,
 					struct pipe_picture_desc *picture)
 {
-	struct r600_texture *luma = (struct r600_texture *)
-				((struct vl_video_buffer *)target)->resources[0];
-	struct r600_texture *chroma = (struct r600_texture *)
-				((struct vl_video_buffer *)target)->resources[1];
+	struct si_texture *luma = (struct si_texture *)
+				  ((struct vl_video_buffer *)target)->resources[0];
+	struct si_texture *chroma = (struct si_texture *)
+				    ((struct vl_video_buffer *)target)->resources[1];
 	rvcn_dec_message_header_t *header;
 	rvcn_dec_message_index_t *index;
 	rvcn_dec_message_decode_t *decode;
@@ -853,8 +853,8 @@
 	decode->bsd_size = align(dec->bs_size, 128);
 	decode->dpb_size = dec->dpb.res->buf->size;
 	decode->dt_size =
-		((struct r600_resource *)((struct vl_video_buffer *)target)->resources[0])->buf->size +
-		((struct r600_resource *)((struct vl_video_buffer *)target)->resources[1])->buf->size;
+		r600_resource(((struct vl_video_buffer *)target)->resources[0])->buf->size +
+		r600_resource(((struct vl_video_buffer *)target)->resources[1])->buf->size;
 
 	decode->sct_size = 0;
 	decode->sc_coeff_size = 0;
@@ -986,7 +986,7 @@
 	if (dec->ctx.res)
 		decode->hw_ctxt_size = dec->ctx.res->buf->size;
 
-	return luma->resource.buf;
+	return luma->buffer.buf;
 }
 
 static void rvcn_dec_message_destroy(struct radeon_decoder *dec)
@@ -1033,7 +1033,7 @@
 	uint64_t addr;
 
 	dec->ws->cs_add_buffer(dec->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
-			   domain, RADEON_PRIO_UVD);
+			   domain, 0);
 	addr = dec->ws->buffer_get_virtual_address(buf);
 	addr = addr + off;
 
diff --git a/src/gallium/drivers/radeon/radeon_vcn_enc.h b/src/gallium/drivers/radeon/radeon_vcn_enc.h
index 9f0c909..04685c6 100644
--- a/src/gallium/drivers/radeon/radeon_vcn_enc.h
+++ b/src/gallium/drivers/radeon/radeon_vcn_enc.h
@@ -455,7 +455,7 @@
 
 	struct pipe_screen		*screen;
 	struct radeon_winsys*		ws;
-	struct radeon_winsys_cs*	cs;
+	struct radeon_cmdbuf*	cs;
 
 	radeon_enc_get_buffer			get_buffer;
 
diff --git a/src/gallium/drivers/radeon/radeon_vcn_enc_1_2.c b/src/gallium/drivers/radeon/radeon_vcn_enc_1_2.c
index 07493d8..6632451 100644
--- a/src/gallium/drivers/radeon/radeon_vcn_enc_1_2.c
+++ b/src/gallium/drivers/radeon/radeon_vcn_enc_1_2.c
@@ -34,7 +34,7 @@
 
 #include "vl/vl_video_buffer.h"
 
-#include "r600_pipe_common.h"
+#include "si_pipe.h"
 #include "radeon_video.h"
 #include "radeon_vcn_enc.h"
 
@@ -56,7 +56,7 @@
 								  signed offset)
 {
 	enc->ws->cs_add_buffer(enc->cs, buf, usage | RADEON_USAGE_SYNCHRONIZED,
-									   domain, RADEON_PRIO_VCE);
+                               domain, 0);
 	uint64_t addr;
 	addr = enc->ws->buffer_get_virtual_address(buf);
 	addr = addr + offset;
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index a2947df..749f30c 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -63,9 +63,8 @@
 	 * able to move buffers around individually, so request a
 	 * non-sub-allocated buffer.
 	 */
-	buffer->res = (struct r600_resource *)
-		pipe_buffer_create(screen, PIPE_BIND_SHARED,
-				   usage, size);
+	buffer->res = r600_resource(pipe_buffer_create(screen, PIPE_BIND_SHARED,
+						       usage, size));
 
 	return buffer->res != NULL;
 }
@@ -77,7 +76,7 @@
 }
 
 /* reallocate a buffer, preserving its content */
-bool si_vid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
+bool si_vid_resize_buffer(struct pipe_screen *screen, struct radeon_cmdbuf *cs,
 			  struct rvid_buffer *new_buf, unsigned new_size)
 {
 	struct si_screen *sscreen = (struct si_screen *)screen;
diff --git a/src/gallium/drivers/radeon/radeon_video.h b/src/gallium/drivers/radeon/radeon_video.h
index eee550c..71904b3 100644
--- a/src/gallium/drivers/radeon/radeon_video.h
+++ b/src/gallium/drivers/radeon/radeon_video.h
@@ -54,7 +54,7 @@
 void si_vid_destroy_buffer(struct rvid_buffer *buffer);
 
 /* reallocate a buffer, preserving its content */
-bool si_vid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
+bool si_vid_resize_buffer(struct pipe_screen *screen, struct radeon_cmdbuf *cs,
 			  struct rvid_buffer *new_buf, unsigned new_size);
 
 /* clear the buffer with zeros */
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index fae4fb7..99a793f 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -115,71 +115,65 @@
     RADEON_CS_THREAD_TIME,
 };
 
-/* Each group of four has the same priority. */
 enum radeon_bo_priority {
+    /* Each group of two has the same priority. */
     RADEON_PRIO_FENCE = 0,
     RADEON_PRIO_TRACE,
-    RADEON_PRIO_SO_FILLED_SIZE,
+
+    RADEON_PRIO_SO_FILLED_SIZE = 2,
     RADEON_PRIO_QUERY,
 
     RADEON_PRIO_IB1 = 4, /* main IB submitted to the kernel */
     RADEON_PRIO_IB2, /* IB executed with INDIRECT_BUFFER */
-    RADEON_PRIO_DRAW_INDIRECT,
+
+    RADEON_PRIO_DRAW_INDIRECT = 6,
     RADEON_PRIO_INDEX_BUFFER,
 
-    RADEON_PRIO_VCE = 8,
-    RADEON_PRIO_UVD,
-    RADEON_PRIO_SDMA_BUFFER,
-    RADEON_PRIO_SDMA_TEXTURE,
-
-    RADEON_PRIO_CP_DMA = 12,
-
-    RADEON_PRIO_CONST_BUFFER = 16,
-    RADEON_PRIO_DESCRIPTORS,
+    RADEON_PRIO_CP_DMA = 8,
     RADEON_PRIO_BORDER_COLORS,
 
-    RADEON_PRIO_SAMPLER_BUFFER = 20,
+    RADEON_PRIO_CONST_BUFFER = 10,
+    RADEON_PRIO_DESCRIPTORS,
+
+    RADEON_PRIO_SAMPLER_BUFFER = 12,
     RADEON_PRIO_VERTEX_BUFFER,
 
-    RADEON_PRIO_SHADER_RW_BUFFER = 24,
+    RADEON_PRIO_SHADER_RW_BUFFER = 14,
     RADEON_PRIO_COMPUTE_GLOBAL,
 
-    RADEON_PRIO_SAMPLER_TEXTURE = 28,
+    RADEON_PRIO_SAMPLER_TEXTURE = 16,
     RADEON_PRIO_SHADER_RW_IMAGE,
 
-    RADEON_PRIO_SAMPLER_TEXTURE_MSAA = 32,
+    RADEON_PRIO_SAMPLER_TEXTURE_MSAA = 18,
+    RADEON_PRIO_COLOR_BUFFER,
 
-    RADEON_PRIO_COLOR_BUFFER = 36,
+    RADEON_PRIO_DEPTH_BUFFER = 20,
 
-    RADEON_PRIO_DEPTH_BUFFER = 40,
+    RADEON_PRIO_COLOR_BUFFER_MSAA = 22,
 
-    RADEON_PRIO_COLOR_BUFFER_MSAA = 44,
+    RADEON_PRIO_DEPTH_BUFFER_MSAA = 24,
 
-    RADEON_PRIO_DEPTH_BUFFER_MSAA = 48,
-
-    RADEON_PRIO_CMASK = 52,
-    RADEON_PRIO_DCC,
-    RADEON_PRIO_HTILE,
+    RADEON_PRIO_SEPARATE_META = 26,
     RADEON_PRIO_SHADER_BINARY, /* the hw can't hide instruction cache misses */
 
-    RADEON_PRIO_SHADER_RINGS = 56,
+    RADEON_PRIO_SHADER_RINGS = 28,
 
-    RADEON_PRIO_SCRATCH_BUFFER = 60,
-    /* 63 is the maximum value */
+    RADEON_PRIO_SCRATCH_BUFFER = 30,
+    /* 31 is the maximum value */
 };
 
 struct winsys_handle;
 struct radeon_winsys_ctx;
 
-struct radeon_winsys_cs_chunk {
+struct radeon_cmdbuf_chunk {
     unsigned cdw;  /* Number of used dwords. */
     unsigned max_dw; /* Maximum number of dwords. */
     uint32_t *buf; /* The base pointer of the chunk. */
 };
 
-struct radeon_winsys_cs {
-    struct radeon_winsys_cs_chunk current;
-    struct radeon_winsys_cs_chunk *prev;
+struct radeon_cmdbuf {
+    struct radeon_cmdbuf_chunk current;
+    struct radeon_cmdbuf_chunk *prev;
     unsigned                      num_prev; /* Number of previous chunks. */
     unsigned                      max_prev; /* Space in array pointed to by prev. */
     unsigned                      prev_dw; /* Total number of dwords in previous chunks. */
@@ -230,7 +224,7 @@
 struct radeon_bo_list_item {
     uint64_t bo_size;
     uint64_t vm_address;
-    uint64_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */
+    uint32_t priority_usage; /* mask of (1 << RADEON_PRIO_*) */
 };
 
 struct radeon_winsys {
@@ -297,7 +291,7 @@
      * \return          The pointer at the beginning of the buffer.
      */
     void *(*buffer_map)(struct pb_buffer *buf,
-                        struct radeon_winsys_cs *cs,
+                        struct radeon_cmdbuf *cs,
                         enum pipe_transfer_usage usage);
 
     /**
@@ -460,7 +454,7 @@
      * \param flush     Flush callback function associated with the command stream.
      * \param user      User pointer that will be passed to the flush callback.
      */
-    struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx,
+    struct radeon_cmdbuf *(*cs_create)(struct radeon_winsys_ctx *ctx,
                                           enum ring_type ring_type,
                                           void (*flush)(void *ctx, unsigned flags,
 							struct pipe_fence_handle **fence),
@@ -471,7 +465,7 @@
      *
      * \param cs        A command stream to destroy.
      */
-    void (*cs_destroy)(struct radeon_winsys_cs *cs);
+    void (*cs_destroy)(struct radeon_cmdbuf *cs);
 
     /**
      * Add a buffer. Each buffer used by a CS must be added using this function.
@@ -484,7 +478,7 @@
      *                  placed in the requested domain. 15 is the maximum.
      * \return Buffer index.
      */
-    unsigned (*cs_add_buffer)(struct radeon_winsys_cs *cs,
+    unsigned (*cs_add_buffer)(struct radeon_cmdbuf *cs,
                              struct pb_buffer *buf,
                              enum radeon_bo_usage usage,
                              enum radeon_bo_domain domain,
@@ -500,7 +494,7 @@
      * \param buf       Buffer
      * \return          The buffer index, or -1 if the buffer has not been added.
      */
-    int (*cs_lookup_buffer)(struct radeon_winsys_cs *cs,
+    int (*cs_lookup_buffer)(struct radeon_cmdbuf *cs,
                             struct pb_buffer *buf);
 
     /**
@@ -511,7 +505,7 @@
      *
      * \param cs        A command stream to validate.
      */
-    bool (*cs_validate)(struct radeon_winsys_cs *cs);
+    bool (*cs_validate)(struct radeon_cmdbuf *cs);
 
     /**
      * Check whether the given number of dwords is available in the IB.
@@ -520,7 +514,7 @@
      * \param cs        A command stream.
      * \param dw        Number of CS dwords requested by the caller.
      */
-    bool (*cs_check_space)(struct radeon_winsys_cs *cs, unsigned dw);
+    bool (*cs_check_space)(struct radeon_cmdbuf *cs, unsigned dw);
 
     /**
      * Return the buffer list.
@@ -532,7 +526,7 @@
      * \param list  Returned buffer list. Set to NULL to query the count only.
      * \return      The buffer count.
      */
-    unsigned (*cs_get_buffer_list)(struct radeon_winsys_cs *cs,
+    unsigned (*cs_get_buffer_list)(struct radeon_cmdbuf *cs,
                                    struct radeon_bo_list_item *list);
 
     /**
@@ -545,7 +539,7 @@
      * \return Negative POSIX error code or 0 for success.
      *         Asynchronous submissions never return an error.
      */
-    int (*cs_flush)(struct radeon_winsys_cs *cs,
+    int (*cs_flush)(struct radeon_cmdbuf *cs,
                     unsigned flags,
                     struct pipe_fence_handle **fence);
 
@@ -556,7 +550,7 @@
      * The fence must not be used for anything except \ref cs_add_fence_dependency
      * before the flush.
      */
-    struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_winsys_cs *cs);
+    struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_cmdbuf *cs);
 
     /**
      * Return true if a buffer is referenced by a command stream.
@@ -564,7 +558,7 @@
      * \param cs        A command stream.
      * \param buf       A winsys buffer.
      */
-    bool (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs,
+    bool (*cs_is_buffer_referenced)(struct radeon_cmdbuf *cs,
                                     struct pb_buffer *buf,
                                     enum radeon_bo_usage usage);
 
@@ -575,7 +569,7 @@
      * \param fid       Feature ID, one of RADEON_FID_*
      * \param enable    Whether to enable or disable the feature.
      */
-    bool (*cs_request_feature)(struct radeon_winsys_cs *cs,
+    bool (*cs_request_feature)(struct radeon_cmdbuf *cs,
                                enum radeon_feature_id fid,
                                bool enable);
      /**
@@ -583,19 +577,19 @@
       *
       * \param cs        A command stream.
       */
-    void (*cs_sync_flush)(struct radeon_winsys_cs *cs);
+    void (*cs_sync_flush)(struct radeon_cmdbuf *cs);
 
     /**
      * Add a fence dependency to the CS, so that the CS will wait for
      * the fence before execution.
      */
-    void (*cs_add_fence_dependency)(struct radeon_winsys_cs *cs,
+    void (*cs_add_fence_dependency)(struct radeon_cmdbuf *cs,
                                     struct pipe_fence_handle *fence);
 
     /**
      * Signal a syncobj when the CS finishes execution.
      */
-    void (*cs_add_syncobj_signal)(struct radeon_winsys_cs *cs,
+    void (*cs_add_syncobj_signal)(struct radeon_cmdbuf *cs,
 				  struct pipe_fence_handle *fence);
 
     /**
@@ -662,17 +656,17 @@
     const char* (*get_chip_name)(struct radeon_winsys *ws);
 };
 
-static inline bool radeon_emitted(struct radeon_winsys_cs *cs, unsigned num_dw)
+static inline bool radeon_emitted(struct radeon_cmdbuf *cs, unsigned num_dw)
 {
     return cs && (cs->prev_dw + cs->current.cdw > num_dw);
 }
 
-static inline void radeon_emit(struct radeon_winsys_cs *cs, uint32_t value)
+static inline void radeon_emit(struct radeon_cmdbuf *cs, uint32_t value)
 {
     cs->current.buf[cs->current.cdw++] = value;
 }
 
-static inline void radeon_emit_array(struct radeon_winsys_cs *cs,
+static inline void radeon_emit_array(struct radeon_cmdbuf *cs,
 				     const uint32_t *values, unsigned count)
 {
     memcpy(cs->current.buf + cs->current.cdw, values, count * 4);
diff --git a/src/gallium/drivers/radeonsi/.gitignore b/src/gallium/drivers/radeonsi/.gitignore
new file mode 100644
index 0000000..a30f62c
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/.gitignore
@@ -0,0 +1,2 @@
+# Generated source files
+/si_driinfo.h
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index b20a549..f760b5b 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -47,7 +47,6 @@
 	si_texture.c \
 	si_uvd.c \
 	../radeon/r600_perfcounter.c \
-	../radeon/r600_pipe_common.h \
 	../radeon/radeon_uvd.c \
 	../radeon/radeon_uvd.h \
 	../radeon/radeon_vcn_dec.c \
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 690e7ff..595f8d4 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -33,7 +33,7 @@
 				 uint64_t src_offset,
 				 uint64_t size)
 {
-	struct radeon_winsys_cs *cs = ctx->dma_cs;
+	struct radeon_cmdbuf *cs = ctx->dma_cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = r600_resource(dst);
 	struct r600_resource *rsrc = r600_resource(src);
@@ -73,7 +73,7 @@
 				  uint64_t size,
 				  unsigned clear_value)
 {
-	struct radeon_winsys_cs *cs = sctx->dma_cs;
+	struct radeon_cmdbuf *cs = sctx->dma_cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = r600_resource(dst);
 
@@ -114,7 +114,7 @@
 }
 
 static unsigned encode_tile_info(struct si_context *sctx,
-				 struct r600_texture *tex, unsigned level,
+				 struct si_texture *tex, unsigned level,
 				 bool set_bpp)
 {
 	struct radeon_info *info = &sctx->screen->info;
@@ -144,59 +144,59 @@
 				  const struct pipe_box *src_box)
 {
 	struct radeon_info *info = &sctx->screen->info;
-	struct r600_texture *rsrc = (struct r600_texture*)src;
-	struct r600_texture *rdst = (struct r600_texture*)dst;
-	unsigned bpp = rdst->surface.bpe;
-	uint64_t dst_address = rdst->resource.gpu_address +
-			       rdst->surface.u.legacy.level[dst_level].offset;
-	uint64_t src_address = rsrc->resource.gpu_address +
-			       rsrc->surface.u.legacy.level[src_level].offset;
-	unsigned dst_mode = rdst->surface.u.legacy.level[dst_level].mode;
-	unsigned src_mode = rsrc->surface.u.legacy.level[src_level].mode;
-	unsigned dst_tile_index = rdst->surface.u.legacy.tiling_index[dst_level];
-	unsigned src_tile_index = rsrc->surface.u.legacy.tiling_index[src_level];
+	struct si_texture *ssrc = (struct si_texture*)src;
+	struct si_texture *sdst = (struct si_texture*)dst;
+	unsigned bpp = sdst->surface.bpe;
+	uint64_t dst_address = sdst->buffer.gpu_address +
+			       sdst->surface.u.legacy.level[dst_level].offset;
+	uint64_t src_address = ssrc->buffer.gpu_address +
+			       ssrc->surface.u.legacy.level[src_level].offset;
+	unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
+	unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode;
+	unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level];
+	unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level];
 	unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
 	unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
 	unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
 	unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
 	unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ?
-					    rdst->surface.tile_swizzle : 0;
+					    sdst->surface.tile_swizzle : 0;
 	unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ?
-					    rsrc->surface.tile_swizzle : 0;
-	unsigned dst_pitch = rdst->surface.u.legacy.level[dst_level].nblk_x;
-	unsigned src_pitch = rsrc->surface.u.legacy.level[src_level].nblk_x;
-	uint64_t dst_slice_pitch = ((uint64_t)rdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp;
-	uint64_t src_slice_pitch = ((uint64_t)rsrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp;
-	unsigned dst_width = minify_as_blocks(rdst->resource.b.b.width0,
-					      dst_level, rdst->surface.blk_w);
-	unsigned src_width = minify_as_blocks(rsrc->resource.b.b.width0,
-					      src_level, rsrc->surface.blk_w);
-	unsigned dst_height = minify_as_blocks(rdst->resource.b.b.height0,
-					       dst_level, rdst->surface.blk_h);
-	unsigned src_height = minify_as_blocks(rsrc->resource.b.b.height0,
-					       src_level, rsrc->surface.blk_h);
-	unsigned srcx = src_box->x / rsrc->surface.blk_w;
-	unsigned srcy = src_box->y / rsrc->surface.blk_h;
+					    ssrc->surface.tile_swizzle : 0;
+	unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x;
+	unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x;
+	uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp;
+	uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp;
+	unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0,
+					      dst_level, sdst->surface.blk_w);
+	unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0,
+					      src_level, ssrc->surface.blk_w);
+	unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0,
+					       dst_level, sdst->surface.blk_h);
+	unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0,
+					       src_level, ssrc->surface.blk_h);
+	unsigned srcx = src_box->x / ssrc->surface.blk_w;
+	unsigned srcy = src_box->y / ssrc->surface.blk_h;
 	unsigned srcz = src_box->z;
-	unsigned copy_width = DIV_ROUND_UP(src_box->width, rsrc->surface.blk_w);
-	unsigned copy_height = DIV_ROUND_UP(src_box->height, rsrc->surface.blk_h);
+	unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
+	unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
 	unsigned copy_depth = src_box->depth;
 
 	assert(src_level <= src->last_level);
 	assert(dst_level <= dst->last_level);
-	assert(rdst->surface.u.legacy.level[dst_level].offset +
+	assert(sdst->surface.u.legacy.level[dst_level].offset +
 	       dst_slice_pitch * bpp * (dstz + src_box->depth) <=
-	       rdst->resource.buf->size);
-	assert(rsrc->surface.u.legacy.level[src_level].offset +
+	       sdst->buffer.buf->size);
+	assert(ssrc->surface.u.legacy.level[src_level].offset +
 	       src_slice_pitch * bpp * (srcz + src_box->depth) <=
-	       rsrc->resource.buf->size);
+	       ssrc->buffer.buf->size);
 
-	if (!si_prepare_for_dma_blit(sctx, rdst, dst_level, dstx, dsty,
-				     dstz, rsrc, src_level, src_box))
+	if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty,
+				     dstz, ssrc, src_level, src_box))
 		return false;
 
-	dstx /= rdst->surface.blk_w;
-	dsty /= rdst->surface.blk_h;
+	dstx /= sdst->surface.blk_w;
+	dsty /= sdst->surface.blk_h;
 
 	if (srcx >= (1 << 14) ||
 	    srcy >= (1 << 14) ||
@@ -230,9 +230,9 @@
 	      sctx->family != CHIP_KAVERI) ||
 	     (srcx + copy_width != (1 << 14) &&
 	      srcy + copy_height != (1 << 14)))) {
-		struct radeon_winsys_cs *cs = sctx->dma_cs;
+		struct radeon_cmdbuf *cs = sctx->dma_cs;
 
-		si_need_dma_space(sctx, 13, &rdst->resource, &rsrc->resource);
+		si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
 
 		radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
 						CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
@@ -259,25 +259,25 @@
 
 	/* Tiled <-> linear sub-window copy. */
 	if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
-		struct r600_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? rsrc : rdst;
-		struct r600_texture *linear = tiled == rsrc ? rdst : rsrc;
-		unsigned tiled_level =	tiled	== rsrc ? src_level : dst_level;
-		unsigned linear_level =	linear	== rsrc ? src_level : dst_level;
-		unsigned tiled_x =	tiled	== rsrc ? srcx : dstx;
-		unsigned linear_x =	linear  == rsrc ? srcx : dstx;
-		unsigned tiled_y =	tiled	== rsrc ? srcy : dsty;
-		unsigned linear_y =	linear  == rsrc ? srcy : dsty;
-		unsigned tiled_z =	tiled	== rsrc ? srcz : dstz;
-		unsigned linear_z =	linear  == rsrc ? srcz : dstz;
-		unsigned tiled_width =	tiled	== rsrc ? src_width : dst_width;
-		unsigned linear_width =	linear	== rsrc ? src_width : dst_width;
-		unsigned tiled_pitch =	tiled	== rsrc ? src_pitch : dst_pitch;
-		unsigned linear_pitch =	linear	== rsrc ? src_pitch : dst_pitch;
-		unsigned tiled_slice_pitch  = tiled  == rsrc ? src_slice_pitch : dst_slice_pitch;
-		unsigned linear_slice_pitch = linear == rsrc ? src_slice_pitch : dst_slice_pitch;
-		uint64_t tiled_address =  tiled  == rsrc ? src_address : dst_address;
-		uint64_t linear_address = linear == rsrc ? src_address : dst_address;
-		unsigned tiled_micro_mode = tiled == rsrc ? src_micro_mode : dst_micro_mode;
+		struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
+		struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
+		unsigned tiled_level =	tiled	== ssrc ? src_level : dst_level;
+		unsigned linear_level =	linear	== ssrc ? src_level : dst_level;
+		unsigned tiled_x =	tiled	== ssrc ? srcx : dstx;
+		unsigned linear_x =	linear  == ssrc ? srcx : dstx;
+		unsigned tiled_y =	tiled	== ssrc ? srcy : dsty;
+		unsigned linear_y =	linear  == ssrc ? srcy : dsty;
+		unsigned tiled_z =	tiled	== ssrc ? srcz : dstz;
+		unsigned linear_z =	linear  == ssrc ? srcz : dstz;
+		unsigned tiled_width =	tiled	== ssrc ? src_width : dst_width;
+		unsigned linear_width =	linear	== ssrc ? src_width : dst_width;
+		unsigned tiled_pitch =	tiled	== ssrc ? src_pitch : dst_pitch;
+		unsigned linear_pitch =	linear	== ssrc ? src_pitch : dst_pitch;
+		unsigned tiled_slice_pitch  = tiled  == ssrc ? src_slice_pitch : dst_slice_pitch;
+		unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
+		uint64_t tiled_address =  tiled  == ssrc ? src_address : dst_address;
+		uint64_t linear_address = linear == ssrc ? src_address : dst_address;
+		unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
 
 		assert(tiled_pitch % 8 == 0);
 		assert(tiled_slice_pitch % 64 == 0);
@@ -392,10 +392,10 @@
 		    copy_width_aligned <= (1 << 14) &&
 		    copy_height <= (1 << 14) &&
 		    copy_depth <= (1 << 11)) {
-			struct radeon_winsys_cs *cs = sctx->dma_cs;
-			uint32_t direction = linear == rdst ? 1u << 31 : 0;
+			struct radeon_cmdbuf *cs = sctx->dma_cs;
+			uint32_t direction = linear == sdst ? 1u << 31 : 0;
 
-			si_need_dma_space(sctx, 14, &rdst->resource, &rsrc->resource);
+			si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
 
 			radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
 							CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
@@ -428,8 +428,8 @@
 	    /* check if these fit into the bitfields */
 	    src_address % 256 == 0 &&
 	    dst_address % 256 == 0 &&
-	    rsrc->surface.u.legacy.tile_split <= 4096 &&
-	    rdst->surface.u.legacy.tile_split <= 4096 &&
+	    ssrc->surface.u.legacy.tile_split <= 4096 &&
+	    sdst->surface.u.legacy.tile_split <= 4096 &&
 	    dstx % 8 == 0 &&
 	    dsty % 8 == 0 &&
 	    srcx % 8 == 0 &&
@@ -487,9 +487,9 @@
 		     (srcx + copy_width_aligned != (1 << 14) &&
 		      srcy + copy_height_aligned != (1 << 14) &&
 		      dstx + copy_width != (1 << 14)))) {
-			struct radeon_winsys_cs *cs = sctx->dma_cs;
+			struct radeon_cmdbuf *cs = sctx->dma_cs;
 
-			si_need_dma_space(sctx, 15, &rdst->resource, &rsrc->resource);
+			si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer);
 
 			radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
 							CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0));
@@ -498,13 +498,13 @@
 			radeon_emit(cs, srcx | (srcy << 16));
 			radeon_emit(cs, srcz | (src_pitch_tile_max << 16));
 			radeon_emit(cs, src_slice_tile_max);
-			radeon_emit(cs, encode_tile_info(sctx, rsrc, src_level, true));
+			radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true));
 			radeon_emit(cs, dst_address);
 			radeon_emit(cs, dst_address >> 32);
 			radeon_emit(cs, dstx | (dsty << 16));
 			radeon_emit(cs, dstz | (dst_pitch_tile_max << 16));
 			radeon_emit(cs, dst_slice_tile_max);
-			radeon_emit(cs, encode_tile_info(sctx, rdst, dst_level, false));
+			radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false));
 			if (sctx->chip_class == CIK) {
 				radeon_emit(cs, copy_width_aligned |
 						(copy_height_aligned << 16));
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build
index f817d591..9049839 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -63,7 +63,6 @@
   'si_texture.c',
   'si_uvd.c',
   '../radeon/r600_perfcounter.c',
-  '../radeon/r600_pipe_common.h',
   '../radeon/radeon_uvd.c',
   '../radeon/radeon_uvd.h',
   '../radeon/radeon_vcn_enc_1_2.c',
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 7581d5b..0fd69f3 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -57,7 +57,7 @@
 		util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
 		util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
 		util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
-		util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
+		util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask);
 		util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
 	}
 
@@ -75,17 +75,27 @@
 
 	if (op & SI_DISABLE_RENDER_COND)
 		sctx->render_cond_force_off = true;
+
+	if (sctx->screen->dpbb_allowed) {
+		sctx->dpbb_force_off = true;
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+	}
 }
 
 void si_blitter_end(struct si_context *sctx)
 {
+	if (sctx->screen->dpbb_allowed) {
+		sctx->dpbb_force_off = false;
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
+	}
+
 	sctx->render_cond_force_off = false;
 
 	/* Restore shader pointers because the VS blit shader changed all
 	 * non-global VS user SGPRs. */
 	sctx->shader_pointers_dirty |= SI_DESCS_SHADER_MASK(VERTEX);
 	sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
-	si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 }
 
 static unsigned u_max_sample(struct pipe_resource *r)
@@ -95,8 +105,8 @@
 
 static unsigned
 si_blit_dbcb_copy(struct si_context *sctx,
-		  struct r600_texture *src,
-		  struct r600_texture *dst,
+		  struct si_texture *src,
+		  struct si_texture *dst,
 		  unsigned planes, unsigned level_mask,
 		  unsigned first_layer, unsigned last_layer,
 		  unsigned first_sample, unsigned last_sample)
@@ -109,7 +119,7 @@
 		sctx->dbcb_depth_copy_enabled = true;
 	if (planes & PIPE_MASK_S)
 		sctx->dbcb_stencil_copy_enabled = true;
-	si_mark_atom_dirty(sctx, &sctx->db_render_state);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
 	assert(sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled);
 
@@ -120,7 +130,7 @@
 
 		/* The smaller the mipmap level, the less layers there are
 		 * as far as 3D textures are concerned. */
-		max_layer = util_max_layer(&src->resource.b.b, level);
+		max_layer = util_max_layer(&src->buffer.b.b, level);
 		checked_last_layer = MIN2(last_layer, max_layer);
 
 		surf_tmpl.u.tex.level = level;
@@ -128,19 +138,19 @@
 		for (layer = first_layer; layer <= checked_last_layer; layer++) {
 			struct pipe_surface *zsurf, *cbsurf;
 
-			surf_tmpl.format = src->resource.b.b.format;
+			surf_tmpl.format = src->buffer.b.b.format;
 			surf_tmpl.u.tex.first_layer = layer;
 			surf_tmpl.u.tex.last_layer = layer;
 
-			zsurf = sctx->b.create_surface(&sctx->b, &src->resource.b.b, &surf_tmpl);
+			zsurf = sctx->b.create_surface(&sctx->b, &src->buffer.b.b, &surf_tmpl);
 
-			surf_tmpl.format = dst->resource.b.b.format;
-			cbsurf = sctx->b.create_surface(&sctx->b, &dst->resource.b.b, &surf_tmpl);
+			surf_tmpl.format = dst->buffer.b.b.format;
+			cbsurf = sctx->b.create_surface(&sctx->b, &dst->buffer.b.b, &surf_tmpl);
 
 			for (sample = first_sample; sample <= last_sample; sample++) {
 				if (sample != sctx->dbcb_copy_sample) {
 					sctx->dbcb_copy_sample = sample;
-					si_mark_atom_dirty(sctx, &sctx->db_render_state);
+					si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 				}
 
 				si_blitter_begin(sctx, SI_DECOMPRESS);
@@ -154,21 +164,21 @@
 		}
 
 		if (first_layer == 0 && last_layer >= max_layer &&
-		    first_sample == 0 && last_sample >= u_max_sample(&src->resource.b.b))
+		    first_sample == 0 && last_sample >= u_max_sample(&src->buffer.b.b))
 			fully_copied_levels |= 1u << level;
 	}
 
 	sctx->decompression_enabled = false;
 	sctx->dbcb_depth_copy_enabled = false;
 	sctx->dbcb_stencil_copy_enabled = false;
-	si_mark_atom_dirty(sctx, &sctx->db_render_state);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
 	return fully_copied_levels;
 }
 
 void si_blit_decompress_depth(struct pipe_context *ctx,
-			      struct r600_texture *texture,
-			      struct r600_texture *staging,
+			      struct si_texture *texture,
+			      struct si_texture *staging,
 			      unsigned first_level, unsigned last_level,
 			      unsigned first_layer, unsigned last_layer,
 			      unsigned first_sample, unsigned last_sample)
@@ -178,7 +188,7 @@
 
 	assert(staging != NULL && "use si_blit_decompress_zs_in_place instead");
 
-	desc = util_format_description(staging->resource.b.b.format);
+	desc = util_format_description(staging->buffer.b.b.format);
 
 	if (util_format_has_depth(desc))
 		planes |= PIPE_MASK_Z;
@@ -195,7 +205,7 @@
  */
 static void
 si_blit_decompress_zs_planes_in_place(struct si_context *sctx,
-				      struct r600_texture *texture,
+				      struct si_texture *texture,
 				      unsigned planes, unsigned level_mask,
 				      unsigned first_layer, unsigned last_layer)
 {
@@ -210,9 +220,9 @@
 		sctx->db_flush_stencil_inplace = true;
 	if (planes & PIPE_MASK_Z)
 		sctx->db_flush_depth_inplace = true;
-	si_mark_atom_dirty(sctx, &sctx->db_render_state);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
-	surf_tmpl.format = texture->resource.b.b.format;
+	surf_tmpl.format = texture->buffer.b.b.format;
 
 	sctx->decompression_enabled = true;
 
@@ -223,14 +233,14 @@
 
 		/* The smaller the mipmap level, the less layers there are
 		 * as far as 3D textures are concerned. */
-		max_layer = util_max_layer(&texture->resource.b.b, level);
+		max_layer = util_max_layer(&texture->buffer.b.b, level);
 		checked_last_layer = MIN2(last_layer, max_layer);
 
 		for (layer = first_layer; layer <= checked_last_layer; layer++) {
 			surf_tmpl.u.tex.first_layer = layer;
 			surf_tmpl.u.tex.last_layer = layer;
 
-			zsurf = sctx->b.create_surface(&sctx->b, &texture->resource.b.b, &surf_tmpl);
+			zsurf = sctx->b.create_surface(&sctx->b, &texture->buffer.b.b, &surf_tmpl);
 
 			si_blitter_begin(sctx, SI_DECOMPRESS);
 			util_blitter_custom_depth_stencil(sctx->blitter, zsurf, NULL, ~0,
@@ -256,7 +266,7 @@
 	sctx->decompression_enabled = false;
 	sctx->db_flush_depth_inplace = false;
 	sctx->db_flush_stencil_inplace = false;
-	si_mark_atom_dirty(sctx, &sctx->db_render_state);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 }
 
 /* Helper function of si_flush_depth_texture: decompress the given levels
@@ -264,7 +274,7 @@
  */
 static void
 si_blit_decompress_zs_in_place(struct si_context *sctx,
-			       struct r600_texture *texture,
+			       struct si_texture *texture,
 			       unsigned levels_z, unsigned levels_s,
 			       unsigned first_layer, unsigned last_layer)
 {
@@ -298,7 +308,7 @@
 
 static void
 si_decompress_depth(struct si_context *sctx,
-		    struct r600_texture *tex,
+		    struct si_texture *tex,
 		    unsigned required_planes,
 		    unsigned first_level, unsigned last_level,
 		    unsigned first_layer, unsigned last_layer)
@@ -341,14 +351,14 @@
 	 */
 	if (copy_planes &&
 	    (tex->flushed_depth_texture ||
-	     si_init_flushed_depth_texture(&sctx->b, &tex->resource.b.b, NULL))) {
-		struct r600_texture *dst = tex->flushed_depth_texture;
+	     si_init_flushed_depth_texture(&sctx->b, &tex->buffer.b.b, NULL))) {
+		struct si_texture *dst = tex->flushed_depth_texture;
 		unsigned fully_copied_levels;
 		unsigned levels = 0;
 
 		assert(tex->flushed_depth_texture);
 
-		if (util_format_is_depth_and_stencil(dst->resource.b.b.format))
+		if (util_format_is_depth_and_stencil(dst->buffer.b.b.format))
 			copy_planes = PIPE_MASK_Z | PIPE_MASK_S;
 
 		if (copy_planes & PIPE_MASK_Z) {
@@ -363,7 +373,7 @@
 		fully_copied_levels = si_blit_dbcb_copy(
 			sctx, tex, dst, copy_planes, levels,
 			first_layer, last_layer,
-			0, u_max_sample(&tex->resource.b.b));
+			0, u_max_sample(&tex->buffer.b.b));
 
 		if (copy_planes & PIPE_MASK_Z)
 			tex->dirty_level_mask &= ~fully_copied_levels;
@@ -398,15 +408,15 @@
 		/* Only in-place decompression needs to flush DB caches, or
 		 * when we don't decompress but TC-compatible planes are dirty.
 		 */
-		si_make_DB_shader_coherent(sctx, tex->resource.b.b.nr_samples,
+		si_make_DB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
 					   inplace_planes & PIPE_MASK_S,
 					   tc_compat_htile);
 	}
 	/* set_framebuffer_state takes care of coherency for single-sample.
 	 * The DB->CB copy uses CB for the final writes.
 	 */
-	if (copy_planes && tex->resource.b.b.nr_samples > 1)
-		si_make_CB_shader_coherent(sctx, tex->resource.b.b.nr_samples,
+	if (copy_planes && tex->buffer.b.b.nr_samples > 1)
+		si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
 					   false);
 }
 
@@ -420,7 +430,7 @@
 	while (mask) {
 		struct pipe_sampler_view *view;
 		struct si_sampler_view *sview;
-		struct r600_texture *tex;
+		struct si_texture *tex;
 
 		i = u_bit_scan(&mask);
 
@@ -428,21 +438,21 @@
 		assert(view);
 		sview = (struct si_sampler_view*)view;
 
-		tex = (struct r600_texture *)view->texture;
+		tex = (struct si_texture *)view->texture;
 		assert(tex->db_compatible);
 
 		si_decompress_depth(sctx, tex,
 				    sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
 				    view->u.tex.first_level, view->u.tex.last_level,
-				    0, util_max_layer(&tex->resource.b.b, view->u.tex.first_level));
+				    0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
 	}
 }
 
 static void si_blit_decompress_color(struct si_context *sctx,
-		struct r600_texture *rtex,
-		unsigned first_level, unsigned last_level,
-		unsigned first_layer, unsigned last_layer,
-		bool need_dcc_decompress)
+				     struct si_texture *tex,
+				     unsigned first_level, unsigned last_level,
+				     unsigned first_layer, unsigned last_layer,
+				     bool need_dcc_decompress)
 {
 	void* custom_blend;
 	unsigned layer, checked_last_layer, max_layer;
@@ -450,7 +460,7 @@
 		u_bit_consecutive(first_level, last_level - first_level + 1);
 
 	if (!need_dcc_decompress)
-		level_mask &= rtex->dirty_level_mask;
+		level_mask &= tex->dirty_level_mask;
 	if (!level_mask)
 		return;
 
@@ -463,14 +473,14 @@
 	if (need_dcc_decompress) {
 		custom_blend = sctx->custom_blend_dcc_decompress;
 
-		assert(rtex->dcc_offset);
+		assert(tex->dcc_offset);
 
 		/* disable levels without DCC */
 		for (int i = first_level; i <= last_level; i++) {
-			if (!vi_dcc_enabled(rtex, i))
+			if (!vi_dcc_enabled(tex, i))
 				level_mask &= ~(1 << i);
 		}
-	} else if (rtex->fmask.size) {
+	} else if (tex->surface.fmask_size) {
 		custom_blend = sctx->custom_blend_fmask_decompress;
 	} else {
 		custom_blend = sctx->custom_blend_eliminate_fastclear;
@@ -483,17 +493,17 @@
 
 		/* The smaller the mipmap level, the less layers there are
 		 * as far as 3D textures are concerned. */
-		max_layer = util_max_layer(&rtex->resource.b.b, level);
+		max_layer = util_max_layer(&tex->buffer.b.b, level);
 		checked_last_layer = MIN2(last_layer, max_layer);
 
 		for (layer = first_layer; layer <= checked_last_layer; layer++) {
 			struct pipe_surface *cbsurf, surf_tmpl;
 
-			surf_tmpl.format = rtex->resource.b.b.format;
+			surf_tmpl.format = tex->buffer.b.b.format;
 			surf_tmpl.u.tex.level = level;
 			surf_tmpl.u.tex.first_layer = layer;
 			surf_tmpl.u.tex.last_layer = layer;
-			cbsurf = sctx->b.create_surface(&sctx->b, &rtex->resource.b.b, &surf_tmpl);
+			cbsurf = sctx->b.create_surface(&sctx->b, &tex->buffer.b.b, &surf_tmpl);
 
 			/* Required before and after FMASK and DCC_DECOMPRESS. */
 			if (custom_blend == sctx->custom_blend_fmask_decompress ||
@@ -514,25 +524,25 @@
 		/* The texture will always be dirty if some layers aren't flushed.
 		 * I don't think this case occurs often though. */
 		if (first_layer == 0 && last_layer >= max_layer) {
-			rtex->dirty_level_mask &= ~(1 << level);
+			tex->dirty_level_mask &= ~(1 << level);
 		}
 	}
 
 	sctx->decompression_enabled = false;
-	si_make_CB_shader_coherent(sctx, rtex->resource.b.b.nr_samples,
-				   vi_dcc_enabled(rtex, first_level));
+	si_make_CB_shader_coherent(sctx, tex->buffer.b.b.nr_samples,
+				   vi_dcc_enabled(tex, first_level));
 }
 
 static void
-si_decompress_color_texture(struct si_context *sctx, struct r600_texture *tex,
+si_decompress_color_texture(struct si_context *sctx, struct si_texture *tex,
 			    unsigned first_level, unsigned last_level)
 {
 	/* CMASK or DCC can be discarded and we can still end up here. */
-	if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
+	if (!tex->cmask_buffer && !tex->surface.fmask_size && !tex->dcc_offset)
 		return;
 
 	si_blit_decompress_color(sctx, tex, first_level, last_level, 0,
-				 util_max_layer(&tex->resource.b.b, first_level),
+				 util_max_layer(&tex->buffer.b.b, first_level),
 				 false);
 }
 
@@ -545,14 +555,14 @@
 
 	while (mask) {
 		struct pipe_sampler_view *view;
-		struct r600_texture *tex;
+		struct si_texture *tex;
 
 		i = u_bit_scan(&mask);
 
 		view = textures->views[i];
 		assert(view);
 
-		tex = (struct r600_texture *)view->texture;
+		tex = (struct si_texture *)view->texture;
 
 		si_decompress_color_texture(sctx, tex, view->u.tex.first_level,
 					    view->u.tex.last_level);
@@ -568,14 +578,14 @@
 
 	while (mask) {
 		const struct pipe_image_view *view;
-		struct r600_texture *tex;
+		struct si_texture *tex;
 
 		i = u_bit_scan(&mask);
 
 		view = &images->views[i];
 		assert(view->resource->target != PIPE_BUFFER);
 
-		tex = (struct r600_texture *)view->resource;
+		tex = (struct si_texture *)view->resource;
 
 		si_decompress_color_texture(sctx, tex, view->u.tex.level,
 					    view->u.tex.level);
@@ -583,7 +593,7 @@
 }
 
 static void si_check_render_feedback_texture(struct si_context *sctx,
-					     struct r600_texture *tex,
+					     struct si_texture *tex,
 					     unsigned first_level,
 					     unsigned last_level,
 					     unsigned first_layer,
@@ -595,14 +605,14 @@
 		return;
 
 	for (unsigned j = 0; j < sctx->framebuffer.state.nr_cbufs; ++j) {
-		struct r600_surface * surf;
+		struct si_surface * surf;
 
 		if (!sctx->framebuffer.state.cbufs[j])
 			continue;
 
-		surf = (struct r600_surface*)sctx->framebuffer.state.cbufs[j];
+		surf = (struct si_surface*)sctx->framebuffer.state.cbufs[j];
 
-		if (tex == (struct r600_texture *)surf->base.texture &&
+		if (tex == (struct si_texture *)surf->base.texture &&
 		    surf->base.u.tex.level >= first_level &&
 		    surf->base.u.tex.level <= last_level &&
 		    surf->base.u.tex.first_layer <= last_layer &&
@@ -623,7 +633,7 @@
 
 	while (mask) {
 		const struct pipe_sampler_view *view;
-		struct r600_texture *tex;
+		struct si_texture *tex;
 
 		unsigned i = u_bit_scan(&mask);
 
@@ -631,7 +641,7 @@
 		if(view->texture->target == PIPE_BUFFER)
 			continue;
 
-		tex = (struct r600_texture *)view->texture;
+		tex = (struct si_texture *)view->texture;
 
 		si_check_render_feedback_texture(sctx, tex,
 						 view->u.tex.first_level,
@@ -648,7 +658,7 @@
 
 	while (mask) {
 		const struct pipe_image_view *view;
-		struct r600_texture *tex;
+		struct si_texture *tex;
 
 		unsigned i = u_bit_scan(&mask);
 
@@ -656,7 +666,7 @@
 		if (view->resource->target == PIPE_BUFFER)
 			continue;
 
-		tex = (struct r600_texture *)view->resource;
+		tex = (struct si_texture *)view->resource;
 
 		si_check_render_feedback_texture(sctx, tex,
 						 view->u.tex.level,
@@ -671,13 +681,13 @@
 	util_dynarray_foreach(&sctx->resident_tex_handles,
 			      struct si_texture_handle *, tex_handle) {
 		struct pipe_sampler_view *view;
-		struct r600_texture *tex;
+		struct si_texture *tex;
 
 		view = (*tex_handle)->view;
 		if (view->texture->target == PIPE_BUFFER)
 			continue;
 
-		tex = (struct r600_texture *)view->texture;
+		tex = (struct si_texture *)view->texture;
 
 		si_check_render_feedback_texture(sctx, tex,
 						 view->u.tex.first_level,
@@ -692,13 +702,13 @@
 	util_dynarray_foreach(&sctx->resident_img_handles,
 			      struct si_image_handle *, img_handle) {
 		struct pipe_image_view *view;
-		struct r600_texture *tex;
+		struct si_texture *tex;
 
 		view = &(*img_handle)->view;
 		if (view->resource->target == PIPE_BUFFER)
 			continue;
 
-		tex = (struct r600_texture *)view->resource;
+		tex = (struct si_texture *)view->resource;
 
 		si_check_render_feedback_texture(sctx, tex,
 						 view->u.tex.level,
@@ -735,7 +745,7 @@
 	util_dynarray_foreach(&sctx->resident_tex_needs_color_decompress,
 			      struct si_texture_handle *, tex_handle) {
 		struct pipe_sampler_view *view = (*tex_handle)->view;
-		struct r600_texture *tex = (struct r600_texture *)view->texture;
+		struct si_texture *tex = (struct si_texture *)view->texture;
 
 		si_decompress_color_texture(sctx, tex, view->u.tex.first_level,
 					    view->u.tex.last_level);
@@ -745,12 +755,12 @@
 			      struct si_texture_handle *, tex_handle) {
 		struct pipe_sampler_view *view = (*tex_handle)->view;
 		struct si_sampler_view *sview = (struct si_sampler_view *)view;
-		struct r600_texture *tex = (struct r600_texture *)view->texture;
+		struct si_texture *tex = (struct si_texture *)view->texture;
 
 		si_decompress_depth(sctx, tex,
 			sview->is_stencil_sampler ? PIPE_MASK_S : PIPE_MASK_Z,
 			view->u.tex.first_level, view->u.tex.last_level,
-			0, util_max_layer(&tex->resource.b.b, view->u.tex.first_level));
+			0, util_max_layer(&tex->buffer.b.b, view->u.tex.first_level));
 	}
 }
 
@@ -759,7 +769,7 @@
 	util_dynarray_foreach(&sctx->resident_img_needs_color_decompress,
 			      struct si_image_handle *, img_handle) {
 		struct pipe_image_view *view = &(*img_handle)->view;
-		struct r600_texture *tex = (struct r600_texture *)view->resource;
+		struct si_texture *tex = (struct si_texture *)view->resource;
 
 		si_decompress_color_texture(sctx, tex, view->u.tex.level,
 					    view->u.tex.level);
@@ -805,7 +815,7 @@
 		if (sctx->ps_uses_fbfetch) {
 			struct pipe_surface *cb0 = sctx->framebuffer.state.cbufs[0];
 			si_decompress_color_texture(sctx,
-						    (struct r600_texture*)cb0->texture,
+						    (struct si_texture*)cb0->texture,
 						    cb0->u.tex.first_layer,
 						    cb0->u.tex.last_layer);
 		}
@@ -829,12 +839,12 @@
 				      unsigned first_layer, unsigned last_layer)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct r600_texture *rtex = (struct r600_texture*)tex;
+	struct si_texture *stex = (struct si_texture*)tex;
 
-	if (rtex->db_compatible) {
+	if (stex->db_compatible) {
 		planes &= PIPE_MASK_Z | PIPE_MASK_S;
 
-		if (!rtex->surface.has_stencil)
+		if (!stex->surface.has_stencil)
 			planes &= ~PIPE_MASK_S;
 
 		/* If we've rendered into the framebuffer and it's a blitting
@@ -846,10 +856,10 @@
 		    sctx->framebuffer.state.zsbuf->texture == tex)
 			si_update_fb_dirtiness_after_rendering(sctx);
 
-		si_decompress_depth(sctx, rtex, planes,
+		si_decompress_depth(sctx, stex, planes,
 				    level, level,
 				    first_layer, last_layer);
-	} else if (rtex->fmask.size || rtex->cmask.size || rtex->dcc_offset) {
+	} else if (stex->surface.fmask_size || stex->cmask_buffer || stex->dcc_offset) {
 		/* If we've rendered into the framebuffer and it's a blitting
 		 * source, make sure the decompression pass is invoked
 		 * by dirtying the framebuffer.
@@ -863,7 +873,7 @@
 			}
 		}
 
-		si_blit_decompress_color(sctx, rtex, level, level,
+		si_blit_decompress_color(sctx, stex, level, level,
 					 first_layer, last_layer, false);
 	}
 }
@@ -887,7 +897,7 @@
 			     const struct pipe_box *src_box)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct r600_texture *rsrc = (struct r600_texture*)src;
+	struct si_texture *ssrc = (struct si_texture*)src;
 	struct pipe_surface *dst_view, dst_templ;
 	struct pipe_sampler_view src_templ, *src_view;
 	unsigned dst_width, dst_height, src_width0, src_height0;
@@ -919,7 +929,7 @@
 
 	if (util_format_is_compressed(src->format) ||
 	    util_format_is_compressed(dst->format)) {
-		unsigned blocksize = rsrc->surface.bpe;
+		unsigned blocksize = ssrc->surface.bpe;
 
 		if (blocksize == 8)
 			src_templ.format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
@@ -962,7 +972,7 @@
 			sbox.width = util_format_get_nblocksx(src->format, src_box->width);
 			src_box = &sbox;
 		} else {
-			unsigned blocksize = rsrc->surface.bpe;
+			unsigned blocksize = ssrc->surface.bpe;
 
 			switch (blocksize) {
 			case 1:
@@ -1085,9 +1095,9 @@
 				     const struct pipe_blit_info *info)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct r600_texture *src = (struct r600_texture*)info->src.resource;
-	struct r600_texture *dst = (struct r600_texture*)info->dst.resource;
-	MAYBE_UNUSED struct r600_texture *rtmp;
+	struct si_texture *src = (struct si_texture*)info->src.resource;
+	struct si_texture *dst = (struct si_texture*)info->dst.resource;
+	MAYBE_UNUSED struct si_texture *stmp;
 	unsigned dst_width = u_minify(info->dst.resource->width0, info->dst.level);
 	unsigned dst_height = u_minify(info->dst.resource->height0, info->dst.level);
 	enum pipe_format format = info->src.format;
@@ -1129,7 +1139,7 @@
 	    info->src.box.height == dst_height &&
 	    info->src.box.depth == 1 &&
 	    !dst->surface.is_linear &&
-	    (!dst->cmask.size || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */
+	    (!dst->cmask_buffer || !dst->dirty_level_mask)) { /* dst cannot be fast-cleared */
 		/* Check the last constraint. */
 		if (src->surface.micro_tile_mode != dst->surface.micro_tile_mode) {
 			/* The next fast clear will switch to this mode to
@@ -1151,6 +1161,11 @@
 			    info->dst.resource->last_level != 0)
 				goto resolve_to_temp;
 
+			/* This can happen with mipmapping. */
+			if (sctx->chip_class == VI &&
+			    !dst->surface.u.legacy.level[info->dst.level].dcc_fast_clear_size)
+				goto resolve_to_temp;
+
 			vi_dcc_clear_level(sctx, dst, info->dst.level,
 					   0xFFFFFFFF);
 			dst->dirty_level_mask &= ~(1 << info->dst.level);
@@ -1186,10 +1201,10 @@
 	tmp = ctx->screen->resource_create(ctx->screen, &templ);
 	if (!tmp)
 		return false;
-	rtmp = (struct r600_texture*)tmp;
+	stmp = (struct si_texture*)tmp;
 
-	assert(!rtmp->surface.is_linear);
-	assert(src->surface.micro_tile_mode == rtmp->surface.micro_tile_mode);
+	assert(!stmp->surface.is_linear);
+	assert(src->surface.micro_tile_mode == stmp->surface.micro_tile_mode);
 
 	/* resolve */
 	si_do_CB_resolve(sctx, info, tmp, 0, 0, format);
@@ -1212,7 +1227,7 @@
 		    const struct pipe_blit_info *info)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct r600_texture *rdst = (struct r600_texture *)info->dst.resource;
+	struct si_texture *dst = (struct si_texture *)info->dst.resource;
 
 	if (do_hardware_msaa_resolve(ctx, info)) {
 		return;
@@ -1224,7 +1239,7 @@
 	 * resource_copy_region can't do this yet, because dma_copy calls it
 	 * on failure (recursion).
 	 */
-	if (rdst->surface.is_linear &&
+	if (dst->surface.is_linear &&
 	    sctx->dma_copy &&
 	    util_can_blit_via_copy_region(info, false)) {
 		sctx->dma_copy(ctx, info->dst.resource, info->dst.level,
@@ -1267,7 +1282,7 @@
 				  unsigned first_layer, unsigned last_layer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct r600_texture *rtex = (struct r600_texture *)tex;
+	struct si_texture *stex = (struct si_texture *)tex;
 
 	if (!util_blitter_is_copy_supported(sctx->blitter, tex, tex))
 		return false;
@@ -1281,10 +1296,10 @@
 
 	/* Clear dirty_level_mask for the levels that will be overwritten. */
 	assert(base_level < last_level);
-	rtex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1,
+	stex->dirty_level_mask &= ~u_bit_consecutive(base_level + 1,
 						     last_level - base_level);
 
-	sctx->generate_mipmap_for_depth = rtex->is_depth;
+	sctx->generate_mipmap_for_depth = stex->is_depth;
 
 	si_blitter_begin(sctx, SI_BLIT | SI_DISABLE_RENDER_COND);
 	util_blitter_generate_mipmap(sctx->blitter, tex, format,
@@ -1300,25 +1315,25 @@
 			      struct pipe_resource *res)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct r600_texture *rtex = (struct r600_texture*)res;
+	struct si_texture *tex = (struct si_texture*)res;
 
 	assert(res->target != PIPE_BUFFER);
-	assert(!rtex->dcc_separate_buffer || rtex->dcc_gather_statistics);
+	assert(!tex->dcc_separate_buffer || tex->dcc_gather_statistics);
 
 	/* st/dri calls flush twice per frame (not a bug), this prevents double
 	 * decompression. */
-	if (rtex->dcc_separate_buffer && !rtex->separate_dcc_dirty)
+	if (tex->dcc_separate_buffer && !tex->separate_dcc_dirty)
 		return;
 
-	if (!rtex->is_depth && (rtex->cmask.size || rtex->dcc_offset)) {
-		si_blit_decompress_color(sctx, rtex, 0, res->last_level,
+	if (!tex->is_depth && (tex->cmask_buffer || tex->dcc_offset)) {
+		si_blit_decompress_color(sctx, tex, 0, res->last_level,
 					 0, util_max_layer(res, 0),
-					 rtex->dcc_separate_buffer != NULL);
+					 tex->dcc_separate_buffer != NULL);
 	}
 
 	/* Always do the analysis even if DCC is disabled at the moment. */
-	if (rtex->dcc_gather_statistics) {
-		bool separate_dcc_dirty = rtex->separate_dcc_dirty;
+	if (tex->dcc_gather_statistics) {
+		bool separate_dcc_dirty = tex->separate_dcc_dirty;
 
 		/* If the color buffer hasn't been unbound and fast clear hasn't
 		 * been used, separate_dcc_dirty is false, but there may have been
@@ -1341,19 +1356,19 @@
 		}
 
 		if (separate_dcc_dirty) {
-			rtex->separate_dcc_dirty = false;
-			vi_separate_dcc_process_and_reset_stats(ctx, rtex);
+			tex->separate_dcc_dirty = false;
+			vi_separate_dcc_process_and_reset_stats(ctx, tex);
 		}
 	}
 }
 
-void si_decompress_dcc(struct si_context *sctx, struct r600_texture *rtex)
+void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex)
 {
-	if (!rtex->dcc_offset)
+	if (!tex->dcc_offset)
 		return;
 
-	si_blit_decompress_color(sctx, rtex, 0, rtex->resource.b.b.last_level,
-				 0, util_max_layer(&rtex->resource.b.b, 0),
+	si_blit_decompress_color(sctx, tex, 0, tex->buffer.b.b.last_level,
+				 0, util_max_layer(&tex->buffer.b.b, 0),
 				 true);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
index a22d7f7..a03a944 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -25,6 +25,7 @@
 #include "radeonsi/si_pipe.h"
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
+#include "util/u_transfer.h"
 #include <inttypes.h>
 #include <stdio.h>
 
@@ -103,7 +104,7 @@
 			     struct r600_resource *res,
 			     uint64_t size, unsigned alignment)
 {
-	struct r600_texture *rtex = (struct r600_texture*)res;
+	struct si_texture *tex = (struct si_texture*)res;
 
 	res->bo_size = size;
 	res->bo_alignment = alignment;
@@ -141,8 +142,7 @@
 	}
 
 	if (res->b.b.target == PIPE_BUFFER &&
-	    res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
-			      PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
+	    res->b.b.flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) {
 		/* Use GTT for all persistent mappings with older
 		 * kernels, because they didn't always flush the HDP
 		 * cache before CS execution.
@@ -160,7 +160,7 @@
 	}
 
 	/* Tiled textures are unmappable. Always put them in VRAM. */
-	if ((res->b.b.target != PIPE_BUFFER && !rtex->surface.is_linear) ||
+	if ((res->b.b.target != PIPE_BUFFER && !tex->surface.is_linear) ||
 	    res->b.b.flags & SI_RESOURCE_FLAG_UNMAPPABLE) {
 		res->domains = RADEON_DOMAIN_VRAM;
 		res->flags |= RADEON_FLAG_NO_CPU_ACCESS |
@@ -341,7 +341,7 @@
 				    unsigned offset)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct r600_transfer *transfer;
+	struct si_transfer *transfer;
 
 	if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
 		transfer = slab_alloc(&sctx->pool_transfers_unsync);
@@ -479,9 +479,9 @@
 		struct r600_resource *staging;
 
 		assert(!(usage & TC_TRANSFER_MAP_THREADED_UNSYNC));
-		staging = (struct r600_resource*) pipe_buffer_create(
+		staging = r600_resource(pipe_buffer_create(
 				ctx->screen, 0, PIPE_USAGE_STAGING,
-				box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT));
+				box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT)));
 		if (staging) {
 			/* Copy the VRAM buffer to the staging buffer. */
 			sctx->dma_copy(ctx, &staging->b.b, 0,
@@ -517,17 +517,17 @@
 				      struct pipe_transfer *transfer,
 				      const struct pipe_box *box)
 {
-	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+	struct si_transfer *stransfer = (struct si_transfer*)transfer;
 	struct r600_resource *rbuffer = r600_resource(transfer->resource);
 
-	if (rtransfer->staging) {
+	if (stransfer->staging) {
 		struct pipe_resource *dst, *src;
 		unsigned soffset;
 		struct pipe_box dma_box;
 
 		dst = transfer->resource;
-		src = &rtransfer->staging->b.b;
-		soffset = rtransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT;
+		src = &stransfer->staging->b.b;
+		soffset = stransfer->offset + box->x % SI_MAP_BUFFER_ALIGNMENT;
 
 		u_box_1d(soffset, box->width, &dma_box);
 
@@ -558,14 +558,14 @@
 				     struct pipe_transfer *transfer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+	struct si_transfer *stransfer = (struct si_transfer*)transfer;
 
 	if (transfer->usage & PIPE_TRANSFER_WRITE &&
 	    !(transfer->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
 		si_buffer_do_flush_region(ctx, transfer, &transfer->box);
 
-	r600_resource_reference(&rtransfer->staging, NULL);
-	assert(rtransfer->b.staging == NULL); /* for threaded context only */
+	r600_resource_reference(&stransfer->staging, NULL);
+	assert(stransfer->b.staging == NULL); /* for threaded context only */
 	pipe_resource_reference(&transfer->resource, NULL);
 
 	/* Don't use pool_transfers_unsync. We are always in the driver
@@ -649,11 +649,9 @@
 	return &rbuffer->b.b;
 }
 
-struct pipe_resource *si_aligned_buffer_create(struct pipe_screen *screen,
-					       unsigned flags,
-					       unsigned usage,
-					       unsigned size,
-					       unsigned alignment)
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
+						 unsigned flags, unsigned usage,
+						 unsigned size, unsigned alignment)
 {
 	struct pipe_resource buffer;
 
@@ -670,6 +668,14 @@
 	return si_buffer_create(screen, &buffer, alignment);
 }
 
+struct r600_resource *si_aligned_buffer_create(struct pipe_screen *screen,
+					       unsigned flags, unsigned usage,
+					       unsigned size, unsigned alignment)
+{
+	return r600_resource(pipe_aligned_buffer_create(screen, flags, usage,
+							size, alignment));
+}
+
 static struct pipe_resource *
 si_buffer_from_user_memory(struct pipe_screen *screen,
 			   const struct pipe_resource *templ,
diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
index 22f5558..f10929a 100644
--- a/src/gallium/drivers/radeonsi/si_build_pm4.h
+++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
@@ -32,7 +32,7 @@
 #include "si_pipe.h"
 #include "sid.h"
 
-static inline void radeon_set_config_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_config_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg < SI_CONTEXT_REG_OFFSET);
 	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
@@ -40,13 +40,13 @@
 	radeon_emit(cs, (reg - SI_CONFIG_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_config_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_config_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_config_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_context_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_context_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= SI_CONTEXT_REG_OFFSET);
 	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
@@ -54,13 +54,13 @@
 	radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_context_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_context_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_context_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_context_reg_idx(struct radeon_winsys_cs *cs,
+static inline void radeon_set_context_reg_idx(struct radeon_cmdbuf *cs,
 					      unsigned reg, unsigned idx,
 					      unsigned value)
 {
@@ -71,7 +71,7 @@
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_sh_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_sh_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= SI_SH_REG_OFFSET && reg < SI_SH_REG_END);
 	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
@@ -79,13 +79,13 @@
 	radeon_emit(cs, (reg - SI_SH_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_sh_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_sh_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_sh_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_uconfig_reg_seq(struct radeon_winsys_cs *cs, unsigned reg, unsigned num)
+static inline void radeon_set_uconfig_reg_seq(struct radeon_cmdbuf *cs, unsigned reg, unsigned num)
 {
 	assert(reg >= CIK_UCONFIG_REG_OFFSET && reg < CIK_UCONFIG_REG_END);
 	assert(cs->current.cdw + 2 + num <= cs->current.max_dw);
@@ -93,13 +93,13 @@
 	radeon_emit(cs, (reg - CIK_UCONFIG_REG_OFFSET) >> 2);
 }
 
-static inline void radeon_set_uconfig_reg(struct radeon_winsys_cs *cs, unsigned reg, unsigned value)
+static inline void radeon_set_uconfig_reg(struct radeon_cmdbuf *cs, unsigned reg, unsigned value)
 {
 	radeon_set_uconfig_reg_seq(cs, reg, 1);
 	radeon_emit(cs, value);
 }
 
-static inline void radeon_set_uconfig_reg_idx(struct radeon_winsys_cs *cs,
+static inline void radeon_set_uconfig_reg_idx(struct radeon_cmdbuf *cs,
 					      unsigned reg, unsigned idx,
 					      unsigned value)
 {
@@ -110,4 +110,130 @@
 	radeon_emit(cs, value);
 }
 
+/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
+static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
+					      enum si_tracked_reg reg, unsigned value)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	if (!(sctx->tracked_regs.reg_saved & (1 << reg)) ||
+	    sctx->tracked_regs.reg_value[reg] != value ) {
+
+		radeon_set_context_reg(cs, offset, value);
+
+		sctx->tracked_regs.reg_saved |= 1 << reg;
+		sctx->tracked_regs.reg_value[reg] = value;
+	}
+}
+
+/**
+ * Set 2 consecutive registers if any registers value is different.
+ * @param offset        starting register offset
+ * @param value1        is written to first register
+ * @param value2        is written to second register
+ */
+static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
+					       enum si_tracked_reg reg, unsigned value1,
+					       unsigned value2)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	if (!(sctx->tracked_regs.reg_saved & (1 << reg)) ||
+	    !(sctx->tracked_regs.reg_saved & (1 << (reg + 1))) ||
+	    sctx->tracked_regs.reg_value[reg] != value1 ||
+	    sctx->tracked_regs.reg_value[reg+1] != value2 ) {
+
+		radeon_set_context_reg_seq(cs, offset, 2);
+		radeon_emit(cs, value1);
+		radeon_emit(cs, value2);
+
+		sctx->tracked_regs.reg_value[reg] = value1;
+		sctx->tracked_regs.reg_value[reg+1] = value2;
+		sctx->tracked_regs.reg_saved |= 3 << reg;
+	}
+}
+
+/**
+ * Set 3 consecutive registers if any registers value is different.
+ */
+static inline void radeon_opt_set_context_reg3(struct si_context *sctx, unsigned offset,
+					       enum si_tracked_reg reg, unsigned value1,
+					       unsigned value2, unsigned value3)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	if (!(sctx->tracked_regs.reg_saved & (1 << reg)) ||
+	    !(sctx->tracked_regs.reg_saved & (1 << (reg + 1))) ||
+	    !(sctx->tracked_regs.reg_saved & (1 << (reg + 2))) ||
+	    sctx->tracked_regs.reg_value[reg] != value1 ||
+	    sctx->tracked_regs.reg_value[reg+1] != value2 ||
+	    sctx->tracked_regs.reg_value[reg+2] != value3 ) {
+
+		radeon_set_context_reg_seq(cs, offset, 3);
+		radeon_emit(cs, value1);
+		radeon_emit(cs, value2);
+		radeon_emit(cs, value3);
+
+		sctx->tracked_regs.reg_value[reg] = value1;
+		sctx->tracked_regs.reg_value[reg+1] = value2;
+		sctx->tracked_regs.reg_value[reg+2] = value3;
+		sctx->tracked_regs.reg_saved |= 7 << reg;
+	}
+}
+
+/**
+ * Set 4 consecutive registers if any registers value is different.
+ */
+static inline void radeon_opt_set_context_reg4(struct si_context *sctx, unsigned offset,
+					       enum si_tracked_reg reg, unsigned value1,
+					       unsigned value2, unsigned value3,
+					       unsigned value4)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	if (!(sctx->tracked_regs.reg_saved & (1 << reg)) ||
+	    !(sctx->tracked_regs.reg_saved & (1 << (reg + 1))) ||
+	    !(sctx->tracked_regs.reg_saved & (1 << (reg + 2))) ||
+	    !(sctx->tracked_regs.reg_saved & (1 << (reg + 3))) ||
+	    sctx->tracked_regs.reg_value[reg] != value1 ||
+	    sctx->tracked_regs.reg_value[reg+1] != value2 ||
+	    sctx->tracked_regs.reg_value[reg+2] != value3 ||
+	    sctx->tracked_regs.reg_value[reg+3] != value4 ) {
+
+		radeon_set_context_reg_seq(cs, offset, 4);
+		radeon_emit(cs, value1);
+		radeon_emit(cs, value2);
+		radeon_emit(cs, value3);
+		radeon_emit(cs, value4);
+
+		sctx->tracked_regs.reg_value[reg] = value1;
+		sctx->tracked_regs.reg_value[reg+1] = value2;
+		sctx->tracked_regs.reg_value[reg+2] = value3;
+		sctx->tracked_regs.reg_value[reg+3] = value4;
+		sctx->tracked_regs.reg_saved |= 0xf << reg;
+	}
+}
+
+/**
+ * Set consecutive registers if any registers value is different.
+ */
+static inline void radeon_opt_set_context_regn(struct si_context *sctx, unsigned offset,
+					       unsigned *value, unsigned *saved_val,
+					       unsigned num)
+{
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+	int i, j;
+
+	for (i = 0; i < num; i++) {
+		if (saved_val[i] != value[i]) {
+			radeon_set_context_reg_seq(cs, offset, num);
+			for (j = 0; j < num; j++)
+				radeon_emit(cs, value[j]);
+
+			memcpy(saved_val, value, sizeof(uint32_t) * num);
+			break;
+		}
+	}
+}
+
 #endif
diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index 4e05d9b..4e07de8 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -35,37 +35,27 @@
 };
 
 static void si_alloc_separate_cmask(struct si_screen *sscreen,
-				    struct r600_texture *rtex)
+				    struct si_texture *tex)
 {
-	if (rtex->cmask_buffer)
+	if (tex->cmask_buffer || !tex->surface.cmask_size)
                 return;
 
-	assert(rtex->cmask.size == 0);
-
-	si_texture_get_cmask_info(sscreen, rtex, &rtex->cmask);
-	if (!rtex->cmask.size)
-		return;
-
-	rtex->cmask_buffer = (struct r600_resource *)
+	tex->cmask_buffer =
 		si_aligned_buffer_create(&sscreen->b,
 					 SI_RESOURCE_FLAG_UNMAPPABLE,
 					 PIPE_USAGE_DEFAULT,
-					 rtex->cmask.size,
-					 rtex->cmask.alignment);
-	if (rtex->cmask_buffer == NULL) {
-		rtex->cmask.size = 0;
+					 tex->surface.cmask_size,
+					 tex->surface.cmask_alignment);
+	if (tex->cmask_buffer == NULL)
 		return;
-	}
 
-	/* update colorbuffer state bits */
-	rtex->cmask.base_address_reg = rtex->cmask_buffer->gpu_address >> 8;
-
-	rtex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+	tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
+	tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
 
 	p_atomic_inc(&sscreen->compressed_colortex_counter);
 }
 
-static void si_set_clear_color(struct r600_texture *rtex,
+static bool si_set_clear_color(struct si_texture *tex,
 			       enum pipe_format surface_format,
 			       const union pipe_color_union *color)
 {
@@ -73,7 +63,7 @@
 
 	memset(&uc, 0, sizeof(uc));
 
-	if (rtex->surface.bpe == 16) {
+	if (tex->surface.bpe == 16) {
 		/* DCC fast clear only:
 		 *   CLEAR_WORD0 = R = G = B
 		 *   CLEAR_WORD1 = A
@@ -90,61 +80,77 @@
 		util_pack_color(color->f, surface_format, &uc);
 	}
 
-	memcpy(rtex->color_clear_value, &uc, 2 * sizeof(uint32_t));
+	if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
+		return false;
+
+	memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
+	return true;
 }
 
-static bool vi_get_fast_clear_parameters(enum pipe_format surface_format,
-					 const union pipe_color_union *color,
-					 uint32_t* reset_value,
-					 bool* clear_words_needed)
+/** Linearize and convert luminace/intensity to red. */
+enum pipe_format si_simplify_cb_format(enum pipe_format format)
 {
-	bool values[4] = {};
-	int i;
-	bool main_value = false;
-	bool extra_value = false;
-	int extra_channel;
+	format = util_format_linear(format);
+	format = util_format_luminance_to_red(format);
+	return util_format_intensity_to_red(format);
+}
 
-	/* This is needed to get the correct DCC clear value for luminance formats.
-	 * 1) Get the linear format (because the next step can't handle L8_SRGB).
-	 * 2) Convert luminance to red. (the real hw format for luminance)
+bool vi_alpha_is_on_msb(enum pipe_format format)
+{
+	format = si_simplify_cb_format(format);
+
+	/* Formats with 3 channels can't have alpha. */
+	if (util_format_description(format)->nr_channels == 3)
+		return true; /* same as xxxA; is any value OK here? */
+
+	return si_translate_colorswap(format, false) <= 1;
+}
+
+static bool vi_get_fast_clear_parameters(enum pipe_format base_format,
+					 enum pipe_format surface_format,
+					 const union pipe_color_union *color,
+					 uint32_t* clear_value,
+					 bool *eliminate_needed)
+{
+	/* If we want to clear without needing a fast clear eliminate step, we
+	 * can set color and alpha independently to 0 or 1 (or 0/max for integer
+	 * formats).
 	 */
-	surface_format = util_format_linear(surface_format);
-	surface_format = util_format_luminance_to_red(surface_format);
+	bool values[4] = {}; /* whether to clear to 0 or 1 */
+	bool color_value = false; /* clear color to 0 or 1 */
+	bool alpha_value = false; /* clear alpha to 0 or 1 */
+	int alpha_channel; /* index of the alpha component */
+	bool has_color = false;
+	bool has_alpha = false;
 
-	const struct util_format_description *desc = util_format_description(surface_format);
+	const struct util_format_description *desc =
+		util_format_description(si_simplify_cb_format(surface_format));
 
+	/* 128-bit fast clear with different R,G,B values is unsupported. */
 	if (desc->block.bits == 128 &&
 	    (color->ui[0] != color->ui[1] ||
 	     color->ui[0] != color->ui[2]))
 		return false;
 
-	*clear_words_needed = true;
-	*reset_value = 0x20202020U;
+	*eliminate_needed = true;
+	*clear_value = 0x20202020U; /* use CB clear color registers */
 
-	/* If we want to clear without needing a fast clear eliminate step, we
-	 * can set each channel to 0 or 1 (or 0/max for integer formats). We
-	 * have two sets of flags, one for the last or first channel(extra) and
-	 * one for the other channels(main).
-	 */
+	if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
+		return true; /* need ELIMINATE_FAST_CLEAR */
 
-	if (surface_format == PIPE_FORMAT_R11G11B10_FLOAT ||
-	    surface_format == PIPE_FORMAT_B5G6R5_UNORM ||
-	    surface_format == PIPE_FORMAT_B5G6R5_SRGB ||
-	    util_format_is_alpha(surface_format)) {
-		extra_channel = -1;
-	} else if (desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
-		if (si_translate_colorswap(surface_format, false) <= 1)
-			extra_channel = desc->nr_channels - 1;
-		else
-			extra_channel = 0;
-	} else
-		return true;
+	bool base_alpha_is_on_msb = vi_alpha_is_on_msb(base_format);
+	bool surf_alpha_is_on_msb = vi_alpha_is_on_msb(surface_format);
 
-	for (i = 0; i < 4; ++i) {
-		int index = desc->swizzle[i] - PIPE_SWIZZLE_X;
+	/* Formats with 3 channels can't have alpha. */
+	if (desc->nr_channels == 3)
+		alpha_channel = -1;
+	else if (surf_alpha_is_on_msb)
+		alpha_channel = desc->nr_channels - 1;
+	else
+		alpha_channel = 0;
 
-		if (desc->swizzle[i] < PIPE_SWIZZLE_X ||
-		    desc->swizzle[i] > PIPE_SWIZZLE_W)
+	for (int i = 0; i < 4; ++i) {
+		if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
 			continue;
 
 		if (desc->channel[i].pure_integer &&
@@ -154,7 +160,7 @@
 
 			values[i] = color->i[i] != 0;
 			if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
-				return true;
+				return true; /* need ELIMINATE_FAST_CLEAR */
 		} else if (desc->channel[i].pure_integer &&
 			   desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
 			/* Use the maximum value for clamping the clear color. */
@@ -162,71 +168,90 @@
 
 			values[i] = color->ui[i] != 0U;
 			if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
-				return true;
+				return true; /* need ELIMINATE_FAST_CLEAR */
 		} else {
 			values[i] = color->f[i] != 0.0F;
 			if (color->f[i] != 0.0F && color->f[i] != 1.0F)
-				return true;
+				return true; /* need ELIMINATE_FAST_CLEAR */
 		}
 
-		if (index == extra_channel)
-			extra_value = values[i];
-		else
-			main_value = values[i];
+		if (desc->swizzle[i] == alpha_channel) {
+			alpha_value = values[i];
+			has_alpha = true;
+		} else {
+			color_value = values[i];
+			has_color = true;
+		}
 	}
 
-	for (int i = 0; i < 4; ++i)
-		if (values[i] != main_value &&
-		    desc->swizzle[i] - PIPE_SWIZZLE_X != extra_channel &&
-		    desc->swizzle[i] >= PIPE_SWIZZLE_X &&
-		    desc->swizzle[i] <= PIPE_SWIZZLE_W)
-			return true;
+	/* If alpha isn't present, make it the same as color, and vice versa. */
+	if (!has_alpha)
+		alpha_value = color_value;
+	else if (!has_color)
+		color_value = alpha_value;
 
-	*clear_words_needed = false;
-	if (main_value)
-		*reset_value |= 0x80808080U;
+	if (color_value != alpha_value &&
+	    base_alpha_is_on_msb != surf_alpha_is_on_msb)
+		return true; /* require ELIMINATE_FAST_CLEAR */
 
-	if (extra_value)
-		*reset_value |= 0x40404040U;
+	/* Check if all color values are equal if they are present. */
+	for (int i = 0; i < 4; ++i) {
+		if (desc->swizzle[i] <= PIPE_SWIZZLE_W &&
+		    desc->swizzle[i] != alpha_channel &&
+		    values[i] != color_value)
+			return true; /* require ELIMINATE_FAST_CLEAR */
+	}
+
+	/* This doesn't need ELIMINATE_FAST_CLEAR.
+	 * CB uses both the DCC clear codes and the CB clear color registers,
+	 * so they must match.
+	 */
+	*eliminate_needed = false;
+
+	if (color_value)
+		*clear_value |= 0x80808080U;
+	if (alpha_value)
+		*clear_value |= 0x40404040U;
 	return true;
 }
 
 void vi_dcc_clear_level(struct si_context *sctx,
-			struct r600_texture *rtex,
+			struct si_texture *tex,
 			unsigned level, unsigned clear_value)
 {
 	struct pipe_resource *dcc_buffer;
 	uint64_t dcc_offset, clear_size;
 
-	assert(vi_dcc_enabled(rtex, level));
+	assert(vi_dcc_enabled(tex, level));
 
-	if (rtex->dcc_separate_buffer) {
-		dcc_buffer = &rtex->dcc_separate_buffer->b.b;
+	if (tex->dcc_separate_buffer) {
+		dcc_buffer = &tex->dcc_separate_buffer->b.b;
 		dcc_offset = 0;
 	} else {
-		dcc_buffer = &rtex->resource.b.b;
-		dcc_offset = rtex->dcc_offset;
+		dcc_buffer = &tex->buffer.b.b;
+		dcc_offset = tex->dcc_offset;
 	}
 
 	if (sctx->chip_class >= GFX9) {
 		/* Mipmap level clears aren't implemented. */
-		assert(rtex->resource.b.b.last_level == 0);
-		/* MSAA needs a different clear size. */
-		assert(rtex->resource.b.b.nr_samples <= 1);
-		clear_size = rtex->surface.dcc_size;
+		assert(tex->buffer.b.b.last_level == 0);
+		/* 4x and 8x MSAA needs a sophisticated compute shader for
+		 * the clear. See AMDVLK. */
+		assert(tex->buffer.b.b.nr_storage_samples <= 2);
+		clear_size = tex->surface.dcc_size;
 	} else {
-		unsigned num_layers = util_num_layers(&rtex->resource.b.b, level);
+		unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
 
 		/* If this is 0, fast clear isn't possible. (can occur with MSAA) */
-		assert(rtex->surface.u.legacy.level[level].dcc_fast_clear_size);
-		/* Layered MSAA DCC fast clears need to clear dcc_fast_clear_size
-		 * bytes for each layer. This is not currently implemented, and
-		 * therefore MSAA DCC isn't even enabled with multiple layers.
+		assert(tex->surface.u.legacy.level[level].dcc_fast_clear_size);
+		/* Layered 4x and 8x MSAA DCC fast clears need to clear
+		 * dcc_fast_clear_size bytes for each layer. A compute shader
+		 * would be more efficient than separate per-layer clear operations.
 		 */
-		assert(rtex->resource.b.b.nr_samples <= 1 || num_layers == 1);
+		assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 1);
 
-		dcc_offset += rtex->surface.u.legacy.level[level].dcc_offset;
-		clear_size = rtex->surface.u.legacy.level[level].dcc_fast_clear_size *
+		dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
+		clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
 			     num_layers;
 	}
 
@@ -239,20 +264,20 @@
  * src and dst micro tile modes match.
  */
 static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
-					   struct r600_texture *rtex)
+					   struct si_texture *tex)
 {
-	if (rtex->resource.b.is_shared ||
-	    rtex->resource.b.b.nr_samples <= 1 ||
-	    rtex->surface.micro_tile_mode == rtex->last_msaa_resolve_target_micro_mode)
+	if (tex->buffer.b.is_shared ||
+	    tex->buffer.b.b.nr_samples <= 1 ||
+	    tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
 		return;
 
 	assert(sscreen->info.chip_class >= GFX9 ||
-	       rtex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
-	assert(rtex->resource.b.b.last_level == 0);
+	       tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
+	assert(tex->buffer.b.b.last_level == 0);
 
 	if (sscreen->info.chip_class >= GFX9) {
 		/* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
-		assert(rtex->surface.u.gfx9.surf.swizzle_mode >= 4);
+		assert(tex->surface.u.gfx9.surf.swizzle_mode >= 4);
 
 		/* If you do swizzle_mode % 4, you'll get:
 		 *   0 = Depth
@@ -262,20 +287,20 @@
 		 *
 		 * Depth-sample order isn't allowed:
 		 */
-		assert(rtex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
+		assert(tex->surface.u.gfx9.surf.swizzle_mode % 4 != 0);
 
-		switch (rtex->last_msaa_resolve_target_micro_mode) {
+		switch (tex->last_msaa_resolve_target_micro_mode) {
 		case RADEON_MICRO_MODE_DISPLAY:
-			rtex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
-			rtex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
+			tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+			tex->surface.u.gfx9.surf.swizzle_mode += 2; /* D */
 			break;
 		case RADEON_MICRO_MODE_THIN:
-			rtex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
-			rtex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
+			tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+			tex->surface.u.gfx9.surf.swizzle_mode += 1; /* S */
 			break;
 		case RADEON_MICRO_MODE_ROTATED:
-			rtex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
-			rtex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
+			tex->surface.u.gfx9.surf.swizzle_mode &= ~0x3;
+			tex->surface.u.gfx9.surf.swizzle_mode += 3; /* R */
 			break;
 		default: /* depth */
 			assert(!"unexpected micro mode");
@@ -286,48 +311,48 @@
 		 * any definitions for them either. They are all 2D_TILED_THIN1
 		 * modes with different bpp and micro tile mode.
 		 */
-		switch (rtex->last_msaa_resolve_target_micro_mode) {
+		switch (tex->last_msaa_resolve_target_micro_mode) {
 		case RADEON_MICRO_MODE_DISPLAY:
-			rtex->surface.u.legacy.tiling_index[0] = 10;
+			tex->surface.u.legacy.tiling_index[0] = 10;
 			break;
 		case RADEON_MICRO_MODE_THIN:
-			rtex->surface.u.legacy.tiling_index[0] = 14;
+			tex->surface.u.legacy.tiling_index[0] = 14;
 			break;
 		case RADEON_MICRO_MODE_ROTATED:
-			rtex->surface.u.legacy.tiling_index[0] = 28;
+			tex->surface.u.legacy.tiling_index[0] = 28;
 			break;
 		default: /* depth, thick */
 			assert(!"unexpected micro mode");
 			return;
 		}
 	} else { /* SI */
-		switch (rtex->last_msaa_resolve_target_micro_mode) {
+		switch (tex->last_msaa_resolve_target_micro_mode) {
 		case RADEON_MICRO_MODE_DISPLAY:
-			switch (rtex->surface.bpe) {
+			switch (tex->surface.bpe) {
 			case 1:
-                            rtex->surface.u.legacy.tiling_index[0] = 10;
+                            tex->surface.u.legacy.tiling_index[0] = 10;
                             break;
 			case 2:
-                            rtex->surface.u.legacy.tiling_index[0] = 11;
+                            tex->surface.u.legacy.tiling_index[0] = 11;
                             break;
 			default: /* 4, 8 */
-                            rtex->surface.u.legacy.tiling_index[0] = 12;
+                            tex->surface.u.legacy.tiling_index[0] = 12;
                             break;
 			}
 			break;
 		case RADEON_MICRO_MODE_THIN:
-			switch (rtex->surface.bpe) {
+			switch (tex->surface.bpe) {
 			case 1:
-                                rtex->surface.u.legacy.tiling_index[0] = 14;
+                                tex->surface.u.legacy.tiling_index[0] = 14;
                                 break;
 			case 2:
-                                rtex->surface.u.legacy.tiling_index[0] = 15;
+                                tex->surface.u.legacy.tiling_index[0] = 15;
                                 break;
 			case 4:
-                                rtex->surface.u.legacy.tiling_index[0] = 16;
+                                tex->surface.u.legacy.tiling_index[0] = 16;
                                 break;
 			default: /* 8, 16 */
-                                rtex->surface.u.legacy.tiling_index[0] = 17;
+                                tex->surface.u.legacy.tiling_index[0] = 17;
                                 break;
 			}
 			break;
@@ -337,7 +362,7 @@
 		}
 	}
 
-	rtex->surface.micro_tile_mode = rtex->last_msaa_resolve_target_micro_mode;
+	tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
 
 	p_atomic_inc(&sscreen->dirty_tex_counter);
 }
@@ -358,7 +383,7 @@
 		return;
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
-		struct r600_texture *tex;
+		struct si_texture *tex;
 		unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
 
 		if (!fb->cbufs[i])
@@ -369,16 +394,23 @@
 			continue;
 
 		unsigned level = fb->cbufs[i]->u.tex.level;
-		tex = (struct r600_texture *)fb->cbufs[i]->texture;
+		if (level > 0)
+			continue;
+
+		tex = (struct si_texture *)fb->cbufs[i]->texture;
+
+		/* TODO: GFX9: Implement DCC fast clear for level 0 of
+		 * mipmapped textures. Mipmapped DCC has to clear a rectangular
+		 * area of DCC for level 0 (because the whole miptree is
+		 * organized in a 2D plane).
+		 */
+		if (sctx->chip_class >= GFX9 &&
+		    tex->buffer.b.b.last_level > 0)
+			continue;
 
 		/* the clear is allowed if all layers are bound */
 		if (fb->cbufs[i]->u.tex.first_layer != 0 ||
-		    fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->resource.b.b, 0)) {
-			continue;
-		}
-
-		/* cannot clear mipmapped textures */
-		if (fb->cbufs[i]->texture->last_level != 0) {
+		    fb->cbufs[i]->u.tex.last_layer != util_max_layer(&tex->buffer.b.b, 0)) {
 			continue;
 		}
 
@@ -391,34 +423,14 @@
 		 * because there is no way to communicate the clear color among
 		 * all clients
 		 */
-		if (tex->resource.b.is_shared &&
-		    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+		if (tex->buffer.b.is_shared &&
+		    !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
 			continue;
 
-		/* fast color clear with 1D tiling doesn't work on old kernels and CIK */
-		if (sctx->chip_class == CIK &&
+		if (sctx->chip_class <= VI &&
 		    tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
-		    sctx->screen->info.drm_major == 2 &&
-		    sctx->screen->info.drm_minor < 38) {
+		    !sctx->screen->info.htile_cmask_support_1d_tiling)
 			continue;
-		}
-
-		/* Fast clear is the most appropriate place to enable DCC for
-		 * displayable surfaces.
-		 */
-		if (sctx->chip_class >= VI &&
-		    !(sctx->screen->debug_flags & DBG(NO_DCC_FB))) {
-			vi_separate_dcc_try_enable(sctx, tex);
-
-			/* RB+ isn't supported with a CMASK clear only on Stoney,
-			 * so all clears are considered to be hypothetically slow
-			 * clears, which is weighed when determining whether to
-			 * enable separate DCC.
-			 */
-			if (tex->dcc_gather_statistics &&
-			    sctx->family == CHIP_STONEY)
-				tex->num_slow_clears++;
-		}
 
 		bool need_decompress_pass = false;
 
@@ -428,46 +440,62 @@
 		 *
 		 * This helps on both dGPUs and APUs, even small APUs like Mullins.
 		 */
-		bool too_small = tex->resource.b.b.nr_samples <= 1 &&
-				 tex->resource.b.b.width0 *
-				 tex->resource.b.b.height0 <= 512 * 512;
+		bool too_small = tex->buffer.b.b.nr_samples <= 1 &&
+				 tex->buffer.b.b.width0 *
+				 tex->buffer.b.b.height0 <= 512 * 512;
+
+		/* Fast clear is the most appropriate place to enable DCC for
+		 * displayable surfaces.
+		 */
+		if (sctx->family == CHIP_STONEY && !too_small) {
+			vi_separate_dcc_try_enable(sctx, tex);
+
+			/* RB+ isn't supported with a CMASK clear only on Stoney,
+			 * so all clears are considered to be hypothetically slow
+			 * clears, which is weighed when determining whether to
+			 * enable separate DCC.
+			 */
+			if (tex->dcc_gather_statistics) /* only for Stoney */
+				tex->num_slow_clears++;
+		}
 
 		/* Try to clear DCC first, otherwise try CMASK. */
 		if (vi_dcc_enabled(tex, 0)) {
 			uint32_t reset_value;
-			bool clear_words_needed;
+			bool eliminate_needed;
 
 			if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
 				continue;
 
-			/* This can only occur with MSAA. */
+			/* This can happen with mipmapping or MSAA. */
 			if (sctx->chip_class == VI &&
 			    !tex->surface.u.legacy.level[level].dcc_fast_clear_size)
 				continue;
 
-			if (!vi_get_fast_clear_parameters(fb->cbufs[i]->format,
+			if (!vi_get_fast_clear_parameters(tex->buffer.b.b.format,
+							  fb->cbufs[i]->format,
 							  color, &reset_value,
-							  &clear_words_needed))
+							  &eliminate_needed))
 				continue;
 
-			if (clear_words_needed && too_small)
+			if (eliminate_needed && too_small)
 				continue;
 
 			/* DCC fast clear with MSAA should clear CMASK to 0xC. */
-			if (tex->resource.b.b.nr_samples >= 2 && tex->cmask.size) {
+			if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
 				/* TODO: This doesn't work with MSAA. */
-				if (clear_words_needed)
+				if (eliminate_needed)
 					continue;
 
 				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
-						tex->cmask.offset, tex->cmask.size,
+						tex->cmask_offset, tex->surface.cmask_size,
 						0xCCCCCCCC, SI_COHERENCY_CB_META);
 				need_decompress_pass = true;
 			}
 
 			vi_dcc_clear_level(sctx, tex, 0, reset_value);
 
-			if (clear_words_needed)
+			if (eliminate_needed)
 				need_decompress_pass = true;
 
 			tex->separate_dcc_dirty = true;
@@ -486,13 +514,12 @@
 
 			/* ensure CMASK is enabled */
 			si_alloc_separate_cmask(sctx->screen, tex);
-			if (tex->cmask.size == 0) {
+			if (!tex->cmask_buffer)
 				continue;
-			}
 
 			/* Do the fast clear. */
 			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
-					tex->cmask.offset, tex->cmask.size, 0,
+					tex->cmask_offset, tex->surface.cmask_size, 0,
 					SI_COHERENCY_CB_META);
 			need_decompress_pass = true;
 		}
@@ -506,10 +533,10 @@
 		/* We can change the micro tile mode before a full clear. */
 		si_set_optimal_micro_tile_mode(sctx->screen, tex);
 
-		si_set_clear_color(tex, fb->cbufs[i]->format, color);
-
-		sctx->framebuffer.dirty_cbufs |= 1 << i;
-		si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
+		if (si_set_clear_color(tex, fb->cbufs[i]->format, color)) {
+			sctx->framebuffer.dirty_cbufs |= 1 << i;
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+		}
 		*buffers &= ~clear_bit;
 	}
 }
@@ -521,8 +548,8 @@
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
 	struct pipe_surface *zsbuf = fb->zsbuf;
-	struct r600_texture *zstex =
-		zsbuf ? (struct r600_texture*)zsbuf->texture : NULL;
+	struct si_texture *zstex =
+		zsbuf ? (struct si_texture*)zsbuf->texture : NULL;
 
 	if (buffers & PIPE_CLEAR_COLOR) {
 		si_do_fast_color_clear(sctx, &buffers, color);
@@ -531,14 +558,14 @@
 
 		/* These buffers cannot use fast clear, make sure to disable expansion. */
 		for (unsigned i = 0; i < fb->nr_cbufs; i++) {
-			struct r600_texture *tex;
+			struct si_texture *tex;
 
 			/* If not clearing this buffer, skip. */
 			if (!(buffers & (PIPE_CLEAR_COLOR0 << i)) || !fb->cbufs[i])
 				continue;
 
-			tex = (struct r600_texture *)fb->cbufs[i]->texture;
-			if (tex->fmask.size == 0)
+			tex = (struct si_texture *)fb->cbufs[i]->texture;
+			if (tex->surface.fmask_size == 0)
 				tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
 		}
 	}
@@ -546,7 +573,7 @@
 	if (zstex &&
 	    si_htile_enabled(zstex, zsbuf->u.tex.level) &&
 	    zsbuf->u.tex.first_layer == 0 &&
-	    zsbuf->u.tex.last_layer == util_max_layer(&zstex->resource.b.b, 0)) {
+	    zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
 		/* TC-compatible HTILE only supports depth clears to 0 or 1. */
 		if (buffers & PIPE_CLEAR_DEPTH &&
 		    (!zstex->tc_compatible_htile ||
@@ -557,11 +584,14 @@
 				sctx->db_depth_disable_expclear = true;
 			}
 
-			zstex->depth_clear_value = depth;
-			sctx->framebuffer.dirty_zsbuf = true;
-			si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */
+			if (zstex->depth_clear_value != (float)depth) {
+				/* Update DB_DEPTH_CLEAR. */
+				zstex->depth_clear_value = depth;
+				sctx->framebuffer.dirty_zsbuf = true;
+				si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+			}
 			sctx->db_depth_clear = true;
-			si_mark_atom_dirty(sctx, &sctx->db_render_state);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 		}
 
 		/* TC-compatible HTILE only supports stencil clears to 0. */
@@ -575,11 +605,14 @@
 				sctx->db_stencil_disable_expclear = true;
 			}
 
-			zstex->stencil_clear_value = stencil;
-			sctx->framebuffer.dirty_zsbuf = true;
-			si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_STENCIL_CLEAR */
+			if (zstex->stencil_clear_value != (uint8_t)stencil) {
+				/* Update DB_STENCIL_CLEAR. */
+				zstex->stencil_clear_value = stencil;
+				sctx->framebuffer.dirty_zsbuf = true;
+				si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
+			}
 			sctx->db_stencil_clear = true;
-			si_mark_atom_dirty(sctx, &sctx->db_render_state);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 		}
 
 		/* TODO: Find out what's wrong here. Fast depth clear leads to
@@ -608,14 +641,14 @@
 		sctx->db_depth_clear = false;
 		sctx->db_depth_disable_expclear = false;
 		zstex->depth_cleared = true;
-		si_mark_atom_dirty(sctx, &sctx->db_render_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 	}
 
 	if (sctx->db_stencil_clear) {
 		sctx->db_stencil_clear = false;
 		sctx->db_stencil_disable_expclear = false;
 		zstex->stencil_cleared = true;
-		si_mark_atom_dirty(sctx, &sctx->db_render_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 	}
 }
 
@@ -660,7 +693,7 @@
 			     const void *data)
 {
 	struct pipe_screen *screen = pipe->screen;
-	struct r600_texture *rtex = (struct r600_texture*)tex;
+	struct si_texture *stex = (struct si_texture*)tex;
 	struct pipe_surface tmpl = {{0}};
 	struct pipe_surface *sf;
 	const struct util_format_description *desc =
@@ -674,7 +707,7 @@
 	if (!sf)
 		return;
 
-	if (rtex->is_depth) {
+	if (stex->is_depth) {
 		unsigned clear;
 		float depth;
 		uint8_t stencil = 0;
@@ -683,7 +716,7 @@
 		clear = PIPE_CLEAR_DEPTH;
 		desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
 
-		if (rtex->surface.has_stencil) {
+		if (stex->surface.has_stencil) {
 			clear |= PIPE_CLEAR_STENCIL;
 			desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
 		}
@@ -703,7 +736,7 @@
 			desc->unpack_rgba_float(color.f, 0, data, 0, 1, 1);
 
 		if (screen->is_format_supported(screen, tex->format,
-						tex->target, 0,
+						tex->target, 0, 0,
 						PIPE_BIND_RENDER_TARGET)) {
 			si_clear_render_target(pipe, sf, &color,
 					       box->x, box->y,
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index e8fe852..2349be9 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -86,17 +86,18 @@
 	struct si_compute *program = (struct si_compute *)job;
 	struct si_shader *shader = &program->shader;
 	struct si_shader_selector sel;
-	LLVMTargetMachineRef tm;
+	struct ac_llvm_compiler *compiler;
 	struct pipe_debug_callback *debug = &program->compiler_ctx_state.debug;
+	struct si_screen *sscreen = program->screen;
 
 	assert(!debug->debug_message || debug->async);
 	assert(thread_index >= 0);
-	assert(thread_index < ARRAY_SIZE(program->screen->tm));
-	tm = program->screen->tm[thread_index];
+	assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+	compiler = &sscreen->compiler[thread_index];
 
 	memset(&sel, 0, sizeof(sel));
 
-	sel.screen = program->screen;
+	sel.screen = sscreen;
 
 	if (program->ir_type == PIPE_SHADER_IR_TGSI) {
 		tgsi_scan_shader(program->ir.tgsi, &sel.info);
@@ -109,9 +110,12 @@
 		si_lower_nir(&sel);
 	}
 
+	/* Store the declared LDS size into tgsi_shader_info for the shader
+	 * cache to include it.
+	 */
+	sel.info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE] = program->local_size;
 
 	sel.type = PIPE_SHADER_COMPUTE;
-	sel.local_size = program->local_size;
 	si_get_active_slot_masks(&sel.info,
 				 &program->active_const_and_shader_buffers,
 				 &program->active_samplers_and_images);
@@ -122,10 +126,36 @@
 	program->uses_block_size = sel.info.uses_block_size;
 	program->uses_bindless_samplers = sel.info.uses_bindless_samplers;
 	program->uses_bindless_images = sel.info.uses_bindless_images;
+	program->variable_group_size =
+		sel.info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
 
-	if (si_shader_create(program->screen, tm, &program->shader, debug)) {
-		program->shader.compilation_failed = true;
+	void *ir_binary = si_get_ir_binary(&sel);
+
+	/* Try to load the shader from the shader cache. */
+	mtx_lock(&sscreen->shader_cache_mutex);
+
+	if (ir_binary &&
+	    si_shader_cache_load_shader(sscreen, ir_binary, shader)) {
+		mtx_unlock(&sscreen->shader_cache_mutex);
+
+		si_shader_dump_stats_for_shader_db(shader, debug);
+		si_shader_dump(sscreen, shader, debug, PIPE_SHADER_COMPUTE,
+			       stderr, true);
+
+		if (si_shader_binary_upload(sscreen, shader))
+			program->shader.compilation_failed = true;
 	} else {
+		mtx_unlock(&sscreen->shader_cache_mutex);
+
+		if (si_shader_create(sscreen, compiler, &program->shader, debug)) {
+			program->shader.compilation_failed = true;
+
+			if (program->ir_type == PIPE_SHADER_IR_TGSI)
+				FREE(program->ir.tgsi);
+			program->shader.selector = NULL;
+			return;
+		}
+
 		bool scratch_enabled = shader->config.scratch_bytes_per_wave > 0;
 		unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS +
 				      (sel.info.uses_grid_size ? 3 : 0) +
@@ -147,8 +177,12 @@
 						sel.info.uses_thread_id[1] ? 1 : 0) |
 			S_00B84C_LDS_SIZE(shader->config.lds_size);
 
-		program->variable_group_size =
-			sel.info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0;
+		if (ir_binary) {
+			mtx_lock(&sscreen->shader_cache_mutex);
+			if (!si_shader_cache_insert_shader(sscreen, ir_binary, shader, true))
+				FREE(ir_binary);
+			mtx_unlock(&sscreen->shader_cache_mutex);
+		}
 	}
 
 	if (program->ir_type == PIPE_SHADER_IR_TGSI)
@@ -188,28 +222,11 @@
 		program->compiler_ctx_state.debug = sctx->debug;
 		program->compiler_ctx_state.is_debug_context = sctx->is_debug;
 		p_atomic_inc(&sscreen->num_shaders_created);
-		util_queue_fence_init(&program->ready);
 
-		struct util_async_debug_callback async_debug;
-		bool wait =
-			(sctx->debug.debug_message && !sctx->debug.async) ||
-			sctx->is_debug ||
-			si_can_dump_shader(sscreen, PIPE_SHADER_COMPUTE);
-
-		if (wait) {
-			u_async_debug_init(&async_debug);
-			program->compiler_ctx_state.debug = async_debug.base;
-		}
-
-		util_queue_add_job(&sscreen->shader_compiler_queue,
-				   program, &program->ready,
-				   si_create_compute_state_async, NULL);
-
-		if (wait) {
-			util_queue_fence_wait(&program->ready);
-			u_async_debug_drain(&async_debug, &sctx->debug);
-			u_async_debug_cleanup(&async_debug);
-		}
+		si_schedule_initial_compile(sctx, PIPE_SHADER_COMPUTE,
+					    &program->ready,
+					    &program->compiler_ctx_state,
+					    program, si_create_compute_state_async);
 	} else {
 		const struct pipe_llvm_program_header *header;
 		const char *code;
@@ -298,7 +315,7 @@
 
 static void si_initialize_compute(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	uint64_t bc_va;
 
 	radeon_set_sh_reg_seq(cs, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, 2);
@@ -337,9 +354,7 @@
 		radeon_emit(cs, bc_va >> 8);  /* R_030E00_TA_CS_BC_BASE_ADDR */
 		radeon_emit(cs, S_030E04_ADDRESS(bc_va >> 40)); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
 	} else {
-		if (sctx->screen->info.drm_major == 3 ||
-		    (sctx->screen->info.drm_major == 2 &&
-		     sctx->screen->info.drm_minor >= 48)) {
+		if (sctx->screen->info.si_TA_CS_BC_BASE_ADDR_allowed) {
 			radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR,
 					      bc_va >> 8);
 		}
@@ -362,11 +377,11 @@
 	if (scratch_bo_size < scratch_needed) {
 		r600_resource_reference(&sctx->compute_scratch_buffer, NULL);
 
-		sctx->compute_scratch_buffer = (struct r600_resource*)
+		sctx->compute_scratch_buffer =
 			si_aligned_buffer_create(&sctx->screen->b,
-						   SI_RESOURCE_FLAG_UNMAPPABLE,
-						   PIPE_USAGE_DEFAULT,
-						   scratch_needed, 256);
+						 SI_RESOURCE_FLAG_UNMAPPABLE,
+						 PIPE_USAGE_DEFAULT,
+						 scratch_needed, 256);
 
 		if (!sctx->compute_scratch_buffer)
 			return false;
@@ -393,7 +408,7 @@
 				     const amd_kernel_code_t *code_object,
 				     unsigned offset)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct si_shader_config inline_config = {0};
 	struct si_shader_config *config;
 	uint64_t shader_va;
@@ -497,7 +512,7 @@
 					  const amd_kernel_code_t *code_object,
 					  unsigned user_sgpr)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	uint64_t scratch_va = sctx->compute_scratch_buffer->gpu_address;
 
 	unsigned max_private_element_size = AMD_HSA_BITS_GET(
@@ -542,7 +557,7 @@
 				      uint64_t kernel_args_va)
 {
 	struct si_compute *program = sctx->cs_shader_state.program;
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	static const enum amd_code_property_mask_t workgroup_count_masks [] = {
 		AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X,
@@ -631,7 +646,7 @@
 				    const amd_kernel_code_t *code_object,
 				    const struct pipe_grid_info *info)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct si_compute *program = sctx->cs_shader_state.program;
 	struct r600_resource *input_buffer = NULL;
 	unsigned kernel_args_size;
@@ -695,7 +710,7 @@
                                 const struct pipe_grid_info *info)
 {
 	struct si_compute *program = sctx->cs_shader_state.program;
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 +
 				 4 * SI_NUM_RESOURCE_SGPRS;
 	unsigned block_size_reg = grid_size_reg +
@@ -709,7 +724,7 @@
 			int i;
 
 			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-					 (struct r600_resource *)info->indirect,
+					 r600_resource(info->indirect),
 					 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 
 			for (i = 0; i < 3; ++i) {
@@ -742,7 +757,7 @@
                                      const struct pipe_grid_info *info)
 {
 	struct si_screen *sscreen = sctx->screen;
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
 	unsigned waves_per_threadgroup =
 		DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 64);
@@ -780,7 +795,7 @@
 		uint64_t base_va = r600_resource(info->indirect)->gpu_address;
 
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-		                 (struct r600_resource *)info->indirect,
+		                 r600_resource(info->indirect),
 		                 RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 
 		radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) |
@@ -869,10 +884,9 @@
 	si_upload_compute_shader_descriptors(sctx);
 	si_emit_compute_shader_pointers(sctx);
 
-	if (si_is_atom_dirty(sctx, sctx->atoms.s.render_cond)) {
-		sctx->atoms.s.render_cond->emit(sctx,
-		                                sctx->atoms.s.render_cond);
-		si_set_atom_dirty(sctx, sctx->atoms.s.render_cond, false);
+	if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond)) {
+		sctx->atoms.s.render_cond.emit(sctx);
+		si_set_atom_dirty(sctx, &sctx->atoms.s.render_cond, false);
 	}
 
 	if ((program->input_size ||
@@ -884,7 +898,7 @@
 	/* Global buffers */
 	for (i = 0; i < MAX_GLOBAL_BUFFERS; i++) {
 		struct r600_resource *buffer =
-				(struct r600_resource*)program->global_buffers[i];
+			r600_resource(program->global_buffers[i]);
 		if (!buffer) {
 			continue;
 		}
@@ -949,7 +963,6 @@
 	sctx->b.create_compute_state = si_create_compute_state;
 	sctx->b.delete_compute_state = si_delete_compute_state;
 	sctx->b.bind_compute_state = si_bind_compute_state;
-/*	 ctx->context.create_sampler_view = evergreen_compute_create_sampler_view; */
 	sctx->b.set_compute_resources = si_set_compute_resources;
 	sctx->b.set_global_binding = si_set_global_binding;
 	sctx->b.launch_grid = si_launch_grid;
diff --git a/src/gallium/drivers/radeonsi/si_compute.h b/src/gallium/drivers/radeonsi/si_compute.h
index 3a4cdea..d0a8982 100644
--- a/src/gallium/drivers/radeonsi/si_compute.h
+++ b/src/gallium/drivers/radeonsi/si_compute.h
@@ -29,7 +29,7 @@
 
 #include "si_shader.h"
 
-#define MAX_GLOBAL_BUFFERS 22
+#define MAX_GLOBAL_BUFFERS 32
 
 struct si_compute {
 	struct pipe_reference reference;
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index b316637..f98fad4 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -62,7 +62,7 @@
 			   uint64_t src_va, unsigned size, unsigned flags,
 			   enum si_coherency coher)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	uint32_t header = 0, command = 0;
 
 	assert(size <= cp_dma_max_byte_count(sctx));
@@ -88,9 +88,9 @@
 	/* Src and dst flags. */
 	if (sctx->chip_class >= GFX9 && !(flags & CP_DMA_CLEAR) &&
 	    src_va == dst_va)
-		header |= S_411_DSL_SEL(V_411_NOWHERE); /* prefetch only */
+		header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
 	else if (flags & CP_DMA_USE_L2)
-		header |= S_411_DSL_SEL(V_411_DST_ADDR_TC_L2);
+		header |= S_411_DST_SEL(V_411_DST_ADDR_TC_L2);
 
 	if (flags & CP_DMA_CLEAR)
 		header |= S_411_SRC_SEL(V_411_DATA);
@@ -186,11 +186,11 @@
 	/* This must be done after need_cs_space. */
 	if (!(user_flags & SI_CPDMA_SKIP_BO_LIST_UPDATE)) {
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-					  (struct r600_resource*)dst,
+					  r600_resource(dst),
 					  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
 		if (src)
 			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-						  (struct r600_resource*)src,
+						  r600_resource(src),
 						  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
 	}
 
@@ -380,7 +380,7 @@
 	if (!sctx->scratch_buffer ||
 	    sctx->scratch_buffer->b.b.width0 < scratch_size) {
 		r600_resource_reference(&sctx->scratch_buffer, NULL);
-		sctx->scratch_buffer = (struct r600_resource*)
+		sctx->scratch_buffer =
 			si_aligned_buffer_create(&sctx->screen->b,
 						   SI_RESOURCE_FLAG_UNMAPPABLE,
 						   PIPE_USAGE_DEFAULT,
@@ -388,7 +388,7 @@
 		if (!sctx->scratch_buffer)
 			return;
 
-		si_mark_atom_dirty(sctx, &sctx->scratch_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
 	}
 
 	si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index b7d40db..ec4bd03 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -43,7 +43,7 @@
  * Store a linearized copy of all chunks of \p cs together with the buffer
  * list in \p saved.
  */
-void si_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
+void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
 		struct radeon_saved_cs *saved, bool get_buffer_list)
 {
 	uint32_t *buf;
@@ -294,9 +294,8 @@
 
 static void si_dump_debug_registers(struct si_context *sctx, FILE *f)
 {
-	if (sctx->screen->info.drm_major == 2 &&
-	    sctx->screen->info.drm_minor < 42)
-		return; /* no radeon support */
+	if (!sctx->screen->info.has_read_registers_query)
+		return;
 
 	fprintf(f, "Memory-mapped registers:\n");
 	si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS);
@@ -347,7 +346,7 @@
 	free(chunk);
 }
 
-static void si_parse_current_ib(FILE *f, struct radeon_winsys_cs *cs,
+static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs,
 				unsigned begin, unsigned end,
 				int *last_trace_id, unsigned trace_id_count,
 				const char *name, enum chip_class chip_class)
@@ -360,7 +359,7 @@
 		name, begin);
 
 	for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) {
-		struct radeon_winsys_cs_chunk *chunk = &cs->prev[prev_idx];
+		struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx];
 
 		if (begin < chunk->cdw) {
 			ac_parse_ib_chunk(f, chunk->buf + begin,
@@ -497,10 +496,6 @@
 	        ITEM(IB2),
 	        ITEM(DRAW_INDIRECT),
 	        ITEM(INDEX_BUFFER),
-	        ITEM(VCE),
-	        ITEM(UVD),
-	        ITEM(SDMA_BUFFER),
-	        ITEM(SDMA_TEXTURE),
 		ITEM(CP_DMA),
 	        ITEM(CONST_BUFFER),
 	        ITEM(DESCRIPTORS),
@@ -516,9 +511,7 @@
 	        ITEM(DEPTH_BUFFER),
 	        ITEM(COLOR_BUFFER_MSAA),
 	        ITEM(DEPTH_BUFFER_MSAA),
-	        ITEM(CMASK),
-	        ITEM(DCC),
-	        ITEM(HTILE),
+	        ITEM(SEPARATE_META),
 		ITEM(SHADER_BINARY),
 		ITEM(SHADER_RINGS),
 		ITEM(SCRATCH_BUFFER),
@@ -575,8 +568,8 @@
 			size / page_size, va / page_size, (va + size) / page_size);
 
 		/* Print the usage. */
-		for (j = 0; j < 64; j++) {
-			if (!(saved->bo_list[i].priority_usage & (1ull << j)))
+		for (j = 0; j < 32; j++) {
+			if (!(saved->bo_list[i].priority_usage & (1u << j)))
 				continue;
 
 			fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j));
@@ -591,23 +584,23 @@
 static void si_dump_framebuffer(struct si_context *sctx, struct u_log_context *log)
 {
 	struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
-	struct r600_texture *rtex;
+	struct si_texture *tex;
 	int i;
 
 	for (i = 0; i < state->nr_cbufs; i++) {
 		if (!state->cbufs[i])
 			continue;
 
-		rtex = (struct r600_texture*)state->cbufs[i]->texture;
+		tex = (struct si_texture*)state->cbufs[i]->texture;
 		u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i);
-		si_print_texture_info(sctx->screen, rtex, log);
+		si_print_texture_info(sctx->screen, tex, log);
 		u_log_printf(log, "\n");
 	}
 
 	if (state->zsbuf) {
-		rtex = (struct r600_texture*)state->zsbuf->texture;
+		tex = (struct si_texture*)state->zsbuf->texture;
 		u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n");
-		si_print_texture_info(sctx->screen, rtex, log);
+		si_print_texture_info(sctx->screen, tex, log);
 		u_log_printf(log, "\n");
 	}
 }
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 2a24d85..06e95e8 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -100,8 +100,8 @@
 		      ((uint64_t)G_008F04_BASE_ADDRESS_HI(desc[1]) << 32);
 
 	/* Sign-extend the 48-bit address. */
-	if (va & (1ull << 47))
-		va |= 0xffffull << 48;
+	va <<= 16;
+	va = (int64_t)va >> 16;
 	return va;
 }
 
@@ -162,7 +162,7 @@
 		r600_resource_reference(&desc->buffer, NULL);
 		desc->gpu_list = NULL;
 		desc->gpu_address = si_desc_extract_buffer_address(descriptor);
-		si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 		return true;
 	}
 
@@ -192,7 +192,7 @@
 	assert((desc->buffer->gpu_address >> 32) == sctx->screen->info.address32_hi);
 	assert((desc->gpu_address >> 32) == sctx->screen->info.address32_hi);
 
-	si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 	return true;
 }
 
@@ -248,34 +248,28 @@
 				       bool is_stencil_sampler,
 				       bool check_mem)
 {
-	struct r600_resource *rres;
-	struct r600_texture *rtex;
+	struct si_texture *tex = (struct si_texture*)resource;
 	enum radeon_bo_priority priority;
 
 	if (!resource)
 		return;
 
-	if (resource->target != PIPE_BUFFER) {
-		struct r600_texture *tex = (struct r600_texture*)resource;
+	/* Use the flushed depth texture if direct sampling is unsupported. */
+	if (resource->target != PIPE_BUFFER &&
+	    tex->is_depth && !si_can_sample_zs(tex, is_stencil_sampler))
+		tex = tex->flushed_depth_texture;
 
-		if (tex->is_depth && !si_can_sample_zs(tex, is_stencil_sampler))
-			resource = &tex->flushed_depth_texture->resource.b.b;
-	}
-
-	rres = (struct r600_resource*)resource;
-	priority = si_get_sampler_view_priority(rres);
-
-	radeon_add_to_gfx_buffer_list_check_mem(sctx, rres, usage, priority,
+	priority = si_get_sampler_view_priority(&tex->buffer);
+	radeon_add_to_gfx_buffer_list_check_mem(sctx, &tex->buffer, usage, priority,
 						check_mem);
 
 	if (resource->target == PIPE_BUFFER)
 		return;
 
-	/* Now add separate DCC or HTILE. */
-	rtex = (struct r600_texture*)resource;
-	if (rtex->dcc_separate_buffer) {
-		radeon_add_to_gfx_buffer_list_check_mem(sctx, rtex->dcc_separate_buffer,
-							usage, RADEON_PRIO_DCC, check_mem);
+	/* Add separate DCC. */
+	if (tex->dcc_separate_buffer) {
+		radeon_add_to_gfx_buffer_list_check_mem(sctx, tex->dcc_separate_buffer,
+							usage, RADEON_PRIO_SEPARATE_META, check_mem);
 	}
 }
 
@@ -317,7 +311,7 @@
  * \param state			descriptor to update
  */
 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
-				    struct r600_texture *tex,
+				    struct si_texture *tex,
 				    const struct legacy_surf_level *base_level_info,
 				    unsigned base_level, unsigned first_level,
 				    unsigned block_width, bool is_stencil,
@@ -330,7 +324,7 @@
 		is_stencil = false;
 	}
 
-	va = tex->resource.gpu_address;
+	va = tex->buffer.gpu_address;
 
 	if (sscreen->info.chip_class >= GFX9) {
 		/* Only stencil_offset needs to be added here. */
@@ -358,7 +352,7 @@
 		state[7] = 0;
 
 		if (vi_dcc_enabled(tex, first_level)) {
-			meta_va = (!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) +
+			meta_va = (!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) +
 				  tex->dcc_offset;
 
 			if (sscreen->info.chip_class == VI) {
@@ -368,7 +362,7 @@
 
 			meta_va |= (uint32_t)tex->surface.tile_swizzle << 8;
 		} else if (vi_tc_compat_htile_enabled(tex, first_level)) {
-			meta_va = tex->resource.gpu_address + tex->htile_offset;
+			meta_va = tex->buffer.gpu_address + tex->htile_offset;
 		}
 
 		if (meta_va) {
@@ -418,7 +412,7 @@
 
 static void si_set_sampler_state_desc(struct si_sampler_state *sstate,
 				      struct si_sampler_view *sview,
-				      struct r600_texture *tex,
+				      struct si_texture *tex,
 				      uint32_t *desc)
 {
 	if (sview && sview->is_integer)
@@ -436,29 +430,29 @@
 				     uint32_t *desc)
 {
 	struct pipe_sampler_view *view = &sview->base;
-	struct r600_texture *rtex = (struct r600_texture *)view->texture;
-	bool is_buffer = rtex->resource.b.b.target == PIPE_BUFFER;
+	struct si_texture *tex = (struct si_texture *)view->texture;
+	bool is_buffer = tex->buffer.b.b.target == PIPE_BUFFER;
 
 	if (unlikely(!is_buffer && sview->dcc_incompatible)) {
-		if (vi_dcc_enabled(rtex, view->u.tex.first_level))
-			if (!si_texture_disable_dcc(sctx, rtex))
-				si_decompress_dcc(sctx, rtex);
+		if (vi_dcc_enabled(tex, view->u.tex.first_level))
+			if (!si_texture_disable_dcc(sctx, tex))
+				si_decompress_dcc(sctx, tex);
 
 		sview->dcc_incompatible = false;
 	}
 
-	assert(rtex); /* views with texture == NULL aren't supported */
+	assert(tex); /* views with texture == NULL aren't supported */
 	memcpy(desc, sview->state, 8*4);
 
 	if (is_buffer) {
-		si_set_buf_desc_address(&rtex->resource,
+		si_set_buf_desc_address(&tex->buffer,
 					sview->base.u.buf.offset,
 					desc + 4);
 	} else {
-		bool is_separate_stencil = rtex->db_compatible &&
+		bool is_separate_stencil = tex->db_compatible &&
 					   sview->is_stencil_sampler;
 
-		si_set_mutable_tex_desc_fields(sctx->screen, rtex,
+		si_set_mutable_tex_desc_fields(sctx->screen, tex,
 					       sview->base_level_info,
 					       sview->base_level,
 					       sview->base.u.tex.first_level,
@@ -467,7 +461,7 @@
 					       desc);
 	}
 
-	if (!is_buffer && rtex->fmask.size) {
+	if (!is_buffer && tex->surface.fmask_size) {
 		memcpy(desc + 8, sview->fmask_state, 8*4);
 	} else {
 		/* Disable FMASK and bind sampler state in [12:15]. */
@@ -475,26 +469,26 @@
 
 		if (sstate)
 			si_set_sampler_state_desc(sstate, sview,
-						  is_buffer ? NULL : rtex,
+						  is_buffer ? NULL : tex,
 						  desc + 12);
 	}
 }
 
-static bool color_needs_decompression(struct r600_texture *rtex)
+static bool color_needs_decompression(struct si_texture *tex)
 {
-	return rtex->fmask.size ||
-	       (rtex->dirty_level_mask &&
-		(rtex->cmask.size || rtex->dcc_offset));
+	return tex->surface.fmask_size ||
+	       (tex->dirty_level_mask &&
+		(tex->cmask_buffer || tex->dcc_offset));
 }
 
-static bool depth_needs_decompression(struct r600_texture *rtex)
+static bool depth_needs_decompression(struct si_texture *tex)
 {
 	/* If the depth/stencil texture is TC-compatible, no decompression
 	 * will be done. The decompression function will only flush DB caches
 	 * to make it coherent with shaders. That's necessary because the driver
 	 * doesn't flush DB caches in any other case.
 	 */
-	return rtex->db_compatible;
+	return tex->db_compatible;
 }
 
 static void si_set_sampler_view(struct si_context *sctx,
@@ -512,29 +506,29 @@
 		return;
 
 	if (view) {
-		struct r600_texture *rtex = (struct r600_texture *)view->texture;
+		struct si_texture *tex = (struct si_texture *)view->texture;
 
 		si_set_sampler_view_desc(sctx, rview,
 					 samplers->sampler_states[slot], desc);
 
-		if (rtex->resource.b.b.target == PIPE_BUFFER) {
-			rtex->resource.bind_history |= PIPE_BIND_SAMPLER_VIEW;
+		if (tex->buffer.b.b.target == PIPE_BUFFER) {
+			tex->buffer.bind_history |= PIPE_BIND_SAMPLER_VIEW;
 			samplers->needs_depth_decompress_mask &= ~(1u << slot);
 			samplers->needs_color_decompress_mask &= ~(1u << slot);
 		} else {
-			if (depth_needs_decompression(rtex)) {
+			if (depth_needs_decompression(tex)) {
 				samplers->needs_depth_decompress_mask |= 1u << slot;
 			} else {
 				samplers->needs_depth_decompress_mask &= ~(1u << slot);
 			}
-			if (color_needs_decompression(rtex)) {
+			if (color_needs_decompression(tex)) {
 				samplers->needs_color_decompress_mask |= 1u << slot;
 			} else {
 				samplers->needs_color_decompress_mask &= ~(1u << slot);
 			}
 
-			if (rtex->dcc_offset &&
-			    p_atomic_read(&rtex->framebuffers_bound))
+			if (tex->dcc_offset &&
+			    p_atomic_read(&tex->framebuffers_bound))
 				sctx->need_check_render_feedback = true;
 		}
 
@@ -610,9 +604,9 @@
 		struct pipe_resource *res = samplers->views[i]->texture;
 
 		if (res && res->target != PIPE_BUFFER) {
-			struct r600_texture *rtex = (struct r600_texture *)res;
+			struct si_texture *tex = (struct si_texture *)res;
 
-			if (color_needs_decompression(rtex)) {
+			if (color_needs_decompression(tex)) {
 				samplers->needs_color_decompress_mask |= 1u << i;
 			} else {
 				samplers->needs_color_decompress_mask &= ~(1u << i);
@@ -673,7 +667,7 @@
 static void
 si_mark_image_range_valid(const struct pipe_image_view *view)
 {
-	struct r600_resource *res = (struct r600_resource *)view->resource;
+	struct r600_resource *res = r600_resource(view->resource);
 
 	assert(res && res->b.b.target == PIPE_BUFFER);
 
@@ -690,7 +684,7 @@
 	struct si_screen *screen = ctx->screen;
 	struct r600_resource *res;
 
-	res = (struct r600_resource *)view->resource;
+	res = r600_resource(view->resource);
 
 	if (res->b.b.target == PIPE_BUFFER) {
 		if (view->access & PIPE_IMAGE_ACCESS_WRITE)
@@ -703,7 +697,7 @@
 		si_set_buf_desc_address(res, view->u.buf.offset, desc + 4);
 	} else {
 		static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
-		struct r600_texture *tex = (struct r600_texture *)res;
+		struct si_texture *tex = (struct si_texture *)res;
 		unsigned level = view->u.tex.level;
 		unsigned width, height, depth, hw_level;
 		bool uses_dcc = vi_dcc_enabled(tex, level);
@@ -714,13 +708,13 @@
 		 * so we don't wanna trigger it.
 		 */
 		if (tex->is_depth ||
-		    (!fmask_desc && tex->fmask.size != 0)) {
+		    (!fmask_desc && tex->surface.fmask_size != 0)) {
 			assert(!"Z/S and MSAA image stores are not supported");
 			access &= ~PIPE_IMAGE_ACCESS_WRITE;
 		}
 
 		assert(!tex->is_depth);
-		assert(fmask_desc || tex->fmask.size == 0);
+		assert(fmask_desc || tex->surface.fmask_size == 0);
 
 		if (uses_dcc && !skip_decompress &&
 		    (view->access & PIPE_IMAGE_ACCESS_WRITE ||
@@ -786,7 +780,7 @@
 		return;
 	}
 
-	res = (struct r600_resource *)view->resource;
+	res = r600_resource(view->resource);
 
 	if (&images->views[slot] != view)
 		util_copy_image_view(&images->views[slot], view);
@@ -797,7 +791,7 @@
 		images->needs_color_decompress_mask &= ~(1 << slot);
 		res->bind_history |= PIPE_BIND_SHADER_IMAGE;
 	} else {
-		struct r600_texture *tex = (struct r600_texture *)res;
+		struct si_texture *tex = (struct si_texture *)res;
 		unsigned level = view->u.tex.level;
 
 		if (color_needs_decompression(tex)) {
@@ -858,9 +852,9 @@
 		struct pipe_resource *res = images->views[i].resource;
 
 		if (res && res->target != PIPE_BUFFER) {
-			struct r600_texture *rtex = (struct r600_texture *)res;
+			struct si_texture *tex = (struct si_texture *)res;
 
-			if (color_needs_decompression(rtex)) {
+			if (color_needs_decompression(tex)) {
 				images->needs_color_decompress_mask |= 1 << i;
 			} else {
 				images->needs_color_decompress_mask &= ~(1 << i);
@@ -895,7 +889,7 @@
 	si_update_ps_iter_samples(sctx);
 
 	if (surf) {
-		struct r600_texture *tex = (struct r600_texture*)surf->texture;
+		struct si_texture *tex = (struct si_texture*)surf->texture;
 		struct pipe_image_view view;
 
 		assert(tex);
@@ -906,9 +900,9 @@
 		 */
 		si_texture_disable_dcc(sctx, tex);
 
-		if (tex->resource.b.b.nr_samples <= 1 && tex->cmask_buffer) {
+		if (tex->buffer.b.b.nr_samples <= 1 && tex->cmask_buffer) {
 			/* Disable CMASK. */
-			assert(tex->cmask_buffer != &tex->resource);
+			assert(tex->cmask_buffer != &tex->buffer);
 			si_eliminate_fast_color_clear(sctx, tex);
 			si_texture_discard_cmask(sctx->screen, tex);
 		}
@@ -925,9 +919,9 @@
 		memset(desc, 0, 16 * 4);
 		si_set_shader_image_desc(sctx, &view, true, desc, desc + 8);
 
-		pipe_resource_reference(&buffers->buffers[slot], &tex->resource.b.b);
+		pipe_resource_reference(&buffers->buffers[slot], &tex->buffer.b.b);
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-					  &tex->resource, RADEON_USAGE_READ,
+					  &tex->buffer, RADEON_USAGE_READ,
 					  RADEON_PRIO_SHADER_RW_IMAGE);
 		buffers->enabled_mask |= 1u << slot;
 	} else {
@@ -974,13 +968,13 @@
 		struct si_sampler_view *sview =
 			(struct si_sampler_view *)samplers->views[slot];
 
-		struct r600_texture *tex = NULL;
+		struct si_texture *tex = NULL;
 
 		if (sview && sview->base.texture &&
 		    sview->base.texture->target != PIPE_BUFFER)
-			tex = (struct r600_texture *)sview->base.texture;
+			tex = (struct si_texture *)sview->base.texture;
 
-		if (tex && tex->fmask.size)
+		if (tex && tex->surface.fmask_size)
 			continue;
 
 		si_set_sampler_state_desc(sstates[i], sview, tex,
@@ -1077,7 +1071,7 @@
 			continue;
 
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer.resource,
+				      r600_resource(sctx->vertex_buffer[vb].buffer.resource),
 				      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 	}
 
@@ -1137,7 +1131,7 @@
 		uint32_t *desc = &ptr[i*4];
 
 		vb = &sctx->vertex_buffer[vbo_index];
-		rbuffer = (struct r600_resource*)vb->buffer.resource;
+		rbuffer = r600_resource(vb->buffer.resource);
 		if (!rbuffer) {
 			memset(desc, 0, 16);
 			continue;
@@ -1163,7 +1157,7 @@
 
 		if (first_vb_use_mask & (1 << i)) {
 			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-					      (struct r600_resource*)vb->buffer.resource,
+					      r600_resource(vb->buffer.resource),
 					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 		}
 	}
@@ -1172,7 +1166,7 @@
 	 * on performance (confirmed by testing). New descriptors are always
 	 * uploaded to a fresh new buffer, so I don't think flushing the const
 	 * cache is needed. */
-	si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 	sctx->vertex_buffers_dirty = false;
 	sctx->vertex_buffer_pointer_dirty = true;
 	sctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS;
@@ -1262,7 +1256,7 @@
 
 		buffers->buffers[slot] = buffer;
 		radeon_add_to_gfx_buffer_list_check_mem(sctx,
-							(struct r600_resource*)buffer,
+							r600_resource(buffer),
 							buffers->shader_usage_constbuf,
 							buffers->priority_constbuf, true);
 		buffers->enabled_mask |= 1u << slot;
@@ -1344,7 +1338,7 @@
 			continue;
 		}
 
-		buf = (struct r600_resource *)sbuffer->buffer;
+		buf = r600_resource(sbuffer->buffer);
 		va = buf->gpu_address + sbuffer->buffer_offset;
 
 		desc[0] = va;
@@ -1474,7 +1468,7 @@
 
 		pipe_resource_reference(&buffers->buffers[slot], buffer);
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      (struct r600_resource*)buffer,
+				      r600_resource(buffer),
 				      buffers->shader_usage, buffers->priority);
 		buffers->enabled_mask |= 1u << slot;
 	} else {
@@ -1530,13 +1524,13 @@
 	util_dynarray_foreach(&sctx->resident_tex_handles,
 			      struct si_texture_handle *, tex_handle) {
 		struct pipe_resource *res = (*tex_handle)->view->texture;
-		struct r600_texture *rtex;
+		struct si_texture *tex;
 
 		if (!res || res->target == PIPE_BUFFER)
 			continue;
 
-		rtex = (struct r600_texture *)res;
-		if (!color_needs_decompression(rtex))
+		tex = (struct si_texture *)res;
+		if (!color_needs_decompression(tex))
 			continue;
 
 		util_dynarray_append(&sctx->resident_tex_needs_color_decompress,
@@ -1547,13 +1541,13 @@
 			      struct si_image_handle *, img_handle) {
 		struct pipe_image_view *view = &(*img_handle)->view;
 		struct pipe_resource *res = view->resource;
-		struct r600_texture *rtex;
+		struct si_texture *tex;
 
 		if (!res || res->target == PIPE_BUFFER)
 			continue;
 
-		rtex = (struct r600_texture *)res;
-		if (!color_needs_decompression(rtex))
+		tex = (struct si_texture *)res;
+		if (!color_needs_decompression(tex))
 			continue;
 
 		util_dynarray_append(&sctx->resident_img_needs_color_decompress,
@@ -1599,7 +1593,7 @@
 			sctx->descriptors_dirty |= 1u << descriptors_idx;
 
 			radeon_add_to_gfx_buffer_list_check_mem(sctx,
-								(struct r600_resource *)buf,
+								r600_resource(buf),
 								usage, priority, true);
 		}
 	}
@@ -1809,7 +1803,7 @@
 					  unsigned num_dwords)
 {
 	struct si_descriptors *desc = &sctx->bindless_descriptors;
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned desc_slot_offset = desc_slot * 16;
 	uint32_t *data;
 	uint64_t va;
@@ -1985,14 +1979,14 @@
 	if (shader == PIPE_SHADER_VERTEX)
 		sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
 
-	si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 }
 
 static void si_shader_pointers_begin_new_cs(struct si_context *sctx)
 {
 	sctx->shader_pointers_dirty = u_bit_consecutive(0, SI_NUM_DESCS);
 	sctx->vertex_buffer_pointer_dirty = sctx->vb_descriptors_buffer != NULL;
-	si_mark_atom_dirty(sctx, &sctx->shader_pointers.atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
 	sctx->graphics_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
 	sctx->compute_bindless_pointer_dirty = sctx->bindless_descriptors.buffer != NULL;
 }
@@ -2008,12 +2002,14 @@
 	if (*base != new_base) {
 		*base = new_base;
 
-		if (new_base) {
+		if (new_base)
 			si_mark_shader_pointers_dirty(sctx, shader);
 
-			if (shader == PIPE_SHADER_VERTEX)
-				sctx->last_vs_state = ~0;
-		}
+		/* Any change in enabled shader stages requires re-emitting
+		 * the VS state SGPR, because it contains the clamp_vertex_color
+		 * state, which can be done in VS, TES, and GS.
+		 */
+		sctx->last_vs_state = ~0;
 	}
 }
 
@@ -2055,7 +2051,7 @@
 	}
 }
 
-static void si_emit_shader_pointer_head(struct radeon_winsys_cs *cs,
+static void si_emit_shader_pointer_head(struct radeon_cmdbuf *cs,
 					unsigned sh_offset,
 					unsigned pointer_count)
 {
@@ -2064,7 +2060,7 @@
 }
 
 static void si_emit_shader_pointer_body(struct si_screen *sscreen,
-					struct radeon_winsys_cs *cs,
+					struct radeon_cmdbuf *cs,
 					uint64_t va)
 {
 	radeon_emit(cs, va);
@@ -2079,7 +2075,7 @@
 				   struct si_descriptors *desc,
 				   unsigned sh_base)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned sh_offset = sh_base + desc->shader_userdata_offset;
 
 	si_emit_shader_pointer_head(cs, sh_offset, 1);
@@ -2093,7 +2089,7 @@
 	if (!sh_base)
 		return;
 
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned mask = sctx->shader_pointers_dirty & pointer_mask;
 
 	while (mask) {
@@ -2117,7 +2113,7 @@
 	if (!sh_base)
 		return;
 
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned mask = sctx->shader_pointers_dirty & pointer_mask;
 
 	while (mask) {
@@ -2153,8 +2149,7 @@
 			       R_00B530_SPI_SHADER_USER_DATA_LS_0);
 }
 
-void si_emit_graphics_shader_pointers(struct si_context *sctx,
-                                      struct r600_atom *atom)
+void si_emit_graphics_shader_pointers(struct si_context *sctx)
 {
 	uint32_t *sh_base = sctx->shader_pointers.sh_base;
 
@@ -2185,7 +2180,7 @@
 		~u_bit_consecutive(SI_DESCS_RW_BUFFERS, SI_DESCS_FIRST_COMPUTE);
 
 	if (sctx->vertex_buffer_pointer_dirty) {
-		struct radeon_winsys_cs *cs = sctx->gfx_cs;
+		struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 		/* Find the location of the VB descriptor pointer. */
 		/* TODO: In the future, the pointer will be packed in unused
@@ -2428,25 +2423,25 @@
 
 	if (resident) {
 		if (sview->base.texture->target != PIPE_BUFFER) {
-			struct r600_texture *rtex =
-				(struct r600_texture *)sview->base.texture;
+			struct si_texture *tex =
+				(struct si_texture *)sview->base.texture;
 
-			if (depth_needs_decompression(rtex)) {
+			if (depth_needs_decompression(tex)) {
 				util_dynarray_append(
 					&sctx->resident_tex_needs_depth_decompress,
 					struct si_texture_handle *,
 					tex_handle);
 			}
 
-			if (color_needs_decompression(rtex)) {
+			if (color_needs_decompression(tex)) {
 				util_dynarray_append(
 					&sctx->resident_tex_needs_color_decompress,
 					struct si_texture_handle *,
 					tex_handle);
 			}
 
-			if (rtex->dcc_offset &&
-			    p_atomic_read(&rtex->framebuffers_bound))
+			if (tex->dcc_offset &&
+			    p_atomic_read(&tex->framebuffers_bound))
 				sctx->need_check_render_feedback = true;
 
 			si_update_bindless_texture_descriptor(sctx, tex_handle);
@@ -2570,22 +2565,22 @@
 
 	img_handle = (struct si_image_handle *)entry->data;
 	view = &img_handle->view;
-	res = (struct r600_resource *)view->resource;
+	res = r600_resource(view->resource);
 
 	if (resident) {
 		if (res->b.b.target != PIPE_BUFFER) {
-			struct r600_texture *rtex = (struct r600_texture *)res;
+			struct si_texture *tex = (struct si_texture *)res;
 			unsigned level = view->u.tex.level;
 
-			if (color_needs_decompression(rtex)) {
+			if (color_needs_decompression(tex)) {
 				util_dynarray_append(
 					&sctx->resident_img_needs_color_decompress,
 					struct si_image_handle *,
 					img_handle);
 			}
 
-			if (vi_dcc_enabled(rtex, level) &&
-			    p_atomic_read(&rtex->framebuffers_bound))
+			if (vi_dcc_enabled(tex, level) &&
+			    p_atomic_read(&tex->framebuffers_bound))
 				sctx->need_check_render_feedback = true;
 
 			si_update_bindless_image_descriptor(sctx, img_handle);
@@ -2762,8 +2757,7 @@
 	sctx->b.make_image_handle_resident = si_make_image_handle_resident;
 
 	/* Shader user data. */
-	si_init_atom(sctx, &sctx->shader_pointers.atom, &sctx->atoms.s.shader_pointers,
-		     si_emit_graphics_shader_pointers);
+	sctx->atoms.s.shader_pointers.emit = si_emit_graphics_shader_pointers;
 
 	/* Set default and immutable mappings. */
 	si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index e3b5bb4..da5bd47 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -35,10 +35,10 @@
 				uint64_t src_offset,
 				uint64_t size)
 {
-	struct radeon_winsys_cs *cs = ctx->dma_cs;
+	struct radeon_cmdbuf *cs = ctx->dma_cs;
 	unsigned i, ncopy, count, max_size, sub_cmd, shift;
-	struct r600_resource *rdst = (struct r600_resource*)dst;
-	struct r600_resource *rsrc = (struct r600_resource*)src;
+	struct r600_resource *rdst = r600_resource(dst);
+	struct r600_resource *rsrc = r600_resource(src);
 
 	/* Mark the buffer range of destination as valid (initialized),
 	 * so that transfer_map knows it should wait for the GPU when mapping
@@ -83,7 +83,7 @@
 				uint64_t size,
 				unsigned clear_value)
 {
-	struct radeon_winsys_cs *cs = sctx->dma_cs;
+	struct radeon_cmdbuf *cs = sctx->dma_cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = r600_resource(dst);
 
@@ -131,17 +131,17 @@
 			     unsigned pitch,
 			     unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = ctx->dma_cs;
-	struct r600_texture *rsrc = (struct r600_texture*)src;
-	struct r600_texture *rdst = (struct r600_texture*)dst;
-	unsigned dst_mode = rdst->surface.u.legacy.level[dst_level].mode;
+	struct radeon_cmdbuf *cs = ctx->dma_cs;
+	struct si_texture *ssrc = (struct si_texture*)src;
+	struct si_texture *sdst = (struct si_texture*)dst;
+	unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
 	bool detile = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED;
-	struct r600_texture *rlinear = detile ? rdst : rsrc;
-	struct r600_texture *rtiled = detile ? rsrc : rdst;
+	struct si_texture *linear = detile ? sdst : ssrc;
+	struct si_texture *tiled = detile ? ssrc : sdst;
 	unsigned linear_lvl = detile ? dst_level : src_level;
 	unsigned tiled_lvl = detile ? src_level : dst_level;
 	struct radeon_info *info = &ctx->screen->info;
-	unsigned index = rtiled->surface.u.legacy.tiling_index[tiled_lvl];
+	unsigned index = tiled->surface.u.legacy.tiling_index[tiled_lvl];
 	unsigned tile_mode = info->si_tile_mode_array[index];
 	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
 	unsigned ncopy, height, cheight, i;
@@ -150,7 +150,7 @@
 	uint64_t base, addr;
 	unsigned pipe_config;
 
-	assert(dst_mode != rsrc->surface.u.legacy.level[src_level].mode);
+	assert(dst_mode != ssrc->surface.u.legacy.level[src_level].mode);
 
 	sub_cmd = SI_DMA_COPY_TILED;
 	lbpp = util_logbase2(bpp);
@@ -163,35 +163,35 @@
 	tiled_y = detile ? src_y : dst_y;
 	tiled_z = detile ? src_z : dst_z;
 
-	assert(!util_format_is_depth_and_stencil(rtiled->resource.b.b.format));
+	assert(!util_format_is_depth_and_stencil(tiled->buffer.b.b.format));
 
 	array_mode = G_009910_ARRAY_MODE(tile_mode);
-	slice_tile_max = (rtiled->surface.u.legacy.level[tiled_lvl].nblk_x *
-			  rtiled->surface.u.legacy.level[tiled_lvl].nblk_y) / (8*8) - 1;
+	slice_tile_max = (tiled->surface.u.legacy.level[tiled_lvl].nblk_x *
+			  tiled->surface.u.legacy.level[tiled_lvl].nblk_y) / (8*8) - 1;
 	/* linear height must be the same as the slice tile max height, it's ok even
 	 * if the linear destination/source have smaller heigh as the size of the
 	 * dma packet will be using the copy_height which is always smaller or equal
 	 * to the linear height
 	 */
-	height = rtiled->surface.u.legacy.level[tiled_lvl].nblk_y;
-	base = rtiled->surface.u.legacy.level[tiled_lvl].offset;
-	addr = rlinear->surface.u.legacy.level[linear_lvl].offset;
-	addr += (uint64_t)rlinear->surface.u.legacy.level[linear_lvl].slice_size_dw * 4 * linear_z;
+	height = tiled->surface.u.legacy.level[tiled_lvl].nblk_y;
+	base = tiled->surface.u.legacy.level[tiled_lvl].offset;
+	addr = linear->surface.u.legacy.level[linear_lvl].offset;
+	addr += (uint64_t)linear->surface.u.legacy.level[linear_lvl].slice_size_dw * 4 * linear_z;
 	addr += linear_y * pitch + linear_x * bpp;
 	bank_h = G_009910_BANK_HEIGHT(tile_mode);
 	bank_w = G_009910_BANK_WIDTH(tile_mode);
 	mt_aspect = G_009910_MACRO_TILE_ASPECT(tile_mode);
 	/* Non-depth modes don't have TILE_SPLIT set. */
-	tile_split = util_logbase2(rtiled->surface.u.legacy.tile_split >> 6);
+	tile_split = util_logbase2(tiled->surface.u.legacy.tile_split >> 6);
 	nbanks = G_009910_NUM_BANKS(tile_mode);
-	base += rtiled->resource.gpu_address;
-	addr += rlinear->resource.gpu_address;
+	base += tiled->buffer.gpu_address;
+	addr += linear->buffer.gpu_address;
 
 	pipe_config = G_009910_PIPE_CONFIG(tile_mode);
 	mt = G_009910_MICRO_TILE_MODE(tile_mode);
 	size = copy_height * pitch;
 	ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
-	si_need_dma_space(ctx, ncopy * 9, &rdst->resource, &rsrc->resource);
+	si_need_dma_space(ctx, ncopy * 9, &sdst->buffer, &ssrc->buffer);
 
 	for (i = 0; i < ncopy; i++) {
 		cheight = copy_height;
@@ -225,8 +225,8 @@
 			const struct pipe_box *src_box)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	struct r600_texture *rsrc = (struct r600_texture*)src;
-	struct r600_texture *rdst = (struct r600_texture*)dst;
+	struct si_texture *ssrc = (struct si_texture*)src;
+	struct si_texture *sdst = (struct si_texture*)dst;
 	unsigned dst_pitch, src_pitch, bpp, dst_mode, src_mode;
 	unsigned src_w, dst_w;
 	unsigned src_x, src_y;
@@ -259,8 +259,8 @@
 	goto fallback;
 
 	if (src_box->depth > 1 ||
-	    !si_prepare_for_dma_blit(sctx, rdst, dst_level, dstx, dsty,
-				     dstz, rsrc, src_level, src_box))
+	    !si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty,
+				     dstz, ssrc, src_level, src_box))
 		goto fallback;
 
 	src_x = util_format_get_nblocksx(src->format, src_box->x);
@@ -268,21 +268,21 @@
 	src_y = util_format_get_nblocksy(src->format, src_box->y);
 	dst_y = util_format_get_nblocksy(src->format, dst_y);
 
-	bpp = rdst->surface.bpe;
-	dst_pitch = rdst->surface.u.legacy.level[dst_level].nblk_x * rdst->surface.bpe;
-	src_pitch = rsrc->surface.u.legacy.level[src_level].nblk_x * rsrc->surface.bpe;
-	src_w = u_minify(rsrc->resource.b.b.width0, src_level);
-	dst_w = u_minify(rdst->resource.b.b.width0, dst_level);
+	bpp = sdst->surface.bpe;
+	dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x * sdst->surface.bpe;
+	src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x * ssrc->surface.bpe;
+	src_w = u_minify(ssrc->buffer.b.b.width0, src_level);
+	dst_w = u_minify(sdst->buffer.b.b.width0, dst_level);
 
-	dst_mode = rdst->surface.u.legacy.level[dst_level].mode;
-	src_mode = rsrc->surface.u.legacy.level[src_level].mode;
+	dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
+	src_mode = ssrc->surface.u.legacy.level[src_level].mode;
 
 	if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w ||
 	    src_box->width != src_w ||
-	    src_box->height != u_minify(rsrc->resource.b.b.height0, src_level) ||
-	    src_box->height != u_minify(rdst->resource.b.b.height0, dst_level) ||
-	    rsrc->surface.u.legacy.level[src_level].nblk_y !=
-	    rdst->surface.u.legacy.level[dst_level].nblk_y) {
+	    src_box->height != u_minify(ssrc->buffer.b.b.height0, src_level) ||
+	    src_box->height != u_minify(sdst->buffer.b.b.height0, dst_level) ||
+	    ssrc->surface.u.legacy.level[src_level].nblk_y !=
+	    sdst->surface.u.legacy.level[dst_level].nblk_y) {
 		/* FIXME si can do partial blit */
 		goto fallback;
 	}
@@ -301,18 +301,18 @@
 		 *   dst_x/y == 0
 		 *   dst_pitch == src_pitch
 		 */
-		src_offset= rsrc->surface.u.legacy.level[src_level].offset;
-		src_offset += (uint64_t)rsrc->surface.u.legacy.level[src_level].slice_size_dw * 4 * src_box->z;
+		src_offset= ssrc->surface.u.legacy.level[src_level].offset;
+		src_offset += (uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4 * src_box->z;
 		src_offset += src_y * src_pitch + src_x * bpp;
-		dst_offset = rdst->surface.u.legacy.level[dst_level].offset;
-		dst_offset += (uint64_t)rdst->surface.u.legacy.level[dst_level].slice_size_dw * 4 * dst_z;
+		dst_offset = sdst->surface.u.legacy.level[dst_level].offset;
+		dst_offset += (uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4 * dst_z;
 		dst_offset += dst_y * dst_pitch + dst_x * bpp;
 		si_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset,
-				   (uint64_t)rsrc->surface.u.legacy.level[src_level].slice_size_dw * 4);
+				   (uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4);
 	} else {
 		si_dma_copy_tile(sctx, dst, dst_level, dst_x, dst_y, dst_z,
 				 src, src_level, src_x, src_y, src_box->z,
-				 src_box->height / rsrc->surface.blk_h,
+				 src_box->height / ssrc->surface.blk_h,
 				 dst_pitch, bpp);
 	}
 	return;
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c b/src/gallium/drivers/radeonsi/si_dma_cs.c
index 1eefaeb..3bb7693 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -26,9 +26,9 @@
 
 static void si_dma_emit_wait_idle(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->dma_cs;
+	struct radeon_cmdbuf *cs = sctx->dma_cs;
 
-	/* NOP waits for idle on Evergreen and later. */
+	/* NOP waits for idle. */
 	if (sctx->chip_class >= CIK)
 		radeon_emit(cs, 0x00000000); /* NOP */
 	else
@@ -93,13 +93,11 @@
 
 	if (dst) {
 		radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst,
-					  RADEON_USAGE_WRITE,
-					  RADEON_PRIO_SDMA_BUFFER);
+					  RADEON_USAGE_WRITE, 0);
 	}
 	if (src) {
 		radeon_add_to_buffer_list(ctx, ctx->dma_cs, src,
-					  RADEON_USAGE_READ,
-					  RADEON_PRIO_SDMA_BUFFER);
+					  RADEON_USAGE_READ, 0);
 	}
 
 	/* this function is called before all DMA calls, so increment this. */
@@ -109,7 +107,7 @@
 void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
 		     struct pipe_fence_handle **fence)
 {
-	struct radeon_winsys_cs *cs = ctx->dma_cs;
+	struct radeon_cmdbuf *cs = ctx->dma_cs;
 	struct radeon_saved_cs saved;
 	bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
 
diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
index 19fcb96..abb7057 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -70,7 +70,7 @@
 			    struct r600_resource *buf, uint64_t va,
 			    uint32_t new_fence, unsigned query_type)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	unsigned op = EVENT_TYPE(event) |
 		      EVENT_INDEX(5) |
 		      event_flags;
@@ -163,7 +163,7 @@
 void si_gfx_wait_fence(struct si_context *ctx,
 		       uint64_t va, uint32_t ref, uint32_t mask)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 
 	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
 	radeon_emit(cs, WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_MEM_SPACE(1));
@@ -266,7 +266,7 @@
 	radeon_add_to_buffer_list(ctx, ctx->gfx_cs, fine->buf,
 				  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 	if (flags & PIPE_FLUSH_TOP_OF_PIPE) {
-		struct radeon_winsys_cs *cs = ctx->gfx_cs;
+		struct radeon_cmdbuf *cs = ctx->gfx_cs;
 		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
 		radeon_emit(cs, S_370_DST_SEL(V_370_MEM_ASYNC) |
 			S_370_WR_CONFIRM(1) |
@@ -291,8 +291,12 @@
 {
 	struct radeon_winsys *rws = ((struct si_screen*)screen)->ws;
 	struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
+	struct si_context *sctx;
 	int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
 
+	ctx = threaded_context_unwrap_sync(ctx);
+	sctx = (struct si_context*)(ctx ? ctx : NULL);
+
 	if (!util_queue_fence_is_signalled(&rfence->ready)) {
 		if (rfence->tc_token) {
 			/* Ensure that si_flush_from_st will be called for
@@ -345,49 +349,43 @@
 	}
 
 	/* Flush the gfx IB if it hasn't been flushed yet. */
-	if (ctx && rfence->gfx_unflushed.ctx) {
-		struct si_context *sctx;
+	if (sctx && rfence->gfx_unflushed.ctx == sctx &&
+	    rfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
+		/* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
+		 * spec says:
+		 *
+		 *    "If the sync object being blocked upon will not be
+		 *     signaled in finite time (for example, by an associated
+		 *     fence command issued previously, but not yet flushed to
+		 *     the graphics pipeline), then ClientWaitSync may hang
+		 *     forever. To help prevent this behavior, if
+		 *     ClientWaitSync is called and all of the following are
+		 *     true:
+		 *
+		 *     * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
+		 *     * sync is unsignaled when ClientWaitSync is called,
+		 *     * and the calls to ClientWaitSync and FenceSync were
+		 *       issued from the same context,
+		 *
+		 *     then the GL will behave as if the equivalent of Flush
+		 *     were inserted immediately after the creation of sync."
+		 *
+		 * This means we need to flush for such fences even when we're
+		 * not going to wait.
+		 */
+		si_flush_gfx_cs(sctx,
+				(timeout ? 0 : PIPE_FLUSH_ASYNC) |
+				 RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
+				NULL);
+		rfence->gfx_unflushed.ctx = NULL;
 
-		sctx = (struct si_context *)threaded_context_unwrap_unsync(ctx);
-		if (rfence->gfx_unflushed.ctx == sctx &&
-		    rfence->gfx_unflushed.ib_index == sctx->num_gfx_cs_flushes) {
-			/* Section 4.1.2 (Signaling) of the OpenGL 4.6 (Core profile)
-			 * spec says:
-			 *
-			 *    "If the sync object being blocked upon will not be
-			 *     signaled in finite time (for example, by an associated
-			 *     fence command issued previously, but not yet flushed to
-			 *     the graphics pipeline), then ClientWaitSync may hang
-			 *     forever. To help prevent this behavior, if
-			 *     ClientWaitSync is called and all of the following are
-			 *     true:
-			 *
-			 *     * the SYNC_FLUSH_COMMANDS_BIT bit is set in flags,
-			 *     * sync is unsignaled when ClientWaitSync is called,
-			 *     * and the calls to ClientWaitSync and FenceSync were
-			 *       issued from the same context,
-			 *
-			 *     then the GL will behave as if the equivalent of Flush
-			 *     were inserted immediately after the creation of sync."
-			 *
-			 * This means we need to flush for such fences even when we're
-			 * not going to wait.
-			 */
-			threaded_context_unwrap_sync(ctx);
-			si_flush_gfx_cs(sctx,
-					(timeout ? 0 : PIPE_FLUSH_ASYNC) |
-					 RADEON_FLUSH_START_NEXT_GFX_IB_NOW,
-					NULL);
-			rfence->gfx_unflushed.ctx = NULL;
+		if (!timeout)
+			return false;
 
-			if (!timeout)
-				return false;
-
-			/* Recompute the timeout after all that. */
-			if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
-				int64_t time = os_time_get_nano();
-				timeout = abs_timeout > time ? abs_timeout - time : 0;
-			}
+		/* Recompute the timeout after all that. */
+		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+			int64_t time = os_time_get_nano();
+			timeout = abs_timeout > time ? abs_timeout - time : 0;
 		}
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index 6bfbc4d..6e80479 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -78,21 +78,12 @@
 	case CHIP_VEGAM: return "AMD VEGAM";
 	case CHIP_VEGA10: return "AMD VEGA10";
 	case CHIP_VEGA12: return "AMD VEGA12";
+	case CHIP_VEGA20: return "AMD VEGA20";
 	case CHIP_RAVEN: return "AMD RAVEN";
 	default: return "AMD unknown";
 	}
 }
 
-static bool si_have_tgsi_compute(struct si_screen *sscreen)
-{
-	/* Old kernels disallowed some register writes for SI
-	 * that are used for indirect dispatches. */
-	return (sscreen->info.chip_class >= CIK ||
-		sscreen->info.drm_major == 3 ||
-		(sscreen->info.drm_major == 2 &&
-		 sscreen->info.drm_minor >= 45));
-}
-
 static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
 	struct si_screen *sscreen = (struct si_screen *)pscreen;
@@ -161,6 +152,7 @@
 	case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
 	case PIPE_CAP_INVALIDATE_BUFFER:
 	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+	case PIPE_CAP_QUERY_BUFFER_OBJECT:
 	case PIPE_CAP_QUERY_MEMORY_INFO:
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
@@ -192,27 +184,20 @@
 	case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
 	case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
 	case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+	case PIPE_CAP_TGSI_BALLOT:
 	case PIPE_CAP_TGSI_VOTE:
 	case PIPE_CAP_TGSI_FS_FBFETCH:
 		return 1;
 
-	case PIPE_CAP_TGSI_BALLOT:
-		return HAVE_LLVM >= 0x0500;
-
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 		return !SI_BIG_ENDIAN && sscreen->info.has_userptr;
 
 	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-		return (sscreen->info.drm_major == 2 &&
-			sscreen->info.drm_minor >= 43) ||
-		       sscreen->info.drm_major == 3;
+		return sscreen->info.has_gpu_reset_status_query ||
+		       sscreen->info.has_gpu_reset_counter_query;
 
 	case PIPE_CAP_TEXTURE_MULTISAMPLE:
-		/* 2D tiling on CIK is supported since DRM 2.35.0 */
-		return sscreen->info.chip_class < CIK ||
-		       (sscreen->info.drm_major == 2 &&
-			sscreen->info.drm_minor >= 35) ||
-		       sscreen->info.drm_major == 3;
+		return sscreen->info.has_2d_tiling;
 
         case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
                 return SI_MAP_BUFFER_ALIGNMENT;
@@ -226,8 +211,10 @@
 		return 4;
 
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
-		if (si_have_tgsi_compute(sscreen))
-			return 450;
+	case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+		if (sscreen->info.has_indirect_compute_dispatch)
+			return param == PIPE_CAP_GLSL_FEATURE_LEVEL ?
+				450 : 440;
 		return 420;
 
 	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
@@ -236,24 +223,11 @@
 	case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
 	case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
 	case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
-		/* SI doesn't support unaligned loads.
-		 * CIK needs DRM 2.50.0 on radeon. */
-		return sscreen->info.chip_class == SI ||
-		       (sscreen->info.drm_major == 2 &&
-			sscreen->info.drm_minor < 50);
+		return !sscreen->info.has_unaligned_shader_loads;
 
 	case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
-		/* TODO: GFX9 hangs. */
-		if (sscreen->info.chip_class >= GFX9)
-			return 0;
-		/* Disable on SI due to VM faults in CP DMA. Enable once these
-		 * faults are mitigated in software.
-		 */
-		if (sscreen->info.chip_class >= CIK &&
-		    sscreen->info.drm_major == 3 &&
-		    sscreen->info.drm_minor >= 13)
-			return RADEON_SPARSE_PAGE_SIZE;
-		return 0;
+		return sscreen->info.has_sparse_vm_mappings ?
+				RADEON_SPARSE_PAGE_SIZE : 0;
 
 	case PIPE_CAP_PACKED_UNIFORMS:
 		if (sscreen->debug_flags & DBG(NIR))
@@ -277,6 +251,13 @@
 	case PIPE_CAP_TILE_RASTER_ORDER:
 	case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
 	case PIPE_CAP_CONTEXT_PRIORITY_MASK:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+	case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+	case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+	case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
 		return 0;
 
 	case PIPE_CAP_FENCE_SIGNAL:
@@ -288,9 +269,6 @@
 	case PIPE_CAP_NATIVE_FENCE_FD:
 		return sscreen->info.has_fence_to_handle;
 
-	case PIPE_CAP_QUERY_BUFFER_OBJECT:
-		return si_have_tgsi_compute(sscreen);
-
 	case PIPE_CAP_DRAW_PARAMETERS:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
@@ -334,6 +312,8 @@
 	case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
 	case PIPE_CAP_MAX_RENDER_TARGETS:
 		return 8;
+	case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
+		return sscreen->info.has_eqaa_surface_allocator ? 2 : 0;
 
 	case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
 	case PIPE_CAP_MIN_TEXEL_OFFSET:
@@ -376,6 +356,10 @@
 		return 16.0f;
 	case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
 		return 16.0f;
+	case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+	case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+	case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+		return 0.0f;
 	}
 	return 0.0f;
 }
@@ -399,7 +383,7 @@
 		case PIPE_SHADER_CAP_SUPPORTED_IRS: {
 			int ir = 1 << PIPE_SHADER_IR_NATIVE;
 
-			if (si_have_tgsi_compute(sscreen))
+			if (sscreen->info.has_indirect_compute_dispatch)
 				ir |= 1 << PIPE_SHADER_IR_TGSI;
 
 			return ir;
@@ -511,6 +495,8 @@
 	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
 	case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
 		return 0;
+	case PIPE_SHADER_CAP_SCALAR_ISA:
+		return 1;
 	}
 	return 0;
 }
@@ -939,8 +925,8 @@
 	 *
 	 * Instead, return statistics of this process.
 	 */
-	vram_usage = ws->query_value(ws, RADEON_REQUESTED_VRAM_MEMORY) / 1024;
-	gtt_usage =  ws->query_value(ws, RADEON_REQUESTED_GTT_MEMORY) / 1024;
+	vram_usage = ws->query_value(ws, RADEON_VRAM_USAGE) / 1024;
+	gtt_usage =  ws->query_value(ws, RADEON_GTT_USAGE) / 1024;
 
 	info->avail_device_memory =
 		vram_usage <= info->total_device_memory ?
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 12f5650..c0688d4 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -30,7 +30,7 @@
 /* initialize */
 void si_need_gfx_cs_space(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 
 	/* There is no need to flush the DMA IB here, because
 	 * r600_need_dma_space always flushes the GFX IB if there is
@@ -67,15 +67,14 @@
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 		     struct pipe_fence_handle **fence)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct radeon_winsys *ws = ctx->ws;
 	unsigned wait_flags = 0;
 
 	if (ctx->gfx_flush_in_progress)
 		return;
 
-	if (ctx->chip_class == VI && ctx->screen->info.drm_minor <= 1) {
-		/* DRM 3.1.0 doesn't flush TC for VI correctly. */
+	if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) {
 		wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
 			      SI_CONTEXT_CS_PARTIAL_FLUSH |
 			      SI_CONTEXT_INV_GLOBAL_L2;
@@ -148,8 +147,6 @@
 	if (fence)
 		ws->fence_reference(fence, ctx->last_gfx_fence);
 
-	/* This must be after cs_flush returns, since the context's API
-	 * thread can concurrently read this value in si_fence_finish. */
 	ctx->num_gfx_cs_flushes++;
 
 	/* Check VM faults if needed. */
@@ -180,9 +177,8 @@
 
 	pipe_reference_init(&ctx->current_saved_cs->reference, 1);
 
-	ctx->current_saved_cs->trace_buf = (struct r600_resource*)
-				 pipe_buffer_create(ctx->b.screen, 0,
-						    PIPE_USAGE_STAGING, 8);
+	ctx->current_saved_cs->trace_buf = r600_resource(
+		pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8));
 	if (!ctx->current_saved_cs->trace_buf) {
 		free(ctx->current_saved_cs);
 		ctx->current_saved_cs = NULL;
@@ -258,39 +254,40 @@
 	}
 	/* This should always be marked as dirty to set the framebuffer scissor
 	 * at least. */
-	si_mark_atom_dirty(ctx, &ctx->framebuffer.atom);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer);
 
-	si_mark_atom_dirty(ctx, &ctx->clip_regs);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs);
 	/* CLEAR_STATE sets zeros. */
 	if (!has_clear_state || ctx->clip_state.any_nonzeros)
-		si_mark_atom_dirty(ctx, &ctx->clip_state.atom);
-	ctx->msaa_sample_locs.nr_samples = 0;
-	si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs.atom);
-	si_mark_atom_dirty(ctx, &ctx->msaa_config);
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state);
+	ctx->sample_locs_num_samples = 0;
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config);
 	/* CLEAR_STATE sets 0xffff. */
-	if (!has_clear_state || ctx->sample_mask.sample_mask != 0xffff)
-		si_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
-	si_mark_atom_dirty(ctx, &ctx->cb_render_state);
+	if (!has_clear_state || ctx->sample_mask != 0xffff)
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state);
 	/* CLEAR_STATE sets zeros. */
 	if (!has_clear_state || ctx->blend_color.any_nonzeros)
-		si_mark_atom_dirty(ctx, &ctx->blend_color.atom);
-	si_mark_atom_dirty(ctx, &ctx->db_render_state);
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state);
 	if (ctx->chip_class >= GFX9)
-		si_mark_atom_dirty(ctx, &ctx->dpbb_state);
-	si_mark_atom_dirty(ctx, &ctx->stencil_ref.atom);
-	si_mark_atom_dirty(ctx, &ctx->spi_map);
-	si_mark_atom_dirty(ctx, &ctx->streamout.enable_atom);
-	si_mark_atom_dirty(ctx, &ctx->render_cond_atom);
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond);
 	si_all_descriptors_begin_new_cs(ctx);
 	si_all_resident_buffers_begin_new_cs(ctx);
 
 	ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
 	ctx->viewports.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
 	ctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-	si_mark_atom_dirty(ctx, &ctx->scissors.atom);
-	si_mark_atom_dirty(ctx, &ctx->viewports.atom);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
 
-	si_mark_atom_dirty(ctx, &ctx->scratch_state);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);
 	if (ctx->scratch_buffer) {
 		si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
 	}
@@ -312,7 +309,6 @@
 	ctx->last_index_size = -1;
 	ctx->last_primitive_restart_en = -1;
 	ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN;
-	ctx->last_gs_out_prim = -1;
 	ctx->last_prim = -1;
 	ctx->last_multi_vgt_param = -1;
 	ctx->last_rast_prim = -1;
@@ -322,6 +318,41 @@
 	ctx->last_tcs = NULL;
 	ctx->last_tes_sh_base = -1;
 	ctx->last_num_tcs_input_cp = -1;
+	ctx->last_ls_hs_config = -1; /* impossible value */
 
 	ctx->cs_shader_state.initialized = false;
+
+	if (has_clear_state) {
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff;
+		ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL]	= 0x00001000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG]	= 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA]	= 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL]	= 0x00090000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003;
+		ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL]	= 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ]	= 0x3f800000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ]	= 0x3f800000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ]	= 0x3f800000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ]	= 0x3f800000;
+
+		/* Set all saved registers state to saved. */
+		ctx->tracked_regs.reg_saved = 0xffffffff;
+	} else {
+		/* Set all saved registers state to unknown. */
+		ctx->tracked_regs.reg_saved = 0;
+	}
+
+	/* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */
+	memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
 }
diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c
index ad62e55..43bf887 100644
--- a/src/gallium/drivers/radeonsi/si_perfcounter.c
+++ b/src/gallium/drivers/radeonsi/si_perfcounter.c
@@ -345,9 +345,9 @@
  * blocks here matters.
  */
 static struct si_pc_block groups_CIK[] = {
-	{ &cik_CB, 226, 4 },
+	{ &cik_CB, 226},
 	{ &cik_CPF, 17 },
-	{ &cik_DB, 257, 4 },
+	{ &cik_DB, 257},
 	{ &cik_GRBM, 34 },
 	{ &cik_GRBMSE, 15 },
 	{ &cik_PA_SU, 153 },
@@ -357,7 +357,7 @@
 	{ &cik_SX, 32 },
 	{ &cik_TA, 111, 11 },
 	{ &cik_TCA, 39, 2 },
-	{ &cik_TCC, 160, 16 },
+	{ &cik_TCC, 160},
 	{ &cik_TD, 55, 11 },
 	{ &cik_TCP, 154, 11 },
 	{ &cik_GDS, 121 },
@@ -372,9 +372,9 @@
 };
 
 static struct si_pc_block groups_VI[] = {
-	{ &cik_CB, 405, 4 },
+	{ &cik_CB, 405},
 	{ &cik_CPF, 19 },
-	{ &cik_DB, 257, 4 },
+	{ &cik_DB, 257},
 	{ &cik_GRBM, 34 },
 	{ &cik_GRBMSE, 15 },
 	{ &cik_PA_SU, 153 },
@@ -384,7 +384,7 @@
 	{ &cik_SX, 34 },
 	{ &cik_TA, 119, 16 },
 	{ &cik_TCA, 35, 2 },
-	{ &cik_TCC, 192, 16 },
+	{ &cik_TCC, 192},
 	{ &cik_TD, 55, 16 },
 	{ &cik_TCP, 180, 16 },
 	{ &cik_GDS, 121 },
@@ -399,9 +399,9 @@
 };
 
 static struct si_pc_block groups_gfx9[] = {
-	{ &cik_CB, 438, 4 },
+	{ &cik_CB, 438},
 	{ &cik_CPF, 32 },
-	{ &cik_DB, 328, 4 },
+	{ &cik_DB, 328},
 	{ &cik_GRBM, 38 },
 	{ &cik_GRBMSE, 16 },
 	{ &cik_PA_SU, 292 },
@@ -411,7 +411,7 @@
 	{ &cik_SX, 208 },
 	{ &cik_TA, 119, 16 },
 	{ &cik_TCA, 35, 2 },
-	{ &cik_TCC, 256, 16 },
+	{ &cik_TCC, 256},
 	{ &cik_TD, 57, 16 },
 	{ &cik_TCP, 85, 16 },
 	{ &cik_GDS, 121 },
@@ -425,7 +425,7 @@
 static void si_pc_emit_instance(struct si_context *sctx,
 				int se, int instance)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned value = S_030800_SH_BROADCAST_WRITES(1);
 
 	if (se >= 0) {
@@ -446,7 +446,7 @@
 static void si_pc_emit_shaders(struct si_context *sctx,
 			       unsigned shaders)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2);
 	radeon_emit(cs, shaders & 0x7f);
@@ -459,7 +459,7 @@
 {
 	struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
 	struct si_pc_block_base *regs = sigroup->b;
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned idx;
 	unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK;
 	unsigned dw;
@@ -552,7 +552,7 @@
 static void si_pc_emit_start(struct si_context *sctx,
 			     struct r600_resource *buffer, uint64_t va)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer,
 				  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
@@ -578,7 +578,7 @@
 static void si_pc_emit_stop(struct si_context *sctx,
 			    struct r600_resource *buffer, uint64_t va)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	si_gfx_write_event_eop(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0,
 			       EOP_DATA_SEL_VALUE_32BIT,
@@ -601,7 +601,7 @@
 {
 	struct si_pc_block *sigroup = (struct si_pc_block *)group->data;
 	struct si_pc_block_base *regs = sigroup->b;
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned idx;
 	unsigned reg = regs->counter0_lo;
 	unsigned reg_delta = 8;
@@ -704,10 +704,13 @@
 		struct si_pc_block *block = &blocks[i];
 		unsigned instances = block->instances;
 
-		if (!strcmp(block->b->name, "IA")) {
-			if (screen->info.max_se > 2)
-				instances = 2;
-		}
+		if (!strcmp(block->b->name, "CB") ||
+		    !strcmp(block->b->name, "DB"))
+			instances = screen->info.max_se;
+		else if (!strcmp(block->b->name, "TCC"))
+			instances = screen->info.num_tcc_blocks;
+		else if (!strcmp(block->b->name, "IA"))
+			instances = MAX2(1, screen->info.max_se / 2);
 
 		si_perfcounters_add_block(screen, pc,
 					    block->b->name,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index f2fdb98..6b36893 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -28,8 +28,10 @@
 #include "si_shader_internal.h"
 #include "sid.h"
 
+#include "ac_llvm_util.h"
 #include "radeon/radeon_uvd.h"
-#include "util/hash_table.h"
+#include "gallivm/lp_bld_misc.h"
+#include "util/disk_cache.h"
 #include "util/u_log.h"
 #include "util/u_memory.h"
 #include "util/u_suballoc.h"
@@ -55,6 +57,7 @@
 	/* Shader compiler options the shader cache should be aware of: */
 	{ "unsafemath", DBG(UNSAFE_MATH), "Enable unsafe math shader optimizations" },
 	{ "sisched", DBG(SI_SCHED), "Enable LLVM SI Machine Instruction Scheduler." },
+	{ "gisel", DBG(GISEL), "Enable LLVM global instruction selector." },
 
 	/* Shader compiler options (with no effect on the shader cache): */
 	{ "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
@@ -74,6 +77,7 @@
 	{ "nowc", DBG(NO_WC), "Disable GTT write combining" },
 	{ "check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info." },
 	{ "reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context." },
+	{ "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },
 
 	/* 3D engine options: */
 	{ "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
@@ -90,7 +94,6 @@
 	{ "nodccclear", DBG(NO_DCC_CLEAR), "Disable DCC fast clear." },
 	{ "nodccfb", DBG(NO_DCC_FB), "Disable separate DCC on the main framebuffer" },
 	{ "nodccmsaa", DBG(NO_DCC_MSAA), "Disable DCC for MSAA" },
-	{ "dccmsaa", DBG(DCC_MSAA), "Enable DCC for MSAA" },
 	{ "nofmask", DBG(NO_FMASK), "Disable MSAA compression" },
 
 	/* Tests: */
@@ -102,6 +105,38 @@
 	DEBUG_NAMED_VALUE_END /* must be last */
 };
 
+static void si_init_compiler(struct si_screen *sscreen,
+			     struct ac_llvm_compiler *compiler)
+{
+	/* Only create the less-optimizing version of the compiler on APUs
+	 * predating Ryzen (Raven). */
+	bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram &&
+				       sscreen->info.chip_class <= VI;
+
+	enum ac_target_machine_options tm_options =
+		(sscreen->debug_flags & DBG(SI_SCHED) ? AC_TM_SISCHED : 0) |
+		(sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) |
+		(sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
+		(sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) |
+		(!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) |
+		(sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) |
+		(create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0);
+
+	ac_init_llvm_once();
+	ac_init_llvm_compiler(compiler, true, sscreen->info.family, tm_options);
+	compiler->passes = ac_create_llvm_passes(compiler->tm);
+
+	if (compiler->low_opt_tm)
+		compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
+}
+
+static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
+{
+	ac_destroy_llvm_passes(compiler->passes);
+	ac_destroy_llvm_passes(compiler->low_opt_passes);
+	ac_destroy_llvm_compiler(compiler);
+}
+
 /*
  * pipe_context
  */
@@ -170,7 +205,7 @@
 				sctx->b.destroy_query(&sctx->b,
 							sctx->dcc_stats[i].ps_stats[j]);
 
-		r600_texture_reference(&sctx->dcc_stats[i].tex, NULL);
+		si_texture_reference(&sctx->dcc_stats[i].tex, NULL);
 	}
 
 	if (sctx->query_result_shader)
@@ -200,7 +235,7 @@
 	sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
 	r600_resource_reference(&sctx->eop_bug_scratch, NULL);
 
-	LLVMDisposeTargetMachine(sctx->tm);
+	si_destroy_compiler(&sctx->compiler);
 
 	si_saved_cs_reference(&sctx->current_saved_cs, NULL);
 
@@ -215,25 +250,25 @@
 	FREE(sctx);
 }
 
-static enum pipe_reset_status
-si_amdgpu_get_reset_status(struct pipe_context *ctx)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-
-	return sctx->ws->ctx_query_reset_status(sctx->ctx);
-}
-
 static enum pipe_reset_status si_get_reset_status(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	unsigned latest = sctx->ws->query_value(sctx->ws,
-						  RADEON_GPU_RESET_COUNTER);
 
-	if (sctx->gpu_reset_counter == latest)
-		return PIPE_NO_RESET;
+	if (sctx->screen->info.has_gpu_reset_status_query)
+		return sctx->ws->ctx_query_reset_status(sctx->ctx);
 
-	sctx->gpu_reset_counter = latest;
-	return PIPE_UNKNOWN_CONTEXT_RESET;
+	if (sctx->screen->info.has_gpu_reset_counter_query) {
+		unsigned latest = sctx->ws->query_value(sctx->ws,
+							RADEON_GPU_RESET_COUNTER);
+
+		if (sctx->gpu_reset_counter == latest)
+			return PIPE_NO_RESET;
+
+		sctx->gpu_reset_counter = latest;
+		return PIPE_UNKNOWN_CONTEXT_RESET;
+	}
+
+	return PIPE_NO_RESET;
 }
 
 static void si_set_device_reset_callback(struct pipe_context *ctx,
@@ -285,18 +320,6 @@
 		u_log_printf(sctx->log, "\nString marker: %*s\n", len, string);
 }
 
-static LLVMTargetMachineRef
-si_create_llvm_target_machine(struct si_screen *sscreen)
-{
-	enum ac_target_machine_options tm_options =
-		(sscreen->debug_flags & DBG(SI_SCHED) ? AC_TM_SISCHED : 0) |
-		(sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) |
-		(sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) |
-		(!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0);
-
-	return ac_create_target_machine(sscreen->info.family, tm_options);
-}
-
 static void si_set_debug_callback(struct pipe_context *ctx,
 				  const struct pipe_debug_callback *cb)
 {
@@ -352,13 +375,12 @@
 	sctx->family = sscreen->info.family;
 	sctx->chip_class = sscreen->info.chip_class;
 
-	if (sscreen->info.drm_major == 2 && sscreen->info.drm_minor >= 43) {
-		sctx->b.get_device_reset_status = si_get_reset_status;
+	if (sscreen->info.has_gpu_reset_counter_query) {
 		sctx->gpu_reset_counter =
-				sctx->ws->query_value(sctx->ws,
-							RADEON_GPU_RESET_COUNTER);
+			sctx->ws->query_value(sctx->ws, RADEON_GPU_RESET_COUNTER);
 	}
 
+	sctx->b.get_device_reset_status = si_get_reset_status;
 	sctx->b.set_device_reset_callback = si_set_device_reset_callback;
 
 	si_init_context_texture_functions(sctx);
@@ -367,9 +389,9 @@
 	if (sctx->chip_class == CIK ||
 	    sctx->chip_class == VI ||
 	    sctx->chip_class == GFX9) {
-		sctx->eop_bug_scratch = (struct r600_resource*)
-					  pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
-							     16 * sscreen->info.num_render_backends);
+		sctx->eop_bug_scratch = r600_resource(
+			pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
+					   16 * sscreen->info.num_render_backends));
 		if (!sctx->eop_bug_scratch)
 			goto fail;
 	}
@@ -409,9 +431,6 @@
 						       sctx);
 	}
 
-	if (sscreen->info.drm_major == 3)
-		sctx->b.get_device_reset_status = si_amdgpu_get_reset_status;
-
 	si_init_buffer_functions(sctx);
 	si_init_clear_functions(sctx);
 	si_init_blit_functions(sctx);
@@ -438,10 +457,10 @@
 	if (!sctx->border_color_table)
 		goto fail;
 
-	sctx->border_color_buffer = (struct r600_resource*)
+	sctx->border_color_buffer = r600_resource(
 		pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT,
 				   SI_MAX_BORDER_COLORS *
-				   sizeof(*sctx->border_color_table));
+				   sizeof(*sctx->border_color_table)));
 	if (!sctx->border_color_buffer)
 		goto fail;
 
@@ -472,16 +491,16 @@
 	sctx->blitter->draw_rectangle = si_draw_rectangle;
 	sctx->blitter->skip_viewport_restore = true;
 
-	sctx->sample_mask.sample_mask = 0xffff;
+	sctx->sample_mask = 0xffff;
 
 	if (sctx->chip_class >= GFX9) {
-		sctx->wait_mem_scratch = (struct r600_resource*)
-			pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4);
+		sctx->wait_mem_scratch = r600_resource(
+			pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4));
 		if (!sctx->wait_mem_scratch)
 			goto fail;
 
 		/* Initialize the memory. */
-		struct radeon_winsys_cs *cs = sctx->gfx_cs;
+		struct radeon_cmdbuf *cs = sctx->gfx_cs;
 		radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
 		radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
 			    S_370_WR_CONFIRM(1) |
@@ -497,8 +516,8 @@
 	 * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
 	if (sctx->chip_class == CIK) {
 		sctx->null_const_buf.buffer =
-			si_aligned_buffer_create(screen,
-						 SI_RESOURCE_FLAG_32BIT,
+			pipe_aligned_buffer_create(screen,
+						   SI_RESOURCE_FLAG_32BIT,
 						   PIPE_USAGE_DEFAULT, 16,
 						   sctx->screen->info.tcc_cache_line_size);
 		if (!sctx->null_const_buf.buffer)
@@ -549,7 +568,7 @@
 	sctx->scratch_waves = MAX2(32 * sscreen->info.num_good_compute_units,
 				   max_threads_per_block / 64);
 
-	sctx->tm = si_create_llvm_target_machine(sscreen);
+	si_init_compiler(sscreen, &sctx->compiler);
 
 	/* Bindless handles. */
 	sctx->tex_handles = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
@@ -624,13 +643,11 @@
 	util_queue_destroy(&sscreen->shader_compiler_queue);
 	util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
 
-	for (i = 0; i < ARRAY_SIZE(sscreen->tm); i++)
-		if (sscreen->tm[i])
-			LLVMDisposeTargetMachine(sscreen->tm[i]);
+	for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++)
+		si_destroy_compiler(&sscreen->compiler[i]);
 
-	for (i = 0; i < ARRAY_SIZE(sscreen->tm_low_priority); i++)
-		if (sscreen->tm_low_priority[i])
-			LLVMDisposeTargetMachine(sscreen->tm_low_priority[i]);
+	for (i = 0; i < ARRAY_SIZE(sscreen->compiler_lowp); i++)
+		si_destroy_compiler(&sscreen->compiler_lowp[i]);
 
 	/* Free shader parts. */
 	for (i = 0; i < ARRAY_SIZE(parts); i++) {
@@ -659,39 +676,10 @@
 	FREE(sscreen);
 }
 
-static bool si_init_gs_info(struct si_screen *sscreen)
+static void si_init_gs_info(struct si_screen *sscreen)
 {
-	/* gs_table_depth is not used by GFX9 */
-	if (sscreen->info.chip_class >= GFX9)
-		return true;
-
-	switch (sscreen->info.family) {
-	case CHIP_OLAND:
-	case CHIP_HAINAN:
-	case CHIP_KAVERI:
-	case CHIP_KABINI:
-	case CHIP_MULLINS:
-	case CHIP_ICELAND:
-	case CHIP_CARRIZO:
-	case CHIP_STONEY:
-		sscreen->gs_table_depth = 16;
-		return true;
-	case CHIP_TAHITI:
-	case CHIP_PITCAIRN:
-	case CHIP_VERDE:
-	case CHIP_BONAIRE:
-	case CHIP_HAWAII:
-	case CHIP_TONGA:
-	case CHIP_FIJI:
-	case CHIP_POLARIS10:
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-	case CHIP_VEGAM:
-		sscreen->gs_table_depth = 32;
-		return true;
-	default:
-		return false;
-	}
+	sscreen->gs_table_depth = ac_get_gs_table_depth(sscreen->info.chip_class,
+							sscreen->info.family);
 }
 
 static void si_handle_env_var_force_family(struct si_screen *sscreen)
@@ -763,48 +751,46 @@
 	if (sscreen->debug_flags & DBG_ALL_SHADERS)
 		return;
 
-	uint32_t mesa_timestamp;
-	if (disk_cache_get_function_timestamp(si_disk_cache_create,
-					      &mesa_timestamp)) {
-		char *timestamp_str;
-		int res = -1;
-		uint32_t llvm_timestamp;
+	struct mesa_sha1 ctx;
+	unsigned char sha1[20];
+	char cache_id[20 * 2 + 1];
 
-		if (disk_cache_get_function_timestamp(LLVMInitializeAMDGPUTargetInfo,
-						      &llvm_timestamp)) {
-			res = asprintf(&timestamp_str, "%u_%u",
-				       mesa_timestamp, llvm_timestamp);
-		}
+	_mesa_sha1_init(&ctx);
 
-		if (res != -1) {
-			/* These flags affect shader compilation. */
-			#define ALL_FLAGS (DBG(FS_CORRECT_DERIVS_AFTER_KILL) | \
-					   DBG(SI_SCHED) | \
-					   DBG(UNSAFE_MATH) | \
-					   DBG(NIR))
-			uint64_t shader_debug_flags = sscreen->debug_flags &
-						      ALL_FLAGS;
+	if (!disk_cache_get_function_identifier(si_disk_cache_create, &ctx) ||
+	    !disk_cache_get_function_identifier(LLVMInitializeAMDGPUTargetInfo,
+						&ctx))
+		return;
 
-			/* Add the high bits of 32-bit addresses, which affects
-			 * how 32-bit addresses are expanded to 64 bits.
-			 */
-			STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
-			shader_debug_flags |= (uint64_t)sscreen->info.address32_hi << 32;
+	_mesa_sha1_final(&ctx, sha1);
+	disk_cache_format_hex_id(cache_id, sha1, 20 * 2);
 
-			sscreen->disk_shader_cache =
-				disk_cache_create(si_get_family_name(sscreen),
-						  timestamp_str,
-						  shader_debug_flags);
-			free(timestamp_str);
-		}
-	}
+	/* These flags affect shader compilation. */
+	#define ALL_FLAGS (DBG(FS_CORRECT_DERIVS_AFTER_KILL) |	\
+			   DBG(SI_SCHED) |			\
+			   DBG(GISEL) |				\
+			   DBG(UNSAFE_MATH) |			\
+			   DBG(NIR))
+	uint64_t shader_debug_flags = sscreen->debug_flags &
+		ALL_FLAGS;
+
+	/* Add the high bits of 32-bit addresses, which affects
+	 * how 32-bit addresses are expanded to 64 bits.
+	 */
+	STATIC_ASSERT(ALL_FLAGS <= UINT_MAX);
+	shader_debug_flags |= (uint64_t)sscreen->info.address32_hi << 32;
+
+	sscreen->disk_shader_cache =
+		disk_cache_create(si_get_family_name(sscreen),
+				  cache_id,
+				  shader_debug_flags);
 }
 
 struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
 					   const struct pipe_screen_config *config)
 {
 	struct si_screen *sscreen = CALLOC_STRUCT(si_screen);
-	unsigned num_threads, num_compiler_threads, num_compiler_threads_lowprio, i;
+	unsigned hw_threads, num_comp_hi_threads, num_comp_lo_threads, i;
 
 	if (!sscreen) {
 		return NULL;
@@ -812,6 +798,7 @@
 
 	sscreen->ws = ws;
 	ws->query_info(ws, &sscreen->info);
+	si_handle_env_var_force_family(sscreen);
 
 	sscreen->debug_flags = debug_get_flags_option("R600_DEBUG",
 							debug_options, 0);
@@ -841,7 +828,7 @@
 		ac_print_gpu_info(&sscreen->info);
 
 	slab_create_parent(&sscreen->pool_transfers,
-			   sizeof(struct r600_transfer), 64);
+			   sizeof(struct si_transfer), 64);
 
 	sscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
 	if (sscreen->force_aniso >= 0) {
@@ -853,25 +840,38 @@
 	(void) mtx_init(&sscreen->aux_context_lock, mtx_plain);
 	(void) mtx_init(&sscreen->gpu_load_mutex, mtx_plain);
 
-	if (!si_init_gs_info(sscreen) ||
-	    !si_init_shader_cache(sscreen)) {
+	si_init_gs_info(sscreen);
+	if (!si_init_shader_cache(sscreen)) {
 		FREE(sscreen);
 		return NULL;
 	}
 
 	si_disk_cache_create(sscreen);
 
-	/* Only enable as many threads as we have target machines, but at most
-	 * the number of CPUs - 1 if there is more than one.
-	 */
-	num_threads = sysconf(_SC_NPROCESSORS_ONLN);
-	num_threads = MAX2(1, num_threads - 1);
-	num_compiler_threads = MIN2(num_threads, ARRAY_SIZE(sscreen->tm));
-	num_compiler_threads_lowprio =
-		MIN2(num_threads, ARRAY_SIZE(sscreen->tm_low_priority));
+	/* Determine the number of shader compiler threads. */
+	hw_threads = sysconf(_SC_NPROCESSORS_ONLN);
 
-	if (!util_queue_init(&sscreen->shader_compiler_queue, "si_shader",
-			     32, num_compiler_threads,
+	if (hw_threads >= 12) {
+		num_comp_hi_threads = hw_threads * 3 / 4;
+		num_comp_lo_threads = hw_threads / 3;
+	} else if (hw_threads >= 6) {
+		num_comp_hi_threads = hw_threads - 2;
+		num_comp_lo_threads = hw_threads / 2;
+	} else if (hw_threads >= 2) {
+		num_comp_hi_threads = hw_threads - 1;
+		num_comp_lo_threads = hw_threads / 2;
+	} else {
+		num_comp_hi_threads = 1;
+		num_comp_lo_threads = 1;
+	}
+
+	num_comp_hi_threads = MIN2(num_comp_hi_threads,
+				   ARRAY_SIZE(sscreen->compiler));
+	num_comp_lo_threads = MIN2(num_comp_lo_threads,
+				   ARRAY_SIZE(sscreen->compiler_lowp));
+
+	if (!util_queue_init(&sscreen->shader_compiler_queue, "sh",
+			     64, num_comp_hi_threads,
 			     UTIL_QUEUE_INIT_RESIZE_IF_FULL)) {
 		si_destroy_shader_cache(sscreen);
 		FREE(sscreen);
@@ -879,8 +879,8 @@
 	}
 
 	if (!util_queue_init(&sscreen->shader_compiler_queue_low_priority,
-			     "si_shader_low",
-			     32, num_compiler_threads_lowprio,
+			     "shlo",
+			     64, num_comp_lo_threads,
 			     UTIL_QUEUE_INIT_RESIZE_IF_FULL |
 			     UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
 	       si_destroy_shader_cache(sscreen);
@@ -888,8 +888,6 @@
 	       return NULL;
 	}
 
-	si_handle_env_var_force_family(sscreen);
-
 	if (!debug_get_bool_option("RADEON_DISABLE_PERFCOUNTERS", false))
 		si_init_perfcounters(sscreen);
 
@@ -903,7 +901,8 @@
 	unsigned max_offchip_buffers_per_se;
 
 	/* Only certain chips can use the maximum value. */
-	if (sscreen->info.family == CHIP_VEGA12)
+	if (sscreen->info.family == CHIP_VEGA12 ||
+	    sscreen->info.family == CHIP_VEGA20)
 		max_offchip_buffers_per_se = double_offchip_buffers ? 128 : 64;
 	else
 		max_offchip_buffers_per_se = double_offchip_buffers ? 127 : 63;
@@ -1013,9 +1012,7 @@
 	}
 
 	sscreen->dcc_msaa_allowed =
-		!(sscreen->debug_flags & DBG(NO_DCC_MSAA)) &&
-		(sscreen->debug_flags & DBG(DCC_MSAA) ||
-		 sscreen->info.chip_class == VI);
+		!(sscreen->debug_flags & DBG(NO_DCC_MSAA));
 
 	sscreen->cpdma_prefetch_writes_memory = sscreen->info.chip_class <= VI;
 
@@ -1033,10 +1030,35 @@
 	if (debug_get_bool_option("RADEON_DUMP_SHADERS", false))
 		sscreen->debug_flags |= DBG_ALL_SHADERS;
 
-	for (i = 0; i < num_compiler_threads; i++)
-		sscreen->tm[i] = si_create_llvm_target_machine(sscreen);
-	for (i = 0; i < num_compiler_threads_lowprio; i++)
-		sscreen->tm_low_priority[i] = si_create_llvm_target_machine(sscreen);
+	/* Syntax:
+	 *     EQAA=s,z,c
+	 * Example:
+	 *     EQAA=8,4,2
+
+	 * That means 8 coverage samples, 4 Z/S samples, and 2 color samples.
+	 * Constraints:
+	 *     s >= z >= c (ignoring this only wastes memory)
+	 *     s = [2..16]
+	 *     z = [2..8]
+	 *     c = [2..8]
+	 *
+	 * Only MSAA color and depth buffers are overriden.
+	 */
+	if (sscreen->info.has_eqaa_surface_allocator) {
+		const char *eqaa = debug_get_option("EQAA", NULL);
+		unsigned s,z,f;
+
+		if (eqaa && sscanf(eqaa, "%u,%u,%u", &s, &z, &f) == 3 && s && z && f) {
+			sscreen->eqaa_force_coverage_samples = s;
+			sscreen->eqaa_force_z_samples = z;
+			sscreen->eqaa_force_color_samples = f;
+		}
+	}
+
+	for (i = 0; i < num_comp_hi_threads; i++)
+		si_init_compiler(sscreen, &sscreen->compiler[i]);
+	for (i = 0; i < num_comp_lo_threads; i++)
+		si_init_compiler(sscreen, &sscreen->compiler_lowp[i]);
 
 	/* Create the auxiliary context. This must be done last. */
 	sscreen->aux_context = si_create_context(&sscreen->b, 0);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index e75663d..9ab79bc 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -26,9 +26,11 @@
 #define SI_PIPE_H
 
 #include "si_shader.h"
+#include "si_state.h"
 
 #include "util/u_dynarray.h"
 #include "util/u_idalloc.h"
+#include "util/u_threaded_context.h"
 
 #ifdef PIPE_ARCH_BIG_ENDIAN
 #define SI_BIG_ENDIAN 1
@@ -119,6 +121,7 @@
 	DBG_FS_CORRECT_DERIVS_AFTER_KILL,
 	DBG_UNSAFE_MATH,
 	DBG_SI_SCHED,
+	DBG_GISEL,
 
 	/* Shader compiler options (with no effect on the shader cache): */
 	DBG_CHECK_IR,
@@ -138,6 +141,7 @@
 	DBG_NO_WC,
 	DBG_CHECK_VM,
 	DBG_RESERVE_VMID,
+	DBG_ZERO_VRAM,
 
 	/* 3D engine options: */
 	DBG_SWITCH_ON_EOP,
@@ -154,7 +158,6 @@
 	DBG_NO_DCC_CLEAR,
 	DBG_NO_DCC_FB,
 	DBG_NO_DCC_MSAA,
-	DBG_DCC_MSAA,
 	DBG_NO_FMASK,
 
 	/* Tests: */
@@ -171,6 +174,226 @@
 struct hash_table;
 struct u_suballocator;
 
+/* Only 32-bit buffer allocations are supported, gallium doesn't support more
+ * at the moment.
+ */
+struct r600_resource {
+	struct threaded_resource	b;
+
+	/* Winsys objects. */
+	struct pb_buffer		*buf;
+	uint64_t			gpu_address;
+	/* Memory usage if the buffer placement is optimal. */
+	uint64_t			vram_usage;
+	uint64_t			gart_usage;
+
+	/* Resource properties. */
+	uint64_t			bo_size;
+	unsigned			bo_alignment;
+	enum radeon_bo_domain		domains;
+	enum radeon_bo_flag		flags;
+	unsigned			bind_history;
+	int				max_forced_staging_uploads;
+
+	/* The buffer range which is initialized (with a write transfer,
+	 * streamout, DMA, or as a random access target). The rest of
+	 * the buffer is considered invalid and can be mapped unsynchronized.
+	 *
+	 * This allows unsychronized mapping of a buffer range which hasn't
+	 * been used yet. It's for applications which forget to use
+	 * the unsynchronized map flag and expect the driver to figure it out.
+         */
+	struct util_range		valid_buffer_range;
+
+	/* For buffers only. This indicates that a write operation has been
+	 * performed by TC L2, but the cache hasn't been flushed.
+	 * Any hw block which doesn't use or bypasses TC L2 should check this
+	 * flag and flush the cache before using the buffer.
+	 *
+	 * For example, TC L2 must be flushed if a buffer which has been
+	 * modified by a shader store instruction is about to be used as
+	 * an index buffer. The reason is that VGT DMA index fetching doesn't
+	 * use TC L2.
+	 */
+	bool				TC_L2_dirty;
+
+	/* Whether this resource is referenced by bindless handles. */
+	bool				texture_handle_allocated;
+	bool				image_handle_allocated;
+
+	/* Whether the resource has been exported via resource_get_handle. */
+	unsigned			external_usage; /* PIPE_HANDLE_USAGE_* */
+};
+
+struct si_transfer {
+	struct threaded_transfer	b;
+	struct r600_resource		*staging;
+	unsigned			offset;
+};
+
+struct si_texture {
+	struct r600_resource		buffer;
+
+	struct radeon_surf		surface;
+	uint64_t			size;
+	struct si_texture		*flushed_depth_texture;
+
+	/* Colorbuffer compression and fast clear. */
+	uint64_t			fmask_offset;
+	uint64_t			cmask_offset;
+	uint64_t			cmask_base_address_reg;
+	struct r600_resource		*cmask_buffer;
+	uint64_t			dcc_offset; /* 0 = disabled */
+	unsigned			cb_color_info; /* fast clear enable bit */
+	unsigned			color_clear_value[2];
+	unsigned			last_msaa_resolve_target_micro_mode;
+	unsigned			num_level0_transfers;
+
+	/* Depth buffer compression and fast clear. */
+	uint64_t			htile_offset;
+	float				depth_clear_value;
+	uint16_t			dirty_level_mask; /* each bit says if that mipmap is compressed */
+	uint16_t			stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
+	enum pipe_format		db_render_format:16;
+	uint8_t				stencil_clear_value;
+	bool				tc_compatible_htile:1;
+	bool				depth_cleared:1; /* if it was cleared at least once */
+	bool				stencil_cleared:1; /* if it was cleared at least once */
+	bool				upgraded_depth:1; /* upgraded from unorm to Z32_FLOAT */
+	bool				is_depth:1;
+	bool				db_compatible:1;
+	bool				can_sample_z:1;
+	bool				can_sample_s:1;
+
+	/* We need to track DCC dirtiness, because st/dri usually calls
+	 * flush_resource twice per frame (not a bug) and we don't wanna
+	 * decompress DCC twice. Also, the dirty tracking must be done even
+	 * if DCC isn't used, because it's required by the DCC usage analysis
+	 * for a possible future enablement.
+	 */
+	bool				separate_dcc_dirty:1;
+	/* Statistics gathering for the DCC enablement heuristic. */
+	bool				dcc_gather_statistics:1;
+	/* Counter that should be non-zero if the texture is bound to a
+	 * framebuffer.
+	 */
+	unsigned                        framebuffers_bound;
+	/* Whether the texture is a displayable back buffer and needs DCC
+	 * decompression, which is expensive. Therefore, it's enabled only
+	 * if statistics suggest that it will pay off and it's allocated
+	 * separately. It can't be bound as a sampler by apps. Limited to
+	 * target == 2D and last_level == 0. If enabled, dcc_offset contains
+	 * the absolute GPUVM address, not the relative one.
+	 */
+	struct r600_resource		*dcc_separate_buffer;
+	/* When DCC is temporarily disabled, the separate buffer is here. */
+	struct r600_resource		*last_dcc_separate_buffer;
+	/* Estimate of how much this color buffer is written to in units of
+	 * full-screen draws: ps_invocations / (width * height)
+	 * Shader kills, late Z, and blending with trivial discards make it
+	 * inaccurate (we need to count CB updates, not PS invocations).
+	 */
+	unsigned			ps_draw_ratio;
+	/* The number of clears since the last DCC usage analysis. */
+	unsigned			num_slow_clears;
+};
+
+struct si_surface {
+	struct pipe_surface		base;
+
+	/* These can vary with block-compressed textures. */
+	uint16_t width0;
+	uint16_t height0;
+
+	bool color_initialized:1;
+	bool depth_initialized:1;
+
+	/* Misc. color flags. */
+	bool color_is_int8:1;
+	bool color_is_int10:1;
+	bool dcc_incompatible:1;
+
+	/* Color registers. */
+	unsigned cb_color_info;
+	unsigned cb_color_view;
+	unsigned cb_color_attrib;
+	unsigned cb_color_attrib2;	/* GFX9 and later */
+	unsigned cb_dcc_control;	/* VI and later */
+	unsigned spi_shader_col_format:8;	/* no blending, no alpha-to-coverage. */
+	unsigned spi_shader_col_format_alpha:8;	/* alpha-to-coverage */
+	unsigned spi_shader_col_format_blend:8;	/* blending without alpha. */
+	unsigned spi_shader_col_format_blend_alpha:8; /* blending with alpha. */
+
+	/* DB registers. */
+	uint64_t db_depth_base;		/* DB_Z_READ/WRITE_BASE */
+	uint64_t db_stencil_base;
+	uint64_t db_htile_data_base;
+	unsigned db_depth_info;
+	unsigned db_z_info;
+	unsigned db_z_info2;		/* GFX9+ */
+	unsigned db_depth_view;
+	unsigned db_depth_size;
+	unsigned db_depth_slice;
+	unsigned db_stencil_info;
+	unsigned db_stencil_info2;	/* GFX9+ */
+	unsigned db_htile_surface;
+};
+
+struct si_mmio_counter {
+	unsigned busy;
+	unsigned idle;
+};
+
+union si_mmio_counters {
+	struct {
+		/* For global GPU load including SDMA. */
+		struct si_mmio_counter gpu;
+
+		/* GRBM_STATUS */
+		struct si_mmio_counter spi;
+		struct si_mmio_counter gui;
+		struct si_mmio_counter ta;
+		struct si_mmio_counter gds;
+		struct si_mmio_counter vgt;
+		struct si_mmio_counter ia;
+		struct si_mmio_counter sx;
+		struct si_mmio_counter wd;
+		struct si_mmio_counter bci;
+		struct si_mmio_counter sc;
+		struct si_mmio_counter pa;
+		struct si_mmio_counter db;
+		struct si_mmio_counter cp;
+		struct si_mmio_counter cb;
+
+		/* SRBM_STATUS2 */
+		struct si_mmio_counter sdma;
+
+		/* CP_STAT */
+		struct si_mmio_counter pfp;
+		struct si_mmio_counter meq;
+		struct si_mmio_counter me;
+		struct si_mmio_counter surf_sync;
+		struct si_mmio_counter cp_dma;
+		struct si_mmio_counter scratch_ram;
+	} named;
+	unsigned array[0];
+};
+
+struct si_memory_object {
+	struct pipe_memory_object	b;
+	struct pb_buffer		*buf;
+	uint32_t			stride;
+};
+
+/* Saved CS data for debugging features. */
+struct radeon_saved_cs {
+	uint32_t			*ib;
+	unsigned			num_dw;
+
+	struct radeon_bo_list_item	*bo_list;
+	unsigned			bo_count;
+};
+
 struct si_screen {
 	struct pipe_screen		b;
 	struct radeon_winsys		*ws;
@@ -185,6 +408,9 @@
 	unsigned			tess_offchip_ring_size;
 	unsigned			tess_factor_ring_size;
 	unsigned			vgt_hs_offchip_param;
+	unsigned			eqaa_force_coverage_samples;
+	unsigned			eqaa_force_z_samples;
+	unsigned			eqaa_force_color_samples;
 	bool				has_clear_state;
 	bool				has_distributed_tess;
 	bool				has_draw_indirect_multi;
@@ -288,17 +514,16 @@
 	struct util_queue		shader_compiler_queue;
 	/* Use at most 3 normal compiler threads on quadcore and better.
 	 * Hyperthreaded CPUs report the number of threads, but we want
-	 * the number of cores. */
-	LLVMTargetMachineRef		tm[3]; /* used by the queue only */
+	 * the number of cores. We only need this many threads for shader-db. */
+	struct ac_llvm_compiler		compiler[24]; /* used by the queue only */
 
 	struct util_queue		shader_compiler_queue_low_priority;
 	/* Use at most 2 low priority threads on quadcore and better.
 	 * We want to minimize the impact on multithreaded Mesa. */
-	LLVMTargetMachineRef		tm_low_priority[2]; /* at most 2 threads */
+	struct ac_llvm_compiler		compiler_lowp[10];
 };
 
 struct si_blend_color {
-	struct r600_atom		atom;
 	struct pipe_blend_color		state;
 	bool				any_nonzeros;
 };
@@ -353,7 +578,6 @@
 };
 
 struct si_framebuffer {
-	struct r600_atom		atom;
 	struct pipe_framebuffer_state	state;
 	unsigned			colorbuf_enabled_4bit;
 	unsigned			spi_shader_col_format;
@@ -362,6 +586,7 @@
 	unsigned			spi_shader_col_format_blend_alpha;
 	ubyte				nr_samples:5; /* at most 16xAA */
 	ubyte				log_samples:3; /* at most 4 = 16xAA */
+	ubyte				nr_color_samples; /* at most 8xAA */
 	ubyte				compressed_cb_mask;
 	ubyte				uncompressed_cb_mask;
 	ubyte				color_is_int8;
@@ -381,13 +606,11 @@
 };
 
 struct si_scissors {
-	struct r600_atom		atom;
 	unsigned			dirty_mask;
 	struct pipe_scissor_state	states[SI_MAX_VIEWPORTS];
 };
 
 struct si_viewports {
-	struct r600_atom		atom;
 	unsigned			dirty_mask;
 	unsigned			depth_range_dirty_mask;
 	struct pipe_viewport_state	states[SI_MAX_VIEWPORTS];
@@ -395,21 +618,10 @@
 };
 
 struct si_clip_state {
-	struct r600_atom		atom;
 	struct pipe_clip_state		state;
 	bool				any_nonzeros;
 };
 
-struct si_sample_locs {
-	struct r600_atom	atom;
-	unsigned		nr_samples;
-};
-
-struct si_sample_mask {
-	struct r600_atom	atom;
-	uint16_t		sample_mask;
-};
-
 struct si_streamout_target {
 	struct pipe_stream_output_target b;
 
@@ -422,7 +634,6 @@
 };
 
 struct si_streamout {
-	struct r600_atom		begin_atom;
 	bool				begin_emitted;
 
 	unsigned			enabled_mask;
@@ -441,7 +652,6 @@
 	unsigned			hw_enabled_mask;
 
 	/* The state of VGT_STRMOUT_(CONFIG|EN). */
-	struct r600_atom		enable_atom;
 	bool				streamout_enabled;
 	bool				prims_gen_query_enabled;
 	int				num_prims_gen_queries;
@@ -526,8 +736,8 @@
 
 	struct radeon_winsys		*ws;
 	struct radeon_winsys_ctx	*ctx;
-	struct radeon_winsys_cs		*gfx_cs;
-	struct radeon_winsys_cs		*dma_cs;
+	struct radeon_cmdbuf		*gfx_cs;
+	struct radeon_cmdbuf		*dma_cs;
 	struct pipe_fence_handle	*last_gfx_fence;
 	struct pipe_fence_handle	*last_sdma_fence;
 	struct r600_resource		*eop_bug_scratch;
@@ -552,7 +762,7 @@
 	void				*vs_blit_texcoord;
 	struct si_screen		*screen;
 	struct pipe_debug_callback	debug;
-	LLVMTargetMachineRef		tm; /* only non-threaded compilation */
+	struct ac_llvm_compiler		compiler; /* only non-threaded compilation */
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
 	struct r600_resource		*wait_mem_scratch;
 	unsigned			wait_mem_number;
@@ -583,19 +793,13 @@
 
 	/* Atom declarations. */
 	struct si_framebuffer		framebuffer;
-	struct si_sample_locs		msaa_sample_locs;
-	struct r600_atom		db_render_state;
-	struct r600_atom		dpbb_state;
-	struct r600_atom		msaa_config;
-	struct si_sample_mask		sample_mask;
-	struct r600_atom		cb_render_state;
+	unsigned			sample_locs_num_samples;
+	uint16_t			sample_mask;
 	unsigned			last_cb_target_mask;
 	struct si_blend_color		blend_color;
-	struct r600_atom		clip_regs;
 	struct si_clip_state		clip_state;
 	struct si_shader_data		shader_pointers;
 	struct si_stencil_ref		stencil_ref;
-	struct r600_atom		spi_map;
 	struct si_scissors		scissors;
 	struct si_streamout		streamout;
 	struct si_viewports		viewports;
@@ -681,7 +885,6 @@
 	int			last_sh_base_reg;
 	int			last_primitive_restart_en;
 	int			last_restart_index;
-	int			last_gs_out_prim;
 	int			last_prim;
 	int			last_multi_vgt_param;
 	int			last_rast_prim;
@@ -691,7 +894,6 @@
 	enum pipe_prim_type	current_rast_prim; /* primitive type after TES, GS */
 
 	/* Scratch buffer */
-	struct r600_atom	scratch_state;
 	struct r600_resource	*scratch_buffer;
 	unsigned		scratch_waves;
 	unsigned		spi_tmpring_size;
@@ -706,6 +908,7 @@
 	int			last_tes_sh_base;
 	bool			last_tess_uses_primid;
 	unsigned		last_num_patches;
+	int			last_ls_hs_config;
 
 	/* Debug state. */
 	bool			is_debug;
@@ -716,7 +919,7 @@
 	/* Other state */
 	bool need_check_render_feedback;
 	bool			decompression_enabled;
-
+	bool			dpbb_force_off;
 	bool			vs_writes_viewport_index;
 	bool			vs_disables_clipping_viewport;
 
@@ -787,14 +990,13 @@
 	unsigned			num_cs_dw_queries_suspend;
 
 	/* Render condition. */
-	struct r600_atom		render_cond_atom;
 	struct pipe_query		*render_cond;
 	unsigned			render_cond_mode;
 	bool				render_cond_invert;
 	bool				render_cond_force_off; /* for u_blitter */
 
 	/* Statistics gathering for the DCC enablement heuristic. It can't be
-	 * in r600_texture because r600_texture can be shared by multiple
+	 * in si_texture because si_texture can be shared by multiple
 	 * contexts. This is for back buffers only. We shouldn't get too many
 	 * of those.
 	 *
@@ -803,7 +1005,7 @@
 	 * enabled by DCC stat gathering.
 	 */
 	struct {
-		struct r600_texture		*tex;
+		struct si_texture		*tex;
 		/* Query queue: 0 = usually active, 1 = waiting, 2 = readback. */
 		struct pipe_query		*ps_stats[3];
 		/* If all slots are used and another slot is needed,
@@ -823,6 +1025,8 @@
 
 	void (*dma_clear_buffer)(struct si_context *sctx, struct pipe_resource *dst,
 				 uint64_t offset, uint64_t size, unsigned value);
+
+	struct si_tracked_regs			tracked_regs;
 };
 
 /* cik_sdma.c */
@@ -848,10 +1052,10 @@
 			     struct pipe_resource *src,
 			     unsigned src_level,
 			     const struct pipe_box *src_box);
-void si_decompress_dcc(struct si_context *sctx, struct r600_texture *rtex);
+void si_decompress_dcc(struct si_context *sctx, struct si_texture *tex);
 void si_blit_decompress_depth(struct pipe_context *ctx,
-			      struct r600_texture *texture,
-			      struct r600_texture *staging,
+			      struct si_texture *texture,
+			      struct si_texture *staging,
 			      unsigned first_level, unsigned last_level,
 			      unsigned first_layer, unsigned last_layer,
 			      unsigned first_sample, unsigned last_sample);
@@ -868,11 +1072,12 @@
 			     uint64_t size, unsigned alignment);
 bool si_alloc_resource(struct si_screen *sscreen,
 		       struct r600_resource *res);
-struct pipe_resource *si_aligned_buffer_create(struct pipe_screen *screen,
-					       unsigned flags,
-					       unsigned usage,
-					       unsigned size,
-					       unsigned alignment);
+struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen,
+						 unsigned flags, unsigned usage,
+						 unsigned size, unsigned alignment);
+struct r600_resource *si_aligned_buffer_create(struct pipe_screen *screen,
+					       unsigned flags, unsigned usage,
+					       unsigned size, unsigned alignment);
 void si_replace_buffer_storage(struct pipe_context *ctx,
 			       struct pipe_resource *dst,
 			       struct pipe_resource *src);
@@ -880,8 +1085,10 @@
 void si_init_buffer_functions(struct si_context *sctx);
 
 /* si_clear.c */
+enum pipe_format si_simplify_cb_format(enum pipe_format format);
+bool vi_alpha_is_on_msb(enum pipe_format format);
 void vi_dcc_clear_level(struct si_context *sctx,
-			struct r600_texture *rtex,
+			struct si_texture *tex,
 			unsigned level, unsigned clear_value);
 void si_init_clear_functions(struct si_context *sctx);
 
@@ -917,7 +1124,7 @@
 void si_init_cp_dma_functions(struct si_context *sctx);
 
 /* si_debug.c */
-void si_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs,
+void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
 		struct radeon_saved_cs *saved, bool get_buffer_list);
 void si_clear_saved_cs(struct radeon_saved_cs *saved);
 void si_destroy_saved_cs(struct si_saved_cs *scs);
@@ -974,6 +1181,9 @@
 /* si_compute.c */
 void si_init_compute_functions(struct si_context *sctx);
 
+/* r600_perfcounters.c */
+void si_perfcounters_destroy(struct si_screen *sscreen);
+
 /* si_perfcounters.c */
 void si_init_perfcounters(struct si_screen *screen);
 
@@ -1000,30 +1210,23 @@
 void si_update_vs_viewport_state(struct si_context *ctx);
 void si_init_viewport_functions(struct si_context *ctx);
 
-/* r600_texture.c */
+/* si_texture.c */
 bool si_prepare_for_dma_blit(struct si_context *sctx,
-			     struct r600_texture *rdst,
+			     struct si_texture *dst,
 			     unsigned dst_level, unsigned dstx,
 			     unsigned dsty, unsigned dstz,
-			     struct r600_texture *rsrc,
+			     struct si_texture *src,
 			     unsigned src_level,
 			     const struct pipe_box *src_box);
-void si_texture_get_fmask_info(struct si_screen *sscreen,
-			       struct r600_texture *rtex,
-			       unsigned nr_samples,
-			       struct r600_fmask_info *out);
-void si_texture_get_cmask_info(struct si_screen *sscreen,
-			       struct r600_texture *rtex,
-			       struct r600_cmask_info *out);
 void si_eliminate_fast_color_clear(struct si_context *sctx,
-				   struct r600_texture *rtex);
+				   struct si_texture *tex);
 void si_texture_discard_cmask(struct si_screen *sscreen,
-			      struct r600_texture *rtex);
+			      struct si_texture *tex);
 bool si_init_flushed_depth_texture(struct pipe_context *ctx,
 				   struct pipe_resource *texture,
-				   struct r600_texture **staging);
+				   struct si_texture **staging);
 void si_print_texture_info(struct si_screen *sscreen,
-			   struct r600_texture *rtex, struct u_log_context *log);
+			   struct si_texture *tex, struct u_log_context *log);
 struct pipe_resource *si_texture_create(struct pipe_screen *screen,
 					const struct pipe_resource *templ);
 bool vi_dcc_formats_compatible(enum pipe_format format1,
@@ -1042,15 +1245,15 @@
 					      unsigned width, unsigned height);
 unsigned si_translate_colorswap(enum pipe_format format, bool do_endian_swap);
 void vi_separate_dcc_try_enable(struct si_context *sctx,
-				struct r600_texture *tex);
+				struct si_texture *tex);
 void vi_separate_dcc_start_query(struct si_context *sctx,
-				 struct r600_texture *tex);
+				 struct si_texture *tex);
 void vi_separate_dcc_stop_query(struct si_context *sctx,
-				struct r600_texture *tex);
+				struct si_texture *tex);
 void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
-					     struct r600_texture *tex);
+					     struct si_texture *tex);
 bool si_texture_disable_dcc(struct si_context *sctx,
-			    struct r600_texture *rtex);
+			    struct si_texture *tex);
 void si_init_screen_texture_functions(struct si_screen *sscreen);
 void si_init_context_texture_functions(struct si_context *sctx);
 
@@ -1059,15 +1262,46 @@
  * common helpers
  */
 
+static inline struct r600_resource *r600_resource(struct pipe_resource *r)
+{
+	return (struct r600_resource*)r;
+}
+
+static inline void
+r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res)
+{
+	pipe_resource_reference((struct pipe_resource **)ptr,
+				(struct pipe_resource *)res);
+}
+
+static inline void
+si_texture_reference(struct si_texture **ptr, struct si_texture *res)
+{
+	pipe_resource_reference((struct pipe_resource **)ptr, &res->buffer.b.b);
+}
+
+static inline bool
+vi_dcc_enabled(struct si_texture *tex, unsigned level)
+{
+	return tex->dcc_offset && level < tex->surface.num_dcc_levels;
+}
+
+static inline unsigned
+si_tile_mode_index(struct si_texture *tex, unsigned level, bool stencil)
+{
+	if (stencil)
+		return tex->surface.u.legacy.stencil_tiling_index[level];
+	else
+		return tex->surface.u.legacy.tiling_index[level];
+}
+
 static inline void
 si_context_add_resource_size(struct si_context *sctx, struct pipe_resource *r)
 {
-	struct r600_resource *res = (struct r600_resource *)r;
-
-	if (res) {
+	if (r) {
 		/* Add memory usage for need_gfx_cs_space */
-		sctx->vram += res->vram_usage;
-		sctx->gtt += res->gart_usage;
+		sctx->vram += r600_resource(r)->vram_usage;
+		sctx->gtt += r600_resource(r)->gart_usage;
 	}
 }
 
@@ -1077,11 +1311,16 @@
 	sctx->last_base_vertex = SI_BASE_VERTEX_UNKNOWN;
 }
 
-static inline void
-si_set_atom_dirty(struct si_context *sctx,
-		  struct r600_atom *atom, bool dirty)
+static inline unsigned
+si_get_atom_bit(struct si_context *sctx, struct si_atom *atom)
 {
-	unsigned bit = 1 << atom->id;
+	return 1 << (atom - sctx->atoms.array);
+}
+
+static inline void
+si_set_atom_dirty(struct si_context *sctx, struct si_atom *atom, bool dirty)
+{
+	unsigned bit = si_get_atom_bit(sctx, atom);
 
 	if (dirty)
 		sctx->dirty_atoms |= bit;
@@ -1090,17 +1329,13 @@
 }
 
 static inline bool
-si_is_atom_dirty(struct si_context *sctx,
-		  struct r600_atom *atom)
+si_is_atom_dirty(struct si_context *sctx, struct si_atom *atom)
 {
-	unsigned bit = 1 << atom->id;
-
-	return sctx->dirty_atoms & bit;
+	return (sctx->dirty_atoms & si_get_atom_bit(sctx, atom)) != 0;
 }
 
 static inline void
-si_mark_atom_dirty(struct si_context *sctx,
-		   struct r600_atom *atom)
+si_mark_atom_dirty(struct si_context *sctx, struct si_atom *atom)
 {
 	si_set_atom_dirty(sctx, atom, true);
 }
@@ -1137,13 +1372,6 @@
 	return sscreen->debug_flags & (1 << processor);
 }
 
-static inline bool si_extra_shader_checks(struct si_screen *sscreen,
-					  unsigned processor)
-{
-	return (sscreen->debug_flags & DBG(CHECK_IR)) ||
-	       si_can_dump_shader(sscreen, processor);
-}
-
 static inline bool si_get_strmout_en(struct si_context *sctx)
 {
 	return sctx->streamout.streamout_enabled ||
@@ -1219,20 +1447,20 @@
 }
 
 static inline bool
-si_can_sample_zs(struct r600_texture *tex, bool stencil_sampler)
+si_can_sample_zs(struct si_texture *tex, bool stencil_sampler)
 {
 	return (stencil_sampler && tex->can_sample_s) ||
 	       (!stencil_sampler && tex->can_sample_z);
 }
 
 static inline bool
-si_htile_enabled(struct r600_texture *tex, unsigned level)
+si_htile_enabled(struct si_texture *tex, unsigned level)
 {
 	return tex->htile_offset && level == 0;
 }
 
 static inline bool
-vi_tc_compat_htile_enabled(struct r600_texture *tex, unsigned level)
+vi_tc_compat_htile_enabled(struct si_texture *tex, unsigned level)
 {
 	assert(!tex->tc_compatible_htile || tex->htile_offset);
 	return tex->tc_compatible_htile && level == 0;
@@ -1241,9 +1469,9 @@
 static inline unsigned si_get_ps_iter_samples(struct si_context *sctx)
 {
 	if (sctx->ps_uses_fbfetch)
-		return sctx->framebuffer.nr_samples;
+		return sctx->framebuffer.nr_color_samples;
 
-	return sctx->ps_iter_samples;
+	return MIN2(sctx->ps_iter_samples, sctx->framebuffer.nr_color_samples);
 }
 
 static inline unsigned si_get_total_colormask(struct si_context *sctx)
@@ -1266,6 +1494,23 @@
 	return colormask;
 }
 
+#define UTIL_ALL_PRIM_LINE_MODES ((1 << PIPE_PRIM_LINES) | \
+				  (1 << PIPE_PRIM_LINE_LOOP) | \
+				  (1 << PIPE_PRIM_LINE_STRIP) | \
+				  (1 << PIPE_PRIM_LINES_ADJACENCY) | \
+				  (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY))
+
+static inline bool util_prim_is_lines(unsigned prim)
+{
+	return ((1 << prim) & UTIL_ALL_PRIM_LINE_MODES) != 0;
+}
+
+static inline bool util_prim_is_points_or_lines(unsigned prim)
+{
+	return ((1 << prim) & (UTIL_ALL_PRIM_LINE_MODES |
+			       (1 << PIPE_PRIM_POINTS))) != 0;
+}
+
 /**
  * Return true if there is enough memory in VRAM and GTT for the buffers
  * added so far.
@@ -1275,7 +1520,7 @@
  */
 static inline bool
 radeon_cs_memory_below_limit(struct si_screen *screen,
-			     struct radeon_winsys_cs *cs,
+			     struct radeon_cmdbuf *cs,
 			     uint64_t vram, uint64_t gtt)
 {
 	vram += cs->used_vram;
@@ -1300,7 +1545,7 @@
  * rebuilt.
  */
 static inline void radeon_add_to_buffer_list(struct si_context *sctx,
-					     struct radeon_winsys_cs *cs,
+					     struct radeon_cmdbuf *cs,
 					     struct r600_resource *rbo,
 					     enum radeon_bo_usage usage,
 					     enum radeon_bo_priority priority)
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index d68a383..446edea 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -123,7 +123,7 @@
 
 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	for (int i = 0; i < state->nbo; ++i) {
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs, state->bo[i],
@@ -167,7 +167,7 @@
 
 	r600_resource_reference(&state->indirect_buffer, NULL);
 	/* TODO: this hangs with 1024 or higher alignment on GFX9. */
-	state->indirect_buffer = (struct r600_resource*)
+	state->indirect_buffer =
 		si_aligned_buffer_create(screen, 0,
 					 PIPE_USAGE_DEFAULT, aligned_ndw * 4,
 					 256);
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index 3de47c0..e29ce71 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -29,6 +29,7 @@
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
 #include "util/os_time.h"
+#include "util/u_suballoc.h"
 #include "tgsi/tgsi_text.h"
 #include "amd/common/sid.h"
 
@@ -529,9 +530,9 @@
 	 * being written by the gpu, hence staging is probably a good
 	 * usage pattern.
 	 */
-	struct r600_resource *buf = (struct r600_resource*)
+	struct r600_resource *buf = r600_resource(
 		pipe_buffer_create(&sscreen->b, 0,
-				   PIPE_USAGE_STAGING, buf_size);
+				   PIPE_USAGE_STAGING, buf_size));
 	if (!buf)
 		return NULL;
 
@@ -730,7 +731,7 @@
 	}
 }
 
-static void emit_sample_streamout(struct radeon_winsys_cs *cs, uint64_t va,
+static void emit_sample_streamout(struct radeon_cmdbuf *cs, uint64_t va,
 				  unsigned stream)
 {
 	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
@@ -744,7 +745,7 @@
 					struct r600_resource *buffer,
 					uint64_t va)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	switch (query->b.type) {
 	case PIPE_QUERY_OCCLUSION_COUNTER:
@@ -828,7 +829,7 @@
 				       struct r600_resource *buffer,
 				       uint64_t va)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	uint64_t fence_va = 0;
 
 	switch (query->b.type) {
@@ -919,7 +920,7 @@
 			       struct r600_resource *buf, uint64_t va,
 			       uint32_t op)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 
 	if (ctx->chip_class >= GFX9) {
 		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
@@ -935,8 +936,7 @@
 				  RADEON_PRIO_QUERY);
 }
 
-static void si_emit_query_predication(struct si_context *ctx,
-				      struct r600_atom *atom)
+static void si_emit_query_predication(struct si_context *ctx)
 {
 	struct si_query_hw *query = (struct si_query_hw *)ctx->render_cond;
 	struct si_query_buffer *qbuf;
@@ -1182,7 +1182,6 @@
 		break;
 	case PIPE_QUERY_PIPELINE_STATISTICS:
 	{
-		/* Offsets apply to EG+ */
 		static const unsigned offsets[] = {56, 48, 24, 32, 40, 16, 8, 0, 64, 72, 80};
 		params->start_offset = offsets[index];
 		params->end_offset = 88 + offsets[index];
@@ -1742,7 +1741,7 @@
 			ssbo[2].buffer_offset = offset;
 			ssbo[2].buffer_size = 8;
 
-			((struct r600_resource *)resource)->TC_L2_dirty = true;
+			r600_resource(resource)->TC_L2_dirty = true;
 		}
 
 		sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, ssbo);
@@ -1775,7 +1774,7 @@
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_query_hw *rquery = (struct si_query_hw *)query;
-	struct r600_atom *atom = &sctx->render_cond_atom;
+	struct si_atom *atom = &sctx->atoms.s.render_cond;
 
 	if (query) {
 		bool needs_workaround = false;
@@ -1942,7 +1941,11 @@
 	X("GPU-db-busy",		GPU_DB_BUSY,		UINT64, AVERAGE),
 	X("GPU-cp-busy",		GPU_CP_BUSY,		UINT64, AVERAGE),
 	X("GPU-cb-busy",		GPU_CB_BUSY,		UINT64, AVERAGE),
+
+	/* SRBM_STATUS2 */
 	X("GPU-sdma-busy",		GPU_SDMA_BUSY,		UINT64, AVERAGE),
+
+	/* CP_STAT */
 	X("GPU-pfp-busy",		GPU_PFP_BUSY,		UINT64, AVERAGE),
 	X("GPU-meq-busy",		GPU_MEQ_BUSY,		UINT64, AVERAGE),
 	X("GPU-me-busy",		GPU_ME_BUSY,		UINT64, AVERAGE),
@@ -1957,16 +1960,23 @@
 
 static unsigned si_get_num_queries(struct si_screen *sscreen)
 {
-	if (sscreen->info.drm_major == 2 && sscreen->info.drm_minor >= 42)
-		return ARRAY_SIZE(si_driver_query_list);
-	else if (sscreen->info.drm_major == 3) {
+	/* amdgpu */
+	if (sscreen->info.drm_major == 3) {
 		if (sscreen->info.chip_class >= VI)
 			return ARRAY_SIZE(si_driver_query_list);
 		else
 			return ARRAY_SIZE(si_driver_query_list) - 7;
 	}
-	else
-		return ARRAY_SIZE(si_driver_query_list) - 25;
+
+	/* radeon */
+	if (sscreen->info.has_read_registers_query) {
+		if (sscreen->info.chip_class == CIK)
+			return ARRAY_SIZE(si_driver_query_list) - 6;
+		else
+			return ARRAY_SIZE(si_driver_query_list) - 7;
+	}
+
+	return ARRAY_SIZE(si_driver_query_list) - 21;
 }
 
 static int si_get_driver_query_info(struct pipe_screen *screen,
@@ -2052,7 +2062,7 @@
 	sctx->b.end_query = si_end_query;
 	sctx->b.get_query_result = si_get_query_result;
 	sctx->b.get_query_result_resource = si_get_query_result_resource;
-	sctx->render_cond_atom.emit = si_emit_query_predication;
+	sctx->atoms.s.render_cond.emit = si_emit_query_predication;
 
 	if (((struct si_screen*)sctx->b.screen)->info.num_render_backends > 0)
 	    sctx->b.render_condition = si_render_condition;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4eff4f5..0b25592 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -22,23 +22,15 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_gather.h"
-#include "gallivm/lp_bld_intr.h"
-#include "gallivm/lp_bld_logic.h"
-#include "gallivm/lp_bld_arit.h"
-#include "gallivm/lp_bld_flow.h"
-#include "gallivm/lp_bld_misc.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
 #include "tgsi/tgsi_build.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_dump.h"
 
-#include "ac_binary.h"
-#include "ac_llvm_util.h"
 #include "ac_exp_param.h"
 #include "ac_shader_util.h"
+#include "ac_llvm_util.h"
 #include "si_shader_internal.h"
 #include "si_pipe.h"
 #include "sid.h"
@@ -77,7 +69,7 @@
 
 static void si_init_shader_ctx(struct si_shader_context *ctx,
 			       struct si_screen *sscreen,
-			       LLVMTargetMachineRef tm);
+			       struct ac_llvm_compiler *compiler);
 
 static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action,
 				 struct lp_build_tgsi_context *bld_base,
@@ -191,7 +183,8 @@
  * less than 64, so that a 64-bit bitmask of used inputs or outputs can be
  * calculated.
  */
-unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
+unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
+				       unsigned is_varying)
 {
 	switch (semantic_name) {
 	case TGSI_SEMANTIC_POSITION:
@@ -220,15 +213,24 @@
 		return SI_MAX_IO_GENERIC + 6;
 	case TGSI_SEMANTIC_PRIMID:
 		return SI_MAX_IO_GENERIC + 7;
-	case TGSI_SEMANTIC_COLOR: /* these alias */
-	case TGSI_SEMANTIC_BCOLOR:
+	case TGSI_SEMANTIC_COLOR:
 		assert(index < 2);
 		return SI_MAX_IO_GENERIC + 8 + index;
+	case TGSI_SEMANTIC_BCOLOR:
+		assert(index < 2);
+		/* If it's a varying, COLOR and BCOLOR alias. */
+		if (is_varying)
+			return SI_MAX_IO_GENERIC + 8 + index;
+		else
+			return SI_MAX_IO_GENERIC + 10 + index;
 	case TGSI_SEMANTIC_TEXCOORD:
 		assert(index < 8);
-		assert(SI_MAX_IO_GENERIC + 10 + index < 64);
-		return SI_MAX_IO_GENERIC + 10 + index;
+		STATIC_ASSERT(SI_MAX_IO_GENERIC + 12 + 8 <= 63);
+		return SI_MAX_IO_GENERIC + 12 + index;
+	case TGSI_SEMANTIC_CLIPVERTEX:
+		return 63;
 	default:
+		fprintf(stderr, "invalid semantic name = %u\n", semantic_name);
 		assert(!"invalid semantic name");
 		return 0;
 	}
@@ -343,21 +345,21 @@
 static LLVMValueRef
 get_tcs_out_patch0_offset(struct si_shader_context *ctx)
 {
-	return lp_build_mul_imm(&ctx->bld_base.uint_bld,
-				si_unpack_param(ctx,
-					     ctx->param_tcs_out_lds_offsets,
-					     0, 16),
-				4);
+	return LLVMBuildMul(ctx->ac.builder,
+			    si_unpack_param(ctx,
+					    ctx->param_tcs_out_lds_offsets,
+					    0, 16),
+			    LLVMConstInt(ctx->i32, 4, 0), "");
 }
 
 static LLVMValueRef
 get_tcs_out_patch0_patch_data_offset(struct si_shader_context *ctx)
 {
-	return lp_build_mul_imm(&ctx->bld_base.uint_bld,
-				si_unpack_param(ctx,
-					     ctx->param_tcs_out_lds_offsets,
-					     16, 16),
-				4);
+	return LLVMBuildMul(ctx->ac.builder,
+			    si_unpack_param(ctx,
+					    ctx->param_tcs_out_lds_offsets,
+					    16, 16),
+			    LLVMConstInt(ctx->i32, 4, 0), "");
 }
 
 static LLVMValueRef
@@ -415,14 +417,14 @@
 
 	switch (ctx->type) {
 	case PIPE_SHADER_VERTEX:
-		stride = util_last_bit64(ctx->shader->selector->outputs_written);
-		return LLVMConstInt(ctx->i32, stride * 4, 0);
+		stride = ctx->shader->selector->lshs_vertex_stride / 4;
+		return LLVMConstInt(ctx->i32, stride, 0);
 
 	case PIPE_SHADER_TESS_CTRL:
 		if (ctx->screen->info.chip_class >= GFX9 &&
 		    ctx->shader->is_monolithic) {
-			stride = util_last_bit64(ctx->shader->key.part.tcs.ls->outputs_written);
-			return LLVMConstInt(ctx->i32, stride * 4, 0);
+			stride = ctx->shader->key.part.tcs.ls->lshs_vertex_stride / 4;
+			return LLVMConstInt(ctx->i32, stride, 0);
 		}
 		return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
 
@@ -860,7 +862,7 @@
 		si_shader_io_get_unique_index_patch(name[input_index],
 						    index[input_index]) :
 		si_shader_io_get_unique_index(name[input_index],
-					      index[input_index]);
+					      index[input_index], false);
 
 	/* Add the base address of the element. */
 	return LLVMBuildAdd(ctx->ac.builder, base_addr,
@@ -1015,7 +1017,7 @@
 
 	param_index_base = is_patch ?
 		si_shader_io_get_unique_index_patch(name[param_base], index[param_base]) :
-		si_shader_io_get_unique_index(name[param_base], index[param_base]);
+		si_shader_io_get_unique_index(name[param_base], index[param_base], false);
 
 	if (param_index) {
 		param_index = LLVMBuildAdd(ctx->ac.builder, param_index,
@@ -1138,7 +1140,7 @@
 		for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; chan++)
 			values[chan] = lds_load(bld_base, type, chan, dw_addr);
 
-		return lp_build_gather_values(&ctx->gallivm, values,
+		return ac_build_gather_values(&ctx->ac, values,
 					      TGSI_NUM_CHANNELS);
 	}
 
@@ -1151,8 +1153,8 @@
 		return si_llvm_emit_fetch_64bit(bld_base, type, lo, hi);
 	}
 
-	dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
-			    LLVMConstInt(ctx->i32, swizzle, 0));
+	dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
+			       LLVMConstInt(ctx->i32, swizzle, 0), "");
 
 	value = ac_lds_load(&ctx->ac, dw_addr);
 
@@ -1170,8 +1172,8 @@
 		      unsigned dw_offset_imm, LLVMValueRef dw_addr,
 		      LLVMValueRef value)
 {
-	dw_addr = lp_build_add(&ctx->bld_base.uint_bld, dw_addr,
-			    LLVMConstInt(ctx->i32, dw_offset_imm, 0));
+	dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
+			       LLVMConstInt(ctx->i32, dw_offset_imm, 0), "");
 
 	ac_lds_store(&ctx->ac, dw_addr, value);
 }
@@ -1482,7 +1484,7 @@
 	}
 
 	if (reg->Register.WriteMask == 0xF && !is_tess_factor) {
-		LLVMValueRef value = lp_build_gather_values(&ctx->gallivm,
+		LLVMValueRef value = ac_build_gather_values(&ctx->ac,
 		                                            values, 4);
 		ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, buf_addr,
 					    base, 0, 1, 0, true, false);
@@ -1598,7 +1600,7 @@
 	}
 
 	if (writemask == 0xF && !is_tess_factor) {
-		LLVMValueRef value = lp_build_gather_values(&ctx->gallivm,
+		LLVMValueRef value = ac_build_gather_values(&ctx->ac,
 		                                            values, 4);
 		ac_build_buffer_store_dword(&ctx->ac, buffer, value, 4, addr,
 					    base, 0, 1, 0, true, false);
@@ -1614,7 +1616,6 @@
 	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
 	struct si_shader *shader = ctx->shader;
-	struct lp_build_context *uint =	&ctx->bld_base.uint_bld;
 	LLVMValueRef vtx_offset, soffset;
 	struct tgsi_shader_info *info = &shader->selector->info;
 	unsigned semantic_name = info->input_semantic_name[input_index];
@@ -1622,7 +1623,7 @@
 	unsigned param;
 	LLVMValueRef value;
 
-	param = si_shader_io_get_unique_index(semantic_name, semantic_index);
+	param = si_shader_io_get_unique_index(semantic_name, semantic_index, false);
 
 	/* GFX9 has the ESGS ring in LDS. */
 	if (ctx->screen->info.chip_class >= GFX9) {
@@ -1659,14 +1660,15 @@
 			values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param,
 							     type, chan);
 		}
-		return lp_build_gather_values(&ctx->gallivm, values,
+		return ac_build_gather_values(&ctx->ac, values,
 					      TGSI_NUM_CHANNELS);
 	}
 
 	/* Get the vertex offset parameter on GFX6. */
 	LLVMValueRef gs_vtx_offset = ctx->gs_vtx_offset[vtx_offset_param];
 
-	vtx_offset = lp_build_mul_imm(uint, gs_vtx_offset, 4);
+	vtx_offset = LLVMBuildMul(ctx->ac.builder, gs_vtx_offset,
+				  LLVMConstInt(ctx->i32, 4, 0), "");
 
 	soffset = LLVMConstInt(ctx->i32, (param * 4 + swizzle) * 256, 0);
 
@@ -1881,7 +1883,6 @@
 	unsigned input_index,
 	LLVMValueRef out[4])
 {
-	struct lp_build_context *base = &ctx->bld_base.base;
 	struct si_shader *shader = ctx->shader;
 	struct tgsi_shader_info *info = &shader->selector->info;
 	LLVMValueRef main_fn = ctx->main_fn;
@@ -1898,11 +1899,12 @@
 		unsigned mask = colors_read >> (semantic_index * 4);
 		unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
 				  (semantic_index ? util_bitcount(colors_read & 0xf) : 0);
+		LLVMValueRef undef = LLVMGetUndef(ctx->f32);
 
-		out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
-		out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
-		out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
-		out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
+		out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : undef;
+		out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : undef;
+		out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : undef;
+		out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : undef;
 		return;
 	}
 
@@ -1973,7 +1975,7 @@
 		for (i = 0; i < 3; ++i)
 			values[i] = LLVMConstInt(ctx->i32, sizes[i], 0);
 
-		result = lp_build_gather_values(&ctx->gallivm, values, 3);
+		result = ac_build_gather_values(&ctx->ac, values, 3);
 	} else {
 		result = LLVMGetParam(ctx->main_fn, ctx->param_block_size);
 	}
@@ -1995,13 +1997,12 @@
 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi, LLVMValueRef sample_id)
 {
 	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct lp_build_context *uint_bld = &ctx->bld_base.uint_bld;
 	LLVMValueRef desc = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers);
 	LLVMValueRef buf_index = LLVMConstInt(ctx->i32, SI_PS_CONST_SAMPLE_POSITIONS, 0);
 	LLVMValueRef resource = ac_build_load_to_sgpr(&ctx->ac, desc, buf_index);
 
 	/* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
-	LLVMValueRef offset0 = lp_build_mul_imm(uint_bld, sample_id, 8);
+	LLVMValueRef offset0 = LLVMBuildMul(ctx->ac.builder, sample_id, LLVMConstInt(ctx->i32, 8, 0), "");
 	LLVMValueRef offset1 = LLVMBuildAdd(ctx->ac.builder, offset0, LLVMConstInt(ctx->i32, 4, 0), "");
 
 	LLVMValueRef pos[4] = {
@@ -2011,7 +2012,7 @@
 		LLVMConstReal(ctx->f32, 0)
 	};
 
-	return lp_build_gather_values(&ctx->gallivm, pos, 4);
+	return ac_build_gather_values(&ctx->ac, pos, 4);
 }
 
 static LLVMValueRef load_sample_mask_in(struct ac_shader_abi *abi)
@@ -2023,8 +2024,6 @@
 static LLVMValueRef si_load_tess_coord(struct ac_shader_abi *abi)
 {
 	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-	struct lp_build_context *bld = &ctx->bld_base.base;
-
 	LLVMValueRef coord[4] = {
 		LLVMGetParam(ctx->main_fn, ctx->param_tes_u),
 		LLVMGetParam(ctx->main_fn, ctx->param_tes_v),
@@ -2034,11 +2033,12 @@
 
 	/* For triangles, the vector should be (u, v, 1-u-v). */
 	if (ctx->shader->selector->info.properties[TGSI_PROPERTY_TES_PRIM_MODE] ==
-	    PIPE_PRIM_TRIANGLES)
-		coord[2] = lp_build_sub(bld, ctx->ac.f32_1,
-					lp_build_add(bld, coord[0], coord[1]));
-
-	return lp_build_gather_values(&ctx->gallivm, coord, 4);
+	    PIPE_PRIM_TRIANGLES) {
+		coord[2] = LLVMBuildFSub(ctx->ac.builder, ctx->ac.f32_1,
+					 LLVMBuildFAdd(ctx->ac.builder,
+						       coord[0], coord[1], ""), "");
+	}
+	return ac_build_gather_values(&ctx->ac, coord, 4);
 }
 
 static LLVMValueRef load_tess_level(struct si_shader_context *ctx,
@@ -2141,11 +2141,10 @@
 			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_X_FLOAT),
 			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Y_FLOAT),
 			LLVMGetParam(ctx->main_fn, SI_PARAM_POS_Z_FLOAT),
-			lp_build_emit_llvm_unary(&ctx->bld_base, TGSI_OPCODE_RCP,
-						 LLVMGetParam(ctx->main_fn,
-							      SI_PARAM_POS_W_FLOAT)),
+			ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
+				      LLVMGetParam(ctx->main_fn, SI_PARAM_POS_W_FLOAT)),
 		};
-		value = lp_build_gather_values(&ctx->gallivm, pos, 4);
+		value = ac_build_gather_values(&ctx->ac, pos, 4);
 		break;
 	}
 
@@ -2164,11 +2163,9 @@
 			LLVMConstReal(ctx->f32, 0),
 			LLVMConstReal(ctx->f32, 0)
 		};
-		pos[0] = lp_build_emit_llvm_unary(&ctx->bld_base,
-						  TGSI_OPCODE_FRC, pos[0]);
-		pos[1] = lp_build_emit_llvm_unary(&ctx->bld_base,
-						  TGSI_OPCODE_FRC, pos[1]);
-		value = lp_build_gather_values(&ctx->gallivm, pos, 4);
+		pos[0] = ac_build_fract(&ctx->ac, pos[0], 32);
+		pos[1] = ac_build_fract(&ctx->ac, pos[1], 32);
+		value = ac_build_gather_values(&ctx->ac, pos, 4);
 		break;
 	}
 
@@ -2206,7 +2203,7 @@
 		for (i = 0; i < 4; i++)
 			val[i] = buffer_load_const(ctx, buf,
 						   LLVMConstInt(ctx->i32, (offset + i) * 4, 0));
-		value = lp_build_gather_values(&ctx->gallivm, val, 4);
+		value = ac_build_gather_values(&ctx->ac, val, 4);
 		break;
 	}
 
@@ -2232,7 +2229,7 @@
 				values[i] = ctx->abi.workgroup_ids[i];
 			}
 		}
-		value = lp_build_gather_values(&ctx->gallivm, values, 3);
+		value = ac_build_gather_values(&ctx->ac, values, 3);
 		break;
 	}
 
@@ -2241,10 +2238,10 @@
 		break;
 
 	case TGSI_SEMANTIC_HELPER_INVOCATION:
-		value = lp_build_intrinsic(ctx->ac.builder,
+		value = ac_build_intrinsic(&ctx->ac,
 					   "llvm.amdgcn.ps.live",
 					   ctx->i1, NULL, 0,
-					   LP_FUNC_ATTR_READNONE);
+					   AC_FUNC_ATTR_READNONE);
 		value = LLVMBuildNot(ctx->ac.builder, value, "");
 		value = LLVMBuildSExt(ctx->ac.builder, value, ctx->i32, "");
 		break;
@@ -2300,6 +2297,7 @@
 void si_declare_compute_memory(struct si_shader_context *ctx)
 {
 	struct si_shader_selector *sel = ctx->shader->selector;
+	unsigned lds_size = sel->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE];
 
 	LLVMTypeRef i8p = LLVMPointerType(ctx->i8, AC_LOCAL_ADDR_SPACE);
 	LLVMValueRef var;
@@ -2307,7 +2305,7 @@
 	assert(!ctx->ac.lds);
 
 	var = LLVMAddGlobalInAddressSpace(ctx->ac.module,
-	                                  LLVMArrayType(ctx->i8, sel->local_size),
+	                                  LLVMArrayType(ctx->i8, lds_size),
 	                                  "compute_lds",
 	                                  AC_LOCAL_ADDR_SPACE);
 	LLVMSetAlignment(var, 4);
@@ -2429,7 +2427,7 @@
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan)
 			values[chan] = fetch_constant(bld_base, reg, type, chan);
 
-		return lp_build_gather_values(&ctx->gallivm, values, 4);
+		return ac_build_gather_values(&ctx->ac, values, 4);
 	}
 
 	/* Split 64-bit loads. */
@@ -2662,9 +2660,9 @@
 				samplemask_param);
 	coverage = ac_to_integer(&ctx->ac, coverage);
 
-	coverage = lp_build_intrinsic(ctx->ac.builder, "llvm.ctpop.i32",
+	coverage = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32",
 				   ctx->i32,
-				   &coverage, 1, LP_FUNC_ATTR_READNONE);
+				   &coverage, 1, AC_FUNC_ATTR_READNONE);
 
 	coverage = LLVMBuildUIToFP(ctx->ac.builder, coverage,
 				   ctx->f32, "");
@@ -2705,9 +2703,9 @@
 				base_elt = buffer_load_const(ctx, const_resource,
 							     addr);
 				args->out[chan] =
-					lp_build_add(&ctx->bld_base.base, args->out[chan],
-						     lp_build_mul(&ctx->bld_base.base, base_elt,
-								  out_elts[const_chan]));
+					LLVMBuildFAdd(ctx->ac.builder, args->out[chan],
+						      LLVMBuildFMul(ctx->ac.builder, base_elt,
+								    out_elts[const_chan], ""), "");
 			}
 		}
 
@@ -2916,7 +2914,8 @@
 		if ((semantic_name != TGSI_SEMANTIC_GENERIC ||
 		     semantic_index < SI_MAX_IO_GENERIC) &&
 		    shader->key.opt.kill_outputs &
-		    (1ull << si_shader_io_get_unique_index(semantic_name, semantic_index)))
+		    (1ull << si_shader_io_get_unique_index(semantic_name,
+							   semantic_index, true)))
 			continue;
 
 		si_export_param(ctx, param_count, outputs[i].values);
@@ -3214,11 +3213,11 @@
 	}
 
 	/* Convert the outputs to vectors for stores. */
-	vec0 = lp_build_gather_values(&ctx->gallivm, out, MIN2(stride, 4));
+	vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
 	vec1 = NULL;
 
 	if (stride > 4)
-		vec1 = lp_build_gather_values(&ctx->gallivm, out+4, stride - 4);
+		vec1 = ac_build_gather_values(&ctx->ac, out+4, stride - 4);
 
 	/* Get the buffer. */
 	buffer = get_tess_ring_descriptor(ctx, TCS_FACTOR_RING);
@@ -3269,7 +3268,7 @@
 		tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
 					LLVMConstInt(ctx->i32, param_outer, 0));
 
-		outer_vec = lp_build_gather_values(&ctx->gallivm, outer,
+		outer_vec = ac_build_gather_values(&ctx->ac, outer,
 						   util_next_power_of_two(outer_comps));
 
 		ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec,
@@ -3282,7 +3281,7 @@
 					LLVMConstInt(ctx->i32, param_inner, 0));
 
 			inner_vec = inner_comps == 1 ? inner[0] :
-				    lp_build_gather_values(&ctx->gallivm, inner, inner_comps);
+				    ac_build_gather_values(&ctx->ac, inner, inner_comps);
 			ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec,
 						    inner_comps, tf_inner_offset,
 						    base, 0, 1, 0, true, false);
@@ -3450,7 +3449,7 @@
 				  8 + SI_SGPR_VS_STATE_BITS);
 
 #if !HAVE_32BIT_POINTERS
-	ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 1,
+	ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 4,
 				  8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES);
 #endif
 
@@ -3490,7 +3489,7 @@
 				  8 + SI_SGPR_BINDLESS_SAMPLERS_AND_IMAGES);
 
 #if !HAVE_32BIT_POINTERS
-	ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 1,
+	ret = si_insert_input_ptr(ctx, ret, ctx->param_vs_state_bits + 4,
 				  8 + GFX9_SGPR_2ND_SAMPLERS_AND_IMAGES);
 #endif
 
@@ -3546,7 +3545,7 @@
 		    name == TGSI_SEMANTIC_VIEWPORT_INDEX)
 			continue;
 
-		int param = si_shader_io_get_unique_index(name, index);
+		int param = si_shader_io_get_unique_index(name, index, false);
 		LLVMValueRef dw_addr = LLVMBuildAdd(ctx->ac.builder, base_dw_addr,
 					LLVMConstInt(ctx->i32, param * 4, 0), "");
 
@@ -3595,9 +3594,12 @@
 			continue;
 
 		param = si_shader_io_get_unique_index(info->output_semantic_name[i],
-						      info->output_semantic_index[i]);
+						      info->output_semantic_index[i], false);
 
 		for (chan = 0; chan < 4; chan++) {
+			if (!(info->output_usagemask[i] & (1 << chan)))
+				continue;
+
 			LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
 			out_val = ac_to_integer(&ctx->ac, out_val);
 
@@ -3674,38 +3676,36 @@
 	 * an IF statement is added that clamps all colors if the constant
 	 * is true.
 	 */
-	if (ctx->type == PIPE_SHADER_VERTEX) {
-		struct lp_build_if_state if_ctx;
-		LLVMValueRef cond = NULL;
-		LLVMValueRef addr, val;
+	struct lp_build_if_state if_ctx;
+	LLVMValueRef cond = NULL;
+	LLVMValueRef addr, val;
 
-		for (i = 0; i < info->num_outputs; i++) {
-			if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
-			    info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
-				continue;
+	for (i = 0; i < info->num_outputs; i++) {
+		if (info->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
+		    info->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
+			continue;
 
-			/* We've found a color. */
-			if (!cond) {
-				/* The state is in the first bit of the user SGPR. */
-				cond = LLVMGetParam(ctx->main_fn,
-						    ctx->param_vs_state_bits);
-				cond = LLVMBuildTrunc(ctx->ac.builder, cond,
-						      ctx->i1, "");
-				lp_build_if(&if_ctx, &ctx->gallivm, cond);
-			}
-
-			for (j = 0; j < 4; j++) {
-				addr = addrs[4 * i + j];
-				val = LLVMBuildLoad(ctx->ac.builder, addr, "");
-				val = ac_build_clamp(&ctx->ac, val);
-				LLVMBuildStore(ctx->ac.builder, val, addr);
-			}
+		/* We've found a color. */
+		if (!cond) {
+			/* The state is in the first bit of the user SGPR. */
+			cond = LLVMGetParam(ctx->main_fn,
+					    ctx->param_vs_state_bits);
+			cond = LLVMBuildTrunc(ctx->ac.builder, cond,
+					      ctx->i1, "");
+			lp_build_if(&if_ctx, &ctx->gallivm, cond);
 		}
 
-		if (cond)
-			lp_build_endif(&if_ctx);
+		for (j = 0; j < 4; j++) {
+			addr = addrs[4 * i + j];
+			val = LLVMBuildLoad(ctx->ac.builder, addr, "");
+			val = ac_build_clamp(&ctx->ac, val);
+			LLVMBuildStore(ctx->ac.builder, val, addr);
+		}
 	}
 
+	if (cond)
+		lp_build_endif(&if_ctx);
+
 	for (i = 0; i < info->num_outputs; i++) {
 		outputs[i].semantic_name = info->output_semantic_name[i];
 		outputs[i].semantic_index = info->output_semantic_index[i];
@@ -4017,11 +4017,13 @@
 	for (i = 0; i < 2; i++) {
 		a = LLVMBuildExtractElement(ctx->ac.builder, interp_ij,
 					    LLVMConstInt(ctx->i32, i, 0), "");
-		result[i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDX, a);
-		result[2+i] = lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_DDY, a);
+		result[i] = ac_build_ddxy(&ctx->ac, AC_TID_MASK_TOP_LEFT, 1,
+					  ac_to_integer(&ctx->ac, a)); /* DDX */
+		result[2+i] = ac_build_ddxy(&ctx->ac, AC_TID_MASK_TOP_LEFT, 2,
+					    ac_to_integer(&ctx->ac, a)); /* DDY */
 	}
 
-	return lp_build_gather_values(&ctx->gallivm, result, 4);
+	return ac_build_gather_values(&ctx->ac, result, 4);
 }
 
 static void interp_fetch_args(
@@ -4074,7 +4076,7 @@
 				ctx->ac.f32_0,
 			};
 
-			sample_position = lp_build_gather_values(&ctx->gallivm, center, 4);
+			sample_position = ac_build_gather_values(&ctx->ac, center, 4);
 		} else {
 			sample_position = load_sample_position(&ctx->abi, sample_id);
 		}
@@ -4182,7 +4184,7 @@
 
 			ij_out[i] = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, "");
 		}
-		interp_param = lp_build_gather_values(&ctx->gallivm, ij_out, 2);
+		interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
 	}
 
 	if (interp_param)
@@ -4323,7 +4325,6 @@
 {
 	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
 	struct tgsi_shader_info *info = &ctx->shader->selector->info;
-	struct lp_build_context *uint = &ctx->bld_base.uint_bld;
 	struct si_shader *shader = ctx->shader;
 	struct lp_build_if_state if_state;
 	LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
@@ -4370,8 +4371,9 @@
 					     shader->selector->gs_max_out_vertices, 0);
 			offset++;
 
-			voffset = lp_build_add(uint, voffset, gs_next_vertex);
-			voffset = lp_build_mul_imm(uint, voffset, 4);
+			voffset = LLVMBuildAdd(ctx->ac.builder, voffset, gs_next_vertex, "");
+			voffset = LLVMBuildMul(ctx->ac.builder, voffset,
+					       LLVMConstInt(ctx->i32, 4, 0), "");
 
 			out_val = ac_to_integer(&ctx->ac, out_val);
 
@@ -4383,14 +4385,15 @@
 		}
 	}
 
-	gs_next_vertex = lp_build_add(uint, gs_next_vertex,
-				      ctx->i32_1);
-
+	gs_next_vertex = LLVMBuildAdd(ctx->ac.builder, gs_next_vertex, ctx->i32_1, "");
 	LLVMBuildStore(ctx->ac.builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
 
-	/* Signal vertex emission */
-	ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
-			 si_get_gs_wave_id(ctx));
+	/* Signal vertex emission if vertex data was written. */
+	if (offset) {
+		ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (stream << 8),
+				 si_get_gs_wave_id(ctx));
+	}
+
 	if (!use_kill)
 		lp_build_endif(&if_state);
 }
@@ -4445,9 +4448,9 @@
 		return;
 	}
 
-	lp_build_intrinsic(ctx->ac.builder,
+	ac_build_intrinsic(&ctx->ac,
 			   "llvm.amdgcn.s.barrier",
-			   ctx->voidt, NULL, 0, LP_FUNC_ATTR_CONVERGENT);
+			   ctx->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
 }
 
 static const struct lp_build_tgsi_action interp_action = {
@@ -4477,10 +4480,12 @@
 		 * allows the optimization passes to move loads and reduces
 		 * SGPR spilling significantly.
 		 */
-		lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG);
+		ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1,
+				     AC_FUNC_ATTR_INREG);
 
 		if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
-			lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_NOALIAS);
+			ac_add_function_attr(ctx->ac.context, ctx->main_fn, i + 1,
+					     AC_FUNC_ATTR_NOALIAS);
 			ac_add_attr_dereferenceable(P, UINT64_MAX);
 		}
 	}
@@ -4631,10 +4636,10 @@
 static void declare_vs_specific_input_sgprs(struct si_shader_context *ctx,
 					    struct si_function_info *fninfo)
 {
+	ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
 	add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.base_vertex);
 	add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.start_instance);
 	add_arg_assign(fninfo, ARG_SGPR, ctx->i32, &ctx->abi.draw_id);
-	ctx->param_vs_state_bits = add_arg(fninfo, ARG_SGPR, ctx->i32);
 }
 
 static void declare_vs_input_vgprs(struct si_shader_context *ctx,
@@ -4741,7 +4746,7 @@
 			/* no extra parameters */
 		} else {
 			if (shader->is_gs_copy_shader) {
-				fninfo.num_params = ctx->param_rw_buffers + 1;
+				fninfo.num_params = ctx->param_vs_state_bits + 1;
 				fninfo.num_sgpr_params = fninfo.num_params;
 			}
 
@@ -4862,13 +4867,12 @@
 		if (ctx->type == PIPE_SHADER_VERTEX) {
 			declare_vs_specific_input_sgprs(ctx, &fninfo);
 		} else {
+			ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
 			ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
 			ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
-			if (!HAVE_32BIT_POINTERS) {
-				/* Declare as many input SGPRs as the VS has. */
+			/* Declare as many input SGPRs as the VS has. */
+			if (!HAVE_32BIT_POINTERS)
 				add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
-				ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32); /* unused */
-			}
 		}
 
 		if (!HAVE_32BIT_POINTERS) {
@@ -4914,6 +4918,7 @@
 	case PIPE_SHADER_TESS_EVAL:
 		declare_global_desc_pointers(ctx, &fninfo);
 		declare_per_stage_desc_pointers(ctx, &fninfo, true);
+		ctx->param_vs_state_bits = add_arg(&fninfo, ARG_SGPR, ctx->i32);
 		ctx->param_tcs_offchip_layout = add_arg(&fninfo, ARG_SGPR, ctx->i32);
 		ctx->param_tes_offchip_addr = add_arg(&fninfo, ARG_SGPR, ctx->i32);
 
@@ -5039,8 +5044,7 @@
 			   si_get_max_workgroup_size(shader));
 
 	/* Reserve register locations for VGPR inputs the PS prolog may need. */
-	if (ctx->type == PIPE_SHADER_FRAGMENT &&
-	    ctx->separate_prolog) {
+	if (ctx->type == PIPE_SHADER_FRAGMENT && !ctx->shader->is_monolithic) {
 		ac_llvm_add_target_dep_function_attr(ctx->main_fn,
 						     "InitialPSInputAddr",
 						     S_0286D0_PERSP_SAMPLE_ENA(1) |
@@ -5352,8 +5356,7 @@
 	assert(!epilog || !epilog->rodata_size);
 
 	r600_resource_reference(&shader->bo, NULL);
-	shader->bo = (struct r600_resource*)
-		     si_aligned_buffer_create(&sscreen->b,
+	shader->bo = si_aligned_buffer_create(&sscreen->b,
 					      sscreen->cpdma_prefetch_writes_memory ?
 						0 : SI_RESOURCE_FLAG_READ_ONLY,
                                               PIPE_USAGE_IMMUTABLE,
@@ -5459,17 +5462,7 @@
 	unsigned lds_per_wave = 0;
 	unsigned max_simd_waves;
 
-	switch (sscreen->info.family) {
-	/* These always have 8 waves: */
-	case CHIP_POLARIS10:
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-	case CHIP_VEGAM:
-		max_simd_waves = 8;
-		break;
-	default:
-		max_simd_waves = 10;
-	}
+	max_simd_waves = ac_get_max_simd_waves(sscreen->info.family);
 
 	/* Compute LDS usage for PS. */
 	switch (shader->selector->type) {
@@ -5651,11 +5644,12 @@
 static int si_compile_llvm(struct si_screen *sscreen,
 			   struct ac_shader_binary *binary,
 			   struct si_shader_config *conf,
-			   LLVMTargetMachineRef tm,
+			   struct ac_llvm_compiler *compiler,
 			   LLVMModuleRef mod,
 			   struct pipe_debug_callback *debug,
 			   unsigned processor,
-			   const char *name)
+			   const char *name,
+			   bool less_optimized)
 {
 	int r = 0;
 	unsigned count = p_atomic_inc_return(&sscreen->num_compilations);
@@ -5677,7 +5671,8 @@
 	}
 
 	if (!si_replace_shader(count, binary)) {
-		r = si_llvm_compile(mod, binary, tm, debug);
+		r = si_llvm_compile(mod, binary, compiler, debug,
+				    less_optimized);
 		if (r)
 			return r;
 	}
@@ -5729,29 +5724,21 @@
 /* Generate code for the hardware VS shader stage to go with a geometry shader */
 struct si_shader *
 si_generate_gs_copy_shader(struct si_screen *sscreen,
-			   LLVMTargetMachineRef tm,
+			   struct ac_llvm_compiler *compiler,
 			   struct si_shader_selector *gs_selector,
 			   struct pipe_debug_callback *debug)
 {
 	struct si_shader_context ctx;
 	struct si_shader *shader;
 	LLVMBuilderRef builder;
-	struct lp_build_tgsi_context *bld_base = &ctx.bld_base;
-	struct lp_build_context *uint = &bld_base->uint_bld;
-	struct si_shader_output_values *outputs;
+	struct si_shader_output_values outputs[SI_MAX_VS_OUTPUTS];
 	struct tgsi_shader_info *gsinfo = &gs_selector->info;
 	int i, r;
 
-	outputs = MALLOC(gsinfo->num_outputs * sizeof(outputs[0]));
-
-	if (!outputs)
-		return NULL;
 
 	shader = CALLOC_STRUCT(si_shader);
-	if (!shader) {
-		FREE(outputs);
+	if (!shader)
 		return NULL;
-	}
 
 	/* We can leave the fence as permanently signaled because the GS copy
 	 * shader only becomes visible globally after it has been compiled. */
@@ -5760,7 +5747,7 @@
 	shader->selector = gs_selector;
 	shader->is_gs_copy_shader = true;
 
-	si_init_shader_ctx(&ctx, sscreen, tm);
+	si_init_shader_ctx(&ctx, sscreen, compiler);
 	ctx.shader = shader;
 	ctx.type = PIPE_SHADER_VERTEX;
 
@@ -5770,7 +5757,8 @@
 	preload_ring_buffers(&ctx);
 
 	LLVMValueRef voffset =
-		lp_build_mul_imm(uint, ctx.abi.vertex_id, 4);
+		LLVMBuildMul(ctx.ac.builder, ctx.abi.vertex_id,
+			     LLVMConstInt(ctx.i32, 4, 0), "");
 
 	/* Fetch the vertex stream ID.*/
 	LLVMValueRef stream_id;
@@ -5817,7 +5805,7 @@
 			for (unsigned chan = 0; chan < 4; chan++) {
 				if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
 				    outputs[i].vertex_stream[chan] != stream) {
-					outputs[i].values[chan] = ctx.bld_base.base.undef;
+					outputs[i].values[chan] = LLVMGetUndef(ctx.f32);
 					continue;
 				}
 
@@ -5841,8 +5829,51 @@
 					       stream);
 		}
 
-		if (stream == 0)
+		if (stream == 0) {
+			/* Vertex color clamping.
+			 *
+			 * This uses a state constant loaded in a user data SGPR and
+			 * an IF statement is added that clamps all colors if the constant
+			 * is true.
+			 */
+			struct lp_build_if_state if_ctx;
+			LLVMValueRef v[2], cond = NULL;
+			LLVMBasicBlockRef blocks[2];
+
+			for (unsigned i = 0; i < gsinfo->num_outputs; i++) {
+				if (gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_COLOR &&
+				    gsinfo->output_semantic_name[i] != TGSI_SEMANTIC_BCOLOR)
+					continue;
+
+				/* We've found a color. */
+				if (!cond) {
+					/* The state is in the first bit of the user SGPR. */
+					cond = LLVMGetParam(ctx.main_fn,
+							    ctx.param_vs_state_bits);
+					cond = LLVMBuildTrunc(ctx.ac.builder, cond,
+							      ctx.i1, "");
+					lp_build_if(&if_ctx, &ctx.gallivm, cond);
+					/* Remember blocks for Phi. */
+					blocks[0] = if_ctx.true_block;
+					blocks[1] = if_ctx.entry_block;
+				}
+
+				for (unsigned j = 0; j < 4; j++) {
+					/* Insert clamp into the true block. */
+					v[0] = ac_build_clamp(&ctx.ac, outputs[i].values[j]);
+					v[1] = outputs[i].values[j];
+
+					/* Insert Phi into the endif block. */
+					LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.merge_block);
+					outputs[i].values[j] = ac_build_phi(&ctx.ac, ctx.f32, 2, v, blocks);
+					LLVMPositionBuilderAtEnd(ctx.ac.builder, if_ctx.true_block);
+				}
+			}
+			if (cond)
+				lp_build_endif(&if_ctx);
+
 			si_llvm_export_vs(&ctx, outputs, gsinfo->num_outputs);
+		}
 
 		LLVMBuildBr(builder, end_bb);
 	}
@@ -5855,10 +5886,10 @@
 	si_llvm_optimize_module(&ctx);
 
 	r = si_compile_llvm(sscreen, &ctx.shader->binary,
-			    &ctx.shader->config, ctx.tm,
-			    ctx.gallivm.module,
+			    &ctx.shader->config, ctx.compiler,
+			    ctx.ac.module,
 			    debug, PIPE_SHADER_GEOMETRY,
-			    "GS Copy Shader");
+			    "GS Copy Shader", false);
 	if (!r) {
 		if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY))
 			fprintf(stderr, "GS Copy Shader:\n");
@@ -5869,8 +5900,6 @@
 
 	si_llvm_dispose(&ctx);
 
-	FREE(outputs);
-
 	if (r != 0) {
 		FREE(shader);
 		shader = NULL;
@@ -5977,11 +6006,11 @@
 
 static void si_init_shader_ctx(struct si_shader_context *ctx,
 			       struct si_screen *sscreen,
-			       LLVMTargetMachineRef tm)
+			       struct ac_llvm_compiler *compiler)
 {
 	struct lp_build_tgsi_context *bld_base;
 
-	si_llvm_context_init(ctx, sscreen, tm);
+	si_llvm_context_init(ctx, sscreen, compiler);
 
 	bld_base = &ctx->bld_base;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant;
@@ -6039,9 +6068,9 @@
 		LLVMGetParam(ctx->main_fn, param),
 		LLVMConstInt(ctx->i32, bitoffset, 0),
 	};
-	lp_build_intrinsic(ctx->ac.builder,
+	ac_build_intrinsic(&ctx->ac,
 			   "llvm.amdgcn.init.exec.from.input",
-			   ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT);
+			   ctx->voidt, args, 2, AC_FUNC_ATTR_CONVERGENT);
 }
 
 static bool si_vs_needs_prolog(const struct si_shader_selector *sel,
@@ -6052,8 +6081,7 @@
 	return sel->vs_needs_prolog || key->ls_vgpr_fix;
 }
 
-static bool si_compile_tgsi_main(struct si_shader_context *ctx,
-				 bool is_monolithic)
+static bool si_compile_tgsi_main(struct si_shader_context *ctx)
 {
 	struct si_shader *shader = ctx->shader;
 	struct si_shader_selector *sel = shader->selector;
@@ -6138,7 +6166,7 @@
 	 * if-block together with its prolog in si_build_wrapper_function.
 	 */
 	if (ctx->screen->info.chip_class >= GFX9) {
-		if (!is_monolithic &&
+		if (!shader->is_monolithic &&
 		    sel->info.num_instructions > 1 && /* not empty shader */
 		    (shader->key.as_es || shader->key.as_ls) &&
 		    (ctx->type == PIPE_SHADER_TESS_EVAL ||
@@ -6148,19 +6176,27 @@
 						ctx->param_merged_wave_info, 0);
 		} else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
 			   ctx->type == PIPE_SHADER_GEOMETRY) {
-			if (!is_monolithic)
+			if (!shader->is_monolithic)
 				ac_init_exec_full_mask(&ctx->ac);
 
-			/* The barrier must execute for all shaders in a
-			 * threadgroup.
-			 */
-			si_llvm_emit_barrier(NULL, bld_base, NULL);
-
 			LLVMValueRef num_threads = si_unpack_param(ctx, ctx->param_merged_wave_info, 8, 8);
 			LLVMValueRef ena =
 				LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
 					    ac_get_thread_id(&ctx->ac), num_threads, "");
 			lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena);
+
+			/* The barrier must execute for all shaders in a
+			 * threadgroup.
+			 *
+			 * Execute the barrier inside the conditional block,
+			 * so that empty waves can jump directly to s_endpgm,
+			 * which will also signal the barrier.
+			 *
+			 * If the shader is TCS and the TCS epilog is present
+			 * and contains a barrier, it will wait there and then
+			 * reach s_endpgm.
+			 */
+			si_llvm_emit_barrier(NULL, bld_base, NULL);
 		}
 	}
 
@@ -6168,7 +6204,7 @@
 	    sel->tcs_info.tessfactors_are_def_in_all_invocs) {
 		for (unsigned i = 0; i < 6; i++) {
 			ctx->invoc0_tess_factors[i] =
-				lp_build_alloca_undef(&ctx->gallivm, ctx->i32, "");
+				ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
 		}
 	}
 
@@ -6176,13 +6212,12 @@
 		int i;
 		for (i = 0; i < 4; i++) {
 			ctx->gs_next_vertex[i] =
-				lp_build_alloca(&ctx->gallivm,
-						ctx->i32, "");
+				ac_build_alloca(&ctx->ac, ctx->i32, "");
 		}
 	}
 
 	if (sel->force_correct_derivs_after_kill) {
-		ctx->postponed_kill = lp_build_alloca_undef(&ctx->gallivm, ctx->i1, "");
+		ctx->postponed_kill = ac_build_alloca_undef(&ctx->ac, ctx->i1, "");
 		/* true = don't kill. */
 		LLVMBuildStore(ctx->ac.builder, LLVMConstInt(ctx->i1, 1, 0),
 			       ctx->postponed_kill);
@@ -6549,7 +6584,8 @@
 	si_init_function_info(&fninfo);
 
 	for (unsigned i = 0; i < num_parts; ++i) {
-		lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
+		ac_add_function_attr(ctx->ac.context, parts[i], -1,
+				     AC_FUNC_ATTR_ALWAYSINLINE);
 		LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
 	}
 
@@ -6676,9 +6712,10 @@
 			param_size = ac_get_type_size(param_type) / 4;
 			is_sgpr = ac_is_sgpr_param(param);
 
-			if (is_sgpr)
-				lp_add_function_attr(parts[part], param_idx + 1, LP_FUNC_ATTR_INREG);
-			else if (out_idx < num_out_sgpr) {
+			if (is_sgpr) {
+				ac_add_function_attr(ctx->ac.context, parts[part],
+						     param_idx + 1, AC_FUNC_ATTR_INREG);
+			} else if (out_idx < num_out_sgpr) {
 				/* Skip returned SGPRs the current part doesn't
 				 * declare on the input. */
 				out_idx = num_out_sgpr;
@@ -6689,7 +6726,7 @@
 			if (param_size == 1)
 				arg = out[out_idx];
 			else
-				arg = lp_build_gather_values(&ctx->gallivm, &out[out_idx], param_size);
+				arg = ac_build_gather_values(&ctx->ac, &out[out_idx], param_size);
 
 			if (LLVMTypeOf(arg) != param_type) {
 				if (LLVMGetTypeKind(param_type) == LLVMPointerTypeKind) {
@@ -6758,10 +6795,25 @@
 	LLVMBuildRetVoid(builder);
 }
 
+static bool si_should_optimize_less(struct ac_llvm_compiler *compiler,
+				    struct si_shader_selector *sel)
+{
+	if (!compiler->low_opt_passes)
+		return false;
+
+	/* Assume a slow CPU. */
+	assert(!sel->screen->info.has_dedicated_vram &&
+	       sel->screen->info.chip_class <= VI);
+
+	/* For a crazy dEQP test containing 2597 memory opcodes, mostly
+	 * buffer stores. */
+	return sel->type == PIPE_SHADER_COMPUTE &&
+	       sel->info.num_memory_instructions > 1000;
+}
+
 int si_compile_tgsi_shader(struct si_screen *sscreen,
-			   LLVMTargetMachineRef tm,
+			   struct ac_llvm_compiler *compiler,
 			   struct si_shader *shader,
-			   bool is_monolithic,
 			   struct pipe_debug_callback *debug)
 {
 	struct si_shader_selector *sel = shader->selector;
@@ -6779,21 +6831,20 @@
 		si_dump_streamout(&sel->so);
 	}
 
-	si_init_shader_ctx(&ctx, sscreen, tm);
+	si_init_shader_ctx(&ctx, sscreen, compiler);
 	si_llvm_context_set_tgsi(&ctx, shader);
-	ctx.separate_prolog = !is_monolithic;
 
 	memset(shader->info.vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
 	       sizeof(shader->info.vs_output_param_offset));
 
 	shader->info.uses_instanceid = sel->info.uses_instanceid;
 
-	if (!si_compile_tgsi_main(&ctx, is_monolithic)) {
+	if (!si_compile_tgsi_main(&ctx)) {
 		si_llvm_dispose(&ctx);
 		return -1;
 	}
 
-	if (is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
+	if (shader->is_monolithic && ctx.type == PIPE_SHADER_VERTEX) {
 		LLVMValueRef parts[2];
 		bool need_prolog = sel->vs_needs_prolog;
 
@@ -6811,7 +6862,7 @@
 
 		si_build_wrapper_function(&ctx, parts + !need_prolog,
 					  1 + need_prolog, need_prolog, 0);
-	} else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
+	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
 		if (sscreen->info.chip_class >= GFX9) {
 			struct si_shader_selector *ls = shader->key.part.tcs.ls;
 			LLVMValueRef parts[4];
@@ -6834,9 +6885,10 @@
 			shader_ls.key.as_ls = 1;
 			shader_ls.key.mono = shader->key.mono;
 			shader_ls.key.opt = shader->key.opt;
+			shader_ls.is_monolithic = true;
 			si_llvm_context_set_tgsi(&ctx, &shader_ls);
 
-			if (!si_compile_tgsi_main(&ctx, true)) {
+			if (!si_compile_tgsi_main(&ctx)) {
 				si_llvm_dispose(&ctx);
 				return -1;
 			}
@@ -6876,7 +6928,7 @@
 
 			si_build_wrapper_function(&ctx, parts, 2, 0, 0);
 		}
-	} else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
+	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
 		if (ctx.screen->info.chip_class >= GFX9) {
 			struct si_shader_selector *es = shader->key.part.gs.es;
 			LLVMValueRef es_prolog = NULL;
@@ -6898,9 +6950,10 @@
 			shader_es.key.as_es = 1;
 			shader_es.key.mono = shader->key.mono;
 			shader_es.key.opt = shader->key.opt;
+			shader_es.is_monolithic = true;
 			si_llvm_context_set_tgsi(&ctx, &shader_es);
 
-			if (!si_compile_tgsi_main(&ctx, true)) {
+			if (!si_compile_tgsi_main(&ctx)) {
 				si_llvm_dispose(&ctx);
 				return -1;
 			}
@@ -6949,7 +7002,7 @@
 
 			si_build_wrapper_function(&ctx, parts, 2, 1, 0);
 		}
-	} else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
+	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
 		LLVMValueRef parts[3];
 		union si_shader_part_key prolog_key;
 		union si_shader_part_key epilog_key;
@@ -6989,8 +7042,9 @@
 	       LLVMPointerTypeKind);
 
 	/* Compile to bytecode. */
-	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, tm,
-			    ctx.gallivm.module, debug, ctx.type, "TGSI shader");
+	r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler,
+			    ctx.ac.module, debug, ctx.type, "TGSI shader",
+			    si_should_optimize_less(compiler, shader->selector));
 	si_llvm_dispose(&ctx);
 	if (r) {
 		fprintf(stderr, "LLVM failed to compile shader\n");
@@ -7100,7 +7154,7 @@
 		   enum pipe_shader_type type,
 		   bool prolog,
 		   union si_shader_part_key *key,
-		   LLVMTargetMachineRef tm,
+		   struct ac_llvm_compiler *compiler,
 		   struct pipe_debug_callback *debug,
 		   void (*build)(struct si_shader_context *,
 				 union si_shader_part_key *),
@@ -7125,7 +7179,7 @@
 	struct si_shader shader = {};
 	struct si_shader_context ctx;
 
-	si_init_shader_ctx(&ctx, sscreen, tm);
+	si_init_shader_ctx(&ctx, sscreen, compiler);
 	ctx.shader = &shader;
 	ctx.type = type;
 
@@ -7156,8 +7210,8 @@
 	/* Compile. */
 	si_llvm_optimize_module(&ctx);
 
-	if (si_compile_llvm(sscreen, &result->binary, &result->config, tm,
-			    ctx.ac.module, debug, ctx.type, name)) {
+	if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler,
+			    ctx.ac.module, debug, ctx.type, name, false)) {
 		FREE(result);
 		result = NULL;
 		goto out;
@@ -7191,7 +7245,7 @@
 	/* Get the pointer to rw buffers. */
 	ptr[0] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS);
 	ptr[1] = LLVMGetParam(ctx->main_fn, (is_merged_shader ? 8 : 0) + SI_SGPR_RW_BUFFERS + 1);
-	list = lp_build_gather_values(&ctx->gallivm, ptr, 2);
+	list = ac_build_gather_values(&ctx->ac, ptr, 2);
 	list = LLVMBuildBitCast(ctx->ac.builder, list, ctx->i64, "");
 	list = LLVMBuildIntToPtr(ctx->ac.builder, list,
 				 ac_array_in_const_addr_space(ctx->v4i32), "");
@@ -7346,7 +7400,7 @@
 }
 
 static bool si_get_vs_prolog(struct si_screen *sscreen,
-			     LLVMTargetMachineRef tm,
+			     struct ac_llvm_compiler *compiler,
 			     struct si_shader *shader,
 			     struct pipe_debug_callback *debug,
 			     struct si_shader *main_part,
@@ -7364,7 +7418,7 @@
 
 	shader->prolog =
 		si_get_shader_part(sscreen, &sscreen->vs_prologs,
-				   PIPE_SHADER_VERTEX, true, &prolog_key, tm,
+				   PIPE_SHADER_VERTEX, true, &prolog_key, compiler,
 				   debug, si_build_vs_prolog_function,
 				   "Vertex Shader Prolog");
 	return shader->prolog != NULL;
@@ -7374,11 +7428,11 @@
  * Select and compile (or reuse) vertex shader parts (prolog & epilog).
  */
 static bool si_shader_select_vs_parts(struct si_screen *sscreen,
-				      LLVMTargetMachineRef tm,
+				      struct ac_llvm_compiler *compiler,
 				      struct si_shader *shader,
 				      struct pipe_debug_callback *debug)
 {
-	return si_get_vs_prolog(sscreen, tm, shader, debug, shader,
+	return si_get_vs_prolog(sscreen, compiler, shader, debug, shader,
 				&shader->key.part.vs.prolog);
 }
 
@@ -7463,7 +7517,7 @@
  * Select and compile (or reuse) TCS parts (epilog).
  */
 static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
-				       LLVMTargetMachineRef tm,
+				       struct ac_llvm_compiler *compiler,
 				       struct si_shader *shader,
 				       struct pipe_debug_callback *debug)
 {
@@ -7471,7 +7525,7 @@
 		struct si_shader *ls_main_part =
 			shader->key.part.tcs.ls->main_shader_part_ls;
 
-		if (!si_get_vs_prolog(sscreen, tm, shader, debug, ls_main_part,
+		if (!si_get_vs_prolog(sscreen, compiler, shader, debug, ls_main_part,
 				      &shader->key.part.tcs.ls_prolog))
 			return false;
 
@@ -7485,7 +7539,7 @@
 
 	shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
 					    PIPE_SHADER_TESS_CTRL, false,
-					    &epilog_key, tm, debug,
+					    &epilog_key, compiler, debug,
 					    si_build_tcs_epilog_function,
 					    "Tessellation Control Shader Epilog");
 	return shader->epilog != NULL;
@@ -7495,7 +7549,7 @@
  * Select and compile (or reuse) GS parts (prolog).
  */
 static bool si_shader_select_gs_parts(struct si_screen *sscreen,
-				      LLVMTargetMachineRef tm,
+				      struct ac_llvm_compiler *compiler,
 				      struct si_shader *shader,
 				      struct pipe_debug_callback *debug)
 {
@@ -7504,7 +7558,7 @@
 			shader->key.part.gs.es->main_shader_part_es;
 
 		if (shader->key.part.gs.es->type == PIPE_SHADER_VERTEX &&
-		    !si_get_vs_prolog(sscreen, tm, shader, debug, es_main_part,
+		    !si_get_vs_prolog(sscreen, compiler, shader, debug, es_main_part,
 				      &shader->key.part.gs.vs_prolog))
 			return false;
 
@@ -7520,7 +7574,7 @@
 
 	shader->prolog2 = si_get_shader_part(sscreen, &sscreen->gs_prologs,
 					    PIPE_SHADER_GEOMETRY, true,
-					    &prolog_key, tm, debug,
+					    &prolog_key, compiler, debug,
 					    si_build_gs_prolog_function,
 					    "Geometry Shader Prolog");
 	return shader->prolog2 != NULL;
@@ -7723,7 +7777,7 @@
 							  interp_vgpr, "");
 			interp[1] = LLVMBuildExtractValue(ctx->ac.builder, ret,
 							  interp_vgpr + 1, "");
-			interp_ij = lp_build_gather_values(&ctx->gallivm, interp, 2);
+			interp_ij = ac_build_gather_values(&ctx->ac, interp, 2);
 		}
 
 		/* Use the absolute location of the input. */
@@ -7908,7 +7962,7 @@
  * Select and compile (or reuse) pixel shader parts (prolog & epilog).
  */
 static bool si_shader_select_ps_parts(struct si_screen *sscreen,
-				      LLVMTargetMachineRef tm,
+				      struct ac_llvm_compiler *compiler,
 				      struct si_shader *shader,
 				      struct pipe_debug_callback *debug)
 {
@@ -7923,7 +7977,7 @@
 		shader->prolog =
 			si_get_shader_part(sscreen, &sscreen->ps_prologs,
 					   PIPE_SHADER_FRAGMENT, true,
-					   &prolog_key, tm, debug,
+					   &prolog_key, compiler, debug,
 					   si_build_ps_prolog_function,
 					   "Fragment Shader Prolog");
 		if (!shader->prolog)
@@ -7936,7 +7990,7 @@
 	shader->epilog =
 		si_get_shader_part(sscreen, &sscreen->ps_epilogs,
 				   PIPE_SHADER_FRAGMENT, false,
-				   &epilog_key, tm, debug,
+				   &epilog_key, compiler, debug,
 				   si_build_ps_epilog_function,
 				   "Fragment Shader Epilog");
 	if (!shader->epilog)
@@ -8039,7 +8093,7 @@
 	}
 }
 
-int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
+int si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
 		     struct si_shader *shader,
 		     struct pipe_debug_callback *debug)
 {
@@ -8057,7 +8111,7 @@
 		/* Monolithic shader (compiled as a whole, has many variants,
 		 * may take a long time to compile).
 		 */
-		r = si_compile_tgsi_shader(sscreen, tm, shader, true, debug);
+		r = si_compile_tgsi_shader(sscreen, compiler, shader, debug);
 		if (r)
 			return r;
 	} else {
@@ -8097,21 +8151,21 @@
 		/* Select prologs and/or epilogs. */
 		switch (sel->type) {
 		case PIPE_SHADER_VERTEX:
-			if (!si_shader_select_vs_parts(sscreen, tm, shader, debug))
+			if (!si_shader_select_vs_parts(sscreen, compiler, shader, debug))
 				return -1;
 			break;
 		case PIPE_SHADER_TESS_CTRL:
-			if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
+			if (!si_shader_select_tcs_parts(sscreen, compiler, shader, debug))
 				return -1;
 			break;
 		case PIPE_SHADER_TESS_EVAL:
 			break;
 		case PIPE_SHADER_GEOMETRY:
-			if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
+			if (!si_shader_select_gs_parts(sscreen, compiler, shader, debug))
 				return -1;
 			break;
 		case PIPE_SHADER_FRAGMENT:
-			if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
+			if (!si_shader_select_ps_parts(sscreen, compiler, shader, debug))
 				return -1;
 
 			/* Make sure we have at least as many VGPRs as there
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 7734bfd..2dc4bc7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -134,20 +134,26 @@
 #include <llvm-c/Core.h> /* LLVMModuleRef */
 #include <llvm-c/TargetMachine.h>
 #include "tgsi/tgsi_scan.h"
+#include "util/u_inlines.h"
 #include "util/u_queue.h"
 
 #include "ac_binary.h"
 #include "ac_llvm_build.h"
-#include "si_state.h"
+#include "ac_llvm_util.h"
+
+#include <stdio.h>
 
 struct nir_shader;
+struct si_shader;
+struct si_context;
 
+#define SI_MAX_ATTRIBS		16
 #define SI_MAX_VS_OUTPUTS	40
 
 /* Shader IO unique indices are supported for TGSI_SEMANTIC_GENERIC with an
  * index smaller than this.
  */
-#define SI_MAX_IO_GENERIC       46
+#define SI_MAX_IO_GENERIC       43
 
 /* SGPR user data indices */
 enum {
@@ -169,17 +175,20 @@
 #endif
 	SI_NUM_RESOURCE_SGPRS,
 
+	/* API VS, TES without GS, GS copy shader */
+	SI_SGPR_VS_STATE_BITS = SI_NUM_RESOURCE_SGPRS,
+	SI_NUM_VS_STATE_RESOURCE_SGPRS,
+
 	/* all VS variants */
-	SI_SGPR_BASE_VERTEX = SI_NUM_RESOURCE_SGPRS,
+	SI_SGPR_BASE_VERTEX = SI_NUM_VS_STATE_RESOURCE_SGPRS,
 	SI_SGPR_START_INSTANCE,
 	SI_SGPR_DRAWID,
-	SI_SGPR_VS_STATE_BITS,
 	SI_VS_NUM_USER_SGPR,
 
 	SI_SGPR_VS_BLIT_DATA = SI_SGPR_CONST_AND_SHADER_BUFFERS,
 
 	/* TES */
-	SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_RESOURCE_SGPRS,
+	SI_SGPR_TES_OFFCHIP_LAYOUT = SI_NUM_VS_STATE_RESOURCE_SGPRS,
 	SI_SGPR_TES_OFFCHIP_ADDR,
 	SI_TES_NUM_USER_SGPR,
 
@@ -220,7 +229,7 @@
 	GFX9_VSGS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR,
 	GFX9_TESGS_NUM_USER_SGPR = GFX9_MERGED_NUM_USER_SGPR,
 #endif
-	SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS + (HAVE_32BIT_POINTERS ? 1 : 2),
+	SI_GSCOPY_NUM_USER_SGPR = SI_NUM_VS_STATE_RESOURCE_SGPRS,
 
 	/* PS only */
 	SI_SGPR_ALPHA_REF	= SI_NUM_RESOURCE_SGPRS,
@@ -272,6 +281,9 @@
 };
 
 enum {
+	/* Use a property enum that CS wouldn't use. */
+	TGSI_PROPERTY_CS_LOCAL_SIZE = TGSI_PROPERTY_FS_COORD_ORIGIN,
+
 	/* Use a property enum that VS wouldn't use. */
 	TGSI_PROPERTY_VS_BLIT_SGPRS = TGSI_PROPERTY_FS_COORD_ORIGIN,
 
@@ -310,7 +322,7 @@
 struct si_compiler_ctx_state {
 	/* Should only be used by si_init_shader_selector_async and
 	 * si_build_shader_variant if thread_index == -1 (non-threaded). */
-	LLVMTargetMachineRef		tm;
+	struct ac_llvm_compiler		*compiler;
 
 	/* Used if thread_index == -1 or if debug.async is true. */
 	struct pipe_debug_callback	debug;
@@ -356,7 +368,8 @@
 	ubyte		culldist_mask;
 
 	/* ES parameters. */
-	unsigned	esgs_itemsize;
+	unsigned	esgs_itemsize; /* vertex stride */
+	unsigned	lshs_vertex_stride;
 
 	/* GS parameters. */
 	unsigned	gs_input_verts_per_prim;
@@ -376,9 +389,7 @@
 	 */
 	unsigned	colors_written_4bit;
 
-	/* CS parameters */
-	unsigned local_size;
-
+	uint64_t	outputs_written_before_ps; /* "get_unique_index" bits */
 	uint64_t	outputs_written;	/* "get_unique_index" bits */
 	uint32_t	patch_outputs_written;	/* "get_unique_index_patch" bits */
 
@@ -492,7 +503,7 @@
 		unsigned	ancillary_vgpr_index:5;
 		unsigned	wqm:1;
 		char		color_attr_index[2];
-		char		color_interp_vgpr_index[2]; /* -1 == constant */
+		signed char	color_interp_vgpr_index[2]; /* -1 == constant */
 	} ps_prolog;
 	struct {
 		struct si_ps_epilog_bits states;
@@ -641,20 +652,20 @@
 /* si_shader.c */
 struct si_shader *
 si_generate_gs_copy_shader(struct si_screen *sscreen,
-			   LLVMTargetMachineRef tm,
+			   struct ac_llvm_compiler *compiler,
 			   struct si_shader_selector *gs_selector,
 			   struct pipe_debug_callback *debug);
 int si_compile_tgsi_shader(struct si_screen *sscreen,
-			   LLVMTargetMachineRef tm,
+			   struct ac_llvm_compiler *compiler,
 			   struct si_shader *shader,
-			   bool is_monolithic,
 			   struct pipe_debug_callback *debug);
-int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
+int si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compiler,
 		     struct si_shader *shader,
 		     struct pipe_debug_callback *debug);
 void si_shader_destroy(struct si_shader *shader);
 unsigned si_shader_io_get_unique_index_patch(unsigned semantic_name, unsigned index);
-unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
+unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index,
+				       unsigned is_varying);
 int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader);
 void si_shader_dump(struct si_screen *sscreen, const struct si_shader *shader,
 		    struct pipe_debug_callback *debug, unsigned processor,
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index afcc14e..3635139 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -31,8 +31,6 @@
 #include "gallivm/lp_bld_tgsi.h"
 #include "tgsi/tgsi_parse.h"
 #include "ac_shader_abi.h"
-#include "ac_llvm_util.h"
-#include "ac_llvm_build.h"
 
 #include <llvm-c/Core.h>
 #include <llvm-c/TargetMachine.h>
@@ -62,9 +60,6 @@
 	unsigned num_images;
 	unsigned num_samplers;
 
-	/* Whether the prolog will be compiled separately. */
-	bool separate_prolog;
-
 	struct ac_shader_abi abi;
 
 	/** This function is responsible for initilizing the inputs array and will be
@@ -179,7 +174,7 @@
 	/* CS */
 	int param_block_size;
 
-	LLVMTargetMachineRef tm;
+	struct ac_llvm_compiler *compiler;
 
 	/* Preloaded descriptors. */
 	LLVMValueRef esgs_ring;
@@ -221,8 +216,9 @@
 }
 
 unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
-			 LLVMTargetMachineRef tm,
-			 struct pipe_debug_callback *debug);
+			 struct ac_llvm_compiler *compiler,
+			 struct pipe_debug_callback *debug,
+			 bool less_optimized);
 
 LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base,
 			  enum tgsi_opcode_type type);
@@ -236,7 +232,7 @@
 
 void si_llvm_context_init(struct si_shader_context *ctx,
 			  struct si_screen *sscreen,
-			  LLVMTargetMachineRef tm);
+			  struct ac_llvm_compiler *compiler);
 void si_llvm_context_set_tgsi(struct si_shader_context *ctx,
 			      struct si_shader *shader);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index b4fba8b..4ae5b00 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -32,6 +32,24 @@
 #include "compiler/nir/nir.h"
 #include "compiler/nir_types.h"
 
+static nir_variable* tex_get_texture_var(nir_tex_instr *instr)
+{
+	for (unsigned i = 0; i < instr->num_srcs; i++) {
+		switch (instr->src[i].src_type) {
+		case nir_tex_src_texture_deref:
+			return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[i].src));
+		default:
+			break;
+		}
+	}
+
+	return NULL;
+}
+
+static nir_variable* intrinsic_get_var(nir_intrinsic_instr *instr)
+{
+	return nir_deref_instr_get_variable(nir_src_as_deref(instr->src[0]));
+}
 
 static void scan_instruction(struct tgsi_shader_info *info,
 			     nir_instr *instr)
@@ -53,12 +71,13 @@
 		}
 	} else if (instr->type == nir_instr_type_tex) {
 		nir_tex_instr *tex = nir_instr_as_tex(instr);
+		nir_variable *texture = tex_get_texture_var(tex);
 
-		if (!tex->texture) {
+		if (!texture) {
 			info->samplers_declared |=
 				u_bit_consecutive(tex->sampler_index, 1);
 		} else {
-			if (tex->texture->var->data.bindless)
+			if (texture->data.bindless)
 				info->uses_bindless_samplers = true;
 		}
 
@@ -124,25 +143,25 @@
 		case nir_intrinsic_load_tess_level_outer:
 			info->reads_tess_factors = true;
 			break;
-		case nir_intrinsic_image_var_load:
-		case nir_intrinsic_image_var_size:
-		case nir_intrinsic_image_var_samples: {
-			nir_variable *var = intr->variables[0]->var;
+		case nir_intrinsic_image_deref_load:
+		case nir_intrinsic_image_deref_size:
+		case nir_intrinsic_image_deref_samples: {
+			nir_variable *var = intrinsic_get_var(intr);
 			if (var->data.bindless)
 				info->uses_bindless_images = true;
 
 			break;
 		}
-		case nir_intrinsic_image_var_store:
-		case nir_intrinsic_image_var_atomic_add:
-		case nir_intrinsic_image_var_atomic_min:
-		case nir_intrinsic_image_var_atomic_max:
-		case nir_intrinsic_image_var_atomic_and:
-		case nir_intrinsic_image_var_atomic_or:
-		case nir_intrinsic_image_var_atomic_xor:
-		case nir_intrinsic_image_var_atomic_exchange:
-		case nir_intrinsic_image_var_atomic_comp_swap: {
-			nir_variable *var = intr->variables[0]->var;
+		case nir_intrinsic_image_deref_store:
+		case nir_intrinsic_image_deref_atomic_add:
+		case nir_intrinsic_image_deref_atomic_min:
+		case nir_intrinsic_image_deref_atomic_max:
+		case nir_intrinsic_image_deref_atomic_and:
+		case nir_intrinsic_image_deref_atomic_or:
+		case nir_intrinsic_image_deref_atomic_xor:
+		case nir_intrinsic_image_deref_atomic_exchange:
+		case nir_intrinsic_image_deref_atomic_comp_swap: {
+			nir_variable *var = intrinsic_get_var(intr);
 			if (var->data.bindless)
 				info->uses_bindless_images = true;
 
@@ -161,8 +180,8 @@
 		case nir_intrinsic_ssbo_atomic_comp_swap:
 			info->writes_memory = true;
 			break;
-		case nir_intrinsic_load_var: {
-			nir_variable *var = intr->variables[0]->var;
+		case nir_intrinsic_load_deref: {
+			nir_variable *var = intrinsic_get_var(intr);
 			nir_variable_mode mode = var->data.mode;
 			enum glsl_base_type base_type =
 				glsl_get_base_type(glsl_without_array(var->type));
@@ -195,25 +214,24 @@
 			}
 			break;
 		}
-		case nir_intrinsic_interp_var_at_centroid:
-		case nir_intrinsic_interp_var_at_sample:
-		case nir_intrinsic_interp_var_at_offset: {
-			enum glsl_interp_mode interp =
-				intr->variables[0]->var->data.interpolation;
+		case nir_intrinsic_interp_deref_at_centroid:
+		case nir_intrinsic_interp_deref_at_sample:
+		case nir_intrinsic_interp_deref_at_offset: {
+			enum glsl_interp_mode interp = intrinsic_get_var(intr)->data.interpolation;
 			switch (interp) {
 			case INTERP_MODE_SMOOTH:
 			case INTERP_MODE_NONE:
-				if (intr->intrinsic == nir_intrinsic_interp_var_at_centroid)
+				if (intr->intrinsic == nir_intrinsic_interp_deref_at_centroid)
 					info->uses_persp_opcode_interp_centroid = true;
-				else if (intr->intrinsic == nir_intrinsic_interp_var_at_sample)
+				else if (intr->intrinsic == nir_intrinsic_interp_deref_at_sample)
 					info->uses_persp_opcode_interp_sample = true;
 				else
 					info->uses_persp_opcode_interp_offset = true;
 				break;
 			case INTERP_MODE_NOPERSPECTIVE:
-				if (intr->intrinsic == nir_intrinsic_interp_var_at_centroid)
+				if (intr->intrinsic == nir_intrinsic_interp_deref_at_centroid)
 					info->uses_linear_opcode_interp_centroid = true;
-				else if (intr->intrinsic == nir_intrinsic_interp_var_at_sample)
+				else if (intr->intrinsic == nir_intrinsic_interp_deref_at_sample)
 					info->uses_linear_opcode_interp_sample = true;
 				else
 					info->uses_linear_opcode_interp_offset = true;
@@ -784,6 +802,8 @@
 
 	ac_lower_indirect_derefs(sel->nir, sel->screen->info.chip_class);
 
+	NIR_PASS_V(sel->nir, nir_lower_load_const_to_scalar);
+
 	bool progress;
 	do {
 		progress = false;
@@ -902,7 +922,7 @@
 		if (dynamic_index)
 			index = si_llvm_bound_index(ctx, index, ctx->num_images);
 
-		index = LLVMBuildSub(ctx->gallivm.builder,
+		index = LLVMBuildSub(ctx->ac.builder,
 				     LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
 				     index, "");
 
@@ -915,7 +935,7 @@
 	if (dynamic_index)
 		index = si_llvm_bound_index(ctx, index, ctx->num_samplers);
 
-	index = LLVMBuildAdd(ctx->gallivm.builder, index,
+	index = LLVMBuildAdd(ctx->ac.builder, index,
 			     LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), "");
 
 	return si_load_sampler_desc(ctx, list, index, desc_type);
@@ -981,7 +1001,7 @@
 	ctx->num_samplers = util_last_bit(info->samplers_declared);
 	ctx->num_images = util_last_bit(info->images_declared);
 
-	if (ctx->shader->selector->local_size) {
+	if (ctx->shader->selector->info.properties[TGSI_PROPERTY_CS_LOCAL_SIZE]) {
 		assert(nir->info.stage == MESA_SHADER_COMPUTE);
 		si_declare_compute_memory(ctx);
 	}
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c
index 207f552..0d29286 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_alu.c
@@ -23,11 +23,7 @@
  */
 
 #include "si_shader_internal.h"
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_intr.h"
-#include "gallivm/lp_bld_gather.h"
-#include "tgsi/tgsi_parse.h"
-#include "amd/common/ac_llvm_build.h"
+#include "ac_llvm_util.h"
 
 static void kill_if_fetch_args(struct lp_build_tgsi_context *bld_base,
 			       struct lp_build_emit_data *emit_data)
@@ -40,7 +36,8 @@
 
 	for (i = 0; i < TGSI_NUM_CHANNELS; i++) {
 		LLVMValueRef value = lp_build_emit_fetch(bld_base, inst, 0, i);
-		conds[i] = LLVMBuildFCmp(builder, LLVMRealOGE, value,
+		/* UGE because NaN shouldn't get killed */
+		conds[i] = LLVMBuildFCmp(builder, LLVMRealUGE, value,
 					ctx->ac.f32_0, "");
 	}
 
@@ -248,7 +245,9 @@
 		     struct lp_build_emit_data *emit_data)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
-	LLVMValueRef floor_index =  lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_FLR, emit_data->args[0]);
+	LLVMValueRef floor_index =
+		ac_build_intrinsic(&ctx->ac, "llvm.floor.f32", ctx->f32,
+				   &emit_data->args[0], 1, AC_FUNC_ATTR_READNONE);
 	emit_data->output[emit_data->chan] = LLVMBuildFPToSI(ctx->ac.builder,
 			floor_index, ctx->i32, "");
 }
@@ -453,9 +452,9 @@
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	emit_data->output[emit_data->chan] =
-		lp_build_intrinsic(ctx->ac.builder, action->intr_name,
+		ac_build_intrinsic(&ctx->ac, action->intr_name,
 				   emit_data->dst_type, emit_data->args,
-				   emit_data->arg_count, LP_FUNC_ATTR_READNONE);
+				   emit_data->arg_count, AC_FUNC_ATTR_READNONE);
 }
 
 static void emit_bfi(const struct lp_build_tgsi_action *action,
@@ -507,18 +506,37 @@
 		     struct lp_build_emit_data *emit_data)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
-	LLVMValueRef bfe_sm5;
-	LLVMValueRef cond;
 
-	bfe_sm5 = ac_build_bfe(&ctx->ac, emit_data->args[0],
-			       emit_data->args[1], emit_data->args[2],
-			       emit_data->info->opcode == TGSI_OPCODE_IBFE);
+	if (HAVE_LLVM < 0x0700) {
+		LLVMValueRef bfe_sm5 =
+			ac_build_bfe(&ctx->ac, emit_data->args[0],
+				     emit_data->args[1], emit_data->args[2],
+				     emit_data->info->opcode == TGSI_OPCODE_IBFE);
 
-	/* Correct for GLSL semantics. */
-	cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntUGE, emit_data->args[2],
-			     LLVMConstInt(ctx->i32, 32, 0), "");
-	emit_data->output[emit_data->chan] =
-		LLVMBuildSelect(ctx->ac.builder, cond, emit_data->args[0], bfe_sm5, "");
+		/* Correct for GLSL semantics. */
+		LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntUGE, emit_data->args[2],
+						  LLVMConstInt(ctx->i32, 32, 0), "");
+		emit_data->output[emit_data->chan] =
+			LLVMBuildSelect(ctx->ac.builder, cond, emit_data->args[0], bfe_sm5, "");
+	} else {
+		/* FIXME: LLVM 7 returns incorrect result when count is 0.
+		 * https://bugs.freedesktop.org/show_bug.cgi?id=107276
+		 */
+		LLVMValueRef zero = ctx->i32_0;
+		LLVMValueRef bfe_sm5 =
+			ac_build_bfe(&ctx->ac, emit_data->args[0],
+				     emit_data->args[1], emit_data->args[2],
+				     emit_data->info->opcode == TGSI_OPCODE_IBFE);
+
+		/* Correct for GLSL semantics. */
+		LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntUGE, emit_data->args[2],
+						  LLVMConstInt(ctx->i32, 32, 0), "");
+		LLVMValueRef cond2 = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, emit_data->args[2],
+						   zero, "");
+		bfe_sm5 = LLVMBuildSelect(ctx->ac.builder, cond, emit_data->args[0], bfe_sm5, "");
+		emit_data->output[emit_data->chan] =
+			LLVMBuildSelect(ctx->ac.builder, cond2, zero, bfe_sm5, "");
+	}
 }
 
 /* this is ffs in C */
@@ -560,10 +578,8 @@
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 
 	emit_data->output[emit_data->chan] =
-		lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_IMAX,
-					  emit_data->args[0],
-					  LLVMBuildNeg(ctx->ac.builder,
-						       emit_data->args[0], ""));
+		ac_build_imax(&ctx->ac,  emit_data->args[0],
+			      LLVMBuildNeg(ctx->ac.builder, emit_data->args[0], ""));
 }
 
 static void emit_minmax_int(const struct lp_build_tgsi_action *action,
@@ -615,14 +631,17 @@
 		      struct lp_build_tgsi_context *bld_base,
 		      struct lp_build_emit_data *emit_data)
 {
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+
 	/* From the GLSL 4.50 spec:
 	 *   "The rounding mode cannot be set and is undefined."
 	 *
 	 * v_cvt_pkrtz_f16 rounds to zero, but it's fastest.
 	 */
 	emit_data->output[emit_data->chan] =
-		ac_build_cvt_pkrtz_f16(&si_shader_context(bld_base)->ac,
-				       emit_data->args);
+		LLVMBuildBitCast(ctx->ac.builder,
+				 ac_build_cvt_pkrtz_f16(&ctx->ac, emit_data->args),
+				 ctx->i32, "");
 }
 
 static void up2h_fetch_args(struct lp_build_tgsi_context *bld_base,
@@ -672,12 +691,11 @@
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 
 	LLVMValueRef sqrt =
-		lp_build_emit_llvm_unary(bld_base, TGSI_OPCODE_SQRT,
-					 emit_data->args[0]);
+		ac_build_intrinsic(&ctx->ac, "llvm.sqrt.f32", ctx->f32,
+				   &emit_data->args[0], 1, AC_FUNC_ATTR_READNONE);
 
 	emit_data->output[emit_data->chan] =
-		lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_DIV,
-					  ctx->ac.f32_1, sqrt);
+		ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, sqrt);
 }
 
 static void dfracexp_fetch_args(struct lp_build_tgsi_context *bld_base,
@@ -694,10 +712,10 @@
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 
 	emit_data->output[emit_data->chan] =
-		lp_build_intrinsic(ctx->ac.builder, "llvm.amdgcn.frexp.mant.f64",
+		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.frexp.mant.f64",
 				   ctx->ac.f64, &emit_data->args[0], 1, 0);
 	emit_data->output1[emit_data->chan] =
-		lp_build_intrinsic(ctx->ac.builder, "llvm.amdgcn.frexp.exp.i32.f64",
+		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.frexp.exp.i32.f64",
 				   ctx->ac.i32, &emit_data->args[0], 1, 0);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
index 7761e2e..427fead 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -25,12 +25,9 @@
 #include "si_shader_internal.h"
 #include "si_pipe.h"
 #include "sid.h"
-#include "gallivm/lp_bld_arit.h"
-#include "gallivm/lp_bld_gather.h"
-#include "gallivm/lp_bld_intr.h"
 #include "tgsi/tgsi_build.h"
-#include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
+#include "ac_llvm_util.h"
 
 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
 				struct lp_build_tgsi_context *bld_base,
@@ -459,7 +456,7 @@
 		derived_ptr = LLVMBuildGEP(ctx->ac.builder, ptr, &index, 1, "");
 		channels[chan] = LLVMBuildLoad(ctx->ac.builder, derived_ptr, "");
 	}
-	emit_data->output[emit_data->chan] = lp_build_gather_values(&ctx->gallivm, channels, 4);
+	emit_data->output[emit_data->chan] = ac_build_gather_values(&ctx->ac, channels, 4);
 }
 
 /**
@@ -605,7 +602,7 @@
 	for (chan = 0; chan < 4; ++chan) {
 		chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
 	}
-	data = lp_build_gather_values(&ctx->gallivm, chans, 4);
+	data = ac_build_gather_values(&ctx->ac, chans, 4);
 
 	emit_data->args[emit_data->arg_count++] = data;
 
@@ -711,8 +708,8 @@
 		emit_data->args[0] = data;
 		emit_data->args[3] = offset;
 
-		lp_build_intrinsic(
-			builder, intrinsic_name, emit_data->dst_type,
+		ac_build_intrinsic(
+			&ctx->ac, intrinsic_name, emit_data->dst_type,
 			emit_data->args, emit_data->arg_count,
 			ac_get_store_intr_attribs(writeonly_memory));
 	}
@@ -747,7 +744,6 @@
 		struct lp_build_emit_data *emit_data)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
-	LLVMBuilderRef builder = ctx->ac.builder;
 	const struct tgsi_full_instruction * inst = emit_data->inst;
 	const struct tgsi_shader_info *info = &ctx->shader->selector->info;
 	unsigned target = inst->Memory.Texture;
@@ -773,8 +769,8 @@
 	}
 
 	if (target == TGSI_TEXTURE_BUFFER) {
-		emit_data->output[emit_data->chan] = lp_build_intrinsic(
-			builder, "llvm.amdgcn.buffer.store.format.v4f32",
+		emit_data->output[emit_data->chan] = ac_build_intrinsic(
+			&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32",
 			emit_data->dst_type, emit_data->args,
 			emit_data->arg_count,
 			ac_get_store_intr_attribs(writeonly_memory));
@@ -928,7 +924,6 @@
 		struct lp_build_emit_data *emit_data)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
-	LLVMBuilderRef builder = ctx->ac.builder;
 	const struct tgsi_full_instruction * inst = emit_data->inst;
 	LLVMValueRef tmp;
 
@@ -942,8 +937,8 @@
 		char intrinsic_name[40];
 		snprintf(intrinsic_name, sizeof(intrinsic_name),
 			 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
-		tmp = lp_build_intrinsic(
-			builder, intrinsic_name, ctx->i32,
+		tmp = ac_build_intrinsic(
+			&ctx->ac, intrinsic_name, ctx->i32,
 			emit_data->args, emit_data->arg_count, 0);
 		emit_data->output[emit_data->chan] = ac_to_float(&ctx->ac, tmp);
 	} else {
@@ -1291,8 +1286,7 @@
 						   emit_data->inst, 0,
 						   chan);
 		if (opcode == TGSI_OPCODE_TXP)
-			args.coords[chan] = lp_build_emit_llvm_binary(
-				bld_base, TGSI_OPCODE_DIV,
+			args.coords[chan] = ac_build_fdiv(&ctx->ac,
 				args.coords[chan], args.coords[3]);
 	}
 
@@ -1489,15 +1483,15 @@
 	    opcode == TGSI_OPCODE_TXF_LZ) {
 		/* add tex offsets */
 		if (inst->Texture.NumOffsets) {
-			struct lp_build_context *uint_bld = &bld_base->uint_bld;
 			const struct tgsi_texture_offset *off = inst->TexOffsets;
 
 			assert(inst->Texture.NumOffsets == 1);
 
 			switch (target) {
 			case TGSI_TEXTURE_3D:
-				args.coords[2] = lp_build_add(uint_bld, args.coords[2],
-						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ]);
+				args.coords[2] =
+					LLVMBuildAdd(ctx->ac.builder, args.coords[2],
+						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleZ], "");
 				/* fall through */
 			case TGSI_TEXTURE_2D:
 			case TGSI_TEXTURE_SHADOW2D:
@@ -1506,16 +1500,16 @@
 			case TGSI_TEXTURE_2D_ARRAY:
 			case TGSI_TEXTURE_SHADOW2D_ARRAY:
 				args.coords[1] =
-					lp_build_add(uint_bld, args.coords[1],
-						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY]);
+					LLVMBuildAdd(ctx->ac.builder, args.coords[1],
+						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleY], "");
 				/* fall through */
 			case TGSI_TEXTURE_1D:
 			case TGSI_TEXTURE_SHADOW1D:
 			case TGSI_TEXTURE_1D_ARRAY:
 			case TGSI_TEXTURE_SHADOW1D_ARRAY:
 				args.coords[0] =
-					lp_build_add(uint_bld, args.coords[0],
-						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX]);
+					LLVMBuildAdd(ctx->ac.builder, args.coords[0],
+						ctx->imms[off->Index * TGSI_NUM_CHANNELS + off->SwizzleX], "");
 				break;
 				/* texture offsets do not apply to other texture targets */
 			}
@@ -1645,9 +1639,7 @@
 				LLVMBuildExtractElement(builder, txq_emit_data.output[0],
 							LLVMConstInt(ctx->i32, c, 0), "");
 			half_texel[c] = LLVMBuildUIToFP(builder, half_texel[c], ctx->f32, "");
-			half_texel[c] =
-				lp_build_emit_llvm_unary(&ctx->bld_base,
-							 TGSI_OPCODE_RCP, half_texel[c]);
+			half_texel[c] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, half_texel[c]);
 			half_texel[c] = LLVMBuildFMul(builder, half_texel[c],
 						      LLVMConstReal(ctx->f32, -0.5), "");
 		}
@@ -1833,7 +1825,7 @@
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct ac_image_args args = {};
-	LLVMValueRef ptr, image, fmask, addr_vec;
+	LLVMValueRef ptr, image, fmask;
 
 	/* Ignore src0, because KHR_blend_func_extended disallows multiple render
 	 * targets.
@@ -1865,7 +1857,8 @@
 		fmask = ac_build_load_to_sgpr(&ctx->ac, ptr,
 			LLVMConstInt(ctx->i32, SI_PS_IMAGE_COLORBUF0_FMASK / 2, 0));
 
-		ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords, false);
+		ac_apply_fmask_to_sample(&ctx->ac, fmask, args.coords,
+					 ctx->shader->key.mono.u.ps.fbfetch_layered);
 	}
 
 	args.opcode = ac_image_load;
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
index acd7e0b..b9ed0fc 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c
@@ -24,26 +24,8 @@
 
 #include "si_shader_internal.h"
 #include "si_pipe.h"
-
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_gather.h"
-#include "gallivm/lp_bld_flow.h"
-#include "gallivm/lp_bld_init.h"
-#include "gallivm/lp_bld_intr.h"
-#include "gallivm/lp_bld_misc.h"
-#include "gallivm/lp_bld_swizzle.h"
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_parse.h"
-#include "util/u_math.h"
+#include "ac_llvm_util.h"
 #include "util/u_memory.h"
-#include "util/u_debug.h"
-
-#include <stdio.h>
-#include <llvm-c/Transforms/IPO.h>
-#include <llvm-c/Transforms/Scalar.h>
-#if HAVE_LLVM >= 0x0700
-#include <llvm-c/Transforms/Utils.h>
-#endif
 
 enum si_llvm_calling_convention {
 	RADEON_LLVM_AMDGPU_VS = 87,
@@ -99,16 +81,15 @@
  * @returns 0 for success, 1 for failure
  */
 unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary,
-			 LLVMTargetMachineRef tm,
-			 struct pipe_debug_callback *debug)
+			 struct ac_llvm_compiler *compiler,
+			 struct pipe_debug_callback *debug,
+			 bool less_optimized)
 {
+	struct ac_compiler_passes *passes =
+		less_optimized && compiler->low_opt_passes ?
+			compiler->low_opt_passes : compiler->passes;
 	struct si_llvm_diagnostics diag;
-	char *err;
 	LLVMContextRef llvm_ctx;
-	LLVMMemoryBufferRef out_buffer;
-	unsigned buffer_size;
-	const char *buffer_data;
-	LLVMBool mem_err;
 
 	diag.debug = debug;
 	diag.retval = 0;
@@ -118,33 +99,10 @@
 
 	LLVMContextSetDiagnosticHandler(llvm_ctx, si_diagnostic_handler, &diag);
 
-	/* Compile IR*/
-	mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile, &err,
-								 &out_buffer);
-
-	/* Process Errors/Warnings */
-	if (mem_err) {
-		fprintf(stderr, "%s: %s", __FUNCTION__, err);
-		pipe_debug_message(debug, SHADER_INFO,
-				   "LLVM emit error: %s", err);
-		FREE(err);
+	/* Compile IR. */
+	if (!ac_compile_module_to_binary(passes, M, binary))
 		diag.retval = 1;
-		goto out;
-	}
 
-	/* Extract Shader Code*/
-	buffer_size = LLVMGetBufferSize(out_buffer);
-	buffer_data = LLVMGetBufferStart(out_buffer);
-
-	if (!ac_elf_read(buffer_data, buffer_size, binary)) {
-		fprintf(stderr, "radeonsi: cannot read an ELF shader binary\n");
-		diag.retval = 1;
-	}
-
-	/* Clean up */
-	LLVMDisposeMemoryBuffer(out_buffer);
-
-out:
 	if (diag.retval != 0)
 		pipe_debug_message(debug, SHADER_INFO, "LLVM compile failed");
 	return diag.retval;
@@ -512,7 +470,7 @@
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 			values[chan] = si_llvm_emit_fetch(bld_base, reg, type, chan);
 		}
-		return lp_build_gather_values(&ctx->gallivm, values,
+		return ac_build_gather_values(&ctx->ac, values,
 					      TGSI_NUM_CHANNELS);
 	}
 
@@ -642,9 +600,8 @@
 		for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
 			unsigned chan;
 			for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-				 ctx->addrs[idx][chan] = lp_build_alloca_undef(
-					&ctx->gallivm,
-					ctx->i32, "");
+				 ctx->addrs[idx][chan] = ac_build_alloca_undef(
+					&ctx->ac, ctx->i32, "");
 			}
 		}
 		break;
@@ -689,7 +646,7 @@
 			 */
 			if (array_size > 16 ||
 			    !ctx->screen->llvm_has_working_vgpr_indexing) {
-				array_alloca = lp_build_alloca_undef(&ctx->gallivm,
+				array_alloca = ac_build_alloca_undef(&ctx->ac,
 					LLVMArrayType(ctx->f32,
 						      array_size), "array");
 				ctx->temp_array_allocas[id] = array_alloca;
@@ -707,7 +664,7 @@
 					 first + i / 4, "xyzw"[i % 4]);
 #endif
 				ctx->temps[first * TGSI_NUM_CHANNELS + i] =
-					lp_build_alloca_undef(&ctx->gallivm,
+					ac_build_alloca_undef(&ctx->ac,
 							      ctx->f32,
 							      name);
 			}
@@ -725,9 +682,8 @@
 				 * a shader ever reads from a channel that
 				 * it never writes to.
 				 */
-				ctx->undef_alloca = lp_build_alloca_undef(
-					&ctx->gallivm,
-					ctx->f32, "undef");
+				ctx->undef_alloca = ac_build_alloca_undef(
+					&ctx->ac, ctx->f32, "undef");
 			}
 
 			for (i = 0; i < decl_size; ++i) {
@@ -791,9 +747,8 @@
 				snprintf(name, sizeof(name), "OUT%d.%c",
 					 idx, "xyzw"[chan % 4]);
 #endif
-				ctx->outputs[idx][chan] = lp_build_alloca_undef(
-					&ctx->gallivm,
-					ctx->f32, name);
+				ctx->outputs[idx][chan] = ac_build_alloca_undef(
+					&ctx->ac, ctx->f32, name);
 			}
 		}
 		break;
@@ -992,7 +947,7 @@
 
 void si_llvm_context_init(struct si_shader_context *ctx,
 			  struct si_screen *sscreen,
-			  LLVMTargetMachineRef tm)
+			  struct ac_llvm_compiler *compiler)
 {
 	struct lp_type type;
 
@@ -1003,31 +958,20 @@
 	 */
 	memset(ctx, 0, sizeof(*ctx));
 	ctx->screen = sscreen;
-	ctx->tm = tm;
+	ctx->compiler = compiler;
 
-	ctx->gallivm.context = LLVMContextCreate();
-	ctx->gallivm.module = LLVMModuleCreateWithNameInContext("tgsi",
-						ctx->gallivm.context);
-	LLVMSetTarget(ctx->gallivm.module, "amdgcn--");
+	ac_llvm_context_init(&ctx->ac, sscreen->info.chip_class, sscreen->info.family);
+	ctx->ac.module = ac_create_module(compiler->tm, ctx->ac.context);
 
-	LLVMTargetDataRef data_layout = LLVMCreateTargetDataLayout(tm);
-	char *data_layout_str = LLVMCopyStringRepOfTargetData(data_layout);
-	LLVMSetDataLayout(ctx->gallivm.module, data_layout_str);
-	LLVMDisposeTargetData(data_layout);
-	LLVMDisposeMessage(data_layout_str);
-
-	bool unsafe_fpmath = (sscreen->debug_flags & DBG(UNSAFE_MATH)) != 0;
 	enum ac_float_mode float_mode =
-		unsafe_fpmath ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
-				AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH;
+		sscreen->debug_flags & DBG(UNSAFE_MATH) ?
+			AC_FLOAT_MODE_UNSAFE_FP_MATH :
+			AC_FLOAT_MODE_NO_SIGNED_ZEROS_FP_MATH;
+	ctx->ac.builder = ac_create_builder(ctx->ac.context, float_mode);
 
-	ctx->gallivm.builder = ac_create_builder(ctx->gallivm.context,
-						 float_mode);
-
-	ac_llvm_context_init(&ctx->ac, ctx->gallivm.context,
-			     sscreen->info.chip_class, sscreen->info.family);
-	ctx->ac.module = ctx->gallivm.module;
-	ctx->ac.builder = ctx->gallivm.builder;
+	ctx->gallivm.context = ctx->ac.context;
+	ctx->gallivm.module = ctx->ac.module;
+	ctx->gallivm.builder = ctx->ac.builder;
 
 	struct lp_build_tgsi_context *bld_base = &ctx->bld_base;
 
@@ -1190,8 +1134,7 @@
 		call_conv = RADEON_LLVM_AMDGPU_VS;
 		break;
 	case PIPE_SHADER_TESS_CTRL:
-		call_conv = HAVE_LLVM >= 0x0500 ? RADEON_LLVM_AMDGPU_HS :
-						  RADEON_LLVM_AMDGPU_VS;
+		call_conv = RADEON_LLVM_AMDGPU_HS;
 		break;
 	case PIPE_SHADER_GEOMETRY:
 		call_conv = RADEON_LLVM_AMDGPU_GS;
@@ -1211,44 +1154,14 @@
 
 void si_llvm_optimize_module(struct si_shader_context *ctx)
 {
-	struct gallivm_state *gallivm = &ctx->gallivm;
-	const char *triple = LLVMGetTarget(gallivm->module);
-	LLVMTargetLibraryInfoRef target_library_info;
-
 	/* Dump LLVM IR before any optimization passes */
 	if (ctx->screen->debug_flags & DBG(PREOPT_IR) &&
 	    si_can_dump_shader(ctx->screen, ctx->type))
 		LLVMDumpModule(ctx->gallivm.module);
 
-	/* Create the pass manager */
-	gallivm->passmgr = LLVMCreatePassManager();
-
-	target_library_info = gallivm_create_target_library_info(triple);
-	LLVMAddTargetLibraryInfo(target_library_info, gallivm->passmgr);
-
-	if (si_extra_shader_checks(ctx->screen, ctx->type))
-		LLVMAddVerifierPass(gallivm->passmgr);
-
-	LLVMAddAlwaysInlinerPass(gallivm->passmgr);
-
-	/* This pass should eliminate all the load and store instructions */
-	LLVMAddPromoteMemoryToRegisterPass(gallivm->passmgr);
-
-	/* Add some optimization passes */
-	LLVMAddScalarReplAggregatesPass(gallivm->passmgr);
-	LLVMAddLICMPass(gallivm->passmgr);
-	LLVMAddAggressiveDCEPass(gallivm->passmgr);
-	LLVMAddCFGSimplificationPass(gallivm->passmgr);
-	/* This is recommended by the instruction combining pass. */
-	LLVMAddEarlyCSEMemSSAPass(gallivm->passmgr);
-	LLVMAddInstructionCombiningPass(gallivm->passmgr);
-
 	/* Run the pass */
-	LLVMRunPassManager(gallivm->passmgr, ctx->gallivm.module);
-
+	LLVMRunPassManager(ctx->compiler->passmgr, ctx->gallivm.module);
 	LLVMDisposeBuilder(ctx->ac.builder);
-	LLVMDisposePassManager(gallivm->passmgr);
-	gallivm_dispose_target_library_info(target_library_info);
 }
 
 void si_llvm_dispose(struct si_shader_context *ctx)
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index b5fe672..fc1ec83 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -33,25 +33,6 @@
 #include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
 
-/* Initialize an external atom (owned by ../radeon). */
-static void
-si_init_external_atom(struct si_context *sctx, struct r600_atom *atom,
-		      struct r600_atom **list_elem)
-{
-	atom->id = list_elem - sctx->atoms.array;
-	*list_elem = atom;
-}
-
-/* Initialize an atom owned by radeonsi.  */
-void si_init_atom(struct si_context *sctx, struct r600_atom *atom,
-		  struct r600_atom **list_elem,
-		  void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
-{
-	atom->emit = emit_func;
-	atom->id = list_elem - sctx->atoms.array;
-	*list_elem = atom;
-}
-
 static unsigned si_map_swizzle(unsigned swizzle)
 {
 	switch (swizzle) {
@@ -83,9 +64,9 @@
  * CB_TARGET_MASK is emitted here to avoid a hang with dual source blending
  * if there is not enough PS outputs.
  */
-static void si_emit_cb_render_state(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_cb_render_state(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct si_state_blend *blend = sctx->queued.named.blend;
 	/* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
 	 * but you never know. */
@@ -106,7 +87,8 @@
 	    (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
 		cb_target_mask = 0;
 
-	radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, cb_target_mask);
+	radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK,
+				   SI_TRACKED_CB_TARGET_MASK, cb_target_mask);
 
 	/* GFX9: Flush DFSM when CB_TARGET_MASK changes.
 	 * I think we don't have to do anything between IBs.
@@ -130,10 +112,12 @@
 				  blend->blend_enable_4bit & cb_target_mask &&
 				  sctx->framebuffer.nr_samples >= 2;
 
-		radeon_set_context_reg(cs, R_028424_CB_DCC_CONTROL,
-				       S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) |
-				       S_028424_OVERWRITE_COMBINER_WATERMARK(4) |
-				       S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable));
+		radeon_opt_set_context_reg(
+				sctx, R_028424_CB_DCC_CONTROL,
+				SI_TRACKED_CB_DCC_CONTROL,
+				S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(1) |
+				S_028424_OVERWRITE_COMBINER_WATERMARK(4) |
+				S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable));
 	}
 
 	/* RB+ register settings. */
@@ -146,8 +130,8 @@
 		unsigned sx_blend_opt_control = 0;
 
 		for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
-			struct r600_surface *surf =
-				(struct r600_surface*)sctx->framebuffer.state.cbufs[i];
+			struct si_surface *surf =
+				(struct si_surface*)sctx->framebuffer.state.cbufs[i];
 			unsigned format, swap, spi_format, colormask;
 			bool has_alpha, has_rgb;
 
@@ -261,10 +245,11 @@
 			}
 		}
 
-		radeon_set_context_reg_seq(cs, R_028754_SX_PS_DOWNCONVERT, 3);
-		radeon_emit(cs, sx_ps_downconvert);	/* R_028754_SX_PS_DOWNCONVERT */
-		radeon_emit(cs, sx_blend_opt_epsilon);	/* R_028758_SX_BLEND_OPT_EPSILON */
-		radeon_emit(cs, sx_blend_opt_control);	/* R_02875C_SX_BLEND_OPT_CONTROL */
+		/* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
+		radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT,
+					    SI_TRACKED_SX_PS_DOWNCONVERT,
+					    sx_ps_downconvert, sx_blend_opt_epsilon,
+					    sx_blend_opt_control);
 	}
 }
 
@@ -672,7 +657,7 @@
 	    (old_blend->blend_enable_4bit != blend->blend_enable_4bit &&
 	     sctx->framebuffer.nr_samples >= 2 &&
 	     sctx->screen->dcc_msaa_allowed))
-		si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
 
 	if (!old_blend ||
 	    old_blend->cb_target_mask != blend->cb_target_mask ||
@@ -688,7 +673,7 @@
 	     old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
 	     old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
 	     old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
-		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 
 	if (sctx->screen->has_out_of_order_rast &&
 	    (!old_blend ||
@@ -696,7 +681,7 @@
 	      old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
 	      old_blend->commutative_4bit != blend->commutative_4bit ||
 	      old_blend->logicop_enable != blend->logicop_enable)))
-		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 }
 
 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
@@ -713,12 +698,12 @@
 
 	sctx->blend_color.state = *state;
 	sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
-	si_mark_atom_dirty(sctx, &sctx->blend_color.atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
 }
 
-static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_blend_color(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
 	radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4);
@@ -740,7 +725,7 @@
 
 	sctx->clip_state.state = *state;
 	sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
-	si_mark_atom_dirty(sctx, &sctx->clip_state.atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
 
 	cb.buffer = NULL;
 	cb.user_buffer = state->ucp;
@@ -750,17 +735,16 @@
 	pipe_resource_reference(&cb.buffer, NULL);
 }
 
-static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_clip_state(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4);
 	radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
 }
 
-static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_clip_regs(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
 	struct si_shader *vs = si_get_vs_state(sctx);
 	struct si_shader_selector *vs_sel = vs->selector;
 	struct tgsi_shader_info *info = &vs_sel->info;
@@ -788,12 +772,14 @@
 	clipdist_mask &= rs->clip_plane_enable;
 	culldist_mask |= clipdist_mask;
 
-	radeon_set_context_reg(cs, R_02881C_PA_CL_VS_OUT_CNTL,
+	radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
+		SI_TRACKED_PA_CL_VS_OUT_CNTL,
 		vs_sel->pa_cl_vs_out_cntl |
 		S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
 		S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
 		clipdist_mask | (culldist_mask << 8));
-	radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
+	radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL,
+		SI_TRACKED_PA_CL_CLIP_CNTL,
 		rs->pa_cl_clip_cntl |
 		ucp_mask |
 		S_028810_CLIP_DISABLE(window_space));
@@ -1006,12 +992,12 @@
 		return;
 
 	if (!old_rs || old_rs->multisample_enable != rs->multisample_enable) {
-		si_mark_atom_dirty(sctx, &sctx->db_render_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
 		/* Update the small primitive filter workaround if necessary. */
 		if (sctx->screen->has_msaa_sample_loc_bug &&
 		    sctx->framebuffer.nr_samples > 1)
-			si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
 	}
 
 	sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
@@ -1021,23 +1007,26 @@
 	si_update_poly_offset_state(sctx);
 
 	if (!old_rs ||
-	    (old_rs->scissor_enable != rs->scissor_enable ||
-	     old_rs->line_width != rs->line_width ||
-	     old_rs->max_point_size != rs->max_point_size)) {
+	    old_rs->scissor_enable != rs->scissor_enable) {
 		sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-		si_mark_atom_dirty(sctx, &sctx->scissors.atom);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
 	}
 
 	if (!old_rs ||
+	    old_rs->line_width != rs->line_width ||
+	    old_rs->max_point_size != rs->max_point_size)
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
+
+	if (!old_rs ||
 	    old_rs->clip_halfz != rs->clip_halfz) {
 		sctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-		si_mark_atom_dirty(sctx, &sctx->viewports.atom);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
 	}
 
 	if (!old_rs ||
 	    old_rs->clip_plane_enable != rs->clip_plane_enable ||
 	    old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
-		si_mark_atom_dirty(sctx, &sctx->clip_regs);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 
 	sctx->ia_multi_vgt_param_key.u.line_stipple_enabled =
 		rs->line_stipple_enable;
@@ -1072,9 +1061,9 @@
 /*
  * infeered state between dsa and stencil ref
  */
-static void si_emit_stencil_ref(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_stencil_ref(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
 	struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
 
@@ -1098,7 +1087,7 @@
 		return;
 
 	sctx->stencil_ref.state = *state;
-	si_mark_atom_dirty(sctx, &sctx->stencil_ref.atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
 }
 
 
@@ -1286,7 +1275,7 @@
 	if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
 		   sizeof(struct si_dsa_stencil_ref_part)) != 0) {
 		sctx->stencil_ref.dsa_part = dsa->stencil_ref;
-		si_mark_atom_dirty(sctx, &sctx->stencil_ref.atom);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
 	}
 
 	if (!old_dsa || old_dsa->alpha_func != dsa->alpha_func)
@@ -1297,13 +1286,13 @@
 	     (old_dsa->depth_enabled != dsa->depth_enabled ||
 	      old_dsa->stencil_enabled != dsa->stencil_enabled ||
 	      old_dsa->db_can_write != dsa->db_can_write)))
-		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 
 	if (sctx->screen->has_out_of_order_rast &&
 	    (!old_dsa ||
 	     memcmp(old_dsa->order_invariance, dsa->order_invariance,
 		    sizeof(old_dsa->order_invariance))))
-		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 }
 
 static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
@@ -1337,19 +1326,19 @@
 	/* Occlusion queries. */
 	if (sctx->occlusion_queries_disabled != !enable) {
 		sctx->occlusion_queries_disabled = !enable;
-		si_mark_atom_dirty(sctx, &sctx->db_render_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 	}
 }
 
 void si_set_occlusion_query_state(struct si_context *sctx,
 				  bool old_perfect_enable)
 {
-	si_mark_atom_dirty(sctx, &sctx->db_render_state);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
 	bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
 
 	if (perfect_enable != old_perfect_enable)
-		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 }
 
 void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
@@ -1360,30 +1349,27 @@
 	si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
 }
 
-static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state)
+static void si_emit_db_render_state(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
-	unsigned db_shader_control;
-
-	radeon_set_context_reg_seq(cs, R_028000_DB_RENDER_CONTROL, 2);
+	unsigned db_shader_control, db_render_control, db_count_control;
 
 	/* DB_RENDER_CONTROL */
 	if (sctx->dbcb_depth_copy_enabled ||
 	    sctx->dbcb_stencil_copy_enabled) {
-		radeon_emit(cs,
-			    S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
-			    S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
-			    S_028000_COPY_CENTROID(1) |
-			    S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample));
+		db_render_control =
+			S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
+			S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
+			S_028000_COPY_CENTROID(1) |
+			S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
 	} else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
-		radeon_emit(cs,
-			    S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
-			    S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace));
+		db_render_control =
+			S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
+			S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
 	} else {
-		radeon_emit(cs,
-			    S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
-			    S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear));
+		db_render_control =
+			S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
+			S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
 	}
 
 	/* DB_COUNT_CONTROL (occlusion queries) */
@@ -1392,28 +1378,41 @@
 		bool perfect = sctx->num_perfect_occlusion_queries > 0;
 
 		if (sctx->chip_class >= CIK) {
-			radeon_emit(cs,
-				    S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-				    S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) |
-				    S_028004_ZPASS_ENABLE(1) |
-				    S_028004_SLICE_EVEN_ENABLE(1) |
-				    S_028004_SLICE_ODD_ENABLE(1));
+			unsigned log_sample_rate = sctx->framebuffer.log_samples;
+
+			/* Stoney doesn't increment occlusion query counters
+			 * if the sample rate is 16x. Use 8x sample rate instead.
+			 */
+			if (sctx->family == CHIP_STONEY)
+				log_sample_rate = MIN2(log_sample_rate, 3);
+
+			db_count_control =
+				S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+				S_028004_SAMPLE_RATE(log_sample_rate) |
+				S_028004_ZPASS_ENABLE(1) |
+				S_028004_SLICE_EVEN_ENABLE(1) |
+				S_028004_SLICE_ODD_ENABLE(1);
 		} else {
-			radeon_emit(cs,
-				    S_028004_PERFECT_ZPASS_COUNTS(perfect) |
-				    S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples));
+			db_count_control =
+				S_028004_PERFECT_ZPASS_COUNTS(perfect) |
+				S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
 		}
 	} else {
 		/* Disable occlusion queries. */
 		if (sctx->chip_class >= CIK) {
-			radeon_emit(cs, 0);
+			db_count_control = 0;
 		} else {
-			radeon_emit(cs, S_028004_ZPASS_INCREMENT_DISABLE(1));
+			db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
 		}
 	}
 
+	radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL,
+				    SI_TRACKED_DB_RENDER_CONTROL, db_render_control,
+				    db_count_control);
+
 	/* DB_RENDER_OVERRIDE2 */
-	radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2,
+	radeon_opt_set_context_reg(sctx,  R_028010_DB_RENDER_OVERRIDE2,
+		SI_TRACKED_DB_RENDER_OVERRIDE2,
 		S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
 		S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
 		S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
@@ -1427,15 +1426,15 @@
 	}
 
 	/* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
-	if (!rs || !rs->multisample_enable)
+	if (!rs->multisample_enable)
 		db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
 
 	if (sctx->screen->has_rbplus &&
 	    !sctx->screen->rbplus_allowed)
 		db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
 
-	radeon_set_context_reg(cs, R_02880C_DB_SHADER_CONTROL,
-			       db_shader_control);
+	radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,
+				   SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
 }
 
 /*
@@ -1592,9 +1591,6 @@
 				       int first_non_void)
 {
 	struct si_screen *sscreen = (struct si_screen*)screen;
-	bool enable_compressed_formats = (sscreen->info.drm_major == 2 &&
-					  sscreen->info.drm_minor >= 31) ||
-					 sscreen->info.drm_major == 3;
 	bool uniform = true;
 	int i;
 
@@ -1649,7 +1645,7 @@
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
-		if (!enable_compressed_formats)
+		if (!sscreen->info.has_format_bc1_through_bc7)
 			goto out_unknown;
 
 		switch (format) {
@@ -1695,7 +1691,7 @@
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
-		if (!enable_compressed_formats)
+		if (!sscreen->info.has_format_bc1_through_bc7)
 			goto out_unknown;
 
 		switch (format) {
@@ -1724,7 +1720,7 @@
 	}
 
 	if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
-		if (!enable_compressed_formats)
+		if (!sscreen->info.has_format_bc1_through_bc7)
 			goto out_unknown;
 
 		switch (format) {
@@ -1909,10 +1905,10 @@
 	}
 }
 
-static unsigned si_tex_dim(struct si_screen *sscreen, struct r600_texture *rtex,
+static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex,
 			   unsigned view_target, unsigned nr_samples)
 {
-	unsigned res_target = rtex->resource.b.b.target;
+	unsigned res_target = tex->buffer.b.b.target;
 
 	if (view_target == PIPE_TEXTURE_CUBE ||
 	    view_target == PIPE_TEXTURE_CUBE_ARRAY)
@@ -1926,7 +1922,7 @@
 	if ((res_target == PIPE_TEXTURE_1D ||
 	     res_target == PIPE_TEXTURE_1D_ARRAY) &&
 	    sscreen->info.chip_class >= GFX9 &&
-	    rtex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
+	    tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
 		if (res_target == PIPE_TEXTURE_1D)
 			res_target = PIPE_TEXTURE_2D;
 		else
@@ -2136,8 +2132,10 @@
 				      enum pipe_format format,
 				      enum pipe_texture_target target,
 				      unsigned sample_count,
+				      unsigned storage_sample_count,
 				      unsigned usage)
 {
+	struct si_screen *sscreen = (struct si_screen *)screen;
 	unsigned retval = 0;
 
 	if (target >= PIPE_MAX_TEXTURE_TYPES) {
@@ -2145,7 +2143,7 @@
 		return false;
 	}
 
-	if (!util_format_is_supported(format, usage))
+	if (MAX2(1, sample_count) < MAX2(1, storage_sample_count))
 		return false;
 
 	if (sample_count > 1) {
@@ -2155,18 +2153,26 @@
 		if (usage & PIPE_BIND_SHADER_IMAGE)
 			return false;
 
-		switch (sample_count) {
-		case 2:
-		case 4:
-		case 8:
-			break;
-		case 16:
-			if (format == PIPE_FORMAT_NONE)
-				return true;
-			else
-				return false;
-		default:
+		/* Only power-of-two sample counts are supported. */
+		if (!util_is_power_of_two_or_zero(sample_count) ||
+		    !util_is_power_of_two_or_zero(storage_sample_count))
 			return false;
+
+		/* MSAA support without framebuffer attachments. */
+		if (format == PIPE_FORMAT_NONE && sample_count <= 16)
+			return true;
+
+		if (!sscreen->info.has_eqaa_surface_allocator ||
+		    util_format_is_depth_or_stencil(format)) {
+			/* Color without EQAA or depth/stencil. */
+			if (sample_count > 8 ||
+			    sample_count != storage_sample_count)
+				return false;
+		} else {
+			/* Color with EQAA. */
+			if (sample_count > 16 ||
+			    storage_sample_count > 8)
+				return false;
 		}
 	}
 
@@ -2221,7 +2227,7 @@
  * framebuffer handling
  */
 
-static void si_choose_spi_color_formats(struct r600_surface *surf,
+static void si_choose_spi_color_formats(struct si_surface *surf,
 					unsigned format, unsigned swap,
 					unsigned ntype, bool is_depth)
 {
@@ -2339,9 +2345,9 @@
 }
 
 static void si_initialize_color_surface(struct si_context *sctx,
-					struct r600_surface *surf)
+					struct si_surface *surf)
 {
-	struct r600_texture *rtex = (struct r600_texture*)surf->base.texture;
+	struct si_texture *tex = (struct si_texture*)surf->base.texture;
 	unsigned color_info, color_attrib;
 	unsigned format, swap, ntype, endian;
 	const struct util_format_description *desc;
@@ -2427,15 +2433,16 @@
 	color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
 						  util_format_is_intensity(surf->base.format));
 
-	if (rtex->resource.b.b.nr_samples > 1) {
-		unsigned log_samples = util_logbase2(rtex->resource.b.b.nr_samples);
+	if (tex->buffer.b.b.nr_samples > 1) {
+		unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples);
+		unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples);
 
 		color_attrib |= S_028C74_NUM_SAMPLES(log_samples) |
-				S_028C74_NUM_FRAGMENTS(log_samples);
+				S_028C74_NUM_FRAGMENTS(log_fragments);
 
-		if (rtex->fmask.size) {
+		if (tex->surface.fmask_size) {
 			color_info |= S_028C70_COMPRESSION(1);
-			unsigned fmask_bankh = util_logbase2(rtex->fmask.bank_height);
+			unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh);
 
 			if (sctx->chip_class == SI) {
 				/* due to a hw bug, FMASK_BANK_HEIGHT must be set on SI too */
@@ -2455,10 +2462,10 @@
 		if (!sctx->screen->info.has_dedicated_vram)
 			min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
 
-		if (rtex->resource.b.b.nr_samples > 1) {
-			if (rtex->surface.bpe == 1)
+		if (tex->buffer.b.b.nr_storage_samples > 1) {
+			if (tex->surface.bpe == 1)
 				max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
-			else if (rtex->surface.bpe == 2)
+			else if (tex->surface.bpe == 2)
 				max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
 		}
 
@@ -2468,8 +2475,8 @@
 	}
 
 	/* This must be set for fast clear to work without FMASK. */
-	if (!rtex->fmask.size && sctx->chip_class == SI) {
-		unsigned bankh = util_logbase2(rtex->surface.u.legacy.bankh);
+	if (!tex->surface.fmask_size && sctx->chip_class == SI) {
+		unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh);
 		color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
 	}
 
@@ -2477,14 +2484,14 @@
 			      S_028C6C_SLICE_MAX(surf->base.u.tex.last_layer);
 
 	if (sctx->chip_class >= GFX9) {
-		unsigned mip0_depth = util_max_layer(&rtex->resource.b.b, 0);
+		unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0);
 
 		color_view |= S_028C6C_MIP_LEVEL(surf->base.u.tex.level);
 		color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
-				S_028C74_RESOURCE_TYPE(rtex->surface.u.gfx9.resource_type);
+				S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type);
 		surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
 					 S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
-					 S_028C68_MAX_MIP(rtex->resource.b.b.last_level);
+					 S_028C68_MAX_MIP(tex->buffer.b.b.last_level);
 	}
 
 	surf->cb_color_view = color_view;
@@ -2492,26 +2499,26 @@
 	surf->cb_color_attrib = color_attrib;
 
 	/* Determine pixel shader export format */
-	si_choose_spi_color_formats(surf, format, swap, ntype, rtex->is_depth);
+	si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth);
 
 	surf->color_initialized = true;
 }
 
 static void si_init_depth_surface(struct si_context *sctx,
-				  struct r600_surface *surf)
+				  struct si_surface *surf)
 {
-	struct r600_texture *rtex = (struct r600_texture*)surf->base.texture;
+	struct si_texture *tex = (struct si_texture*)surf->base.texture;
 	unsigned level = surf->base.u.tex.level;
 	unsigned format, stencil_format;
 	uint32_t z_info, s_info;
 
-	format = si_translate_dbformat(rtex->db_render_format);
-	stencil_format = rtex->surface.has_stencil ?
+	format = si_translate_dbformat(tex->db_render_format);
+	stencil_format = tex->surface.has_stencil ?
 				 V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
 
 	assert(format != V_028040_Z_INVALID);
 	if (format == V_028040_Z_INVALID)
-		PRINT_ERR("Invalid DB format: %d, disabling DB.\n", rtex->resource.b.b.format);
+		PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
 
 	surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
 			      S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
@@ -2519,31 +2526,31 @@
 	surf->db_htile_surface = 0;
 
 	if (sctx->chip_class >= GFX9) {
-		assert(rtex->surface.u.gfx9.surf_offset == 0);
-		surf->db_depth_base = rtex->resource.gpu_address >> 8;
-		surf->db_stencil_base = (rtex->resource.gpu_address +
-					 rtex->surface.u.gfx9.stencil_offset) >> 8;
+		assert(tex->surface.u.gfx9.surf_offset == 0);
+		surf->db_depth_base = tex->buffer.gpu_address >> 8;
+		surf->db_stencil_base = (tex->buffer.gpu_address +
+					 tex->surface.u.gfx9.stencil_offset) >> 8;
 		z_info = S_028038_FORMAT(format) |
-			 S_028038_NUM_SAMPLES(util_logbase2(rtex->resource.b.b.nr_samples)) |
-			 S_028038_SW_MODE(rtex->surface.u.gfx9.surf.swizzle_mode) |
-			 S_028038_MAXMIP(rtex->resource.b.b.last_level);
+			 S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
+			 S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
+			 S_028038_MAXMIP(tex->buffer.b.b.last_level);
 		s_info = S_02803C_FORMAT(stencil_format) |
-			 S_02803C_SW_MODE(rtex->surface.u.gfx9.stencil.swizzle_mode);
-		surf->db_z_info2 = S_028068_EPITCH(rtex->surface.u.gfx9.surf.epitch);
-		surf->db_stencil_info2 = S_02806C_EPITCH(rtex->surface.u.gfx9.stencil.epitch);
+			 S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
+		surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch);
+		surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch);
 		surf->db_depth_view |= S_028008_MIPID(level);
-		surf->db_depth_size = S_02801C_X_MAX(rtex->resource.b.b.width0 - 1) |
-				      S_02801C_Y_MAX(rtex->resource.b.b.height0 - 1);
+		surf->db_depth_size = S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) |
+				      S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1);
 
-		if (si_htile_enabled(rtex, level)) {
+		if (si_htile_enabled(tex, level)) {
 			z_info |= S_028038_TILE_SURFACE_ENABLE(1) |
 				  S_028038_ALLOW_EXPCLEAR(1);
 
-			if (rtex->tc_compatible_htile) {
+			if (tex->tc_compatible_htile) {
 				unsigned max_zplanes = 4;
 
-				if (rtex->db_render_format == PIPE_FORMAT_Z16_UNORM &&
-				    rtex->resource.b.b.nr_samples > 1)
+				if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM &&
+				    tex->buffer.b.b.nr_samples > 1)
 					max_zplanes = 2;
 
 				z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1) |
@@ -2551,43 +2558,43 @@
 				s_info |= S_02803C_ITERATE_FLUSH(1);
 			}
 
-			if (rtex->surface.has_stencil) {
+			if (tex->surface.has_stencil) {
 				/* Stencil buffer workaround ported from the SI-CI-VI code.
 				 * See that for explanation.
 				 */
-				s_info |= S_02803C_ALLOW_EXPCLEAR(rtex->resource.b.b.nr_samples <= 1);
+				s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1);
 			} else {
 				/* Use all HTILE for depth if there's no stencil. */
 				s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
 			}
 
-			surf->db_htile_data_base = (rtex->resource.gpu_address +
-						    rtex->htile_offset) >> 8;
+			surf->db_htile_data_base = (tex->buffer.gpu_address +
+						    tex->htile_offset) >> 8;
 			surf->db_htile_surface = S_028ABC_FULL_CACHE(1) |
-						 S_028ABC_PIPE_ALIGNED(rtex->surface.u.gfx9.htile.pipe_aligned) |
-						 S_028ABC_RB_ALIGNED(rtex->surface.u.gfx9.htile.rb_aligned);
+						 S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned) |
+						 S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned);
 		}
 	} else {
 		/* SI-CI-VI */
-		struct legacy_surf_level *levelinfo = &rtex->surface.u.legacy.level[level];
+		struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level];
 
 		assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
 
-		surf->db_depth_base = (rtex->resource.gpu_address +
-				       rtex->surface.u.legacy.level[level].offset) >> 8;
-		surf->db_stencil_base = (rtex->resource.gpu_address +
-					 rtex->surface.u.legacy.stencil_level[level].offset) >> 8;
+		surf->db_depth_base = (tex->buffer.gpu_address +
+				       tex->surface.u.legacy.level[level].offset) >> 8;
+		surf->db_stencil_base = (tex->buffer.gpu_address +
+					 tex->surface.u.legacy.stencil_level[level].offset) >> 8;
 
 		z_info = S_028040_FORMAT(format) |
-			 S_028040_NUM_SAMPLES(util_logbase2(rtex->resource.b.b.nr_samples));
+			 S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
 		s_info = S_028044_FORMAT(stencil_format);
-		surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!rtex->tc_compatible_htile);
+		surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile);
 
 		if (sctx->chip_class >= CIK) {
 			struct radeon_info *info = &sctx->screen->info;
-			unsigned index = rtex->surface.u.legacy.tiling_index[level];
-			unsigned stencil_index = rtex->surface.u.legacy.stencil_tiling_index[level];
-			unsigned macro_index = rtex->surface.u.legacy.macro_tile_index;
+			unsigned index = tex->surface.u.legacy.tiling_index[level];
+			unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level];
+			unsigned macro_index = tex->surface.u.legacy.macro_tile_index;
 			unsigned tile_mode = info->si_tile_mode_array[index];
 			unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
 			unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
@@ -2602,9 +2609,9 @@
 			z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
 			s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
 		} else {
-			unsigned tile_mode_index = si_tile_mode_index(rtex, level, false);
+			unsigned tile_mode_index = si_tile_mode_index(tex, level, false);
 			z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
-			tile_mode_index = si_tile_mode_index(rtex, level, true);
+			tile_mode_index = si_tile_mode_index(tex, level, true);
 			s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
 		}
 
@@ -2613,11 +2620,11 @@
 		surf->db_depth_slice = S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x *
 								levelinfo->nblk_y) / 64 - 1);
 
-		if (si_htile_enabled(rtex, level)) {
+		if (si_htile_enabled(tex, level)) {
 			z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
 				  S_028040_ALLOW_EXPCLEAR(1);
 
-			if (rtex->surface.has_stencil) {
+			if (tex->surface.has_stencil) {
 				/* Workaround: For a not yet understood reason, the
 				 * combination of MSAA, fast stencil clear and stencil
 				 * decompress messes with subsequent stencil buffer
@@ -2629,9 +2636,9 @@
 				 * Check piglit's arb_texture_multisample-stencil-clear
 				 * test if you want to try changing this.
 				 */
-				if (rtex->resource.b.b.nr_samples <= 1)
+				if (tex->buffer.b.b.nr_samples <= 1)
 					s_info |= S_028044_ALLOW_EXPCLEAR(1);
-			} else if (!rtex->tc_compatible_htile) {
+			} else if (!tex->tc_compatible_htile) {
 				/* Use all of the htile_buffer for depth if there's no stencil.
 				 * This must not be set when TC-compatible HTILE is enabled
 				 * due to a hw bug.
@@ -2639,16 +2646,17 @@
 				s_info |= S_028044_TILE_STENCIL_DISABLE(1);
 			}
 
-			surf->db_htile_data_base = (rtex->resource.gpu_address +
-						    rtex->htile_offset) >> 8;
+			surf->db_htile_data_base = (tex->buffer.gpu_address +
+						    tex->htile_offset) >> 8;
 			surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
 
-			if (rtex->tc_compatible_htile) {
+			if (tex->tc_compatible_htile) {
 				surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
 
-				if (rtex->resource.b.b.nr_samples <= 1)
+				/* 0 = full compression. N = only compress up to N-1 Z planes. */
+				if (tex->buffer.b.b.nr_samples <= 1)
 					z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
-				else if (rtex->resource.b.b.nr_samples <= 4)
+				else if (tex->buffer.b.b.nr_samples <= 4)
 					z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
 				else
 					z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
@@ -2669,39 +2677,39 @@
 
 	if (sctx->framebuffer.state.zsbuf) {
 		struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
-		struct r600_texture *rtex = (struct r600_texture *)surf->texture;
+		struct si_texture *tex = (struct si_texture *)surf->texture;
 
-		rtex->dirty_level_mask |= 1 << surf->u.tex.level;
+		tex->dirty_level_mask |= 1 << surf->u.tex.level;
 
-		if (rtex->surface.has_stencil)
-			rtex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
+		if (tex->surface.has_stencil)
+			tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
 	}
 
 	unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
 	while (compressed_cb_mask) {
 		unsigned i = u_bit_scan(&compressed_cb_mask);
 		struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
-		struct r600_texture *rtex = (struct r600_texture*)surf->texture;
+		struct si_texture *tex = (struct si_texture*)surf->texture;
 
-		if (rtex->fmask.size)
-			rtex->dirty_level_mask |= 1 << surf->u.tex.level;
-		if (rtex->dcc_gather_statistics)
-			rtex->separate_dcc_dirty = true;
+		if (tex->surface.fmask_size)
+			tex->dirty_level_mask |= 1 << surf->u.tex.level;
+		if (tex->dcc_gather_statistics)
+			tex->separate_dcc_dirty = true;
 	}
 }
 
 static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *state)
 {
 	for (int i = 0; i < state->nr_cbufs; ++i) {
-		struct r600_surface *surf = NULL;
-		struct r600_texture *rtex;
+		struct si_surface *surf = NULL;
+		struct si_texture *tex;
 
 		if (!state->cbufs[i])
 			continue;
-		surf = (struct r600_surface*)state->cbufs[i];
-		rtex = (struct r600_texture*)surf->base.texture;
+		surf = (struct si_surface*)state->cbufs[i];
+		tex = (struct si_texture*)surf->base.texture;
 
-		p_atomic_dec(&rtex->framebuffers_bound);
+		p_atomic_dec(&tex->framebuffers_bound);
 	}
 }
 
@@ -2710,15 +2718,15 @@
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct pipe_constant_buffer constbuf = {0};
-	struct r600_surface *surf = NULL;
-	struct r600_texture *rtex;
+	struct si_surface *surf = NULL;
+	struct si_texture *tex;
 	bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
 	unsigned old_nr_samples = sctx->framebuffer.nr_samples;
 	unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
 	bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
 	bool old_has_stencil =
 		old_has_zsbuf &&
-		((struct r600_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
+		((struct si_texture*)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
 	bool unbound = false;
 	int i;
 
@@ -2728,9 +2736,9 @@
 		if (!sctx->framebuffer.state.cbufs[i])
 			continue;
 
-		rtex = (struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
-		if (rtex->dcc_gather_statistics)
-			vi_separate_dcc_stop_query(sctx, rtex);
+		tex = (struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
+		if (tex->dcc_gather_statistics)
+			vi_separate_dcc_stop_query(sctx, tex);
 	}
 
 	/* Disable DCC if the formats are incompatible. */
@@ -2738,8 +2746,8 @@
 		if (!state->cbufs[i])
 			continue;
 
-		surf = (struct r600_surface*)state->cbufs[i];
-		rtex = (struct r600_texture*)surf->base.texture;
+		surf = (struct si_surface*)state->cbufs[i];
+		tex = (struct si_texture*)surf->base.texture;
 
 		if (!surf->dcc_incompatible)
 			continue;
@@ -2754,9 +2762,9 @@
 			unbound = true;
 		}
 
-		if (vi_dcc_enabled(rtex, surf->base.u.tex.level))
-			if (!si_texture_disable_dcc(sctx, rtex))
-				si_decompress_dcc(sctx, rtex);
+		if (vi_dcc_enabled(tex, surf->base.u.tex.level))
+			if (!si_texture_disable_dcc(sctx, tex))
+				si_decompress_dcc(sctx, tex);
 
 		surf->dcc_incompatible = false;
 	}
@@ -2824,6 +2832,7 @@
 	sctx->framebuffer.compressed_cb_mask = 0;
 	sctx->framebuffer.uncompressed_cb_mask = 0;
 	sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
+	sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
 	sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
 	sctx->framebuffer.any_dst_linear = false;
 	sctx->framebuffer.CB_has_shader_readable_metadata = false;
@@ -2833,8 +2842,8 @@
 		if (!state->cbufs[i])
 			continue;
 
-		surf = (struct r600_surface*)state->cbufs[i];
-		rtex = (struct r600_texture*)surf->base.texture;
+		surf = (struct si_surface*)state->cbufs[i];
+		tex = (struct si_texture*)surf->base.texture;
 
 		if (!surf->color_initialized) {
 			si_initialize_color_surface(sctx, surf);
@@ -2855,33 +2864,45 @@
 		if (surf->color_is_int10)
 			sctx->framebuffer.color_is_int10 |= 1 << i;
 
-		if (rtex->fmask.size)
+		if (tex->surface.fmask_size)
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
 		else
 			sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
 
-		if (rtex->surface.is_linear)
+		/* Don't update nr_color_samples for non-AA buffers.
+		 * (e.g. destination of MSAA resolve)
+		 */
+		if (tex->buffer.b.b.nr_samples >= 2 &&
+		    tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) {
+			sctx->framebuffer.nr_color_samples =
+				MIN2(sctx->framebuffer.nr_color_samples,
+				     tex->buffer.b.b.nr_storage_samples);
+			sctx->framebuffer.nr_color_samples =
+				MAX2(1, sctx->framebuffer.nr_color_samples);
+		}
+
+		if (tex->surface.is_linear)
 			sctx->framebuffer.any_dst_linear = true;
 
-		if (vi_dcc_enabled(rtex, surf->base.u.tex.level))
+		if (vi_dcc_enabled(tex, surf->base.u.tex.level))
 			sctx->framebuffer.CB_has_shader_readable_metadata = true;
 
 		si_context_add_resource_size(sctx, surf->base.texture);
 
-		p_atomic_inc(&rtex->framebuffers_bound);
+		p_atomic_inc(&tex->framebuffers_bound);
 
-		if (rtex->dcc_gather_statistics) {
+		if (tex->dcc_gather_statistics) {
 			/* Dirty tracking must be enabled for DCC usage analysis. */
 			sctx->framebuffer.compressed_cb_mask |= 1 << i;
-			vi_separate_dcc_start_query(sctx, rtex);
+			vi_separate_dcc_start_query(sctx, tex);
 		}
 	}
 
-	struct r600_texture *zstex = NULL;
+	struct si_texture *zstex = NULL;
 
 	if (state->zsbuf) {
-		surf = (struct r600_surface*)state->zsbuf;
-		zstex = (struct r600_texture*)surf->base.texture;
+		surf = (struct si_surface*)state->zsbuf;
+		zstex = (struct si_texture*)surf->base.texture;
 
 		if (!surf->depth_initialized) {
 			si_init_depth_surface(sctx, surf);
@@ -2895,24 +2916,24 @@
 
 	si_update_ps_colorbuf0_slot(sctx);
 	si_update_poly_offset_state(sctx);
-	si_mark_atom_dirty(sctx, &sctx->cb_render_state);
-	si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
 
 	if (sctx->screen->dpbb_allowed)
-		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 
 	if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
-		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 
 	if (sctx->screen->has_out_of_order_rast &&
 	    (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
 	     !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
 	     (zstex && zstex->surface.has_stencil != old_has_stencil)))
-		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 
 	if (sctx->framebuffer.nr_samples != old_nr_samples) {
-		si_mark_atom_dirty(sctx, &sctx->msaa_config);
-		si_mark_atom_dirty(sctx, &sctx->db_render_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
 		/* Set sample locations as fragment shader constants. */
 		switch (sctx->framebuffer.nr_samples) {
@@ -2939,7 +2960,7 @@
 		constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
 		si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
 
-		si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
 	}
 
 	sctx->do_update_shaders = true;
@@ -2952,59 +2973,63 @@
 	}
 }
 
-static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_framebuffer_state(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
 	unsigned i, nr_cbufs = state->nr_cbufs;
-	struct r600_texture *tex = NULL;
-	struct r600_surface *cb = NULL;
+	struct si_texture *tex = NULL;
+	struct si_surface *cb = NULL;
 	unsigned cb_color_info = 0;
 
 	/* Colorbuffers. */
 	for (i = 0; i < nr_cbufs; i++) {
-		uint64_t cb_color_base, cb_color_fmask, cb_dcc_base;
+		uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
 		unsigned cb_color_attrib;
 
 		if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
 			continue;
 
-		cb = (struct r600_surface*)state->cbufs[i];
+		cb = (struct si_surface*)state->cbufs[i];
 		if (!cb) {
 			radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
 					       S_028C70_FORMAT(V_028C70_COLOR_INVALID));
 			continue;
 		}
 
-		tex = (struct r600_texture *)cb->base.texture;
+		tex = (struct si_texture *)cb->base.texture;
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      &tex->resource, RADEON_USAGE_READWRITE,
-				      tex->resource.b.b.nr_samples > 1 ?
+				      &tex->buffer, RADEON_USAGE_READWRITE,
+				      tex->buffer.b.b.nr_samples > 1 ?
 					      RADEON_PRIO_COLOR_BUFFER_MSAA :
 					      RADEON_PRIO_COLOR_BUFFER);
 
-		if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) {
+		if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
 			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
 				tex->cmask_buffer, RADEON_USAGE_READWRITE,
-				RADEON_PRIO_CMASK);
+				RADEON_PRIO_SEPARATE_META);
 		}
 
 		if (tex->dcc_separate_buffer)
 			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
 						  tex->dcc_separate_buffer,
 						  RADEON_USAGE_READWRITE,
-						  RADEON_PRIO_DCC);
+						  RADEON_PRIO_SEPARATE_META);
 
 		/* Compute mutable surface parameters. */
-		cb_color_base = tex->resource.gpu_address >> 8;
+		cb_color_base = tex->buffer.gpu_address >> 8;
 		cb_color_fmask = 0;
+		cb_color_cmask = tex->cmask_base_address_reg;
 		cb_dcc_base = 0;
 		cb_color_info = cb->cb_color_info | tex->cb_color_info;
 		cb_color_attrib = cb->cb_color_attrib;
 
-		if (tex->fmask.size) {
-			cb_color_fmask = (tex->resource.gpu_address + tex->fmask.offset) >> 8;
-			cb_color_fmask |= tex->fmask.tile_swizzle;
+		if (cb->base.u.tex.level > 0)
+			cb_color_info &= C_028C70_FAST_CLEAR;
+
+		if (tex->surface.fmask_size) {
+			cb_color_fmask = (tex->buffer.gpu_address + tex->fmask_offset) >> 8;
+			cb_color_fmask |= tex->surface.fmask_tile_swizzle;
 		}
 
 		/* Set up DCC. */
@@ -3017,7 +3042,7 @@
 			if (!is_msaa_resolve_dst)
 				cb_color_info |= S_028C70_DCC_ENABLE(1);
 
-			cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->resource.gpu_address : 0) +
+			cb_dcc_base = ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) +
 				       tex->dcc_offset) >> 8;
 			cb_dcc_base |= tex->surface.tile_swizzle;
 		}
@@ -3033,8 +3058,10 @@
 			/* Set mutable surface parameters. */
 			cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
 			cb_color_base |= tex->surface.tile_swizzle;
-			if (!tex->fmask.size)
+			if (!tex->surface.fmask_size)
 				cb_color_fmask = cb_color_base;
+			if (cb->base.u.tex.level > 0)
+				cb_color_cmask = cb_color_base;
 			cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
 					   S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
 					   S_028C74_RB_ALIGNED(meta.rb_aligned) |
@@ -3048,8 +3075,8 @@
 			radeon_emit(cs, cb_color_info);		/* CB_COLOR0_INFO */
 			radeon_emit(cs, cb_color_attrib);	/* CB_COLOR0_ATTRIB */
 			radeon_emit(cs, cb->cb_dcc_control);	/* CB_COLOR0_DCC_CONTROL */
-			radeon_emit(cs, tex->cmask.base_address_reg); /* CB_COLOR0_CMASK */
-			radeon_emit(cs, S_028C80_BASE_256B(tex->cmask.base_address_reg >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
+			radeon_emit(cs, cb_color_cmask);	/* CB_COLOR0_CMASK */
+			radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
 			radeon_emit(cs, cb_color_fmask);	/* CB_COLOR0_FMASK */
 			radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
 			radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
@@ -3071,8 +3098,10 @@
 			if (level_info->mode == RADEON_SURF_MODE_2D)
 				cb_color_base |= tex->surface.tile_swizzle;
 
-			if (!tex->fmask.size)
+			if (!tex->surface.fmask_size)
 				cb_color_fmask = cb_color_base;
+			if (cb->base.u.tex.level > 0)
+				cb_color_cmask = cb_color_base;
 			if (cb_dcc_base)
 				cb_dcc_base += level_info->dcc_offset >> 8;
 
@@ -3085,11 +3114,11 @@
 			cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
 			cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
 
-			if (tex->fmask.size) {
+			if (tex->surface.fmask_size) {
 				if (sctx->chip_class >= CIK)
-					cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->fmask.pitch_in_pixels / 8 - 1);
-				cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->fmask.tile_mode_index);
-				cb_color_fmask_slice = S_028C88_TILE_MAX(tex->fmask.slice_tile_max);
+					cb_color_pitch |= S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
+				cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
+				cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
 			} else {
 				/* This must be set for fast clear to work without FMASK. */
 				if (sctx->chip_class >= CIK)
@@ -3107,8 +3136,8 @@
 			radeon_emit(cs, cb_color_info);		/* CB_COLOR0_INFO */
 			radeon_emit(cs, cb_color_attrib);	/* CB_COLOR0_ATTRIB */
 			radeon_emit(cs, cb->cb_dcc_control);	/* CB_COLOR0_DCC_CONTROL */
-			radeon_emit(cs, tex->cmask.base_address_reg);	/* CB_COLOR0_CMASK */
-			radeon_emit(cs, tex->cmask.slice_tile_max);	/* CB_COLOR0_CMASK_SLICE */
+			radeon_emit(cs, cb_color_cmask);	/* CB_COLOR0_CMASK */
+			radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
 			radeon_emit(cs, cb_color_fmask);		/* CB_COLOR0_FMASK */
 			radeon_emit(cs, cb_color_fmask_slice);		/* CB_COLOR0_FMASK_SLICE */
 			radeon_emit(cs, tex->color_clear_value[0]);	/* CB_COLOR0_CLEAR_WORD0 */
@@ -3124,11 +3153,11 @@
 
 	/* ZS buffer. */
 	if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
-		struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
-		struct r600_texture *rtex = (struct r600_texture*)zb->base.texture;
+		struct si_surface *zb = (struct si_surface*)state->zsbuf;
+		struct si_texture *tex = (struct si_texture*)zb->base.texture;
 
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      &rtex->resource, RADEON_USAGE_READWRITE,
+				      &tex->buffer, RADEON_USAGE_READWRITE,
 				      zb->base.texture->nr_samples > 1 ?
 					      RADEON_PRIO_DEPTH_BUFFER_MSAA :
 					      RADEON_PRIO_DEPTH_BUFFER);
@@ -3141,7 +3170,7 @@
 
 			radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
 			radeon_emit(cs, zb->db_z_info |			/* DB_Z_INFO */
-				    S_028038_ZRANGE_PRECISION(rtex->depth_clear_value != 0));
+				    S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
 			radeon_emit(cs, zb->db_stencil_info);		/* DB_STENCIL_INFO */
 			radeon_emit(cs, zb->db_depth_base);		/* DB_Z_READ_BASE */
 			radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
@@ -3161,7 +3190,7 @@
 			radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
 			radeon_emit(cs, zb->db_depth_info);	/* DB_DEPTH_INFO */
 			radeon_emit(cs, zb->db_z_info |		/* DB_Z_INFO */
-				    S_028040_ZRANGE_PRECISION(rtex->depth_clear_value != 0));
+				    S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
 			radeon_emit(cs, zb->db_stencil_info);	/* DB_STENCIL_INFO */
 			radeon_emit(cs, zb->db_depth_base);	/* DB_Z_READ_BASE */
 			radeon_emit(cs, zb->db_stencil_base);	/* DB_STENCIL_READ_BASE */
@@ -3172,8 +3201,8 @@
 		}
 
 		radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
-		radeon_emit(cs, rtex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */
-		radeon_emit(cs, fui(rtex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
+		radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */
+		radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
 
 		radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
 		radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
@@ -3201,10 +3230,9 @@
 	sctx->framebuffer.dirty_zsbuf = false;
 }
 
-static void si_emit_msaa_sample_locs(struct si_context *sctx,
-				     struct r600_atom *atom)
+static void si_emit_msaa_sample_locs(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned nr_samples = sctx->framebuffer.nr_samples;
 	bool has_msaa_sample_loc_bug = sctx->screen->has_msaa_sample_loc_bug;
 
@@ -3220,8 +3248,8 @@
 	if (has_msaa_sample_loc_bug)
 		nr_samples = MAX2(nr_samples, 1);
 
-	if (nr_samples != sctx->msaa_sample_locs.nr_samples) {
-		sctx->msaa_sample_locs.nr_samples = nr_samples;
+	if (nr_samples != sctx->sample_locs_num_samples) {
+		sctx->sample_locs_num_samples = nr_samples;
 		si_emit_sample_locations(cs, nr_samples);
 	}
 
@@ -3238,11 +3266,13 @@
 		 */
 		if (has_msaa_sample_loc_bug &&
 		    sctx->framebuffer.nr_samples > 1 &&
-		    rs && !rs->multisample_enable)
+		    !rs->multisample_enable)
 			small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
 
-		radeon_set_context_reg(cs, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
-				       small_prim_filter_cntl);
+		radeon_opt_set_context_reg(sctx,
+					   R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
+					   SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
+					   small_prim_filter_cntl);
 	}
 }
 
@@ -3271,8 +3301,8 @@
 	};
 
 	if (sctx->framebuffer.state.zsbuf) {
-		struct r600_texture *zstex =
-			(struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
+		struct si_texture *zstex =
+			(struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
 		bool has_stencil = zstex->surface.has_stencil;
 		dsa_order_invariant = dsa->order_invariance[has_stencil];
 		if (!dsa_order_invariant.zs)
@@ -3313,9 +3343,9 @@
 	return true;
 }
 
-static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_msaa_config(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
 	/* 33% faster rendering to linear color buffers */
 	bool dst_is_linear = sctx->framebuffer.any_dst_linear;
@@ -3333,9 +3363,68 @@
 		S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
 		S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
 		S_028A4C_FORCE_EOV_REZ_ENABLE(1);
+	unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
+			   S_028804_INCOHERENT_EQAA_READS(1) |
+			   S_028804_INTERPOLATE_COMP_Z(1) |
+			   S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
+	unsigned coverage_samples, color_samples, z_samples;
 
-	int setup_samples = sctx->framebuffer.nr_samples > 1 ? sctx->framebuffer.nr_samples :
-			    sctx->smoothing_enabled ? SI_NUM_SMOOTH_AA_SAMPLES : 0;
+	/* S: Coverage samples (up to 16x):
+	 * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
+	 * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
+	 *
+	 * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
+	 * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
+	 * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
+	 * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
+	 * # from the closest defined sample if Z is uncompressed (same quality as the number of
+	 * # Z samples).
+	 *
+	 * F: Color samples (up to 8x, must be <= coverage samples):
+	 * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
+	 * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
+	 *
+	 * Can be anything between coverage and color samples:
+	 * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
+	 * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
+	 * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
+	 * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
+	 * # All are currently set the same as coverage samples.
+	 *
+	 * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
+	 * flag for undefined color samples. A shader-based resolve must handle unknowns
+	 * or mask them out with AND. Unknowns can also be guessed from neighbors via
+	 * an edge-detect shader-based resolve, which is required to make "color samples = 1"
+	 * useful. The CB resolve always drops unknowns.
+	 *
+	 * Sensible AA configurations:
+	 *   EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
+	 *   EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
+	 *   EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
+	 *   EQAA  8s 8z 8f = 8x MSAA
+	 *   EQAA  8s 8z 4f - might look the same as 8x MSAA
+	 *   EQAA  8s 8z 2f - might look the same as 8x MSAA with low-density geometry
+	 *   EQAA  8s 4z 4f - might look the same as 8x MSAA if Z is compressed
+	 *   EQAA  8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
+	 *   EQAA  4s 4z 4f = 4x MSAA
+	 *   EQAA  4s 4z 2f - might look the same as 4x MSAA with low-density geometry
+	 *   EQAA  2s 2z 2f = 2x MSAA
+	 */
+	if (sctx->framebuffer.nr_samples > 1) {
+		coverage_samples = sctx->framebuffer.nr_samples;
+		color_samples = sctx->framebuffer.nr_color_samples;
+
+		if (sctx->framebuffer.state.zsbuf) {
+			z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
+			z_samples = MAX2(1, z_samples);
+		} else {
+			z_samples = coverage_samples;
+		}
+	} else if (sctx->smoothing_enabled) {
+		coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
+	} else {
+		coverage_samples = color_samples = z_samples = 1;
+	}
 
 	/* Required by OpenGL line rasterization.
 	 *
@@ -3345,8 +3434,9 @@
 	 *       endcaps.
 	 */
 	unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
+	unsigned sc_aa_config = 0;
 
-	if (setup_samples > 1) {
+	if (coverage_samples > 1) {
 		/* distance from the pixel center, indexed by log2(nr_samples) */
 		static unsigned max_dist[] = {
 			0, /* unused */
@@ -3355,49 +3445,38 @@
 			7, /* 8x MSAA */
 			8, /* 16x MSAA */
 		};
-		unsigned log_samples = util_logbase2(setup_samples);
+		unsigned log_samples = util_logbase2(coverage_samples);
+		unsigned log_z_samples = util_logbase2(z_samples);
 		unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
-		unsigned log_ps_iter_samples =
-			util_logbase2(util_next_power_of_two(ps_iter_samples));
+		unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
 
-		radeon_set_context_reg_seq(cs, R_028BDC_PA_SC_LINE_CNTL, 2);
-		radeon_emit(cs, sc_line_cntl |
-			    S_028BDC_EXPAND_LINE_WIDTH(1)); /* CM_R_028BDC_PA_SC_LINE_CNTL */
-		radeon_emit(cs, S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
-			    S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
-			    S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples)); /* CM_R_028BE0_PA_SC_AA_CONFIG */
+		sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
+		sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
+			       S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
+			       S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
 
 		if (sctx->framebuffer.nr_samples > 1) {
-			radeon_set_context_reg(cs, R_028804_DB_EQAA,
-					       S_028804_MAX_ANCHOR_SAMPLES(log_samples) |
-					       S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
-					       S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
-					       S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples) |
-					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
-					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
-			radeon_set_context_reg(cs, R_028A4C_PA_SC_MODE_CNTL_1,
-					       S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
-					       sc_mode_cntl_1);
+			db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
+				   S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
+				   S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
+				   S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
+			sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
 		} else if (sctx->smoothing_enabled) {
-			radeon_set_context_reg(cs, R_028804_DB_EQAA,
-					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
-					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) |
-					       S_028804_OVERRASTERIZATION_AMOUNT(log_samples));
-			radeon_set_context_reg(cs, R_028A4C_PA_SC_MODE_CNTL_1,
-					       sc_mode_cntl_1);
+			db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
 		}
-	} else {
-		radeon_set_context_reg_seq(cs, R_028BDC_PA_SC_LINE_CNTL, 2);
-		radeon_emit(cs, sc_line_cntl); /* CM_R_028BDC_PA_SC_LINE_CNTL */
-		radeon_emit(cs, 0); /* CM_R_028BE0_PA_SC_AA_CONFIG */
-
-		radeon_set_context_reg(cs, R_028804_DB_EQAA,
-				       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
-				       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
-		radeon_set_context_reg(cs, R_028A4C_PA_SC_MODE_CNTL_1,
-				       sc_mode_cntl_1);
 	}
 
+	/* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
+	radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL,
+				    SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl,
+				    sc_aa_config);
+	/* R_028804_DB_EQAA */
+	radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA,
+				   db_eqaa);
+	/* R_028A4C_PA_SC_MODE_CNTL_1 */
+	radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1,
+				   SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
+
 	/* GFX9: Flush DFSM when the AA mode changes. */
 	if (sctx->screen->dfsm_allowed) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
@@ -3408,15 +3487,18 @@
 void si_update_ps_iter_samples(struct si_context *sctx)
 {
 	if (sctx->framebuffer.nr_samples > 1)
-		si_mark_atom_dirty(sctx, &sctx->msaa_config);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 	if (sctx->screen->dpbb_allowed)
-		si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 }
 
 static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
+	/* The hardware can only do sample shading with 2^n samples. */
+	min_samples = util_next_power_of_two(min_samples);
+
 	if (sctx->ps_iter_samples == min_samples)
 		return;
 
@@ -3536,7 +3618,7 @@
  */
 void
 si_make_texture_descriptor(struct si_screen *screen,
-			   struct r600_texture *tex,
+			   struct si_texture *tex,
 			   bool sampler,
 			   enum pipe_texture_target target,
 			   enum pipe_format pipe_format,
@@ -3547,15 +3629,19 @@
 			   uint32_t *state,
 			   uint32_t *fmask_state)
 {
-	struct pipe_resource *res = &tex->resource.b.b;
+	struct pipe_resource *res = &tex->buffer.b.b;
 	const struct util_format_description *desc;
 	unsigned char swizzle[4];
 	int first_non_void;
-	unsigned num_format, data_format, type;
+	unsigned num_format, data_format, type, num_samples;
 	uint64_t va;
 
 	desc = util_format_description(pipe_format);
 
+	num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ?
+			MAX2(1, res->nr_samples) :
+			MAX2(1, res->nr_storage_samples);
+
 	if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
 		const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
 		const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
@@ -3678,7 +3764,7 @@
 
 		assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
 	} else {
-		type = si_tex_dim(screen, tex, target, res->nr_samples);
+		type = si_tex_dim(screen, tex, target, num_samples);
 	}
 
 	if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
@@ -3701,10 +3787,9 @@
 		    S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
 		    S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
 		    S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-		    S_008F1C_BASE_LEVEL(res->nr_samples > 1 ?
-					0 : first_level) |
-		    S_008F1C_LAST_LEVEL(res->nr_samples > 1 ?
-					util_logbase2(res->nr_samples) :
+		    S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
+		    S_008F1C_LAST_LEVEL(num_samples > 1 ?
+					util_logbase2(num_samples) :
 					last_level) |
 		    S_008F1C_TYPE(type));
 	state[4] = 0;
@@ -3724,9 +3809,9 @@
 			state[4] |= S_008F20_DEPTH(last_layer);
 
 		state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
-		state[5] |= S_008F24_MAX_MIP(res->nr_samples > 1 ?
-					     util_logbase2(res->nr_samples) :
-					     tex->resource.b.b.last_level);
+		state[5] |= S_008F24_MAX_MIP(num_samples > 1 ?
+					     util_logbase2(num_samples) :
+					     tex->buffer.b.b.last_level);
 	} else {
 		state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
 		state[4] |= S_008F20_DEPTH(depth - 1);
@@ -3734,9 +3819,7 @@
 	}
 
 	if (tex->dcc_offset) {
-		unsigned swap = si_translate_colorswap(pipe_format, false);
-
-		state[6] = S_008F28_ALPHA_IS_ON_MSB(swap <= 1);
+		state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(pipe_format));
 	} else {
 		/* The last dword is unused by hw. The shader uses it to clear
 		 * bits in the first dword of sampler state.
@@ -3750,44 +3833,106 @@
 	}
 
 	/* Initialize the sampler view for FMASK. */
-	if (tex->fmask.size) {
+	if (tex->surface.fmask_size) {
 		uint32_t data_format, num_format;
 
-		va = tex->resource.gpu_address + tex->fmask.offset;
+		va = tex->buffer.gpu_address + tex->fmask_offset;
 
+#define FMASK(s,f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
 		if (screen->info.chip_class >= GFX9) {
 			data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
-			switch (res->nr_samples) {
-			case 2:
+			switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+			case FMASK(2,1):
+				num_format = V_008F14_IMG_FMASK_8_2_1;
+				break;
+			case FMASK(2,2):
 				num_format = V_008F14_IMG_FMASK_8_2_2;
 				break;
-			case 4:
+			case FMASK(4,1):
+				num_format = V_008F14_IMG_FMASK_8_4_1;
+				break;
+			case FMASK(4,2):
+				num_format = V_008F14_IMG_FMASK_8_4_2;
+				break;
+			case FMASK(4,4):
 				num_format = V_008F14_IMG_FMASK_8_4_4;
 				break;
-			case 8:
+			case FMASK(8,1):
+				num_format = V_008F14_IMG_FMASK_8_8_1;
+				break;
+			case FMASK(8,2):
+				num_format = V_008F14_IMG_FMASK_16_8_2;
+				break;
+			case FMASK(8,4):
+				num_format = V_008F14_IMG_FMASK_32_8_4;
+				break;
+			case FMASK(8,8):
 				num_format = V_008F14_IMG_FMASK_32_8_8;
 				break;
+			case FMASK(16,1):
+				num_format = V_008F14_IMG_FMASK_16_16_1;
+				break;
+			case FMASK(16,2):
+				num_format = V_008F14_IMG_FMASK_32_16_2;
+				break;
+			case FMASK(16,4):
+				num_format = V_008F14_IMG_FMASK_64_16_4;
+				break;
+			case FMASK(16,8):
+				num_format = V_008F14_IMG_FMASK_64_16_8;
+				break;
 			default:
 				unreachable("invalid nr_samples");
 			}
 		} else {
-			switch (res->nr_samples) {
-			case 2:
+			switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
+			case FMASK(2,1):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
+				break;
+			case FMASK(2,2):
 				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
 				break;
-			case 4:
+			case FMASK(4,1):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
+				break;
+			case FMASK(4,2):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
+				break;
+			case FMASK(4,4):
 				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
 				break;
-			case 8:
+			case FMASK(8,1):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
+				break;
+			case FMASK(8,2):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
+				break;
+			case FMASK(8,4):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
+				break;
+			case FMASK(8,8):
 				data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
 				break;
+			case FMASK(16,1):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
+				break;
+			case FMASK(16,2):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
+				break;
+			case FMASK(16,4):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
+				break;
+			case FMASK(16,8):
+				data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
+				break;
 			default:
 				unreachable("invalid nr_samples");
 			}
 			num_format = V_008F14_IMG_NUM_FORMAT_UINT;
 		}
+#undef FMASK
 
-		fmask_state[0] = (va >> 8) | tex->fmask.tile_swizzle;
+		fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
 		fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
 				 S_008F14_DATA_FORMAT_GFX6(data_format) |
 				 S_008F14_NUM_FORMAT_GFX6(num_format);
@@ -3810,9 +3955,9 @@
 			fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
 					  S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
 		} else {
-			fmask_state[3] |= S_008F1C_TILING_INDEX(tex->fmask.tile_mode_index);
+			fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
 			fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
-					  S_008F20_PITCH_GFX6(tex->fmask.pitch_in_pixels - 1);
+					  S_008F20_PITCH_GFX6(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
 			fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
 		}
 	}
@@ -3837,7 +3982,7 @@
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
-	struct r600_texture *tmp = (struct r600_texture*)texture;
+	struct si_texture *tex = (struct si_texture*)texture;
 	unsigned base_level, first_level, last_level;
 	unsigned char state_swizzle[4];
 	unsigned height, depth, width;
@@ -3866,7 +4011,7 @@
 	/* Buffer resource. */
 	if (texture->target == PIPE_BUFFER) {
 		si_make_buffer_descriptor(sctx->screen,
-					  (struct r600_resource *)texture,
+					  r600_resource(texture),
 					  state->format,
 					  state->u.buf.offset,
 					  state->u.buf.size,
@@ -3908,30 +4053,30 @@
 	pipe_format = state->format;
 
 	/* Depth/stencil texturing sometimes needs separate texture. */
-	if (tmp->is_depth && !si_can_sample_zs(tmp, view->is_stencil_sampler)) {
-		if (!tmp->flushed_depth_texture &&
+	if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
+		if (!tex->flushed_depth_texture &&
 		    !si_init_flushed_depth_texture(ctx, texture, NULL)) {
 			pipe_resource_reference(&view->base.texture, NULL);
 			FREE(view);
 			return NULL;
 		}
 
-		assert(tmp->flushed_depth_texture);
+		assert(tex->flushed_depth_texture);
 
 		/* Override format for the case where the flushed texture
 		 * contains only Z or only S.
 		 */
-		if (tmp->flushed_depth_texture->resource.b.b.format != tmp->resource.b.b.format)
-			pipe_format = tmp->flushed_depth_texture->resource.b.b.format;
+		if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format)
+			pipe_format = tex->flushed_depth_texture->buffer.b.b.format;
 
-		tmp = tmp->flushed_depth_texture;
+		tex = tex->flushed_depth_texture;
 	}
 
-	surflevel = tmp->surface.u.legacy.level;
+	surflevel = tex->surface.u.legacy.level;
 
-	if (tmp->db_compatible) {
+	if (tex->db_compatible) {
 		if (!view->is_stencil_sampler)
-			pipe_format = tmp->db_render_format;
+			pipe_format = tex->db_render_format;
 
 		switch (pipe_format) {
 		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
@@ -3948,7 +4093,7 @@
 		case PIPE_FORMAT_S8X24_UINT:
 		case PIPE_FORMAT_X32_S8X24_UINT:
 			pipe_format = PIPE_FORMAT_S8_UINT;
-			surflevel = tmp->surface.u.legacy.stencil_level;
+			surflevel = tex->surface.u.legacy.stencil_level;
 			break;
 		default:;
 		}
@@ -3959,7 +4104,7 @@
 						state->u.tex.first_level,
 						state->format);
 
-	si_make_texture_descriptor(sctx->screen, tmp, true,
+	si_make_texture_descriptor(sctx->screen, tex, true,
 				   state->target, pipe_format, state_swizzle,
 				   first_level, last_level,
 				   state->u.tex.first_layer, last_layer,
@@ -4164,17 +4309,17 @@
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
-	if (sctx->sample_mask.sample_mask == (uint16_t)sample_mask)
+	if (sctx->sample_mask == (uint16_t)sample_mask)
 		return;
 
-	sctx->sample_mask.sample_mask = sample_mask;
-	si_mark_atom_dirty(sctx, &sctx->sample_mask.atom);
+	sctx->sample_mask = sample_mask;
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
 }
 
-static void si_emit_sample_mask(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_sample_mask(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
-	unsigned mask = sctx->sample_mask.sample_mask;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+	unsigned mask = sctx->sample_mask;
 
 	/* Needed for line and polygon smoothing as well as for the Polaris
 	 * small primitive filter. We expect the state tracker to take care of
@@ -4524,23 +4669,17 @@
 
 void si_init_state_functions(struct si_context *sctx)
 {
-	si_init_external_atom(sctx, &sctx->render_cond_atom, &sctx->atoms.s.render_cond);
-	si_init_external_atom(sctx, &sctx->streamout.begin_atom, &sctx->atoms.s.streamout_begin);
-	si_init_external_atom(sctx, &sctx->streamout.enable_atom, &sctx->atoms.s.streamout_enable);
-	si_init_external_atom(sctx, &sctx->scissors.atom, &sctx->atoms.s.scissors);
-	si_init_external_atom(sctx, &sctx->viewports.atom, &sctx->atoms.s.viewports);
-
-	si_init_atom(sctx, &sctx->framebuffer.atom, &sctx->atoms.s.framebuffer, si_emit_framebuffer_state);
-	si_init_atom(sctx, &sctx->msaa_sample_locs.atom, &sctx->atoms.s.msaa_sample_locs, si_emit_msaa_sample_locs);
-	si_init_atom(sctx, &sctx->db_render_state, &sctx->atoms.s.db_render_state, si_emit_db_render_state);
-	si_init_atom(sctx, &sctx->dpbb_state, &sctx->atoms.s.dpbb_state, si_emit_dpbb_state);
-	si_init_atom(sctx, &sctx->msaa_config, &sctx->atoms.s.msaa_config, si_emit_msaa_config);
-	si_init_atom(sctx, &sctx->sample_mask.atom, &sctx->atoms.s.sample_mask, si_emit_sample_mask);
-	si_init_atom(sctx, &sctx->cb_render_state, &sctx->atoms.s.cb_render_state, si_emit_cb_render_state);
-	si_init_atom(sctx, &sctx->blend_color.atom, &sctx->atoms.s.blend_color, si_emit_blend_color);
-	si_init_atom(sctx, &sctx->clip_regs, &sctx->atoms.s.clip_regs, si_emit_clip_regs);
-	si_init_atom(sctx, &sctx->clip_state.atom, &sctx->atoms.s.clip_state, si_emit_clip_state);
-	si_init_atom(sctx, &sctx->stencil_ref.atom, &sctx->atoms.s.stencil_ref, si_emit_stencil_ref);
+	sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
+	sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
+	sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
+	sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
+	sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
+	sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
+	sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
+	sctx->atoms.s.blend_color.emit = si_emit_blend_color;
+	sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
+	sctx->atoms.s.clip_state.emit = si_emit_clip_state;
+	sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
 
 	sctx->b.create_blend_state = si_create_blend_state;
 	sctx->b.bind_blend_state = si_bind_blend_state;
@@ -4621,213 +4760,35 @@
 				  unsigned raster_config,
 				  unsigned raster_config_1)
 {
-	unsigned sh_per_se = MAX2(sctx->screen->info.max_sh_per_se, 1);
 	unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
-	unsigned rb_mask = sctx->screen->info.enabled_rb_mask;
-	unsigned num_rb = MIN2(sctx->screen->info.num_render_backends, 16);
-	unsigned rb_per_pkr = MIN2(num_rb / num_se / sh_per_se, 2);
-	unsigned rb_per_se = num_rb / num_se;
-	unsigned se_mask[4];
+	unsigned raster_config_se[4];
 	unsigned se;
 
-	se_mask[0] = ((1 << rb_per_se) - 1);
-	se_mask[1] = (se_mask[0] << rb_per_se);
-	se_mask[2] = (se_mask[1] << rb_per_se);
-	se_mask[3] = (se_mask[2] << rb_per_se);
-
-	se_mask[0] &= rb_mask;
-	se_mask[1] &= rb_mask;
-	se_mask[2] &= rb_mask;
-	se_mask[3] &= rb_mask;
-
-	assert(num_se == 1 || num_se == 2 || num_se == 4);
-	assert(sh_per_se == 1 || sh_per_se == 2);
-	assert(rb_per_pkr == 1 || rb_per_pkr == 2);
-
-	/* XXX: I can't figure out what the *_XSEL and *_YSEL
-	 * fields are for, so I'm leaving them as their default
-	 * values. */
+	ac_get_harvested_configs(&sctx->screen->info,
+				 raster_config,
+				 &raster_config_1,
+				 raster_config_se);
 
 	for (se = 0; se < num_se; se++) {
-		unsigned raster_config_se = raster_config;
-		unsigned pkr0_mask = ((1 << rb_per_pkr) - 1) << (se * rb_per_se);
-		unsigned pkr1_mask = pkr0_mask << rb_per_pkr;
-		int idx = (se / 2) * 2;
-
-		if ((num_se > 1) && (!se_mask[idx] || !se_mask[idx + 1])) {
-			raster_config_se &= C_028350_SE_MAP;
-
-			if (!se_mask[idx]) {
-				raster_config_se |=
-					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_3);
-			} else {
-				raster_config_se |=
-					S_028350_SE_MAP(V_028350_RASTER_CONFIG_SE_MAP_0);
-			}
-		}
-
-		pkr0_mask &= rb_mask;
-		pkr1_mask &= rb_mask;
-		if (rb_per_se > 2 && (!pkr0_mask || !pkr1_mask)) {
-			raster_config_se &= C_028350_PKR_MAP;
-
-			if (!pkr0_mask) {
-				raster_config_se |=
-					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_3);
-			} else {
-				raster_config_se |=
-					S_028350_PKR_MAP(V_028350_RASTER_CONFIG_PKR_MAP_0);
-			}
-		}
-
-		if (rb_per_se >= 2) {
-			unsigned rb0_mask = 1 << (se * rb_per_se);
-			unsigned rb1_mask = rb0_mask << 1;
-
-			rb0_mask &= rb_mask;
-			rb1_mask &= rb_mask;
-			if (!rb0_mask || !rb1_mask) {
-				raster_config_se &= C_028350_RB_MAP_PKR0;
-
-				if (!rb0_mask) {
-					raster_config_se |=
-						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_3);
-				} else {
-					raster_config_se |=
-						S_028350_RB_MAP_PKR0(V_028350_RASTER_CONFIG_RB_MAP_0);
-				}
-			}
-
-			if (rb_per_se > 2) {
-				rb0_mask = 1 << (se * rb_per_se + rb_per_pkr);
-				rb1_mask = rb0_mask << 1;
-				rb0_mask &= rb_mask;
-				rb1_mask &= rb_mask;
-				if (!rb0_mask || !rb1_mask) {
-					raster_config_se &= C_028350_RB_MAP_PKR1;
-
-					if (!rb0_mask) {
-						raster_config_se |=
-							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_3);
-					} else {
-						raster_config_se |=
-							S_028350_RB_MAP_PKR1(V_028350_RASTER_CONFIG_RB_MAP_0);
-					}
-				}
-			}
-		}
-
 		si_set_grbm_gfx_index_se(sctx, pm4, se);
-		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se);
+		si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
 	}
 	si_set_grbm_gfx_index(sctx, pm4, ~0);
 
 	if (sctx->chip_class >= CIK) {
-		if ((num_se > 2) && ((!se_mask[0] && !se_mask[1]) ||
-		                     (!se_mask[2] && !se_mask[3]))) {
-			raster_config_1 &= C_028354_SE_PAIR_MAP;
-
-			if (!se_mask[0] && !se_mask[1]) {
-				raster_config_1 |=
-					S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_3);
-			} else {
-				raster_config_1 |=
-					S_028354_SE_PAIR_MAP(V_028354_RASTER_CONFIG_SE_PAIR_MAP_0);
-			}
-		}
-
 		si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
 	}
 }
 
 static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *pm4)
 {
-	struct si_screen *sscreen = sctx->screen;
 	unsigned num_rb = MIN2(sctx->screen->info.num_render_backends, 16);
 	unsigned rb_mask = sctx->screen->info.enabled_rb_mask;
 	unsigned raster_config, raster_config_1;
 
-	switch (sctx->family) {
-	case CHIP_TAHITI:
-	case CHIP_PITCAIRN:
-		raster_config = 0x2a00126a;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_VERDE:
-		raster_config = 0x0000124a;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_OLAND:
-		raster_config = 0x00000082;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_HAINAN:
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_BONAIRE:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_HAWAII:
-		raster_config = 0x3a00161a;
-		raster_config_1 = 0x0000002e;
-		break;
-	case CHIP_FIJI:
-		if (sscreen->info.cik_macrotile_mode_array[0] == 0x000000e8) {
-			/* old kernels with old tiling config */
-			raster_config = 0x16000012;
-			raster_config_1 = 0x0000002a;
-		} else {
-			raster_config = 0x3a00161a;
-			raster_config_1 = 0x0000002e;
-		}
-		break;
-	case CHIP_VEGAM:
-		raster_config = 0x3a00161a;
-		raster_config_1 = 0x0000002e;
-		break;
-	case CHIP_POLARIS10:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x0000002a;
-		break;
-	case CHIP_POLARIS11:
-	case CHIP_POLARIS12:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_TONGA:
-		raster_config = 0x16000012;
-		raster_config_1 = 0x0000002a;
-		break;
-	case CHIP_ICELAND:
-		if (num_rb == 1)
-			raster_config = 0x00000000;
-		else
-			raster_config = 0x00000002;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_CARRIZO:
-		raster_config = 0x00000002;
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_KAVERI:
-		/* KV should be 0x00000002, but that causes problems with radeon */
-		raster_config = 0x00000000; /* 0x00000002 */
-		raster_config_1 = 0x00000000;
-		break;
-	case CHIP_KABINI:
-	case CHIP_MULLINS:
-	case CHIP_STONEY:
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
-		break;
-	default:
-		fprintf(stderr,
-			"radeonsi: Unknown GPU, using 0 for raster_config\n");
-		raster_config = 0x00000000;
-		raster_config_1 = 0x00000000;
-	}
+	ac_get_raster_config(&sctx->screen->info,
+			     &raster_config,
+			     &raster_config_1);
 
 	if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
 		/* Always use the default config when all backends are enabled
@@ -4893,9 +4854,6 @@
 		si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) |
 			       S_008A14_CLIP_VTX_REORDER_ENA(1));
 
-	si_pm4_set_reg(pm4, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210);
-	si_pm4_set_reg(pm4, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98);
-
 	if (!has_clear_state)
 		si_pm4_set_reg(pm4, R_02882C_PA_SU_PRIM_FILTER_CNTL, 0);
 
@@ -5045,6 +5003,7 @@
 		switch (sctx->family) {
 		case CHIP_VEGA10:
 		case CHIP_VEGA12:
+		case CHIP_VEGA20:
 			pc_lines = 4096;
 			break;
 		case CHIP_RAVEN:
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index e9849a9..1edf5d6 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -26,14 +26,13 @@
 #define SI_STATE_H
 
 #include "si_pm4.h"
-#include "radeon/r600_pipe_common.h"
 
 #include "pipebuffer/pb_slab.h"
+#include "util/u_blitter.h"
 
 #define SI_NUM_GRAPHICS_SHADERS (PIPE_SHADER_TESS_EVAL+1)
 #define SI_NUM_SHADERS (PIPE_SHADER_COMPUTE+1)
 
-#define SI_MAX_ATTRIBS			16
 #define SI_NUM_VERTEX_BUFFERS		SI_MAX_ATTRIBS
 #define SI_NUM_SAMPLERS			32 /* OpenGL textures units per shader */
 #define SI_NUM_CONST_BUFFERS		16
@@ -43,6 +42,15 @@
 struct si_screen;
 struct si_shader;
 struct si_shader_selector;
+struct si_texture;
+struct si_qbo_state;
+
+/* State atoms are callbacks which write a sequence of packets into a GPU
+ * command buffer (AKA indirect buffer, AKA IB, AKA command stream, AKA CS).
+ */
+struct si_atom {
+	void (*emit)(struct si_context *ctx);
+};
 
 struct si_state_blend {
 	struct si_pm4_state	pm4;
@@ -124,7 +132,6 @@
 };
 
 struct si_stencil_ref {
-	struct r600_atom		atom;
 	struct pipe_stencil_ref		state;
 	struct si_dsa_stencil_ref_part	dsa_part;
 };
@@ -165,41 +172,124 @@
 	struct si_pm4_state	*array[0];
 };
 
+#define SI_STATE_IDX(name) \
+	(offsetof(union si_state, named.name) / sizeof(struct si_pm4_state *))
+#define SI_STATE_BIT(name) (1 << SI_STATE_IDX(name))
 #define SI_NUM_STATES (sizeof(union si_state) / sizeof(struct si_pm4_state *))
 
+static inline unsigned si_states_that_roll_context(void)
+{
+	return (SI_STATE_BIT(blend) |
+		SI_STATE_BIT(rasterizer) |
+		SI_STATE_BIT(dsa) |
+		SI_STATE_BIT(poly_offset) |
+		SI_STATE_BIT(es) |
+		SI_STATE_BIT(gs) |
+		SI_STATE_BIT(vgt_shader_config) |
+		SI_STATE_BIT(vs) |
+		SI_STATE_BIT(ps));
+}
+
 union si_state_atoms {
 	struct {
 		/* The order matters. */
-		struct r600_atom *render_cond;
-		struct r600_atom *streamout_begin;
-		struct r600_atom *streamout_enable; /* must be after streamout_begin */
-		struct r600_atom *framebuffer;
-		struct r600_atom *msaa_sample_locs;
-		struct r600_atom *db_render_state;
-		struct r600_atom *dpbb_state;
-		struct r600_atom *msaa_config;
-		struct r600_atom *sample_mask;
-		struct r600_atom *cb_render_state;
-		struct r600_atom *blend_color;
-		struct r600_atom *clip_regs;
-		struct r600_atom *clip_state;
-		struct r600_atom *shader_pointers;
-		struct r600_atom *scissors;
-		struct r600_atom *viewports;
-		struct r600_atom *stencil_ref;
-		struct r600_atom *spi_map;
-		struct r600_atom *scratch_state;
+		struct si_atom render_cond;
+		struct si_atom streamout_begin;
+		struct si_atom streamout_enable; /* must be after streamout_begin */
+		struct si_atom framebuffer;
+		struct si_atom msaa_sample_locs;
+		struct si_atom db_render_state;
+		struct si_atom dpbb_state;
+		struct si_atom msaa_config;
+		struct si_atom sample_mask;
+		struct si_atom cb_render_state;
+		struct si_atom blend_color;
+		struct si_atom clip_regs;
+		struct si_atom clip_state;
+		struct si_atom shader_pointers;
+		struct si_atom guardband;
+		struct si_atom scissors;
+		struct si_atom viewports;
+		struct si_atom stencil_ref;
+		struct si_atom spi_map;
+		struct si_atom scratch_state;
 	} s;
-	struct r600_atom *array[0];
+	struct si_atom array[0];
 };
 
-#define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct r600_atom*))
+#define SI_ATOM_BIT(name) (1 << (offsetof(union si_state_atoms, s.name) / \
+			         sizeof(struct si_atom)))
+#define SI_NUM_ATOMS (sizeof(union si_state_atoms)/sizeof(struct si_atom*))
+
+static inline unsigned si_atoms_that_roll_context(void)
+{
+	return (SI_ATOM_BIT(streamout_begin) |
+		SI_ATOM_BIT(streamout_enable) |
+		SI_ATOM_BIT(framebuffer) |
+		SI_ATOM_BIT(msaa_sample_locs) |
+		SI_ATOM_BIT(db_render_state) |
+		SI_ATOM_BIT(dpbb_state) |
+		SI_ATOM_BIT(msaa_config) |
+		SI_ATOM_BIT(sample_mask) |
+		SI_ATOM_BIT(cb_render_state) |
+		SI_ATOM_BIT(blend_color) |
+		SI_ATOM_BIT(clip_regs) |
+		SI_ATOM_BIT(clip_state) |
+		SI_ATOM_BIT(guardband) |
+		SI_ATOM_BIT(scissors) |
+		SI_ATOM_BIT(viewports) |
+		SI_ATOM_BIT(stencil_ref) |
+		SI_ATOM_BIT(spi_map) |
+		SI_ATOM_BIT(scratch_state));
+}
 
 struct si_shader_data {
-	struct r600_atom	atom;
 	uint32_t		sh_base[SI_NUM_SHADERS];
 };
 
+/* The list of registers whose emitted values are remembered by si_context. */
+enum si_tracked_reg {
+	SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */
+	SI_TRACKED_DB_COUNT_CONTROL,
+
+	SI_TRACKED_DB_RENDER_OVERRIDE2,
+	SI_TRACKED_DB_SHADER_CONTROL,
+
+	SI_TRACKED_CB_TARGET_MASK,
+	SI_TRACKED_CB_DCC_CONTROL,
+
+	SI_TRACKED_SX_PS_DOWNCONVERT, /* 3 consecutive registers */
+	SI_TRACKED_SX_BLEND_OPT_EPSILON,
+	SI_TRACKED_SX_BLEND_OPT_CONTROL,
+
+	SI_TRACKED_PA_SC_LINE_CNTL, /* 2 consecutive registers */
+	SI_TRACKED_PA_SC_AA_CONFIG,
+
+	SI_TRACKED_DB_EQAA,
+	SI_TRACKED_PA_SC_MODE_CNTL_1,
+
+	SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL,
+
+	SI_TRACKED_PA_CL_VS_OUT_CNTL,
+	SI_TRACKED_PA_CL_CLIP_CNTL,
+
+	SI_TRACKED_PA_SC_BINNER_CNTL_0,
+	SI_TRACKED_DB_DFSM_CONTROL,
+
+	SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ, /* 4 consecutive registers */
+	SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ,
+	SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ,
+	SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ,
+
+	SI_NUM_TRACKED_REGS,
+};
+
+struct si_tracked_regs {
+	uint32_t		reg_saved;
+	uint32_t		reg_value[SI_NUM_TRACKED_REGS];
+	uint32_t		spi_ps_input_cntl[32];
+};
+
 /* Private read-write buffer slots. */
 enum {
 	SI_ES_RING_ESGS,
@@ -300,9 +390,6 @@
 	unsigned			enabled_mask;
 };
 
-#define si_pm4_block_idx(member) \
-	(offsetof(union si_state, named.member) / sizeof(struct si_pm4_state *))
-
 #define si_pm4_state_changed(sctx, member) \
 	((sctx)->queued.named.member != (sctx)->emitted.named.member)
 
@@ -312,7 +399,7 @@
 #define si_pm4_bind_state(sctx, member, value) \
 	do { \
 		(sctx)->queued.named.member = (value); \
-		(sctx)->dirty_states |= 1 << si_pm4_block_idx(member); \
+		(sctx)->dirty_states |= SI_STATE_BIT(member); \
 	} while(0)
 
 #define si_pm4_delete_state(sctx, member, value) \
@@ -321,12 +408,12 @@
 			(sctx)->queued.named.member = NULL; \
 		} \
 		si_pm4_free_state(sctx, (struct si_pm4_state *)(value), \
-				  si_pm4_block_idx(member)); \
+				  SI_STATE_IDX(member)); \
 	} while(0)
 
 /* si_descriptors.c */
 void si_set_mutable_tex_desc_fields(struct si_screen *sscreen,
-				    struct r600_texture *tex,
+				    struct si_texture *tex,
 				    const struct legacy_surf_level *base_level_info,
 				    unsigned base_level, unsigned first_level,
 				    unsigned block_width, bool is_stencil,
@@ -355,8 +442,7 @@
 void si_update_all_texture_descriptors(struct si_context *sctx);
 void si_shader_change_notify(struct si_context *sctx);
 void si_update_needs_color_decompress_masks(struct si_context *sctx);
-void si_emit_graphics_shader_pointers(struct si_context *sctx,
-                                      struct r600_atom *atom);
+void si_emit_graphics_shader_pointers(struct si_context *sctx);
 void si_emit_compute_shader_pointers(struct si_context *sctx);
 void si_set_rw_buffer(struct si_context *sctx,
 		      uint slot, const struct pipe_constant_buffer *input);
@@ -373,11 +459,6 @@
 void si_rebind_buffer(struct si_context *sctx, struct pipe_resource *buf,
 		      uint64_t old_va);
 /* si_state.c */
-struct si_shader_selector;
-
-void si_init_atom(struct si_context *sctx, struct r600_atom *atom,
-		  struct r600_atom **list_elem,
-		  void (*emit_func)(struct si_context *ctx, struct r600_atom *state));
 void si_init_state_functions(struct si_context *sctx);
 void si_init_screen_state_functions(struct si_screen *sscreen);
 void
@@ -387,7 +468,7 @@
 			  uint32_t *state);
 void
 si_make_texture_descriptor(struct si_screen *screen,
-			   struct r600_texture *tex,
+			   struct si_texture *tex,
 			   bool sampler,
 			   enum pipe_texture_target target,
 			   enum pipe_format pipe_format,
@@ -410,13 +491,23 @@
 				  bool old_perfect_enable);
 
 /* si_state_binning.c */
-void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state);
+void si_emit_dpbb_state(struct si_context *sctx);
 
 /* si_state_shaders.c */
+void *si_get_ir_binary(struct si_shader_selector *sel);
+bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
+				 struct si_shader *shader);
+bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary,
+				   struct si_shader *shader,
+				   bool insert_into_disk_cache);
 bool si_update_shaders(struct si_context *sctx);
 void si_init_shader_functions(struct si_context *sctx);
 bool si_init_shader_cache(struct si_screen *sscreen);
 void si_destroy_shader_cache(struct si_screen *sscreen);
+void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
+				 struct util_queue_fence *ready_fence,
+				 struct si_compiler_ctx_state *compiler_ctx_state,
+				 void *job, util_queue_execute_func execute);
 void si_get_active_slot_masks(const struct tgsi_shader_info *info,
 			      uint32_t *const_and_shader_buffers,
 			      uint64_t *samplers_and_images);
@@ -438,7 +529,7 @@
 
 /* si_state_msaa.c */
 void si_init_msaa_functions(struct si_context *sctx);
-void si_emit_sample_locations(struct radeon_winsys_cs *cs, int nr_samples);
+void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples);
 
 /* si_state_streamout.c */
 void si_streamout_buffers_dirty(struct si_context *sctx);
@@ -448,15 +539,6 @@
 void si_init_streamout_functions(struct si_context *sctx);
 
 
-static inline unsigned
-si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil)
-{
-	if (stencil)
-		return rtex->surface.u.legacy.stencil_tiling_index[level];
-	else
-		return rtex->surface.u.legacy.tiling_index[level];
-}
-
 static inline unsigned si_get_constbuf_slot(unsigned slot)
 {
 	/* Constant buffers are in slots [16..31], ascending */
diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c
index 87b89e8..4aad94d 100644
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -37,7 +37,7 @@
 	unsigned bin_size_y;
 };
 
-typedef struct si_bin_size_map si_bin_size_subtable[3][9];
+typedef struct si_bin_size_map si_bin_size_subtable[3][10];
 
 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
 static struct uvec2 si_find_bin_size(struct si_screen *sscreen,
@@ -54,7 +54,7 @@
 	const struct si_bin_size_map *subtable =
 		&table[log_num_rb_per_se][log_num_se][0];
 
-	for (i = 0; subtable[i].start != UINT_MAX; i++) {
+	for (i = 0; subtable[i].bin_size_x != 0; i++) {
 		if (sum >= subtable[i].start && sum < subtable[i + 1].start)
 			break;
 	}
@@ -66,7 +66,7 @@
 static struct uvec2 si_get_color_bin_size(struct si_context *sctx,
 					  unsigned cb_target_enabled_4bit)
 {
-	unsigned nr_samples = sctx->framebuffer.nr_samples;
+	unsigned num_fragments = sctx->framebuffer.nr_color_samples;
 	unsigned sum = 0;
 
 	/* Compute the sum of all Bpp. */
@@ -74,15 +74,15 @@
 		if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
 			continue;
 
-		struct r600_texture *rtex =
-			(struct r600_texture*)sctx->framebuffer.state.cbufs[i]->texture;
-		sum += rtex->surface.bpe;
+		struct si_texture *tex =
+			(struct si_texture*)sctx->framebuffer.state.cbufs[i]->texture;
+		sum += tex->surface.bpe;
 	}
 
 	/* Multiply the sum by some function of the number of samples. */
-	if (nr_samples >= 2) {
+	if (num_fragments >= 2) {
 		if (si_get_ps_iter_samples(sctx) >= 2)
-			sum *= nr_samples;
+			sum *= num_fragments;
 		else
 			sum *= 2;
 	}
@@ -97,7 +97,6 @@
 				{        2,   32,  128 },
 				{        3,   16,  128 },
 				{       17,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Two shader engines */
@@ -106,7 +105,6 @@
 				{        3,   32,  128 },
 				{        5,   16,  128 },
 				{       17,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Four shader engines */
@@ -114,7 +112,6 @@
 				{        3,   64,  128 },
 				{        5,   16,  128 },
 				{       17,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 		{
@@ -124,9 +121,8 @@
 				{        0,  128,  128 },
 				{        2,   64,  128 },
 				{        3,   32,  128 },
-				{        5,   16,  128 },
+				{        9,   16,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Two shader engines */
@@ -135,7 +131,6 @@
 				{        5,   32,  128 },
 				{        9,   16,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Four shader engines */
@@ -145,7 +140,6 @@
 				{        5,   64,  128 },
 				{        9,   16,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 		{
@@ -157,8 +151,7 @@
 				{        3,   64,  128 },
 				{        5,   32,  128 },
 				{        9,   16,  128 },
-				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
+				{       17,    0,    0 },
 			},
 			{
 				/* Two shader engines */
@@ -169,18 +162,16 @@
 				{        9,   32,  128 },
 				{       17,   16,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				/* Four shader engines */
 				{        0,  256,  512 },
-				{        2,  256,  256 },
-				{        3,  128,  256 },
-				{        5,  128,  128 },
-				{        9,   64,  128 },
-				{       17,   16,  128 },
+				{        2,  128,  512 },
+				{        3,   64,  512 },
+				{        5,   32,  512 },
+				{        9,   32,  256 },
+				{       17,   32,  128 },
 				{       33,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 	};
@@ -199,86 +190,80 @@
 		return size;
 	}
 
-	struct r600_texture *rtex =
-		(struct r600_texture*)sctx->framebuffer.state.zsbuf->texture;
+	struct si_texture *tex =
+		(struct si_texture*)sctx->framebuffer.state.zsbuf->texture;
 	unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
-	unsigned stencil_coeff = rtex->surface.has_stencil &&
+	unsigned stencil_coeff = tex->surface.has_stencil &&
 				 dsa->stencil_enabled ? 1 : 0;
 	unsigned sum = 4 * (depth_coeff + stencil_coeff) *
-		       sctx->framebuffer.nr_samples;
+		       tex->buffer.b.b.nr_samples;
 
 	static const si_bin_size_subtable table[] = {
 		{
 			// One RB / SE
 			{
 				// One shader engine
-				{        0,  128,  256 },
-				{        2,  128,  128 },
+				{        0,   64,  512 },
+				{        2,   64,  256 },
 				{        4,   64,  128 },
 				{        7,   32,  128 },
 				{       13,   16,  128 },
 				{       49,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				// Two shader engines
-				{        0,  256,  256 },
-				{        2,  128,  256 },
-				{        4,  128,  128 },
+				{        0,  128,  512 },
+				{        2,   64,  512 },
+				{        4,   64,  256 },
 				{        7,   64,  128 },
 				{       13,   32,  128 },
 				{       25,   16,  128 },
 				{       49,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				// Four shader engines
 				{        0,  256,  512 },
-				{        2,  256,  256 },
-				{        4,  128,  256 },
-				{        7,  128,  128 },
+				{        2,  128,  512 },
+				{        4,   64,  512 },
+				{        7,   64,  256 },
 				{       13,   64,  128 },
 				{       25,   16,  128 },
 				{       49,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 		{
 			// Two RB / SE
 			{
 				// One shader engine
-				{        0,  256,  256 },
-				{        2,  128,  256 },
-				{        4,  128,  128 },
+				{        0,  128,  512 },
+				{        2,   64,  512 },
+				{        4,   64,  256 },
 				{        7,   64,  128 },
 				{       13,   32,  128 },
 				{       25,   16,  128 },
 				{       97,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				// Two shader engines
 				{        0,  256,  512 },
-				{        2,  256,  256 },
-				{        4,  128,  256 },
-				{        7,  128,  128 },
+				{        2,  128,  512 },
+				{        4,   64,  512 },
+				{        7,   64,  256 },
 				{       13,   64,  128 },
 				{       25,   32,  128 },
 				{       49,   16,  128 },
 				{       97,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 			{
 				// Four shader engines
 				{        0,  512,  512 },
 				{        2,  256,  512 },
-				{        4,  256,  256 },
-				{        7,  128,  256 },
-				{       13,  128,  128 },
+				{        4,  128,  512 },
+				{        7,   64,  512 },
+				{       13,   64,  256 },
 				{       25,   64,  128 },
 				{       49,   16,  128 },
 				{       97,    0,    0 },
-				{ UINT_MAX,    0,    0 },
 			},
 		},
 		{
@@ -286,36 +271,36 @@
 			{
 				// One shader engine
 				{        0,  256,  512 },
-				{        2,  256,  256 },
-				{        4,  128,  256 },
-				{        7,  128,  128 },
+				{        2,  128,  512 },
+				{        4,   64,  512 },
+				{        7,   64,  256 },
 				{       13,   64,  128 },
 				{       25,   32,  128 },
 				{       49,   16,  128 },
-				{ UINT_MAX,    0,    0 },
+				{      193,    0,    0 },
 			},
 			{
 				// Two shader engines
 				{        0,  512,  512 },
 				{        2,  256,  512 },
-				{        4,  256,  256 },
-				{        7,  128,  256 },
-				{       13,  128,  128 },
+				{        4,  128,  512 },
+				{        7,   64,  512 },
+				{       13,   64,  256 },
 				{       25,   64,  128 },
 				{       49,   32,  128 },
 				{       97,   16,  128 },
-				{ UINT_MAX,    0,    0 },
+				{      193,    0,    0 },
 			},
 			{
 				// Four shader engines
 				{        0,  512,  512 },
 				{        4,  256,  512 },
-				{        7,  256,  256 },
-				{       13,  128,  256 },
-				{       25,  128,  128 },
-				{       49,   64,  128 },
+				{        7,  128,  512 },
+				{       13,   64,  512 },
+				{       25,   32,  512 },
+				{       49,   32,  256 },
 				{       97,   16,  128 },
-				{ UINT_MAX,    0,    0 },
+				{      193,    0,    0 },
 			},
 		},
 	};
@@ -325,16 +310,17 @@
 
 static void si_emit_dpbb_disable(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
-
-	radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
-			       S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
-			       S_028C44_DISABLE_START_OF_PRIM(1));
-	radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
-			       S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF));
+	radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
+		SI_TRACKED_PA_SC_BINNER_CNTL_0,
+		S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
+		S_028C44_DISABLE_START_OF_PRIM(1));
+	radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
+				   SI_TRACKED_DB_DFSM_CONTROL,
+				   S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
+				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
 }
 
-void si_emit_dpbb_state(struct si_context *sctx, struct r600_atom *state)
+void si_emit_dpbb_state(struct si_context *sctx)
 {
 	struct si_screen *sscreen = sctx->screen;
 	struct si_state_blend *blend = sctx->queued.named.blend;
@@ -343,7 +329,7 @@
 
 	assert(sctx->chip_class >= GFX9);
 
-	if (!sscreen->dpbb_allowed || !blend || !dsa) {
+	if (!sscreen->dpbb_allowed || !blend || !dsa || sctx->dpbb_force_off) {
 		si_emit_dpbb_disable(sctx);
 		return;
 	}
@@ -353,18 +339,14 @@
 			   G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) ||
 			   blend->alpha_to_coverage;
 
-	/* This is ported from Vulkan, but it doesn't make much sense to me.
-	 * Maybe it's for RE-Z? But Vulkan doesn't use RE-Z. TODO: Clarify this.
-	 */
-	bool ps_can_reject_z_trivially =
+	bool db_can_reject_z_trivially =
 		!G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
-		G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control);
+		G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control) ||
+		G_02880C_DEPTH_BEFORE_SHADER(db_shader_control);
 
-	/* Disable binning if PS can kill trivially with DB writes.
-	 * Ported from Vulkan. (heuristic?)
-	 */
+	/* Disable DPBB when it's believed to be inefficient. */
 	if (ps_can_kill &&
-	    ps_can_reject_z_trivially &&
+	    db_can_reject_z_trivially &&
 	    sctx->framebuffer.state.zsbuf &&
 	    dsa->db_can_write) {
 		si_emit_dpbb_disable(sctx);
@@ -393,8 +375,13 @@
 	/* Enable DFSM if it's preferred. */
 	unsigned punchout_mode = V_028060_FORCE_OFF;
 	bool disable_start_of_prim = true;
+	bool zs_eqaa_dfsm_bug = sctx->chip_class == GFX9 &&
+				sctx->framebuffer.state.zsbuf &&
+				sctx->framebuffer.nr_samples !=
+				MAX2(1, sctx->framebuffer.state.zsbuf->texture->nr_samples);
 
 	if (sscreen->dfsm_allowed &&
+	    !zs_eqaa_dfsm_bug &&
 	    cb_target_enabled_4bit &&
 	    !G_02880C_KILL_ENABLE(db_shader_control) &&
 	    /* These two also imply that DFSM is disabled when PS writes to memory. */
@@ -414,6 +401,7 @@
 	switch (sctx->family) {
 	case CHIP_VEGA10:
 	case CHIP_VEGA12:
+	case CHIP_VEGA20:
 	case CHIP_RAVEN:
 		/* Tuned for Raven. Vega might need different values. */
 		context_states_per_bin = 5;
@@ -431,18 +419,21 @@
 	if (bin_size.y >= 32)
 		bin_size_extend.y = util_logbase2(bin_size.y) - 5;
 
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
-	radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
-			       S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
-			       S_028C44_BIN_SIZE_X(bin_size.x == 16) |
-			       S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
-			       S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
-			       S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
-			       S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
-			       S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
-			       S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
-			       S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
-			       S_028C44_OPTIMAL_BIN_SELECTION(1));
-	radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
-			       S_028060_PUNCHOUT_MODE(punchout_mode));
+	radeon_opt_set_context_reg(
+		sctx, R_028C44_PA_SC_BINNER_CNTL_0,
+		SI_TRACKED_PA_SC_BINNER_CNTL_0,
+		S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
+		S_028C44_BIN_SIZE_X(bin_size.x == 16) |
+		S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
+		S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
+		S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
+		S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
+		S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
+		S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
+		S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
+		S_028C44_OPTIMAL_BIN_SELECTION(1));
+	radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
+				   SI_TRACKED_DB_DFSM_CONTROL,
+				   S_028060_PUNCHOUT_MODE(punchout_mode) |
+				   S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 81019e1..4157e5e 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -59,31 +59,6 @@
 	return prim_conv[mode];
 }
 
-static unsigned si_conv_prim_to_gs_out(unsigned mode)
-{
-	static const int prim_conv[] = {
-		[PIPE_PRIM_POINTS]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
-		[PIPE_PRIM_LINES]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_LINE_LOOP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_LINE_STRIP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_TRIANGLES]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_STRIP]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_FAN]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_QUADS]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_QUAD_STRIP]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_POLYGON]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_LINES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_PATCHES]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
-		[SI_PRIM_RECTANGLE_LIST]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP
-	};
-	assert(mode < ARRAY_SIZE(prim_conv));
-
-	return prim_conv[mode];
-}
-
 /**
  * This calculates the LDS size for tessellation shaders (VS, TCS, TES).
  * LS.LDS_SIZE is shared by all 3 shader stages.
@@ -91,11 +66,11 @@
  * The information about LDS and other non-compile-time parameters is then
  * written to userdata SGPRs.
  */
-static void si_emit_derived_tess_state(struct si_context *sctx,
+static bool si_emit_derived_tess_state(struct si_context *sctx,
 				       const struct pipe_draw_info *info,
 				       unsigned *num_patches)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct si_shader *ls_current;
 	struct si_shader_selector *ls;
 	/* The TES pointer will only be used for sctx->last_tcs.
@@ -135,7 +110,7 @@
 	    (!has_primid_instancing_bug ||
 	     (sctx->last_tess_uses_primid == tess_uses_primid))) {
 		*num_patches = sctx->last_num_patches;
-		return;
+		return false;
 	}
 
 	sctx->last_ls = ls_current;
@@ -159,7 +134,7 @@
 		num_tcs_patch_outputs = 2; /* TESSINNER + TESSOUTER */
 	}
 
-	input_vertex_size = num_tcs_inputs * 16;
+	input_vertex_size = ls->lshs_vertex_stride;
 	output_vertex_size = num_tcs_outputs * 16;
 
 	input_patch_size = num_tcs_input_cp * input_vertex_size;
@@ -171,7 +146,8 @@
 	 * resource usage. Also ensures that the number of tcs in and out
 	 * vertices per threadgroup are at most 256.
 	 */
-	*num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
+	unsigned max_verts_per_patch = MAX2(num_tcs_input_cp, num_tcs_output_cp);
+	*num_patches = 256 / max_verts_per_patch;
 
 	/* Make sure that the data fits in LDS. This assumes the shaders only
 	 * use LDS for the inputs and outputs.
@@ -189,16 +165,31 @@
 			    (sctx->screen->tess_offchip_block_dw_size * 4) /
 			    output_patch_size);
 
-	/* Not necessary for correctness, but improves performance. The
-	 * specific value is taken from the proprietary driver.
+	/* Not necessary for correctness, but improves performance.
+	 * The hardware can do more, but the radeonsi shader constant is
+	 * limited to 6 bits.
 	 */
-	*num_patches = MIN2(*num_patches, 40);
+	*num_patches = MIN2(*num_patches, 63); /* triangles: 3 full waves except 3 lanes */
+
+	/* When distributed tessellation is unsupported, switch between SEs
+	 * at a higher frequency to compensate for it.
+	 */
+	if (!sctx->screen->has_distributed_tess && sctx->screen->info.max_se > 1)
+		*num_patches = MIN2(*num_patches, 16); /* recommended */
+
+	/* Make sure that vector lanes are reasonably occupied. It probably
+	 * doesn't matter much because this is LS-HS, and TES is likely to
+	 * occupy significantly more CUs.
+	 */
+	unsigned temp_verts_per_tg = *num_patches * max_verts_per_patch;
+	if (temp_verts_per_tg > 64 && temp_verts_per_tg % 64 < 48)
+		*num_patches = (temp_verts_per_tg & ~63) / max_verts_per_patch;
 
 	if (sctx->chip_class == SI) {
 		/* SI bug workaround, related to power management. Limit LS-HS
 		 * threadgroups to only one wave.
 		 */
-		unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
+		unsigned one_wave = 64 / max_verts_per_patch;
 		*num_patches = MIN2(*num_patches, one_wave);
 	}
 
@@ -305,12 +296,18 @@
 		       S_028B58_HS_NUM_INPUT_CP(num_tcs_input_cp) |
 		       S_028B58_HS_NUM_OUTPUT_CP(num_tcs_output_cp);
 
-	if (sctx->chip_class >= CIK)
-		radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2,
-					   ls_hs_config);
-	else
-		radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG,
-				       ls_hs_config);
+	if (sctx->last_ls_hs_config != ls_hs_config) {
+		if (sctx->chip_class >= CIK) {
+			radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2,
+						   ls_hs_config);
+		} else {
+			radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG,
+					       ls_hs_config);
+		}
+		sctx->last_ls_hs_config = ls_hs_config;
+		return true; /* true if the context rolls */
+	}
+	return false;
 }
 
 static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
@@ -386,7 +383,7 @@
 		 * Polaris supports primitive restart with WD_SWITCH_ON_EOP=0
 		 * for points, line strips, and tri strips.
 		 */
-		if (sscreen->info.max_se < 4 ||
+		if (sscreen->info.max_se <= 2 ||
 		    key->u.prim == PIPE_PRIM_POLYGON ||
 		    key->u.prim == PIPE_PRIM_LINE_LOOP ||
 		    key->u.prim == PIPE_PRIM_TRIANGLE_FAN ||
@@ -417,7 +414,7 @@
 			wd_switch_on_eop = true;
 
 		/* Required on CIK and later. */
-		if (sscreen->info.max_se > 2 && !wd_switch_on_eop)
+		if (sscreen->info.max_se == 4 && !wd_switch_on_eop)
 			ia_switch_on_eoi = true;
 
 		/* Required by Hawaii and, for some special cases, by VI. */
@@ -432,6 +429,12 @@
 		    key->u.uses_instancing)
 			partial_vs_wave = true;
 
+		/* This only applies to Polaris10 and later 4 SE chips.
+		 * wd_switch_on_eop is already true on all other chips.
+		 */
+		if (!wd_switch_on_eop && key->u.primitive_restart)
+			partial_vs_wave = true;
+
 		/* If the WD switch is false, the IA switch must be false too. */
 		assert(wd_switch_on_eop || !ia_switch_on_eop);
 	}
@@ -533,23 +536,19 @@
 }
 
 /* rast_prim is the primitive type after GS. */
-static void si_emit_rasterizer_prim_state(struct si_context *sctx)
+static bool si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	enum pipe_prim_type rast_prim = sctx->current_rast_prim;
-	struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
+	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
 	/* Skip this if not rendering lines. */
-	if (rast_prim != PIPE_PRIM_LINES &&
-	    rast_prim != PIPE_PRIM_LINE_LOOP &&
-	    rast_prim != PIPE_PRIM_LINE_STRIP &&
-	    rast_prim != PIPE_PRIM_LINES_ADJACENCY &&
-	    rast_prim != PIPE_PRIM_LINE_STRIP_ADJACENCY)
-		return;
+	if (!util_prim_is_lines(rast_prim))
+		return false;
 
 	if (rast_prim == sctx->last_rast_prim &&
 	    rs->pa_sc_line_stipple == sctx->last_sc_line_stipple)
-		return;
+		return false;
 
 	/* For lines, reset the stipple pattern at each primitive. Otherwise,
 	 * reset the stipple pattern at each packet (line strips, line loops).
@@ -560,6 +559,7 @@
 
 	sctx->last_rast_prim = rast_prim;
 	sctx->last_sc_line_stipple = rs->pa_sc_line_stipple;
+	return true; /* true if the context rolls */
 }
 
 static void si_emit_vs_state(struct si_context *sctx,
@@ -575,24 +575,42 @@
 	}
 
 	if (sctx->current_vs_state != sctx->last_vs_state) {
-		struct radeon_winsys_cs *cs = sctx->gfx_cs;
+		struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
+		/* For the API vertex shader (VS_STATE_INDEXED). */
 		radeon_set_sh_reg(cs,
 			sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX] +
 			SI_SGPR_VS_STATE_BITS * 4,
 			sctx->current_vs_state);
 
+		/* For vertex color clamping, which is done in the last stage
+		 * before the rasterizer. */
+		if (sctx->gs_shader.cso || sctx->tes_shader.cso) {
+			/* GS copy shader or TES if GS is missing. */
+			radeon_set_sh_reg(cs,
+				R_00B130_SPI_SHADER_USER_DATA_VS_0 +
+				SI_SGPR_VS_STATE_BITS * 4,
+				sctx->current_vs_state);
+		}
+
 		sctx->last_vs_state = sctx->current_vs_state;
 	}
 }
 
+static inline bool si_prim_restart_index_changed(struct si_context *sctx,
+						 const struct pipe_draw_info *info)
+{
+	return info->primitive_restart &&
+	       (info->restart_index != sctx->last_restart_index ||
+		sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
+}
+
 static void si_emit_draw_registers(struct si_context *sctx,
 				   const struct pipe_draw_info *info,
 				   unsigned num_patches)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
-	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
 	unsigned ia_multi_vgt_param;
 
 	ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches);
@@ -617,11 +635,6 @@
 		sctx->last_prim = prim;
 	}
 
-	if (gs_out_prim != sctx->last_gs_out_prim) {
-		radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
-		sctx->last_gs_out_prim = gs_out_prim;
-	}
-
 	/* Primitive restart. */
 	if (info->primitive_restart != sctx->last_primitive_restart_en) {
 		if (sctx->chip_class >= GFX9)
@@ -634,9 +647,7 @@
 		sctx->last_primitive_restart_en = info->primitive_restart;
 
 	}
-	if (info->primitive_restart &&
-	    (info->restart_index != sctx->last_restart_index ||
-	     sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN)) {
+	if (si_prim_restart_index_changed(sctx, info)) {
 		radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
 				       info->restart_index);
 		sctx->last_restart_index = info->restart_index;
@@ -650,7 +661,7 @@
 				 unsigned index_offset)
 {
 	struct pipe_draw_indirect_info *indirect = info->indirect;
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
 	bool render_cond_bit = sctx->render_cond && !sctx->render_cond_force_off;
 	uint32_t index_max_size = 0;
@@ -720,7 +731,7 @@
 		index_va = r600_resource(indexbuf)->gpu_address + index_offset;
 
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      (struct r600_resource *)indexbuf,
+				      r600_resource(indexbuf),
 				      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
 	} else {
 		/* On CI and later, non-indexed draws overwrite VGT_INDEX_TYPE,
@@ -743,7 +754,7 @@
 		radeon_emit(cs, indirect_va >> 32);
 
 		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      (struct r600_resource *)indirect->buffer,
+				      r600_resource(indirect->buffer),
 				      RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 
 		unsigned di_src_sel = index_size ? V_0287F0_DI_SRC_SEL_DMA
@@ -773,7 +784,7 @@
 
 			if (indirect->indirect_draw_count) {
 				struct r600_resource *params_buf =
-					(struct r600_resource *)indirect->indirect_draw_count;
+					r600_resource(indirect->indirect_draw_count);
 
 				radeon_add_to_buffer_list(
 					sctx, sctx->gfx_cs, params_buf,
@@ -852,7 +863,7 @@
 static void si_emit_surface_sync(struct si_context *sctx,
 				 unsigned cp_coher_cntl)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	if (sctx->chip_class >= GFX9) {
 		/* Flush caches and wait for the caches to assert idle. */
@@ -875,7 +886,7 @@
 
 void si_emit_cache_flush(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	uint32_t flags = sctx->flags;
 	uint32_t cp_coher_cntl = 0;
 	uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
@@ -1179,13 +1190,33 @@
 static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
 			       unsigned skip_atom_mask)
 {
+	unsigned num_patches = 0;
+	bool context_roll = false; /* set correctly for GFX9 only */
+
+	context_roll |= si_emit_rasterizer_prim_state(sctx);
+	if (sctx->tes_shader.cso)
+		context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches);
+	if (info->count_from_stream_output)
+		context_roll = true;
+
+	/* Vega10/Raven scissor bug workaround. When any context register is
+	 * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
+	 * registers must be written too.
+	 */
+	if ((sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) &&
+	    (context_roll ||
+	     sctx->dirty_atoms & si_atoms_that_roll_context() ||
+	     sctx->dirty_states & si_states_that_roll_context() ||
+	     si_prim_restart_index_changed(sctx, info))) {
+		sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
+	}
+
 	/* Emit state atoms. */
 	unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
-	while (mask) {
-		struct r600_atom *atom = sctx->atoms.array[u_bit_scan(&mask)];
+	while (mask)
+		sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
 
-		atom->emit(sctx, atom);
-	}
 	sctx->dirty_atoms &= skip_atom_mask;
 
 	/* Emit states. */
@@ -1203,11 +1234,6 @@
 	sctx->dirty_states = 0;
 
 	/* Emit draw states. */
-	unsigned num_patches = 0;
-
-	si_emit_rasterizer_prim_state(sctx);
-	if (sctx->tes_shader.cso)
-		si_emit_derived_tess_state(sctx, info, &num_patches);
 	si_emit_vs_state(sctx, info);
 	si_emit_draw_registers(sctx, info, num_patches);
 }
@@ -1236,15 +1262,10 @@
 			return;
 	}
 
-	if (unlikely(!sctx->vs_shader.cso)) {
-		assert(0);
-		return;
-	}
-	if (unlikely(!sctx->ps_shader.cso && (!rs || !rs->rasterizer_discard))) {
-		assert(0);
-		return;
-	}
-	if (unlikely(!!sctx->tes_shader.cso != (info->mode == PIPE_PRIM_PATCHES))) {
+	if (unlikely(!sctx->vs_shader.cso ||
+		     !rs ||
+		     (!sctx->ps_shader.cso && !rs->rasterizer_discard) ||
+		     (!!sctx->tes_shader.cso != (info->mode == PIPE_PRIM_PATCHES)))) {
 		assert(0);
 		return;
 	}
@@ -1256,7 +1277,7 @@
 		sctx->framebuffer.dirty_cbufs |=
 			((1 << sctx->framebuffer.state.nr_cbufs) - 1);
 		sctx->framebuffer.dirty_zsbuf = true;
-		si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
 		si_update_all_texture_descriptors(sctx);
 	}
 
@@ -1278,12 +1299,9 @@
 		rast_prim = info->mode;
 
 	if (rast_prim != sctx->current_rast_prim) {
-		bool old_is_poly = sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES;
-		bool new_is_poly = rast_prim >= PIPE_PRIM_TRIANGLES;
-		if (old_is_poly != new_is_poly) {
-			sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-			si_mark_atom_dirty(sctx, &sctx->scissors.atom);
-		}
+		if (util_prim_is_points_or_lines(sctx->current_rast_prim) !=
+		    util_prim_is_points_or_lines(rast_prim))
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
 
 		sctx->current_rast_prim = rast_prim;
 		sctx->do_update_shaders = true;
@@ -1413,16 +1431,6 @@
 	if (!si_upload_vertex_buffer_descriptors(sctx))
 		return;
 
-	/* Vega10/Raven scissor bug workaround. This must be done before VPORT
-	 * scissor registers are changed. There is also a more efficient but
-	 * more involved alternative workaround.
-	 */
-	if ((sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) &&
-	    si_is_atom_dirty(sctx, &sctx->scissors.atom)) {
-		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
-		si_emit_cache_flush(sctx);
-	}
-
 	/* Use optimal packet order based on whether we need to sync the pipeline. */
 	if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
 				      SI_CONTEXT_FLUSH_AND_INV_DB |
@@ -1436,7 +1444,7 @@
 		unsigned masked_atoms = 0;
 
 		if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
-			masked_atoms |= 1u << sctx->render_cond_atom.id;
+			masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
 
 		if (!si_upload_graphics_shader_descriptors(sctx))
 			return;
@@ -1446,8 +1454,8 @@
 		si_emit_cache_flush(sctx);
 		/* <-- CUs are idle here. */
 
-		if (si_is_atom_dirty(sctx, &sctx->render_cond_atom))
-			sctx->render_cond_atom.emit(sctx, NULL);
+		if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
+			sctx->atoms.s.render_cond.emit(sctx);
 		sctx->dirty_atoms = 0;
 
 		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
@@ -1557,7 +1565,7 @@
 
 void si_trace_emit(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	uint64_t va = sctx->current_saved_cs->trace_buf->gpu_address;
 	uint32_t trace_id = ++sctx->current_saved_cs->trace_id;
 
diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c
index 19bed09..10232a5 100644
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -26,162 +26,159 @@
 
 /* For MSAA sample positions. */
 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)  \
-	(((s0x) & 0xf) | (((unsigned)(s0y) & 0xf) << 4) |		   \
-	(((unsigned)(s1x) & 0xf) << 8) | (((unsigned)(s1y) & 0xf) << 12) |	   \
-	(((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) |	   \
+	((((unsigned)(s0x) & 0xf) << 0)  | (((unsigned)(s0y) & 0xf) << 4)  | \
+	 (((unsigned)(s1x) & 0xf) << 8)  | (((unsigned)(s1y) & 0xf) << 12) | \
+	 (((unsigned)(s2x) & 0xf) << 16) | (((unsigned)(s2y) & 0xf) << 20) | \
 	 (((unsigned)(s3x) & 0xf) << 24) | (((unsigned)(s3y) & 0xf) << 28))
 
-/* 2xMSAA
- * There are two locations (4, 4), (-4, -4). */
-static const uint32_t sample_locs_2x[4] = {
-	FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
-	FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
-	FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
-	FILL_SREG(4, 4, -4, -4, 4, 4, -4, -4),
-};
-/* 4xMSAA
- * There are 4 locations: (-2, -6), (6, -2), (-6, 2), (2, 6). */
-static const uint32_t sample_locs_4x[4] = {
-	FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
-	FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
-	FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
-	FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6),
-};
+/* For obtaining location coordinates from registers */
+#define SEXT4(x)		((int)((x) | ((x) & 0x8 ? 0xfffffff0 : 0)))
+#define GET_SFIELD(reg, index)	SEXT4(((reg) >> ((index) * 4)) & 0xf)
+#define GET_SX(reg, index)	GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
+#define GET_SY(reg, index)	GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
 
-/* Cayman 8xMSAA */
-static const uint32_t sample_locs_8x[] = {
-	FILL_SREG( 1, -3, -1,  3, 5,  1, -3, -5),
-	FILL_SREG( 1, -3, -1,  3, 5,  1, -3, -5),
-	FILL_SREG( 1, -3, -1,  3, 5,  1, -3, -5),
-	FILL_SREG( 1, -3, -1,  3, 5,  1, -3, -5),
-	FILL_SREG(-5,  5, -7, -1, 3,  7,  7, -7),
-	FILL_SREG(-5,  5, -7, -1, 3,  7,  7, -7),
-	FILL_SREG(-5,  5, -7, -1, 3,  7,  7, -7),
-	FILL_SREG(-5,  5, -7, -1, 3,  7,  7, -7),
+/* The following sample ordering is required by EQAA.
+ *
+ * Sample 0 is approx. in the top-left quadrant.
+ * Sample 1 is approx. in the bottom-right quadrant.
+ *
+ * Sample 2 is approx. in the bottom-left quadrant.
+ * Sample 3 is approx. in the top-right quadrant.
+ * (sample I={2,3} adds more detail to the vicinity of sample I-2)
+ *
+ * Sample 4 is approx. in the same quadrant as sample 0. (top-left)
+ * Sample 5 is approx. in the same quadrant as sample 1. (bottom-right)
+ * Sample 6 is approx. in the same quadrant as sample 2. (bottom-left)
+ * Sample 7 is approx. in the same quadrant as sample 3. (top-right)
+ * (sample I={4,5,6,7} adds more detail to the vicinity of sample I-4)
+ *
+ * The next 8 samples add more detail to the vicinity of the previous samples.
+ * (sample I (I >= 8) adds more detail to the vicinity of sample I-8)
+ *
+ * The ordering is specified such that:
+ *   If we take the first 2 samples, we should get good 2x MSAA.
+ *   If we add 2 more samples, we should get good 4x MSAA with the same sample locations.
+ *   If we add 4 more samples, we should get good 8x MSAA with the same sample locations.
+ *   If we add 8 more samples, we should get perfect 16x MSAA with the same sample locations.
+ *
+ * The ordering also allows finding samples in the same vicinity.
+ *
+ * Group N of 2 samples in the same vicinity in 16x MSAA: {N,N+8}
+ * Group N of 2 samples in the same vicinity in 8x MSAA: {N,N+4}
+ * Group N of 2 samples in the same vicinity in 4x MSAA: {N,N+2}
+ *
+ * Groups of 4 samples in the same vicinity in 16x MSAA:
+ *   Top left:     {0,4,8,12}
+ *   Bottom right: {1,5,9,13}
+ *   Bottom left:  {2,6,10,14}
+ *   Top right:    {3,7,11,15}
+ *
+ * Groups of 4 samples in the same vicinity in 8x MSAA:
+ *   Left half:  {0,2,4,6}
+ *   Right half: {1,3,5,7}
+ *
+ * Groups of 8 samples in the same vicinity in 16x MSAA:
+ *   Left half:  {0,2,4,6,8,10,12,14}
+ *   Right half: {1,3,5,7,9,11,13,15}
+ */
+
+/* 1x MSAA */
+static const uint32_t sample_locs_1x =
+	FILL_SREG( 0, 0,   0, 0,   0, 0,   0, 0); /* S1, S2, S3 fields are not used by 1x */
+static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
+
+/* 2x MSAA */
+static const uint32_t sample_locs_2x =
+	FILL_SREG(-4,-4,   4, 4,   0, 0,   0, 0); /* S2 & S3 fields are not used by 2x MSAA */
+static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
+
+/* 4x, 8x, and 16x MSAA
+ * - The first 4 locations happen to be optimal for 4x MSAA, better than
+ *   the standard DX 4x locations.
+ * - The first 8 locations happen to be almost as good as 8x DX locations,
+ *   but the DX locations are horrible for worst-case EQAA 8s4f and 8s2f.
+ */
+static const uint32_t sample_locs_4x_8x_16x[] = {
+	FILL_SREG(-5,-2,   5, 3,  -2, 6,   3,-5),
+	FILL_SREG(-6,-7,   1, 1,  -6, 4,   7,-3),
+	FILL_SREG(-1,-3,   6, 7,  -3, 2,   0,-7),
+	FILL_SREG(-4,-6,   2, 5,  -8, 0,   4,-1),
 };
-/* Cayman 16xMSAA */
-static const uint32_t sample_locs_16x[] = {
-	FILL_SREG( 1,  1, -1, -3, -3,  2,  4, -1),
-	FILL_SREG( 1,  1, -1, -3, -3,  2,  4, -1),
-	FILL_SREG( 1,  1, -1, -3, -3,  2,  4, -1),
-	FILL_SREG( 1,  1, -1, -3, -3,  2,  4, -1),
-	FILL_SREG(-5, -2,  2,  5,  5,  3,  3, -5),
-	FILL_SREG(-5, -2,  2,  5,  5,  3,  3, -5),
-	FILL_SREG(-5, -2,  2,  5,  5,  3,  3, -5),
-	FILL_SREG(-5, -2,  2,  5,  5,  3,  3, -5),
-	FILL_SREG(-2,  6,  0, -7, -4, -6, -6,  4),
-	FILL_SREG(-2,  6,  0, -7, -4, -6, -6,  4),
-	FILL_SREG(-2,  6,  0, -7, -4, -6, -6,  4),
-	FILL_SREG(-2,  6,  0, -7, -4, -6, -6,  4),
-	FILL_SREG(-8,  0,  7, -4,  6,  7, -7, -8),
-	FILL_SREG(-8,  0,  7, -4,  6,  7, -7, -8),
-	FILL_SREG(-8,  0,  7, -4,  6,  7, -7, -8),
-	FILL_SREG(-8,  0,  7, -4,  6,  7, -7, -8),
-};
+static const uint64_t centroid_priority_4x = 0x2310231023102310ull;
+static const uint64_t centroid_priority_8x = 0x4762310547623105ull;
+static const uint64_t centroid_priority_16x = 0x49e7c6b231d0fa85ull;
 
 static void si_get_sample_position(struct pipe_context *ctx, unsigned sample_count,
 				   unsigned sample_index, float *out_value)
 {
-	int offset, index;
-	struct {
-		int idx:4;
-	} val;
+	const uint32_t *sample_locs;
 
 	switch (sample_count) {
 	case 1:
 	default:
-		out_value[0] = out_value[1] = 0.5;
+		sample_locs = &sample_locs_1x;
 		break;
 	case 2:
-		offset = 4 * (sample_index * 2);
-		val.idx = (sample_locs_2x[0] >> offset) & 0xf;
-		out_value[0] = (float)(val.idx + 8) / 16.0f;
-		val.idx = (sample_locs_2x[0] >> (offset + 4)) & 0xf;
-		out_value[1] = (float)(val.idx + 8) / 16.0f;
+		sample_locs = &sample_locs_2x;
 		break;
 	case 4:
-		offset = 4 * (sample_index * 2);
-		val.idx = (sample_locs_4x[0] >> offset) & 0xf;
-		out_value[0] = (float)(val.idx + 8) / 16.0f;
-		val.idx = (sample_locs_4x[0] >> (offset + 4)) & 0xf;
-		out_value[1] = (float)(val.idx + 8) / 16.0f;
-		break;
 	case 8:
-		offset = 4 * (sample_index % 4 * 2);
-		index = (sample_index / 4) * 4;
-		val.idx = (sample_locs_8x[index] >> offset) & 0xf;
-		out_value[0] = (float)(val.idx + 8) / 16.0f;
-		val.idx = (sample_locs_8x[index] >> (offset + 4)) & 0xf;
-		out_value[1] = (float)(val.idx + 8) / 16.0f;
-		break;
 	case 16:
-		offset = 4 * (sample_index % 4 * 2);
-		index = (sample_index / 4) * 4;
-		val.idx = (sample_locs_16x[index] >> offset) & 0xf;
-		out_value[0] = (float)(val.idx + 8) / 16.0f;
-		val.idx = (sample_locs_16x[index] >> (offset + 4)) & 0xf;
-		out_value[1] = (float)(val.idx + 8) / 16.0f;
+		sample_locs = sample_locs_4x_8x_16x;
 		break;
 	}
+
+	out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
+	out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
 }
 
-void si_emit_sample_locations(struct radeon_winsys_cs *cs, int nr_samples)
+static void si_emit_max_4_sample_locs(struct radeon_cmdbuf *cs,
+				      uint64_t centroid_priority,
+				      uint32_t sample_locs)
+{
+	radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+	radeon_emit(cs, centroid_priority);
+	radeon_emit(cs, centroid_priority >> 32);
+	radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs);
+	radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs);
+	radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs);
+	radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs);
+}
+
+static void si_emit_max_16_sample_locs(struct radeon_cmdbuf *cs,
+				       uint64_t centroid_priority,
+				       const uint32_t *sample_locs,
+				       unsigned num_samples)
+{
+	radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
+	radeon_emit(cs, centroid_priority);
+	radeon_emit(cs, centroid_priority >> 32);
+	radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0,
+				   num_samples == 8 ? 14 : 16);
+	radeon_emit_array(cs, sample_locs, 4);
+	radeon_emit_array(cs, sample_locs, 4);
+	radeon_emit_array(cs, sample_locs, 4);
+	radeon_emit_array(cs, sample_locs, num_samples == 8 ? 2 : 4);
+}
+
+void si_emit_sample_locations(struct radeon_cmdbuf *cs, int nr_samples)
 {
 	switch (nr_samples) {
 	default:
 	case 1:
-		radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0);
-		radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0);
-		radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0);
-		radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0);
+		si_emit_max_4_sample_locs(cs, centroid_priority_1x, sample_locs_1x);
 		break;
 	case 2:
-		radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_2x[0]);
-		radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_2x[1]);
-		radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_2x[2]);
-		radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_2x[3]);
+		si_emit_max_4_sample_locs(cs, centroid_priority_2x, sample_locs_2x);
 		break;
 	case 4:
-		radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_4x[0]);
-		radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_4x[1]);
-		radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_4x[2]);
-		radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_4x[3]);
+		si_emit_max_4_sample_locs(cs, centroid_priority_4x, sample_locs_4x_8x_16x[0]);
 		break;
 	case 8:
-		radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14);
-		radeon_emit(cs, sample_locs_8x[0]);
-		radeon_emit(cs, sample_locs_8x[4]);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, sample_locs_8x[1]);
-		radeon_emit(cs, sample_locs_8x[5]);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, sample_locs_8x[2]);
-		radeon_emit(cs, sample_locs_8x[6]);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, 0);
-		radeon_emit(cs, sample_locs_8x[3]);
-		radeon_emit(cs, sample_locs_8x[7]);
+		si_emit_max_16_sample_locs(cs, centroid_priority_8x, sample_locs_4x_8x_16x, 8);
 		break;
 	case 16:
-		radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 16);
-		radeon_emit(cs, sample_locs_16x[0]);
-		radeon_emit(cs, sample_locs_16x[4]);
-		radeon_emit(cs, sample_locs_16x[8]);
-		radeon_emit(cs, sample_locs_16x[12]);
-		radeon_emit(cs, sample_locs_16x[1]);
-		radeon_emit(cs, sample_locs_16x[5]);
-		radeon_emit(cs, sample_locs_16x[9]);
-		radeon_emit(cs, sample_locs_16x[13]);
-		radeon_emit(cs, sample_locs_16x[2]);
-		radeon_emit(cs, sample_locs_16x[6]);
-		radeon_emit(cs, sample_locs_16x[10]);
-		radeon_emit(cs, sample_locs_16x[14]);
-		radeon_emit(cs, sample_locs_16x[3]);
-		radeon_emit(cs, sample_locs_16x[7]);
-		radeon_emit(cs, sample_locs_16x[11]);
-		radeon_emit(cs, sample_locs_16x[15]);
+		si_emit_max_16_sample_locs(cs, centroid_priority_16x, sample_locs_4x_8x_16x, 16);
 		break;
 	}
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index b8dddd3..de33d25 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -45,7 +45,7 @@
  * Return the IR binary in a buffer. For TGSI the first 4 bytes contain its
  * size as integer.
  */
-static void *si_get_ir_binary(struct si_shader_selector *sel)
+void *si_get_ir_binary(struct si_shader_selector *sel)
 {
 	struct blob blob;
 	unsigned ir_size;
@@ -202,10 +202,9 @@
  *
  * Returns false on failure, in which case the ir_binary should be freed.
  */
-static bool si_shader_cache_insert_shader(struct si_screen *sscreen,
-					  void *ir_binary,
-					  struct si_shader *shader,
-					  bool insert_into_disk_cache)
+bool si_shader_cache_insert_shader(struct si_screen *sscreen, void *ir_binary,
+				   struct si_shader *shader,
+				   bool insert_into_disk_cache)
 {
 	void *hw_binary;
 	struct hash_entry *entry;
@@ -235,9 +234,8 @@
 	return true;
 }
 
-static bool si_shader_cache_load_shader(struct si_screen *sscreen,
-					void *ir_binary,
-				        struct si_shader *shader)
+bool si_shader_cache_load_shader(struct si_screen *sscreen, void *ir_binary,
+				 struct si_shader *shader)
 {
 	struct hash_entry *entry =
 		_mesa_hash_table_search(sscreen->shader_cache, ir_binary);
@@ -605,6 +603,30 @@
 	polaris_set_vgt_vertex_reuse(sscreen, shader->selector, shader, pm4);
 }
 
+static unsigned si_conv_prim_to_gs_out(unsigned mode)
+{
+	static const int prim_conv[] = {
+		[PIPE_PRIM_POINTS]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
+		[PIPE_PRIM_LINES]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_LINE_LOOP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_LINE_STRIP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_TRIANGLES]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_STRIP]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_FAN]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_QUADS]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_QUAD_STRIP]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_POLYGON]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_LINES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_PATCHES]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
+	};
+	assert(mode < ARRAY_SIZE(prim_conv));
+
+	return prim_conv[mode];
+}
+
 struct gfx9_gs_info {
 	unsigned es_verts_per_subgroup;
 	unsigned gs_prims_per_subgroup;
@@ -735,6 +757,8 @@
 	if (max_stream >= 2)
 		offset += num_components[2] * sel->gs_max_out_vertices;
 	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
+	si_pm4_set_reg(pm4, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
+		       si_conv_prim_to_gs_out(sel->gs_output_prim));
 	if (max_stream >= 3)
 		offset += num_components[3] * sel->gs_max_out_vertices;
 	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
@@ -1223,12 +1247,13 @@
 	}
 
 	/* Find out which VS outputs aren't used by the PS. */
-	uint64_t outputs_written = vs->outputs_written;
+	uint64_t outputs_written = vs->outputs_written_before_ps;
 	uint64_t inputs_read = 0;
 
-	/* ignore POSITION, PSIZE */
-	outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0)) |
-			     (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0)));
+	/* Ignore outputs that are not passed from VS to PS. */
+	outputs_written &= ~((1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_POSITION, 0, true)) |
+			     (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_PSIZE, 0, true)) |
+			     (1ull << si_shader_io_get_unique_index(TGSI_SEMANTIC_CLIPVERTEX, 0, true)));
 
 	if (!ps_disabled) {
 		inputs_read = ps->inputs_read;
@@ -1391,67 +1416,63 @@
 			key->part.ps.epilog.color_is_int10 &= sel->info.colors_written;
 		}
 
-		if (rs) {
-			bool is_poly = (sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES &&
-					sctx->current_rast_prim <= PIPE_PRIM_POLYGON) ||
-				       sctx->current_rast_prim >= PIPE_PRIM_TRIANGLES_ADJACENCY;
-			bool is_line = !is_poly && sctx->current_rast_prim != PIPE_PRIM_POINTS;
+		bool is_poly = !util_prim_is_points_or_lines(sctx->current_rast_prim);
+		bool is_line = util_prim_is_lines(sctx->current_rast_prim);
 
-			key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
-			key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
+		key->part.ps.prolog.color_two_side = rs->two_side && sel->info.colors_read;
+		key->part.ps.prolog.flatshade_colors = rs->flatshade && sel->info.colors_read;
 
-			if (sctx->queued.named.blend) {
-				key->part.ps.epilog.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
-							      rs->multisample_enable;
-			}
+		if (sctx->queued.named.blend) {
+			key->part.ps.epilog.alpha_to_one = sctx->queued.named.blend->alpha_to_one &&
+							   rs->multisample_enable;
+		}
 
-			key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
-			key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
-							      (is_line && rs->line_smooth)) &&
-							     sctx->framebuffer.nr_samples <= 1;
-			key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
+		key->part.ps.prolog.poly_stipple = rs->poly_stipple_enable && is_poly;
+		key->part.ps.epilog.poly_line_smoothing = ((is_poly && rs->poly_smooth) ||
+							   (is_line && rs->line_smooth)) &&
+							  sctx->framebuffer.nr_samples <= 1;
+		key->part.ps.epilog.clamp_color = rs->clamp_fragment_color;
 
-			if (sctx->ps_iter_samples > 1 &&
-			    sel->info.reads_samplemask) {
-				key->part.ps.prolog.samplemask_log_ps_iter =
-					util_logbase2(util_next_power_of_two(sctx->ps_iter_samples));
-			}
+		if (sctx->ps_iter_samples > 1 &&
+		    sel->info.reads_samplemask) {
+			key->part.ps.prolog.samplemask_log_ps_iter =
+				util_logbase2(sctx->ps_iter_samples);
+		}
 
-			if (rs->force_persample_interp &&
-			    rs->multisample_enable &&
-			    sctx->framebuffer.nr_samples > 1 &&
-			    sctx->ps_iter_samples > 1) {
-				key->part.ps.prolog.force_persp_sample_interp =
-					sel->info.uses_persp_center ||
-					sel->info.uses_persp_centroid;
+		if (rs->force_persample_interp &&
+		    rs->multisample_enable &&
+		    sctx->framebuffer.nr_samples > 1 &&
+		    sctx->ps_iter_samples > 1) {
+			key->part.ps.prolog.force_persp_sample_interp =
+				sel->info.uses_persp_center ||
+				sel->info.uses_persp_centroid;
 
-				key->part.ps.prolog.force_linear_sample_interp =
-					sel->info.uses_linear_center ||
-					sel->info.uses_linear_centroid;
-			} else if (rs->multisample_enable &&
-				   sctx->framebuffer.nr_samples > 1) {
-				key->part.ps.prolog.bc_optimize_for_persp =
-					sel->info.uses_persp_center &&
-					sel->info.uses_persp_centroid;
-				key->part.ps.prolog.bc_optimize_for_linear =
-					sel->info.uses_linear_center &&
-					sel->info.uses_linear_centroid;
-			} else {
-				/* Make sure SPI doesn't compute more than 1 pair
-				 * of (i,j), which is the optimization here. */
-				key->part.ps.prolog.force_persp_center_interp =
-					sel->info.uses_persp_center +
-					sel->info.uses_persp_centroid +
-					sel->info.uses_persp_sample > 1;
+			key->part.ps.prolog.force_linear_sample_interp =
+				sel->info.uses_linear_center ||
+				sel->info.uses_linear_centroid;
+		} else if (rs->multisample_enable &&
+			   sctx->framebuffer.nr_samples > 1) {
+			key->part.ps.prolog.bc_optimize_for_persp =
+				sel->info.uses_persp_center &&
+				sel->info.uses_persp_centroid;
+			key->part.ps.prolog.bc_optimize_for_linear =
+				sel->info.uses_linear_center &&
+				sel->info.uses_linear_centroid;
+		} else {
+			/* Make sure SPI doesn't compute more than 1 pair
+			 * of (i,j), which is the optimization here. */
+			key->part.ps.prolog.force_persp_center_interp =
+				sel->info.uses_persp_center +
+				sel->info.uses_persp_centroid +
+				sel->info.uses_persp_sample > 1;
 
-				key->part.ps.prolog.force_linear_center_interp =
-					sel->info.uses_linear_center +
-					sel->info.uses_linear_centroid +
-					sel->info.uses_linear_sample > 1;
+			key->part.ps.prolog.force_linear_center_interp =
+				sel->info.uses_linear_center +
+				sel->info.uses_linear_centroid +
+				sel->info.uses_linear_sample > 1;
 
-				if (sel->info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE])
-					key->mono.u.ps.interpolate_at_sample_force_center = 1;
-			}
+			if (sel->info.opcode_count[TGSI_OPCODE_INTERP_SAMPLE])
+				key->mono.u.ps.interpolate_at_sample_force_center = 1;
 		}
 
 		key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
@@ -1488,26 +1509,26 @@
 {
 	struct si_shader_selector *sel = shader->selector;
 	struct si_screen *sscreen = sel->screen;
-	LLVMTargetMachineRef tm;
+	struct ac_llvm_compiler *compiler;
 	struct pipe_debug_callback *debug = &shader->compiler_ctx_state.debug;
 	int r;
 
 	if (thread_index >= 0) {
 		if (low_priority) {
-			assert(thread_index < ARRAY_SIZE(sscreen->tm_low_priority));
-			tm = sscreen->tm_low_priority[thread_index];
+			assert(thread_index < ARRAY_SIZE(sscreen->compiler_lowp));
+			compiler = &sscreen->compiler_lowp[thread_index];
 		} else {
-			assert(thread_index < ARRAY_SIZE(sscreen->tm));
-			tm = sscreen->tm[thread_index];
+			assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+			compiler = &sscreen->compiler[thread_index];
 		}
 		if (!debug->async)
 			debug = NULL;
 	} else {
 		assert(!low_priority);
-		tm = shader->compiler_ctx_state.tm;
+		compiler = shader->compiler_ctx_state.compiler;
 	}
 
-	r = si_shader_create(sscreen, tm, shader, debug);
+	r = si_shader_create(sscreen, compiler, shader, debug);
 	if (unlikely(r)) {
 		PRINT_ERR("Failed to build shader variant (type=%u) %d\n",
 			 sel->type, r);
@@ -1559,10 +1580,10 @@
 		main_part->selector = sel;
 		main_part->key.as_es = key->as_es;
 		main_part->key.as_ls = key->as_ls;
+		main_part->is_monolithic = false;
 
-		if (si_compile_tgsi_shader(sscreen, compiler_state->tm,
-					   main_part, false,
-					   &compiler_state->debug) != 0) {
+		if (si_compile_tgsi_shader(sscreen, compiler_state->compiler,
+					   main_part, &compiler_state->debug) != 0) {
 			FREE(main_part);
 			return false;
 		}
@@ -1835,13 +1856,13 @@
 {
 	struct si_shader_selector *sel = (struct si_shader_selector *)job;
 	struct si_screen *sscreen = sel->screen;
-	LLVMTargetMachineRef tm;
+	struct ac_llvm_compiler *compiler;
 	struct pipe_debug_callback *debug = &sel->compiler_ctx_state.debug;
 
 	assert(!debug->debug_message || debug->async);
 	assert(thread_index >= 0);
-	assert(thread_index < ARRAY_SIZE(sscreen->tm));
-	tm = sscreen->tm[thread_index];
+	assert(thread_index < ARRAY_SIZE(sscreen->compiler));
+	compiler = &sscreen->compiler[thread_index];
 
 	/* Compile the main shader part for use with a prolog and/or epilog.
 	 * If this fails, the driver will try to compile a monolithic shader
@@ -1861,6 +1882,7 @@
 		util_queue_fence_init(&shader->ready);
 
 		shader->selector = sel;
+		shader->is_monolithic = false;
 		si_parse_next_shader_property(&sel->info,
 					      sel->so.num_outputs != 0,
 					      &shader->key);
@@ -1879,7 +1901,7 @@
 			mtx_unlock(&sscreen->shader_cache_mutex);
 
 			/* Compile the shader if it hasn't been loaded from the cache. */
-			if (si_compile_tgsi_shader(sscreen, tm, shader, false,
+			if (si_compile_tgsi_shader(sscreen, compiler, shader,
 						   debug) != 0) {
 				FREE(shader);
 				FREE(ir_binary);
@@ -1927,8 +1949,8 @@
 						break;
 					/* fall through */
 				default:
-					id = si_shader_io_get_unique_index(name, index);
-					sel->outputs_written &= ~(1ull << id);
+					id = si_shader_io_get_unique_index(name, index, true);
+					sel->outputs_written_before_ps &= ~(1ull << id);
 					break;
 				case TGSI_SEMANTIC_POSITION: /* ignore these */
 				case TGSI_SEMANTIC_PSIZE:
@@ -1942,7 +1964,7 @@
 
 	/* The GS copy shader is always pre-compiled. */
 	if (sel->type == PIPE_SHADER_GEOMETRY) {
-		sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, tm, sel, debug);
+		sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, compiler, sel, debug);
 		if (!sel->gs_copy_shader) {
 			fprintf(stderr, "radeonsi: can't create GS copy shader\n");
 			return;
@@ -1952,6 +1974,34 @@
 	}
 }
 
+void si_schedule_initial_compile(struct si_context *sctx, unsigned processor,
+				 struct util_queue_fence *ready_fence,
+				 struct si_compiler_ctx_state *compiler_ctx_state,
+				 void *job, util_queue_execute_func execute)
+{
+	util_queue_fence_init(ready_fence);
+
+	struct util_async_debug_callback async_debug;
+	bool wait =
+		(sctx->debug.debug_message && !sctx->debug.async) ||
+		sctx->is_debug ||
+		si_can_dump_shader(sctx->screen, processor);
+
+	if (wait) {
+		u_async_debug_init(&async_debug);
+		compiler_ctx_state->debug = async_debug.base;
+	}
+
+	util_queue_add_job(&sctx->screen->shader_compiler_queue, job,
+			   ready_fence, execute, NULL);
+
+	if (wait) {
+		util_queue_fence_wait(ready_fence);
+		u_async_debug_drain(&async_debug, &sctx->debug);
+		u_async_debug_cleanup(&async_debug);
+	}
+}
+
 /* Return descriptor slot usage masks from the given shader info. */
 void si_get_active_slot_masks(const struct tgsi_shader_info *info,
 			      uint32_t *const_and_shader_buffers,
@@ -2101,20 +2151,30 @@
 				/* fall through */
 			default:
 				sel->outputs_written |=
-					1ull << si_shader_io_get_unique_index(name, index);
+					1ull << si_shader_io_get_unique_index(name, index, false);
+				sel->outputs_written_before_ps |=
+					1ull << si_shader_io_get_unique_index(name, index, true);
 				break;
-			case TGSI_SEMANTIC_CLIPVERTEX: /* ignore these */
 			case TGSI_SEMANTIC_EDGEFLAG:
 				break;
 			}
 		}
 		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
+		sel->lshs_vertex_stride = sel->esgs_itemsize;
+
+		/* Add 1 dword to reduce LDS bank conflicts, so that each vertex
+		 * will start on a different bank. (except for the maximum 32*16).
+		 */
+		if (sel->lshs_vertex_stride < 32*16)
+			sel->lshs_vertex_stride += 4;
 
 		/* For the ESGS ring in LDS, add 1 dword to reduce LDS bank
 		 * conflicts, i.e. each vertex will start at a different bank.
 		 */
 		if (sctx->chip_class >= GFX9)
 			sel->esgs_itemsize += 4;
+
+		assert(((sel->esgs_itemsize / 4) & C_028AAC_ITEMSIZE) == 0);
 		break;
 
 	case PIPE_SHADER_FRAGMENT:
@@ -2130,7 +2190,7 @@
 				/* fall through */
 			default:
 				sel->inputs_read |=
-					1ull << si_shader_io_get_unique_index(name, index);
+					1ull << si_shader_io_get_unique_index(name, index, true);
 				break;
 			case TGSI_SEMANTIC_PCOORD: /* ignore this */
 				break;
@@ -2217,29 +2277,10 @@
 	}
 
 	(void) mtx_init(&sel->mutex, mtx_plain);
-	util_queue_fence_init(&sel->ready);
 
-	struct util_async_debug_callback async_debug;
-	bool wait =
-		(sctx->debug.debug_message && !sctx->debug.async) ||
-		sctx->is_debug ||
-		si_can_dump_shader(sscreen, sel->info.processor);
-
-	if (wait) {
-		u_async_debug_init(&async_debug);
-		sel->compiler_ctx_state.debug = async_debug.base;
-	}
-
-	util_queue_add_job(&sscreen->shader_compiler_queue, sel,
-			   &sel->ready, si_init_shader_selector_async,
-			   NULL);
-
-	if (wait) {
-		util_queue_fence_wait(&sel->ready);
-		u_async_debug_drain(&async_debug, &sctx->debug);
-		u_async_debug_cleanup(&async_debug);
-	}
-
+	si_schedule_initial_compile(sctx, sel->info.processor, &sel->ready,
+				    &sel->compiler_ctx_state, sel,
+				    si_init_shader_selector_async);
 	return sel;
 }
 
@@ -2272,7 +2313,7 @@
 	     !next_hw_vs_variant ||
 	     old_hw_vs_variant->key.opt.clip_disable !=
 	     next_hw_vs_variant->key.opt.clip_disable))
-		si_mark_atom_dirty(sctx, &sctx->clip_regs);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 }
 
 static void si_update_common_shader_state(struct si_context *sctx)
@@ -2428,14 +2469,14 @@
 
 		if (!old_sel ||
 		    old_sel->info.colors_written != sel->info.colors_written)
-			si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
 
 		if (sctx->screen->has_out_of_order_rast &&
 		    (!old_sel ||
 		     old_sel->info.writes_memory != sel->info.writes_memory ||
 		     old_sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] !=
 		     sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
-			si_mark_atom_dirty(sctx, &sctx->msaa_config);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 	}
 	si_set_active_descriptors_for_shader(sctx, sel);
 	si_update_ps_colorbuf0_slot(sctx);
@@ -2598,29 +2639,27 @@
 	return ps_input_cntl;
 }
 
-static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_spi_map(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
 	struct si_shader *ps = sctx->ps_shader.current;
 	struct si_shader *vs = si_get_vs_state(sctx);
 	struct tgsi_shader_info *psinfo = ps ? &ps->selector->info : NULL;
 	unsigned i, num_interp, num_written = 0, bcol_interp[2];
+	unsigned spi_ps_input_cntl[32];
 
 	if (!ps || !ps->selector->info.num_inputs)
 		return;
 
 	num_interp = si_get_ps_num_interp(ps);
 	assert(num_interp > 0);
-	radeon_set_context_reg_seq(cs, R_028644_SPI_PS_INPUT_CNTL_0, num_interp);
 
 	for (i = 0; i < psinfo->num_inputs; i++) {
 		unsigned name = psinfo->input_semantic_name[i];
 		unsigned index = psinfo->input_semantic_index[i];
 		unsigned interpolate = psinfo->input_interpolate[i];
 
-		radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, name, index,
-						     interpolate));
-		num_written++;
+		spi_ps_input_cntl[num_written++] = si_get_ps_input_cntl(sctx, vs, name,
+							    index, interpolate);
 
 		if (name == TGSI_SEMANTIC_COLOR) {
 			assert(index < ARRAY_SIZE(bcol_interp));
@@ -2635,12 +2674,19 @@
 			if (!(psinfo->colors_read & (0xf << (i * 4))))
 				continue;
 
-			radeon_emit(cs, si_get_ps_input_cntl(sctx, vs, bcol,
-							     i, bcol_interp[i]));
-			num_written++;
+			spi_ps_input_cntl[num_written++] =
+			  si_get_ps_input_cntl(sctx, vs, bcol, i, bcol_interp[i]);
+
 		}
 	}
 	assert(num_interp == num_written);
+
+	/* R_028644_SPI_PS_INPUT_CNTL_0 */
+	/* Dota 2: Only ~16% of SPI map updates set different values. */
+	/* Talos: Only ~9% of SPI map updates set different values. */
+	radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
+				    spi_ps_input_cntl,
+				    sctx->tracked_regs.spi_ps_input_cntl, num_interp);
 }
 
 /**
@@ -2720,7 +2766,7 @@
 	if (update_esgs) {
 		pipe_resource_reference(&sctx->esgs_ring, NULL);
 		sctx->esgs_ring =
-			si_aligned_buffer_create(sctx->b.screen,
+			pipe_aligned_buffer_create(sctx->b.screen,
 						   SI_RESOURCE_FLAG_UNMAPPABLE,
 						   PIPE_USAGE_DEFAULT,
 						   esgs_ring_size, alignment);
@@ -2731,7 +2777,7 @@
 	if (update_gsvs) {
 		pipe_resource_reference(&sctx->gsvs_ring, NULL);
 		sctx->gsvs_ring =
-			si_aligned_buffer_create(sctx->b.screen,
+			pipe_aligned_buffer_create(sctx->b.screen,
 						   SI_RESOURCE_FLAG_UNMAPPABLE,
 						   PIPE_USAGE_DEFAULT,
 						   gsvs_ring_size, alignment);
@@ -2972,7 +3018,7 @@
 			/* Create a bigger scratch buffer */
 			r600_resource_reference(&sctx->scratch_buffer, NULL);
 
-			sctx->scratch_buffer = (struct r600_resource*)
+			sctx->scratch_buffer =
 				si_aligned_buffer_create(&sctx->screen->b,
 							   SI_RESOURCE_FLAG_UNMAPPABLE,
 							   PIPE_USAGE_DEFAULT,
@@ -2980,7 +3026,7 @@
 			if (!sctx->scratch_buffer)
 				return false;
 
-			si_mark_atom_dirty(sctx, &sctx->scratch_state);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
 			si_context_add_resource_size(sctx,
 						     &sctx->scratch_buffer->b.b);
 		}
@@ -2997,7 +3043,7 @@
 			   S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
 	if (spi_tmpring_size != sctx->spi_tmpring_size) {
 		sctx->spi_tmpring_size = spi_tmpring_size;
-		si_mark_atom_dirty(sctx, &sctx->scratch_state);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
 	}
 	return true;
 }
@@ -3009,7 +3055,7 @@
 	/* The address must be aligned to 2^19, because the shader only
 	 * receives the high 13 bits.
 	 */
-	sctx->tess_rings = si_aligned_buffer_create(sctx->b.screen,
+	sctx->tess_rings = pipe_aligned_buffer_create(sctx->b.screen,
 						    SI_RESOURCE_FLAG_32BIT,
 						    PIPE_USAGE_DEFAULT,
 						    sctx->screen->tess_offchip_ring_size +
@@ -3134,7 +3180,7 @@
 		old_ps ? old_ps->key.part.ps.epilog.spi_shader_col_format : 0;
 	int r;
 
-	compiler_state.tm = sctx->tm;
+	compiler_state.compiler = &sctx->compiler;
 	compiler_state.debug = sctx->debug;
 	compiler_state.is_debug_context = sctx->is_debug;
 
@@ -3234,7 +3280,7 @@
 	si_update_vgt_shader_config(sctx);
 
 	if (old_clip_disable != si_get_vs_state(sctx)->key.opt.clip_disable)
-		si_mark_atom_dirty(sctx, &sctx->clip_regs);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
 
 	if (sctx->ps_shader.cso) {
 		unsigned db_shader_control;
@@ -3253,7 +3299,7 @@
 		    sctx->flatshade != rs->flatshade) {
 			sctx->sprite_coord_enable = rs->sprite_coord_enable;
 			sctx->flatshade = rs->flatshade;
-			si_mark_atom_dirty(sctx, &sctx->spi_map);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
 		}
 
 		if (sctx->screen->rbplus_allowed &&
@@ -3261,24 +3307,24 @@
 		    (!old_ps ||
 		     old_spi_shader_col_format !=
 		     sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format))
-			si_mark_atom_dirty(sctx, &sctx->cb_render_state);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
 
 		if (sctx->ps_db_shader_control != db_shader_control) {
 			sctx->ps_db_shader_control = db_shader_control;
-			si_mark_atom_dirty(sctx, &sctx->db_render_state);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 			if (sctx->screen->dpbb_allowed)
-				si_mark_atom_dirty(sctx, &sctx->dpbb_state);
+				si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
 		}
 
 		if (sctx->smoothing_enabled != sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing) {
 			sctx->smoothing_enabled = sctx->ps_shader.current->key.part.ps.epilog.poly_line_smoothing;
-			si_mark_atom_dirty(sctx, &sctx->msaa_config);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
 
 			if (sctx->chip_class == SI)
-				si_mark_atom_dirty(sctx, &sctx->db_render_state);
+				si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
 
 			if (sctx->framebuffer.nr_samples <= 1)
-				si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
+				si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
 		}
 	}
 
@@ -3328,10 +3374,9 @@
 	return true;
 }
 
-static void si_emit_scratch_state(struct si_context *sctx,
-				  struct r600_atom *atom)
+static void si_emit_scratch_state(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 
 	radeon_set_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
 			       sctx->spi_tmpring_size);
@@ -3409,9 +3454,8 @@
 
 void si_init_shader_functions(struct si_context *sctx)
 {
-	si_init_atom(sctx, &sctx->spi_map, &sctx->atoms.s.spi_map, si_emit_spi_map);
-	si_init_atom(sctx, &sctx->scratch_state, &sctx->atoms.s.scratch_state,
-		     si_emit_scratch_state);
+	sctx->atoms.s.spi_map.emit = si_emit_spi_map;
+	sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
 
 	sctx->b.create_vs_state = si_create_shader_selector;
 	sctx->b.create_tcs_state = si_create_shader_selector;
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
index e77eafe..fd7e843 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -25,6 +25,7 @@
 #include "si_build_pm4.h"
 
 #include "util/u_memory.h"
+#include "util/u_suballoc.h"
 
 static void si_set_streamout_enable(struct si_context *sctx, bool enable);
 
@@ -42,7 +43,7 @@
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_streamout_target *t;
-	struct r600_resource *rbuffer = (struct r600_resource*)buffer;
+	struct r600_resource *rbuffer = r600_resource(buffer);
 
 	t = CALLOC_STRUCT(si_streamout_target);
 	if (!t) {
@@ -82,7 +83,7 @@
 	if (!sctx->streamout.enabled_mask)
 		return;
 
-	si_mark_atom_dirty(sctx, &sctx->streamout.begin_atom);
+	si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
 	si_set_streamout_enable(sctx, true);
 }
 
@@ -115,9 +116,9 @@
 		/* Invalidate the scalar cache in case a streamout buffer is
 		 * going to be used as a constant buffer.
 		 *
-		 * Invalidate TC L1, because streamout bypasses it (done by
-		 * setting GLC=1 in the store instruction), but it can contain
-		 * outdated data of streamout buffers.
+		 * Invalidate vL1, because streamout bypasses it (done by
+		 * setting GLC=1 in the store instruction), but vL1 in other
+		 * CUs can contain outdated data of streamout buffers.
 		 *
 		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
 		 * used as an input immediately.
@@ -168,7 +169,7 @@
 	if (num_targets) {
 		si_streamout_buffers_dirty(sctx);
 	} else {
-		si_set_atom_dirty(sctx, &sctx->streamout.begin_atom, false);
+		si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
 		si_set_streamout_enable(sctx, false);
 	}
 
@@ -200,7 +201,7 @@
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						buffer);
 			radeon_add_to_gfx_buffer_list_check_mem(sctx,
-							    (struct r600_resource*)buffer,
+							    r600_resource(buffer),
 							    buffers->shader_usage,
 							    RADEON_PRIO_SHADER_RW_BUFFER,
 							    true);
@@ -229,7 +230,7 @@
 
 static void si_flush_vgt_streamout(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	unsigned reg_strmout_cntl;
 
 	/* The register is at different places on different ASICs. */
@@ -253,9 +254,9 @@
 	radeon_emit(cs, 4); /* poll interval */
 }
 
-static void si_emit_streamout_begin(struct si_context *sctx, struct r600_atom *atom)
+static void si_emit_streamout_begin(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct si_streamout_target **t = sctx->streamout.targets;
 	uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
 	unsigned i;
@@ -310,7 +311,7 @@
 
 void si_emit_streamout_end(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
 	struct si_streamout_target **t = sctx->streamout.targets;
 	unsigned i;
 	uint64_t va;
@@ -355,8 +356,7 @@
  * are no buffers bound.
  */
 
-static void si_emit_streamout_enable(struct si_context *sctx,
-				     struct r600_atom *atom)
+static void si_emit_streamout_enable(struct si_context *sctx)
 {
 	radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
 	radeon_emit(sctx->gfx_cs,
@@ -384,7 +384,7 @@
 
 	if ((old_strmout_en != si_get_strmout_en(sctx)) ||
             (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))
-		si_mark_atom_dirty(sctx, &sctx->streamout.enable_atom);
+		si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 }
 
 void si_update_prims_generated_query_state(struct si_context *sctx,
@@ -400,7 +400,7 @@
 			sctx->streamout.num_prims_gen_queries != 0;
 
 		if (old_strmout_en != si_get_strmout_en(sctx))
-			si_mark_atom_dirty(sctx, &sctx->streamout.enable_atom);
+			si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
 	}
 }
 
@@ -409,6 +409,6 @@
 	sctx->b.create_stream_output_target = si_create_so_target;
 	sctx->b.stream_output_target_destroy = si_so_target_destroy;
 	sctx->b.set_stream_output_targets = si_set_streamout_targets;
-	sctx->streamout.begin_atom.emit = si_emit_streamout_begin;
-	sctx->streamout.enable_atom.emit = si_emit_streamout_enable;
+	sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
+	sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
 }
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index 09ea584..4183be0 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -44,7 +44,7 @@
 		return;
 
 	ctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot;
-	si_mark_atom_dirty(ctx, &ctx->scissors.atom);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
 }
 
 /* Since the guard band disables clipping, we have to clip per-pixel
@@ -110,7 +110,7 @@
 }
 
 static void si_emit_one_scissor(struct si_context *ctx,
-				struct radeon_winsys_cs *cs,
+				struct radeon_cmdbuf *cs,
 				struct si_signed_scissor *vp_scissor,
 				struct pipe_scissor_state *scissor)
 {
@@ -136,14 +136,27 @@
 /* the range is [-MAX, MAX] */
 #define SI_MAX_VIEWPORT_RANGE 32768
 
-static void si_emit_guardband(struct si_context *ctx,
-			      struct si_signed_scissor *vp_as_scissor)
+static void si_emit_guardband(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	const struct si_signed_scissor *vp_as_scissor;
+	struct si_signed_scissor max_vp_scissor;
 	struct pipe_viewport_state vp;
 	float left, top, right, bottom, max_range, guardband_x, guardband_y;
 	float discard_x, discard_y;
 
+	if (ctx->vs_writes_viewport_index) {
+		/* Shaders can draw to any viewport. Make a union of all
+		 * viewports. */
+		max_vp_scissor = ctx->viewports.as_scissor[0];
+		for (unsigned i = 1; i < SI_MAX_VIEWPORTS; i++) {
+			si_scissor_make_union(&max_vp_scissor,
+					      &ctx->viewports.as_scissor[i]);
+		}
+		vp_as_scissor = &max_vp_scissor;
+	} else {
+		vp_as_scissor = &ctx->viewports.as_scissor[0];
+	}
+
 	/* Reconstruct the viewport transformation from the scissor. */
 	vp.translate[0] = (vp_as_scissor->minx + vp_as_scissor->maxx) / 2.0;
 	vp.translate[1] = (vp_as_scissor->miny + vp_as_scissor->maxy) / 2.0;
@@ -179,8 +192,7 @@
 	discard_x = 1.0;
 	discard_y = 1.0;
 
-	if (unlikely(ctx->current_rast_prim < PIPE_PRIM_TRIANGLES) &&
-	    ctx->queued.named.rasterizer) {
+	if (unlikely(util_prim_is_points_or_lines(ctx->current_rast_prim))) {
 		/* When rendering wide points or lines, we need to be more
 		 * conservative about when to discard them entirely. */
 		const struct si_state_rasterizer *rs = ctx->queued.named.rasterizer;
@@ -201,26 +213,22 @@
 		discard_y = MIN2(discard_y, guardband_y);
 	}
 
-	/* If any of the GB registers is updated, all of them must be updated. */
-	radeon_set_context_reg_seq(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4);
-
-	radeon_emit(cs, fui(guardband_y)); /* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */
-	radeon_emit(cs, fui(discard_y));   /* R_028BEC_PA_CL_GB_VERT_DISC_ADJ */
-	radeon_emit(cs, fui(guardband_x)); /* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */
-	radeon_emit(cs, fui(discard_x));   /* R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */
+	/* If any of the GB registers is updated, all of them must be updated.
+	 * R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, R_028BEC_PA_CL_GB_VERT_DISC_ADJ
+	 * R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ
+	 */
+	radeon_opt_set_context_reg4(ctx, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ,
+				    SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ,
+				    fui(guardband_y), fui(discard_y),
+				    fui(guardband_x), fui(discard_x));
 }
 
-static void si_emit_scissors(struct si_context *ctx, struct r600_atom *atom)
+static void si_emit_scissors(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct pipe_scissor_state *states = ctx->scissors.states;
 	unsigned mask = ctx->scissors.dirty_mask;
-	bool scissor_enabled = false;
-	struct si_signed_scissor max_vp_scissor;
-	int i;
-
-	if (ctx->queued.named.rasterizer)
-		scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
+	bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
 
 	/* The simple case: Only 1 viewport is active. */
 	if (!ctx->vs_writes_viewport_index) {
@@ -231,17 +239,10 @@
 
 		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
 		si_emit_one_scissor(ctx, cs, vp, scissor_enabled ? &states[0] : NULL);
-		si_emit_guardband(ctx, vp);
 		ctx->scissors.dirty_mask &= ~1; /* clear one bit */
 		return;
 	}
 
-	/* Shaders can draw to any viewport. Make a union of all viewports. */
-	max_vp_scissor = ctx->viewports.as_scissor[0];
-	for (i = 1; i < SI_MAX_VIEWPORTS; i++)
-		si_scissor_make_union(&max_vp_scissor,
-				      &ctx->viewports.as_scissor[i]);
-
 	while (mask) {
 		int start, count, i;
 
@@ -254,7 +255,6 @@
 					    scissor_enabled ? &states[i] : NULL);
 		}
 	}
-	si_emit_guardband(ctx, &max_vp_scissor);
 	ctx->scissors.dirty_mask = 0;
 }
 
@@ -279,14 +279,15 @@
 	ctx->viewports.dirty_mask |= mask;
 	ctx->viewports.depth_range_dirty_mask |= mask;
 	ctx->scissors.dirty_mask |= mask;
-	si_mark_atom_dirty(ctx, &ctx->viewports.atom);
-	si_mark_atom_dirty(ctx, &ctx->scissors.atom);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
 }
 
 static void si_emit_one_viewport(struct si_context *ctx,
 				 struct pipe_viewport_state *state)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 
 	radeon_emit(cs, fui(state->scale[0]));
 	radeon_emit(cs, fui(state->translate[0]));
@@ -298,7 +299,7 @@
 
 static void si_emit_viewports(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct pipe_viewport_state *states = ctx->viewports.states;
 	unsigned mask = ctx->viewports.dirty_mask;
 
@@ -340,16 +341,13 @@
 
 static void si_emit_depth_ranges(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->gfx_cs;
+	struct radeon_cmdbuf *cs = ctx->gfx_cs;
 	struct pipe_viewport_state *states = ctx->viewports.states;
 	unsigned mask = ctx->viewports.depth_range_dirty_mask;
-	bool clip_halfz = false;
+	bool clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
 	bool window_space = ctx->vs_disables_clipping_viewport;
 	float zmin, zmax;
 
-	if (ctx->queued.named.rasterizer)
-		clip_halfz = ctx->queued.named.rasterizer->clip_halfz;
-
 	/* The simple case: Only 1 viewport is active. */
 	if (!ctx->vs_writes_viewport_index) {
 		if (!(mask & 1))
@@ -382,8 +380,7 @@
 	ctx->viewports.depth_range_dirty_mask = 0;
 }
 
-static void si_emit_viewport_states(struct si_context *ctx,
-				    struct r600_atom *atom)
+static void si_emit_viewport_states(struct si_context *ctx)
 {
 	si_emit_viewports(ctx);
 	si_emit_depth_ranges(ctx);
@@ -415,27 +412,34 @@
 		ctx->vs_disables_clipping_viewport = vs_window_space;
 		ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
 		ctx->viewports.depth_range_dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-		si_mark_atom_dirty(ctx, &ctx->scissors.atom);
-		si_mark_atom_dirty(ctx, &ctx->viewports.atom);
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
+		si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
 	}
 
 	/* Viewport index handling. */
+	if (ctx->vs_writes_viewport_index == info->writes_viewport_index)
+		return;
+
+	/* This changes how the guardband is computed. */
 	ctx->vs_writes_viewport_index = info->writes_viewport_index;
+	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
+
 	if (!ctx->vs_writes_viewport_index)
 		return;
 
 	if (ctx->scissors.dirty_mask)
-	    si_mark_atom_dirty(ctx, &ctx->scissors.atom);
+	    si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
 
 	if (ctx->viewports.dirty_mask ||
 	    ctx->viewports.depth_range_dirty_mask)
-	    si_mark_atom_dirty(ctx, &ctx->viewports.atom);
+	    si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
 }
 
 void si_init_viewport_functions(struct si_context *ctx)
 {
-	ctx->scissors.atom.emit = si_emit_scissors;
-	ctx->viewports.atom.emit = si_emit_viewport_states;
+	ctx->atoms.s.guardband.emit = si_emit_guardband;
+	ctx->atoms.s.scissors.emit = si_emit_scissors;
+	ctx->atoms.s.viewports.emit = si_emit_viewport_states;
 
 	ctx->b.set_scissor_states = si_set_scissor_states;
 	ctx->b.set_viewport_states = si_set_viewport_states;
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
index 6c3cde4..ee6ab7c 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -220,8 +220,8 @@
 	 */
 	for (i = 0; i < iterations; i++) {
 		struct pipe_resource tsrc = {}, tdst = {}, *src, *dst;
-		struct r600_texture *rdst;
-		struct r600_texture *rsrc;
+		struct si_texture *sdst;
+		struct si_texture *ssrc;
 		struct cpu_texture src_cpu, dst_cpu;
 		unsigned bpp, max_width, max_height, max_depth, j, num;
 		unsigned gfx_blits = 0, dma_blits = 0, max_tex_side_gen;
@@ -290,24 +290,24 @@
 		dst = screen->resource_create(screen, &tdst);
 		assert(src);
 		assert(dst);
-		rdst = (struct r600_texture*)dst;
-		rsrc = (struct r600_texture*)src;
+		sdst = (struct si_texture*)dst;
+		ssrc = (struct si_texture*)src;
 		alloc_cpu_texture(&src_cpu, &tsrc, bpp);
 		alloc_cpu_texture(&dst_cpu, &tdst, bpp);
 
 		printf("%4u: dst = (%5u x %5u x %u, %s), "
 		       " src = (%5u x %5u x %u, %s), bpp = %2u, ",
 		       i, tdst.width0, tdst.height0, tdst.array_size,
-		       array_mode_to_string(sscreen, &rdst->surface),
+		       array_mode_to_string(sscreen, &sdst->surface),
 		       tsrc.width0, tsrc.height0, tsrc.array_size,
-		       array_mode_to_string(sscreen, &rsrc->surface), bpp);
+		       array_mode_to_string(sscreen, &ssrc->surface), bpp);
 		fflush(stdout);
 
 		/* set src pixels */
 		set_random_pixels(ctx, src, &src_cpu);
 
 		/* clear dst pixels */
-		si_clear_buffer(sctx, dst, 0, rdst->surface.surf_size, 0, true);
+		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0, true);
 		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
 
 		/* preparation */
@@ -337,8 +337,8 @@
 				dstz = rand() % (tdst.array_size - depth + 1);
 
 				/* special code path to hit the tiled partial copies */
-				if (!rsrc->surface.is_linear &&
-				    !rdst->surface.is_linear &&
+				if (!ssrc->surface.is_linear &&
+				    !sdst->surface.is_linear &&
 				    rand() & 1) {
 					if (max_width < 8 || max_height < 8)
 						continue;
@@ -365,8 +365,8 @@
 				}
 
 				/* special code path to hit out-of-bounds reads in L2T */
-				if (rsrc->surface.is_linear &&
-				    !rdst->surface.is_linear &&
+				if (ssrc->surface.is_linear &&
+				    !sdst->surface.is_linear &&
 				    rand() % 4 == 0) {
 					srcx = 0;
 					srcy = 0;
diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c
index af606d7..bcff226 100644
--- a/src/gallium/drivers/radeonsi/si_texture.c
+++ b/src/gallium/drivers/radeonsi/si_texture.c
@@ -31,6 +31,7 @@
 #include "util/u_pack_color.h"
 #include "util/u_resource.h"
 #include "util/u_surface.h"
+#include "util/u_transfer.h"
 #include "util/os_time.h"
 #include <errno.h>
 #include <inttypes.h>
@@ -43,37 +44,37 @@
 
 
 bool si_prepare_for_dma_blit(struct si_context *sctx,
-			     struct r600_texture *rdst,
+			     struct si_texture *dst,
 			     unsigned dst_level, unsigned dstx,
 			     unsigned dsty, unsigned dstz,
-			     struct r600_texture *rsrc,
+			     struct si_texture *src,
 			     unsigned src_level,
 			     const struct pipe_box *src_box)
 {
 	if (!sctx->dma_cs)
 		return false;
 
-	if (rdst->surface.bpe != rsrc->surface.bpe)
+	if (dst->surface.bpe != src->surface.bpe)
 		return false;
 
 	/* MSAA: Blits don't exist in the real world. */
-	if (rsrc->resource.b.b.nr_samples > 1 ||
-	    rdst->resource.b.b.nr_samples > 1)
+	if (src->buffer.b.b.nr_samples > 1 ||
+	    dst->buffer.b.b.nr_samples > 1)
 		return false;
 
 	/* Depth-stencil surfaces:
 	 *   When dst is linear, the DB->CB copy preserves HTILE.
 	 *   When dst is tiled, the 3D path must be used to update HTILE.
 	 */
-	if (rsrc->is_depth || rdst->is_depth)
+	if (src->is_depth || dst->is_depth)
 		return false;
 
 	/* DCC as:
 	 *   src: Use the 3D path. DCC decompression is expensive.
 	 *   dst: Use the 3D path to compress the pixels with DCC.
 	 */
-	if (vi_dcc_enabled(rsrc, src_level) ||
-	    vi_dcc_enabled(rdst, dst_level))
+	if (vi_dcc_enabled(src, src_level) ||
+	    vi_dcc_enabled(dst, dst_level))
 		return false;
 
 	/* CMASK as:
@@ -81,23 +82,23 @@
 	 *   dst: If overwriting the whole texture, discard CMASK and use
 	 *        SDMA. Otherwise, use the 3D path.
 	 */
-	if (rdst->cmask.size && rdst->dirty_level_mask & (1 << dst_level)) {
+	if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
 		/* The CMASK clear is only enabled for the first level. */
 		assert(dst_level == 0);
-		if (!util_texrange_covers_whole_level(&rdst->resource.b.b, dst_level,
+		if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level,
 						      dstx, dsty, dstz, src_box->width,
 						      src_box->height, src_box->depth))
 			return false;
 
-		si_texture_discard_cmask(sctx->screen, rdst);
+		si_texture_discard_cmask(sctx->screen, dst);
 	}
 
 	/* All requirements are met. Prepare textures for SDMA. */
-	if (rsrc->cmask.size && rsrc->dirty_level_mask & (1 << src_level))
-		sctx->b.flush_resource(&sctx->b, &rsrc->resource.b.b);
+	if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
+		sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
 
-	assert(!(rsrc->dirty_level_mask & (1 << src_level)));
-	assert(!(rdst->dirty_level_mask & (1 << dst_level)));
+	assert(!(src->dirty_level_mask & (1 << src_level)));
+	assert(!(dst->dirty_level_mask & (1 << dst_level)));
 
 	return true;
 }
@@ -137,11 +138,11 @@
 }
 
 /* Copy from a full GPU texture to a transfer's staging one. */
-static void si_copy_to_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
+static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
-	struct pipe_resource *dst = &rtransfer->staging->b.b;
+	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
+	struct pipe_resource *dst = &stransfer->staging->b.b;
 	struct pipe_resource *src = transfer->resource;
 
 	if (src->nr_samples > 1) {
@@ -155,12 +156,12 @@
 }
 
 /* Copy from a transfer's staging texture to a full GPU one. */
-static void si_copy_from_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer)
+static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer;
+	struct pipe_transfer *transfer = (struct pipe_transfer*)stransfer;
 	struct pipe_resource *dst = transfer->resource;
-	struct pipe_resource *src = &rtransfer->staging->b.b;
+	struct pipe_resource *src = &stransfer->staging->b.b;
 	struct pipe_box sbox;
 
 	u_box_3d(0, 0, 0, transfer->box.width, transfer->box.height, transfer->box.depth, &sbox);
@@ -178,41 +179,41 @@
 }
 
 static unsigned si_texture_get_offset(struct si_screen *sscreen,
-				      struct r600_texture *rtex, unsigned level,
+				      struct si_texture *tex, unsigned level,
 				      const struct pipe_box *box,
 				      unsigned *stride,
 				      unsigned *layer_stride)
 {
 	if (sscreen->info.chip_class >= GFX9) {
-		*stride = rtex->surface.u.gfx9.surf_pitch * rtex->surface.bpe;
-		*layer_stride = rtex->surface.u.gfx9.surf_slice_size;
+		*stride = tex->surface.u.gfx9.surf_pitch * tex->surface.bpe;
+		*layer_stride = tex->surface.u.gfx9.surf_slice_size;
 
 		if (!box)
 			return 0;
 
 		/* Each texture is an array of slices. Each slice is an array
 		 * of mipmap levels. */
-		return box->z * rtex->surface.u.gfx9.surf_slice_size +
-		       rtex->surface.u.gfx9.offset[level] +
-		       (box->y / rtex->surface.blk_h *
-			rtex->surface.u.gfx9.surf_pitch +
-			box->x / rtex->surface.blk_w) * rtex->surface.bpe;
+		return box->z * tex->surface.u.gfx9.surf_slice_size +
+		       tex->surface.u.gfx9.offset[level] +
+		       (box->y / tex->surface.blk_h *
+			tex->surface.u.gfx9.surf_pitch +
+			box->x / tex->surface.blk_w) * tex->surface.bpe;
 	} else {
-		*stride = rtex->surface.u.legacy.level[level].nblk_x *
-			  rtex->surface.bpe;
-		assert((uint64_t)rtex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
-		*layer_stride = (uint64_t)rtex->surface.u.legacy.level[level].slice_size_dw * 4;
+		*stride = tex->surface.u.legacy.level[level].nblk_x *
+			  tex->surface.bpe;
+		assert((uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 <= UINT_MAX);
+		*layer_stride = (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4;
 
 		if (!box)
-			return rtex->surface.u.legacy.level[level].offset;
+			return tex->surface.u.legacy.level[level].offset;
 
 		/* Each texture is an array of mipmap levels. Each level is
 		 * an array of slices. */
-		return rtex->surface.u.legacy.level[level].offset +
-		       box->z * (uint64_t)rtex->surface.u.legacy.level[level].slice_size_dw * 4 +
-		       (box->y / rtex->surface.blk_h *
-		        rtex->surface.u.legacy.level[level].nblk_x +
-		        box->x / rtex->surface.blk_w) * rtex->surface.bpe;
+		return tex->surface.u.legacy.level[level].offset +
+		       box->z * (uint64_t)tex->surface.u.legacy.level[level].slice_size_dw * 4 +
+		       (box->y / tex->surface.blk_h *
+		        tex->surface.u.legacy.level[level].nblk_x +
+		        box->x / tex->surface.blk_w) * tex->surface.bpe;
 	}
 }
 
@@ -238,7 +239,7 @@
 
 	if (!is_flushed_depth &&
 	    ptex->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
-		bpe = 4; /* stencil is allocated separately on evergreen */
+		bpe = 4; /* stencil is allocated separately */
 	} else {
 		bpe = util_format_get_blocksize(ptex->format);
 		assert(util_is_power_of_two_or_zero(bpe));
@@ -268,9 +269,23 @@
 	if (sscreen->info.chip_class >= VI &&
 	    (ptex->flags & SI_RESOURCE_FLAG_DISABLE_DCC ||
 	     ptex->format == PIPE_FORMAT_R9G9B9E5_FLOAT ||
-	     /* DCC MSAA array textures are disallowed due to incomplete clear impl. */
-	     (ptex->nr_samples >= 2 &&
-	      (!sscreen->dcc_msaa_allowed || ptex->array_size > 1))))
+	     (ptex->nr_samples >= 2 && !sscreen->dcc_msaa_allowed)))
+		flags |= RADEON_SURF_DISABLE_DCC;
+
+	/* Stoney: 128bpp MSAA textures randomly fail piglit tests with DCC. */
+	if (sscreen->info.family == CHIP_STONEY &&
+	    bpe == 16 && ptex->nr_samples >= 2)
+		flags |= RADEON_SURF_DISABLE_DCC;
+
+	/* VI: DCC clear for 4x and 8x MSAA array textures unimplemented. */
+	if (sscreen->info.chip_class == VI &&
+	    ptex->nr_storage_samples >= 4 &&
+	    ptex->array_size > 1)
+		flags |= RADEON_SURF_DISABLE_DCC;
+
+	/* GFX9: DCC clear for 4x and 8x MSAA textures unimplemented. */
+	if (sscreen->info.chip_class >= GFX9 &&
+	    ptex->nr_storage_samples >= 4)
 		flags |= RADEON_SURF_DISABLE_DCC;
 
 	if (ptex->bind & PIPE_BIND_SCANOUT || is_scanout) {
@@ -321,10 +336,10 @@
 }
 
 static void si_texture_init_metadata(struct si_screen *sscreen,
-				     struct r600_texture *rtex,
+				     struct si_texture *tex,
 				     struct radeon_bo_metadata *metadata)
 {
-	struct radeon_surf *surface = &rtex->surface;
+	struct radeon_surf *surface = &tex->surface;
 
 	memset(metadata, 0, sizeof(*metadata));
 
@@ -382,7 +397,7 @@
 }
 
 void si_eliminate_fast_color_clear(struct si_context *sctx,
-				   struct r600_texture *rtex)
+				   struct si_texture *tex)
 {
 	struct si_screen *sscreen = sctx->screen;
 	struct pipe_context *ctx = &sctx->b;
@@ -391,7 +406,7 @@
 		mtx_lock(&sscreen->aux_context_lock);
 
 	unsigned n = sctx->num_decompress_calls;
-	ctx->flush_resource(ctx, &rtex->resource.b.b);
+	ctx->flush_resource(ctx, &tex->buffer.b.b);
 
 	/* Flush only if any fast clear elimination took place. */
 	if (n != sctx->num_decompress_calls)
@@ -402,46 +417,47 @@
 }
 
 void si_texture_discard_cmask(struct si_screen *sscreen,
-			      struct r600_texture *rtex)
+			      struct si_texture *tex)
 {
-	if (!rtex->cmask.size)
+	if (!tex->cmask_buffer)
 		return;
 
-	assert(rtex->resource.b.b.nr_samples <= 1);
+	assert(tex->buffer.b.b.nr_samples <= 1);
 
 	/* Disable CMASK. */
-	memset(&rtex->cmask, 0, sizeof(rtex->cmask));
-	rtex->cmask.base_address_reg = rtex->resource.gpu_address >> 8;
-	rtex->dirty_level_mask = 0;
+	tex->cmask_base_address_reg = tex->buffer.gpu_address >> 8;
+	tex->dirty_level_mask = 0;
 
-	rtex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
+	tex->cb_color_info &= ~S_028C70_FAST_CLEAR(1);
 
-	if (rtex->cmask_buffer != &rtex->resource)
-	    r600_resource_reference(&rtex->cmask_buffer, NULL);
+	if (tex->cmask_buffer != &tex->buffer)
+	    r600_resource_reference(&tex->cmask_buffer, NULL);
+
+	tex->cmask_buffer = NULL;
 
 	/* Notify all contexts about the change. */
 	p_atomic_inc(&sscreen->dirty_tex_counter);
 	p_atomic_inc(&sscreen->compressed_colortex_counter);
 }
 
-static bool si_can_disable_dcc(struct r600_texture *rtex)
+static bool si_can_disable_dcc(struct si_texture *tex)
 {
 	/* We can't disable DCC if it can be written by another process. */
-	return rtex->dcc_offset &&
-	       (!rtex->resource.b.is_shared ||
-		!(rtex->resource.external_usage & PIPE_HANDLE_USAGE_WRITE));
+	return tex->dcc_offset &&
+	       (!tex->buffer.b.is_shared ||
+		!(tex->buffer.external_usage & PIPE_HANDLE_USAGE_WRITE));
 }
 
 static bool si_texture_discard_dcc(struct si_screen *sscreen,
-				   struct r600_texture *rtex)
+				   struct si_texture *tex)
 {
-	if (!si_can_disable_dcc(rtex))
+	if (!si_can_disable_dcc(tex))
 		return false;
 
-	assert(rtex->dcc_separate_buffer == NULL);
+	assert(tex->dcc_separate_buffer == NULL);
 
 	/* Disable DCC. */
-	rtex->dcc_offset = 0;
+	tex->dcc_offset = 0;
 
 	/* Notify all contexts about the change. */
 	p_atomic_inc(&sscreen->dirty_tex_counter);
@@ -470,43 +486,43 @@
  *              if you don't.
  */
 bool si_texture_disable_dcc(struct si_context *sctx,
-			    struct r600_texture *rtex)
+			    struct si_texture *tex)
 {
 	struct si_screen *sscreen = sctx->screen;
 
-	if (!si_can_disable_dcc(rtex))
+	if (!si_can_disable_dcc(tex))
 		return false;
 
 	if (&sctx->b == sscreen->aux_context)
 		mtx_lock(&sscreen->aux_context_lock);
 
 	/* Decompress DCC. */
-	si_decompress_dcc(sctx, rtex);
+	si_decompress_dcc(sctx, tex);
 	sctx->b.flush(&sctx->b, NULL, 0);
 
 	if (&sctx->b == sscreen->aux_context)
 		mtx_unlock(&sscreen->aux_context_lock);
 
-	return si_texture_discard_dcc(sscreen, rtex);
+	return si_texture_discard_dcc(sscreen, tex);
 }
 
 static void si_reallocate_texture_inplace(struct si_context *sctx,
-					  struct r600_texture *rtex,
+					  struct si_texture *tex,
 					  unsigned new_bind_flag,
 					  bool invalidate_storage)
 {
 	struct pipe_screen *screen = sctx->b.screen;
-	struct r600_texture *new_tex;
-	struct pipe_resource templ = rtex->resource.b.b;
+	struct si_texture *new_tex;
+	struct pipe_resource templ = tex->buffer.b.b;
 	unsigned i;
 
 	templ.bind |= new_bind_flag;
 
-	if (rtex->resource.b.is_shared)
+	if (tex->buffer.b.is_shared)
 		return;
 
 	if (new_bind_flag == PIPE_BIND_LINEAR) {
-		if (rtex->surface.is_linear)
+		if (tex->surface.is_linear)
 			return;
 
 		/* This fails with MSAA, depth, and compressed textures. */
@@ -515,7 +531,7 @@
 			return;
 	}
 
-	new_tex = (struct r600_texture*)screen->resource_create(screen, &templ);
+	new_tex = (struct si_texture*)screen->resource_create(screen, &templ);
 	if (!new_tex)
 		return;
 
@@ -528,52 +544,82 @@
 				 u_minify(templ.width0, i), u_minify(templ.height0, i),
 				 util_num_layers(&templ, i), &box);
 
-			sctx->dma_copy(&sctx->b, &new_tex->resource.b.b, i, 0, 0, 0,
-				       &rtex->resource.b.b, i, &box);
+			sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0,
+				       &tex->buffer.b.b, i, &box);
 		}
 	}
 
 	if (new_bind_flag == PIPE_BIND_LINEAR) {
-		si_texture_discard_cmask(sctx->screen, rtex);
-		si_texture_discard_dcc(sctx->screen, rtex);
+		si_texture_discard_cmask(sctx->screen, tex);
+		si_texture_discard_dcc(sctx->screen, tex);
 	}
 
-	/* Replace the structure fields of rtex. */
-	rtex->resource.b.b.bind = templ.bind;
-	pb_reference(&rtex->resource.buf, new_tex->resource.buf);
-	rtex->resource.gpu_address = new_tex->resource.gpu_address;
-	rtex->resource.vram_usage = new_tex->resource.vram_usage;
-	rtex->resource.gart_usage = new_tex->resource.gart_usage;
-	rtex->resource.bo_size = new_tex->resource.bo_size;
-	rtex->resource.bo_alignment = new_tex->resource.bo_alignment;
-	rtex->resource.domains = new_tex->resource.domains;
-	rtex->resource.flags = new_tex->resource.flags;
-	rtex->size = new_tex->size;
-	rtex->db_render_format = new_tex->db_render_format;
-	rtex->db_compatible = new_tex->db_compatible;
-	rtex->can_sample_z = new_tex->can_sample_z;
-	rtex->can_sample_s = new_tex->can_sample_s;
-	rtex->surface = new_tex->surface;
-	rtex->fmask = new_tex->fmask;
-	rtex->cmask = new_tex->cmask;
-	rtex->cb_color_info = new_tex->cb_color_info;
-	rtex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
-	rtex->htile_offset = new_tex->htile_offset;
-	rtex->tc_compatible_htile = new_tex->tc_compatible_htile;
-	rtex->depth_cleared = new_tex->depth_cleared;
-	rtex->stencil_cleared = new_tex->stencil_cleared;
-	rtex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
-	rtex->framebuffers_bound = new_tex->framebuffers_bound;
+	/* Replace the structure fields of tex. */
+	tex->buffer.b.b.bind = templ.bind;
+	pb_reference(&tex->buffer.buf, new_tex->buffer.buf);
+	tex->buffer.gpu_address = new_tex->buffer.gpu_address;
+	tex->buffer.vram_usage = new_tex->buffer.vram_usage;
+	tex->buffer.gart_usage = new_tex->buffer.gart_usage;
+	tex->buffer.bo_size = new_tex->buffer.bo_size;
+	tex->buffer.bo_alignment = new_tex->buffer.bo_alignment;
+	tex->buffer.domains = new_tex->buffer.domains;
+	tex->buffer.flags = new_tex->buffer.flags;
+
+	tex->surface = new_tex->surface;
+	tex->size = new_tex->size;
+	si_texture_reference(&tex->flushed_depth_texture,
+			     new_tex->flushed_depth_texture);
+
+	tex->fmask_offset = new_tex->fmask_offset;
+	tex->cmask_offset = new_tex->cmask_offset;
+	tex->cmask_base_address_reg = new_tex->cmask_base_address_reg;
+
+	if (tex->cmask_buffer == &tex->buffer)
+		tex->cmask_buffer = NULL;
+	else
+		r600_resource_reference(&tex->cmask_buffer, NULL);
+
+	if (new_tex->cmask_buffer == &new_tex->buffer)
+		tex->cmask_buffer = &tex->buffer;
+	else
+		r600_resource_reference(&tex->cmask_buffer, new_tex->cmask_buffer);
+
+	tex->dcc_offset = new_tex->dcc_offset;
+	tex->cb_color_info = new_tex->cb_color_info;
+	memcpy(tex->color_clear_value, new_tex->color_clear_value,
+	       sizeof(tex->color_clear_value));
+	tex->last_msaa_resolve_target_micro_mode = new_tex->last_msaa_resolve_target_micro_mode;
+
+	tex->htile_offset = new_tex->htile_offset;
+	tex->depth_clear_value = new_tex->depth_clear_value;
+	tex->dirty_level_mask = new_tex->dirty_level_mask;
+	tex->stencil_dirty_level_mask = new_tex->stencil_dirty_level_mask;
+	tex->db_render_format = new_tex->db_render_format;
+	tex->stencil_clear_value = new_tex->stencil_clear_value;
+	tex->tc_compatible_htile = new_tex->tc_compatible_htile;
+	tex->depth_cleared = new_tex->depth_cleared;
+	tex->stencil_cleared = new_tex->stencil_cleared;
+	tex->upgraded_depth = new_tex->upgraded_depth;
+	tex->db_compatible = new_tex->db_compatible;
+	tex->can_sample_z = new_tex->can_sample_z;
+	tex->can_sample_s = new_tex->can_sample_s;
+
+	tex->separate_dcc_dirty = new_tex->separate_dcc_dirty;
+	tex->dcc_gather_statistics = new_tex->dcc_gather_statistics;
+	r600_resource_reference(&tex->dcc_separate_buffer,
+				new_tex->dcc_separate_buffer);
+	r600_resource_reference(&tex->last_dcc_separate_buffer,
+				new_tex->last_dcc_separate_buffer);
 
 	if (new_bind_flag == PIPE_BIND_LINEAR) {
-		assert(!rtex->htile_offset);
-		assert(!rtex->cmask.size);
-		assert(!rtex->fmask.size);
-		assert(!rtex->dcc_offset);
-		assert(!rtex->is_depth);
+		assert(!tex->htile_offset);
+		assert(!tex->cmask_buffer);
+		assert(!tex->surface.fmask_size);
+		assert(!tex->dcc_offset);
+		assert(!tex->is_depth);
 	}
 
-	r600_texture_reference(&new_tex, NULL);
+	si_texture_reference(&new_tex, NULL);
 
 	p_atomic_inc(&sctx->screen->dirty_tex_counter);
 }
@@ -584,10 +630,10 @@
 }
 
 static void si_query_opaque_metadata(struct si_screen *sscreen,
-				     struct r600_texture *rtex,
+				     struct si_texture *tex,
 			             struct radeon_bo_metadata *md)
 {
-	struct pipe_resource *res = &rtex->resource.b.b;
+	struct pipe_resource *res = &tex->buffer.b.b;
 	static const unsigned char swizzle[] = {
 		PIPE_SWIZZLE_X,
 		PIPE_SWIZZLE_Y,
@@ -597,12 +643,11 @@
 	uint32_t desc[8], i;
 	bool is_array = util_texture_is_array(res->target);
 
-	/* DRM 2.x.x doesn't support this. */
-	if (sscreen->info.drm_major != 3)
+	if (!sscreen->info.has_bo_metadata)
 		return;
 
-	assert(rtex->dcc_separate_buffer == NULL);
-	assert(rtex->fmask.size == 0);
+	assert(tex->dcc_separate_buffer == NULL);
+	assert(tex->surface.fmask_size == 0);
 
 	/* Metadata image format format version 1:
 	 * [0] = 1 (metadata format identifier)
@@ -619,20 +664,20 @@
 	/* TILE_MODE_INDEX is ambiguous without a PCI ID. */
 	md->metadata[1] = si_get_bo_metadata_word1(sscreen);
 
-	si_make_texture_descriptor(sscreen, rtex, true,
+	si_make_texture_descriptor(sscreen, tex, true,
 				   res->target, res->format,
 				   swizzle, 0, res->last_level, 0,
 				   is_array ? res->array_size - 1 : 0,
 				   res->width0, res->height0, res->depth0,
 				   desc, NULL);
 
-	si_set_mutable_tex_desc_fields(sscreen, rtex, &rtex->surface.u.legacy.level[0],
-				       0, 0, rtex->surface.blk_w, false, desc);
+	si_set_mutable_tex_desc_fields(sscreen, tex, &tex->surface.u.legacy.level[0],
+				       0, 0, tex->surface.blk_w, false, desc);
 
 	/* Clear the base address and set the relative DCC offset. */
 	desc[0] = 0;
 	desc[1] &= C_008F14_BASE_ADDRESS_HI;
-	desc[7] = rtex->dcc_offset >> 8;
+	desc[7] = tex->dcc_offset >> 8;
 
 	/* Dwords [2:9] contain the image descriptor. */
 	memcpy(&md->metadata[2], desc, sizeof(desc));
@@ -641,14 +686,14 @@
 	/* Dwords [10:..] contain the mipmap level offsets. */
 	if (sscreen->info.chip_class <= VI) {
 		for (i = 0; i <= res->last_level; i++)
-			md->metadata[10+i] = rtex->surface.u.legacy.level[i].offset >> 8;
+			md->metadata[10+i] = tex->surface.u.legacy.level[i].offset >> 8;
 
 		md->size_metadata += (1 + res->last_level) * 4;
 	}
 }
 
 static void si_apply_opaque_metadata(struct si_screen *sscreen,
-				     struct r600_texture *rtex,
+				     struct si_texture *tex,
 			             struct radeon_bo_metadata *md)
 {
 	uint32_t *desc = &md->metadata[2];
@@ -663,14 +708,14 @@
 	    md->metadata[0] != 0 &&
 	    md->metadata[1] == si_get_bo_metadata_word1(sscreen) &&
 	    G_008F28_COMPRESSION_EN(desc[6])) {
-		rtex->dcc_offset = (uint64_t)desc[7] << 8;
+		tex->dcc_offset = (uint64_t)desc[7] << 8;
 		return;
 	}
 
 	/* Disable DCC. These are always set by texture_from_handle and must
 	 * be cleared here.
 	 */
-	rtex->dcc_offset = 0;
+	tex->dcc_offset = 0;
 }
 
 static boolean si_texture_get_handle(struct pipe_screen* screen,
@@ -681,8 +726,8 @@
 {
 	struct si_screen *sscreen = (struct si_screen*)screen;
 	struct si_context *sctx;
-	struct r600_resource *res = (struct r600_resource*)resource;
-	struct r600_texture *rtex = (struct r600_texture*)resource;
+	struct r600_resource *res = r600_resource(resource);
+	struct si_texture *tex = (struct si_texture*)resource;
 	struct radeon_bo_metadata metadata;
 	bool update_metadata = false;
 	unsigned stride, offset, slice_size;
@@ -695,31 +740,31 @@
 		/* This is not supported now, but it might be required for OpenCL
 		 * interop in the future.
 		 */
-		if (resource->nr_samples > 1 || rtex->is_depth)
+		if (resource->nr_samples > 1 || tex->is_depth)
 			return false;
 
 		/* Move a suballocated texture into a non-suballocated allocation. */
 		if (sscreen->ws->buffer_is_suballocated(res->buf) ||
-		    rtex->surface.tile_swizzle ||
-		    (rtex->resource.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+		    tex->surface.tile_swizzle ||
+		    (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
 		     sscreen->info.has_local_buffers &&
-		     whandle->type != DRM_API_HANDLE_TYPE_KMS)) {
+		     whandle->type != WINSYS_HANDLE_TYPE_KMS)) {
 			assert(!res->b.is_shared);
-			si_reallocate_texture_inplace(sctx, rtex,
+			si_reallocate_texture_inplace(sctx, tex,
 							PIPE_BIND_SHARED, false);
 			flush = true;
 			assert(res->b.b.bind & PIPE_BIND_SHARED);
 			assert(res->flags & RADEON_FLAG_NO_SUBALLOC);
 			assert(!(res->flags & RADEON_FLAG_NO_INTERPROCESS_SHARING));
-			assert(rtex->surface.tile_swizzle == 0);
+			assert(tex->surface.tile_swizzle == 0);
 		}
 
 		/* Since shader image stores don't support DCC on VI,
 		 * disable it for external clients that want write
 		 * access.
 		 */
-		if (usage & PIPE_HANDLE_USAGE_WRITE && rtex->dcc_offset) {
-			if (si_texture_disable_dcc(sctx, rtex)) {
+		if (usage & PIPE_HANDLE_USAGE_WRITE && tex->dcc_offset) {
+			if (si_texture_disable_dcc(sctx, tex)) {
 				update_metadata = true;
 				/* si_texture_disable_dcc flushes the context */
 				flush = false;
@@ -727,44 +772,44 @@
 		}
 
 		if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) &&
-		    (rtex->cmask.size || rtex->dcc_offset)) {
+		    (tex->cmask_buffer || tex->dcc_offset)) {
 			/* Eliminate fast clear (both CMASK and DCC) */
-			si_eliminate_fast_color_clear(sctx, rtex);
+			si_eliminate_fast_color_clear(sctx, tex);
 			/* eliminate_fast_color_clear flushes the context */
 			flush = false;
 
 			/* Disable CMASK if flush_resource isn't going
 			 * to be called.
 			 */
-			if (rtex->cmask.size)
-				si_texture_discard_cmask(sscreen, rtex);
+			if (tex->cmask_buffer)
+				si_texture_discard_cmask(sscreen, tex);
 		}
 
 		/* Set metadata. */
 		if (!res->b.is_shared || update_metadata) {
-			si_texture_init_metadata(sscreen, rtex, &metadata);
-			si_query_opaque_metadata(sscreen, rtex, &metadata);
+			si_texture_init_metadata(sscreen, tex, &metadata);
+			si_query_opaque_metadata(sscreen, tex, &metadata);
 
 			sscreen->ws->buffer_set_metadata(res->buf, &metadata);
 		}
 
 		if (sscreen->info.chip_class >= GFX9) {
-			offset = rtex->surface.u.gfx9.surf_offset;
-			stride = rtex->surface.u.gfx9.surf_pitch *
-				 rtex->surface.bpe;
-			slice_size = rtex->surface.u.gfx9.surf_slice_size;
+			offset = tex->surface.u.gfx9.surf_offset;
+			stride = tex->surface.u.gfx9.surf_pitch *
+				 tex->surface.bpe;
+			slice_size = tex->surface.u.gfx9.surf_slice_size;
 		} else {
-			offset = rtex->surface.u.legacy.level[0].offset;
-			stride = rtex->surface.u.legacy.level[0].nblk_x *
-				 rtex->surface.bpe;
-			slice_size = (uint64_t)rtex->surface.u.legacy.level[0].slice_size_dw * 4;
+			offset = tex->surface.u.legacy.level[0].offset;
+			stride = tex->surface.u.legacy.level[0].nblk_x *
+				 tex->surface.bpe;
+			slice_size = (uint64_t)tex->surface.u.legacy.level[0].slice_size_dw * 4;
 		}
 	} else {
 		/* Buffer exports are for the OpenCL interop. */
 		/* Move a suballocated buffer into a non-suballocated allocation. */
 		if (sscreen->ws->buffer_is_suballocated(res->buf) ||
 		    /* A DMABUF export always fails if the BO is local. */
-		    (rtex->resource.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
+		    (tex->buffer.flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
 		     sscreen->info.has_local_buffers)) {
 			assert(!res->b.is_shared);
 
@@ -819,155 +864,24 @@
 static void si_texture_destroy(struct pipe_screen *screen,
 			       struct pipe_resource *ptex)
 {
-	struct r600_texture *rtex = (struct r600_texture*)ptex;
-	struct r600_resource *resource = &rtex->resource;
+	struct si_texture *tex = (struct si_texture*)ptex;
+	struct r600_resource *resource = &tex->buffer;
 
-	r600_texture_reference(&rtex->flushed_depth_texture, NULL);
+	si_texture_reference(&tex->flushed_depth_texture, NULL);
 
-	if (rtex->cmask_buffer != &rtex->resource) {
-	    r600_resource_reference(&rtex->cmask_buffer, NULL);
+	if (tex->cmask_buffer != &tex->buffer) {
+	    r600_resource_reference(&tex->cmask_buffer, NULL);
 	}
 	pb_reference(&resource->buf, NULL);
-	r600_resource_reference(&rtex->dcc_separate_buffer, NULL);
-	r600_resource_reference(&rtex->last_dcc_separate_buffer, NULL);
-	FREE(rtex);
+	r600_resource_reference(&tex->dcc_separate_buffer, NULL);
+	r600_resource_reference(&tex->last_dcc_separate_buffer, NULL);
+	FREE(tex);
 }
 
 static const struct u_resource_vtbl si_texture_vtbl;
 
-/* The number of samples can be specified independently of the texture. */
-void si_texture_get_fmask_info(struct si_screen *sscreen,
-			       struct r600_texture *rtex,
-			       unsigned nr_samples,
-			       struct r600_fmask_info *out)
-{
-	/* FMASK is allocated like an ordinary texture. */
-	struct pipe_resource templ = rtex->resource.b.b;
-	struct radeon_surf fmask = {};
-	unsigned flags, bpe;
-
-	memset(out, 0, sizeof(*out));
-
-	if (sscreen->info.chip_class >= GFX9) {
-		out->alignment = rtex->surface.u.gfx9.fmask_alignment;
-		out->size = rtex->surface.u.gfx9.fmask_size;
-		out->tile_swizzle = rtex->surface.u.gfx9.fmask_tile_swizzle;
-		return;
-	}
-
-	templ.nr_samples = 1;
-	flags = rtex->surface.flags | RADEON_SURF_FMASK;
-
-	switch (nr_samples) {
-	case 2:
-	case 4:
-		bpe = 1;
-		break;
-	case 8:
-		bpe = 4;
-		break;
-	default:
-		PRINT_ERR("Invalid sample count for FMASK allocation.\n");
-		return;
-	}
-
-	if (sscreen->ws->surface_init(sscreen->ws, &templ, flags, bpe,
-				      RADEON_SURF_MODE_2D, &fmask)) {
-		PRINT_ERR("Got error in surface_init while allocating FMASK.\n");
-		return;
-	}
-
-	assert(fmask.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
-
-	out->slice_tile_max = (fmask.u.legacy.level[0].nblk_x * fmask.u.legacy.level[0].nblk_y) / 64;
-	if (out->slice_tile_max)
-		out->slice_tile_max -= 1;
-
-	out->tile_mode_index = fmask.u.legacy.tiling_index[0];
-	out->pitch_in_pixels = fmask.u.legacy.level[0].nblk_x;
-	out->bank_height = fmask.u.legacy.bankh;
-	out->tile_swizzle = fmask.tile_swizzle;
-	out->alignment = MAX2(256, fmask.surf_alignment);
-	out->size = fmask.surf_size;
-}
-
-static void si_texture_allocate_fmask(struct si_screen *sscreen,
-				      struct r600_texture *rtex)
-{
-	si_texture_get_fmask_info(sscreen, rtex,
-				    rtex->resource.b.b.nr_samples, &rtex->fmask);
-
-	rtex->fmask.offset = align64(rtex->size, rtex->fmask.alignment);
-	rtex->size = rtex->fmask.offset + rtex->fmask.size;
-}
-
-void si_texture_get_cmask_info(struct si_screen *sscreen,
-			       struct r600_texture *rtex,
-			       struct r600_cmask_info *out)
-{
-	unsigned pipe_interleave_bytes = sscreen->info.pipe_interleave_bytes;
-	unsigned num_pipes = sscreen->info.num_tile_pipes;
-	unsigned cl_width, cl_height;
-
-	if (sscreen->info.chip_class >= GFX9) {
-		out->alignment = rtex->surface.u.gfx9.cmask_alignment;
-		out->size = rtex->surface.u.gfx9.cmask_size;
-		return;
-	}
-
-	switch (num_pipes) {
-	case 2:
-		cl_width = 32;
-		cl_height = 16;
-		break;
-	case 4:
-		cl_width = 32;
-		cl_height = 32;
-		break;
-	case 8:
-		cl_width = 64;
-		cl_height = 32;
-		break;
-	case 16: /* Hawaii */
-		cl_width = 64;
-		cl_height = 64;
-		break;
-	default:
-		assert(0);
-		return;
-	}
-
-	unsigned base_align = num_pipes * pipe_interleave_bytes;
-
-	unsigned width = align(rtex->resource.b.b.width0, cl_width*8);
-	unsigned height = align(rtex->resource.b.b.height0, cl_height*8);
-	unsigned slice_elements = (width * height) / (8*8);
-
-	/* Each element of CMASK is a nibble. */
-	unsigned slice_bytes = slice_elements / 2;
-
-	out->slice_tile_max = (width * height) / (128*128);
-	if (out->slice_tile_max)
-		out->slice_tile_max -= 1;
-
-	out->alignment = MAX2(256, base_align);
-	out->size = util_num_layers(&rtex->resource.b.b, 0) *
-		    align(slice_bytes, base_align);
-}
-
-static void si_texture_allocate_cmask(struct si_screen *sscreen,
-				      struct r600_texture *rtex)
-{
-	si_texture_get_cmask_info(sscreen, rtex, &rtex->cmask);
-
-	rtex->cmask.offset = align64(rtex->size, rtex->cmask.alignment);
-	rtex->size = rtex->cmask.offset + rtex->cmask.size;
-
-	rtex->cb_color_info |= S_028C70_FAST_CLEAR(1);
-}
-
 static void si_texture_get_htile_size(struct si_screen *sscreen,
-				      struct r600_texture *rtex)
+				      struct si_texture *tex)
 {
 	unsigned cl_width, cl_height, width, height;
 	unsigned slice_elements, slice_bytes, pipe_interleave_bytes, base_align;
@@ -975,12 +889,10 @@
 
 	assert(sscreen->info.chip_class <= VI);
 
-	rtex->surface.htile_size = 0;
+	tex->surface.htile_size = 0;
 
-	/* HTILE is broken with 1D tiling on old kernels and CIK. */
-	if (sscreen->info.chip_class >= CIK &&
-	    rtex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
-	    sscreen->info.drm_major == 2 && sscreen->info.drm_minor < 38)
+	if (tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_1D &&
+	    !sscreen->info.htile_cmask_support_1d_tiling)
 		return;
 
 	/* Overalign HTILE on P2 configs to work around GPU hangs in
@@ -1019,8 +931,8 @@
 		return;
 	}
 
-	width = align(rtex->surface.u.legacy.level[0].nblk_x, cl_width * 8);
-	height = align(rtex->surface.u.legacy.level[0].nblk_y, cl_height * 8);
+	width = align(tex->surface.u.legacy.level[0].nblk_x, cl_width * 8);
+	height = align(tex->surface.u.legacy.level[0].nblk_y, cl_height * 8);
 
 	slice_elements = (width * height) / (8 * 8);
 	slice_bytes = slice_elements * 4;
@@ -1028,27 +940,27 @@
 	pipe_interleave_bytes = sscreen->info.pipe_interleave_bytes;
 	base_align = num_pipes * pipe_interleave_bytes;
 
-	rtex->surface.htile_alignment = base_align;
-	rtex->surface.htile_size =
-		util_num_layers(&rtex->resource.b.b, 0) *
+	tex->surface.htile_alignment = base_align;
+	tex->surface.htile_size =
+		util_num_layers(&tex->buffer.b.b, 0) *
 		align(slice_bytes, base_align);
 }
 
 static void si_texture_allocate_htile(struct si_screen *sscreen,
-				      struct r600_texture *rtex)
+				      struct si_texture *tex)
 {
-	if (sscreen->info.chip_class <= VI && !rtex->tc_compatible_htile)
-		si_texture_get_htile_size(sscreen, rtex);
+	if (sscreen->info.chip_class <= VI && !tex->tc_compatible_htile)
+		si_texture_get_htile_size(sscreen, tex);
 
-	if (!rtex->surface.htile_size)
+	if (!tex->surface.htile_size)
 		return;
 
-	rtex->htile_offset = align(rtex->size, rtex->surface.htile_alignment);
-	rtex->size = rtex->htile_offset + rtex->surface.htile_size;
+	tex->htile_offset = align(tex->size, tex->surface.htile_alignment);
+	tex->size = tex->htile_offset + tex->surface.htile_size;
 }
 
 void si_print_texture_info(struct si_screen *sscreen,
-			   struct r600_texture *rtex, struct u_log_context *log)
+			   struct si_texture *tex, struct u_log_context *log)
 {
 	int i;
 
@@ -1056,161 +968,163 @@
 	u_log_printf(log, "  Info: npix_x=%u, npix_y=%u, npix_z=%u, blk_w=%u, "
 		"blk_h=%u, array_size=%u, last_level=%u, "
 		"bpe=%u, nsamples=%u, flags=0x%x, %s\n",
-		rtex->resource.b.b.width0, rtex->resource.b.b.height0,
-		rtex->resource.b.b.depth0, rtex->surface.blk_w,
-		rtex->surface.blk_h,
-		rtex->resource.b.b.array_size, rtex->resource.b.b.last_level,
-		rtex->surface.bpe, rtex->resource.b.b.nr_samples,
-		rtex->surface.flags, util_format_short_name(rtex->resource.b.b.format));
+		tex->buffer.b.b.width0, tex->buffer.b.b.height0,
+		tex->buffer.b.b.depth0, tex->surface.blk_w,
+		tex->surface.blk_h,
+		tex->buffer.b.b.array_size, tex->buffer.b.b.last_level,
+		tex->surface.bpe, tex->buffer.b.b.nr_samples,
+		tex->surface.flags, util_format_short_name(tex->buffer.b.b.format));
 
 	if (sscreen->info.chip_class >= GFX9) {
 		u_log_printf(log, "  Surf: size=%"PRIu64", slice_size=%"PRIu64", "
 			"alignment=%u, swmode=%u, epitch=%u, pitch=%u\n",
-			rtex->surface.surf_size,
-			rtex->surface.u.gfx9.surf_slice_size,
-			rtex->surface.surf_alignment,
-			rtex->surface.u.gfx9.surf.swizzle_mode,
-			rtex->surface.u.gfx9.surf.epitch,
-			rtex->surface.u.gfx9.surf_pitch);
+			tex->surface.surf_size,
+			tex->surface.u.gfx9.surf_slice_size,
+			tex->surface.surf_alignment,
+			tex->surface.u.gfx9.surf.swizzle_mode,
+			tex->surface.u.gfx9.surf.epitch,
+			tex->surface.u.gfx9.surf_pitch);
 
-		if (rtex->fmask.size) {
+		if (tex->surface.fmask_size) {
 			u_log_printf(log, "  FMASK: offset=%"PRIu64", size=%"PRIu64", "
 				"alignment=%u, swmode=%u, epitch=%u\n",
-				rtex->fmask.offset,
-				rtex->surface.u.gfx9.fmask_size,
-				rtex->surface.u.gfx9.fmask_alignment,
-				rtex->surface.u.gfx9.fmask.swizzle_mode,
-				rtex->surface.u.gfx9.fmask.epitch);
+				tex->fmask_offset,
+				tex->surface.fmask_size,
+				tex->surface.fmask_alignment,
+				tex->surface.u.gfx9.fmask.swizzle_mode,
+				tex->surface.u.gfx9.fmask.epitch);
 		}
 
-		if (rtex->cmask.size) {
-			u_log_printf(log, "  CMask: offset=%"PRIu64", size=%"PRIu64", "
+		if (tex->cmask_buffer) {
+			u_log_printf(log, "  CMask: offset=%"PRIu64", size=%u, "
 				"alignment=%u, rb_aligned=%u, pipe_aligned=%u\n",
-				rtex->cmask.offset,
-				rtex->surface.u.gfx9.cmask_size,
-				rtex->surface.u.gfx9.cmask_alignment,
-				rtex->surface.u.gfx9.cmask.rb_aligned,
-				rtex->surface.u.gfx9.cmask.pipe_aligned);
+				tex->cmask_offset,
+				tex->surface.cmask_size,
+				tex->surface.cmask_alignment,
+				tex->surface.u.gfx9.cmask.rb_aligned,
+				tex->surface.u.gfx9.cmask.pipe_aligned);
 		}
 
-		if (rtex->htile_offset) {
+		if (tex->htile_offset) {
 			u_log_printf(log, "  HTile: offset=%"PRIu64", size=%u, alignment=%u, "
 				"rb_aligned=%u, pipe_aligned=%u\n",
-				rtex->htile_offset,
-				rtex->surface.htile_size,
-				rtex->surface.htile_alignment,
-				rtex->surface.u.gfx9.htile.rb_aligned,
-				rtex->surface.u.gfx9.htile.pipe_aligned);
+				tex->htile_offset,
+				tex->surface.htile_size,
+				tex->surface.htile_alignment,
+				tex->surface.u.gfx9.htile.rb_aligned,
+				tex->surface.u.gfx9.htile.pipe_aligned);
 		}
 
-		if (rtex->dcc_offset) {
+		if (tex->dcc_offset) {
 			u_log_printf(log, "  DCC: offset=%"PRIu64", size=%u, "
 				"alignment=%u, pitch_max=%u, num_dcc_levels=%u\n",
-				rtex->dcc_offset, rtex->surface.dcc_size,
-				rtex->surface.dcc_alignment,
-				rtex->surface.u.gfx9.dcc_pitch_max,
-				rtex->surface.num_dcc_levels);
+				tex->dcc_offset, tex->surface.dcc_size,
+				tex->surface.dcc_alignment,
+				tex->surface.u.gfx9.dcc_pitch_max,
+				tex->surface.num_dcc_levels);
 		}
 
-		if (rtex->surface.u.gfx9.stencil_offset) {
+		if (tex->surface.u.gfx9.stencil_offset) {
 			u_log_printf(log, "  Stencil: offset=%"PRIu64", swmode=%u, epitch=%u\n",
-				rtex->surface.u.gfx9.stencil_offset,
-				rtex->surface.u.gfx9.stencil.swizzle_mode,
-				rtex->surface.u.gfx9.stencil.epitch);
+				tex->surface.u.gfx9.stencil_offset,
+				tex->surface.u.gfx9.stencil.swizzle_mode,
+				tex->surface.u.gfx9.stencil.epitch);
 		}
 		return;
 	}
 
 	u_log_printf(log, "  Layout: size=%"PRIu64", alignment=%u, bankw=%u, "
 		"bankh=%u, nbanks=%u, mtilea=%u, tilesplit=%u, pipeconfig=%u, scanout=%u\n",
-		rtex->surface.surf_size, rtex->surface.surf_alignment, rtex->surface.u.legacy.bankw,
-		rtex->surface.u.legacy.bankh, rtex->surface.u.legacy.num_banks, rtex->surface.u.legacy.mtilea,
-		rtex->surface.u.legacy.tile_split, rtex->surface.u.legacy.pipe_config,
-		(rtex->surface.flags & RADEON_SURF_SCANOUT) != 0);
+		tex->surface.surf_size, tex->surface.surf_alignment, tex->surface.u.legacy.bankw,
+		tex->surface.u.legacy.bankh, tex->surface.u.legacy.num_banks, tex->surface.u.legacy.mtilea,
+		tex->surface.u.legacy.tile_split, tex->surface.u.legacy.pipe_config,
+		(tex->surface.flags & RADEON_SURF_SCANOUT) != 0);
 
-	if (rtex->fmask.size)
+	if (tex->surface.fmask_size)
 		u_log_printf(log, "  FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, "
 			"bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
-			rtex->fmask.offset, rtex->fmask.size, rtex->fmask.alignment,
-			rtex->fmask.pitch_in_pixels, rtex->fmask.bank_height,
-			rtex->fmask.slice_tile_max, rtex->fmask.tile_mode_index);
+			tex->fmask_offset, tex->surface.fmask_size, tex->surface.fmask_alignment,
+			tex->surface.u.legacy.fmask.pitch_in_pixels,
+			tex->surface.u.legacy.fmask.bankh,
+			tex->surface.u.legacy.fmask.slice_tile_max,
+			tex->surface.u.legacy.fmask.tiling_index);
 
-	if (rtex->cmask.size)
-		u_log_printf(log, "  CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, "
+	if (tex->cmask_buffer)
+		u_log_printf(log, "  CMask: offset=%"PRIu64", size=%u, alignment=%u, "
 			"slice_tile_max=%u\n",
-			rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment,
-			rtex->cmask.slice_tile_max);
+			tex->cmask_offset, tex->surface.cmask_size, tex->surface.cmask_alignment,
+			tex->surface.u.legacy.cmask_slice_tile_max);
 
-	if (rtex->htile_offset)
+	if (tex->htile_offset)
 		u_log_printf(log, "  HTile: offset=%"PRIu64", size=%u, "
 			"alignment=%u, TC_compatible = %u\n",
-			rtex->htile_offset, rtex->surface.htile_size,
-			rtex->surface.htile_alignment,
-			rtex->tc_compatible_htile);
+			tex->htile_offset, tex->surface.htile_size,
+			tex->surface.htile_alignment,
+			tex->tc_compatible_htile);
 
-	if (rtex->dcc_offset) {
+	if (tex->dcc_offset) {
 		u_log_printf(log, "  DCC: offset=%"PRIu64", size=%u, alignment=%u\n",
-			rtex->dcc_offset, rtex->surface.dcc_size,
-			rtex->surface.dcc_alignment);
-		for (i = 0; i <= rtex->resource.b.b.last_level; i++)
+			tex->dcc_offset, tex->surface.dcc_size,
+			tex->surface.dcc_alignment);
+		for (i = 0; i <= tex->buffer.b.b.last_level; i++)
 			u_log_printf(log, "  DCCLevel[%i]: enabled=%u, offset=%u, "
 				"fast_clear_size=%u\n",
-				i, i < rtex->surface.num_dcc_levels,
-				rtex->surface.u.legacy.level[i].dcc_offset,
-				rtex->surface.u.legacy.level[i].dcc_fast_clear_size);
+				i, i < tex->surface.num_dcc_levels,
+				tex->surface.u.legacy.level[i].dcc_offset,
+				tex->surface.u.legacy.level[i].dcc_fast_clear_size);
 	}
 
-	for (i = 0; i <= rtex->resource.b.b.last_level; i++)
+	for (i = 0; i <= tex->buffer.b.b.last_level; i++)
 		u_log_printf(log, "  Level[%i]: offset=%"PRIu64", slice_size=%"PRIu64", "
 			"npix_x=%u, npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
 			"mode=%u, tiling_index = %u\n",
-			i, rtex->surface.u.legacy.level[i].offset,
-			(uint64_t)rtex->surface.u.legacy.level[i].slice_size_dw * 4,
-			u_minify(rtex->resource.b.b.width0, i),
-			u_minify(rtex->resource.b.b.height0, i),
-			u_minify(rtex->resource.b.b.depth0, i),
-			rtex->surface.u.legacy.level[i].nblk_x,
-			rtex->surface.u.legacy.level[i].nblk_y,
-			rtex->surface.u.legacy.level[i].mode,
-			rtex->surface.u.legacy.tiling_index[i]);
+			i, tex->surface.u.legacy.level[i].offset,
+			(uint64_t)tex->surface.u.legacy.level[i].slice_size_dw * 4,
+			u_minify(tex->buffer.b.b.width0, i),
+			u_minify(tex->buffer.b.b.height0, i),
+			u_minify(tex->buffer.b.b.depth0, i),
+			tex->surface.u.legacy.level[i].nblk_x,
+			tex->surface.u.legacy.level[i].nblk_y,
+			tex->surface.u.legacy.level[i].mode,
+			tex->surface.u.legacy.tiling_index[i]);
 
-	if (rtex->surface.has_stencil) {
+	if (tex->surface.has_stencil) {
 		u_log_printf(log, "  StencilLayout: tilesplit=%u\n",
-			rtex->surface.u.legacy.stencil_tile_split);
-		for (i = 0; i <= rtex->resource.b.b.last_level; i++) {
+			tex->surface.u.legacy.stencil_tile_split);
+		for (i = 0; i <= tex->buffer.b.b.last_level; i++) {
 			u_log_printf(log, "  StencilLevel[%i]: offset=%"PRIu64", "
 				"slice_size=%"PRIu64", npix_x=%u, "
 				"npix_y=%u, npix_z=%u, nblk_x=%u, nblk_y=%u, "
 				"mode=%u, tiling_index = %u\n",
-				i, rtex->surface.u.legacy.stencil_level[i].offset,
-				(uint64_t)rtex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
-				u_minify(rtex->resource.b.b.width0, i),
-				u_minify(rtex->resource.b.b.height0, i),
-				u_minify(rtex->resource.b.b.depth0, i),
-				rtex->surface.u.legacy.stencil_level[i].nblk_x,
-				rtex->surface.u.legacy.stencil_level[i].nblk_y,
-				rtex->surface.u.legacy.stencil_level[i].mode,
-				rtex->surface.u.legacy.stencil_tiling_index[i]);
+				i, tex->surface.u.legacy.stencil_level[i].offset,
+				(uint64_t)tex->surface.u.legacy.stencil_level[i].slice_size_dw * 4,
+				u_minify(tex->buffer.b.b.width0, i),
+				u_minify(tex->buffer.b.b.height0, i),
+				u_minify(tex->buffer.b.b.depth0, i),
+				tex->surface.u.legacy.stencil_level[i].nblk_x,
+				tex->surface.u.legacy.stencil_level[i].nblk_y,
+				tex->surface.u.legacy.stencil_level[i].mode,
+				tex->surface.u.legacy.stencil_tiling_index[i]);
 		}
 	}
 }
 
-/* Common processing for r600_texture_create and r600_texture_from_handle */
-static struct r600_texture *
+/* Common processing for si_texture_create and si_texture_from_handle */
+static struct si_texture *
 si_texture_create_object(struct pipe_screen *screen,
 			 const struct pipe_resource *base,
 			 struct pb_buffer *buf,
 			 struct radeon_surf *surface)
 {
-	struct r600_texture *rtex;
+	struct si_texture *tex;
 	struct r600_resource *resource;
 	struct si_screen *sscreen = (struct si_screen*)screen;
 
-	rtex = CALLOC_STRUCT(r600_texture);
-	if (!rtex)
+	tex = CALLOC_STRUCT(si_texture);
+	if (!tex)
 		return NULL;
 
-	resource = &rtex->resource;
+	resource = &tex->buffer;
 	resource->b.b = *base;
 	resource->b.b.next = NULL;
 	resource->b.vtbl = &si_texture_vtbl;
@@ -1218,66 +1132,73 @@
 	resource->b.b.screen = screen;
 
 	/* don't include stencil-only formats which we don't support for rendering */
-	rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format));
+	tex->is_depth = util_format_has_depth(util_format_description(tex->buffer.b.b.format));
 
-	rtex->surface = *surface;
-	rtex->size = rtex->surface.surf_size;
+	tex->surface = *surface;
+	tex->size = tex->surface.surf_size;
 
-	rtex->tc_compatible_htile = rtex->surface.htile_size != 0 &&
-				    (rtex->surface.flags &
-				     RADEON_SURF_TC_COMPATIBLE_HTILE);
+	tex->tc_compatible_htile = tex->surface.htile_size != 0 &&
+				   (tex->surface.flags &
+				    RADEON_SURF_TC_COMPATIBLE_HTILE);
 
 	/* TC-compatible HTILE:
 	 * - VI only supports Z32_FLOAT.
 	 * - GFX9 only supports Z32_FLOAT and Z16_UNORM. */
-	if (rtex->tc_compatible_htile) {
+	if (tex->tc_compatible_htile) {
 		if (sscreen->info.chip_class >= GFX9 &&
 		    base->format == PIPE_FORMAT_Z16_UNORM)
-			rtex->db_render_format = base->format;
+			tex->db_render_format = base->format;
 		else {
-			rtex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
-			rtex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
+			tex->db_render_format = PIPE_FORMAT_Z32_FLOAT;
+			tex->upgraded_depth = base->format != PIPE_FORMAT_Z32_FLOAT &&
 					       base->format != PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
 		}
 	} else {
-		rtex->db_render_format = base->format;
+		tex->db_render_format = base->format;
 	}
 
 	/* Applies to GCN. */
-	rtex->last_msaa_resolve_target_micro_mode = rtex->surface.micro_tile_mode;
+	tex->last_msaa_resolve_target_micro_mode = tex->surface.micro_tile_mode;
 
 	/* Disable separate DCC at the beginning. DRI2 doesn't reuse buffers
 	 * between frames, so the only thing that can enable separate DCC
 	 * with DRI2 is multiple slow clears within a frame.
 	 */
-	rtex->ps_draw_ratio = 0;
+	tex->ps_draw_ratio = 0;
 
-	if (rtex->is_depth) {
+	if (tex->is_depth) {
 		if (sscreen->info.chip_class >= GFX9) {
-			rtex->can_sample_z = true;
-			rtex->can_sample_s = true;
+			tex->can_sample_z = true;
+			tex->can_sample_s = true;
 		} else {
-			rtex->can_sample_z = !rtex->surface.u.legacy.depth_adjusted;
-			rtex->can_sample_s = !rtex->surface.u.legacy.stencil_adjusted;
+			tex->can_sample_z = !tex->surface.u.legacy.depth_adjusted;
+			tex->can_sample_s = !tex->surface.u.legacy.stencil_adjusted;
 		}
 
 		if (!(base->flags & (SI_RESOURCE_FLAG_TRANSFER |
 				     SI_RESOURCE_FLAG_FLUSHED_DEPTH))) {
-			rtex->db_compatible = true;
+			tex->db_compatible = true;
 
 			if (!(sscreen->debug_flags & DBG(NO_HYPERZ)))
-				si_texture_allocate_htile(sscreen, rtex);
+				si_texture_allocate_htile(sscreen, tex);
 		}
 	} else {
 		if (base->nr_samples > 1 &&
 		    !buf &&
 		    !(sscreen->debug_flags & DBG(NO_FMASK))) {
-			si_texture_allocate_fmask(sscreen, rtex);
-			si_texture_allocate_cmask(sscreen, rtex);
-			rtex->cmask_buffer = &rtex->resource;
+			/* Allocate FMASK. */
+			tex->fmask_offset = align64(tex->size,
+						     tex->surface.fmask_alignment);
+			tex->size = tex->fmask_offset + tex->surface.fmask_size;
 
-			if (!rtex->fmask.size || !rtex->cmask.size) {
-				FREE(rtex);
+			/* Allocate CMASK. */
+			tex->cmask_offset = align64(tex->size, tex->surface.cmask_alignment);
+			tex->size = tex->cmask_offset + tex->surface.cmask_size;
+			tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
+			tex->cmask_buffer = &tex->buffer;
+
+			if (!tex->surface.fmask_size || !tex->surface.cmask_size) {
+				FREE(tex);
 				return NULL;
 			}
 		}
@@ -1286,22 +1207,22 @@
 		 * If it's not present, it will be disabled by
 		 * apply_opaque_metadata later.
 		 */
-		if (rtex->surface.dcc_size &&
+		if (tex->surface.dcc_size &&
 		    (buf || !(sscreen->debug_flags & DBG(NO_DCC))) &&
-		    !(rtex->surface.flags & RADEON_SURF_SCANOUT)) {
+		    !(tex->surface.flags & RADEON_SURF_SCANOUT)) {
 			/* Reserve space for the DCC buffer. */
-			rtex->dcc_offset = align64(rtex->size, rtex->surface.dcc_alignment);
-			rtex->size = rtex->dcc_offset + rtex->surface.dcc_size;
+			tex->dcc_offset = align64(tex->size, tex->surface.dcc_alignment);
+			tex->size = tex->dcc_offset + tex->surface.dcc_size;
 		}
 	}
 
 	/* Now create the backing buffer. */
 	if (!buf) {
-		si_init_resource_fields(sscreen, resource, rtex->size,
-					  rtex->surface.surf_alignment);
+		si_init_resource_fields(sscreen, resource, tex->size,
+					  tex->surface.surf_alignment);
 
 		if (!si_alloc_resource(sscreen, resource)) {
-			FREE(rtex);
+			FREE(tex);
 			return NULL;
 		}
 	} else {
@@ -1316,40 +1237,40 @@
 			resource->gart_usage = buf->size;
 	}
 
-	if (rtex->cmask.size) {
+	if (tex->cmask_buffer) {
 		/* Initialize the cmask to 0xCC (= compressed state). */
-		si_screen_clear_buffer(sscreen, &rtex->cmask_buffer->b.b,
-					 rtex->cmask.offset, rtex->cmask.size,
+		si_screen_clear_buffer(sscreen, &tex->cmask_buffer->b.b,
+					 tex->cmask_offset, tex->surface.cmask_size,
 					 0xCCCCCCCC);
 	}
-	if (rtex->htile_offset) {
+	if (tex->htile_offset) {
 		uint32_t clear_value = 0;
 
-		if (sscreen->info.chip_class >= GFX9 || rtex->tc_compatible_htile)
+		if (sscreen->info.chip_class >= GFX9 || tex->tc_compatible_htile)
 			clear_value = 0x0000030F;
 
-		si_screen_clear_buffer(sscreen, &rtex->resource.b.b,
-					 rtex->htile_offset,
-					 rtex->surface.htile_size,
+		si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
+					 tex->htile_offset,
+					 tex->surface.htile_size,
 					 clear_value);
 	}
 
 	/* Initialize DCC only if the texture is not being imported. */
-	if (!buf && rtex->dcc_offset) {
-		si_screen_clear_buffer(sscreen, &rtex->resource.b.b,
-					 rtex->dcc_offset,
-					 rtex->surface.dcc_size,
+	if (!buf && tex->dcc_offset) {
+		si_screen_clear_buffer(sscreen, &tex->buffer.b.b,
+					 tex->dcc_offset,
+					 tex->surface.dcc_size,
 					 0xFFFFFFFF);
 	}
 
 	/* Initialize the CMASK base register value. */
-	rtex->cmask.base_address_reg =
-		(rtex->resource.gpu_address + rtex->cmask.offset) >> 8;
+	tex->cmask_base_address_reg =
+		(tex->buffer.gpu_address + tex->cmask_offset) >> 8;
 
 	if (sscreen->debug_flags & DBG(VM)) {
 		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Texture %ix%ix%i, %i levels, %i samples, %s\n",
-			rtex->resource.gpu_address,
-			rtex->resource.gpu_address + rtex->resource.buf->size,
+			tex->buffer.gpu_address,
+			tex->buffer.gpu_address + tex->buffer.buf->size,
 			base->width0, base->height0, util_num_layers(base, 0), base->last_level+1,
 			base->nr_samples ? base->nr_samples : 1, util_format_short_name(base->format));
 	}
@@ -1358,13 +1279,13 @@
 		puts("Texture:");
 		struct u_log_context log;
 		u_log_context_init(&log);
-		si_print_texture_info(sscreen, rtex, &log);
+		si_print_texture_info(sscreen, tex, &log);
 		u_log_new_page_print(&log, stdout);
 		fflush(stdout);
 		u_log_context_destroy(&log);
 	}
 
-	return rtex;
+	return tex;
 }
 
 static enum radeon_surf_mode
@@ -1438,6 +1359,25 @@
 					const struct pipe_resource *templ)
 {
 	struct si_screen *sscreen = (struct si_screen*)screen;
+	bool is_zs = util_format_is_depth_or_stencil(templ->format);
+
+	if (templ->nr_samples >= 2) {
+		/* This is hackish (overwriting the const pipe_resource template),
+		 * but should be harmless and state trackers can also see
+		 * the overriden number of samples in the created pipe_resource.
+		 */
+		if (is_zs && sscreen->eqaa_force_z_samples) {
+			((struct pipe_resource*)templ)->nr_samples =
+			((struct pipe_resource*)templ)->nr_storage_samples =
+				sscreen->eqaa_force_z_samples;
+		} else if (!is_zs && sscreen->eqaa_force_color_samples) {
+			((struct pipe_resource*)templ)->nr_samples =
+				sscreen->eqaa_force_coverage_samples;
+			((struct pipe_resource*)templ)->nr_storage_samples =
+				sscreen->eqaa_force_color_samples;
+		}
+	}
+
 	struct radeon_surf surface = {0};
 	bool is_flushed_depth = templ->flags & SI_RESOURCE_FLAG_FLUSHED_DEPTH;
 	bool tc_compatible_htile =
@@ -1453,8 +1393,7 @@
 		!(sscreen->debug_flags & DBG(NO_HYPERZ)) &&
 		!is_flushed_depth &&
 		templ->nr_samples <= 1 && /* TC-compat HTILE is less efficient with MSAA */
-		util_format_is_depth_or_stencil(templ->format);
-
+		is_zs;
 	int r;
 
 	r = si_init_surface(sscreen, &surface, templ,
@@ -1469,6 +1408,71 @@
 	       si_texture_create_object(screen, templ, NULL, &surface);
 }
 
+static struct pipe_resource *si_texture_from_winsys_buffer(struct si_screen *sscreen,
+							   const struct pipe_resource *templ,
+							   struct pb_buffer *buf,
+							   unsigned stride,
+							   unsigned offset,
+							   unsigned usage,
+							   bool dedicated)
+{
+	enum radeon_surf_mode array_mode;
+	struct radeon_surf surface = {};
+	struct radeon_bo_metadata metadata = {};
+	struct si_texture *tex;
+	bool is_scanout;
+	int r;
+
+	if (dedicated) {
+		sscreen->ws->buffer_get_metadata(buf, &metadata);
+		si_surface_import_metadata(sscreen, &surface, &metadata,
+					   &array_mode, &is_scanout);
+	} else {
+		/**
+		 * The bo metadata is unset for un-dedicated images. So we fall
+		 * back to linear. See answer to question 5 of the
+		 * VK_KHX_external_memory spec for some details.
+		 *
+		 * It is possible that this case isn't going to work if the
+		 * surface pitch isn't correctly aligned by default.
+		 *
+		 * In order to support it correctly we require multi-image
+		 * metadata to be syncrhonized between radv and radeonsi. The
+		 * semantics of associating multiple image metadata to a memory
+		 * object on the vulkan export side are not concretely defined
+		 * either.
+		 *
+		 * All the use cases we are aware of at the moment for memory
+		 * objects use dedicated allocations. So lets keep the initial
+		 * implementation simple.
+		 *
+		 * A possible alternative is to attempt to reconstruct the
+		 * tiling information when the TexParameter TEXTURE_TILING_EXT
+		 * is set.
+		 */
+		array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
+		is_scanout = false;
+	}
+
+	r = si_init_surface(sscreen, &surface, templ,
+			    array_mode, stride, offset, true, is_scanout,
+			    false, false);
+	if (r)
+		return NULL;
+
+	tex = si_texture_create_object(&sscreen->b, templ, buf, &surface);
+	if (!tex)
+		return NULL;
+
+	tex->buffer.b.is_shared = true;
+	tex->buffer.external_usage = usage;
+
+	si_apply_opaque_metadata(sscreen, tex, &metadata);
+
+	assert(tex->surface.tile_swizzle == 0);
+	return &tex->buffer.b.b;
+}
+
 static struct pipe_resource *si_texture_from_handle(struct pipe_screen *screen,
 						    const struct pipe_resource *templ,
 						    struct winsys_handle *whandle,
@@ -1477,12 +1481,6 @@
 	struct si_screen *sscreen = (struct si_screen*)screen;
 	struct pb_buffer *buf = NULL;
 	unsigned stride = 0, offset = 0;
-	enum radeon_surf_mode array_mode;
-	struct radeon_surf surface = {};
-	int r;
-	struct radeon_bo_metadata metadata = {};
-	struct r600_texture *rtex;
-	bool is_scanout;
 
 	/* Support only 2D textures without mipmaps */
 	if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
@@ -1493,44 +1491,25 @@
 	if (!buf)
 		return NULL;
 
-	sscreen->ws->buffer_get_metadata(buf, &metadata);
-	si_surface_import_metadata(sscreen, &surface, &metadata,
-				     &array_mode, &is_scanout);
-
-	r = si_init_surface(sscreen, &surface, templ, array_mode, stride,
-			      offset, true, is_scanout, false, false);
-	if (r) {
-		return NULL;
-	}
-
-	rtex = si_texture_create_object(screen, templ, buf, &surface);
-	if (!rtex)
-		return NULL;
-
-	rtex->resource.b.is_shared = true;
-	rtex->resource.external_usage = usage;
-
-	si_apply_opaque_metadata(sscreen, rtex, &metadata);
-
-	assert(rtex->surface.tile_swizzle == 0);
-	return &rtex->resource.b.b;
+	return si_texture_from_winsys_buffer(sscreen, templ, buf, stride,
+					     offset, usage, true);
 }
 
 bool si_init_flushed_depth_texture(struct pipe_context *ctx,
 				   struct pipe_resource *texture,
-				   struct r600_texture **staging)
+				   struct si_texture **staging)
 {
-	struct r600_texture *rtex = (struct r600_texture*)texture;
+	struct si_texture *tex = (struct si_texture*)texture;
 	struct pipe_resource resource;
-	struct r600_texture **flushed_depth_texture = staging ?
-			staging : &rtex->flushed_depth_texture;
+	struct si_texture **flushed_depth_texture = staging ?
+			staging : &tex->flushed_depth_texture;
 	enum pipe_format pipe_format = texture->format;
 
 	if (!staging) {
-		if (rtex->flushed_depth_texture)
+		if (tex->flushed_depth_texture)
 			return true; /* it's ready */
 
-		if (!rtex->can_sample_z && rtex->can_sample_s) {
+		if (!tex->can_sample_z && tex->can_sample_s) {
 			switch (pipe_format) {
 			case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 				/* Save memory by not allocating the S plane. */
@@ -1551,7 +1530,7 @@
 				break;
 			default:;
 			}
-		} else if (!rtex->can_sample_s && rtex->can_sample_z) {
+		} else if (!tex->can_sample_s && tex->can_sample_z) {
 			assert(util_format_has_stencil(util_format_description(pipe_format)));
 
 			/* DB->CB copies to an 8bpp surface don't work. */
@@ -1575,7 +1554,7 @@
 	if (staging)
 		resource.flags |= SI_RESOURCE_FLAG_TRANSFER;
 
-	*flushed_depth_texture = (struct r600_texture *)ctx->screen->resource_create(ctx->screen, &resource);
+	*flushed_depth_texture = (struct si_texture *)ctx->screen->resource_create(ctx->screen, &resource);
 	if (*flushed_depth_texture == NULL) {
 		PRINT_ERR("failed to create temporary texture to hold flushed depth\n");
 		return false;
@@ -1612,38 +1591,38 @@
 }
 
 static bool si_can_invalidate_texture(struct si_screen *sscreen,
-				      struct r600_texture *rtex,
+				      struct si_texture *tex,
 				      unsigned transfer_usage,
 				      const struct pipe_box *box)
 {
-	return !rtex->resource.b.is_shared &&
+	return !tex->buffer.b.is_shared &&
 		!(transfer_usage & PIPE_TRANSFER_READ) &&
-		rtex->resource.b.b.last_level == 0 &&
-		util_texrange_covers_whole_level(&rtex->resource.b.b, 0,
+		tex->buffer.b.b.last_level == 0 &&
+		util_texrange_covers_whole_level(&tex->buffer.b.b, 0,
 						 box->x, box->y, box->z,
 						 box->width, box->height,
 						 box->depth);
 }
 
 static void si_texture_invalidate_storage(struct si_context *sctx,
-					  struct r600_texture *rtex)
+					  struct si_texture *tex)
 {
 	struct si_screen *sscreen = sctx->screen;
 
 	/* There is no point in discarding depth and tiled buffers. */
-	assert(!rtex->is_depth);
-	assert(rtex->surface.is_linear);
+	assert(!tex->is_depth);
+	assert(tex->surface.is_linear);
 
 	/* Reallocate the buffer in the same pipe_resource. */
-	si_alloc_resource(sscreen, &rtex->resource);
+	si_alloc_resource(sscreen, &tex->buffer);
 
 	/* Initialize the CMASK base address (needed even without CMASK). */
-	rtex->cmask.base_address_reg =
-		(rtex->resource.gpu_address + rtex->cmask.offset) >> 8;
+	tex->cmask_base_address_reg =
+		(tex->buffer.gpu_address + tex->cmask_offset) >> 8;
 
 	p_atomic_inc(&sscreen->dirty_tex_counter);
 
-	sctx->num_alloc_tex_transfer_bytes += rtex->size;
+	sctx->num_alloc_tex_transfer_bytes += tex->size;
 }
 
 static void *si_texture_transfer_map(struct pipe_context *ctx,
@@ -1654,8 +1633,8 @@
 				     struct pipe_transfer **ptransfer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct r600_texture *rtex = (struct r600_texture*)texture;
-	struct r600_transfer *trans;
+	struct si_texture *tex = (struct si_texture*)texture;
+	struct si_transfer *trans;
 	struct r600_resource *buf;
 	unsigned offset = 0;
 	char *map;
@@ -1665,7 +1644,7 @@
 	assert(box->width && box->height && box->depth);
 
 	/* Depth textures use staging unconditionally. */
-	if (!rtex->is_depth) {
+	if (!tex->is_depth) {
 		/* Degrade the tile mode if we get too many transfers on APUs.
 		 * On dGPUs, the staging texture is always faster.
 		 * Only count uploads that are at least 4x4 pixels large.
@@ -1673,12 +1652,12 @@
 		if (!sctx->screen->info.has_dedicated_vram &&
 		    level == 0 &&
 		    box->width >= 4 && box->height >= 4 &&
-		    p_atomic_inc_return(&rtex->num_level0_transfers) == 10) {
+		    p_atomic_inc_return(&tex->num_level0_transfers) == 10) {
 			bool can_invalidate =
-				si_can_invalidate_texture(sctx->screen, rtex,
+				si_can_invalidate_texture(sctx->screen, tex,
 							    usage, box);
 
-			si_reallocate_texture_inplace(sctx, rtex,
+			si_reallocate_texture_inplace(sctx, tex,
 							PIPE_BIND_LINEAR,
 							can_invalidate);
 		}
@@ -1692,27 +1671,27 @@
 		 * Use the staging texture for uploads if the underlying BO
 		 * is busy.
 		 */
-		if (!rtex->surface.is_linear)
+		if (!tex->surface.is_linear)
 			use_staging_texture = true;
 		else if (usage & PIPE_TRANSFER_READ)
 			use_staging_texture =
-				rtex->resource.domains & RADEON_DOMAIN_VRAM ||
-				rtex->resource.flags & RADEON_FLAG_GTT_WC;
+				tex->buffer.domains & RADEON_DOMAIN_VRAM ||
+				tex->buffer.flags & RADEON_FLAG_GTT_WC;
 		/* Write & linear only: */
-		else if (si_rings_is_buffer_referenced(sctx, rtex->resource.buf,
+		else if (si_rings_is_buffer_referenced(sctx, tex->buffer.buf,
 						       RADEON_USAGE_READWRITE) ||
-			 !sctx->ws->buffer_wait(rtex->resource.buf, 0,
+			 !sctx->ws->buffer_wait(tex->buffer.buf, 0,
 						RADEON_USAGE_READWRITE)) {
 			/* It's busy. */
-			if (si_can_invalidate_texture(sctx->screen, rtex,
+			if (si_can_invalidate_texture(sctx->screen, tex,
 							usage, box))
-				si_texture_invalidate_storage(sctx, rtex);
+				si_texture_invalidate_storage(sctx, tex);
 			else
 				use_staging_texture = true;
 		}
 	}
 
-	trans = CALLOC_STRUCT(r600_transfer);
+	trans = CALLOC_STRUCT(si_transfer);
 	if (!trans)
 		return NULL;
 	pipe_resource_reference(&trans->b.b.resource, texture);
@@ -1720,10 +1699,10 @@
 	trans->b.b.usage = usage;
 	trans->b.b.box = *box;
 
-	if (rtex->is_depth) {
-		struct r600_texture *staging_depth;
+	if (tex->is_depth) {
+		struct si_texture *staging_depth;
 
-		if (rtex->resource.b.b.nr_samples > 1) {
+		if (tex->buffer.b.b.nr_samples > 1) {
 			/* MSAA depth buffers need to be converted to single sample buffers.
 			 *
 			 * Mapping MSAA depth buffers can occur if ReadPixels is called
@@ -1751,7 +1730,7 @@
 				}
 
 				si_copy_region_with_blit(ctx, temp, 0, 0, 0, 0, texture, level, box);
-				si_blit_decompress_depth(ctx, (struct r600_texture*)temp, staging_depth,
+				si_blit_decompress_depth(ctx, (struct si_texture*)temp, staging_depth,
 							 0, 0, 0, box->depth, 0, 0);
 				pipe_resource_reference(&temp, NULL);
 			}
@@ -1768,7 +1747,7 @@
 				goto fail_trans;
 			}
 
-			si_blit_decompress_depth(ctx, rtex, staging_depth,
+			si_blit_decompress_depth(ctx, tex, staging_depth,
 						 level, level,
 						 box->z, box->z + box->depth - 1,
 						 0, 0);
@@ -1779,11 +1758,11 @@
 							 &trans->b.b.layer_stride);
 		}
 
-		trans->staging = (struct r600_resource*)staging_depth;
+		trans->staging = &staging_depth->buffer;
 		buf = trans->staging;
 	} else if (use_staging_texture) {
 		struct pipe_resource resource;
-		struct r600_texture *staging;
+		struct si_texture *staging;
 
 		si_init_temp_resource_from_box(&resource, texture, box, level,
 						 SI_RESOURCE_FLAG_TRANSFER);
@@ -1791,12 +1770,12 @@
 			PIPE_USAGE_STAGING : PIPE_USAGE_STREAM;
 
 		/* Create the temporary texture. */
-		staging = (struct r600_texture*)ctx->screen->resource_create(ctx->screen, &resource);
+		staging = (struct si_texture*)ctx->screen->resource_create(ctx->screen, &resource);
 		if (!staging) {
 			PRINT_ERR("failed to create temporary texture to hold untiled copy\n");
 			goto fail_trans;
 		}
-		trans->staging = &staging->resource;
+		trans->staging = &staging->buffer;
 
 		/* Just get the strides. */
 		si_texture_get_offset(sctx->screen, staging, 0, NULL,
@@ -1811,10 +1790,10 @@
 		buf = trans->staging;
 	} else {
 		/* the resource is mapped directly */
-		offset = si_texture_get_offset(sctx->screen, rtex, level, box,
+		offset = si_texture_get_offset(sctx->screen, tex, level, box,
 						 &trans->b.b.stride,
 						 &trans->b.b.layer_stride);
-		buf = &rtex->resource;
+		buf = &tex->buffer;
 	}
 
 	if (!(map = si_buffer_map_sync_with_rings(sctx, buf, usage)))
@@ -1834,24 +1813,24 @@
 				      struct pipe_transfer* transfer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct r600_transfer *rtransfer = (struct r600_transfer*)transfer;
+	struct si_transfer *stransfer = (struct si_transfer*)transfer;
 	struct pipe_resource *texture = transfer->resource;
-	struct r600_texture *rtex = (struct r600_texture*)texture;
+	struct si_texture *tex = (struct si_texture*)texture;
 
-	if ((transfer->usage & PIPE_TRANSFER_WRITE) && rtransfer->staging) {
-		if (rtex->is_depth && rtex->resource.b.b.nr_samples <= 1) {
+	if ((transfer->usage & PIPE_TRANSFER_WRITE) && stransfer->staging) {
+		if (tex->is_depth && tex->buffer.b.b.nr_samples <= 1) {
 			ctx->resource_copy_region(ctx, texture, transfer->level,
 						  transfer->box.x, transfer->box.y, transfer->box.z,
-						  &rtransfer->staging->b.b, transfer->level,
+						  &stransfer->staging->b.b, transfer->level,
 						  &transfer->box);
 		} else {
-			si_copy_from_staging_texture(ctx, rtransfer);
+			si_copy_from_staging_texture(ctx, stransfer);
 		}
 	}
 
-	if (rtransfer->staging) {
-		sctx->num_alloc_tex_transfer_bytes += rtransfer->staging->buf->size;
-		r600_resource_reference(&rtransfer->staging, NULL);
+	if (stransfer->staging) {
+		sctx->num_alloc_tex_transfer_bytes += stransfer->staging->buf->size;
+		r600_resource_reference(&stransfer->staging, NULL);
 	}
 
 	/* Heuristic for {upload, draw, upload, draw, ..}:
@@ -1885,86 +1864,72 @@
 	si_texture_transfer_unmap,	/* transfer_unmap */
 };
 
-/* DCC channel type categories within which formats can be reinterpreted
- * while keeping the same DCC encoding. The swizzle must also match. */
-enum dcc_channel_type {
-	dcc_channel_float,
-	/* uint and sint can be merged if we never use TC-compatible DCC clear
-	 * encoding with the clear value of 1. */
-	dcc_channel_uint,
-	dcc_channel_sint,
-	dcc_channel_uint_10_10_10_2,
-	dcc_channel_incompatible,
-};
-
-/* Return the type of DCC encoding. */
-static enum dcc_channel_type
-vi_get_dcc_channel_type(const struct util_format_description *desc)
-{
-	int i;
-
-	/* Find the first non-void channel. */
-	for (i = 0; i < desc->nr_channels; i++)
-		if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID)
-			break;
-	if (i == desc->nr_channels)
-		return dcc_channel_incompatible;
-
-	switch (desc->channel[i].size) {
-	case 32:
-	case 16:
-	case 8:
-		if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT)
-			return dcc_channel_float;
-		if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED)
-			return dcc_channel_uint;
-		return dcc_channel_sint;
-	case 10:
-		return dcc_channel_uint_10_10_10_2;
-	default:
-		return dcc_channel_incompatible;
-	}
-}
-
-/* Return if it's allowed to reinterpret one format as another with DCC enabled. */
+/* Return if it's allowed to reinterpret one format as another with DCC enabled.
+ */
 bool vi_dcc_formats_compatible(enum pipe_format format1,
 			       enum pipe_format format2)
 {
 	const struct util_format_description *desc1, *desc2;
-	enum dcc_channel_type type1, type2;
-	int i;
 
+	/* No format change - exit early. */
+	if (format1 == format2)
+		return true;
+
+	format1 = si_simplify_cb_format(format1);
+	format2 = si_simplify_cb_format(format2);
+
+	/* Check again after format adjustments. */
 	if (format1 == format2)
 		return true;
 
 	desc1 = util_format_description(format1);
 	desc2 = util_format_description(format2);
 
-	if (desc1->nr_channels != desc2->nr_channels)
+	if (desc1->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
+	    desc2->layout != UTIL_FORMAT_LAYOUT_PLAIN)
 		return false;
 
-	/* Swizzles must be the same. */
-	for (i = 0; i < desc1->nr_channels; i++)
-		if (desc1->swizzle[i] <= PIPE_SWIZZLE_W &&
-		    desc2->swizzle[i] <= PIPE_SWIZZLE_W &&
-		    desc1->swizzle[i] != desc2->swizzle[i])
-			return false;
+	/* Float and non-float are totally incompatible. */
+	if ((desc1->channel[0].type == UTIL_FORMAT_TYPE_FLOAT) !=
+	    (desc2->channel[0].type == UTIL_FORMAT_TYPE_FLOAT))
+		return false;
 
-	type1 = vi_get_dcc_channel_type(desc1);
-	type2 = vi_get_dcc_channel_type(desc2);
+	/* Channel sizes must match across DCC formats.
+	 * Comparing just the first 2 channels should be enough.
+	 */
+	if (desc1->channel[0].size != desc2->channel[0].size ||
+	    (desc1->nr_channels >= 2 &&
+	     desc1->channel[1].size != desc2->channel[1].size))
+		return false;
 
-	return type1 != dcc_channel_incompatible &&
-	       type2 != dcc_channel_incompatible &&
-	       type1 == type2;
+	/* Everything below is not needed if the driver never uses the DCC
+	 * clear code with the value of 1.
+	 */
+
+	/* If the clear values are all 1 or all 0, this constraint can be
+	 * ignored. */
+	if (vi_alpha_is_on_msb(format1) != vi_alpha_is_on_msb(format2))
+		return false;
+
+	/* Channel types must match if the clear value of 1 is used.
+	 * The type categories are only float, signed, unsigned.
+	 * NORM and INT are always compatible.
+	 */
+	if (desc1->channel[0].type != desc2->channel[0].type ||
+	    (desc1->nr_channels >= 2 &&
+	     desc1->channel[1].type != desc2->channel[1].type))
+		return false;
+
+	return true;
 }
 
 bool vi_dcc_formats_are_incompatible(struct pipe_resource *tex,
 				     unsigned level,
 				     enum pipe_format view_format)
 {
-	struct r600_texture *rtex = (struct r600_texture *)tex;
+	struct si_texture *stex = (struct si_texture *)tex;
 
-	return vi_dcc_enabled(rtex, level) &&
+	return vi_dcc_enabled(stex, level) &&
 	       !vi_dcc_formats_compatible(tex->format, view_format);
 }
 
@@ -1975,11 +1940,11 @@
 					   unsigned level,
 					   enum pipe_format view_format)
 {
-	struct r600_texture *rtex = (struct r600_texture *)tex;
+	struct si_texture *stex = (struct si_texture *)tex;
 
 	if (vi_dcc_formats_are_incompatible(tex, level, view_format))
-		if (!si_texture_disable_dcc(sctx, (struct r600_texture*)tex))
-			si_decompress_dcc(sctx, rtex);
+		if (!si_texture_disable_dcc(sctx, stex))
+			si_decompress_dcc(sctx, stex);
 }
 
 struct pipe_surface *si_create_surface_custom(struct pipe_context *pipe,
@@ -1988,7 +1953,7 @@
 					      unsigned width0, unsigned height0,
 					      unsigned width, unsigned height)
 {
-	struct r600_surface *surface = CALLOC_STRUCT(r600_surface);
+	struct si_surface *surface = CALLOC_STRUCT(si_surface);
 
 	if (!surface)
 		return NULL;
@@ -2137,21 +2102,21 @@
 			sctx->dcc_stats[slot].ps_stats[i] = NULL;
 		}
 
-	r600_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
+	si_texture_reference(&sctx->dcc_stats[slot].tex, NULL);
 }
 
 /**
  * Return the per-context slot where DCC statistics queries for the texture live.
  */
 static unsigned vi_get_context_dcc_stats_index(struct si_context *sctx,
-					       struct r600_texture *tex)
+					       struct si_texture *tex)
 {
 	int i, empty_slot = -1;
 
 	/* Remove zombie textures (textures kept alive by this array only). */
 	for (i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++)
 		if (sctx->dcc_stats[i].tex &&
-		    sctx->dcc_stats[i].tex->resource.b.b.reference.count == 1)
+		    sctx->dcc_stats[i].tex->buffer.b.b.reference.count == 1)
 			vi_dcc_clean_up_context_slot(sctx, i);
 
 	/* Find the texture. */
@@ -2183,7 +2148,7 @@
 	}
 
 	/* Add the texture to the new slot. */
-	r600_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
+	si_texture_reference(&sctx->dcc_stats[empty_slot].tex, tex);
 	sctx->dcc_stats[empty_slot].last_use_timestamp = os_time_get();
 	return empty_slot;
 }
@@ -2202,7 +2167,7 @@
  * Called when binding a color buffer.
  */
 void vi_separate_dcc_start_query(struct si_context *sctx,
-				 struct r600_texture *tex)
+				 struct si_texture *tex)
 {
 	unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
 
@@ -2220,7 +2185,7 @@
  * Called when unbinding a color buffer.
  */
 void vi_separate_dcc_stop_query(struct si_context *sctx,
-				struct r600_texture *tex)
+				struct si_texture *tex)
 {
 	unsigned i = vi_get_context_dcc_stats_index(sctx, tex);
 
@@ -2232,7 +2197,7 @@
 	sctx->dcc_stats[i].query_active = false;
 }
 
-static bool vi_should_enable_separate_dcc(struct r600_texture *tex)
+static bool vi_should_enable_separate_dcc(struct si_texture *tex)
 {
 	/* The minimum number of fullscreen draws per frame that is required
 	 * to enable DCC. */
@@ -2241,18 +2206,22 @@
 
 /* Called by fast clear. */
 void vi_separate_dcc_try_enable(struct si_context *sctx,
-				struct r600_texture *tex)
+				struct si_texture *tex)
 {
 	/* The intent is to use this with shared displayable back buffers,
 	 * but it's not strictly limited only to them.
 	 */
-	if (!tex->resource.b.is_shared ||
-	    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
-	    tex->resource.b.b.target != PIPE_TEXTURE_2D ||
-	    tex->resource.b.b.last_level > 0 ||
-	    !tex->surface.dcc_size)
+	if (!tex->buffer.b.is_shared ||
+	    !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH) ||
+	    tex->buffer.b.b.target != PIPE_TEXTURE_2D ||
+	    tex->buffer.b.b.last_level > 0 ||
+	    !tex->surface.dcc_size ||
+	    sctx->screen->debug_flags & DBG(NO_DCC) ||
+	    sctx->screen->debug_flags & DBG(NO_DCC_FB))
 		return;
 
+	assert(sctx->chip_class >= VI);
+
 	if (tex->dcc_offset)
 		return; /* already enabled */
 
@@ -2277,7 +2246,7 @@
 		tex->dcc_separate_buffer = tex->last_dcc_separate_buffer;
 		tex->last_dcc_separate_buffer = NULL;
 	} else {
-		tex->dcc_separate_buffer = (struct r600_resource*)
+		tex->dcc_separate_buffer =
 			si_aligned_buffer_create(sctx->b.screen,
 						   SI_RESOURCE_FLAG_UNMAPPABLE,
 						   PIPE_USAGE_DEFAULT,
@@ -2300,7 +2269,7 @@
  * takes place.
  */
 void vi_separate_dcc_process_and_reset_stats(struct pipe_context *ctx,
-					     struct r600_texture *tex)
+					     struct si_texture *tex)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
 	struct pipe_query *tmp;
@@ -2321,7 +2290,7 @@
 		/* Compute the approximate number of fullscreen draws. */
 		tex->ps_draw_ratio =
 			result.pipeline_statistics.ps_invocations /
-			(tex->resource.b.b.width0 * tex->resource.b.b.height0);
+			(tex->buffer.b.b.width0 * tex->buffer.b.b.height0);
 		sctx->last_tex_ps_draw_ratio = tex->ps_draw_ratio;
 
 		disable = tex->dcc_separate_buffer &&
@@ -2361,7 +2330,7 @@
 		      bool dedicated)
 {
 	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct r600_memory_object *memobj = CALLOC_STRUCT(r600_memory_object);
+	struct si_memory_object *memobj = CALLOC_STRUCT(si_memory_object);
 	struct pb_buffer *buf = NULL;
 	uint32_t stride, offset;
 
@@ -2378,7 +2347,6 @@
 	memobj->b.dedicated = dedicated;
 	memobj->buf = buf;
 	memobj->stride = stride;
-	memobj->offset = offset;
 
 	return (struct pipe_memory_object *)memobj;
 
@@ -2388,7 +2356,7 @@
 si_memobj_destroy(struct pipe_screen *screen,
 		  struct pipe_memory_object *_memobj)
 {
-	struct r600_memory_object *memobj = (struct r600_memory_object *)_memobj;
+	struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
 
 	pb_reference(&memobj->buf, NULL);
 	free(memobj);
@@ -2400,77 +2368,29 @@
 		       struct pipe_memory_object *_memobj,
 		       uint64_t offset)
 {
-	int r;
 	struct si_screen *sscreen = (struct si_screen*)screen;
-	struct r600_memory_object *memobj = (struct r600_memory_object *)_memobj;
-	struct r600_texture *rtex;
-	struct radeon_surf surface = {};
-	struct radeon_bo_metadata metadata = {};
-	enum radeon_surf_mode array_mode;
-	bool is_scanout;
-	struct pb_buffer *buf = NULL;
-
-	if (memobj->b.dedicated) {
-		sscreen->ws->buffer_get_metadata(memobj->buf, &metadata);
-		si_surface_import_metadata(sscreen, &surface, &metadata,
-				     &array_mode, &is_scanout);
-	} else {
-		/**
-		 * The bo metadata is unset for un-dedicated images. So we fall
-		 * back to linear. See answer to question 5 of the
-		 * VK_KHX_external_memory spec for some details.
-		 *
-		 * It is possible that this case isn't going to work if the
-		 * surface pitch isn't correctly aligned by default.
-		 *
-		 * In order to support it correctly we require multi-image
-		 * metadata to be syncrhonized between radv and radeonsi. The
-		 * semantics of associating multiple image metadata to a memory
-		 * object on the vulkan export side are not concretely defined
-		 * either.
-		 *
-		 * All the use cases we are aware of at the moment for memory
-		 * objects use dedicated allocations. So lets keep the initial
-		 * implementation simple.
-		 *
-		 * A possible alternative is to attempt to reconstruct the
-		 * tiling information when the TexParameter TEXTURE_TILING_EXT
-		 * is set.
-		 */
-		array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
-		is_scanout = false;
-
-	}
-
-	r = si_init_surface(sscreen, &surface, templ,
-			      array_mode, memobj->stride,
-			      offset, true, is_scanout,
-			      false, false);
-	if (r)
+	struct si_memory_object *memobj = (struct si_memory_object *)_memobj;
+	struct pipe_resource *tex =
+		si_texture_from_winsys_buffer(sscreen, templ, memobj->buf,
+					      memobj->stride, offset,
+					      PIPE_HANDLE_USAGE_READ_WRITE,
+					      memobj->b.dedicated);
+	if (!tex)
 		return NULL;
 
-	rtex = si_texture_create_object(screen, templ, memobj->buf, &surface);
-	if (!rtex)
-		return NULL;
-
-	/* r600_texture_create_object doesn't increment refcount of
+	/* si_texture_from_winsys_buffer doesn't increment refcount of
 	 * memobj->buf, so increment it here.
 	 */
+	struct pb_buffer *buf = NULL;
 	pb_reference(&buf, memobj->buf);
-
-	rtex->resource.b.is_shared = true;
-	rtex->resource.external_usage = PIPE_HANDLE_USAGE_READ_WRITE;
-
-	si_apply_opaque_metadata(sscreen, rtex, &metadata);
-
-	return &rtex->resource.b.b;
+	return tex;
 }
 
 static bool si_check_resource_capability(struct pipe_screen *screen,
 					 struct pipe_resource *resource,
 					 unsigned bind)
 {
-	struct r600_texture *tex = (struct r600_texture*)resource;
+	struct si_texture *tex = (struct si_texture*)resource;
 
 	/* Buffers only support the linear flag. */
 	if (resource->target == PIPE_BUFFER)
diff --git a/src/gallium/drivers/radeonsi/si_uvd.c b/src/gallium/drivers/radeonsi/si_uvd.c
index 4165725..1a9d8f8 100644
--- a/src/gallium/drivers/radeonsi/si_uvd.c
+++ b/src/gallium/drivers/radeonsi/si_uvd.c
@@ -41,7 +41,7 @@
 						 const struct pipe_video_buffer *tmpl)
 {
 	struct si_context *ctx = (struct si_context *)pipe;
-	struct r600_texture *resources[VL_NUM_COMPONENTS] = {};
+	struct si_texture *resources[VL_NUM_COMPONENTS] = {};
 	struct radeon_surf *surfaces[VL_NUM_COMPONENTS] = {};
 	struct pb_buffer **pbs[VL_NUM_COMPONENTS] = {};
 	const enum pipe_format *resource_formats;
@@ -68,11 +68,11 @@
 			vl_video_buffer_template(&templ, &vidtemplate,
 			                         resource_formats[i], 1,
 			                         array_size, PIPE_USAGE_DEFAULT, i);
-			/* Set PIPE_BIND_SHARED to avoid reallocation in r600_texture_get_handle,
+			/* Set PIPE_BIND_SHARED to avoid reallocation in si_texture_get_handle,
 			 * which can't handle joined surfaces. */
 			/* TODO: get tiling working */
 			templ.bind = PIPE_BIND_LINEAR | PIPE_BIND_SHARED;
-			resources[i] = (struct r600_texture *)
+			resources[i] = (struct si_texture *)
 			                pipe->screen->resource_create(pipe->screen, &templ);
 			if (!resources[i])
 				goto error;
@@ -84,7 +84,7 @@
 			continue;
 
 		surfaces[i] = & resources[i]->surface;
-		pbs[i] = &resources[i]->resource.buf;
+		pbs[i] = &resources[i]->buffer.buf;
 	}
 
 	si_vid_join_surfaces(ctx, pbs, surfaces);
@@ -94,8 +94,8 @@
 			continue;
 
 		/* reset the address */
-		resources[i]->resource.gpu_address = ctx->ws->buffer_get_virtual_address(
-			resources[i]->resource.buf);
+		resources[i]->buffer.gpu_address = ctx->ws->buffer_get_virtual_address(
+			resources[i]->buffer.buf);
 	}
 
 	vidtemplate.height *= array_size;
@@ -103,7 +103,7 @@
 
 error:
 	for (i = 0; i < VL_NUM_COMPONENTS; ++i)
-		r600_texture_reference(&resources[i], NULL);
+		si_texture_reference(&resources[i], NULL);
 
 	return NULL;
 }
@@ -112,8 +112,8 @@
 static struct pb_buffer* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
 {
 	struct si_screen *sscreen = (struct si_screen*)buf->base.context->screen;
-	struct r600_texture *luma = (struct r600_texture *)buf->resources[0];
-	struct r600_texture *chroma = (struct r600_texture *)buf->resources[1];
+	struct si_texture *luma = (struct si_texture *)buf->resources[0];
+	struct si_texture *chroma = (struct si_texture *)buf->resources[1];
 	enum ruvd_surface_type type =  (sscreen->info.chip_class >= GFX9) ?
 					RUVD_SURFACE_TYPE_GFX9 :
 					RUVD_SURFACE_TYPE_LEGACY;
@@ -122,7 +122,7 @@
 
 	si_uvd_set_dt_surfaces(msg, &luma->surface, (chroma) ? &chroma->surface : NULL, type);
 
-	return luma->resource.buf;
+	return luma->buffer.buf;
 }
 
 /* get the radeon resources for VCE */
@@ -130,10 +130,10 @@
 			      struct pb_buffer **handle,
 			      struct radeon_surf **surface)
 {
-	struct r600_texture *res = (struct r600_texture *)resource;
+	struct si_texture *res = (struct si_texture *)resource;
 
 	if (handle)
-		*handle = res->resource.buf;
+		*handle = res->buffer.buf;
 
 	if (surface)
 		*surface = &res->surface;
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 25f6f74..302bebf 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -151,6 +151,8 @@
       return 0;
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
       return 330;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      return 140;
    case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
    case PIPE_CAP_TGSI_TEX_TXF_LZ:
       return 0;
@@ -309,11 +311,19 @@
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return 0;
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
       return 4;
@@ -362,6 +372,12 @@
       return 16.0;
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return 16.0; /* arbitrary */
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+      return 0.0;
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+      return 0.0;
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0;
    }
    /* should only get here on unhandled cases */
    debug_printf("Unexpected PIPE_CAPF %d query\n", param);
@@ -378,6 +394,7 @@
                               enum pipe_format format,
                               enum pipe_texture_target target,
                               unsigned sample_count,
+                              unsigned storage_sample_count,
                               unsigned bind)
 {
    struct sw_winsys *winsys = softpipe_screen(screen)->winsys;
@@ -393,6 +410,9 @@
           target == PIPE_TEXTURE_CUBE ||
           target == PIPE_TEXTURE_CUBE_ARRAY);
 
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+      return false;
+
    format_desc = util_format_description(format);
    if (!format_desc)
       return FALSE;
@@ -426,8 +446,7 @@
          return FALSE;
    }
 
-   if (format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC ||
-       format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC) {
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_ASTC) {
       /* Software decoding is not hooked up. */
       return FALSE;
    }
diff --git a/src/gallium/drivers/svga/meson.build b/src/gallium/drivers/svga/meson.build
index 2976212..7981e29 100644
--- a/src/gallium/drivers/svga/meson.build
+++ b/src/gallium/drivers/svga/meson.build
@@ -79,7 +79,7 @@
 
 libsvga = static_library(
   'svga',
-  files_svga,
+  [files_svga, sha1_h],
   c_args : [c_vis_args, c_msvc_compat_args],
   include_directories : [
     inc_src, inc_include, inc_gallium, inc_gallium_aux,
diff --git a/src/gallium/drivers/svga/svga_format.c b/src/gallium/drivers/svga/svga_format.c
index c9adee1..0b20260 100644
--- a/src/gallium/drivers/svga/svga_format.c
+++ b/src/gallium/drivers/svga/svga_format.c
@@ -2082,6 +2082,7 @@
                          enum pipe_format format,
                          enum pipe_texture_target target,
                          unsigned sample_count,
+                         unsigned storage_sample_count,
                          unsigned bindings)
 {
    struct svga_screen *ss = svga_screen(screen);
@@ -2091,6 +2092,9 @@
 
    assert(bindings);
 
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+      return false;
+
    if (sample_count > 1) {
       /* In ms_samples, if bit N is set it means that we support
        * multisample with N+1 samples per pixel.
diff --git a/src/gallium/drivers/svga/svga_format.h b/src/gallium/drivers/svga/svga_format.h
index c063589..11e7e41 100644
--- a/src/gallium/drivers/svga/svga_format.h
+++ b/src/gallium/drivers/svga/svga_format.h
@@ -127,6 +127,7 @@
                          enum pipe_format format,
                          enum pipe_texture_target target,
                          unsigned sample_count,
+                         unsigned storage_sample_count,
                          unsigned bindings);
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_clear.c b/src/gallium/drivers/svga/svga_pipe_clear.c
index e234ef5..d128717 100644
--- a/src/gallium/drivers/svga/svga_pipe_clear.c
+++ b/src/gallium/drivers/svga/svga_pipe_clear.c
@@ -422,6 +422,7 @@
              pipe->screen->is_format_supported(pipe->screen, rtv->format,
                                                rtv->texture->target,
                                                rtv->texture->nr_samples,
+                                               rtv->texture->nr_storage_samples,
                                                PIPE_BIND_RENDER_TARGET)) {
             /* clear with quad drawing */
             util_blitter_save_framebuffer(svga->blitter,
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 104cb6d..9daae9c 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -1003,9 +1003,14 @@
             return NULL;
       }
    } else {
-      if (!sbuf->bind_flags) {
+      /* If there is no resource handle yet, then combine the buffer bind
+       * flags and the tobind_flags if they are compatible.
+       * If not, just use the tobind_flags for creating the resource handle.
+       */
+      if (compatible_bind_flags(sbuf->bind_flags, tobind_flags))
+         sbuf->bind_flags = sbuf->bind_flags | tobind_flags;
+      else
          sbuf->bind_flags = tobind_flags;
-      }
 
       assert((sbuf->bind_flags & tobind_flags) == tobind_flags);
 
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index 71b8ebe..068862b 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -1040,6 +1040,7 @@
       if (screen->is_format_supported(screen, template->format,
                                       template->target,
                                       template->nr_samples,
+                                      template->nr_storage_samples,
                                       PIPE_BIND_SAMPLER_VIEW)) {
          bindings |= PIPE_BIND_SAMPLER_VIEW;
       }
@@ -1054,6 +1055,7 @@
          if (screen->is_format_supported(screen, template->format,
                                          template->target,
                                          template->nr_samples,
+                                         template->nr_storage_samples,
                                          PIPE_BIND_RENDER_TARGET)) {
             bindings |= PIPE_BIND_RENDER_TARGET;
          }
@@ -1064,6 +1066,7 @@
          if (screen->is_format_supported(screen, template->format,
                                          template->target,
                                          template->nr_samples,
+                                         template->nr_storage_samples,
                                          PIPE_BIND_DEPTH_STENCIL)) {
             bindings |= PIPE_BIND_DEPTH_STENCIL;
          }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index f5f07fa..5706216 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -91,8 +91,6 @@
     */
    build = "build: DEBUG;";
    mutex = "mutex: " PIPE_ATOMIC ";";
-#elif defined(VMX86_STATS)
-   build = "build: OPT;";
 #else
    build = "build: RELEASE;";
 #endif
@@ -167,6 +165,13 @@
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return 15.0;
 
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+      /* fall-through */
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+      /* fall-through */
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0f;
+
    }
 
    debug_printf("Unexpected PIPE_CAPF_ query %u\n", param);
@@ -271,6 +276,9 @@
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
       return sws->have_vgpu10 ? 330 : 120;
 
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      return sws->have_vgpu10 ? 140 : 120;
+
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return 0;
 
@@ -373,6 +381,12 @@
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
    case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
@@ -447,11 +461,13 @@
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return 0;
    }
 
@@ -539,6 +555,8 @@
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
          return 0;
+      case PIPE_SHADER_CAP_SCALAR_ISA:
+         return 1;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
       }
@@ -606,6 +624,8 @@
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
          return 0;
+      case PIPE_SHADER_CAP_SCALAR_ISA:
+         return 1;
       case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
          return 32;
       }
@@ -707,6 +727,8 @@
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
    case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
       return 0;
+   case PIPE_SHADER_CAP_SCALAR_ISA:
+      return 1;
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
    default:
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
index 42a3593e..0ecabc7 100644
--- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -42,11 +42,11 @@
                          unsigned *idx)
 {
    switch (semantic.Name) {
-   case TGSI_SEMANTIC_POSITION:  
+   case TGSI_SEMANTIC_POSITION:
       *idx = semantic.Index;
       *usage = SVGA3D_DECLUSAGE_POSITION;
       break;
-   case TGSI_SEMANTIC_COLOR:     
+   case TGSI_SEMANTIC_COLOR:
       *idx = semantic.Index;
       *usage = SVGA3D_DECLUSAGE_COLOR;
       break;
@@ -54,21 +54,21 @@
       *idx = semantic.Index + 2; /* sharing with COLOR */
       *usage = SVGA3D_DECLUSAGE_COLOR;
       break;
-   case TGSI_SEMANTIC_FOG:       
+   case TGSI_SEMANTIC_FOG:
       *idx = 0;
       assert(semantic.Index == 0);
       *usage = SVGA3D_DECLUSAGE_TEXCOORD;
       break;
-   case TGSI_SEMANTIC_PSIZE:     
+   case TGSI_SEMANTIC_PSIZE:
       *idx = semantic.Index;
       *usage = SVGA3D_DECLUSAGE_PSIZE;
       break;
-   case TGSI_SEMANTIC_GENERIC:   
+   case TGSI_SEMANTIC_GENERIC:
       *idx = svga_remap_generic_index(emit->key.generic_remap_table,
                                       semantic.Index);
       *usage = SVGA3D_DECLUSAGE_TEXCOORD;
       break;
-   case TGSI_SEMANTIC_NORMAL:    
+   case TGSI_SEMANTIC_NORMAL:
       *idx = semantic.Index;
       *usage = SVGA3D_DECLUSAGE_NORMAL;
       break;
@@ -98,7 +98,7 @@
 static boolean
 emit_decl(struct svga_shader_emitter *emit,
           SVGA3dShaderDestToken reg,
-          unsigned usage, 
+          unsigned usage,
           unsigned index)
 {
    SVGA3DOpDclArgs dcl;
@@ -108,7 +108,7 @@
    assert(index < 16);
    assert(usage <= SVGA3D_DECLUSAGE_MAX);
 
-   opcode = inst_token( SVGA3DOP_DCL );
+   opcode = inst_token(SVGA3DOP_DCL);
    dcl.values[0] = 0;
    dcl.values[1] = 0;
 
@@ -118,7 +118,7 @@
    dcl.values[0] |= 1<<31;
 
    return (emit_instruction(emit, opcode) &&
-           svga_shader_emit_dwords( emit, dcl.values, ARRAY_SIZE(dcl.values)));
+           svga_shader_emit_dwords(emit, dcl.values, ARRAY_SIZE(dcl.values)));
 }
 
 
@@ -132,7 +132,7 @@
       SVGA3dShaderDestToken reg =
          dst_register(SVGA3DREG_MISCTYPE, SVGA3DMISCREG_FACE);
 
-      if (!emit_decl( emit, reg, 0, 0 ))
+      if (!emit_decl(emit, reg, 0, 0))
          return FALSE;
 
       emit->emitted_vface = TRUE;
@@ -146,8 +146,8 @@
  * Note that this always goes into texcoord[0].
  */
 static boolean
-ps30_input_emit_depth_fog( struct svga_shader_emitter *emit,
-                           struct src_register *out )
+ps30_input_emit_depth_fog(struct svga_shader_emitter *emit,
+                          struct src_register *out)
 {
    struct src_register reg;
 
@@ -159,14 +159,14 @@
    if (emit->ps30_input_count >= SVGA3D_INPUTREG_MAX)
       return FALSE;
 
-   reg = src_register( SVGA3DREG_INPUT,
-                       emit->ps30_input_count++ );
+   reg = src_register(SVGA3DREG_INPUT,
+                       emit->ps30_input_count++);
 
    *out = emit->ps_depth_fog = reg;
 
    emit->emitted_depth_fog = TRUE;
 
-   return emit_decl( emit, dst( reg ), SVGA3D_DECLUSAGE_TEXCOORD, 0 );
+   return emit_decl(emit, dst(reg), SVGA3D_DECLUSAGE_TEXCOORD, 0);
 }
 
 
@@ -184,73 +184,74 @@
 
    if (semantic.Name == TGSI_SEMANTIC_POSITION) {
 
-      emit->ps_true_pos = src_register( SVGA3DREG_MISCTYPE,
-                                        SVGA3DMISCREG_POSITION );
-      emit->ps_true_pos.base.swizzle = TRANSLATE_SWIZZLE( TGSI_SWIZZLE_X,
+      emit->ps_true_pos = src_register(SVGA3DREG_MISCTYPE,
+                                        SVGA3DMISCREG_POSITION);
+      emit->ps_true_pos.base.swizzle = TRANSLATE_SWIZZLE(TGSI_SWIZZLE_X,
                                                           TGSI_SWIZZLE_Y,
                                                           TGSI_SWIZZLE_Y,
-                                                          TGSI_SWIZZLE_Y );
-      reg = writemask( dst(emit->ps_true_pos),
-                       TGSI_WRITEMASK_XY );
+                                                          TGSI_SWIZZLE_Y);
+      reg = writemask(dst(emit->ps_true_pos),
+                       TGSI_WRITEMASK_XY);
       emit->ps_reads_pos = TRUE;
 
       if (emit->info.reads_z) {
-         emit->ps_temp_pos = dst_register( SVGA3DREG_TEMP,
-                                           emit->nr_hw_temp );
+         emit->ps_temp_pos = dst_register(SVGA3DREG_TEMP,
+                                           emit->nr_hw_temp);
 
-         emit->input_map[idx] = src_register( SVGA3DREG_TEMP,
-                                              emit->nr_hw_temp );
+         emit->input_map[idx] = src_register(SVGA3DREG_TEMP,
+                                              emit->nr_hw_temp);
          emit->nr_hw_temp++;
 
-         if (!ps30_input_emit_depth_fog( emit, &emit->ps_depth_pos ))
+         if (!ps30_input_emit_depth_fog(emit, &emit->ps_depth_pos))
             return FALSE;
 
-         emit->ps_depth_pos.base.swizzle = TRANSLATE_SWIZZLE( TGSI_SWIZZLE_Z,
+         emit->ps_depth_pos.base.swizzle = TRANSLATE_SWIZZLE(TGSI_SWIZZLE_Z,
                                                               TGSI_SWIZZLE_Z,
                                                               TGSI_SWIZZLE_Z,
-                                                              TGSI_SWIZZLE_W );
+                                                              TGSI_SWIZZLE_W);
       }
       else {
          emit->input_map[idx] = emit->ps_true_pos;
       }
 
-      return emit_decl( emit, reg, 0, 0 );
+      return emit_decl(emit, reg, 0, 0);
    }
    else if (emit->key.fs.light_twoside &&
             (semantic.Name == TGSI_SEMANTIC_COLOR)) {
 
-      if (!translate_vs_ps_semantic( emit, semantic, &usage, &index ))
+      if (!translate_vs_ps_semantic(emit, semantic, &usage, &index))
          return FALSE;
 
       emit->internal_color_idx[emit->internal_color_count] = idx;
-      emit->input_map[idx] = src_register( SVGA3DREG_INPUT, emit->ps30_input_count );
+      emit->input_map[idx] =
+         src_register(SVGA3DREG_INPUT, emit->ps30_input_count);
       emit->ps30_input_count++;
       emit->internal_color_count++;
 
-      reg = dst( emit->input_map[idx] );
+      reg = dst(emit->input_map[idx]);
 
-      if (!emit_decl( emit, reg, usage, index ))
+      if (!emit_decl(emit, reg, usage, index))
          return FALSE;
 
       semantic.Name = TGSI_SEMANTIC_BCOLOR;
-      if (!translate_vs_ps_semantic( emit, semantic, &usage, &index ))
+      if (!translate_vs_ps_semantic(emit, semantic, &usage, &index))
          return FALSE;
 
       if (emit->ps30_input_count >= SVGA3D_INPUTREG_MAX)
          return FALSE;
 
-      reg = dst_register( SVGA3DREG_INPUT, emit->ps30_input_count++ );
+      reg = dst_register(SVGA3DREG_INPUT, emit->ps30_input_count++);
 
-      if (!emit_decl( emit, reg, usage, index ))
+      if (!emit_decl(emit, reg, usage, index))
          return FALSE;
 
-      if (!emit_vface_decl( emit ))
+      if (!emit_vface_decl(emit))
          return FALSE;
 
       return TRUE;
    }
    else if (semantic.Name == TGSI_SEMANTIC_FACE) {
-      if (!emit_vface_decl( emit ))
+      if (!emit_vface_decl(emit))
          return FALSE;
       emit->emit_frontface = TRUE;
       emit->internal_frontface_idx = idx;
@@ -260,28 +261,29 @@
 
       assert(semantic.Index == 0);
 
-      if (!ps30_input_emit_depth_fog( emit, &emit->input_map[idx] ))
+      if (!ps30_input_emit_depth_fog(emit, &emit->input_map[idx]))
          return FALSE;
 
-      emit->input_map[idx].base.swizzle = TRANSLATE_SWIZZLE( TGSI_SWIZZLE_X,
+      emit->input_map[idx].base.swizzle = TRANSLATE_SWIZZLE(TGSI_SWIZZLE_X,
                                                              TGSI_SWIZZLE_X,
                                                              TGSI_SWIZZLE_X,
-                                                             TGSI_SWIZZLE_X );
-
+                                                             TGSI_SWIZZLE_X);
       return TRUE;
    }
    else {
 
-      if (!translate_vs_ps_semantic( emit, semantic, &usage, &index ))
+      if (!translate_vs_ps_semantic(emit, semantic, &usage, &index))
          return FALSE;
 
       if (emit->ps30_input_count >= SVGA3D_INPUTREG_MAX)
          return FALSE;
 
-      emit->input_map[idx] = src_register( SVGA3DREG_INPUT, emit->ps30_input_count++ );
-      reg = dst( emit->input_map[idx] );
+      emit->input_map[idx] =
+         src_register(SVGA3DREG_INPUT, emit->ps30_input_count++);
 
-      if (!emit_decl( emit, reg, usage, index ))
+      reg = dst(emit->input_map[idx]);
+
+      if (!emit_decl(emit, reg, usage, index))
          return FALSE;
 
       if (semantic.Name == TGSI_SEMANTIC_GENERIC &&
@@ -331,10 +333,10 @@
       if (emit->unit == PIPE_SHADER_FRAGMENT) {
          if (emit->key.fs.white_fragments) {
             /* Used for XOR logicop mode */
-            emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
-                                                  emit->nr_hw_temp++ );
+            emit->output_map[idx] = dst_register(SVGA3DREG_TEMP,
+                                                  emit->nr_hw_temp++);
             emit->temp_color_output[idx] = emit->output_map[idx];
-            emit->true_color_output[idx] = dst_register(SVGA3DREG_COLOROUT, 
+            emit->true_color_output[idx] = dst_register(SVGA3DREG_COLOROUT,
                                                         semantic.Index);
          }
          else if (emit->key.fs.write_color0_to_n_cbufs) {
@@ -359,21 +361,21 @@
          }
       }
       else {
-         emit->output_map[idx] = dst_register( SVGA3DREG_COLOROUT, 
-                                               semantic.Index );
+         emit->output_map[idx] = dst_register(SVGA3DREG_COLOROUT,
+                                               semantic.Index);
       }
       break;
    case TGSI_SEMANTIC_POSITION:
-      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
-                                            emit->nr_hw_temp++ );
+      emit->output_map[idx] = dst_register(SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++);
       emit->temp_pos = emit->output_map[idx];
-      emit->true_pos = dst_register( SVGA3DREG_DEPTHOUT, 
-                                     semantic.Index );
+      emit->true_pos = dst_register(SVGA3DREG_DEPTHOUT,
+                                     semantic.Index);
       break;
    default:
       assert(0);
       /* A wild stab in the dark. */
-      emit->output_map[idx] = dst_register( SVGA3DREG_COLOROUT, 0 );
+      emit->output_map[idx] = dst_register(SVGA3DREG_COLOROUT, 0);
       break;
    }
 
@@ -394,23 +396,23 @@
    SVGA3dShaderInstToken opcode;
    unsigned usage, index;
 
-   opcode = inst_token( SVGA3DOP_DCL );
+   opcode = inst_token(SVGA3DOP_DCL);
    dcl.values[0] = 0;
    dcl.values[1] = 0;
 
-   emit->input_map[idx] = src_register( SVGA3DREG_INPUT, idx );
-   dcl.dst = dst_register( SVGA3DREG_INPUT, idx );
+   emit->input_map[idx] = src_register(SVGA3DREG_INPUT, idx);
+   dcl.dst = dst_register(SVGA3DREG_INPUT, idx);
 
    assert(dcl.dst.reserved0);
 
-   svga_generate_vdecl_semantics( idx, &usage, &index );
+   svga_generate_vdecl_semantics(idx, &usage, &index);
 
    dcl.usage = usage;
    dcl.index = index;
    dcl.values[0] |= 1<<31;
 
    return (emit_instruction(emit, opcode) &&
-           svga_shader_emit_dwords( emit, dcl.values, ARRAY_SIZE(dcl.values)));
+           svga_shader_emit_dwords(emit, dcl.values, ARRAY_SIZE(dcl.values)));
 }
 
 
@@ -428,13 +430,13 @@
       return TRUE;
    }
 
-   reg = dst_register( SVGA3DREG_OUTPUT, emit->vs30_output_count++ );
+   reg = dst_register(SVGA3DREG_OUTPUT, emit->vs30_output_count++);
 
    *out = emit->vs_depth_fog = reg;
 
    emit->emitted_depth_fog = TRUE;
 
-   return emit_decl( emit, reg, SVGA3D_DECLUSAGE_TEXCOORD, 0 );
+   return emit_decl(emit, reg, SVGA3D_DECLUSAGE_TEXCOORD, 0);
 }
 
 
@@ -452,39 +454,39 @@
    SVGA3dShaderInstToken opcode;
    unsigned usage, index;
 
-   opcode = inst_token( SVGA3DOP_DCL );
+   opcode = inst_token(SVGA3DOP_DCL);
    dcl.values[0] = 0;
    dcl.values[1] = 0;
 
-   if (!translate_vs_ps_semantic( emit, semantic, &usage, &index ))
+   if (!translate_vs_ps_semantic(emit, semantic, &usage, &index))
       return FALSE;
 
    if (emit->vs30_output_count >= SVGA3D_OUTPUTREG_MAX)
       return FALSE;
 
-   dcl.dst = dst_register( SVGA3DREG_OUTPUT, emit->vs30_output_count++ );
+   dcl.dst = dst_register(SVGA3DREG_OUTPUT, emit->vs30_output_count++);
    dcl.usage = usage;
    dcl.index = index;
    dcl.values[0] |= 1<<31;
 
    if (semantic.Name == TGSI_SEMANTIC_POSITION) {
       assert(idx == 0);
-      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
-                                            emit->nr_hw_temp++ );
+      emit->output_map[idx] = dst_register(SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++);
       emit->temp_pos = emit->output_map[idx];
       emit->true_pos = dcl.dst;
 
       /* Grab an extra output for the depth output */
-      if (!vs30_output_emit_depth_fog( emit, &emit->depth_pos ))
+      if (!vs30_output_emit_depth_fog(emit, &emit->depth_pos))
          return FALSE;
 
    }
    else if (semantic.Name == TGSI_SEMANTIC_PSIZE) {
-      emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
-                                            emit->nr_hw_temp++ );
+      emit->output_map[idx] = dst_register(SVGA3DREG_TEMP,
+                                            emit->nr_hw_temp++);
       emit->temp_psiz = emit->output_map[idx];
 
-      /* This has the effect of not declaring psiz (below) and not 
+      /* This has the effect of not declaring psiz (below) and not
        * emitting the final MOV to true_psiz in the postamble.
        */
       if (!emit->key.vs.allow_psiz)
@@ -499,7 +501,7 @@
        */
       emit->vs30_output_count--;
 
-      if (!vs30_output_emit_depth_fog( emit, &emit->output_map[idx] ))
+      if (!vs30_output_emit_depth_fog(emit, &emit->output_map[idx]))
          return FALSE;
 
       return TRUE;
@@ -509,7 +511,7 @@
    }
 
    return (emit_instruction(emit, opcode) &&
-           svga_shader_emit_dwords( emit, dcl.values, ARRAY_SIZE(dcl.values)));
+           svga_shader_emit_dwords(emit, dcl.values, ARRAY_SIZE(dcl.values)));
 }
 
 
@@ -534,26 +536,27 @@
 
 
 static boolean
-ps30_sampler( struct svga_shader_emitter *emit,
-              unsigned idx )
+ps30_sampler(struct svga_shader_emitter *emit,
+              unsigned idx)
 {
    SVGA3DOpDclArgs dcl;
    SVGA3dShaderInstToken opcode;
 
-   opcode = inst_token( SVGA3DOP_DCL );
+   opcode = inst_token(SVGA3DOP_DCL);
    dcl.values[0] = 0;
    dcl.values[1] = 0;
 
-   dcl.dst = dst_register( SVGA3DREG_SAMPLER, idx );
-   dcl.type = svga_tgsi_sampler_type( emit, idx );
+   dcl.dst = dst_register(SVGA3DREG_SAMPLER, idx);
+   dcl.type = svga_tgsi_sampler_type(emit, idx);
    dcl.values[0] |= 1<<31;
 
    return (emit_instruction(emit, opcode) &&
-           svga_shader_emit_dwords( emit, dcl.values, ARRAY_SIZE(dcl.values)));
+           svga_shader_emit_dwords(emit, dcl.values, ARRAY_SIZE(dcl.values)));
 }
 
+
 boolean
-svga_shader_emit_samplers_decl( struct svga_shader_emitter *emit )
+svga_shader_emit_samplers_decl(struct svga_shader_emitter *emit)
 {
    unsigned i;
 
@@ -564,15 +567,16 @@
    return TRUE;
 }
 
+
 boolean
-svga_translate_decl_sm30( struct svga_shader_emitter *emit,
-                          const struct tgsi_full_declaration *decl )
+svga_translate_decl_sm30(struct svga_shader_emitter *emit,
+                         const struct tgsi_full_declaration *decl)
 {
    unsigned first = decl->Range.First;
    unsigned last = decl->Range.Last;
    unsigned idx;
 
-   for( idx = first; idx <= last; idx++ ) {
+   for(idx = first; idx <= last; idx++) {
       boolean ok = TRUE;
 
       switch (decl->Declaration.File) {
@@ -586,16 +590,16 @@
 
       case TGSI_FILE_INPUT:
          if (emit->unit == PIPE_SHADER_VERTEX)
-            ok = vs30_input( emit, decl->Semantic, idx );
+            ok = vs30_input(emit, decl->Semantic, idx);
          else
-            ok = ps30_input( emit, decl->Semantic, idx );
+            ok = ps30_input(emit, decl->Semantic, idx);
          break;
 
       case TGSI_FILE_OUTPUT:
          if (emit->unit == PIPE_SHADER_VERTEX)
-            ok = vs30_output( emit, decl->Semantic, idx );
+            ok = vs30_output(emit, decl->Semantic, idx);
          else
-            ok = ps30_output( emit, decl->Semantic, idx );
+            ok = ps30_output(emit, decl->Semantic, idx);
          break;
 
       case TGSI_FILE_SAMPLER_VIEW:
diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
index c22f09e..5cc3f77 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -104,6 +104,7 @@
 		$(srcdir)/rasterizer/codegen/gen_llvm_types.py \
 		--input $(srcdir)/swr_context.h \
 		--output ./gen_swr_context_llvm.h
+	$(AM_V_GEN)touch $@
 
 rasterizer/codegen/gen_knobs.cpp: rasterizer/codegen/gen_knobs.py rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.cpp rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -111,6 +112,7 @@
 		$(srcdir)/rasterizer/codegen/gen_knobs.py \
 		--output rasterizer/codegen/gen_knobs.cpp \
 		--gen_cpp
+	$(AM_V_GEN)touch $@
 
 rasterizer/codegen/gen_knobs.h: rasterizer/codegen/gen_knobs.py rasterizer/codegen/knob_defs.py rasterizer/codegen/templates/gen_knobs.h rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -118,6 +120,7 @@
 		$(srcdir)/rasterizer/codegen/gen_knobs.py \
 		--output rasterizer/codegen/gen_knobs.h \
 		--gen_h
+	$(AM_V_GEN)touch $@
 
 rasterizer/jitter/gen_state_llvm.h: rasterizer/codegen/gen_llvm_types.py rasterizer/codegen/templates/gen_llvm.hpp rasterizer/core/state.h rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -125,6 +128,7 @@
 		$(srcdir)/rasterizer/codegen/gen_llvm_types.py \
 		--input $(srcdir)/rasterizer/core/state.h \
 		--output rasterizer/jitter/gen_state_llvm.h
+	$(AM_V_GEN)touch $@
 
 rasterizer/jitter/gen_builder.hpp: rasterizer/codegen/gen_llvm_ir_macros.py rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -133,6 +137,7 @@
 		--input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \
 		--output rasterizer/jitter \
 		--gen_h
+	$(AM_V_GEN)touch $@
 
 rasterizer/jitter/gen_builder_meta.hpp: rasterizer/codegen/gen_llvm_ir_macros.py rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -140,6 +145,7 @@
 		$(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \
 		--output rasterizer/jitter \
 		--gen_meta_h
+	$(AM_V_GEN)touch $@
 
 rasterizer/jitter/gen_builder_intrin.hpp: rasterizer/codegen/gen_llvm_ir_macros.py rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -147,6 +153,7 @@
 		$(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \
 		--output rasterizer/jitter \
 		--gen_intrin_h
+	$(AM_V_GEN)touch $@
 
 rasterizer/archrast/gen_ar_event.hpp: rasterizer/codegen/gen_archrast.py rasterizer/codegen/templates/gen_ar_event.hpp rasterizer/archrast/events.proto rasterizer/archrast/events_private.proto rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -156,6 +163,7 @@
 		--proto_private $(srcdir)/rasterizer/archrast/events_private.proto \
 		--output rasterizer/archrast/gen_ar_event.hpp \
 		--gen_event_hpp
+	$(AM_V_GEN)touch $@
 
 rasterizer/archrast/gen_ar_event.cpp: rasterizer/codegen/gen_archrast.py rasterizer/codegen/templates/gen_ar_event.cpp rasterizer/archrast/events.proto rasterizer/archrast/events_private.proto rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -165,6 +173,7 @@
 		--proto_private $(srcdir)/rasterizer/archrast/events_private.proto \
 		--output rasterizer/archrast/gen_ar_event.cpp \
 		--gen_event_cpp
+	$(AM_V_GEN)touch $@
 
 rasterizer/archrast/gen_ar_eventhandler.hpp: rasterizer/codegen/gen_archrast.py rasterizer/codegen/templates/gen_ar_eventhandler.hpp rasterizer/archrast/events.proto rasterizer/archrast/events_private.proto rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -174,6 +183,7 @@
 		--proto_private $(srcdir)/rasterizer/archrast/events_private.proto \
 		--output rasterizer/archrast/gen_ar_eventhandler.hpp \
 		--gen_eventhandler_hpp
+	$(AM_V_GEN)touch $@
 
 rasterizer/archrast/gen_ar_eventhandlerfile.hpp: rasterizer/codegen/gen_archrast.py rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp rasterizer/archrast/events.proto rasterizer/archrast/events_private.proto rasterizer/codegen/gen_common.py
 	$(MKDIR_GEN)
@@ -183,6 +193,7 @@
 		--proto_private $(srcdir)/rasterizer/archrast/events_private.proto \
 		--output rasterizer/archrast/gen_ar_eventhandlerfile.hpp \
 		--gen_eventhandlerfile_hpp
+	$(AM_V_GEN)touch $@
 
 rasterizer/core/backends/gen_BackendPixelRate0.cpp \
 rasterizer/core/backends/gen_BackendPixelRate1.cpp \
@@ -363,9 +374,9 @@
 # created with the oldest supported version of LLVM.
 dist-hook:
 if SWR_INVALID_LLVM_VERSION
-	@echo "*******************************************************"
-	@echo "LLVM 4.0.0 or LLVM 4.0.1 required to create the tarball"
-	@echo "*******************************************************"
+	@echo "*****************************************"
+	@echo "LLVM 5.0.x required to create the tarball"
+	@echo "*****************************************"
 	@test
 endif
 
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
index a06d1d7..b298356 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -132,6 +132,7 @@
 	rasterizer/core/threads.h \
 	rasterizer/core/tilemgr.cpp \
 	rasterizer/core/tilemgr.h \
+	rasterizer/core/tileset.h \
 	rasterizer/core/utils.h
 
 JITTER_CXX_SOURCES := \
@@ -176,4 +177,6 @@
 	rasterizer/memory/StoreTile_TileY2.cpp \
 	rasterizer/memory/StoreTile_TileY.cpp \
 	rasterizer/memory/TilingFunctions.h \
-	rasterizer/memory/tilingtraits.h
+	rasterizer/memory/tilingtraits.h \
+	rasterizer/memory/InitMemory.cpp \
+	rasterizer/memory/InitMemory.h
diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript
index 528cfac..224372e 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -12,8 +12,8 @@
     env['swr'] = False
     Return()
 
-if env['LLVM_VERSION'] < distutils.version.LooseVersion('4.0'):
-    print("warning: swr requires LLVM >= 4.0: not building swr")
+if env['LLVM_VERSION'] < distutils.version.LooseVersion('5.0'):
+    print("warning: swr requires LLVM >= 5.0: not building swr")
     env['swr'] = False
     Return()
 
diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build
index 575133d..b95c8bc 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -131,6 +131,7 @@
   'rasterizer/core/threads.h',
   'rasterizer/core/tilemgr.cpp',
   'rasterizer/core/tilemgr.h',
+  'rasterizer/core/tileset.h',
   'rasterizer/core/utils.h',
   'rasterizer/memory/ClearTile.cpp',
   'rasterizer/memory/Convert.h',
@@ -150,6 +151,8 @@
   'rasterizer/memory/StoreTile_TileY.cpp',
   'rasterizer/memory/TilingFunctions.h',
   'rasterizer/memory/tilingtraits.h',
+  'rasterizer/memory/InitMemory.h',
+  'rasterizer/memory/InitMemory.cpp',
 )
 
 swr_context_files = files('swr_context.h')
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
index 871db79..ceb06ae 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file archrast.cpp
-*
-* @brief Implementation for archrast.
-*
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file archrast.cpp
+ *
+ * @brief Implementation for archrast.
+ *
+ ******************************************************************************/
 #include <atomic>
 
 #include "common/os.h"
@@ -38,14 +38,14 @@
     /// @brief struct that keeps track of depth and stencil event information
     struct DepthStencilStats
     {
-        uint32_t earlyZTestPassCount = 0;
-        uint32_t earlyZTestFailCount = 0;
-        uint32_t lateZTestPassCount = 0;
-        uint32_t lateZTestFailCount = 0;
+        uint32_t earlyZTestPassCount       = 0;
+        uint32_t earlyZTestFailCount       = 0;
+        uint32_t lateZTestPassCount        = 0;
+        uint32_t lateZTestFailCount        = 0;
         uint32_t earlyStencilTestPassCount = 0;
         uint32_t earlyStencilTestFailCount = 0;
-        uint32_t lateStencilTestPassCount = 0;
-        uint32_t lateStencilTestFailCount = 0;
+        uint32_t lateStencilTestPassCount  = 0;
+        uint32_t lateStencilTestFailCount  = 0;
     };
 
     struct CStats
@@ -76,12 +76,12 @@
     struct CullStats
     {
         uint32_t degeneratePrimCount = 0;
-        uint32_t backfacePrimCount = 0;
+        uint32_t backfacePrimCount   = 0;
     };
 
     struct AlphaStats
     {
-        uint32_t alphaTestCount = 0;
+        uint32_t alphaTestCount  = 0;
         uint32_t alphaBlendCount = 0;
     };
 
@@ -93,22 +93,78 @@
     class EventHandlerApiStats : public EventHandlerFile
     {
     public:
-        EventHandlerApiStats(uint32_t id) : EventHandlerFile(id) {}
+        EventHandlerApiStats(uint32_t id) : EventHandlerFile(id)
+        {
+#if defined(_WIN32)
+            // Attempt to copy the events.proto file to the ArchRast output dir. It's common for
+            // tools to place the events.proto file in the DEBUG_OUTPUT_DIR when launching AR. If it
+            // exists, this will attempt to copy it the first time we get here to package it with
+            // the stats. Otherwise, the user would need to specify the events.proto location when
+            // parsing the stats in post.
+            std::stringstream eventsProtoSrcFilename, eventsProtoDstFilename;
+            eventsProtoSrcFilename << KNOB_DEBUG_OUTPUT_DIR << "\\events.proto" << std::ends;
+            eventsProtoDstFilename << mOutputDir.substr(0, mOutputDir.size() - 1)
+                                   << "\\events.proto" << std::ends;
+
+            // If event.proto already exists, we're done; else do the copy
+            struct stat buf; // Use a Posix stat for file existence check
+            if (!stat(eventsProtoDstFilename.str().c_str(), &buf) == 0)
+            {
+                // Now check to make sure the events.proto source exists
+                if (stat(eventsProtoSrcFilename.str().c_str(), &buf) == 0)
+                {
+                    std::ifstream srcFile;
+                    srcFile.open(eventsProtoSrcFilename.str().c_str(), std::ios::binary);
+                    if (srcFile.is_open())
+                    {
+                        // Just do a binary buffer copy
+                        std::ofstream dstFile;
+                        dstFile.open(eventsProtoDstFilename.str().c_str(), std::ios::binary);
+                        dstFile << srcFile.rdbuf();
+                        dstFile.close();
+                    }
+                    srcFile.close();
+                }
+            }
+#endif
+        }
 
         virtual void Handle(const DrawInstancedEvent& event)
         {
-            DrawInfoEvent e(event.data.drawId, ArchRast::Instanced, event.data.topology, 
-                event.data.numVertices, 0, 0, event.data.startVertex, event.data.numInstances, 
-                event.data.startInstance, event.data.tsEnable, event.data.gsEnable, event.data.soEnable, event.data.soTopology, event.data.splitId);
-            
+            DrawInfoEvent e(event.data.drawId,
+                            ArchRast::Instanced,
+                            event.data.topology,
+                            event.data.numVertices,
+                            0,
+                            0,
+                            event.data.startVertex,
+                            event.data.numInstances,
+                            event.data.startInstance,
+                            event.data.tsEnable,
+                            event.data.gsEnable,
+                            event.data.soEnable,
+                            event.data.soTopology,
+                            event.data.splitId);
+
             EventHandlerFile::Handle(e);
         }
 
         virtual void Handle(const DrawIndexedInstancedEvent& event)
         {
-            DrawInfoEvent e(event.data.drawId, ArchRast::IndexedInstanced, event.data.topology, 0,
-                event.data.numIndices, event.data.indexOffset, event.data.baseVertex, event.data.numInstances,
-                event.data.startInstance, event.data.tsEnable, event.data.gsEnable, event.data.soEnable, event.data.soTopology, event.data.splitId);
+            DrawInfoEvent e(event.data.drawId,
+                            ArchRast::IndexedInstanced,
+                            event.data.topology,
+                            0,
+                            event.data.numIndices,
+                            event.data.indexOffset,
+                            event.data.baseVertex,
+                            event.data.numInstances,
+                            event.data.startInstance,
+                            event.data.tsEnable,
+                            event.data.gsEnable,
+                            event.data.soEnable,
+                            event.data.soTopology,
+                            event.data.splitId);
 
             EventHandlerFile::Handle(e);
         }
@@ -121,131 +177,155 @@
     class EventHandlerWorkerStats : public EventHandlerFile
     {
     public:
-        EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false) {}
+        EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false)
+        {
+            memset(mShaderStats, 0, sizeof(mShaderStats));
+        }
 
         virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
         {
-            //earlyZ test compute
+            // earlyZ test compute
             mDSSingleSample.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSingleSample.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSSingleSample.earlyZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //earlyStencil test compute
+            // earlyStencil test compute
             mDSSingleSample.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSingleSample.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSSingleSample.earlyStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
 
-            //earlyZ test single and multi sample
+            // earlyZ test single and multi sample
             mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSCombined.earlyZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //earlyStencil test single and multi sample
+            // earlyStencil test single and multi sample
             mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSCombined.earlyStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
 
             mNeedFlush = true;
         }
 
         virtual void Handle(const EarlyDepthStencilInfoSampleRate& event)
         {
-            //earlyZ test compute
+            // earlyZ test compute
             mDSSampleRate.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSampleRate.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSSampleRate.earlyZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //earlyStencil test compute
+            // earlyStencil test compute
             mDSSampleRate.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSampleRate.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSSampleRate.earlyStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
 
-            //earlyZ test single and multi sample
+            // earlyZ test single and multi sample
             mDSCombined.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSCombined.earlyZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //earlyStencil test single and multi sample
+            // earlyStencil test single and multi sample
             mDSCombined.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSCombined.earlyStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
 
             mNeedFlush = true;
         }
 
         virtual void Handle(const EarlyDepthStencilInfoNullPS& event)
         {
-            //earlyZ test compute
+            // earlyZ test compute
             mDSNullPS.earlyZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSNullPS.earlyZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSNullPS.earlyZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //earlyStencil test compute
+            // earlyStencil test compute
             mDSNullPS.earlyStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSNullPS.earlyStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSNullPS.earlyStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
             mNeedFlush = true;
         }
 
         virtual void Handle(const LateDepthStencilInfoSingleSample& event)
         {
-            //lateZ test compute
+            // lateZ test compute
             mDSSingleSample.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSingleSample.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSSingleSample.lateZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //lateStencil test compute
+            // lateStencil test compute
             mDSSingleSample.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSingleSample.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSSingleSample.lateStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
 
-            //lateZ test single and multi sample
+            // lateZ test single and multi sample
             mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSCombined.lateZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //lateStencil test single and multi sample
+            // lateStencil test single and multi sample
             mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSCombined.lateStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
 
             mNeedFlush = true;
         }
 
         virtual void Handle(const LateDepthStencilInfoSampleRate& event)
         {
-            //lateZ test compute
+            // lateZ test compute
             mDSSampleRate.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSSampleRate.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSSampleRate.lateZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //lateStencil test compute
+            // lateStencil test compute
             mDSSampleRate.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSSampleRate.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSSampleRate.lateStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
 
-
-            //lateZ test single and multi sample
+            // lateZ test single and multi sample
             mDSCombined.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSCombined.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSCombined.lateZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //lateStencil test single and multi sample
+            // lateStencil test single and multi sample
             mDSCombined.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSCombined.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSCombined.lateStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
 
             mNeedFlush = true;
         }
 
         virtual void Handle(const LateDepthStencilInfoNullPS& event)
         {
-            //lateZ test compute
+            // lateZ test compute
             mDSNullPS.lateZTestPassCount += _mm_popcnt_u32(event.data.depthPassMask);
-            mDSNullPS.lateZTestFailCount += _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
+            mDSNullPS.lateZTestFailCount +=
+                _mm_popcnt_u32((!event.data.depthPassMask) & event.data.coverageMask);
 
-            //lateStencil test compute
+            // lateStencil test compute
             mDSNullPS.lateStencilTestPassCount += _mm_popcnt_u32(event.data.stencilPassMask);
-            mDSNullPS.lateStencilTestFailCount += _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
+            mDSNullPS.lateStencilTestFailCount +=
+                _mm_popcnt_u32((!event.data.stencilPassMask) & event.data.coverageMask);
             mNeedFlush = true;
         }
 
         virtual void Handle(const EarlyDepthInfoPixelRate& event)
         {
-            //earlyZ test compute
+            // earlyZ test compute
             mDSPixelRate.earlyZTestPassCount += event.data.depthPassCount;
-            mDSPixelRate.earlyZTestFailCount += (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
+            mDSPixelRate.earlyZTestFailCount +=
+                (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
             mNeedFlush = true;
         }
 
 
         virtual void Handle(const LateDepthInfoPixelRate& event)
         {
-            //lateZ test compute
+            // lateZ test compute
             mDSPixelRate.lateZTestPassCount += event.data.depthPassCount;
-            mDSPixelRate.lateZTestFailCount += (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
+            mDSPixelRate.lateZTestFailCount +=
+                (_mm_popcnt_u32(event.data.activeLanes) - event.data.depthPassCount);
             mNeedFlush = true;
         }
 
@@ -253,8 +333,10 @@
         virtual void Handle(const ClipInfoEvent& event)
         {
             mClipper.mustClipCount += _mm_popcnt_u32(event.data.clipMask);
-            mClipper.trivialRejectCount += event.data.numInvocations - _mm_popcnt_u32(event.data.validMask);
-            mClipper.trivialAcceptCount += _mm_popcnt_u32(event.data.validMask & ~event.data.clipMask);
+            mClipper.trivialRejectCount +=
+                event.data.numInvocations - _mm_popcnt_u32(event.data.validMask);
+            mClipper.trivialAcceptCount +=
+                _mm_popcnt_u32(event.data.validMask & ~event.data.clipMask);
         }
 
         struct ShaderStats
@@ -297,58 +379,86 @@
         // Flush cached events for this draw
         virtual void FlushDraw(uint32_t drawId)
         {
-            if (mNeedFlush == false) return;
+            if (mNeedFlush == false)
+                return;
 
             EventHandlerFile::Handle(PSInfo(drawId, mShaderStats[SHADER_PIXEL].numInstExecuted));
             EventHandlerFile::Handle(CSInfo(drawId, mShaderStats[SHADER_COMPUTE].numInstExecuted));
 
-            //singleSample
-            EventHandlerFile::Handle(EarlyZSingleSample(drawId, mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZSingleSample(drawId, mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount));
-            EventHandlerFile::Handle(EarlyStencilSingleSample(drawId, mDSSingleSample.earlyStencilTestPassCount, mDSSingleSample.earlyStencilTestFailCount));
-            EventHandlerFile::Handle(LateStencilSingleSample(drawId, mDSSingleSample.lateStencilTestPassCount, mDSSingleSample.lateStencilTestFailCount));
+            // singleSample
+            EventHandlerFile::Handle(EarlyZSingleSample(
+                drawId, mDSSingleSample.earlyZTestPassCount, mDSSingleSample.earlyZTestFailCount));
+            EventHandlerFile::Handle(LateZSingleSample(
+                drawId, mDSSingleSample.lateZTestPassCount, mDSSingleSample.lateZTestFailCount));
+            EventHandlerFile::Handle(
+                EarlyStencilSingleSample(drawId,
+                                         mDSSingleSample.earlyStencilTestPassCount,
+                                         mDSSingleSample.earlyStencilTestFailCount));
+            EventHandlerFile::Handle(
+                LateStencilSingleSample(drawId,
+                                        mDSSingleSample.lateStencilTestPassCount,
+                                        mDSSingleSample.lateStencilTestFailCount));
 
-            //sampleRate
-            EventHandlerFile::Handle(EarlyZSampleRate(drawId, mDSSampleRate.earlyZTestPassCount, mDSSampleRate.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZSampleRate(drawId, mDSSampleRate.lateZTestPassCount, mDSSampleRate.lateZTestFailCount));
-            EventHandlerFile::Handle(EarlyStencilSampleRate(drawId, mDSSampleRate.earlyStencilTestPassCount, mDSSampleRate.earlyStencilTestFailCount));
-            EventHandlerFile::Handle(LateStencilSampleRate(drawId, mDSSampleRate.lateStencilTestPassCount, mDSSampleRate.lateStencilTestFailCount));
+            // sampleRate
+            EventHandlerFile::Handle(EarlyZSampleRate(
+                drawId, mDSSampleRate.earlyZTestPassCount, mDSSampleRate.earlyZTestFailCount));
+            EventHandlerFile::Handle(LateZSampleRate(
+                drawId, mDSSampleRate.lateZTestPassCount, mDSSampleRate.lateZTestFailCount));
+            EventHandlerFile::Handle(
+                EarlyStencilSampleRate(drawId,
+                                       mDSSampleRate.earlyStencilTestPassCount,
+                                       mDSSampleRate.earlyStencilTestFailCount));
+            EventHandlerFile::Handle(LateStencilSampleRate(drawId,
+                                                           mDSSampleRate.lateStencilTestPassCount,
+                                                           mDSSampleRate.lateStencilTestFailCount));
 
-            //combined
-            EventHandlerFile::Handle(EarlyZ(drawId, mDSCombined.earlyZTestPassCount, mDSCombined.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZ(drawId, mDSCombined.lateZTestPassCount, mDSCombined.lateZTestFailCount));
-            EventHandlerFile::Handle(EarlyStencil(drawId, mDSCombined.earlyStencilTestPassCount, mDSCombined.earlyStencilTestFailCount));
-            EventHandlerFile::Handle(LateStencil(drawId, mDSCombined.lateStencilTestPassCount, mDSCombined.lateStencilTestFailCount));
+            // combined
+            EventHandlerFile::Handle(
+                EarlyZ(drawId, mDSCombined.earlyZTestPassCount, mDSCombined.earlyZTestFailCount));
+            EventHandlerFile::Handle(
+                LateZ(drawId, mDSCombined.lateZTestPassCount, mDSCombined.lateZTestFailCount));
+            EventHandlerFile::Handle(EarlyStencil(drawId,
+                                                  mDSCombined.earlyStencilTestPassCount,
+                                                  mDSCombined.earlyStencilTestFailCount));
+            EventHandlerFile::Handle(LateStencil(drawId,
+                                                 mDSCombined.lateStencilTestPassCount,
+                                                 mDSCombined.lateStencilTestFailCount));
 
-            //pixelRate
-            EventHandlerFile::Handle(EarlyZPixelRate(drawId, mDSPixelRate.earlyZTestPassCount, mDSPixelRate.earlyZTestFailCount));
-            EventHandlerFile::Handle(LateZPixelRate(drawId, mDSPixelRate.lateZTestPassCount, mDSPixelRate.lateZTestFailCount));
+            // pixelRate
+            EventHandlerFile::Handle(EarlyZPixelRate(
+                drawId, mDSPixelRate.earlyZTestPassCount, mDSPixelRate.earlyZTestFailCount));
+            EventHandlerFile::Handle(LateZPixelRate(
+                drawId, mDSPixelRate.lateZTestPassCount, mDSPixelRate.lateZTestFailCount));
 
 
-            //NullPS
-            EventHandlerFile::Handle(EarlyZNullPS(drawId, mDSNullPS.earlyZTestPassCount, mDSNullPS.earlyZTestFailCount));
-            EventHandlerFile::Handle(EarlyStencilNullPS(drawId, mDSNullPS.earlyStencilTestPassCount, mDSNullPS.earlyStencilTestFailCount));
+            // NullPS
+            EventHandlerFile::Handle(
+                EarlyZNullPS(drawId, mDSNullPS.earlyZTestPassCount, mDSNullPS.earlyZTestFailCount));
+            EventHandlerFile::Handle(EarlyStencilNullPS(
+                drawId, mDSNullPS.earlyStencilTestPassCount, mDSNullPS.earlyStencilTestFailCount));
 
             // Rasterized Subspans
             EventHandlerFile::Handle(RasterTiles(drawId, rastStats.rasterTiles));
 
             // Alpha Subspans
-            EventHandlerFile::Handle(AlphaEvent(drawId, mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount));
+            EventHandlerFile::Handle(
+                AlphaEvent(drawId, mAlphaStats.alphaTestCount, mAlphaStats.alphaBlendCount));
 
             // Primitive Culling
-            EventHandlerFile::Handle(CullEvent(drawId, mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount));
+            EventHandlerFile::Handle(
+                CullEvent(drawId, mCullStats.backfacePrimCount, mCullStats.degeneratePrimCount));
 
             mDSSingleSample = {};
-            mDSSampleRate = {};
-            mDSCombined = {};
-            mDSPixelRate = {};
+            mDSSampleRate   = {};
+            mDSCombined     = {};
+            mDSPixelRate    = {};
             mDSNullPS = {};
 
-            rastStats = {};
-            mCullStats = {};
+            rastStats   = {};
+            mCullStats  = {};
             mAlphaStats = {};
 
-            mShaderStats[SHADER_PIXEL] = {};
+            mShaderStats[SHADER_PIXEL]   = {};
             mShaderStats[SHADER_COMPUTE] = {};
 
             mNeedFlush = false;
@@ -356,31 +466,38 @@
 
         virtual void Handle(const FrontendDrawEndEvent& event)
         {
-            //Clipper
-            EventHandlerFile::Handle(ClipperEvent(event.data.drawId, mClipper.trivialRejectCount, mClipper.trivialAcceptCount, mClipper.mustClipCount));
+            // Clipper
+            EventHandlerFile::Handle(ClipperEvent(event.data.drawId,
+                                                  mClipper.trivialRejectCount,
+                                                  mClipper.trivialAcceptCount,
+                                                  mClipper.mustClipCount));
 
-            //Tesselator
+            // Tesselator
             EventHandlerFile::Handle(TessPrims(event.data.drawId, mTS.inputPrims));
 
-            //Geometry Shader
+            // Geometry Shader
             EventHandlerFile::Handle(GSInputPrims(event.data.drawId, mGS.inputPrimCount));
             EventHandlerFile::Handle(GSPrimsGen(event.data.drawId, mGS.primGeneratedCount));
             EventHandlerFile::Handle(GSVertsInput(event.data.drawId, mGS.vertsInput));
 
-            EventHandlerFile::Handle(VSInfo(event.data.drawId, mShaderStats[SHADER_VERTEX].numInstExecuted));
-            EventHandlerFile::Handle(HSInfo(event.data.drawId, mShaderStats[SHADER_HULL].numInstExecuted));
-            EventHandlerFile::Handle(DSInfo(event.data.drawId, mShaderStats[SHADER_DOMAIN].numInstExecuted));
-            EventHandlerFile::Handle(GSInfo(event.data.drawId, mShaderStats[SHADER_GEOMETRY].numInstExecuted));
+            EventHandlerFile::Handle(
+                VSInfo(event.data.drawId, mShaderStats[SHADER_VERTEX].numInstExecuted));
+            EventHandlerFile::Handle(
+                HSInfo(event.data.drawId, mShaderStats[SHADER_HULL].numInstExecuted));
+            EventHandlerFile::Handle(
+                DSInfo(event.data.drawId, mShaderStats[SHADER_DOMAIN].numInstExecuted));
+            EventHandlerFile::Handle(
+                GSInfo(event.data.drawId, mShaderStats[SHADER_GEOMETRY].numInstExecuted));
 
-            mShaderStats[SHADER_VERTEX] = {};
-            mShaderStats[SHADER_HULL] = {};
-            mShaderStats[SHADER_DOMAIN] = {};
+            mShaderStats[SHADER_VERTEX]   = {};
+            mShaderStats[SHADER_HULL]     = {};
+            mShaderStats[SHADER_DOMAIN]   = {};
             mShaderStats[SHADER_GEOMETRY] = {};
 
-            //Reset Internal Counters
+            // Reset Internal Counters
             mClipper = {};
-            mTS = {};
-            mGS = {};
+            mTS      = {};
+            mGS      = {};
         }
 
         virtual void Handle(const GSPrimInfo& event)
@@ -390,10 +507,7 @@
             mGS.vertsInput += event.data.vertsInput;
         }
 
-        virtual void Handle(const TessPrimCount& event)
-        {
-            mTS.inputPrims += event.data.primCount;
-        }
+        virtual void Handle(const TessPrimCount& event) { mTS.inputPrims += event.data.primCount; }
 
         virtual void Handle(const RasterTileCount& event)
         {
@@ -402,13 +516,15 @@
 
         virtual void Handle(const CullInfoEvent& event)
         {
-            mCullStats.degeneratePrimCount += _mm_popcnt_u32(event.data.validMask ^ (event.data.validMask & ~event.data.degeneratePrimMask));
-            mCullStats.backfacePrimCount   += _mm_popcnt_u32(event.data.validMask ^ (event.data.validMask & ~event.data.backfacePrimMask));
+            mCullStats.degeneratePrimCount += _mm_popcnt_u32(
+                event.data.validMask ^ (event.data.validMask & ~event.data.degeneratePrimMask));
+            mCullStats.backfacePrimCount += _mm_popcnt_u32(
+                event.data.validMask ^ (event.data.validMask & ~event.data.backfacePrimMask));
         }
 
         virtual void Handle(const AlphaInfoEvent& event)
         {
-            mAlphaStats.alphaTestCount  += event.data.alphaTestEnable;
+            mAlphaStats.alphaTestCount += event.data.alphaTestEnable;
             mAlphaStats.alphaBlendCount += event.data.alphaBlendEnable;
         }
 
@@ -416,17 +532,17 @@
         bool mNeedFlush;
         // Per draw stats
         DepthStencilStats mDSSingleSample = {};
-        DepthStencilStats mDSSampleRate = {};
-        DepthStencilStats mDSPixelRate = {};
-        DepthStencilStats mDSCombined = {};
-        DepthStencilStats mDSNullPS = {};
-        DepthStencilStats mDSOmZ = {};
-        CStats mClipper = {};
-        TEStats mTS = {};
-        GSStateInfo mGS = {};
-        RastStats rastStats = {};
-        CullStats mCullStats = {};
-        AlphaStats mAlphaStats = {};
+        DepthStencilStats mDSSampleRate   = {};
+        DepthStencilStats mDSPixelRate    = {};
+        DepthStencilStats mDSCombined     = {};
+        DepthStencilStats mDSNullPS       = {};
+        DepthStencilStats mDSOmZ          = {};
+        CStats            mClipper        = {};
+        TEStats           mTS             = {};
+        GSStateInfo       mGS             = {};
+        RastStats         rastStats       = {};
+        CullStats         mCullStats      = {};
+        AlphaStats        mAlphaStats     = {};
 
         ShaderStats mShaderStats[NUM_SHADER_TYPES];
 
@@ -442,7 +558,7 @@
     {
         // Can we assume single threaded here?
         static std::atomic<uint32_t> counter(0);
-        uint32_t id = counter.fetch_add(1);
+        uint32_t                     id = counter.fetch_add(1);
 
         EventManager* pManager = new EventManager();
 
@@ -497,4 +613,4 @@
 
         pManager->FlushDraw(drawId);
     }
-}
+} // namespace ArchRast
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
index c74d6ad..d42c197 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
+++ b/src/gallium/drivers/swr/rasterizer/archrast/archrast.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file archrast.h
-*
-* @brief Definitions for archrast.
-*
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file archrast.h
+ *
+ * @brief Definitions for archrast.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "common/os.h"
@@ -35,15 +35,14 @@
 {
     enum class AR_THREAD
     {
-        API = 0,
+        API    = 0,
         WORKER = 1
     };
 
     HANDLE CreateThreadContext(AR_THREAD type);
-    void DestroyThreadContext(HANDLE hThreadContext);
+    void   DestroyThreadContext(HANDLE hThreadContext);
 
     // Dispatch event for this thread.
     void Dispatch(HANDLE hThreadContext, const Event& event);
     void FlushDraw(HANDLE hThreadContext, uint32_t drawId);
-};
-
+}; // namespace ArchRast
diff --git a/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h b/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h
index 10e0dce..118a100 100644
--- a/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h
+++ b/src/gallium/drivers/swr/rasterizer/archrast/eventmanager.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file archrast.h
-*
-* @brief Definitions for the event manager.
-*
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file archrast.h
+ *
+ * @brief Definitions for the event manager.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "common/os.h"
@@ -78,12 +78,11 @@
                 pHandler->FlushDraw(drawId);
             }
         }
-    private:
 
+    private:
         // Handlers stay registered for life
         void Detach(EventHandler* pHandler) { SWR_INVALID("Should not be called"); }
 
         std::vector<EventHandler*> mHandlers;
     };
-};
-
+}; // namespace ArchRast
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
index 44a0cc8..60b749d 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_common.py
@@ -32,6 +32,12 @@
 from mako.exceptions import RichTraceback
 
 #==============================================================================
+def ConcatLists(list_of_lists):
+    output = []
+    for l in list_of_lists: output += l
+    return output
+
+#==============================================================================
 def MakeTmpDir(suffix=''):
     '''
         Create temporary directory for use in codegen scripts.
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index bced657..485403a 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -53,11 +53,10 @@
     ['VPERMPS',     ['idx', 'a'], 'a'],
     ['VCVTPD2PS',   ['a'], 'VectorType::get(mFP32Ty, a->getType()->getVectorNumElements())'],
     ['VCVTPH2PS',   ['a'], 'VectorType::get(mFP32Ty, a->getType()->getVectorNumElements())'],
-    ['VCVTPS2PH',   ['a', 'round'], 'mSimdFP16Ty'],
+    ['VCVTPS2PH',   ['a', 'round'], 'mSimdInt16Ty'],
     ['VHSUBPS',     ['a', 'b'], 'a'],
     ['VPTESTC',     ['a', 'b'], 'mInt32Ty'],
     ['VPTESTZ',     ['a', 'b'], 'mInt32Ty'],
-    ['VFMADDPS',    ['a', 'b', 'c'], 'a'],
     ['VPHADDD',     ['a', 'b'], 'a'],
     ['PDEP32',      ['a', 'b'], 'a'],
     ['RDTSC',       [], 'mInt64Ty'],
@@ -71,6 +70,7 @@
     ['STACKRESTORE', 'stackrestore', ['a'], []],
     ['VMINPS', 'minnum', ['a', 'b'], ['a']],
     ['VMAXPS', 'maxnum', ['a', 'b'], ['a']],
+    ['VFMADDPS', 'fmuladd', ['a', 'b', 'c'], ['a']],
     ['DEBUGTRAP', 'debugtrap', [], []],
     ['POPCNT', 'ctpop', ['a'], ['a']],
     ['LOG2', 'log2', ['a'], ['a']],
@@ -161,7 +161,8 @@
                         func_name == 'CreateAlignmentAssumptionHelper' or
                         func_name == 'CreateGEP' or
                         func_name == 'CreateLoad' or
-                        func_name == 'CreateMaskedLoad'):
+                        func_name == 'CreateMaskedLoad' or
+                        func_name == 'CreateElementUnorderedAtomicMemCpy'):
                         ignore = True
 
                     # Convert CamelCase to CAMEL_CASE
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp
index 1ecb455..e696dd2 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.cpp
@@ -1,35 +1,36 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file ${filename}
-*
-* @brief Implementation for events.  auto-generated file
-* 
-* DO NOT EDIT
-*
-* Generation Command Line:
-*  ${'\n*    '.join(cmdline)}
-*
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file ${filename}
+ *
+ * @brief Implementation for events.  auto-generated file
+ *
+ * DO NOT EDIT
+ *
+ * Generation Command Line:
+ *  ${'\n *    '.join(cmdline)}
+ *
+ ******************************************************************************/
+// clang-format off
 #include "common/os.h"
 #include "gen_ar_event.hpp"
 #include "gen_ar_eventhandler.hpp"
@@ -42,3 +43,5 @@
     pHandler->Handle(*this);
 }
 % endfor
+// clan-format on
+
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp
index 685a10b..fe3f261 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_event.hpp
@@ -1,35 +1,36 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file ${filename}
-*
-* @brief Definitions for events.  auto-generated file
-* 
-* DO NOT EDIT
-*
-* Generation Command Line:
-*  ${'\n*    '.join(cmdline)}
-* 
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file ${filename}
+ *
+ * @brief Definitions for events.  auto-generated file
+ *
+ * DO NOT EDIT
+ *
+ * Generation Command Line:
+ *  ${'\n *    '.join(cmdline)}
+ *
+ ******************************************************************************/
+// clang-format off
 #pragma once
 
 #include "common/os.h"
@@ -46,7 +47,7 @@
     };
 % endfor
 
-    //Forward decl
+    // Forward decl
     class EventHandler;
 
     //////////////////////////////////////////////////////////////////////////
@@ -104,5 +105,6 @@
 
         virtual void Accept(EventHandler* pHandler) const;
     };
-% endfor
-}
\ No newline at end of file
+    % endfor
+} // namespace ArchRast
+// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp
index 87d0ef4..140dd00 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandler.hpp
@@ -1,35 +1,36 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file ${filename}
-*
-* @brief Event handler interface.  auto-generated file
-* 
-* DO NOT EDIT
-*
-* Generation Command Line:
-*  ${'\n*    '.join(cmdline)}
-*
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file ${filename}
+ *
+ * @brief Event handler interface.  auto-generated file
+ *
+ * DO NOT EDIT
+ *
+ * Generation Command Line:
+ *  ${'\n *    '.join(cmdline)}
+ *
+ ******************************************************************************/
+// clang-format on
 #pragma once
 
 #include "${event_header}"
@@ -51,4 +52,5 @@
         virtual void Handle(const ${name}& event) {}
 % endfor
     };
-}
+} // namespace ArchRast
+// clan-format off
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
index d1852b3..7c10e12 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_ar_eventhandlerfile.hpp
@@ -1,41 +1,43 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file ${filename}
-*
-* @brief Event handler interface.  auto-generated file
-*
-* DO NOT EDIT
-*
-* Generation Command Line:
-*  ${'\n*    '.join(cmdline)}
-*
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file ${filename}
+ *
+ * @brief Event handler interface.  auto-generated file
+ *
+ * DO NOT EDIT
+ *
+ * Generation Command Line:
+ *  ${'\n *    '.join(cmdline)}
+ *
+ ******************************************************************************/
+// clang-format off
 #pragma once
 
 #include "common/os.h"
 #include "${event_header}"
 #include <fstream>
 #include <sstream>
+#include <iostream>
 #include <thread>
 
 namespace ArchRast
@@ -46,17 +48,23 @@
     class EventHandlerFile : public EventHandler
     {
     public:
-        EventHandlerFile(uint32_t id)
-        : mBufOffset(0)
+        EventHandlerFile(uint32_t id) : mBufOffset(0)
         {
 #if defined(_WIN32)
             DWORD pid = GetCurrentProcessId();
             TCHAR procname[MAX_PATH];
             GetModuleFileName(NULL, procname, MAX_PATH);
-            const char* pBaseName = strrchr(procname, '\\');
+            const char*       pBaseName = strrchr(procname, '\\');
             std::stringstream outDir;
             outDir << KNOB_DEBUG_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
-            CreateDirectory(outDir.str().c_str(), NULL);
+            mOutputDir = outDir.str();
+            if (CreateDirectory(mOutputDir.c_str(), NULL))
+            {
+                std::cout << std::endl
+                          << "ArchRast Dir:       " << mOutputDir << std::endl
+                          << std::endl
+                          << std::flush;
+            }
 
             // There could be multiple threads creating thread pools. We
             // want to make sure they are uniquly identified by adding in
@@ -76,10 +84,7 @@
 #endif
         }
 
-        virtual ~EventHandlerFile()
-        {
-            FlushBuffer();
-        }
+        virtual ~EventHandlerFile() { FlushBuffer(); }
 
         //////////////////////////////////////////////////////////////////////////
         /// @brief Flush buffer to file.
@@ -105,7 +110,7 @@
                 file.write((char*)mBuffer, mBufOffset);
                 file.close();
 
-                mBufOffset = 0;
+                mBufOffset       = 0;
                 mHeaderBufOffset = 0; // Reset header offset so its no longer considered.
             }
             return true;
@@ -120,7 +125,8 @@
                 if (!FlushBuffer())
                 {
                     // Don't corrupt what's already in the buffer?
-                    /// @todo Maybe add corrupt marker to buffer here in case we can open file in future?
+                    /// @todo Maybe add corrupt marker to buffer here in case we can open file in
+                    /// future?
                     return;
                 }
             }
@@ -152,10 +158,12 @@
         }
 
         std::string mFilename;
+        std::string mOutputDir;
 
         static const uint32_t mBufferSize = 1024;
-        uint8_t mBuffer[mBufferSize];
+        uint8_t               mBuffer[mBufferSize];
         uint32_t mBufOffset{0};
         uint32_t mHeaderBufOffset{0};
     };
-}
+} // namespace ArchRast
+// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
index 088b1cd..b8da529 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_backend.cpp
@@ -19,11 +19,11 @@
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 // IN THE SOFTWARE.
-// 
+//
 // @file BackendPixelRate${fileNum}.cpp
-// 
+//
 // @brief auto-generated file
-// 
+//
 // DO NOT EDIT
 //
 // Generation Command Line:
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
index bcbcb30..5182bc4 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
@@ -30,6 +30,7 @@
 //  ${'\n//    '.join(cmdline)}
 //
 //============================================================================
+// clang-format off
 #pragma once
 
 //============================================================================
@@ -57,10 +58,10 @@
     %for arg in func['types']:
     args.push_back(${arg}->getType());
     %endfor
-    Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args);
+    Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args);
     return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
     %else:
-    Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
+    Function* pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
     return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
     %endif
 %else:
@@ -68,4 +69,5 @@
 %endif
 }
 
-%endfor
+% endfor
+    // clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
index 5625ef8..d0682c5 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_header_init.hpp
@@ -19,11 +19,11 @@
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 // IN THE SOFTWARE.
-// 
+//
 // @file ${filename}
-// 
+//
 // @brief auto-generated file
-// 
+//
 // DO NOT EDIT
 //
 // Generation Command Line:
@@ -31,6 +31,8 @@
 //
 //============================================================================
 
+// clang-format off
+
 %for num in range(numFiles):
 void Init${tableName}${num}();
 %endfor
@@ -41,3 +43,4 @@
     Init${tableName}${num}();
     %endfor
 }
+// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
index cfdc370..9375569 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.cpp
@@ -1,35 +1,36 @@
 /******************************************************************************
-* Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file ${filename}.cpp
-*
-* @brief Dynamic Knobs for Core.
-*
-* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
-*
-* Generation Command Line:
-*  ${'\n*    '.join(cmdline)}
-*
-******************************************************************************/
+ * Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file ${filename}.cpp
+ *
+ * @brief Dynamic Knobs for Core.
+ *
+ * ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
+ *
+ * Generation Command Line:
+ *  ${'\n *    '.join(cmdline)}
+ *
+ ******************************************************************************/
+// clang-format off
 <% calc_max_knob_len(knobs) %>
 % for inc in includes:
 #include <${inc}>
@@ -40,13 +41,14 @@
 //========================================================
 // Implementation
 //========================================================
-void KnobBase::autoExpandEnvironmentVariables(std::string &text)
+void KnobBase::autoExpandEnvironmentVariables(std::string& text)
 {
 #if (__GNUC__) && (GCC_VERSION < 409000)
     // <regex> isn't implemented prior to gcc-4.9.0
     // unix style variable replacement
     size_t start;
-    while ((start = text.find("${'${'}")) != std::string::npos) {
+    while ((start = text.find("${'${'}")) != std::string::npos)
+    {
         size_t end = text.find("}");
         if (end == std::string::npos)
             break;
@@ -54,7 +56,8 @@
         text.replace(start, end - start + 1, var);
     }
     // win32 style variable replacement
-    while ((start = text.find("%")) != std::string::npos) {
+    while ((start = text.find("%")) != std::string::npos)
+    {
         size_t end = text.find("%", start + 1);
         if (end == std::string::npos)
             break;
@@ -65,7 +68,7 @@
     {
         // unix style variable replacement
         static std::regex env("\\$\\{([^}]+)\\}");
-        std::smatch match;
+        std::smatch       match;
         while (std::regex_search(text, match, env))
         {
             const std::string var = GetEnv(match[1].str());
@@ -77,7 +80,7 @@
     {
         // win32 style variable replacement
         static std::regex env("\\%([^}]+)\\%");
-        std::smatch match;
+        std::smatch       match;
         while (std::regex_search(text, match, env))
         {
             const std::string var = GetEnv(match[1].str());
@@ -89,7 +92,6 @@
 #endif
 }
 
-
 //========================================================
 // Static Data Members
 //========================================================
@@ -113,7 +115,10 @@
     std::basic_stringstream<char> str;
     str << std::showbase << std::setprecision(1) << std::fixed;
 
-    if (optPerLinePrefix == nullptr) { optPerLinePrefix = ""; }
+    if (optPerLinePrefix == nullptr)
+    {
+        optPerLinePrefix = "";
+    }
 
     % for knob in knobs:
     str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
@@ -157,3 +162,4 @@
         name_len = len(name)
         return ' '*(max_len - name_len)
 %>
+// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
index 4213f33..71dbdac 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_knobs.h
@@ -1,35 +1,36 @@
 /******************************************************************************
-* Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file ${filename}.h
-*
-* @brief Dynamic Knobs for Core.
-*
-* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
-*
-* Generation Command Line:
-*  ${'\n*    '.join(cmdline)}
-*
-******************************************************************************/
+ * Copyright (C) 2015-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file ${filename}.h
+ *
+ * @brief Dynamic Knobs for Core.
+ *
+ * ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
+ *
+ * Generation Command Line:
+ *  ${'\n *    '.join(cmdline)}
+ *
+ ******************************************************************************/
+// clang-format off
 <% calc_max_knob_len(knobs) %>
 #pragma once
 #include <string>
@@ -38,11 +39,11 @@
 {
 private:
     // Update the input string.
-    static void autoExpandEnvironmentVariables(std::string &text);
+    static void autoExpandEnvironmentVariables(std::string& text);
 
 protected:
     // Leave input alone and return new string.
-    static std::string expandEnvironmentVariables(std::string const &input)
+    static std::string expandEnvironmentVariables(std::string const& input)
     {
         std::string text = input;
         autoExpandEnvironmentVariables(text);
@@ -50,7 +51,7 @@
     }
 
     template <typename T>
-    static T expandEnvironmentVariables(T const &input)
+    static T expandEnvironmentVariables(T const& input)
     {
         return input;
     }
@@ -60,8 +61,8 @@
 struct Knob : KnobBase
 {
 public:
-    const   T&  Value() const               { return m_Value; }
-    const   T&  Value(T const &newValue)
+    const T& Value() const { return m_Value; }
+    const T& Value(T const& newValue)
     {
         m_Value = expandEnvironmentVariables(newValue);
         return Value();
@@ -150,3 +151,4 @@
         name_len = len(name)
         return ' '*(max_len - name_len)
 %>
+// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
index 190e660..df2934f 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_llvm.hpp
@@ -1,35 +1,37 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file ${filename}
-*
-* @brief auto-generated file
-*
-* DO NOT EDIT
-*
-* Generation Command Line:
-*   ${'\n*     '.join(cmdline)}
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file ${filename}
+ *
+ * @brief auto-generated file
+ *
+ * DO NOT EDIT
+ *
+ * Generation Command Line:
+ *   ${'\n *     '.join(cmdline)}
+ *
+ ******************************************************************************/
+// clang-format off
+
 #pragma once
 
 namespace SwrJit
@@ -37,7 +39,7 @@
     using namespace llvm;
 
 %for type in types:
-    INLINE static StructType *Gen_${type['name']}(JitManager* pJitMgr)
+    INLINE static StructType* Gen_${type['name']}(JitManager* pJitMgr)
     {
         %if needs_ctx(type):
         LLVMContext& ctx = pJitMgr->mContext;
@@ -76,7 +78,7 @@
     %endfor
 
 %endfor
-} // ns SwrJit
+} // namespace SwrJit
 
 <%! # Global function definitions
     import os
@@ -98,3 +100,4 @@
         pad_amt = max_len - cur_len
         return ' '*pad_amt
 %>
+// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
index 06c8762..92e0f40 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_rasterizer.cpp
@@ -19,17 +19,18 @@
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 // IN THE SOFTWARE.
-// 
+//
 // @file gen_rasterizer${fileNum}.cpp
-// 
+//
 // @brief auto-generated file
-// 
+//
 // DO NOT EDIT
 //
 // Generation Command Line:
 //  ${'\n//    '.join(cmdline)}
 //
 //============================================================================
+// clang-format off
 
 #include "core/rasterizer.h"
 #include "core/rasterizer_impl.h"
@@ -40,3 +41,4 @@
     ${func}
     %endfor
 }
+// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
index 1c086ff..e0800f5 100644
--- a/src/gallium/drivers/swr/rasterizer/common/formats.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file formats.cpp
-*
-* @brief auto-generated file
-*
-* DO NOT EDIT
-*
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file formats.cpp
+ *
+ * @brief auto-generated file
+ *
+ * DO NOT EDIT
+ *
+ ******************************************************************************/
 
 #include "formats.h"
 
@@ -72,6842 +72,9227 @@
     // R32G32B32A32_FLOAT (0x0)
     {
         "R32G32B32A32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 32, 32, 32, 32 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {32, 32, 32, 32},             // Bits per component
+        128,                          // Bits per element
+        16,                           // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32B32A32_SINT (0x1)
     {
         "R32G32B32A32_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 32, 32, 32, 32 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {32, 32, 32, 32},             // Bits per component
+        128,                          // Bits per element
+        16,                           // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32B32A32_UINT (0x2)
     {
         "R32G32B32A32_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 32, 32, 32, 32 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {32, 32, 32, 32},             // Bits per component
+        128,                          // Bits per element
+        16,                           // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x3)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x4)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R64G64_FLOAT (0x5)
     {
         "R64G64_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 64, 64, 0, 0 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {64, 64, 0, 0},               // Bits per component
+        128,                          // Bits per element
+        16,                           // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32B32X32_FLOAT (0x6)
     {
         "R32G32B32X32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 32, 32, 32, 32 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {32, 32, 32, 32},             // Bits per component
+        128,                          // Bits per element
+        16,                           // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32B32A32_SSCALED (0x7)
     {
         "R32G32B32A32_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 32, 32, 32, 32 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {32, 32, 32, 32},             // Bits per component
+        128,                          // Bits per element
+        16,                           // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32B32A32_USCALED (0x8)
     {
         "R32G32B32A32_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 32, 32, 32, 32 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {32, 32, 32, 32},             // Bits per component
+        128,                          // Bits per element
+        16,                           // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x9)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xA)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xD)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x10)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x11)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x12)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x13)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x14)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x15)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x16)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x17)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x18)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x19)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R32G32B32A32_SFIXED (0x20)
     {
         "R32G32B32A32_SFIXED",
-        { SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 32, 32, 32, 32 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {32, 32, 32, 32},             // Bits per component
+        128,                          // Bits per element
+        16,                           // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x21)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x22)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x23)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x24)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x25)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x26)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x27)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x28)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x29)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x2A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x2B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x2C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x2D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x2E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x2F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x30)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x31)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x32)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x33)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x34)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x35)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x36)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x37)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x38)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x39)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x3A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x3B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x3C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x3D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x3E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x3F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R32G32B32_FLOAT (0x40)
     {
         "R32G32B32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 32, 32, 32, 0 }, // Bits per component
-        96, // Bits per element
-        12, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {32, 32, 32, 0},              // Bits per component
+        96,                           // Bits per element
+        12,                           // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32B32_SINT (0x41)
     {
         "R32G32B32_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 32, 32, 32, 0 }, // Bits per component
-        96, // Bits per element
-        12, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {32, 32, 32, 0},              // Bits per component
+        96,                           // Bits per element
+        12,                           // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32B32_UINT (0x42)
     {
         "R32G32B32_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 32, 32, 32, 0 }, // Bits per component
-        96, // Bits per element
-        12, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {32, 32, 32, 0},              // Bits per component
+        96,                           // Bits per element
+        12,                           // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x43)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x44)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R32G32B32_SSCALED (0x45)
     {
         "R32G32B32_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 32, 32, 32, 0 }, // Bits per component
-        96, // Bits per element
-        12, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {32, 32, 32, 0},              // Bits per component
+        96,                           // Bits per element
+        12,                           // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32B32_USCALED (0x46)
     {
         "R32G32B32_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 32, 32, 32, 0 }, // Bits per component
-        96, // Bits per element
-        12, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {32, 32, 32, 0},              // Bits per component
+        96,                           // Bits per element
+        12,                           // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x47)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x48)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x49)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x4A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x4B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x4C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x4D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x4E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x4F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R32G32B32_SFIXED (0x50)
     {
         "R32G32B32_SFIXED",
-        { SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 32, 32, 32, 0 }, // Bits per component
-        96, // Bits per element
-        12, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {32, 32, 32, 0},              // Bits per component
+        96,                           // Bits per element
+        12,                           // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x51)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x52)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x53)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x54)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x55)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x56)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x57)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x58)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x59)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x5A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x5B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x5C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x5D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x5E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x5F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x60)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x61)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x62)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x63)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x64)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x65)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x66)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x67)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x68)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x69)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x6A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x6B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x6C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x6D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x6E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x6F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x70)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x71)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x72)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x73)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x74)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x75)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x76)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x77)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x78)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x79)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x7A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x7B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x7C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x7D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x7E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x7F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R16G16B16A16_UNORM (0x80)
     {
         "R16G16B16A16_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 16, 16, 16, 16 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {0, 1, 2, 3},             // Swizzle
+        {16, 16, 16, 16},         // Bits per component
+        64,                       // Bits per element
+        8,                        // Bytes per element
+        4,                        // Num components
+        false,                    // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 65535.0f,
+         1.0f / 65535.0f,
+         1.0f / 65535.0f,
+         1.0f / 65535.0f}, // To float scale factor
+        1,                 // bcWidth
+        1,                 // bcHeight
     },
 
     // R16G16B16A16_SNORM (0x81)
     {
         "R16G16B16A16_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 16, 16, 16, 16 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {0, 1, 2, 3},             // Swizzle
+        {16, 16, 16, 16},         // Bits per component
+        64,                       // Bits per element
+        8,                        // Bytes per element
+        4,                        // Num components
+        false,                    // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 32767.0f,
+         1.0f / 32767.0f,
+         1.0f / 32767.0f,
+         1.0f / 32767.0f}, // To float scale factor
+        1,                 // bcWidth
+        1,                 // bcHeight
     },
 
     // R16G16B16A16_SINT (0x82)
     {
         "R16G16B16A16_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 16, 16, 16, 16 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {16, 16, 16, 16},             // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16B16A16_UINT (0x83)
     {
         "R16G16B16A16_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 16, 16, 16, 16 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {16, 16, 16, 16},             // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16B16A16_FLOAT (0x84)
     {
         "R16G16B16A16_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 16, 16, 16, 16 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {16, 16, 16, 16},             // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32_FLOAT (0x85)
     {
         "R32G32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32_SINT (0x86)
     {
         "R32G32_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32_UINT (0x87)
     {
         "R32G32_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32_FLOAT_X8X24_TYPELESS (0x88)
     {
         "R32_FLOAT_X8X24_TYPELESS",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // X32_TYPELESS_G8X24_UINT (0x89)
     {
         "X32_TYPELESS_G8X24_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UNUSED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // L32A32_FLOAT (0x8A)
     {
         "L32A32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 3, 0, 0},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x8B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x8C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R64_FLOAT (0x8D)
     {
         "R64_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 64, 0, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {64, 0, 0, 0},                // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16B16X16_UNORM (0x8E)
     {
         "R16G16B16X16_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 16, 16, 16, 16 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},     // Defaults for missing components
+        {0, 1, 2, 3},              // Swizzle
+        {16, 16, 16, 16},          // Bits per component
+        64,                        // Bits per element
+        8,                         // Bytes per element
+        4,                         // Num components
+        false,                     // isSRGB
+        false,                     // isBC
+        false,                     // isSubsampled
+        false,                     // isLuminance
+        {true, true, true, false}, // Is normalized?
+        {1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f}, // To float scale factor
+        1,                                                         // bcWidth
+        1,                                                         // bcHeight
     },
 
     // R16G16B16X16_FLOAT (0x8F)
     {
         "R16G16B16X16_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 16, 16, 16, 16 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {16, 16, 16, 16},             // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x90)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // L32X32_FLOAT (0x91)
     {
         "L32X32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 3, 0, 0},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // I32X32_FLOAT (0x92)
     {
         "I32X32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 3, 0, 0},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16B16A16_SSCALED (0x93)
     {
         "R16G16B16A16_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 16, 16, 16, 16 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {16, 16, 16, 16},             // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16B16A16_USCALED (0x94)
     {
         "R16G16B16A16_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 16, 16, 16, 16 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {16, 16, 16, 16},             // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32_SSCALED (0x95)
     {
         "R32G32_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32G32_USCALED (0x96)
     {
         "R32G32_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x97)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x98)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x99)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x9A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x9B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x9C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x9D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x9E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x9F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R32G32_SFIXED (0xA0)
     {
         "R32G32_SFIXED",
-        { SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 32, 32, 0, 0 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SFIXED, SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {32, 32, 0, 0},               // Bits per component
+        64,                           // Bits per element
+        8,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0xA1)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xA2)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xA3)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xA4)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xA5)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xA6)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xA7)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xA8)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xA9)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xAA)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xAB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xAC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xAD)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xAE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xAF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB0)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB1)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB2)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB3)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB4)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB5)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB6)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB7)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB8)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xB9)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xBA)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xBB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xBC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xBD)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xBE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xBF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // B8G8R8A8_UNORM (0xC0)
     {
         "B8G8R8A8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {2, 1, 0, 3},             // Swizzle
+        {8, 8, 8, 8},             // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        false,                    // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor
+        1,                                                            // bcWidth
+        1,                                                            // bcHeight
     },
 
     // B8G8R8A8_UNORM_SRGB (0xC1)
     {
         "B8G8R8A8_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {2, 1, 0, 3},             // Swizzle
+        {8, 8, 8, 8},             // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        true,                     // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor
+        1,                                                            // bcWidth
+        1,                                                            // bcHeight
     },
 
     // R10G10B10A2_UNORM (0xC2)
     {
         "R10G10B10A2_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {0, 1, 2, 3},             // Swizzle
+        {10, 10, 10, 2},          // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        false,                    // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor
+        1,                                                             // bcWidth
+        1,                                                             // bcHeight
     },
 
     // R10G10B10A2_UNORM_SRGB (0xC3)
     {
         "R10G10B10A2_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {0, 1, 2, 3},             // Swizzle
+        {10, 10, 10, 2},          // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        true,                     // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor
+        1,                                                             // bcWidth
+        1,                                                             // bcHeight
     },
 
     // R10G10B10A2_UINT (0xC4)
     {
         "R10G10B10A2_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {10, 10, 10, 2},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0xC5)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xC6)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R8G8B8A8_UNORM (0xC7)
     {
         "R8G8B8A8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {0, 1, 2, 3},             // Swizzle
+        {8, 8, 8, 8},             // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        false,                    // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor
+        1,                                                            // bcWidth
+        1,                                                            // bcHeight
     },
 
     // R8G8B8A8_UNORM_SRGB (0xC8)
     {
         "R8G8B8A8_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {0, 1, 2, 3},             // Swizzle
+        {8, 8, 8, 8},             // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        true,                     // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f}, // To float scale factor
+        1,                                                            // bcWidth
+        1,                                                            // bcHeight
     },
 
     // R8G8B8A8_SNORM (0xC9)
     {
         "R8G8B8A8_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {0, 1, 2, 3},             // Swizzle
+        {8, 8, 8, 8},             // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        false,                    // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f}, // To float scale factor
+        1,                                                            // bcWidth
+        1,                                                            // bcHeight
     },
 
     // R8G8B8A8_SINT (0xCA)
     {
         "R8G8B8A8_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {8, 8, 8, 8},                 // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R8G8B8A8_UINT (0xCB)
     {
         "R8G8B8A8_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {8, 8, 8, 8},                 // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16_UNORM (0xCC)
     {
         "R16G16_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 16, 16, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, false, false }, // Is normalized?
-        { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                    // Defaults for missing components
+        {0, 1, 0, 0},                             // Swizzle
+        {16, 16, 0, 0},                           // Bits per component
+        32,                                       // Bits per element
+        4,                                        // Bytes per element
+        2,                                        // Num components
+        false,                                    // isSRGB
+        false,                                    // isBC
+        false,                                    // isSubsampled
+        false,                                    // isLuminance
+        {true, true, false, false},               // Is normalized?
+        {1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0}, // To float scale factor
+        1,                                        // bcWidth
+        1,                                        // bcHeight
     },
 
     // R16G16_SNORM (0xCD)
     {
         "R16G16_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 16, 16, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, false, false }, // Is normalized?
-        { 1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                    // Defaults for missing components
+        {0, 1, 0, 0},                             // Swizzle
+        {16, 16, 0, 0},                           // Bits per component
+        32,                                       // Bits per element
+        4,                                        // Bytes per element
+        2,                                        // Num components
+        false,                                    // isSRGB
+        false,                                    // isBC
+        false,                                    // isSubsampled
+        false,                                    // isLuminance
+        {true, true, false, false},               // Is normalized?
+        {1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0}, // To float scale factor
+        1,                                        // bcWidth
+        1,                                        // bcHeight
     },
 
     // R16G16_SINT (0xCE)
     {
         "R16G16_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 16, 16, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {16, 16, 0, 0},               // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16_UINT (0xCF)
     {
         "R16G16_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 16, 16, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {16, 16, 0, 0},               // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16_FLOAT (0xD0)
     {
         "R16G16_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 16, 16, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {16, 16, 0, 0},               // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // B10G10R10A2_UNORM (0xD1)
     {
         "B10G10R10A2_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {2, 1, 0, 3},             // Swizzle
+        {10, 10, 10, 2},          // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        false,                    // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor
+        1,                                                             // bcWidth
+        1,                                                             // bcHeight
     },
 
     // B10G10R10A2_UNORM_SRGB (0xD2)
     {
         "B10G10R10A2_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {2, 1, 0, 3},             // Swizzle
+        {10, 10, 10, 2},          // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        true,                     // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f}, // To float scale factor
+        1,                                                             // bcWidth
+        1,                                                             // bcHeight
     },
 
     // R11G11B10_FLOAT (0xD3)
     {
         "R11G11B10_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 11, 11, 10, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {11, 11, 10, 0},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0xD4)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
 
     // R10G10B10_FLOAT_A2_UNORM (0xD5)
     {
         "R10G10B10_FLOAT_A2_UNORM",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f / 3.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},           // Defaults for missing components
+        {0, 1, 2, 3},                    // Swizzle
+        {10, 10, 10, 2},                 // Bits per component
+        32,                              // Bits per element
+        4,                               // Bytes per element
+        4,                               // Num components
+        false,                           // isSRGB
+        false,                           // isBC
+        false,                           // isSubsampled
+        false,                           // isLuminance
+        {false, false, false, false},    // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f / 3.0f}, // To float scale factor
+        1,                               // bcWidth
+        1,                               // bcHeight
     },
 
     // R32_SINT (0xD6)
     {
         "R32_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32_UINT (0xD7)
     {
         "R32_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32_FLOAT (0xD8)
     {
         "R32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R24_UNORM_X8_TYPELESS (0xD9)
     {
         "R24_UNORM_X8_TYPELESS",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 24, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},         // Defaults for missing components
+        {0, 1, 2, 3},                  // Swizzle
+        {24, 0, 0, 0},                 // Bits per component
+        32,                            // Bits per element
+        4,                             // Bytes per element
+        1,                             // Num components
+        false,                         // isSRGB
+        false,                         // isBC
+        false,                         // isSubsampled
+        false,                         // isLuminance
+        {true, false, false, false},   // Is normalized?
+        {1.0f / 16777215.0f, 0, 0, 0}, // To float scale factor
+        1,                             // bcWidth
+        1,                             // bcHeight
     },
 
     // X24_TYPELESS_G8_UINT (0xDA)
     {
         "X24_TYPELESS_G8_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 1, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {1, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0xDB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xDC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // L32_UNORM (0xDD)
     {
         "L32_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 4294967295.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},           // Defaults for missing components
+        {0, 0, 0, 0},                    // Swizzle
+        {32, 0, 0, 0},                   // Bits per component
+        32,                              // Bits per element
+        4,                               // Bytes per element
+        1,                               // Num components
+        false,                           // isSRGB
+        false,                           // isBC
+        false,                           // isSubsampled
+        true,                            // isLuminance
+        {true, false, false, false},     // Is normalized?
+        {1.0f / 4294967295.0f, 0, 0, 0}, // To float scale factor
+        1,                               // bcWidth
+        1,                               // bcHeight
     },
 
     // padding (0xDE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // L16A16_UNORM (0xDF)
     {
         "L16A16_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 16, 16, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, true, false, false }, // Is normalized?
-        { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                    // Defaults for missing components
+        {0, 3, 0, 0},                             // Swizzle
+        {16, 16, 0, 0},                           // Bits per component
+        32,                                       // Bits per element
+        4,                                        // Bytes per element
+        2,                                        // Num components
+        false,                                    // isSRGB
+        false,                                    // isBC
+        false,                                    // isSubsampled
+        true,                                     // isLuminance
+        {true, true, false, false},               // Is normalized?
+        {1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0}, // To float scale factor
+        1,                                        // bcWidth
+        1,                                        // bcHeight
     },
 
     // I24X8_UNORM (0xE0)
     {
         "I24X8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 24, 8, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, true, false, false }, // Is normalized?
-        { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                     // Defaults for missing components
+        {0, 3, 0, 0},                              // Swizzle
+        {24, 8, 0, 0},                             // Bits per component
+        32,                                        // Bits per element
+        4,                                         // Bytes per element
+        2,                                         // Num components
+        false,                                     // isSRGB
+        false,                                     // isBC
+        false,                                     // isSubsampled
+        true,                                      // isLuminance
+        {true, true, false, false},                // Is normalized?
+        {1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
+        1,                                         // bcWidth
+        1,                                         // bcHeight
     },
 
     // L24X8_UNORM (0xE1)
     {
         "L24X8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 24, 8, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, true, false, false }, // Is normalized?
-        { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                     // Defaults for missing components
+        {0, 3, 0, 0},                              // Swizzle
+        {24, 8, 0, 0},                             // Bits per component
+        32,                                        // Bits per element
+        4,                                         // Bytes per element
+        2,                                         // Num components
+        false,                                     // isSRGB
+        false,                                     // isBC
+        false,                                     // isSubsampled
+        true,                                      // isLuminance
+        {true, true, false, false},                // Is normalized?
+        {1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
+        1,                                         // bcWidth
+        1,                                         // bcHeight
     },
 
     // padding (0xE2)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // I32_FLOAT (0xE3)
     {
         "I32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // L32_FLOAT (0xE4)
     {
         "L32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // A32_FLOAT (0xE5)
     {
         "A32_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 3, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {3, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0xE6)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xE7)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xE8)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // B8G8R8X8_UNORM (0xE9)
     {
         "B8G8R8X8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},                               // Defaults for missing components
+        {2, 1, 0, 3},                                        // Swizzle
+        {8, 8, 8, 8},                                        // Bits per component
+        32,                                                  // Bits per element
+        4,                                                   // Bytes per element
+        4,                                                   // Num components
+        false,                                               // isSRGB
+        false,                                               // isBC
+        false,                                               // isSubsampled
+        false,                                               // isLuminance
+        {true, true, true, false},                           // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor
+        1,                                                   // bcWidth
+        1,                                                   // bcHeight
     },
 
     // B8G8R8X8_UNORM_SRGB (0xEA)
     {
         "B8G8R8X8_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},                               // Defaults for missing components
+        {2, 1, 0, 3},                                        // Swizzle
+        {8, 8, 8, 8},                                        // Bits per component
+        32,                                                  // Bits per element
+        4,                                                   // Bytes per element
+        4,                                                   // Num components
+        true,                                                // isSRGB
+        false,                                               // isBC
+        false,                                               // isSubsampled
+        false,                                               // isLuminance
+        {true, true, true, false},                           // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor
+        1,                                                   // bcWidth
+        1,                                                   // bcHeight
     },
 
     // R8G8B8X8_UNORM (0xEB)
     {
         "R8G8B8X8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},                               // Defaults for missing components
+        {0, 1, 2, 3},                                        // Swizzle
+        {8, 8, 8, 8},                                        // Bits per component
+        32,                                                  // Bits per element
+        4,                                                   // Bytes per element
+        4,                                                   // Num components
+        false,                                               // isSRGB
+        false,                                               // isBC
+        false,                                               // isSubsampled
+        false,                                               // isLuminance
+        {true, true, true, false},                           // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor
+        1,                                                   // bcWidth
+        1,                                                   // bcHeight
     },
 
     // R8G8B8X8_UNORM_SRGB (0xEC)
     {
         "R8G8B8X8_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},                               // Defaults for missing components
+        {0, 1, 2, 3},                                        // Swizzle
+        {8, 8, 8, 8},                                        // Bits per component
+        32,                                                  // Bits per element
+        4,                                                   // Bytes per element
+        4,                                                   // Num components
+        true,                                                // isSRGB
+        false,                                               // isBC
+        false,                                               // isSubsampled
+        false,                                               // isLuminance
+        {true, true, true, false},                           // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f}, // To float scale factor
+        1,                                                   // bcWidth
+        1,                                                   // bcHeight
     },
 
     // R9G9B9E5_SHAREDEXP (0xED)
     {
         "R9G9B9E5_SHAREDEXP",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 9, 9, 9, 5 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {9, 9, 9, 5},                 // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // B10G10R10X2_UNORM (0xEE)
     {
         "B10G10R10X2_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},                                  // Defaults for missing components
+        {2, 1, 0, 3},                                           // Swizzle
+        {10, 10, 10, 2},                                        // Bits per component
+        32,                                                     // Bits per element
+        4,                                                      // Bytes per element
+        4,                                                      // Num components
+        false,                                                  // isSRGB
+        false,                                                  // isBC
+        false,                                                  // isSubsampled
+        false,                                                  // isLuminance
+        {true, true, true, false},                              // Is normalized?
+        {1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f}, // To float scale factor
+        1,                                                      // bcWidth
+        1,                                                      // bcHeight
     },
 
     // padding (0xEF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // L16A16_FLOAT (0xF0)
     {
         "L16A16_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 16, 16, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 3, 0, 0},                 // Swizzle
+        {16, 16, 0, 0},               // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0xF1)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xF2)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R10G10B10X2_USCALED (0xF3)
     {
         "R10G10B10X2_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {10, 10, 10, 2},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R8G8B8A8_SSCALED (0xF4)
     {
         "R8G8B8A8_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {8, 8, 8, 8},                 // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R8G8B8A8_USCALED (0xF5)
     {
         "R8G8B8A8_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {8, 8, 8, 8},                 // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16_SSCALED (0xF6)
     {
         "R16G16_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 16, 16, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {16, 16, 0, 0},               // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16_USCALED (0xF7)
     {
         "R16G16_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 16, 16, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {16, 16, 0, 0},               // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32_SSCALED (0xF8)
     {
         "R32_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32_USCALED (0xF9)
     {
         "R32_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0xFA)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xFB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xFC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xFD)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xFE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0xFF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // B5G6R5_UNORM (0x100)
     {
         "B5G6R5_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 0 }, // Swizzle
-        { 5, 6, 5, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                         // Defaults for missing components
+        {2, 1, 0, 0},                                  // Swizzle
+        {5, 6, 5, 0},                                  // Bits per component
+        16,                                            // Bits per element
+        2,                                             // Bytes per element
+        3,                                             // Num components
+        false,                                         // isSRGB
+        false,                                         // isBC
+        false,                                         // isSubsampled
+        false,                                         // isLuminance
+        {true, true, true, false},                     // Is normalized?
+        {1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0}, // To float scale factor
+        1,                                             // bcWidth
+        1,                                             // bcHeight
     },
 
     // B5G6R5_UNORM_SRGB (0x101)
     {
         "B5G6R5_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 0 }, // Swizzle
-        { 5, 6, 5, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        3, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                         // Defaults for missing components
+        {2, 1, 0, 0},                                  // Swizzle
+        {5, 6, 5, 0},                                  // Bits per component
+        16,                                            // Bits per element
+        2,                                             // Bytes per element
+        3,                                             // Num components
+        true,                                          // isSRGB
+        false,                                         // isBC
+        false,                                         // isSubsampled
+        false,                                         // isLuminance
+        {true, true, true, false},                     // Is normalized?
+        {1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0}, // To float scale factor
+        1,                                             // bcWidth
+        1,                                             // bcHeight
     },
 
     // B5G5R5A1_UNORM (0x102)
     {
         "B5G5R5A1_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 5, 5, 5, 1 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},                                   // Defaults for missing components
+        {2, 1, 0, 3},                                            // Swizzle
+        {5, 5, 5, 1},                                            // Bits per component
+        16,                                                      // Bits per element
+        2,                                                       // Bytes per element
+        4,                                                       // Num components
+        false,                                                   // isSRGB
+        false,                                                   // isBC
+        false,                                                   // isSubsampled
+        false,                                                   // isLuminance
+        {true, true, true, true},                                // Is normalized?
+        {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f}, // To float scale factor
+        1,                                                       // bcWidth
+        1,                                                       // bcHeight
     },
 
     // B5G5R5A1_UNORM_SRGB (0x103)
     {
         "B5G5R5A1_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 5, 5, 5, 1 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        4, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},                                   // Defaults for missing components
+        {2, 1, 0, 3},                                            // Swizzle
+        {5, 5, 5, 1},                                            // Bits per component
+        16,                                                      // Bits per element
+        2,                                                       // Bytes per element
+        4,                                                       // Num components
+        true,                                                    // isSRGB
+        false,                                                   // isBC
+        false,                                                   // isSubsampled
+        false,                                                   // isLuminance
+        {true, true, true, true},                                // Is normalized?
+        {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f}, // To float scale factor
+        1,                                                       // bcWidth
+        1,                                                       // bcHeight
     },
 
     // B4G4R4A4_UNORM (0x104)
     {
         "B4G4R4A4_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 4, 4, 4, 4 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},                                    // Defaults for missing components
+        {2, 1, 0, 3},                                             // Swizzle
+        {4, 4, 4, 4},                                             // Bits per component
+        16,                                                       // Bits per element
+        2,                                                        // Bytes per element
+        4,                                                        // Num components
+        false,                                                    // isSRGB
+        false,                                                    // isBC
+        false,                                                    // isSubsampled
+        false,                                                    // isLuminance
+        {true, true, true, true},                                 // Is normalized?
+        {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor
+        1,                                                        // bcWidth
+        1,                                                        // bcHeight
     },
 
     // B4G4R4A4_UNORM_SRGB (0x105)
     {
         "B4G4R4A4_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 4, 4, 4, 4 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        4, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},                                    // Defaults for missing components
+        {2, 1, 0, 3},                                             // Swizzle
+        {4, 4, 4, 4},                                             // Bits per component
+        16,                                                       // Bits per element
+        2,                                                        // Bytes per element
+        4,                                                        // Num components
+        true,                                                     // isSRGB
+        false,                                                    // isBC
+        false,                                                    // isSubsampled
+        false,                                                    // isLuminance
+        {true, true, true, true},                                 // Is normalized?
+        {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor
+        1,                                                        // bcWidth
+        1,                                                        // bcHeight
     },
 
     // R8G8_UNORM (0x106)
     {
         "R8G8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                // Defaults for missing components
+        {0, 1, 0, 0},                         // Swizzle
+        {8, 8, 0, 0},                         // Bits per component
+        16,                                   // Bits per element
+        2,                                    // Bytes per element
+        2,                                    // Num components
+        false,                                // isSRGB
+        false,                                // isBC
+        false,                                // isSubsampled
+        false,                                // isLuminance
+        {true, true, false, false},           // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
+        1,                                    // bcWidth
+        1,                                    // bcHeight
     },
 
     // R8G8_SNORM (0x107)
     {
         "R8G8_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, false, false }, // Is normalized?
-        { 1.0f / 127.0f, 1.0f / 127.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                // Defaults for missing components
+        {0, 1, 0, 0},                         // Swizzle
+        {8, 8, 0, 0},                         // Bits per component
+        16,                                   // Bits per element
+        2,                                    // Bytes per element
+        2,                                    // Num components
+        false,                                // isSRGB
+        false,                                // isBC
+        false,                                // isSubsampled
+        false,                                // isLuminance
+        {true, true, false, false},           // Is normalized?
+        {1.0f / 127.0f, 1.0f / 127.0f, 0, 0}, // To float scale factor
+        1,                                    // bcWidth
+        1,                                    // bcHeight
     },
 
     // R8G8_SINT (0x108)
     {
         "R8G8_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {8, 8, 0, 0},                 // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R8G8_UINT (0x109)
     {
         "R8G8_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {8, 8, 0, 0},                 // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16_UNORM (0x10A)
     {
         "R16_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 0, 0, 0},                // Swizzle
+        {16, 0, 0, 0},               // Bits per component
+        16,                          // Bits per element
+        2,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 65535.0f, 0, 0, 0},  // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // R16_SNORM (0x10B)
     {
         "R16_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 32767.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 0, 0, 0},                // Swizzle
+        {16, 0, 0, 0},               // Bits per component
+        16,                          // Bits per element
+        2,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 32767.0f, 0, 0, 0},  // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // R16_SINT (0x10C)
     {
         "R16_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {16, 0, 0, 0},                // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16_UINT (0x10D)
     {
         "R16_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {16, 0, 0, 0},                // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16_FLOAT (0x10E)
     {
         "R16_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {16, 0, 0, 0},                // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x10F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x110)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // I16_UNORM (0x111)
     {
         "I16_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 0, 0, 0},                // Swizzle
+        {16, 0, 0, 0},               // Bits per component
+        16,                          // Bits per element
+        2,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        true,                        // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 65535.0f, 0, 0, 0},  // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // L16_UNORM (0x112)
     {
         "L16_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 0, 0, 0},                // Swizzle
+        {16, 0, 0, 0},               // Bits per component
+        16,                          // Bits per element
+        2,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        true,                        // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 65535.0f, 0, 0, 0},  // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // A16_UNORM (0x113)
     {
         "A16_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 3, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {3, 0, 0, 0},                // Swizzle
+        {16, 0, 0, 0},               // Bits per component
+        16,                          // Bits per element
+        2,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 65535.0f, 0, 0, 0},  // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // L8A8_UNORM (0x114)
     {
         "L8A8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, true, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                // Defaults for missing components
+        {0, 3, 0, 0},                         // Swizzle
+        {8, 8, 0, 0},                         // Bits per component
+        16,                                   // Bits per element
+        2,                                    // Bytes per element
+        2,                                    // Num components
+        false,                                // isSRGB
+        false,                                // isBC
+        false,                                // isSubsampled
+        true,                                 // isLuminance
+        {true, true, false, false},           // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
+        1,                                    // bcWidth
+        1,                                    // bcHeight
     },
 
     // I16_FLOAT (0x115)
     {
         "I16_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {16, 0, 0, 0},                // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // L16_FLOAT (0x116)
     {
         "L16_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {16, 0, 0, 0},                // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // A16_FLOAT (0x117)
     {
         "A16_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 3, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {3, 0, 0, 0},                 // Swizzle
+        {16, 0, 0, 0},                // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // L8A8_UNORM_SRGB (0x118)
     {
         "L8A8_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, true, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                // Defaults for missing components
+        {0, 3, 0, 0},                         // Swizzle
+        {8, 8, 0, 0},                         // Bits per component
+        16,                                   // Bits per element
+        2,                                    // Bytes per element
+        2,                                    // Num components
+        true,                                 // isSRGB
+        false,                                // isBC
+        false,                                // isSubsampled
+        true,                                 // isLuminance
+        {true, true, false, false},           // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 0, 0}, // To float scale factor
+        1,                                    // bcWidth
+        1,                                    // bcHeight
     },
 
     // padding (0x119)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // B5G5R5X1_UNORM (0x11A)
     {
         "B5G5R5X1_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 5, 5, 5, 1 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
+        {2, 1, 0, 3},                                     // Swizzle
+        {5, 5, 5, 1},                                     // Bits per component
+        16,                                               // Bits per element
+        2,                                                // Bytes per element
+        4,                                                // Num components
+        false,                                            // isSRGB
+        false,                                            // isBC
+        false,                                            // isSubsampled
+        false,                                            // isLuminance
+        {true, true, true, false},                        // Is normalized?
+        {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f}, // To float scale factor
+        1,                                                // bcWidth
+        1,                                                // bcHeight
     },
 
     // B5G5R5X1_UNORM_SRGB (0x11B)
     {
         "B5G5R5X1_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 5, 5, 5, 1 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        4, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED},
+        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
+        {2, 1, 0, 3},                                     // Swizzle
+        {5, 5, 5, 1},                                     // Bits per component
+        16,                                               // Bits per element
+        2,                                                // Bytes per element
+        4,                                                // Num components
+        true,                                             // isSRGB
+        false,                                            // isBC
+        false,                                            // isSubsampled
+        false,                                            // isLuminance
+        {true, true, true, false},                        // Is normalized?
+        {1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f}, // To float scale factor
+        1,                                                // bcWidth
+        1,                                                // bcHeight
     },
 
     // R8G8_SSCALED (0x11C)
     {
         "R8G8_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {8, 8, 0, 0},                 // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R8G8_USCALED (0x11D)
     {
         "R8G8_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 0, 0},                 // Swizzle
+        {8, 8, 0, 0},                 // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16_SSCALED (0x11E)
     {
         "R16_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {16, 0, 0, 0},                // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16_USCALED (0x11F)
     {
         "R16_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 16, 0, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {16, 0, 0, 0},                // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x120)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x121)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x122)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x123)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // A1B5G5R5_UNORM (0x124)
     {
         "A1B5G5R5_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 3, 2, 1, 0 }, // Swizzle
-        { 1, 5, 5, 5 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 1.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},                                   // Defaults for missing components
+        {3, 2, 1, 0},                                            // Swizzle
+        {1, 5, 5, 5},                                            // Bits per component
+        16,                                                      // Bits per element
+        2,                                                       // Bytes per element
+        4,                                                       // Num components
+        false,                                                   // isSRGB
+        false,                                                   // isBC
+        false,                                                   // isSubsampled
+        false,                                                   // isLuminance
+        {true, true, true, true},                                // Is normalized?
+        {1.0f / 1.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f}, // To float scale factor
+        1,                                                       // bcWidth
+        1,                                                       // bcHeight
     },
 
     // A4B4G4R4_UNORM (0x125)
     {
         "A4B4G4R4_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 3, 2, 1, 0 }, // Swizzle
-        { 4, 4, 4, 4 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM},
+        {0, 0, 0, 0x3f800000},                                    // Defaults for missing components
+        {3, 2, 1, 0},                                             // Swizzle
+        {4, 4, 4, 4},                                             // Bits per component
+        16,                                                       // Bits per element
+        2,                                                        // Bytes per element
+        4,                                                        // Num components
+        false,                                                    // isSRGB
+        false,                                                    // isBC
+        false,                                                    // isSubsampled
+        false,                                                    // isLuminance
+        {true, true, true, true},                                 // Is normalized?
+        {1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f}, // To float scale factor
+        1,                                                        // bcWidth
+        1,                                                        // bcHeight
     },
 
     // L8A8_UINT (0x126)
     {
         "L8A8_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 3, 0, 0},                 // Swizzle
+        {8, 8, 0, 0},                 // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // L8A8_SINT (0x127)
     {
         "L8A8_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 3, 0, 0 }, // Swizzle
-        { 8, 8, 0, 0 }, // Bits per component
-        16, // Bits per element
-        2, // Bytes per element
-        2, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 3, 0, 0},                 // Swizzle
+        {8, 8, 0, 0},                 // Bits per component
+        16,                           // Bits per element
+        2,                            // Bytes per element
+        2,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 0, 0},           // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x128)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x129)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x12A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x12B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x12C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x12D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x12E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x12F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x130)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x131)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x132)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x133)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x134)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x135)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x136)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x137)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x138)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x139)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x13A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x13B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x13C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x13D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x13E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x13F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R8_UNORM (0x140)
     {
         "R8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 0, 0, 0},                // Swizzle
+        {8, 0, 0, 0},                // Bits per component
+        8,                           // Bits per element
+        1,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // R8_SNORM (0x141)
     {
         "R8_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 0, 0, 0},                // Swizzle
+        {8, 0, 0, 0},                // Bits per component
+        8,                           // Bits per element
+        1,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 127.0f, 0, 0, 0},    // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // R8_SINT (0x142)
     {
         "R8_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {8, 0, 0, 0},                 // Bits per component
+        8,                            // Bits per element
+        1,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R8_UINT (0x143)
     {
         "R8_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {8, 0, 0, 0},                 // Bits per component
+        8,                            // Bits per element
+        1,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // A8_UNORM (0x144)
     {
         "A8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 3, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {3, 0, 0, 0},                // Swizzle
+        {8, 0, 0, 0},                // Bits per component
+        8,                           // Bits per element
+        1,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // I8_UNORM (0x145)
     {
         "I8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 0, 0, 0},                // Swizzle
+        {8, 0, 0, 0},                // Bits per component
+        8,                           // Bits per element
+        1,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        true,                        // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // L8_UNORM (0x146)
     {
         "L8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 0, 0, 0},                // Swizzle
+        {8, 0, 0, 0},                // Bits per component
+        8,                           // Bits per element
+        1,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        true,                        // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // padding (0x147)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x148)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R8_SSCALED (0x149)
     {
         "R8_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {8, 0, 0, 0},                 // Bits per component
+        8,                            // Bits per element
+        1,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R8_USCALED (0x14A)
     {
         "R8_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {8, 0, 0, 0},                 // Bits per component
+        8,                            // Bits per element
+        1,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x14B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // L8_UNORM_SRGB (0x14C)
     {
         "L8_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 0, 0, 0},                // Swizzle
+        {8, 0, 0, 0},                // Bits per component
+        8,                           // Bits per element
+        1,                           // Bytes per element
+        1,                           // Num components
+        true,                        // isSRGB
+        false,                       // isBC
+        false,                       // isSubsampled
+        true,                        // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        1,                           // bcWidth
+        1,                           // bcHeight
     },
 
     // padding (0x14D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x14E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x14F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x150)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x151)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // L8_UINT (0x152)
     {
         "L8_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {8, 0, 0, 0},                 // Bits per component
+        8,                            // Bits per element
+        1,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // L8_SINT (0x153)
     {
         "L8_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {8, 0, 0, 0},                 // Bits per component
+        8,                            // Bits per element
+        1,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // I8_UINT (0x154)
     {
         "I8_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {8, 0, 0, 0},                 // Bits per component
+        8,                            // Bits per element
+        1,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // I8_SINT (0x155)
     {
         "I8_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        true, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {8, 0, 0, 0},                 // Bits per component
+        8,                            // Bits per element
+        1,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        true,                         // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x156)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x157)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x158)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x159)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x15A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x15B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x15C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x15D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x15E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x15F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x160)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x161)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x162)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x163)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x164)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x165)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x166)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x167)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x168)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x169)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x16A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x16B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x16C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x16D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x16E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x16F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x170)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x171)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x172)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x173)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x174)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x175)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x176)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x177)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x178)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x179)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x17A)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x17B)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x17C)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x17D)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x17E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x17F)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // DXT1_RGB_SRGB (0x180)
     {
         "DXT1_RGB_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        64,                          // Bits per element
+        8,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // padding (0x181)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x182)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // YCRCB_SWAPUVY (0x183)
     {
         "YCRCB_SWAPUVY",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        true, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        2, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {8, 8, 8, 8},                 // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        true,                         // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        2,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x184)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x185)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // BC1_UNORM (0x186)
     {
         "BC1_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        64,                          // Bits per element
+        8,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC2_UNORM (0x187)
     {
         "BC2_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC3_UNORM (0x188)
     {
         "BC3_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC4_UNORM (0x189)
     {
         "BC4_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        64,                          // Bits per element
+        8,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC5_UNORM (0x18A)
     {
         "BC5_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC1_UNORM_SRGB (0x18B)
     {
         "BC1_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        1, // Num components
-        true, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        64,                          // Bits per element
+        8,                           // Bytes per element
+        1,                           // Num components
+        true,                        // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC2_UNORM_SRGB (0x18C)
     {
         "BC2_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        true, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        true,                        // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC3_UNORM_SRGB (0x18D)
     {
         "BC3_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        true, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        true,                        // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // padding (0x18E)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // YCRCB_SWAPUV (0x18F)
     {
         "YCRCB_SWAPUV",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        true, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        2, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {8, 8, 8, 8},                 // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        true,                         // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        2,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x190)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // DXT1_RGB (0x191)
     {
         "DXT1_RGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        64,                          // Bits per element
+        8,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // padding (0x192)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R8G8B8_UNORM (0x193)
     {
         "R8G8B8_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 8, 8, 8, 0 }, // Bits per component
-        24, // Bits per element
-        3, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
+        {0, 1, 2, 0},                                     // Swizzle
+        {8, 8, 8, 0},                                     // Bits per component
+        24,                                               // Bits per element
+        3,                                                // Bytes per element
+        3,                                                // Num components
+        false,                                            // isSRGB
+        false,                                            // isBC
+        false,                                            // isSubsampled
+        false,                                            // isLuminance
+        {true, true, true, false},                        // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0}, // To float scale factor
+        1,                                                // bcWidth
+        1,                                                // bcHeight
     },
 
     // R8G8B8_SNORM (0x194)
     {
         "R8G8B8_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 8, 8, 8, 0 }, // Bits per component
-        24, // Bits per element
-        3, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
+        {0, 1, 2, 0},                                     // Swizzle
+        {8, 8, 8, 0},                                     // Bits per component
+        24,                                               // Bits per element
+        3,                                                // Bytes per element
+        3,                                                // Num components
+        false,                                            // isSRGB
+        false,                                            // isBC
+        false,                                            // isSubsampled
+        false,                                            // isLuminance
+        {true, true, true, false},                        // Is normalized?
+        {1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0}, // To float scale factor
+        1,                                                // bcWidth
+        1,                                                // bcHeight
     },
 
     // R8G8B8_SSCALED (0x195)
     {
         "R8G8B8_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 8, 8, 8, 0 }, // Bits per component
-        24, // Bits per element
-        3, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {8, 8, 8, 0},                 // Bits per component
+        24,                           // Bits per element
+        3,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R8G8B8_USCALED (0x196)
     {
         "R8G8B8_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 8, 8, 8, 0 }, // Bits per component
-        24, // Bits per element
-        3, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {8, 8, 8, 0},                 // Bits per component
+        24,                           // Bits per element
+        3,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R64G64B64A64_FLOAT (0x197)
     {
         "R64G64B64A64_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 64, 64, 64, 64 }, // Bits per component
-        256, // Bits per element
-        32, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {64, 64, 64, 64},             // Bits per component
+        256,                          // Bits per element
+        32,                           // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R64G64B64_FLOAT (0x198)
     {
         "R64G64B64_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 64, 64, 64, 0 }, // Bits per component
-        192, // Bits per element
-        24, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {64, 64, 64, 0},              // Bits per component
+        192,                          // Bits per element
+        24,                           // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // BC4_SNORM (0x199)
     {
         "BC4_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        64, // Bits per element
-        8, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        64,                          // Bits per element
+        8,                           // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 127.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC5_SNORM (0x19A)
     {
         "BC5_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 127.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // R16G16B16_FLOAT (0x19B)
     {
         "R16G16B16_FLOAT",
-        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 16, 16, 16, 0 }, // Bits per component
-        48, // Bits per element
-        6, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {16, 16, 16, 0},              // Bits per component
+        48,                           // Bits per element
+        6,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16B16_UNORM (0x19C)
     {
         "R16G16B16_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 16, 16, 16, 0 }, // Bits per component
-        48, // Bits per element
-        6, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                                  // Defaults for missing components
+        {0, 1, 2, 0},                                           // Swizzle
+        {16, 16, 16, 0},                                        // Bits per component
+        48,                                                     // Bits per element
+        6,                                                      // Bytes per element
+        3,                                                      // Num components
+        false,                                                  // isSRGB
+        false,                                                  // isBC
+        false,                                                  // isSubsampled
+        false,                                                  // isLuminance
+        {true, true, true, false},                              // Is normalized?
+        {1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0}, // To float scale factor
+        1,                                                      // bcWidth
+        1,                                                      // bcHeight
     },
 
     // R16G16B16_SNORM (0x19D)
     {
         "R16G16B16_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 16, 16, 16, 0 }, // Bits per component
-        48, // Bits per element
-        6, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                                  // Defaults for missing components
+        {0, 1, 2, 0},                                           // Swizzle
+        {16, 16, 16, 0},                                        // Bits per component
+        48,                                                     // Bits per element
+        6,                                                      // Bytes per element
+        3,                                                      // Num components
+        false,                                                  // isSRGB
+        false,                                                  // isBC
+        false,                                                  // isSubsampled
+        false,                                                  // isLuminance
+        {true, true, true, false},                              // Is normalized?
+        {1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0}, // To float scale factor
+        1,                                                      // bcWidth
+        1,                                                      // bcHeight
     },
 
     // R16G16B16_SSCALED (0x19E)
     {
         "R16G16B16_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 16, 16, 16, 0 }, // Bits per component
-        48, // Bits per element
-        6, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {16, 16, 16, 0},              // Bits per component
+        48,                           // Bits per element
+        6,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16B16_USCALED (0x19F)
     {
         "R16G16B16_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 16, 16, 16, 0 }, // Bits per component
-        48, // Bits per element
-        6, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {16, 16, 16, 0},              // Bits per component
+        48,                           // Bits per element
+        6,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x1A0)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // BC6H_SF16 (0x1A1)
     {
         "BC6H_SF16",
-        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 127.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC7_UNORM (0x1A2)
     {
         "BC7_UNORM",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC7_UNORM_SRGB (0x1A3)
     {
         "BC7_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        true, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        true,                        // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // BC6H_UF16 (0x1A4)
     {
         "BC6H_UF16",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 8, 8, 8 }, // Bits per component
-        128, // Bits per element
-        16, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        true, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, false, false, false }, // Is normalized?
-        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
-        4, // bcWidth
-        4, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},       // Defaults for missing components
+        {0, 1, 2, 3},                // Swizzle
+        {8, 8, 8, 8},                // Bits per component
+        128,                         // Bits per element
+        16,                          // Bytes per element
+        1,                           // Num components
+        false,                       // isSRGB
+        true,                        // isBC
+        false,                       // isSubsampled
+        false,                       // isLuminance
+        {true, false, false, false}, // Is normalized?
+        {1.0f / 255.0f, 0, 0, 0},    // To float scale factor
+        4,                           // bcWidth
+        4,                           // bcHeight
     },
 
     // padding (0x1A5)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1A6)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1A7)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R8G8B8_UNORM_SRGB (0x1A8)
     {
         "R8G8B8_UNORM_SRGB",
-        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 8, 8, 8, 0 }, // Bits per component
-        24, // Bits per element
-        3, // Bytes per element
-        3, // Num components
-        true, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, false }, // Is normalized?
-        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},                            // Defaults for missing components
+        {0, 1, 2, 0},                                     // Swizzle
+        {8, 8, 8, 0},                                     // Bits per component
+        24,                                               // Bits per element
+        3,                                                // Bytes per element
+        3,                                                // Num components
+        true,                                             // isSRGB
+        false,                                            // isBC
+        false,                                            // isSubsampled
+        false,                                            // isLuminance
+        {true, true, true, false},                        // Is normalized?
+        {1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0}, // To float scale factor
+        1,                                                // bcWidth
+        1,                                                // bcHeight
     },
 
     // padding (0x1A9)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1AA)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1AB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1AC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1AD)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1AE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1AF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R16G16B16_UINT (0x1B0)
     {
         "R16G16B16_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 16, 16, 16, 0 }, // Bits per component
-        48, // Bits per element
-        6, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {16, 16, 16, 0},              // Bits per component
+        48,                           // Bits per element
+        6,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R16G16B16_SINT (0x1B1)
     {
         "R16G16B16_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 16, 16, 16, 0 }, // Bits per component
-        48, // Bits per element
-        6, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {16, 16, 16, 0},              // Bits per component
+        48,                           // Bits per element
+        6,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R32_SFIXED (0x1B2)
     {
         "R32_SFIXED",
-        { SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 0, 0, 0 }, // Swizzle
-        { 32, 0, 0, 0 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SFIXED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 0, 0, 0},                 // Swizzle
+        {32, 0, 0, 0},                // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R10G10B10A2_SNORM (0x1B3)
     {
         "R10G10B10A2_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {0, 1, 2, 3},             // Swizzle
+        {10, 10, 10, 2},          // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        false,                    // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f}, // To float scale factor
+        1,                                                          // bcWidth
+        1,                                                          // bcHeight
     },
 
     // R10G10B10A2_USCALED (0x1B4)
     {
         "R10G10B10A2_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {10, 10, 10, 2},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R10G10B10A2_SSCALED (0x1B5)
     {
         "R10G10B10A2_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {10, 10, 10, 2},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R10G10B10A2_SINT (0x1B6)
     {
         "R10G10B10A2_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {10, 10, 10, 2},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // B10G10R10A2_SNORM (0x1B7)
     {
         "B10G10R10A2_SNORM",
-        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { true, true, true, true }, // Is normalized?
-        { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM},
+        {0, 0, 0, 0x3f800000},    // Defaults for missing components
+        {2, 1, 0, 3},             // Swizzle
+        {10, 10, 10, 2},          // Bits per component
+        32,                       // Bits per element
+        4,                        // Bytes per element
+        4,                        // Num components
+        false,                    // isSRGB
+        false,                    // isBC
+        false,                    // isSubsampled
+        false,                    // isLuminance
+        {true, true, true, true}, // Is normalized?
+        {1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f}, // To float scale factor
+        1,                                                          // bcWidth
+        1,                                                          // bcHeight
     },
 
     // B10G10R10A2_USCALED (0x1B8)
     {
         "B10G10R10A2_USCALED",
-        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {2, 1, 0, 3},                 // Swizzle
+        {10, 10, 10, 2},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // B10G10R10A2_SSCALED (0x1B9)
     {
         "B10G10R10A2_SSCALED",
-        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
-        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED},
+        {0, 0, 0, 0x3f800000},        // Defaults for missing components
+        {2, 1, 0, 3},                 // Swizzle
+        {10, 10, 10, 2},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // B10G10R10A2_UINT (0x1BA)
     {
         "B10G10R10A2_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {2, 1, 0, 3},                 // Swizzle
+        {10, 10, 10, 2},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // B10G10R10A2_SINT (0x1BB)
     {
         "B10G10R10A2_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 2, 1, 0, 3 }, // Swizzle
-        { 10, 10, 10, 2 }, // Bits per component
-        32, // Bits per element
-        4, // Bytes per element
-        4, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {2, 1, 0, 3},                 // Swizzle
+        {10, 10, 10, 2},              // Bits per component
+        32,                           // Bits per element
+        4,                            // Bytes per element
+        4,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 1.0f},     // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x1BC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1BD)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1BE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1BF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1C0)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1C1)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1C2)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1C3)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1C4)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1C5)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1C6)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1C7)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // R8G8B8_UINT (0x1C8)
     {
         "R8G8B8_UINT",
-        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 8, 8, 8, 0 }, // Bits per component
-        24, // Bits per element
-        3, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {8, 8, 8, 0},                 // Bits per component
+        24,                           // Bits per element
+        3,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // R8G8B8_SINT (0x1C9)
     {
         "R8G8B8_SINT",
-        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 0 }, // Swizzle
-        { 8, 8, 8, 0 }, // Bits per component
-        24, // Bits per element
-        3, // Bytes per element
-        3, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 0},                 // Swizzle
+        {8, 8, 8, 0},                 // Bits per component
+        24,                           // Bits per element
+        3,                            // Bytes per element
+        3,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 1.0f, 1.0f, 0},        // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 
     // padding (0x1CA)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1CB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1CC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1CD)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1CE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1CF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D0)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D1)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D2)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D3)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D4)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D5)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D6)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D7)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D8)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1D9)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1DA)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1DB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1DC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1DD)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1DE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1DF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E0)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E1)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E2)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E3)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E4)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E5)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E6)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E7)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E8)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1E9)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1EA)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1EB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1EC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1ED)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1EE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1EF)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F0)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F1)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F2)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F3)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F4)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F5)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F6)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F7)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F8)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1F9)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1FA)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1FB)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1FC)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1FD)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // padding (0x1FE)
-    {
-        nullptr,
-        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0 },{ 0, 0, 0, 0 },{ 0, 0, 0, 0 },
-        0, 0, 0, false, false, false, false,
-        { false, false, false, false },
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        1, 1
-    },
+    {nullptr,
+     {SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     {0, 0, 0, 0},
+     0,
+     0,
+     0,
+     false,
+     false,
+     false,
+     false,
+     {false, false, false, false},
+     {0.0f, 0.0f, 0.0f, 0.0f},
+     1,
+     1},
     // RAW (0x1FF)
     {
         "RAW",
-        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
-        { 0, 0, 0, 0x1 }, // Defaults for missing components
-        { 0, 1, 2, 3 }, // Swizzle
-        { 8, 0, 0, 0 }, // Bits per component
-        8, // Bits per element
-        1, // Bytes per element
-        1, // Num components
-        false, // isSRGB
-        false, // isBC
-        false, // isSubsampled
-        false, // isLuminance
-        { false, false, false, false }, // Is normalized?
-        { 1.0f, 0, 0, 0 }, // To float scale factor
-        1, // bcWidth
-        1, // bcHeight
+        {SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN},
+        {0, 0, 0, 0x1},               // Defaults for missing components
+        {0, 1, 2, 3},                 // Swizzle
+        {8, 0, 0, 0},                 // Bits per component
+        8,                            // Bits per element
+        1,                            // Bytes per element
+        1,                            // Num components
+        false,                        // isSRGB
+        false,                        // isBC
+        false,                        // isSubsampled
+        false,                        // isLuminance
+        {false, false, false, false}, // Is normalized?
+        {1.0f, 0, 0, 0},              // To float scale factor
+        1,                            // bcWidth
+        1,                            // bcHeight
     },
 };
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h
index f13f338..b7a3e53 100644
--- a/src/gallium/drivers/swr/rasterizer/common/formats.h
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file formats.h
-* 
-* @brief auto-generated file
-* 
-* DO NOT EDIT
-* 
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file formats.h
+ *
+ * @brief auto-generated file
+ *
+ * DO NOT EDIT
+ *
+ ******************************************************************************/
 
 #pragma once
 
@@ -54,179 +54,179 @@
 //////////////////////////////////////////////////////////////////////////
 enum SWR_FORMAT
 {
-    R32G32B32A32_FLOAT          = 0x0,
-    R32G32B32A32_SINT           = 0x1,
-    R32G32B32A32_UINT           = 0x2,
-    R64G64_FLOAT                = 0x5,
-    R32G32B32X32_FLOAT          = 0x6,
-    R32G32B32A32_SSCALED        = 0x7,
-    R32G32B32A32_USCALED        = 0x8,
-    R32G32B32A32_SFIXED         = 0x20,
-    R32G32B32_FLOAT             = 0x40,
-    R32G32B32_SINT              = 0x41,
-    R32G32B32_UINT              = 0x42,
-    R32G32B32_SSCALED           = 0x45,
-    R32G32B32_USCALED           = 0x46,
-    R32G32B32_SFIXED            = 0x50,
-    R16G16B16A16_UNORM          = 0x80,
-    R16G16B16A16_SNORM          = 0x81,
-    R16G16B16A16_SINT           = 0x82,
-    R16G16B16A16_UINT           = 0x83,
-    R16G16B16A16_FLOAT          = 0x84,
-    R32G32_FLOAT                = 0x85,
-    R32G32_SINT                 = 0x86,
-    R32G32_UINT                 = 0x87,
-    R32_FLOAT_X8X24_TYPELESS    = 0x88,
-    X32_TYPELESS_G8X24_UINT     = 0x89,
-    L32A32_FLOAT                = 0x8A,
-    R64_FLOAT                   = 0x8D,
-    R16G16B16X16_UNORM          = 0x8E,
-    R16G16B16X16_FLOAT          = 0x8F,
-    L32X32_FLOAT                = 0x91,
-    I32X32_FLOAT                = 0x92,
-    R16G16B16A16_SSCALED        = 0x93,
-    R16G16B16A16_USCALED        = 0x94,
-    R32G32_SSCALED              = 0x95,
-    R32G32_USCALED              = 0x96,
-    R32G32_SFIXED               = 0xA0,
-    B8G8R8A8_UNORM              = 0xC0,
-    B8G8R8A8_UNORM_SRGB         = 0xC1,
-    R10G10B10A2_UNORM           = 0xC2,
-    R10G10B10A2_UNORM_SRGB      = 0xC3,
-    R10G10B10A2_UINT            = 0xC4,
-    R8G8B8A8_UNORM              = 0xC7,
-    R8G8B8A8_UNORM_SRGB         = 0xC8,
-    R8G8B8A8_SNORM              = 0xC9,
-    R8G8B8A8_SINT               = 0xCA,
-    R8G8B8A8_UINT               = 0xCB,
-    R16G16_UNORM                = 0xCC,
-    R16G16_SNORM                = 0xCD,
-    R16G16_SINT                 = 0xCE,
-    R16G16_UINT                 = 0xCF,
-    R16G16_FLOAT                = 0xD0,
-    B10G10R10A2_UNORM           = 0xD1,
-    B10G10R10A2_UNORM_SRGB      = 0xD2,
-    R11G11B10_FLOAT             = 0xD3,
-    R10G10B10_FLOAT_A2_UNORM    = 0xD5,
-    R32_SINT                    = 0xD6,
-    R32_UINT                    = 0xD7,
-    R32_FLOAT                   = 0xD8,
-    R24_UNORM_X8_TYPELESS       = 0xD9,
-    X24_TYPELESS_G8_UINT        = 0xDA,
-    L32_UNORM                   = 0xDD,
-    L16A16_UNORM                = 0xDF,
-    I24X8_UNORM                 = 0xE0,
-    L24X8_UNORM                 = 0xE1,
-    I32_FLOAT                   = 0xE3,
-    L32_FLOAT                   = 0xE4,
-    A32_FLOAT                   = 0xE5,
-    B8G8R8X8_UNORM              = 0xE9,
-    B8G8R8X8_UNORM_SRGB         = 0xEA,
-    R8G8B8X8_UNORM              = 0xEB,
-    R8G8B8X8_UNORM_SRGB         = 0xEC,
-    R9G9B9E5_SHAREDEXP          = 0xED,
-    B10G10R10X2_UNORM           = 0xEE,
-    L16A16_FLOAT                = 0xF0,
-    R10G10B10X2_USCALED         = 0xF3,
-    R8G8B8A8_SSCALED            = 0xF4,
-    R8G8B8A8_USCALED            = 0xF5,
-    R16G16_SSCALED              = 0xF6,
-    R16G16_USCALED              = 0xF7,
-    R32_SSCALED                 = 0xF8,
-    R32_USCALED                 = 0xF9,
-    B5G6R5_UNORM                = 0x100,
-    B5G6R5_UNORM_SRGB           = 0x101,
-    B5G5R5A1_UNORM              = 0x102,
-    B5G5R5A1_UNORM_SRGB         = 0x103,
-    B4G4R4A4_UNORM              = 0x104,
-    B4G4R4A4_UNORM_SRGB         = 0x105,
-    R8G8_UNORM                  = 0x106,
-    R8G8_SNORM                  = 0x107,
-    R8G8_SINT                   = 0x108,
-    R8G8_UINT                   = 0x109,
-    R16_UNORM                   = 0x10A,
-    R16_SNORM                   = 0x10B,
-    R16_SINT                    = 0x10C,
-    R16_UINT                    = 0x10D,
-    R16_FLOAT                   = 0x10E,
-    I16_UNORM                   = 0x111,
-    L16_UNORM                   = 0x112,
-    A16_UNORM                   = 0x113,
-    L8A8_UNORM                  = 0x114,
-    I16_FLOAT                   = 0x115,
-    L16_FLOAT                   = 0x116,
-    A16_FLOAT                   = 0x117,
-    L8A8_UNORM_SRGB             = 0x118,
-    B5G5R5X1_UNORM              = 0x11A,
-    B5G5R5X1_UNORM_SRGB         = 0x11B,
-    R8G8_SSCALED                = 0x11C,
-    R8G8_USCALED                = 0x11D,
-    R16_SSCALED                 = 0x11E,
-    R16_USCALED                 = 0x11F,
-    A1B5G5R5_UNORM              = 0x124,
-    A4B4G4R4_UNORM              = 0x125,
-    L8A8_UINT                   = 0x126,
-    L8A8_SINT                   = 0x127,
-    R8_UNORM                    = 0x140,
-    R8_SNORM                    = 0x141,
-    R8_SINT                     = 0x142,
-    R8_UINT                     = 0x143,
-    A8_UNORM                    = 0x144,
-    I8_UNORM                    = 0x145,
-    L8_UNORM                    = 0x146,
-    R8_SSCALED                  = 0x149,
-    R8_USCALED                  = 0x14A,
-    L8_UNORM_SRGB               = 0x14C,
-    L8_UINT                     = 0x152,
-    L8_SINT                     = 0x153,
-    I8_UINT                     = 0x154,
-    I8_SINT                     = 0x155,
-    DXT1_RGB_SRGB               = 0x180,
-    YCRCB_SWAPUVY               = 0x183,
-    BC1_UNORM                   = 0x186,
-    BC2_UNORM                   = 0x187,
-    BC3_UNORM                   = 0x188,
-    BC4_UNORM                   = 0x189,
-    BC5_UNORM                   = 0x18A,
-    BC1_UNORM_SRGB              = 0x18B,
-    BC2_UNORM_SRGB              = 0x18C,
-    BC3_UNORM_SRGB              = 0x18D,
-    YCRCB_SWAPUV                = 0x18F,
-    DXT1_RGB                    = 0x191,
-    R8G8B8_UNORM                = 0x193,
-    R8G8B8_SNORM                = 0x194,
-    R8G8B8_SSCALED              = 0x195,
-    R8G8B8_USCALED              = 0x196,
-    R64G64B64A64_FLOAT          = 0x197,
-    R64G64B64_FLOAT             = 0x198,
-    BC4_SNORM                   = 0x199,
-    BC5_SNORM                   = 0x19A,
-    R16G16B16_FLOAT             = 0x19B,
-    R16G16B16_UNORM             = 0x19C,
-    R16G16B16_SNORM             = 0x19D,
-    R16G16B16_SSCALED           = 0x19E,
-    R16G16B16_USCALED           = 0x19F,
-    BC6H_SF16                   = 0x1A1,
-    BC7_UNORM                   = 0x1A2,
-    BC7_UNORM_SRGB              = 0x1A3,
-    BC6H_UF16                   = 0x1A4,
-    R8G8B8_UNORM_SRGB           = 0x1A8,
-    R16G16B16_UINT              = 0x1B0,
-    R16G16B16_SINT              = 0x1B1,
-    R32_SFIXED                  = 0x1B2,
-    R10G10B10A2_SNORM           = 0x1B3,
-    R10G10B10A2_USCALED         = 0x1B4,
-    R10G10B10A2_SSCALED         = 0x1B5,
-    R10G10B10A2_SINT            = 0x1B6,
-    B10G10R10A2_SNORM           = 0x1B7,
-    B10G10R10A2_USCALED         = 0x1B8,
-    B10G10R10A2_SSCALED         = 0x1B9,
-    B10G10R10A2_UINT            = 0x1BA,
-    B10G10R10A2_SINT            = 0x1BB,
-    R8G8B8_UINT                 = 0x1C8,
-    R8G8B8_SINT                 = 0x1C9,
-    RAW                         = 0x1FF,
-    NUM_SWR_FORMATS             = 0x200,
+    R32G32B32A32_FLOAT       = 0x0,
+    R32G32B32A32_SINT        = 0x1,
+    R32G32B32A32_UINT        = 0x2,
+    R64G64_FLOAT             = 0x5,
+    R32G32B32X32_FLOAT       = 0x6,
+    R32G32B32A32_SSCALED     = 0x7,
+    R32G32B32A32_USCALED     = 0x8,
+    R32G32B32A32_SFIXED      = 0x20,
+    R32G32B32_FLOAT          = 0x40,
+    R32G32B32_SINT           = 0x41,
+    R32G32B32_UINT           = 0x42,
+    R32G32B32_SSCALED        = 0x45,
+    R32G32B32_USCALED        = 0x46,
+    R32G32B32_SFIXED         = 0x50,
+    R16G16B16A16_UNORM       = 0x80,
+    R16G16B16A16_SNORM       = 0x81,
+    R16G16B16A16_SINT        = 0x82,
+    R16G16B16A16_UINT        = 0x83,
+    R16G16B16A16_FLOAT       = 0x84,
+    R32G32_FLOAT             = 0x85,
+    R32G32_SINT              = 0x86,
+    R32G32_UINT              = 0x87,
+    R32_FLOAT_X8X24_TYPELESS = 0x88,
+    X32_TYPELESS_G8X24_UINT  = 0x89,
+    L32A32_FLOAT             = 0x8A,
+    R64_FLOAT                = 0x8D,
+    R16G16B16X16_UNORM       = 0x8E,
+    R16G16B16X16_FLOAT       = 0x8F,
+    L32X32_FLOAT             = 0x91,
+    I32X32_FLOAT             = 0x92,
+    R16G16B16A16_SSCALED     = 0x93,
+    R16G16B16A16_USCALED     = 0x94,
+    R32G32_SSCALED           = 0x95,
+    R32G32_USCALED           = 0x96,
+    R32G32_SFIXED            = 0xA0,
+    B8G8R8A8_UNORM           = 0xC0,
+    B8G8R8A8_UNORM_SRGB      = 0xC1,
+    R10G10B10A2_UNORM        = 0xC2,
+    R10G10B10A2_UNORM_SRGB   = 0xC3,
+    R10G10B10A2_UINT         = 0xC4,
+    R8G8B8A8_UNORM           = 0xC7,
+    R8G8B8A8_UNORM_SRGB      = 0xC8,
+    R8G8B8A8_SNORM           = 0xC9,
+    R8G8B8A8_SINT            = 0xCA,
+    R8G8B8A8_UINT            = 0xCB,
+    R16G16_UNORM             = 0xCC,
+    R16G16_SNORM             = 0xCD,
+    R16G16_SINT              = 0xCE,
+    R16G16_UINT              = 0xCF,
+    R16G16_FLOAT             = 0xD0,
+    B10G10R10A2_UNORM        = 0xD1,
+    B10G10R10A2_UNORM_SRGB   = 0xD2,
+    R11G11B10_FLOAT          = 0xD3,
+    R10G10B10_FLOAT_A2_UNORM = 0xD5,
+    R32_SINT                 = 0xD6,
+    R32_UINT                 = 0xD7,
+    R32_FLOAT                = 0xD8,
+    R24_UNORM_X8_TYPELESS    = 0xD9,
+    X24_TYPELESS_G8_UINT     = 0xDA,
+    L32_UNORM                = 0xDD,
+    L16A16_UNORM             = 0xDF,
+    I24X8_UNORM              = 0xE0,
+    L24X8_UNORM              = 0xE1,
+    I32_FLOAT                = 0xE3,
+    L32_FLOAT                = 0xE4,
+    A32_FLOAT                = 0xE5,
+    B8G8R8X8_UNORM           = 0xE9,
+    B8G8R8X8_UNORM_SRGB      = 0xEA,
+    R8G8B8X8_UNORM           = 0xEB,
+    R8G8B8X8_UNORM_SRGB      = 0xEC,
+    R9G9B9E5_SHAREDEXP       = 0xED,
+    B10G10R10X2_UNORM        = 0xEE,
+    L16A16_FLOAT             = 0xF0,
+    R10G10B10X2_USCALED      = 0xF3,
+    R8G8B8A8_SSCALED         = 0xF4,
+    R8G8B8A8_USCALED         = 0xF5,
+    R16G16_SSCALED           = 0xF6,
+    R16G16_USCALED           = 0xF7,
+    R32_SSCALED              = 0xF8,
+    R32_USCALED              = 0xF9,
+    B5G6R5_UNORM             = 0x100,
+    B5G6R5_UNORM_SRGB        = 0x101,
+    B5G5R5A1_UNORM           = 0x102,
+    B5G5R5A1_UNORM_SRGB      = 0x103,
+    B4G4R4A4_UNORM           = 0x104,
+    B4G4R4A4_UNORM_SRGB      = 0x105,
+    R8G8_UNORM               = 0x106,
+    R8G8_SNORM               = 0x107,
+    R8G8_SINT                = 0x108,
+    R8G8_UINT                = 0x109,
+    R16_UNORM                = 0x10A,
+    R16_SNORM                = 0x10B,
+    R16_SINT                 = 0x10C,
+    R16_UINT                 = 0x10D,
+    R16_FLOAT                = 0x10E,
+    I16_UNORM                = 0x111,
+    L16_UNORM                = 0x112,
+    A16_UNORM                = 0x113,
+    L8A8_UNORM               = 0x114,
+    I16_FLOAT                = 0x115,
+    L16_FLOAT                = 0x116,
+    A16_FLOAT                = 0x117,
+    L8A8_UNORM_SRGB          = 0x118,
+    B5G5R5X1_UNORM           = 0x11A,
+    B5G5R5X1_UNORM_SRGB      = 0x11B,
+    R8G8_SSCALED             = 0x11C,
+    R8G8_USCALED             = 0x11D,
+    R16_SSCALED              = 0x11E,
+    R16_USCALED              = 0x11F,
+    A1B5G5R5_UNORM           = 0x124,
+    A4B4G4R4_UNORM           = 0x125,
+    L8A8_UINT                = 0x126,
+    L8A8_SINT                = 0x127,
+    R8_UNORM                 = 0x140,
+    R8_SNORM                 = 0x141,
+    R8_SINT                  = 0x142,
+    R8_UINT                  = 0x143,
+    A8_UNORM                 = 0x144,
+    I8_UNORM                 = 0x145,
+    L8_UNORM                 = 0x146,
+    R8_SSCALED               = 0x149,
+    R8_USCALED               = 0x14A,
+    L8_UNORM_SRGB            = 0x14C,
+    L8_UINT                  = 0x152,
+    L8_SINT                  = 0x153,
+    I8_UINT                  = 0x154,
+    I8_SINT                  = 0x155,
+    DXT1_RGB_SRGB            = 0x180,
+    YCRCB_SWAPUVY            = 0x183,
+    BC1_UNORM                = 0x186,
+    BC2_UNORM                = 0x187,
+    BC3_UNORM                = 0x188,
+    BC4_UNORM                = 0x189,
+    BC5_UNORM                = 0x18A,
+    BC1_UNORM_SRGB           = 0x18B,
+    BC2_UNORM_SRGB           = 0x18C,
+    BC3_UNORM_SRGB           = 0x18D,
+    YCRCB_SWAPUV             = 0x18F,
+    DXT1_RGB                 = 0x191,
+    R8G8B8_UNORM             = 0x193,
+    R8G8B8_SNORM             = 0x194,
+    R8G8B8_SSCALED           = 0x195,
+    R8G8B8_USCALED           = 0x196,
+    R64G64B64A64_FLOAT       = 0x197,
+    R64G64B64_FLOAT          = 0x198,
+    BC4_SNORM                = 0x199,
+    BC5_SNORM                = 0x19A,
+    R16G16B16_FLOAT          = 0x19B,
+    R16G16B16_UNORM          = 0x19C,
+    R16G16B16_SNORM          = 0x19D,
+    R16G16B16_SSCALED        = 0x19E,
+    R16G16B16_USCALED        = 0x19F,
+    BC6H_SF16                = 0x1A1,
+    BC7_UNORM                = 0x1A2,
+    BC7_UNORM_SRGB           = 0x1A3,
+    BC6H_UF16                = 0x1A4,
+    R8G8B8_UNORM_SRGB        = 0x1A8,
+    R16G16B16_UINT           = 0x1B0,
+    R16G16B16_SINT           = 0x1B1,
+    R32_SFIXED               = 0x1B2,
+    R10G10B10A2_SNORM        = 0x1B3,
+    R10G10B10A2_USCALED      = 0x1B4,
+    R10G10B10A2_SSCALED      = 0x1B5,
+    R10G10B10A2_SINT         = 0x1B6,
+    B10G10R10A2_SNORM        = 0x1B7,
+    B10G10R10A2_USCALED      = 0x1B8,
+    B10G10R10A2_SSCALED      = 0x1B9,
+    B10G10R10A2_UINT         = 0x1BA,
+    B10G10R10A2_SINT         = 0x1BB,
+    R8G8B8_UINT              = 0x1C8,
+    R8G8B8_SINT              = 0x1C9,
+    RAW                      = 0x1FF,
+    NUM_SWR_FORMATS          = 0x200,
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -266,4 +266,3 @@
 
 // lookup table for unorm8 srgb -> float conversion
 extern const uint32_t srgb8Table[256];
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/intrin.h b/src/gallium/drivers/swr/rasterizer/common/intrin.h
index 59d66bc..4c413ca 100644
--- a/src/gallium/drivers/swr/rasterizer/common/intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/intrin.h
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #ifndef __SWR_INTRIN_H__
 #define __SWR_INTRIN_H__
@@ -28,34 +28,34 @@
 
 #if !defined(SIMD_ARCH)
 #define SIMD_ARCH KNOB_ARCH
-#endif 
+#endif
 
 #include "simdlib_types.hpp"
 
-typedef SIMDImpl::SIMD128Impl::Float                      simd4scalar;
-typedef SIMDImpl::SIMD128Impl::Double                     simd4scalard;
-typedef SIMDImpl::SIMD128Impl::Integer                    simd4scalari;
-typedef SIMDImpl::SIMD128Impl::Vec4                       simd4vector;
-typedef SIMDImpl::SIMD128Impl::Mask                       simd4mask;
+typedef SIMDImpl::SIMD128Impl::Float   simd4scalar;
+typedef SIMDImpl::SIMD128Impl::Double  simd4scalard;
+typedef SIMDImpl::SIMD128Impl::Integer simd4scalari;
+typedef SIMDImpl::SIMD128Impl::Vec4    simd4vector;
+typedef SIMDImpl::SIMD128Impl::Mask    simd4mask;
 
-typedef SIMDImpl::SIMD256Impl::Float                      simd8scalar;
-typedef SIMDImpl::SIMD256Impl::Double                     simd8scalard;
-typedef SIMDImpl::SIMD256Impl::Integer                    simd8scalari;
-typedef SIMDImpl::SIMD256Impl::Vec4                       simd8vector;
-typedef SIMDImpl::SIMD256Impl::Mask                       simd8mask;
+typedef SIMDImpl::SIMD256Impl::Float   simd8scalar;
+typedef SIMDImpl::SIMD256Impl::Double  simd8scalard;
+typedef SIMDImpl::SIMD256Impl::Integer simd8scalari;
+typedef SIMDImpl::SIMD256Impl::Vec4    simd8vector;
+typedef SIMDImpl::SIMD256Impl::Mask    simd8mask;
 
-typedef SIMDImpl::SIMD512Impl::Float                      simd16scalar;
-typedef SIMDImpl::SIMD512Impl::Double                     simd16scalard;
-typedef SIMDImpl::SIMD512Impl::Integer                    simd16scalari;
-typedef SIMDImpl::SIMD512Impl::Vec4                       simd16vector;
-typedef SIMDImpl::SIMD512Impl::Mask                       simd16mask;
+typedef SIMDImpl::SIMD512Impl::Float   simd16scalar;
+typedef SIMDImpl::SIMD512Impl::Double  simd16scalard;
+typedef SIMDImpl::SIMD512Impl::Integer simd16scalari;
+typedef SIMDImpl::SIMD512Impl::Vec4    simd16vector;
+typedef SIMDImpl::SIMD512Impl::Mask    simd16mask;
 
-#if KNOB_SIMD_WIDTH == 8 
-typedef simd8scalar     simdscalar;
-typedef simd8scalard    simdscalard;
-typedef simd8scalari    simdscalari;
-typedef simd8vector     simdvector;
-typedef simd8mask       simdmask;
+#if KNOB_SIMD_WIDTH == 8
+typedef simd8scalar  simdscalar;
+typedef simd8scalard simdscalard;
+typedef simd8scalari simdscalari;
+typedef simd8vector  simdvector;
+typedef simd8mask    simdmask;
 #else
 #error Unsupported vector width
 #endif
@@ -68,7 +68,7 @@
 #else
     UINT result = 0;
 
-    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
+    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
     // using bsf instead of funky loop
     DWORD maskIndex;
     while (_BitScanForward(&maskIndex, mask))
@@ -99,8 +99,8 @@
 #if KNOB_ARCH >= KNOB_ARCH_AVX2
     return _pext_u32(a, mask);
 #else
-    UINT result = 0;
-    DWORD maskIndex;
+    UINT     result = 0;
+    DWORD    maskIndex;
     uint32_t currentBit = 0;
     while (_BitScanForward(&maskIndex, mask))
     {
@@ -117,4 +117,4 @@
 #endif
 }
 
-#endif//__SWR_INTRIN_H__
+#endif //__SWR_INTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
index a62350f..aea5740 100644
--- a/src/gallium/drivers/swr/rasterizer/common/isa.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #pragma once
 
@@ -44,7 +44,7 @@
 class InstructionSet
 {
 public:
-    InstructionSet() : CPU_Rep() {};
+    InstructionSet() : CPU_Rep(){};
 
     // getters
     std::string Vendor(void) { return CPU_Rep.vendor_; }
@@ -113,21 +113,11 @@
     class InstructionSet_Internal
     {
     public:
-        InstructionSet_Internal()
-            : nIds_{ 0 },
-            nExIds_{ 0 },
-            isIntel_{ false },
-            isAMD_{ false },
-            f_1_ECX_{ 0 },
-            f_1_EDX_{ 0 },
-            f_7_EBX_{ 0 },
-            f_7_ECX_{ 0 },
-            f_81_ECX_{ 0 },
-            f_81_EDX_{ 0 },
-            data_{},
-            extdata_{}
+        InstructionSet_Internal() :
+            nIds_{0}, nExIds_{0}, isIntel_{false}, isAMD_{false}, f_1_ECX_{0}, f_1_EDX_{0},
+            f_7_EBX_{0}, f_7_ECX_{0}, f_81_ECX_{0}, f_81_EDX_{0}, data_{}, extdata_{}
         {
-            //int cpuInfo[4] = {-1};
+            // int cpuInfo[4] = {-1};
             std::array<int, 4> cpui;
 
             // Calling __cpuid with 0x0 as the function_id argument
@@ -144,7 +134,7 @@
 #if defined(_MSC_VER) && !defined(__clang__)
                 __cpuidex(cpui.data(), i, 0);
 #else
-                int *data = cpui.data();
+                int* data = cpui.data();
                 __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
 #endif
                 data_.push_back(cpui);
@@ -153,10 +143,10 @@
             // Capture vendor string
             char vendor[0x20];
             memset(vendor, 0, sizeof(vendor));
-            *reinterpret_cast<int*>(vendor) = data_[0][1];
+            *reinterpret_cast<int*>(vendor)     = data_[0][1];
             *reinterpret_cast<int*>(vendor + 4) = data_[0][3];
             *reinterpret_cast<int*>(vendor + 8) = data_[0][2];
-            vendor_ = vendor;
+            vendor_                             = vendor;
             if (vendor_ == "GenuineIntel")
             {
                 isIntel_ = true;
@@ -197,7 +187,7 @@
 #if defined(_MSC_VER) && !defined(__clang__)
                 __cpuidex(cpui.data(), i, 0);
 #else
-                int *data = cpui.data();
+                int* data = cpui.data();
                 __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
 #endif
                 extdata_.push_back(cpui);
@@ -220,18 +210,18 @@
             }
         };
 
-        int nIds_;
-        unsigned nExIds_;
-        std::string vendor_;
-        std::string brand_;
-        bool isIntel_;
-        bool isAMD_;
-        std::bitset<32> f_1_ECX_;
-        std::bitset<32> f_1_EDX_;
-        std::bitset<32> f_7_EBX_;
-        std::bitset<32> f_7_ECX_;
-        std::bitset<32> f_81_ECX_;
-        std::bitset<32> f_81_EDX_;
+        int                             nIds_;
+        unsigned                        nExIds_;
+        std::string                     vendor_;
+        std::string                     brand_;
+        bool                            isIntel_;
+        bool                            isAMD_;
+        std::bitset<32>                 f_1_ECX_;
+        std::bitset<32>                 f_1_EDX_;
+        std::bitset<32>                 f_7_EBX_;
+        std::bitset<32>                 f_7_ECX_;
+        std::bitset<32>                 f_81_ECX_;
+        std::bitset<32>                 f_81_EDX_;
         std::vector<std::array<int, 4>> data_;
         std::vector<std::array<int, 4>> extdata_;
     };
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.cpp b/src/gallium/drivers/swr/rasterizer/common/os.cpp
index 2d97270..aa817d4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/os.cpp
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #include "common/os.h"
 #include <vector>
@@ -34,28 +34,26 @@
 #include <pthread.h>
 #endif // Linux
 
-
-
 #if defined(_WIN32)
 static const DWORD MS_VC_EXCEPTION = 0x406D1388;
 
-#pragma pack(push,8)  
+#pragma pack(push, 8)
 typedef struct tagTHREADNAME_INFO
 {
-    DWORD dwType; // Must be 0x1000.  
-    LPCSTR szName; // Pointer to name (in user addr space).  
-    DWORD dwThreadID; // Thread ID (-1=caller thread).  
-    DWORD dwFlags; // Reserved for future use, must be zero.  
+    DWORD  dwType;     // Must be 0x1000.
+    LPCSTR szName;     // Pointer to name (in user addr space).
+    DWORD  dwThreadID; // Thread ID (-1=caller thread).
+    DWORD  dwFlags;    // Reserved for future use, must be zero.
 } THREADNAME_INFO;
 #pragma pack(pop)
 
 void LegacySetThreadName(const char* pThreadName)
 {
     THREADNAME_INFO info;
-    info.dwType = 0x1000;
-    info.szName = pThreadName;
+    info.dwType     = 0x1000;
+    info.szName     = pThreadName;
     info.dwThreadID = GetCurrentThreadId();
-    info.dwFlags = 0;
+    info.dwFlags    = 0;
 
     if (!IsDebuggerPresent())
     {
@@ -63,14 +61,16 @@
         return;
     }
 
-#pragma warning(push)  
-#pragma warning(disable: 6320 6322)  
-    __try {
+#pragma warning(push)
+#pragma warning(disable : 6320 6322)
+    __try
+    {
         RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR), (ULONG_PTR*)&info);
     }
-    __except (EXCEPTION_EXECUTE_HANDLER) {
+    __except (EXCEPTION_EXECUTE_HANDLER)
+    {
     }
-#pragma warning(pop)  
+#pragma warning(pop)
 }
 #endif // _WIN32
 
@@ -78,23 +78,21 @@
 {
 #if defined(_WIN32)
     // The SetThreadDescription API was brought in version 1607 of Windows 10.
-    typedef HRESULT(WINAPI* PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription);
+    typedef HRESULT(WINAPI * PFNSetThreadDescription)(HANDLE hThread, PCWSTR lpThreadDescription);
     // The SetThreadDescription API works even if no debugger is attached.
-    auto pfnSetThreadDescription =
-        reinterpret_cast<PFNSetThreadDescription>(
-            GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription"));
+    auto pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
+        GetProcAddress(GetModuleHandleA("Kernel32.dll"), "SetThreadDescription"));
 
     if (!pfnSetThreadDescription)
     {
         // try KernelBase.dll
-        pfnSetThreadDescription =
-            reinterpret_cast<PFNSetThreadDescription>(
-                GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription"));
+        pfnSetThreadDescription = reinterpret_cast<PFNSetThreadDescription>(
+            GetProcAddress(GetModuleHandleA("KernelBase.dll"), "SetThreadDescription"));
     }
 
     if (pfnSetThreadDescription)
     {
-        std::string utf8Name = pThreadName;
+        std::string  utf8Name = pThreadName;
         std::wstring wideName;
         wideName.resize(utf8Name.size() + 1);
         swprintf_s(&(wideName.front()), wideName.size(), L"%S", utf8Name.c_str());
@@ -113,12 +111,13 @@
 #endif // Linux
 }
 
-static void SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken)
+static void
+SplitString(std::vector<std::string>& out_segments, const std::string& input, char splitToken)
 {
     out_segments.clear();
 
     std::istringstream f(input);
-    std::string s;
+    std::string        s;
     while (std::getline(f, s, splitToken))
     {
         if (s.size())
@@ -155,12 +154,11 @@
 
 /// Execute Command (block until finished)
 /// @returns process exit value
-int SWR_API  ExecCmd(
-    const std::string&  cmd,            ///< (In) Command line string
-    const char*         pOptEnvStrings, ///< (Optional In) Environment block for new process
-    std::string*        pOptStdOut,     ///< (Optional Out) Standard Output text
-    std::string*        pOptStdErr,     ///< (Optional Out) Standard Error text
-    const std::string*  pOptStdIn)      ///< (Optional In) Standard Input text
+int SWR_API ExecCmd(const std::string& cmd,     ///< (In) Command line string
+                    const char* pOptEnvStrings, ///< (Optional In) Environment block for new process
+                    std::string*       pOptStdOut, ///< (Optional Out) Standard Output text
+                    std::string*       pOptStdErr, ///< (Optional Out) Standard Error text
+                    const std::string* pOptStdIn)  ///< (Optional In) Standard Input text
 {
     int rvalue = -1;
 
@@ -172,8 +170,8 @@
     };
     std::array<WinPipe, 3> hPipes = {};
 
-    SECURITY_ATTRIBUTES saAttr = { sizeof(SECURITY_ATTRIBUTES) };
-    saAttr.bInheritHandle = TRUE;   //Pipe handles are inherited by child process.
+    SECURITY_ATTRIBUTES saAttr  = {sizeof(SECURITY_ATTRIBUTES)};
+    saAttr.bInheritHandle       = TRUE; // Pipe handles are inherited by child process.
     saAttr.lpSecurityDescriptor = NULL;
 
     {
@@ -198,7 +196,7 @@
     }
 
     STARTUPINFOA StartupInfo{};
-    StartupInfo.cb = sizeof(STARTUPINFOA);
+    StartupInfo.cb      = sizeof(STARTUPINFOA);
     StartupInfo.dwFlags = STARTF_USESTDHANDLES;
     StartupInfo.dwFlags |= STARTF_USESHOWWINDOW;
     StartupInfo.wShowWindow = SW_HIDE;
@@ -207,30 +205,28 @@
         StartupInfo.hStdInput = hPipes[0].hRead;
     }
     StartupInfo.hStdOutput = hPipes[1].hWrite;
-    StartupInfo.hStdError = hPipes[2].hWrite;
+    StartupInfo.hStdError  = hPipes[2].hWrite;
     PROCESS_INFORMATION procInfo{};
 
     // CreateProcess can modify the string
     std::string local_cmd = cmd;
 
-    BOOL ProcessValue = CreateProcessA(
-        NULL,
-        (LPSTR)local_cmd.c_str(),
-        NULL,
-        NULL,
-        TRUE,
-        0,
-        (LPVOID)pOptEnvStrings,
-        NULL,
-        &StartupInfo,
-        &procInfo);
+    BOOL ProcessValue = CreateProcessA(NULL,
+                                       (LPSTR)local_cmd.c_str(),
+                                       NULL,
+                                       NULL,
+                                       TRUE,
+                                       0,
+                                       (LPVOID)pOptEnvStrings,
+                                       NULL,
+                                       &StartupInfo,
+                                       &procInfo);
 
     if (ProcessValue && procInfo.hProcess)
     {
-        auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr)
-        {
-            char buf[1024];
-            DWORD dwRead = 0;
+        auto ReadFromPipe = [](HANDLE hPipe, std::string* pOutStr) {
+            char  buf[1024];
+            DWORD dwRead  = 0;
             DWORD dwAvail = 0;
             while (true)
             {
@@ -244,7 +240,12 @@
                     break;
                 }
 
-                if (!::ReadFile(hPipe, buf, std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)), &dwRead, NULL) || !dwRead)
+                if (!::ReadFile(hPipe,
+                                buf,
+                                std::min<size_t>(sizeof(buf) - 1, size_t(dwAvail)),
+                                &dwRead,
+                                NULL) ||
+                    !dwRead)
                 {
                     // error, the child process might ended
                     break;
@@ -257,17 +258,18 @@
                 }
             }
         };
-        bool bProcessEnded = false;
-        size_t bytesWritten = 0;
+        bool   bProcessEnded = false;
+        size_t bytesWritten  = 0;
         do
         {
             if (pOptStdIn && (pOptStdIn->size() > bytesWritten))
             {
                 DWORD bytesToWrite = static_cast<DWORD>(pOptStdIn->size()) - bytesWritten;
-                if (!::WriteFile(
-                    hPipes[0].hWrite,
-                    pOptStdIn->data() + bytesWritten,
-                    bytesToWrite, &bytesToWrite, nullptr))
+                if (!::WriteFile(hPipes[0].hWrite,
+                                 pOptStdIn->data() + bytesWritten,
+                                 bytesToWrite,
+                                 &bytesToWrite,
+                                 nullptr))
                 {
                     // Failed to write to pipe
                     break;
@@ -280,8 +282,7 @@
 
             ReadFromPipe(hPipes[1].hRead, pOptStdOut);
             ReadFromPipe(hPipes[2].hRead, pOptStdErr);
-        }
-        while (!bProcessEnded);
+        } while (!bProcessEnded);
 
         DWORD exitVal = 0;
         if (!GetExitCodeProcess(procInfo.hProcess, &exitVal))
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 5cfd12f..d33c873 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2014-2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #ifndef __SWR_OS_H__
 #define __SWR_OS_H__
@@ -30,7 +30,7 @@
 #if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
 
 #define SWR_API __cdecl
-#define SWR_VISIBLE  __declspec(dllexport)
+#define SWR_VISIBLE __declspec(dllexport)
 
 #ifndef NOMINMAX
 #define NOMINMAX
@@ -64,12 +64,12 @@
 #define DEBUGBREAK __debugbreak()
 
 #define PRAGMA_WARNING_PUSH_DISABLE(...) \
-    __pragma(warning(push));\
-    __pragma(warning(disable:__VA_ARGS__));
+    __pragma(warning(push));             \
+    __pragma(warning(disable : __VA_ARGS__));
 
 #define PRAGMA_WARNING_POP() __pragma(warning(pop))
 
-static inline void *AlignedMalloc(size_t _Size, size_t _Alignment)
+static inline void* AlignedMalloc(size_t _Size, size_t _Alignment)
 {
     return _aligned_malloc(_Size, _Alignment);
 }
@@ -104,13 +104,13 @@
 #include <stdio.h>
 #include <limits.h>
 
-typedef void            VOID;
-typedef void*           LPVOID;
-typedef int             INT;
-typedef unsigned int    UINT;
-typedef void*           HANDLE;
-typedef int             LONG;
-typedef unsigned int    DWORD;
+typedef void         VOID;
+typedef void*        LPVOID;
+typedef int          INT;
+typedef unsigned int UINT;
+typedef void*        HANDLE;
+typedef int          LONG;
+typedef unsigned int DWORD;
 
 #undef FALSE
 #define FALSE 0
@@ -124,7 +124,7 @@
 #ifndef INLINE
 #define INLINE __inline
 #endif
-#define DEBUGBREAK asm ("int $3")
+#define DEBUGBREAK asm("int $3")
 
 #if !defined(__CYGWIN__)
 
@@ -136,28 +136,25 @@
 #endif
 
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
-    #define __declspec(x)           __declspec_##x
-    #define __declspec_align(y)     __attribute__((aligned(y)))
-    #define __declspec_deprecated   __attribute__((deprecated))
-    #define __declspec_dllexport
-    #define __declspec_dllimport
-    #define __declspec_noinline     __attribute__((__noinline__))
-    #define __declspec_nothrow      __attribute__((nothrow))
-    #define __declspec_novtable
-    #define __declspec_thread       __thread
+#define __declspec(x) __declspec_##x
+#define __declspec_align(y) __attribute__((aligned(y)))
+#define __declspec_deprecated __attribute__((deprecated))
+#define __declspec_dllexport
+#define __declspec_dllimport
+#define __declspec_noinline __attribute__((__noinline__))
+#define __declspec_nothrow __attribute__((nothrow))
+#define __declspec_novtable
+#define __declspec_thread __thread
 #else
-    #define __declspec(X)
+#define __declspec(X)
 #endif
 
 #endif
 
-#define GCC_VERSION (__GNUC__ * 10000 \
-                     + __GNUC_MINOR__ * 100 \
-                     + __GNUC_PATCHLEVEL__)
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
 
 #if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
-inline
-uint64_t __rdtsc()
+inline uint64_t      __rdtsc()
 {
     long low, high;
     asm volatile("rdtsc" : "=a"(low), "=d"(high));
@@ -165,10 +162,9 @@
 }
 #endif
 
-#if !defined( __clang__) && !defined(__INTEL_COMPILER)
+#if !defined(__clang__) && !defined(__INTEL_COMPILER)
 // Intrinsic not defined in gcc
-static INLINE
-void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a)
+static INLINE void _mm256_storeu2_m128i(__m128i* hi, __m128i* lo, __m256i a)
 {
     _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
     _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
@@ -181,38 +177,36 @@
 #endif
 #endif
 
-inline
-unsigned char _BitScanForward(unsigned long *Index, unsigned long Mask)
+inline unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask)
 {
     *Index = __builtin_ctz(Mask);
     return (Mask != 0);
 }
 
-inline
-unsigned char _BitScanForward(unsigned int *Index, unsigned int Mask)
+inline unsigned char _BitScanForward(unsigned int* Index, unsigned int Mask)
 {
     *Index = __builtin_ctz(Mask);
     return (Mask != 0);
 }
 
-inline
-unsigned char _BitScanReverse(unsigned long *Index, unsigned long Mask)
+inline unsigned char _BitScanReverse(unsigned long* Index, unsigned long Mask)
 {
     *Index = __builtin_clz(Mask);
     return (Mask != 0);
 }
 
-inline
-unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask)
+inline unsigned char _BitScanReverse(unsigned int* Index, unsigned int Mask)
 {
     *Index = __builtin_clz(Mask);
     return (Mask != 0);
 }
 
-inline
-void *AlignedMalloc(size_t size, size_t alignment)
+#define _BitScanForward64 _BitScanForward
+#define _BitScanReverse64 _BitScanReverse
+
+inline void* AlignedMalloc(size_t size, size_t alignment)
 {
-    void *ret;
+    void* ret;
     if (posix_memalign(&ret, alignment, size))
     {
         return NULL;
@@ -220,19 +214,19 @@
     return ret;
 }
 
-static inline
-void AlignedFree(void* p)
+static inline void AlignedFree(void* p)
 {
     free(p);
 }
 
-#define _countof(a) (sizeof(a)/sizeof(*(a)))
+#define _countof(a) (sizeof(a) / sizeof(*(a)))
 
 #define sprintf_s sprintf
-#define strcpy_s(dst,size,src) strncpy(dst,src,size)
+#define strcpy_s(dst, size, src) strncpy(dst, src, size)
 #define GetCurrentProcessId getpid
 
-#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
+#define InterlockedCompareExchange(Dest, Exchange, Comparand) \
+    __sync_val_compare_and_swap(Dest, Comparand, Exchange)
 #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
 #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
 #define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
@@ -254,9 +248,9 @@
 #define THREAD thread_local
 
 // Universal types
-typedef uint8_t     KILOBYTE[1024];
-typedef KILOBYTE    MEGABYTE[1024];
-typedef MEGABYTE    GIGABYTE[1024];
+typedef uint8_t  KILOBYTE[1024];
+typedef KILOBYTE MEGABYTE[1024];
+typedef MEGABYTE GIGABYTE[1024];
 
 #define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
 #define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
@@ -272,9 +266,9 @@
 #define ATTR_UNUSED
 #endif
 
-#define SWR_FUNC(_retType, _funcName, /* args */...)   \
-   typedef _retType (SWR_API * PFN##_funcName)(__VA_ARGS__); \
-  _retType SWR_API _funcName(__VA_ARGS__);
+#define SWR_FUNC(_retType, _funcName, /* args */...)        \
+    typedef _retType(SWR_API* PFN##_funcName)(__VA_ARGS__); \
+    _retType SWR_API _funcName(__VA_ARGS__);
 
 // Defined in os.cpp
 void SWR_API SetCurrentThreadName(const char* pThreadName);
@@ -282,11 +276,11 @@
 
 /// Execute Command (block until finished)
 /// @returns process exit value
-int SWR_API  ExecCmd(
-    const std::string&  cmd,                        ///< (In) Command line string
-    const char*         pOptEnvStrings = nullptr,   ///< (Optional In) Environment block for new process
-    std::string*        pOptStdOut = nullptr,       ///< (Optional Out) Standard Output text
-    std::string*        pOptStdErr = nullptr,       ///< (Optional Out) Standard Error text
-    const std::string*  pOptStdIn = nullptr);       ///< (Optional In) Standard Input text
+int SWR_API
+    ExecCmd(const std::string& cmd,                ///< (In) Command line string
+            const char*  pOptEnvStrings = nullptr, ///< (Optional In) Environment block for new process
+            std::string* pOptStdOut     = nullptr,   ///< (Optional Out) Standard Output text
+            std::string* pOptStdErr     = nullptr,   ///< (Optional Out) Standard Error text
+            const std::string* pOptStdIn = nullptr); ///< (Optional In) Standard Input text
 
-#endif//__SWR_OS_H__
+#endif //__SWR_OS_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
index 79e82c4..e19a2d1 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file rdtsc_buckets.cpp
-* 
-* @brief implementation of rdtsc buckets.
-* 
-* Notes:
-* 
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file rdtsc_buckets.cpp
+ *
+ * @brief implementation of rdtsc buckets.
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #include "rdtsc_buckets.h"
 #include <inttypes.h>
 
@@ -50,16 +50,16 @@
     BUCKET_THREAD newThread;
     newThread.name = name;
     newThread.root.children.reserve(mBuckets.size());
-    newThread.root.id = 0;
+    newThread.root.id      = 0;
     newThread.root.pParent = nullptr;
-    newThread.pCurrent = &newThread.root;
+    newThread.pCurrent     = &newThread.root;
 
     mThreadMutex.lock();
 
     // assign unique thread id for this thread
-    size_t id = mThreads.size();
+    size_t id    = mThreads.size();
     newThread.id = (UINT)id;
-    tlsThreadId = (UINT)id;
+    tlsThreadId  = (UINT)id;
 
     // store new thread
     mThreads.push_back(newThread);
@@ -76,9 +76,10 @@
     return (UINT)id;
 }
 
-void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
+void BucketManager::PrintBucket(
+    FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
 {
-    const char *arrows[] = {
+    const char* arrows[] = {
         "",
         "|-> ",
         "    |-> ",
@@ -99,7 +100,7 @@
     // compute average cycle count per invocation
     uint64_t CPE = bucket.elapsed / bucket.count;
 
-    BUCKET_DESC &desc = mBuckets[bucket.id];
+    BUCKET_DESC& desc = mBuckets[bucket.id];
 
     // construct hierarchy visualization
     char hier[80];
@@ -107,16 +108,16 @@
     strcat(hier, desc.name.c_str());
 
     // print out
-    fprintf(f, "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n", 
-        percentTotal, 
-        percentParent, 
-        bucket.elapsed, 
-        CPE, 
-        bucket.count, 
-        (unsigned long)0, 
-        (uint32_t)0, 
-        hier
-    );
+    fprintf(f,
+            "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n",
+            percentTotal,
+            percentParent,
+            bucket.elapsed,
+            CPE,
+            bucket.count,
+            (unsigned long)0,
+            (uint32_t)0,
+            hier);
 
     // dump all children of this bucket
     for (const BUCKET& child : bucket.children)
@@ -135,8 +136,8 @@
     fprintf(f, " %%Tot   %%Par  Cycles     CPE        NumEvent   CPE2       NumEvent2  Bucket\n");
 
     // compute thread level total cycle counts across all buckets from root
-    const BUCKET& root = thread.root;
-    uint64_t totalCycles = 0;
+    const BUCKET& root        = thread.root;
+    uint64_t      totalCycles = 0;
     for (const BUCKET& child : root.children)
     {
         totalCycles += child.elapsed;
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
index 48042ac..bbc9538 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file rdtsc_buckets.h
-* 
-* @brief declaration for rdtsc buckets.
-* 
-* Notes:
-* 
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file rdtsc_buckets.h
+ *
+ * @brief declaration for rdtsc buckets.
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "os.h"
@@ -48,7 +48,7 @@
 class BucketManager
 {
 public:
-    BucketManager() { }
+    BucketManager() {}
     ~BucketManager();
 
     // removes all registered thread data
@@ -112,7 +112,8 @@
     // @param id generated by RegisterBucket
     INLINE void StartBucket(UINT id)
     {
-        if (!mCapturing) return;
+        if (!mCapturing)
+            return;
 
         SWR_ASSERT(tlsThreadId < mThreads.size());
 
@@ -125,10 +126,10 @@
             {
                 bt.pCurrent->children.resize(mBuckets.size());
             }
-            BUCKET &child = bt.pCurrent->children[id];
+            BUCKET& child = bt.pCurrent->children[id];
             child.pParent = bt.pCurrent;
-            child.id = id;
-            child.start = tsc;
+            child.id      = id;
+            child.start   = tsc;
 
             // update thread's currently executing bucket
             bt.pCurrent = &child;
@@ -142,7 +143,7 @@
     INLINE void StopBucket(UINT id)
     {
         SWR_ASSERT(tlsThreadId < mThreads.size());
-        BUCKET_THREAD &bt = mThreads[tlsThreadId];
+        BUCKET_THREAD& bt = mThreads[tlsThreadId];
 
         if (bt.level == 0)
         {
@@ -152,7 +153,8 @@
         uint64_t tsc = __rdtsc();
 
         {
-            if (bt.pCurrent->start == 0) return;
+            if (bt.pCurrent->start == 0)
+                return;
             SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected");
 
             bt.pCurrent->elapsed += (tsc - bt.pCurrent->start);
@@ -167,7 +169,8 @@
 
     INLINE void AddEvent(uint32_t id, uint32_t count)
     {
-        if (!mCapturing) return;
+        if (!mCapturing)
+            return;
 
         SWR_ASSERT(tlsThreadId < mThreads.size());
 
@@ -179,15 +182,16 @@
             {
                 bt.pCurrent->children.resize(mBuckets.size());
             }
-            BUCKET &child = bt.pCurrent->children[id];
+            BUCKET& child = bt.pCurrent->children[id];
             child.pParent = bt.pCurrent;
-            child.id = id;
+            child.id      = id;
             child.count += count;
         }
     }
 
 private:
-    void PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
+    void PrintBucket(
+        FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
     void PrintThread(FILE* f, const BUCKET_THREAD& thread);
 
     // list of active threads that have registered with this manager
@@ -197,10 +201,10 @@
     std::vector<BUCKET_DESC> mBuckets;
 
     // is capturing currently enabled
-    volatile bool mCapturing{ false };
+    volatile bool mCapturing{false};
 
     // has capturing completed
-    volatile bool mDoneCapturing{ false };
+    volatile bool mDoneCapturing{false};
 
     std::mutex mThreadMutex;
 
@@ -208,7 +212,6 @@
 
 };
 
-
 // C helpers for jitter
 void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
 void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
index f6e75cd..fd3b1df 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file rdtsc_buckets.h
-* 
-* @brief declaration for rdtsc buckets.
-* 
-* Notes:
-* 
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file rdtsc_buckets.h
+ *
+ * @brief declaration for rdtsc buckets.
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 #include <vector>
@@ -34,12 +34,12 @@
 
 struct BUCKET
 {
-    uint32_t id{ 0 };
-    uint64_t start{ 0 };
-    uint64_t elapsed{ 0 };
-    uint32_t count{ 0 };
+    uint32_t id{0};
+    uint64_t start{0};
+    uint64_t elapsed{0};
+    uint32_t count{0};
 
-    BUCKET* pParent{ nullptr };
+    BUCKET*             pParent{nullptr};
     std::vector<BUCKET> children;
 };
 
@@ -65,29 +65,29 @@
     std::string name;
 
     // id for this thread, assigned by the thread manager
-    uint32_t id{ 0 };
+    uint32_t id{0};
 
     // root of the bucket hierarchy for this thread
     BUCKET root;
 
     // currently executing bucket somewhere in the hierarchy
-    BUCKET* pCurrent{ nullptr };
+    BUCKET* pCurrent{nullptr};
 
     // currently executing hierarchy level
-    uint32_t level{ 0 };
+    uint32_t level{0};
 
     // threadviz file object
-    FILE* vizFile{ nullptr };
+    FILE* vizFile{nullptr};
 
 
     BUCKET_THREAD() {}
     BUCKET_THREAD(const BUCKET_THREAD& that)
     {
-        name = that.name;
-        id = that.id;
-        root = that.root;
+        name     = that.name;
+        id       = that.id;
+        root     = that.root;
         pCurrent = &root;
-        vizFile = that.vizFile;
+        vizFile  = that.vizFile;
     }
 };
 
@@ -100,14 +100,14 @@
 
 struct VIZ_START_DATA
 {
-    uint8_t type;
+    uint8_t  type;
     uint32_t bucketId;
     uint64_t timestamp;
 };
 
 struct VIZ_STOP_DATA
 {
-    uint8_t type;
+    uint8_t  type;
     uint64_t timestamp;
 };
 
@@ -144,7 +144,7 @@
 
 inline void Deserialize(FILE* f, std::string& string)
 {
-    char cstr[256];
+    char    cstr[256];
     uint8_t length;
     fread(&length, sizeof(length), 1, f);
     fread(cstr, length, 1, f);
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index 98a8b9b..b08fb2e 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #ifndef __SWR_SIMD16INTRIN_H__
 #define __SWR_SIMD16INTRIN_H__
@@ -27,144 +27,146 @@
 #if ENABLE_AVX512_SIMD16
 
 #if KNOB_SIMD16_WIDTH == 16
-typedef SIMD512                             SIMD16;
+typedef SIMD512 SIMD16;
 #else
 #error Unsupported vector width
-#endif//KNOB_SIMD16_WIDTH == 16
+#endif // KNOB_SIMD16_WIDTH == 16
 
-#define _simd16_setzero_ps                  SIMD16::setzero_ps
-#define _simd16_setzero_si                  SIMD16::setzero_si
-#define _simd16_set1_ps                     SIMD16::set1_ps
-#define _simd16_set1_epi8                   SIMD16::set1_epi8
-#define _simd16_set1_epi32                  SIMD16::set1_epi32
-#define _simd16_set_ps                      SIMD16::set_ps
-#define _simd16_set_epi32                   SIMD16::set_epi32
-#define _simd16_load_ps                     SIMD16::load_ps
-#define _simd16_loadu_ps                    SIMD16::loadu_ps
-#if 1                                       
-#define _simd16_load1_ps                    SIMD16::broadcast_ss
-#endif                                      
-#define _simd16_load_si                     SIMD16::load_si
-#define _simd16_loadu_si                    SIMD16::loadu_si
-#define _simd16_broadcast_ss(m)             SIMD16::broadcast_ss((float const*)m)
-#define _simd16_store_ps                    SIMD16::store_ps
-#define _simd16_store_si                    SIMD16::store_si
-#define _simd16_extract_ps(a, imm8)         SIMD16::extract_ps<imm8>(a)
-#define _simd16_extract_si(a, imm8)         SIMD16::extract_si<imm8>(a)
-#define _simd16_insert_ps(a, b, imm8)       SIMD16::insert_ps<imm8>(a, b)
-#define _simd16_insert_si(a, b, imm8)       SIMD16::insert_si<imm8>(a, b)
-#define _simd16_maskstore_ps                SIMD16::maskstore_ps
-#define _simd16_blend_ps(a, b, mask)        SIMD16::blend_ps<mask>(a, b)
-#define _simd16_blendv_ps                   SIMD16::blendv_ps
-#define _simd16_blendv_epi32                SIMD16::blendv_epi32
-#define _simd16_mul_ps                      SIMD16::mul_ps
-#define _simd16_div_ps                      SIMD16::div_ps
-#define _simd16_add_ps                      SIMD16::add_ps
-#define _simd16_sub_ps                      SIMD16::sub_ps
-#define _simd16_rsqrt_ps                    SIMD16::rsqrt_ps
-#define _simd16_min_ps                      SIMD16::min_ps
-#define _simd16_max_ps                      SIMD16::max_ps
-#define _simd16_movemask_ps                 SIMD16::movemask_ps
-#define _simd16_movemask_pd                 SIMD16::movemask_pd
-#define _simd16_cvtps_epi32                 SIMD16::cvtps_epi32
-#define _simd16_cvttps_epi32                SIMD16::cvttps_epi32
-#define _simd16_cvtepi32_ps                 SIMD16::cvtepi32_ps
-#define _simd16_cmp_ps(a, b, comp)          SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
-#define _simd16_cmplt_ps                    SIMD16::cmplt_ps
-#define _simd16_cmpgt_ps                    SIMD16::cmpgt_ps
-#define _simd16_cmpneq_ps                   SIMD16::cmpneq_ps
-#define _simd16_cmpeq_ps                    SIMD16::cmpeq_ps
-#define _simd16_cmpge_ps                    SIMD16::cmpge_ps
-#define _simd16_cmple_ps                    SIMD16::cmple_ps
-#define _simd16_castsi_ps                   SIMD16::castsi_ps
-#define _simd16_castps_si                   SIMD16::castps_si
-#define _simd16_castsi_pd                   SIMD16::castsi_pd
-#define _simd16_castpd_si                   SIMD16::castpd_si
-#define _simd16_castpd_ps                   SIMD16::castpd_ps
-#define _simd16_castps_pd                   SIMD16::castps_pd
-#define _simd16_and_ps                      SIMD16::and_ps
-#define _simd16_andnot_ps                   SIMD16::andnot_ps
-#define _simd16_or_ps                       SIMD16::or_ps
-#define _simd16_xor_ps                      SIMD16::xor_ps
-#define _simd16_round_ps(a, mode)           SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
-#define _simd16_mul_epi32                   SIMD16::mul_epi32
-#define _simd16_mullo_epi32                 SIMD16::mullo_epi32
-#define _simd16_sub_epi32                   SIMD16::sub_epi32
-#define _simd16_sub_epi64                   SIMD16::sub_epi64
-#define _simd16_min_epi32                   SIMD16::min_epi32
-#define _simd16_max_epi32                   SIMD16::max_epi32
-#define _simd16_min_epu32                   SIMD16::min_epu32
-#define _simd16_max_epu32                   SIMD16::max_epu32
-#define _simd16_add_epi32                   SIMD16::add_epi32
-#define _simd16_and_si                      SIMD16::and_si
-#define _simd16_andnot_si                   SIMD16::andnot_si
-#define _simd16_or_si                       SIMD16::or_si
-#define _simd16_xor_si                      SIMD16::xor_si
-#define _simd16_cmpeq_epi32                 SIMD16::cmpeq_epi32
-#define _simd16_cmpgt_epi32                 SIMD16::cmpgt_epi32
-#define _simd16_cmplt_epi32                 SIMD16::cmplt_epi32
-#define _simd16_testz_ps                    SIMD16::testz_ps
-#define _simd16_unpacklo_ps                 SIMD16::unpacklo_ps
-#define _simd16_unpackhi_ps                 SIMD16::unpackhi_ps
-#define _simd16_unpacklo_pd                 SIMD16::unpacklo_pd
-#define _simd16_unpackhi_pd                 SIMD16::unpackhi_pd
-#define _simd16_unpacklo_epi8               SIMD16::unpacklo_epi8
-#define _simd16_unpackhi_epi8               SIMD16::unpackhi_epi8
-#define _simd16_unpacklo_epi16              SIMD16::unpacklo_epi16
-#define _simd16_unpackhi_epi16              SIMD16::unpackhi_epi16
-#define _simd16_unpacklo_epi32              SIMD16::unpacklo_epi32
-#define _simd16_unpackhi_epi32              SIMD16::unpackhi_epi32
-#define _simd16_unpacklo_epi64              SIMD16::unpacklo_epi64
-#define _simd16_unpackhi_epi64              SIMD16::unpackhi_epi64
-#define _simd16_slli_epi32(a, i)            SIMD16::slli_epi32<i>(a)
-#define _simd16_srli_epi32(a, i)            SIMD16::srli_epi32<i>(a)
-#define _simd16_srai_epi32(a, i)            SIMD16::srai_epi32<i>(a)
-#define _simd16_fmadd_ps                    SIMD16::fmadd_ps
-#define _simd16_fmsub_ps                    SIMD16::fmsub_ps
-#define _simd16_adds_epu8                   SIMD16::adds_epu8
-#define _simd16_subs_epu8                   SIMD16::subs_epu8
-#define _simd16_add_epi8                    SIMD16::add_epi8
-#define _simd16_shuffle_epi8                SIMD16::shuffle_epi8
+#define _simd16_setzero_ps SIMD16::setzero_ps
+#define _simd16_setzero_si SIMD16::setzero_si
+#define _simd16_set1_ps SIMD16::set1_ps
+#define _simd16_set1_epi8 SIMD16::set1_epi8
+#define _simd16_set1_epi32 SIMD16::set1_epi32
+#define _simd16_set_ps SIMD16::set_ps
+#define _simd16_set_epi32 SIMD16::set_epi32
+#define _simd16_load_ps SIMD16::load_ps
+#define _simd16_loadu_ps SIMD16::loadu_ps
+#if 1
+#define _simd16_load1_ps SIMD16::broadcast_ss
+#endif
+#define _simd16_load_si SIMD16::load_si
+#define _simd16_loadu_si SIMD16::loadu_si
+#define _simd16_broadcast_ss(m) SIMD16::broadcast_ss((float const*)m)
+#define _simd16_store_ps SIMD16::store_ps
+#define _simd16_store_si SIMD16::store_si
+#define _simd16_extract_ps(a, imm8) SIMD16::extract_ps<imm8>(a)
+#define _simd16_extract_si(a, imm8) SIMD16::extract_si<imm8>(a)
+#define _simd16_insert_ps(a, b, imm8) SIMD16::insert_ps<imm8>(a, b)
+#define _simd16_insert_si(a, b, imm8) SIMD16::insert_si<imm8>(a, b)
+#define _simd16_maskstore_ps SIMD16::maskstore_ps
+#define _simd16_blend_ps(a, b, mask) SIMD16::blend_ps<mask>(a, b)
+#define _simd16_blendv_ps SIMD16::blendv_ps
+#define _simd16_blendv_epi32 SIMD16::blendv_epi32
+#define _simd16_mul_ps SIMD16::mul_ps
+#define _simd16_div_ps SIMD16::div_ps
+#define _simd16_add_ps SIMD16::add_ps
+#define _simd16_sub_ps SIMD16::sub_ps
+#define _simd16_rsqrt_ps SIMD16::rsqrt_ps
+#define _simd16_min_ps SIMD16::min_ps
+#define _simd16_max_ps SIMD16::max_ps
+#define _simd16_movemask_ps SIMD16::movemask_ps
+#define _simd16_movemask_pd SIMD16::movemask_pd
+#define _simd16_cvtps_epi32 SIMD16::cvtps_epi32
+#define _simd16_cvttps_epi32 SIMD16::cvttps_epi32
+#define _simd16_cvtepi32_ps SIMD16::cvtepi32_ps
+#define _simd16_cmp_ps(a, b, comp) SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
+#define _simd16_cmplt_ps SIMD16::cmplt_ps
+#define _simd16_cmpgt_ps SIMD16::cmpgt_ps
+#define _simd16_cmpneq_ps SIMD16::cmpneq_ps
+#define _simd16_cmpeq_ps SIMD16::cmpeq_ps
+#define _simd16_cmpge_ps SIMD16::cmpge_ps
+#define _simd16_cmple_ps SIMD16::cmple_ps
+#define _simd16_castsi_ps SIMD16::castsi_ps
+#define _simd16_castps_si SIMD16::castps_si
+#define _simd16_castsi_pd SIMD16::castsi_pd
+#define _simd16_castpd_si SIMD16::castpd_si
+#define _simd16_castpd_ps SIMD16::castpd_ps
+#define _simd16_castps_pd SIMD16::castps_pd
+#define _simd16_and_ps SIMD16::and_ps
+#define _simd16_andnot_ps SIMD16::andnot_ps
+#define _simd16_or_ps SIMD16::or_ps
+#define _simd16_xor_ps SIMD16::xor_ps
+#define _simd16_round_ps(a, mode) SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
+#define _simd16_mul_epi32 SIMD16::mul_epi32
+#define _simd16_mullo_epi32 SIMD16::mullo_epi32
+#define _simd16_sub_epi32 SIMD16::sub_epi32
+#define _simd16_sub_epi64 SIMD16::sub_epi64
+#define _simd16_min_epi32 SIMD16::min_epi32
+#define _simd16_max_epi32 SIMD16::max_epi32
+#define _simd16_min_epu32 SIMD16::min_epu32
+#define _simd16_max_epu32 SIMD16::max_epu32
+#define _simd16_add_epi32 SIMD16::add_epi32
+#define _simd16_and_si SIMD16::and_si
+#define _simd16_andnot_si SIMD16::andnot_si
+#define _simd16_or_si SIMD16::or_si
+#define _simd16_xor_si SIMD16::xor_si
+#define _simd16_cmpeq_epi32 SIMD16::cmpeq_epi32
+#define _simd16_cmpgt_epi32 SIMD16::cmpgt_epi32
+#define _simd16_cmplt_epi32 SIMD16::cmplt_epi32
+#define _simd16_testz_ps SIMD16::testz_ps
+#define _simd16_unpacklo_ps SIMD16::unpacklo_ps
+#define _simd16_unpackhi_ps SIMD16::unpackhi_ps
+#define _simd16_unpacklo_pd SIMD16::unpacklo_pd
+#define _simd16_unpackhi_pd SIMD16::unpackhi_pd
+#define _simd16_unpacklo_epi8 SIMD16::unpacklo_epi8
+#define _simd16_unpackhi_epi8 SIMD16::unpackhi_epi8
+#define _simd16_unpacklo_epi16 SIMD16::unpacklo_epi16
+#define _simd16_unpackhi_epi16 SIMD16::unpackhi_epi16
+#define _simd16_unpacklo_epi32 SIMD16::unpacklo_epi32
+#define _simd16_unpackhi_epi32 SIMD16::unpackhi_epi32
+#define _simd16_unpacklo_epi64 SIMD16::unpacklo_epi64
+#define _simd16_unpackhi_epi64 SIMD16::unpackhi_epi64
+#define _simd16_slli_epi32(a, i) SIMD16::slli_epi32<i>(a)
+#define _simd16_srli_epi32(a, i) SIMD16::srli_epi32<i>(a)
+#define _simd16_srai_epi32(a, i) SIMD16::srai_epi32<i>(a)
+#define _simd16_fmadd_ps SIMD16::fmadd_ps
+#define _simd16_fmsub_ps SIMD16::fmsub_ps
+#define _simd16_adds_epu8 SIMD16::adds_epu8
+#define _simd16_subs_epu8 SIMD16::subs_epu8
+#define _simd16_add_epi8 SIMD16::add_epi8
+#define _simd16_shuffle_epi8 SIMD16::shuffle_epi8
 
-#define _simd16_i32gather_ps(m, index, scale)               SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index)
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
+#define _simd16_i32gather_ps(m, index, scale) \
+    SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(m, index)
+#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) \
+    SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
 
-#define _simd16_abs_epi32                   SIMD16::abs_epi32
+#define _simd16_abs_epi32 SIMD16::abs_epi32
 
-#define _simd16_cmpeq_epi64                 SIMD16::cmpeq_epi64
-#define _simd16_cmpgt_epi64                 SIMD16::cmpgt_epi64
-#define _simd16_cmpeq_epi16                 SIMD16::cmpeq_epi16
-#define _simd16_cmpgt_epi16                 SIMD16::cmpgt_epi16
-#define _simd16_cmpeq_epi8                  SIMD16::cmpeq_epi8
-#define _simd16_cmpgt_epi8                  SIMD16::cmpgt_epi8
+#define _simd16_cmpeq_epi64 SIMD16::cmpeq_epi64
+#define _simd16_cmpgt_epi64 SIMD16::cmpgt_epi64
+#define _simd16_cmpeq_epi16 SIMD16::cmpeq_epi16
+#define _simd16_cmpgt_epi16 SIMD16::cmpgt_epi16
+#define _simd16_cmpeq_epi8 SIMD16::cmpeq_epi8
+#define _simd16_cmpgt_epi8 SIMD16::cmpgt_epi8
 
-#define _simd16_permute_ps_i(a, i)          SIMD16::permute_ps<i>(a)
-#define _simd16_permute_ps                  SIMD16::permute_ps
-#define _simd16_permute_epi32               SIMD16::permute_epi32
-#define _simd16_sllv_epi32                  SIMD16::sllv_epi32
-#define _simd16_srlv_epi32                  SIMD16::sllv_epi32
-#define _simd16_permute2f128_ps(a, b, i)    SIMD16::permute2f128_ps<i>(a, b)
-#define _simd16_permute2f128_pd(a, b, i)    SIMD16::permute2f128_pd<i>(a, b)
-#define _simd16_permute2f128_si(a, b, i)    SIMD16::permute2f128_si<i>(a, b)
-#define _simd16_shuffle_ps(a, b, i)         SIMD16::shuffle_ps<i>(a, b)
-#define _simd16_shuffle_pd(a, b, i)         SIMD16::shuffle_pd<i>(a, b)
-#define _simd16_shuffle_epi32(a, b, imm8)   SIMD16::shuffle_epi32<imm8>(a, b)
-#define _simd16_shuffle_epi64(a, b, imm8)   SIMD16::shuffle_epi64<imm8>(a, b)
-#define _simd16_cvtepu8_epi16               SIMD16::cvtepu8_epi16
-#define _simd16_cvtepu8_epi32               SIMD16::cvtepu8_epi32
-#define _simd16_cvtepu16_epi32              SIMD16::cvtepu16_epi32
-#define _simd16_cvtepu16_epi64              SIMD16::cvtepu16_epi64
-#define _simd16_cvtepu32_epi64              SIMD16::cvtepu32_epi64
-#define _simd16_packus_epi16                SIMD16::packus_epi16
-#define _simd16_packs_epi16                 SIMD16::packs_epi16
-#define _simd16_packus_epi32                SIMD16::packus_epi32
-#define _simd16_packs_epi32                 SIMD16::packs_epi32
-#define _simd16_cmplt_ps_mask               SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
-#define _simd16_cmpeq_ps_mask               SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>
-#define _simd16_int2mask(mask)              simd16mask(mask)
-#define _simd16_mask2int(mask)              int(mask)
-#define _simd16_vmask_ps                    SIMD16::vmask_ps
+#define _simd16_permute_ps_i(a, i) SIMD16::permute_ps<i>(a)
+#define _simd16_permute_ps SIMD16::permute_ps
+#define _simd16_permute_epi32 SIMD16::permute_epi32
+#define _simd16_sllv_epi32 SIMD16::sllv_epi32
+#define _simd16_srlv_epi32 SIMD16::sllv_epi32
+#define _simd16_permute2f128_ps(a, b, i) SIMD16::permute2f128_ps<i>(a, b)
+#define _simd16_permute2f128_pd(a, b, i) SIMD16::permute2f128_pd<i>(a, b)
+#define _simd16_permute2f128_si(a, b, i) SIMD16::permute2f128_si<i>(a, b)
+#define _simd16_shuffle_ps(a, b, i) SIMD16::shuffle_ps<i>(a, b)
+#define _simd16_shuffle_pd(a, b, i) SIMD16::shuffle_pd<i>(a, b)
+#define _simd16_shuffle_epi32(a, b, imm8) SIMD16::shuffle_epi32<imm8>(a, b)
+#define _simd16_shuffle_epi64(a, b, imm8) SIMD16::shuffle_epi64<imm8>(a, b)
+#define _simd16_cvtepu8_epi16 SIMD16::cvtepu8_epi16
+#define _simd16_cvtepu8_epi32 SIMD16::cvtepu8_epi32
+#define _simd16_cvtepu16_epi32 SIMD16::cvtepu16_epi32
+#define _simd16_cvtepu16_epi64 SIMD16::cvtepu16_epi64
+#define _simd16_cvtepu32_epi64 SIMD16::cvtepu32_epi64
+#define _simd16_packus_epi16 SIMD16::packus_epi16
+#define _simd16_packs_epi16 SIMD16::packs_epi16
+#define _simd16_packus_epi32 SIMD16::packus_epi32
+#define _simd16_packs_epi32 SIMD16::packs_epi32
+#define _simd16_cmplt_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
+#define _simd16_cmpeq_ps_mask SIMD16::cmp_ps_mask<SIMD16::CompareType::EQ_OQ>
+#define _simd16_int2mask(mask) simd16mask(mask)
+#define _simd16_mask2int(mask) int(mask)
+#define _simd16_vmask_ps SIMD16::vmask_ps
 
-#endif//ENABLE_AVX512_SIMD16
+#endif // ENABLE_AVX512_SIMD16
 
-#endif//__SWR_SIMD16INTRIN_H_
+#endif //__SWR_SIMD16INTRIN_H_
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index b1471a9..8ffda3f 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #ifndef __SWR_SIMDINTRIN_H__
 #define __SWR_SIMDINTRIN_H__
@@ -28,176 +28,177 @@
 #include "common/simdlib.hpp"
 
 #if KNOB_SIMD_WIDTH == 8
-typedef SIMD256                             SIMD;
+typedef SIMD256 SIMD;
 #else
 #error Unsupported vector width
-#endif//KNOB_SIMD16_WIDTH == 16
+#endif // KNOB_SIMD16_WIDTH == 16
 
+#define _simd128_maskstore_ps SIMD128::maskstore_ps
+#define _simd128_fmadd_ps SIMD128::fmadd_ps
 
-#define _simd128_maskstore_ps               SIMD128::maskstore_ps
-#define _simd128_fmadd_ps                   SIMD128::fmadd_ps
+#define _simd_load_ps SIMD::load_ps
+#define _simd_load1_ps SIMD::broadcast_ss
+#define _simd_loadu_ps SIMD::loadu_ps
+#define _simd_setzero_ps SIMD::setzero_ps
+#define _simd_set1_ps SIMD::set1_ps
+#define _simd_blend_ps(a, b, i) SIMD::blend_ps<i>(a, b)
+#define _simd_blend_epi32(a, b, i) SIMD::blend_epi32<i>(a, b)
+#define _simd_blendv_ps SIMD::blendv_ps
+#define _simd_store_ps SIMD::store_ps
+#define _simd_mul_ps SIMD::mul_ps
+#define _simd_add_ps SIMD::add_ps
+#define _simd_sub_ps SIMD::sub_ps
+#define _simd_rsqrt_ps SIMD::rsqrt_ps
+#define _simd_min_ps SIMD::min_ps
+#define _simd_max_ps SIMD::max_ps
+#define _simd_movemask_ps SIMD::movemask_ps
+#define _simd_cvtps_epi32 SIMD::cvtps_epi32
+#define _simd_cvttps_epi32 SIMD::cvttps_epi32
+#define _simd_cvtepi32_ps SIMD::cvtepi32_ps
+#define _simd_cmplt_ps SIMD::cmplt_ps
+#define _simd_cmpgt_ps SIMD::cmpgt_ps
+#define _simd_cmpneq_ps SIMD::cmpneq_ps
+#define _simd_cmpeq_ps SIMD::cmpeq_ps
+#define _simd_cmpge_ps SIMD::cmpge_ps
+#define _simd_cmple_ps SIMD::cmple_ps
+#define _simd_cmp_ps(a, b, imm) SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
+#define _simd_and_ps SIMD::and_ps
+#define _simd_or_ps SIMD::or_ps
+#define _simd_rcp_ps SIMD::rcp_ps
+#define _simd_div_ps SIMD::div_ps
+#define _simd_castsi_ps SIMD::castsi_ps
+#define _simd_castps_pd SIMD::castps_pd
+#define _simd_castpd_ps SIMD::castpd_ps
+#define _simd_andnot_ps SIMD::andnot_ps
+#define _simd_round_ps(a, i) SIMD::round_ps<SIMD::RoundMode(i)>(a)
+#define _simd_castpd_ps SIMD::castpd_ps
+#define _simd_broadcast_ps(a) SIMD::broadcast_ps((SIMD128::Float const*)(a))
+#define _simd_stream_ps SIMD::stream_ps
 
-#define _simd_load_ps                       SIMD::load_ps
-#define _simd_load1_ps                      SIMD::broadcast_ss
-#define _simd_loadu_ps                      SIMD::loadu_ps
-#define _simd_setzero_ps                    SIMD::setzero_ps
-#define _simd_set1_ps                       SIMD::set1_ps
-#define _simd_blend_ps(a, b, i)             SIMD::blend_ps<i>(a, b)
-#define _simd_blend_epi32(a, b, i)          SIMD::blend_epi32<i>(a, b)
-#define _simd_blendv_ps                     SIMD::blendv_ps
-#define _simd_store_ps                      SIMD::store_ps
-#define _simd_mul_ps                        SIMD::mul_ps
-#define _simd_add_ps                        SIMD::add_ps
-#define _simd_sub_ps                        SIMD::sub_ps
-#define _simd_rsqrt_ps                      SIMD::rsqrt_ps
-#define _simd_min_ps                        SIMD::min_ps
-#define _simd_max_ps                        SIMD::max_ps
-#define _simd_movemask_ps                   SIMD::movemask_ps
-#define _simd_cvtps_epi32                   SIMD::cvtps_epi32
-#define _simd_cvttps_epi32                  SIMD::cvttps_epi32
-#define _simd_cvtepi32_ps                   SIMD::cvtepi32_ps
-#define _simd_cmplt_ps                      SIMD::cmplt_ps
-#define _simd_cmpgt_ps                      SIMD::cmpgt_ps
-#define _simd_cmpneq_ps                     SIMD::cmpneq_ps
-#define _simd_cmpeq_ps                      SIMD::cmpeq_ps
-#define _simd_cmpge_ps                      SIMD::cmpge_ps
-#define _simd_cmple_ps                      SIMD::cmple_ps
-#define _simd_cmp_ps(a, b, imm)             SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
-#define _simd_and_ps                        SIMD::and_ps
-#define _simd_or_ps                         SIMD::or_ps
-#define _simd_rcp_ps                        SIMD::rcp_ps
-#define _simd_div_ps                        SIMD::div_ps
-#define _simd_castsi_ps                     SIMD::castsi_ps
-#define _simd_castps_pd                     SIMD::castps_pd
-#define _simd_castpd_ps                     SIMD::castpd_ps
-#define _simd_andnot_ps                     SIMD::andnot_ps
-#define _simd_round_ps(a, i)                SIMD::round_ps<SIMD::RoundMode(i)>(a)
-#define _simd_castpd_ps                     SIMD::castpd_ps
-#define _simd_broadcast_ps(a)               SIMD::broadcast_ps((SIMD128::Float const *)(a))
-#define _simd_stream_ps                     SIMD::stream_ps
+#define _simd_movemask_pd SIMD::movemask_pd
+#define _simd_castsi_pd SIMD::castsi_pd
 
-#define _simd_movemask_pd                   SIMD::movemask_pd
-#define _simd_castsi_pd                     SIMD::castsi_pd
+#define _simd_mul_epi32 SIMD::mul_epi32
+#define _simd_mullo_epi32 SIMD::mullo_epi32
+#define _simd_sub_epi32 SIMD::sub_epi32
+#define _simd_sub_epi64 SIMD::sub_epi64
+#define _simd_min_epi32 SIMD::min_epi32
+#define _simd_min_epu32 SIMD::min_epu32
+#define _simd_max_epi32 SIMD::max_epi32
+#define _simd_max_epu32 SIMD::max_epu32
+#define _simd_add_epi32 SIMD::add_epi32
+#define _simd_and_si SIMD::and_si
+#define _simd_andnot_si SIMD::andnot_si
+#define _simd_cmpeq_epi32 SIMD::cmpeq_epi32
+#define _simd_cmplt_epi32 SIMD::cmplt_epi32
+#define _simd_cmpgt_epi32 SIMD::cmpgt_epi32
+#define _simd_or_si SIMD::or_si
+#define _simd_xor_si SIMD::xor_si
+#define _simd_castps_si SIMD::castps_si
+#define _simd_adds_epu8 SIMD::adds_epu8
+#define _simd_subs_epu8 SIMD::subs_epu8
+#define _simd_add_epi8 SIMD::add_epi8
+#define _simd_cmpeq_epi64 SIMD::cmpeq_epi64
+#define _simd_cmpgt_epi64 SIMD::cmpgt_epi64
+#define _simd_cmpgt_epi8 SIMD::cmpgt_epi8
+#define _simd_cmpeq_epi8 SIMD::cmpeq_epi8
+#define _simd_cmpgt_epi16 SIMD::cmpgt_epi16
+#define _simd_cmpeq_epi16 SIMD::cmpeq_epi16
+#define _simd_movemask_epi8 SIMD::movemask_epi8
+#define _simd_permute_ps_i(a, i) SIMD::permute_ps<i>(a)
+#define _simd_permute_ps SIMD::permute_ps
+#define _simd_permute_epi32 SIMD::permute_epi32
+#define _simd_srlv_epi32 SIMD::srlv_epi32
+#define _simd_sllv_epi32 SIMD::sllv_epi32
 
-#define _simd_mul_epi32                     SIMD::mul_epi32
-#define _simd_mullo_epi32                   SIMD::mullo_epi32
-#define _simd_sub_epi32                     SIMD::sub_epi32
-#define _simd_sub_epi64                     SIMD::sub_epi64
-#define _simd_min_epi32                     SIMD::min_epi32
-#define _simd_min_epu32                     SIMD::min_epu32
-#define _simd_max_epi32                     SIMD::max_epi32
-#define _simd_max_epu32                     SIMD::max_epu32
-#define _simd_add_epi32                     SIMD::add_epi32
-#define _simd_and_si                        SIMD::and_si
-#define _simd_andnot_si                     SIMD::andnot_si
-#define _simd_cmpeq_epi32                   SIMD::cmpeq_epi32
-#define _simd_cmplt_epi32                   SIMD::cmplt_epi32
-#define _simd_cmpgt_epi32                   SIMD::cmpgt_epi32
-#define _simd_or_si                         SIMD::or_si
-#define _simd_xor_si                        SIMD::xor_si
-#define _simd_castps_si                     SIMD::castps_si
-#define _simd_adds_epu8                     SIMD::adds_epu8
-#define _simd_subs_epu8                     SIMD::subs_epu8
-#define _simd_add_epi8                      SIMD::add_epi8
-#define _simd_cmpeq_epi64                   SIMD::cmpeq_epi64
-#define _simd_cmpgt_epi64                   SIMD::cmpgt_epi64
-#define _simd_cmpgt_epi8                    SIMD::cmpgt_epi8
-#define _simd_cmpeq_epi8                    SIMD::cmpeq_epi8
-#define _simd_cmpgt_epi16                   SIMD::cmpgt_epi16
-#define _simd_cmpeq_epi16                   SIMD::cmpeq_epi16
-#define _simd_movemask_epi8                 SIMD::movemask_epi8
-#define _simd_permute_ps_i(a, i)            SIMD::permute_ps<i>(a)
-#define _simd_permute_ps                    SIMD::permute_ps
-#define _simd_permute_epi32                 SIMD::permute_epi32
-#define _simd_srlv_epi32                    SIMD::srlv_epi32
-#define _simd_sllv_epi32                    SIMD::sllv_epi32
+#define _simd_unpacklo_epi8 SIMD::unpacklo_epi8
+#define _simd_unpackhi_epi8 SIMD::unpackhi_epi8
+#define _simd_unpacklo_epi16 SIMD::unpacklo_epi16
+#define _simd_unpackhi_epi16 SIMD::unpackhi_epi16
+#define _simd_unpacklo_epi32 SIMD::unpacklo_epi32
+#define _simd_unpackhi_epi32 SIMD::unpackhi_epi32
+#define _simd_unpacklo_epi64 SIMD::unpacklo_epi64
+#define _simd_unpackhi_epi64 SIMD::unpackhi_epi64
 
-#define _simd_unpacklo_epi8                 SIMD::unpacklo_epi8
-#define _simd_unpackhi_epi8                 SIMD::unpackhi_epi8
-#define _simd_unpacklo_epi16                SIMD::unpacklo_epi16
-#define _simd_unpackhi_epi16                SIMD::unpackhi_epi16
-#define _simd_unpacklo_epi32                SIMD::unpacklo_epi32
-#define _simd_unpackhi_epi32                SIMD::unpackhi_epi32
-#define _simd_unpacklo_epi64                SIMD::unpacklo_epi64
-#define _simd_unpackhi_epi64                SIMD::unpackhi_epi64
+#define _simd_slli_epi32(a, i) SIMD::slli_epi32<i>(a)
+#define _simd_srai_epi32(a, i) SIMD::srai_epi32<i>(a)
+#define _simd_srli_epi32(a, i) SIMD::srli_epi32<i>(a)
+#define _simd_srlisi_ps(a, i) SIMD::srlisi_ps<i>(a)
 
-#define _simd_slli_epi32(a,i)               SIMD::slli_epi32<i>(a)
-#define _simd_srai_epi32(a,i)               SIMD::srai_epi32<i>(a)
-#define _simd_srli_epi32(a,i)               SIMD::srli_epi32<i>(a)
-#define _simd_srlisi_ps(a,i)                SIMD::srlisi_ps<i>(a)
+#define _simd_fmadd_ps SIMD::fmadd_ps
+#define _simd_fmsub_ps SIMD::fmsub_ps
+#define _simd_shuffle_epi8 SIMD::shuffle_epi8
 
-#define _simd_fmadd_ps                      SIMD::fmadd_ps
-#define _simd_fmsub_ps                      SIMD::fmsub_ps
-#define _simd_shuffle_epi8                  SIMD::shuffle_epi8
+#define _simd_i32gather_ps(p, o, s) SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
+#define _simd_mask_i32gather_ps(r, p, o, m, s) \
+    SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
+#define _simd_abs_epi32 SIMD::abs_epi32
 
-#define _simd_i32gather_ps(p, o, s)         SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
-#define _simd_mask_i32gather_ps(r, p, o, m, s) SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
-#define _simd_abs_epi32                     SIMD::abs_epi32
+#define _simd_cvtepu8_epi16 SIMD::cvtepu8_epi16
+#define _simd_cvtepu8_epi32 SIMD::cvtepu8_epi32
+#define _simd_cvtepu16_epi32 SIMD::cvtepu16_epi32
+#define _simd_cvtepu16_epi64 SIMD::cvtepu16_epi64
+#define _simd_cvtepu32_epi64 SIMD::cvtepu32_epi64
 
-#define _simd_cvtepu8_epi16                 SIMD::cvtepu8_epi16
-#define _simd_cvtepu8_epi32                 SIMD::cvtepu8_epi32
-#define _simd_cvtepu16_epi32                SIMD::cvtepu16_epi32
-#define _simd_cvtepu16_epi64                SIMD::cvtepu16_epi64
-#define _simd_cvtepu32_epi64                SIMD::cvtepu32_epi64
+#define _simd_packus_epi16 SIMD::packus_epi16
+#define _simd_packs_epi16 SIMD::packs_epi16
+#define _simd_packus_epi32 SIMD::packus_epi32
+#define _simd_packs_epi32 SIMD::packs_epi32
 
-#define _simd_packus_epi16                  SIMD::packus_epi16
-#define _simd_packs_epi16                   SIMD::packs_epi16
-#define _simd_packus_epi32                  SIMD::packus_epi32
-#define _simd_packs_epi32                   SIMD::packs_epi32
+#define _simd_unpacklo_ps SIMD::unpacklo_ps
+#define _simd_unpackhi_ps SIMD::unpackhi_ps
+#define _simd_unpacklo_pd SIMD::unpacklo_pd
+#define _simd_unpackhi_pd SIMD::unpackhi_pd
+#define _simd_insertf128_ps SIMD::insertf128_ps
+#define _simd_insertf128_pd SIMD::insertf128_pd
+#define _simd_insertf128_si(a, b, i) SIMD::insertf128_si<i>(a, b)
+#define _simd_extractf128_ps(a, i) SIMD::extractf128_ps<i>(a)
+#define _simd_extractf128_pd(a, i) SIMD::extractf128_pd<i>(a)
+#define _simd_extractf128_si(a, i) SIMD::extractf128_si<i>(a)
+#define _simd_permute2f128_ps(a, b, i) SIMD::permute2f128_ps<i>(a, b)
+#define _simd_permute2f128_pd(a, b, i) SIMD::permute2f128_pd<i>(a, b)
+#define _simd_permute2f128_si(a, b, i) SIMD::permute2f128_si<i>(a, b)
+#define _simd_shuffle_ps(a, b, i) SIMD::shuffle_ps<i>(a, b)
+#define _simd_shuffle_pd(a, b, i) SIMD::shuffle_pd<i>(a, b)
+#define _simd_shuffle_epi32(a, b, imm8) SIMD::shuffle_epi32<imm8>(a, b)
+#define _simd_shuffle_epi64(a, b, imm8) SIMD::shuffle_epi64<imm8>(a, b)
+#define _simd_set1_epi32 SIMD::set1_epi32
+#define _simd_set_epi32 SIMD::set_epi32
+#define _simd_set_ps SIMD::set_ps
+#define _simd_set1_epi8 SIMD::set1_epi8
+#define _simd_setzero_si SIMD::setzero_si
+#define _simd_cvttps_epi32 SIMD::cvttps_epi32
+#define _simd_store_si SIMD::store_si
+#define _simd_broadcast_ss SIMD::broadcast_ss
+#define _simd_maskstore_ps SIMD::maskstore_ps
+#define _simd_load_si SIMD::load_si
+#define _simd_loadu_si SIMD::loadu_si
+#define _simd_sub_ps SIMD::sub_ps
+#define _simd_testz_ps SIMD::testz_ps
+#define _simd_testz_si SIMD::testz_si
+#define _simd_xor_ps SIMD::xor_ps
 
-#define _simd_unpacklo_ps                   SIMD::unpacklo_ps
-#define _simd_unpackhi_ps                   SIMD::unpackhi_ps
-#define _simd_unpacklo_pd                   SIMD::unpacklo_pd
-#define _simd_unpackhi_pd                   SIMD::unpackhi_pd
-#define _simd_insertf128_ps                 SIMD::insertf128_ps
-#define _simd_insertf128_pd                 SIMD::insertf128_pd
-#define _simd_insertf128_si(a, b, i)        SIMD::insertf128_si<i>(a, b)
-#define _simd_extractf128_ps(a, i)          SIMD::extractf128_ps<i>(a)
-#define _simd_extractf128_pd(a, i)          SIMD::extractf128_pd<i>(a)
-#define _simd_extractf128_si(a, i)          SIMD::extractf128_si<i>(a)
-#define _simd_permute2f128_ps(a, b, i)      SIMD::permute2f128_ps<i>(a, b)
-#define _simd_permute2f128_pd(a, b, i)      SIMD::permute2f128_pd<i>(a, b)
-#define _simd_permute2f128_si(a, b, i)      SIMD::permute2f128_si<i>(a, b)
-#define _simd_shuffle_ps(a, b, i)           SIMD::shuffle_ps<i>(a, b)
-#define _simd_shuffle_pd(a, b, i)           SIMD::shuffle_pd<i>(a, b)
-#define _simd_shuffle_epi32(a, b, imm8)     SIMD::shuffle_epi32<imm8>(a, b)
-#define _simd_shuffle_epi64(a, b, imm8)     SIMD::shuffle_epi64<imm8>(a, b)
-#define _simd_set1_epi32                    SIMD::set1_epi32
-#define _simd_set_epi32                     SIMD::set_epi32
-#define _simd_set_ps                        SIMD::set_ps
-#define _simd_set1_epi8                     SIMD::set1_epi8
-#define _simd_setzero_si                    SIMD::setzero_si
-#define _simd_cvttps_epi32                  SIMD::cvttps_epi32
-#define _simd_store_si                      SIMD::store_si
-#define _simd_broadcast_ss                  SIMD::broadcast_ss
-#define _simd_maskstore_ps                  SIMD::maskstore_ps
-#define _simd_load_si                       SIMD::load_si
-#define _simd_loadu_si                      SIMD::loadu_si
-#define _simd_sub_ps                        SIMD::sub_ps
-#define _simd_testz_ps                      SIMD::testz_ps
-#define _simd_testz_si                      SIMD::testz_si
-#define _simd_xor_ps                        SIMD::xor_ps
+#define _simd_loadu2_si SIMD::loadu2_si
+#define _simd_storeu2_si SIMD::storeu2_si
 
-#define _simd_loadu2_si                     SIMD::loadu2_si
-#define _simd_storeu2_si                    SIMD::storeu2_si
+#define _simd_blendv_epi32 SIMD::blendv_epi32
+#define _simd_vmask_ps SIMD::vmask_ps
 
-#define _simd_blendv_epi32                  SIMD::blendv_epi32
-#define _simd_vmask_ps                      SIMD::vmask_ps
-
-template<int mask> SIMDINLINE
-SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const &a, SIMD128::Integer const &b)
+template <int mask>
+SIMDINLINE SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const& a, SIMD128::Integer const& b)
 {
-    return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
+    return SIMD128::castps_si(
+        SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
 }
 
 SIMDINLINE
-void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
+void _simd_mov(simdscalar& r, unsigned int rlane, simdscalar& s, unsigned int slane)
 {
     OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
     SIMD256::store_ps(rArray, r);
     SIMD256::store_ps(sArray, s);
     rArray[rlane] = sArray[slane];
-    r = SIMD256::load_ps(rArray);
+    r             = SIMD256::load_ps(rArray);
 }
 
 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
@@ -228,34 +229,42 @@
 
 #endif
 
-#define _simdvec_dp3_ps                 SIMD::vec4_dp3_ps
-#define _simdvec_dp4_ps                 SIMD::vec4_dp4_ps
-#define _simdvec_rcp_length_ps          SIMD::vec4_rcp_length_ps
-#define _simdvec_normalize_ps           SIMD::vec4_normalize_ps
-#define _simdvec_mul_ps                 SIMD::vec4_mul_ps
-#define _simdvec_add_ps                 SIMD::vec4_add_ps
-#define _simdvec_min_ps                 SIMD::vec4_min_ps
-#define _simdvec_max_ps                 SIMD::vec4_max_ps
-#define _simd_mat4x4_vec4_multiply      SIMD::mat4x4_vec4_multiply
-#define _simd_mat3x3_vec3_w0_multiply   SIMD::mat3x3_vec3_w0_multiply
-#define _simd_mat4x4_vec3_w1_multiply   SIMD::mat4x4_vec3_w1_multiply
-#define _simd_mat4x3_vec3_w1_multiply   SIMD::mat4x3_vec3_w1_multiply
+#define _simdvec_dp3_ps SIMD::vec4_dp3_ps
+#define _simdvec_dp4_ps SIMD::vec4_dp4_ps
+#define _simdvec_rcp_length_ps SIMD::vec4_rcp_length_ps
+#define _simdvec_normalize_ps SIMD::vec4_normalize_ps
+#define _simdvec_mul_ps SIMD::vec4_mul_ps
+#define _simdvec_add_ps SIMD::vec4_add_ps
+#define _simdvec_min_ps SIMD::vec4_min_ps
+#define _simdvec_max_ps SIMD::vec4_max_ps
+#define _simd_mat4x4_vec4_multiply SIMD::mat4x4_vec4_multiply
+#define _simd_mat3x3_vec3_w0_multiply SIMD::mat3x3_vec3_w0_multiply
+#define _simd_mat4x4_vec3_w1_multiply SIMD::mat4x4_vec3_w1_multiply
+#define _simd_mat4x3_vec3_w1_multiply SIMD::mat4x3_vec3_w1_multiply
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simdscalar vplaneps(simdscalar const &vA, simdscalar const &vB, simdscalar const &vC, simdscalar const &vX, simdscalar const &vY)
+SIMDINLINE simdscalar vplaneps(simdscalar const& vA,
+                               simdscalar const& vB,
+                               simdscalar const& vC,
+                               simdscalar const& vX,
+                               simdscalar const& vY)
 {
     simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
-    vOut = _simd_fmadd_ps(vB, vY, vOut);
+    vOut            = _simd_fmadd_ps(vB, vY, vOut);
     return vOut;
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-SIMDINLINE simd4scalar vplaneps(simd4scalar const &vA, simd4scalar const &vB, simd4scalar const &vC, simd4scalar const &vX, simd4scalar const &vY)
+SIMDINLINE simd4scalar vplaneps(simd4scalar const& vA,
+                                simd4scalar const& vB,
+                                simd4scalar const& vC,
+                                simd4scalar const& vX,
+                                simd4scalar const& vY)
 {
     simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
-    vOut = _simd128_fmadd_ps(vB, vY, vOut);
+    vOut             = _simd128_fmadd_ps(vB, vY, vOut);
     return vOut;
 }
 
@@ -264,30 +273,32 @@
 /// @param vI - barycentric I
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalar InterpolateComponent(simdscalar const &vI, simdscalar const &vJ, const float *pInterpBuffer)
+template <UINT Attrib, UINT Comp, UINT numComponents = 4>
+static SIMDINLINE simdscalar InterpolateComponent(simdscalar const& vI,
+                                                  simdscalar const& vJ,
+                                                  const float*      pInterpBuffer)
 {
-    const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-    const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
-    const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
+    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
+    const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
+    const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
 
     simdscalar vA = _simd_broadcast_ss(pInterpA);
     simdscalar vB = _simd_broadcast_ss(pInterpB);
     simdscalar vC = _simd_broadcast_ss(pInterpC);
 
     simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
-    vC = _simd_mul_ps(vk, vC);
-    
+    vC            = _simd_mul_ps(vk, vC);
+
     return vplaneps(vA, vB, vC, vI, vJ);
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Interpolates a single component (flat shade).
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
+template <UINT Attrib, UINT Comp, UINT numComponents = 4>
+static SIMDINLINE simdscalar InterpolateComponentFlat(const float* pInterpBuffer)
 {
-    const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
+    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
 
     simdscalar vA = _simd_broadcast_ss(pInterpA);
 
@@ -299,38 +310,39 @@
 /// @param vI - barycentric I
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
-template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const &vI, simd4scalar const &vJ, const float *pInterpBuffer)
+template <UINT Attrib, UINT Comp, UINT numComponents = 4>
+static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const& vI,
+                                                   simd4scalar const& vJ,
+                                                   const float*       pInterpBuffer)
 {
-    const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
-    const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
-    const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
+    const float* pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
+    const float* pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
+    const float* pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
 
     simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
     simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
     simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
 
     simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
-    vC = SIMD128::mul_ps(vk, vC);
+    vC             = SIMD128::mul_ps(vk, vC);
 
     return vplaneps(vA, vB, vC, vI, vJ);
 }
 
-static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const &a)
+static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const& a)
 {
     simd4scalari ai = SIMD128::castps_si(a);
     return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
 }
 
-static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const &a)
+static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a)
 {
     simdscalari ai = _simd_castps_si(a);
     return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
 }
 
-
 #if ENABLE_AVX512_SIMD16
 #include "simd16intrin.h"
-#endif//ENABLE_AVX512_SIMD16
+#endif // ENABLE_AVX512_SIMD16
 
-#endif//__SWR_SIMDINTRIN_H__
+#endif //__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
index 4114645..bd48fb2 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #pragma once
 
 #include "simdlib_types.hpp"
@@ -38,8 +38,7 @@
 #include "simdlib_128_avx.inl"
 #undef __SIMD_LIB_AVX_HPP__
         }; // struct AVXImpl
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
-
+#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX
 
 #if SIMD_ARCH >= SIMD_ARCH_AVX2
         struct AVX2Impl : AVXImpl
@@ -48,7 +47,7 @@
 #include "simdlib_128_avx2.inl"
 #undef __SIMD_LIB_AVX2_HPP__
         }; // struct AVX2Impl
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX2
 
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
         struct AVX512Impl : AVX2Impl
@@ -62,9 +61,9 @@
 #include "simdlib_128_avx512_core.inl"
 #endif // defined(SIMD_ARCH_KNIGHTS)
 #undef __SIMD_LIB_AVX512_HPP__
-#endif // SIMD_OPT_128_AVX512
+#endif     // SIMD_OPT_128_AVX512
         }; // struct AVX2Impl
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
 
         struct Traits : SIMDImpl::Traits
         {
@@ -78,13 +77,13 @@
 #error Invalid value for SIMD_ARCH
 #endif
 
-            using Float     = SIMD128Impl::Float;
-            using Double    = SIMD128Impl::Double;
-            using Integer   = SIMD128Impl::Integer;
-            using Vec4      = SIMD128Impl::Vec4;
-            using Mask      = SIMD128Impl::Mask;
+            using Float   = SIMD128Impl::Float;
+            using Double  = SIMD128Impl::Double;
+            using Integer = SIMD128Impl::Integer;
+            using Vec4    = SIMD128Impl::Vec4;
+            using Mask    = SIMD128Impl::Mask;
         };
-    } // ns SIMD128Impl
+    } // namespace SIMD128Impl
 
     namespace SIMD256Impl
     {
@@ -95,8 +94,7 @@
 #include "simdlib_256_avx.inl"
 #undef __SIMD_LIB_AVX_HPP__
         }; // struct AVXImpl
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
-
+#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX
 
 #if SIMD_ARCH >= SIMD_ARCH_AVX2
         struct AVX2Impl : AVXImpl
@@ -105,7 +103,7 @@
 #include "simdlib_256_avx2.inl"
 #undef __SIMD_LIB_AVX2_HPP__
         }; // struct AVX2Impl
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX2
 
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
         struct AVX512Impl : AVX2Impl
@@ -119,9 +117,9 @@
 #include "simdlib_256_avx512_core.inl"
 #endif // defined(SIMD_ARCH_KNIGHTS)
 #undef __SIMD_LIB_AVX512_HPP__
-#endif // SIMD_OPT_256_AVX512
+#endif     // SIMD_OPT_256_AVX512
         }; // struct AVX2Impl
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
 
         struct Traits : SIMDImpl::Traits
         {
@@ -135,18 +133,18 @@
 #error Invalid value for SIMD_ARCH
 #endif
 
-            using Float     = SIMD256Impl::Float;
-            using Double    = SIMD256Impl::Double;
-            using Integer   = SIMD256Impl::Integer;
-            using Vec4      = SIMD256Impl::Vec4;
-            using Mask      = SIMD256Impl::Mask;
+            using Float   = SIMD256Impl::Float;
+            using Double  = SIMD256Impl::Double;
+            using Integer = SIMD256Impl::Integer;
+            using Vec4    = SIMD256Impl::Vec4;
+            using Mask    = SIMD256Impl::Mask;
         };
-    } // ns SIMD256Impl
+    } // namespace SIMD256Impl
 
     namespace SIMD512Impl
     {
 #if SIMD_ARCH >= SIMD_ARCH_AVX
-        template<typename SIMD256T>
+        template <typename SIMD256T>
         struct AVXImplBase
         {
 #define __SIMD_LIB_AVX_HPP__
@@ -157,12 +155,10 @@
         using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
 
-
 #if SIMD_ARCH >= SIMD_ARCH_AVX2
         using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
 #endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
 
-
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
         struct AVX512Impl : AVXImplBase<SIMD256Impl::AVX512Impl>
         {
@@ -178,7 +174,7 @@
 #endif // defined(SIMD_ARCH_KNIGHTS)
 #undef __SIMD_LIB_AVX512_HPP__
         }; // struct AVX512ImplBase
-#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+#endif     // #if SIMD_ARCH >= SIMD_ARCH_AVX512
 
         struct Traits : SIMDImpl::Traits
         {
@@ -192,33 +188,32 @@
 #error Invalid value for SIMD_ARCH
 #endif
 
-            using Float     = SIMD512Impl::Float;
-            using Double    = SIMD512Impl::Double;
-            using Integer   = SIMD512Impl::Integer;
-            using Vec4      = SIMD512Impl::Vec4;
-            using Mask      = SIMD512Impl::Mask;
+            using Float   = SIMD512Impl::Float;
+            using Double  = SIMD512Impl::Double;
+            using Integer = SIMD512Impl::Integer;
+            using Vec4    = SIMD512Impl::Vec4;
+            using Mask    = SIMD512Impl::Mask;
         };
-    } // ns SIMD512Impl
-} // ns SIMDImpl
+    } // namespace SIMD512Impl
+} // namespace SIMDImpl
 
 template <typename Traits>
 struct SIMDBase : Traits::IsaImpl
 {
-    using CompareType   = typename Traits::CompareType;
-    using ScaleFactor   = typename Traits::ScaleFactor;
-    using RoundMode     = typename Traits::RoundMode;
-    using SIMD          = typename Traits::IsaImpl;
-    using Float         = typename Traits::Float;
-    using Double        = typename Traits::Double;
-    using Integer       = typename Traits::Integer;
-    using Vec4          = typename Traits::Vec4;
-    using Mask          = typename Traits::Mask;
+    using CompareType = typename Traits::CompareType;
+    using ScaleFactor = typename Traits::ScaleFactor;
+    using RoundMode   = typename Traits::RoundMode;
+    using SIMD        = typename Traits::IsaImpl;
+    using Float       = typename Traits::Float;
+    using Double      = typename Traits::Double;
+    using Integer     = typename Traits::Integer;
+    using Vec4        = typename Traits::Vec4;
+    using Mask        = typename Traits::Mask;
 
     static const size_t VECTOR_BYTES = sizeof(Float);
 
     // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
-    static SIMDINLINE
-    void vec4_load1_ps(Vec4& r, const float *p)
+    static SIMDINLINE void vec4_load1_ps(Vec4& r, const float* p)
     {
         r[0] = SIMD::set1_ps(p[0]);
         r[1] = SIMD::set1_ps(p[1]);
@@ -226,8 +221,7 @@
         r[3] = SIMD::set1_ps(p[3]);
     }
 
-    static SIMDINLINE
-    void vec4_set1_vps(Vec4& r, Float const &s)
+    static SIMDINLINE void vec4_set1_vps(Vec4& r, Float const& s)
     {
         r[0] = s;
         r[1] = s;
@@ -235,48 +229,44 @@
         r[3] = s;
     }
 
-    static SIMDINLINE
-    Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
+    static SIMDINLINE Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
     {
         Float tmp, r;
-        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+        r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
 
-        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
-        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+        tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y)
 
-        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
-        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+        tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 
         return r;
     }
 
-    static SIMDINLINE
-    Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
+    static SIMDINLINE Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
     {
         Float tmp, r;
-        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+        r = SIMD::mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
 
-        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
-        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+        tmp = SIMD::mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y)
 
-        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
-        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+        tmp = SIMD::mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 
-        tmp = SIMD::mul_ps(v0[3], v1[3]);     // (v0.w*v1.w)
-        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+        tmp = SIMD::mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
+        r   = SIMD::add_ps(r, tmp);       // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 
         return r;
     }
 
-    static SIMDINLINE
-    Float vec4_rcp_length_ps(const Vec4& v)
+    static SIMDINLINE Float vec4_rcp_length_ps(const Vec4& v)
     {
         Float length = vec4_dp4_ps(v, v);
         return SIMD::rsqrt_ps(length);
     }
 
-    static SIMDINLINE
-    void vec4_normalize_ps(Vec4& r, const Vec4& v)
+    static SIMDINLINE void vec4_normalize_ps(Vec4& r, const Vec4& v)
     {
         Float rcpLength = vec4_rcp_length_ps(v);
 
@@ -286,8 +276,7 @@
         r[3] = SIMD::mul_ps(v[3], rcpLength);
     }
 
-    static SIMDINLINE
-    void vec4_mul_ps(Vec4& r, const Vec4& v, Float const &s)
+    static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v, Float const& s)
     {
         r[0] = SIMD::mul_ps(v[0], s);
         r[1] = SIMD::mul_ps(v[1], s);
@@ -295,8 +284,7 @@
         r[3] = SIMD::mul_ps(v[3], s);
     }
 
-    static SIMDINLINE
-    void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    static SIMDINLINE void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
     {
         r[0] = SIMD::mul_ps(v0[0], v1[0]);
         r[1] = SIMD::mul_ps(v0[1], v1[1]);
@@ -304,8 +292,7 @@
         r[3] = SIMD::mul_ps(v0[3], v1[3]);
     }
 
-    static SIMDINLINE
-    void vec4_add_ps(Vec4& r, const Vec4& v0, Float const &s)
+    static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, Float const& s)
     {
         r[0] = SIMD::add_ps(v0[0], s);
         r[1] = SIMD::add_ps(v0[1], s);
@@ -313,8 +300,7 @@
         r[3] = SIMD::add_ps(v0[3], s);
     }
 
-    static SIMDINLINE
-    void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    static SIMDINLINE void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
     {
         r[0] = SIMD::add_ps(v0[0], v1[0]);
         r[1] = SIMD::add_ps(v0[1], v1[1]);
@@ -322,8 +308,7 @@
         r[3] = SIMD::add_ps(v0[3], v1[3]);
     }
 
-    static SIMDINLINE
-    void vec4_min_ps(Vec4& r, const Vec4& v0, Float const &s)
+    static SIMDINLINE void vec4_min_ps(Vec4& r, const Vec4& v0, Float const& s)
     {
         r[0] = SIMD::min_ps(v0[0], s);
         r[1] = SIMD::min_ps(v0[1], s);
@@ -331,8 +316,7 @@
         r[3] = SIMD::min_ps(v0[3], s);
     }
 
-    static SIMDINLINE
-    void vec4_max_ps(Vec4& r, const Vec4& v0, Float const &s)
+    static SIMDINLINE void vec4_max_ps(Vec4& r, const Vec4& v0, Float const& s)
     {
         r[0] = SIMD::max_ps(v0[0], s);
         r[1] = SIMD::max_ps(v0[1], s);
@@ -345,66 +329,64 @@
     //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
     //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
     //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
-    static SIMDINLINE
-    void SIMDCALL mat4x4_vec4_multiply(
-        Vec4& result,
-        const float *pMatrix,
-        const Vec4& v)
+    static SIMDINLINE void SIMDCALL mat4x4_vec4_multiply(Vec4&        result,
+                                                         const float* pMatrix,
+                                                         const Vec4&  v)
     {
         Float m;
         Float r0;
         Float r1;
 
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
-        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
+        r1        = SIMD::mul_ps(m, v[3]);               // (m3 * v.z)
+        r0        = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
         result[0] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
-        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
+        r1        = SIMD::mul_ps(m, v[3]);               // (m3 * v.z)
+        r0        = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
         result[1] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
-        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
+        r1        = SIMD::mul_ps(m, v[3]);               // (m3 * v.z)
+        r0        = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
         result[2] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
-        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
+        r1        = SIMD::mul_ps(m, v[3]);               // (m3 * v.z)
+        r0        = SIMD::add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
         result[3] = r0;
     }
 
@@ -413,44 +395,42 @@
     //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
     //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
     //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
-    static SIMDINLINE
-    void SIMDCALL mat3x3_vec3_w0_multiply(
-        Vec4& result,
-        const float *pMatrix,
-        const Vec4& v)
+    static SIMDINLINE void SIMDCALL mat3x3_vec3_w0_multiply(Vec4&        result,
+                                                            const float* pMatrix,
+                                                            const Vec4&  v)
     {
         Float m;
         Float r0;
         Float r1;
 
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
         result[0] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
         result[1] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
         result[2] = r0;
 
         result[3] = SIMD::setzero_ps();
@@ -461,108 +441,104 @@
     //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
     //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
     //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
-    static SIMDINLINE
-    void SIMDCALL mat4x4_vec3_w1_multiply(
-        Vec4& result,
-        const float *pMatrix,
-        const Vec4& v)
+    static SIMDINLINE void SIMDCALL mat4x4_vec3_w1_multiply(Vec4&        result,
+                                                            const float* pMatrix,
+                                                            const Vec4&  v)
     {
         Float m;
         Float r0;
         Float r1;
 
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
-        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
+        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
         result[0] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
-        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
+        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
         result[1] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
-        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
+        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
         result[2] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
-        result[3] = SIMD::add_ps(r0, m);        // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 3 * 4 + 3); // m[row][3]
+        result[3] = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
     }
 
-    static SIMDINLINE
-    void SIMDCALL mat4x3_vec3_w1_multiply(
-        Vec4& result,
-        const float *pMatrix,
-        const Vec4& v)
+    static SIMDINLINE void SIMDCALL mat4x3_vec3_w1_multiply(Vec4&        result,
+                                                            const float* pMatrix,
+                                                            const Vec4&  v)
     {
         Float m;
         Float r0;
         Float r1;
 
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
-        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 0 * 4 + 3); // m[row][3]
+        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
         result[0] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
-        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 1 * 4 + 3); // m[row][3]
+        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
         result[1] = r0;
 
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
-        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
-        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
-        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
-        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
-        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 0); // m[row][0]
+        r0        = SIMD::mul_ps(m, v[0]);               // (m00 * v.x)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 1); // m[row][1]
+        r1        = SIMD::mul_ps(m, v[1]);               // (m1 * v.y)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 2); // m[row][2]
+        r1        = SIMD::mul_ps(m, v[2]);               // (m2 * v.z)
+        r0        = SIMD::add_ps(r0, r1);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m         = SIMD::load1_ps(pMatrix + 2 * 4 + 3); // m[row][3]
+        r0        = SIMD::add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
         result[2] = r0;
         result[3] = SIMD::set1_ps(1.0f);
     }
@@ -572,11 +548,85 @@
 using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
 using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
 
-template <typename SIMD_T> using CompareType    = typename SIMD_T::CompareType;
-template <typename SIMD_T> using ScaleFactor    = typename SIMD_T::ScaleFactor;
-template <typename SIMD_T> using RoundMode      = typename SIMD_T::RoundMode;
-template <typename SIMD_T> using Float          = typename SIMD_T::Float;
-template <typename SIMD_T> using Double         = typename SIMD_T::Double;
-template <typename SIMD_T> using Integer        = typename SIMD_T::Integer;
-template <typename SIMD_T> using Vec4           = typename SIMD_T::Vec4;
-template <typename SIMD_T> using Mask           = typename SIMD_T::Mask;
+template <typename SIMD_T>
+using CompareType = typename SIMD_T::CompareType;
+template <typename SIMD_T>
+using ScaleFactor = typename SIMD_T::ScaleFactor;
+template <typename SIMD_T>
+using RoundMode = typename SIMD_T::RoundMode;
+template <typename SIMD_T>
+using Float = typename SIMD_T::Float;
+template <typename SIMD_T>
+using Double = typename SIMD_T::Double;
+template <typename SIMD_T>
+using Integer = typename SIMD_T::Integer;
+template <typename SIMD_T>
+using Vec4 = typename SIMD_T::Vec4;
+template <typename SIMD_T>
+using Mask = typename SIMD_T::Mask;
+
+template <typename SIMD_T>
+struct SIMDVecEqual
+{
+    INLINE bool operator()(Integer<SIMD_T> a, Integer<SIMD_T> b) const
+    {
+        Integer<SIMD_T> c = SIMD_T::xor_si(a, b);
+        return SIMD_T::testz_si(c, c);
+    }
+
+    INLINE bool operator()(Float<SIMD_T> a, Float<SIMD_T> b) const
+    {
+        return this->operator()(SIMD_T::castps_si(a), SIMD_T::castps_si(b));
+    }
+
+    INLINE bool operator()(Double<SIMD_T> a, Double<SIMD_T> b) const
+    {
+        return this->operator()(SIMD_T::castpd_si(a), SIMD_T::castpd_si(b));
+    }
+};
+
+template <typename SIMD_T>
+struct SIMDVecHash
+{
+    INLINE uint32_t operator()(Integer<SIMD_T> val) const
+    {
+#if defined(_WIN64) || !defined(_WIN32) // assume non-Windows is always 64-bit
+        static_assert(sizeof(void*) == 8, "This path only meant for 64-bit code");
+
+        uint64_t              crc32          = 0;
+        const uint64_t*       pData          = reinterpret_cast<const uint64_t*>(&val);
+        static const uint32_t loopIterations = sizeof(val) / sizeof(void*);
+        static_assert(loopIterations * sizeof(void*) == sizeof(val), "bad vector size");
+
+        for (uint32_t i = 0; i < loopIterations; ++i)
+        {
+            crc32 = _mm_crc32_u64(crc32, pData[i]);
+        }
+
+        return static_cast<uint32_t>(crc32);
+#else
+        static_assert(sizeof(void*) == 4, "This path only meant for 32-bit code");
+
+        uint32_t crc32 = 0;
+        const uint32_t* pData = reinterpret_cast<const uint32_t*>(&val);
+        static const uint32_t loopIterations = sizeof(val) / sizeof(void*);
+        static_assert(loopIterations * sizeof(void*) == sizeof(val), "bad vector size");
+
+        for (uint32_t i = 0; i < loopIterations; ++i)
+        {
+            crc32 = _mm_crc32_u32(crc32, pData[i]);
+        }
+
+        return crc32;
+#endif
+    };
+
+    INLINE uint32_t operator()(Float<SIMD_T> val) const
+    {
+        return operator()(SIMD_T::castps_si(val));
+    };
+    INLINE uint32_t operator()(Double<SIMD_T> val) const
+    {
+        return operator()(SIMD_T::castpd_si(val));
+    }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
index b1511c6..0c5795c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -28,100 +28,79 @@
 // SIMD128 AVX (1) implementation
 //============================================================================
 
-#define SIMD_WRAPPER_1(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return _mm_##op(a);\
+#define SIMD_WRAPPER_1(op) \
+    static SIMDINLINE Float SIMDCALL op(Float a) { return _mm_##op(a); }
+
+#define SIMD_WRAPPER_2(op) \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm_##op(a, b); }
+
+#define SIMD_DWRAPPER_2(op) \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm_##op(a, b); }
+
+#define SIMD_WRAPPER_2I(op)                               \
+    template <int ImmT>                                   \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+    {                                                     \
+        return _mm_##op(a, b, ImmT);                      \
     }
 
-#define SIMD_WRAPPER_2(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm_##op(a, b);\
+#define SIMD_DWRAPPER_2I(op)                                 \
+    template <int ImmT>                                      \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+    {                                                        \
+        return _mm_##op(a, b, ImmT);                         \
     }
 
-#define SIMD_DWRAPPER_2(op)  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return _mm_##op(a, b);\
-    }
+#define SIMD_WRAPPER_3(op) \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
 
-#define SIMD_WRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm_##op(a, b, ImmT);\
-    }
+#define SIMD_IWRAPPER_1(op) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm_##op(a); }
 
-#define SIMD_DWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return _mm_##op(a, b, ImmT);\
-    }
-
-#define SIMD_WRAPPER_3(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
-    {\
-        return _mm_##op(a, b, c);\
-    }
-
-#define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return _mm_##op(a);\
-    }
-
-#define SIMD_IWRAPPER_1I_(op, intrin)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return intrin(a, ImmT);\
+#define SIMD_IWRAPPER_1I_(op, intrin)                \
+    template <int ImmT>                              \
+    static SIMDINLINE Integer SIMDCALL op(Integer a) \
+    {                                                \
+        return intrin(a, ImmT);                      \
     }
 #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
 
-#define SIMD_IWRAPPER_2_(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return intrin(a, b);\
+#define SIMD_IWRAPPER_2_(op, intrin) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return intrin(a, b); }
+
+#define SIMD_IWRAPPER_2(op) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm_##op(a, b); }
+
+#define SIMD_IFWRAPPER_2(op, intrin)                            \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+    {                                                           \
+        return castps_si(intrin(castsi_ps(a), castsi_ps(b)));   \
     }
 
-#define SIMD_IWRAPPER_2(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return _mm_##op(a, b);\
-    }
-
-#define SIMD_IFWRAPPER_2(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
-    }
-
-#define SIMD_IWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return _mm_##op(a, b, ImmT);\
+#define SIMD_IWRAPPER_2I(op)                                    \
+    template <int ImmT>                                         \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+    {                                                           \
+        return _mm_##op(a, b, ImmT);                            \
     }
 
 //-----------------------------------------------------------------------
 // Single precision floating point arithmetic operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);     // return a + b
-SIMD_WRAPPER_2(div_ps);     // return a / b
-SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);     // return a * b
-SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);     // return a - b
+SIMD_WRAPPER_2(add_ps);   // return a + b
+SIMD_WRAPPER_2(div_ps);   // return a / b
+SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);   // return a * b
+SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);   // return a - b
 
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)    // return (a * b) + c
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
 {
     return add_ps(mul_ps(a, b), c);
 }
-static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c)    // return (a * b) - c
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c) // return (a * b) - c
 {
     return sub_ps(mul_ps(a, b), c);
 }
@@ -132,8 +111,14 @@
     return _mm_round_ps(a, static_cast<int>(RMT));
 }
 
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
-static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
+{
+    return round_ps<RoundMode::CEIL_NOEXC>(a);
+}
+static SIMDINLINE Float SIMDCALL floor_ps(Float a)
+{
+    return round_ps<RoundMode::FLOOR_NOEXC>(a);
+}
 
 //-----------------------------------------------------------------------
 // Integer (various width) arithmetic operations
@@ -141,7 +126,7 @@
 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
 SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
@@ -160,41 +145,40 @@
 //-----------------------------------------------------------------------
 // Logical operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2(and_ps);                             // return a & b       (float treated as int)
-SIMD_IWRAPPER_2_(and_si, _mm_and_si128);        // return a & b       (int)
-SIMD_WRAPPER_2(andnot_ps);                          // return (~a) & b    (float treated as int)
-SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128);  // return (~a) & b    (int)
-SIMD_WRAPPER_2(or_ps);                              // return a | b       (float treated as int)
-SIMD_IWRAPPER_2_(or_si, _mm_or_si128);          // return a | b       (int)
-SIMD_WRAPPER_2(xor_ps);                             // return a ^ b       (float treated as int)
-SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);        // return a ^ b       (int)
-
+SIMD_WRAPPER_2(and_ps);                        // return a & b       (float treated as int)
+SIMD_IWRAPPER_2_(and_si, _mm_and_si128);       // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);                     // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);                         // return a | b       (float treated as int)
+SIMD_IWRAPPER_2_(or_si, _mm_or_si128);         // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);                        // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);       // return a ^ b       (int)
 
 //-----------------------------------------------------------------------
 // Shift operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
-SIMD_IWRAPPER_1I(slli_epi64);               // return a << ImmT
+SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
+SIMD_IWRAPPER_1I(slli_epi64); // return a << ImmT
 
 static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
 {
     int32_t a, count;
-    a = _mm_extract_epi32(vA, 0);
+    a     = _mm_extract_epi32(vA, 0);
     count = _mm_extract_epi32(vB, 0);
     a <<= count;
     vA = _mm_insert_epi32(vA, a, 0);
 
-    a = _mm_extract_epi32(vA, 1);
+    a     = _mm_extract_epi32(vA, 1);
     count = _mm_extract_epi32(vB, 1);
     a <<= count;
     vA = _mm_insert_epi32(vA, a, 1);
 
-    a = _mm_extract_epi32(vA, 2);
+    a     = _mm_extract_epi32(vA, 2);
     count = _mm_extract_epi32(vB, 2);
     a <<= count;
     vA = _mm_insert_epi32(vA, a, 2);
 
-    a = _mm_extract_epi32(vA, 3);
+    a     = _mm_extract_epi32(vA, 3);
     count = _mm_extract_epi32(vB, 3);
     a <<= count;
     vA = _mm_insert_epi32(vA, a, 3);
@@ -211,7 +195,7 @@
     return _mm_srl_epi64(a, n);
 }
 
-template<int ImmT>                              // same as srli_si, but with Float cast to int
+template <int ImmT> // same as srli_si, but with Float cast to int
 static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
 {
     return castsi_ps(srli_si<ImmT>(castps_si(a)));
@@ -220,22 +204,22 @@
 static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
 {
     int32_t a, count;
-    a = _mm_extract_epi32(vA, 0);
+    a     = _mm_extract_epi32(vA, 0);
     count = _mm_extract_epi32(vB, 0);
     a >>= count;
     vA = _mm_insert_epi32(vA, a, 0);
 
-    a = _mm_extract_epi32(vA, 1);
+    a     = _mm_extract_epi32(vA, 1);
     count = _mm_extract_epi32(vB, 1);
     a >>= count;
     vA = _mm_insert_epi32(vA, a, 1);
 
-    a = _mm_extract_epi32(vA, 2);
+    a     = _mm_extract_epi32(vA, 2);
     count = _mm_extract_epi32(vB, 2);
     a >>= count;
     vA = _mm_insert_epi32(vA, a, 2);
 
-    a = _mm_extract_epi32(vA, 3);
+    a     = _mm_extract_epi32(vA, 3);
     count = _mm_extract_epi32(vB, 3);
     a >>= count;
     vA = _mm_insert_epi32(vA, a, 3);
@@ -243,32 +227,30 @@
     return vA;
 }
 
-
-
 //-----------------------------------------------------------------------
 // Conversion operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
 {
     return _mm_castpd_ps(a);
 }
 
-static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
 {
     return _mm_castps_si128(a);
 }
 
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
 {
     return _mm_castsi128_pd(a);
 }
 
-static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
 {
     return _mm_castps_pd(a);
 }
 
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
 {
     return _mm_castsi128_ps(a);
 }
@@ -288,18 +270,19 @@
     return _mm_cvtsi32_si128(n);
 }
 
-SIMD_IWRAPPER_1(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
-SIMD_IWRAPPER_1(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
-SIMD_IWRAPPER_1(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
-SIMD_IWRAPPER_1(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
-SIMD_IWRAPPER_1(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+SIMD_IWRAPPER_1(cvtepu8_epi16);  // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
 
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a    (float --> int32)
 {
     return _mm_cvtps_epi32(a);
 }
 
-static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvttps_epi32(Float a) // return (int32)a    (rnd_to_zero(float) --> int32)
 {
     return _mm_cvttps_epi32(a);
 }
@@ -307,77 +290,104 @@
 //-----------------------------------------------------------------------
 // Comparison operations
 //-----------------------------------------------------------------------
-template<CompareType CmpTypeT>
+template <CompareType CmpTypeT>
 static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
 {
     return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
 }
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
-
-SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
-SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
 {
-    return  0 != _mm_testz_ps(a, b);
+    return cmp_ps<CompareType::LT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::GT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::NEQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::EQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::GE_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::LE_OQ>(a, b);
 }
 
-static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a,
+                                         Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
 {
-    return  0 != _mm_testz_si128(a, b);
+    return 0 != _mm_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a,
+                                         Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return 0 != _mm_testz_si128(a, b);
 }
 
 //-----------------------------------------------------------------------
 // Blend / shuffle / permute operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
-SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+SIMD_WRAPPER_2I(blend_ps); // return ImmT ? b : a  (float)
+SIMD_WRAPPER_3(blendv_ps); // return mask ? b : a  (float)
 
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
+                                                Integer b,
+                                                Float   mask) // return mask ? b : a (int)
 {
     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
 }
 
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
+                                                Integer b,
+                                                Integer mask) // return mask ? b : a (int)
 {
     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
 }
 
-static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+static SIMDINLINE Float SIMDCALL
+                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
 {
     return _mm_broadcast_ss(p);
 }
 
-SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
 
-static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Integer SIMDCALL
+                          permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 {
     return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
 }
 
-static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL
+                        permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 {
     return _mm_permutevar_ps(a, swiz);
 }
 
 SIMD_IWRAPPER_1I(shuffle_epi32);
 
-template<int ImmT>
+template <int ImmT>
 static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
 
 SIMD_IWRAPPER_2(shuffle_epi8);
@@ -385,7 +395,7 @@
 SIMD_WRAPPER_2I(shuffle_ps);
 SIMD_IWRAPPER_2(unpackhi_epi16);
 
-//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
+// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
 static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
 {
     return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
@@ -405,68 +415,74 @@
 //-----------------------------------------------------------------------
 // Load / store operations
 //-----------------------------------------------------------------------
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
-    uint32_t *pOffsets = (uint32_t*)&idx;
-    Float vResult;
-    float* pResult = (float*)&vResult;
+    uint32_t* pOffsets = (uint32_t*)&idx;
+    Float     vResult;
+    float*    pResult = (float*)&vResult;
     for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
     {
         uint32_t offset = pOffsets[i];
-        offset = offset * static_cast<uint32_t>(ScaleT);
-        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+        offset          = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
     }
 
     return vResult;
 }
 
-static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+static SIMDINLINE Float SIMDCALL
+                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
 {
     return broadcast_ss(p);
 }
 
-static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+static SIMDINLINE Float SIMDCALL
+                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
 {
     return _mm_load_ps(p);
 }
 
-static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
 {
     return _mm_load_si128(&p->v);
 }
 
-static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+static SIMDINLINE Float SIMDCALL
+                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
 {
     return _mm_loadu_ps(p);
 }
 
-static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+static SIMDINLINE Integer SIMDCALL
+                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
 {
     return _mm_lddqu_si128(&p->v);
 }
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
 {
-    uint32_t *pOffsets = (uint32_t*)&idx;
-    Float vResult = old;
-    float* pResult = (float*)&vResult;
-    DWORD index;
-    uint32_t umask = movemask_ps(mask);
+    uint32_t* pOffsets = (uint32_t*)&idx;
+    Float     vResult  = old;
+    float*    pResult  = (float*)&vResult;
+    DWORD     index;
+    uint32_t  umask = movemask_ps(mask);
     while (_BitScanForward(&index, umask))
     {
         umask &= ~(1 << index);
         uint32_t offset = pOffsets[index];
-        offset = offset * static_cast<uint32_t>(ScaleT);
-        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+        offset          = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
     }
 
     return vResult;
 }
 
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
 {
     _mm_maskstore_ps(p, mask, src);
 }
@@ -495,37 +511,40 @@
     return _mm_set1_epi8(i);
 }
 
-static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
 {
     return _mm_set1_ps(f);
 }
 
-static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
 {
     return _mm_setzero_ps();
 }
 
-static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
 {
     return _mm_setzero_si128();
 }
 
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL
+                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
 {
     _mm_store_ps(p, a);
 }
 
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
 {
     _mm_store_si128(&p->v, a);
 }
 
-static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+static SIMDINLINE void SIMDCALL
+                       storeu_si(Integer* p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
 {
     _mm_storeu_si128(&p->v, a);
 }
 
-static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+static SIMDINLINE void SIMDCALL
+                       stream_ps(float* p, Float a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
 {
     _mm_stream_ps(p, a);
 }
@@ -549,11 +568,10 @@
 
 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
 {
-    Integer vec = set1_epi32(mask);
-    const Integer bit = set_epi32(
-        0x08, 0x04, 0x02, 0x01);
-    vec = and_si(vec, bit);
-    vec = cmplt_epi32(setzero_si(), vec);
+    Integer       vec = set1_epi32(mask);
+    const Integer bit = set_epi32(0x08, 0x04, 0x02, 0x01);
+    vec               = and_si(vec, bit);
+    vec               = cmplt_epi32(setzero_si(), vec);
     return castsi_ps(vec);
 }
 
@@ -573,4 +591,3 @@
 #undef SIMD_IWRAPPER_2
 #undef SIMD_IWRAPPER_2_
 #undef SIMD_IWRAPPER_2I
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
index e8ee0b4..35f9175 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX2_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -32,14 +32,11 @@
 // Only 2 shifts and 2 gathers were introduced with AVX 2
 // Also, add native support for FMA operations
 //============================================================================
-#define SIMD_WRAPPER_3(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
-    {\
-        return _mm_##op(a, b, c);\
-    }
+#define SIMD_WRAPPER_3(op) \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm_##op(a, b, c); }
 
-SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
 
 static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
 {
@@ -51,18 +48,19 @@
     return _mm_srlv_epi32(vA, vB);
 }
 
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
     return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
 }
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
 {
     return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
 }
 
 #undef SIMD_WRAPPER_3
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
index b70a769..2ce3caa 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -34,120 +34,138 @@
 //============================================================================
 
 private:
-    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps128_ps512(r.v); }
-    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd128_pd512(r.v); }
-    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi128_si512(r.v); }
-    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps128(r); }
-    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd128(r); }
-    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si128(r); }
+static SIMDINLINE __m512 __conv(Float r)
+{
+    return _mm512_castps128_ps512(r.v);
+}
+static SIMDINLINE __m512d __conv(Double r)
+{
+    return _mm512_castpd128_pd512(r.v);
+}
+static SIMDINLINE __m512i __conv(Integer r)
+{
+    return _mm512_castsi128_si512(r.v);
+}
+static SIMDINLINE Float __conv(__m512 r)
+{
+    return _mm512_castps512_ps128(r);
+}
+static SIMDINLINE Double __conv(__m512d r)
+{
+    return _mm512_castpd512_pd128(r);
+}
+static SIMDINLINE Integer __conv(__m512i r)
+{
+    return _mm512_castsi512_si128(r);
+}
+
 public:
-
-#define SIMD_WRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+#define SIMD_WRAPPER_1_(op, intrin, mask)                        \
+    static SIMDINLINE Float SIMDCALL op(Float a)                 \
+    {                                                            \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
     }
-#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
+#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
 
-#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+#define SIMD_WRAPPER_1I_(op, intrin, mask)                             \
+    template <int ImmT>                                                \
+    static SIMDINLINE Float SIMDCALL op(Float a)                       \
+    {                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
     }
-#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
+#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
 
-#define SIMD_WRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+#define SIMD_WRAPPER_2_(op, intrin, mask)                                   \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
+    {                                                                       \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
     }
-#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
 
-#define SIMD_WRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+#define SIMD_WRAPPER_2I(op)                                                \
+    template <int ImmT>                                                    \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                  \
+    {                                                                      \
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
     }
 
-#define SIMD_WRAPPER_3_(op, intrin, mask)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+#define SIMD_WRAPPER_3_(op, intrin, mask)                                              \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)                     \
+    {                                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
     }
-#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
+#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
 
-#define SIMD_DWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
+#define SIMD_DWRAPPER_2I(op)                                               \
+    template <int ImmT>                                                    \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)               \
+    {                                                                      \
+        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
     }
 
-#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
+    {                                                            \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
     }
-#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
+#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
 
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
+    template <int ImmT>                                                \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
+    {                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
     }
-#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
+#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
 
-#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
+    {                                                                       \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
     }
-#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
+#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
 
-#define SIMD_IWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+#define SIMD_IWRAPPER_2I(op)                                               \
+    template <int ImmT>                                                    \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)            \
+    {                                                                      \
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
     }
 
 //-----------------------------------------------------------------------
 // Single precision floating point arithmetic operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);     // return a + b
-SIMD_WRAPPER_2(div_ps);     // return a / b
-SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_2(add_ps);                                // return a + b
+SIMD_WRAPPER_2(div_ps);                                // return a / b
+SIMD_WRAPPER_3(fmadd_ps);                              // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);                              // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);                                // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);                                // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);                                // return a * b
 SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xf));     // return 1.0f / a
-SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf));   // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);     // return a - b
+SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xf)); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);                                // return a - b
 
 //-----------------------------------------------------------------------
 // Integer (various width) arithmetic operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
-SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
 
 // SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 
 // return (a * b) & 0xFFFFFFFF
 //
 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
 // and store the low 32 bits of the intermediate integers in dst.
 SIMD_IWRAPPER_2_32(mullo_epi32);
-SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
 
 // SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
 // SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
@@ -155,23 +173,22 @@
 //-----------------------------------------------------------------------
 // Logical operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xf));    // return a & b       (int)
+SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xf));       // return a & b       (int)
 SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xf));     // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xf));    // return a ^ b       (int)
-
+SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xf));         // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xf));       // return a ^ b       (int)
 
 //-----------------------------------------------------------------------
 // Shift operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
-SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
-SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);  // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);  // return a >> b      (uint32)
 
 // use AVX2 version
-//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+// SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
 
 //-----------------------------------------------------------------------
 // Conversion operations (Use AVX2 versions)
@@ -185,16 +202,16 @@
 //-----------------------------------------------------------------------
 // Comparison operations (Use AVX2 versions
 //-----------------------------------------------------------------------
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
 //
-//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
 //{
 //    return cmpgt_epi32(b, a);
 //}
@@ -202,24 +219,27 @@
 //-----------------------------------------------------------------------
 // Blend / shuffle / permute operations
 //-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-// SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-// SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
-// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16
+// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation
+// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 -->
+// uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for
+// _mm256_packus_epi32 and _mm512_packus_epi32 SIMD_IWRAPPER_2_(permute_epi32,
+// permutevar8x32_epi32);
 
-//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for
+// each 32-bit lane i (float)
 //{
 //    return _mm256_permutevar8x32_ps(a, swiz);
 //}
 
 SIMD_IWRAPPER_1I_32(shuffle_epi32);
-//template<int ImmT>
-//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+// template<int ImmT>
+// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
 //{
 //    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
 //}
-//SIMD_IWRAPPER_2(shuffle_epi8);
+// SIMD_IWRAPPER_2(shuffle_epi8);
 SIMD_IWRAPPER_2_32(unpackhi_epi32);
 SIMD_IWRAPPER_2_32(unpacklo_epi32);
 
@@ -233,50 +253,47 @@
 //-----------------------------------------------------------------------
 // Load / store operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+static SIMDINLINE Float SIMDCALL
+                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
 {
     return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
 }
 
-static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
 {
     return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
 }
 
-static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+static SIMDINLINE Float SIMDCALL
+                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
 {
     return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
 }
 
-static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+static SIMDINLINE Integer SIMDCALL
+                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
 {
     return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
 }
 
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
     return __conv(_mm512_mask_i32gather_ps(
-                    _mm512_setzero_ps(),
-                    __mmask16(0xf),
-                    __conv(idx),
-                    p,
-                    static_cast<int>(ScaleT)));
+        _mm512_setzero_ps(), __mmask16(0xf), __conv(idx), p, static_cast<int>(ScaleT)));
 }
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
 {
     __mmask16 m = 0xf;
-    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
-                                _mm512_set1_epi32(0x80000000));
-    return __conv(_mm512_mask_i32gather_ps(
-                    __conv(old),
-                    m,
-                    __conv(idx),
-                    p,
-                    static_cast<int>(ScaleT)));
+    m           = _mm512_mask_test_epi32_mask(
+        m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
+    return __conv(
+        _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
 }
 
 // static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
@@ -286,19 +303,20 @@
 //         _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
 // }
 
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
 {
     __mmask16 m = 0xf;
-    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    m           = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
     _mm512_mask_storeu_ps(p, m, __conv(src));
 }
 
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL
+                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
 {
     _mm512_mask_storeu_ps(p, __mmask16(0xf), __conv(a));
 }
 
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
 {
     _mm512_mask_storeu_epi32(p, __mmask16(0xf), __conv(a));
 }
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
index a4ecd09..16e59c4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_core.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -33,114 +33,118 @@
 // register set.
 //============================================================================
 
-#define SIMD_WRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+#define SIMD_WRAPPER_1_(op, intrin, mask)                        \
+    static SIMDINLINE Float SIMDCALL op(Float a)                 \
+    {                                                            \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
     }
-#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
+#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
 
-#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+#define SIMD_WRAPPER_1I_(op, intrin, mask)                             \
+    template <int ImmT>                                                \
+    static SIMDINLINE Float SIMDCALL op(Float a)                       \
+    {                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
     }
-#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
+#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
 
-#define SIMD_WRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+#define SIMD_WRAPPER_2_(op, intrin, mask)                                   \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
+    {                                                                       \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
     }
-#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
 
-#define SIMD_WRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+#define SIMD_WRAPPER_2I(op)                                                \
+    template <int ImmT>                                                    \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                  \
+    {                                                                      \
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
     }
 
-#define SIMD_WRAPPER_3_(op, intrin, mask)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+#define SIMD_WRAPPER_3_(op, intrin, mask)                                              \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)                     \
+    {                                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
     }
-#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
+#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
 
-#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Double SIMDCALL op(Double a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+#define SIMD_DWRAPPER_1_(op, intrin, mask)                       \
+    static SIMDINLINE Double SIMDCALL op(Double a)               \
+    {                                                            \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
     }
-#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
+#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
 
-#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Double SIMDCALL op(Double a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)                            \
+    template <int ImmT>                                                \
+    static SIMDINLINE Double SIMDCALL op(Double a)                     \
+    {                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
     }
-#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
+#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
 
-#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+#define SIMD_DWRAPPER_2_(op, intrin, mask)                                  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)                \
+    {                                                                       \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
     }
-#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
+#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
 
-#define SIMD_DWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
+#define SIMD_DWRAPPER_2I(op)                                               \
+    template <int ImmT>                                                    \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)               \
+    {                                                                      \
+        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT)); \
     }
 
-#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
+    {                                                            \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
     }
-#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
+#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
 
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
+    template <int ImmT>                                                \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
+    {                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
     }
-#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
+#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
 
-#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
+    {                                                                       \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
     }
-#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
-#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
-#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
+#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
 
-#define SIMD_IWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+#define SIMD_IWRAPPER_2I(op)                                               \
+    template <int ImmT>                                                    \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)            \
+    {                                                                      \
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
     }
 
-SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
-SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
-SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_8(add_epi8);      // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);     // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_IWRAPPER_2_64(sub_epi64);    // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);     // return (b > a) ? 0 : (a - b) (uint8)
+SIMD_IWRAPPER_2_8(packs_epi16);   // int16 --> int8    See documentation for _mm256_packs_epi16 and
+                                  // _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);  // int32 --> int16   See documentation for _mm256_packs_epi32 and
+                                  // _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);  // uint16 --> uint8  See documentation for _mm256_packus_epi16 and
+                                  // _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
+                                  // _mm512_packus_epi32
 SIMD_IWRAPPER_2_16(unpackhi_epi16);
 SIMD_IWRAPPER_2_64(unpackhi_epi64);
 SIMD_IWRAPPER_2_8(unpackhi_epi8);
@@ -151,8 +155,7 @@
 static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
 {
     __mmask64 m = 0xffffull;
-    return static_cast<uint32_t>(
-        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+    return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
 }
 
 #undef SIMD_WRAPPER_1_
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
index b0cae50..1b6592e 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512_knights.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -32,4 +32,3 @@
 // These use native AVX512 instructions with masking to enable a larger
 // register set.
 //============================================================================
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
index 00c094a..4ac0f95 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -30,178 +30,172 @@
 // SIMD256 AVX (1) implementation
 //============================================================================
 
-#define SIMD_WRAPPER_1(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float const &a)   \
-    {\
-        return _mm256_##op(a);\
+#define SIMD_WRAPPER_1(op) \
+    static SIMDINLINE Float SIMDCALL op(Float const& a) { return _mm256_##op(a); }
+
+#define SIMD_WRAPPER_2(op)                                              \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
+    {                                                                   \
+        return _mm256_##op(a, b);                                       \
     }
 
-#define SIMD_WRAPPER_2(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
-    {\
-        return _mm256_##op(a, b);\
+#define SIMD_DWRAPPER_2(op)                                                \
+    static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
+    {                                                                      \
+        return _mm256_##op(a, b);                                          \
     }
 
-#define SIMD_DWRAPPER_2(op)  \
-    static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b)   \
-    {\
-        return _mm256_##op(a, b);\
+#define SIMD_WRAPPER_2I(op)                                             \
+    template <int ImmT>                                                 \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
+    {                                                                   \
+        return _mm256_##op(a, b, ImmT);                                 \
     }
 
-#define SIMD_WRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
-    {\
-        return  _mm256_##op(a, b, ImmT);\
+#define SIMD_DWRAPPER_2I(op)                                               \
+    template <int ImmT>                                                    \
+    static SIMDINLINE Double SIMDCALL op(Double const& a, Double const& b) \
+    {                                                                      \
+        return _mm256_##op(a, b, ImmT);                                    \
     }
 
-#define SIMD_DWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Double SIMDCALL op(Double const &a, Double const &b)   \
-    {\
-        return _mm256_##op(a, b, ImmT);\
+#define SIMD_WRAPPER_3(op)                                                              \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
+    {                                                                                   \
+        return _mm256_##op(a, b, c);                                                    \
     }
 
-#define SIMD_WRAPPER_3(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c)   \
-    {\
-        return _mm256_##op(a, b, c);\
+#define SIMD_IWRAPPER_1(op) \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
+
+#define SIMD_IWRAPPER_2(op)                                                   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return _mm256_##op(a, b);                                             \
     }
 
-#define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
-    {\
-        return _mm256_##op(a);\
+#define SIMD_IFWRAPPER_2(op, intrin)                                          \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return castps_si(intrin(castsi_ps(a), castsi_ps(b)));                 \
     }
 
-#define SIMD_IWRAPPER_2(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return _mm256_##op(a, b);\
+#define SIMD_IFWRAPPER_2I(op, intrin)                                         \
+    template <int ImmT>                                                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return castps_si(intrin(castsi_ps(a), castsi_ps(b), ImmT));           \
     }
 
-#define SIMD_IFWRAPPER_2(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+#define SIMD_IWRAPPER_2I_(op, intrin)                                         \
+    template <int ImmT>                                                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return _mm256_##intrin(a, b, ImmT);                                   \
     }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
 
-#define SIMD_IFWRAPPER_2I(op, intrin)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return _mm256_##intrin(a, b, ImmT);\
-    }
-#define SIMD_IWRAPPER_2I(op)  SIMD_IWRAPPER_2I_(op, op)
-
-#define SIMD_IWRAPPER_3(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c)   \
-    {\
-        return _mm256_##op(a, b, c);\
+#define SIMD_IWRAPPER_3(op)                                                                     \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
+    {                                                                                           \
+        return _mm256_##op(a, b, c);                                                            \
     }
 
 // emulated integer simd
-#define SIMD_EMU_IWRAPPER_1(op) \
-    static SIMDINLINE \
-    Integer SIMDCALL op(Integer const &a)\
-    {\
-        return Integer\
-        {\
-            SIMD128T::op(a.v4[0]),\
-            SIMD128T::op(a.v4[1]),\
-        };\
+#define SIMD_EMU_IWRAPPER_1(op)                             \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
+    {                                                       \
+        return Integer{                                     \
+            SIMD128T::op(a.v4[0]),                          \
+            SIMD128T::op(a.v4[1]),                          \
+        };                                                  \
     }
-#define SIMD_EMU_IWRAPPER_1L(op, shift) \
-    static SIMDINLINE \
-    Integer SIMDCALL op(Integer const &a)\
-    {\
-        return Integer \
-        {\
-            SIMD128T::op(a.v4[0]), \
-            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
-        };\
-    }\
-    static SIMDINLINE \
-    Integer SIMDCALL op(SIMD128Impl::Integer const &a)\
-    {\
-        return Integer \
-        {\
-            SIMD128T::op(a), \
-            SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
-        };\
+#define SIMD_EMU_IWRAPPER_1L(op, shift)                                  \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a)              \
+    {                                                                    \
+        return Integer{                                                  \
+            SIMD128T::op(a.v4[0]),                                       \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])),    \
+        };                                                               \
+    }                                                                    \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer const& a) \
+    {                                                                    \
+        return Integer{                                                  \
+            SIMD128T::op(a),                                             \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a)),          \
+        };                                                               \
     }
 
-#define SIMD_EMU_IWRAPPER_1I(op) \
-    template <int ImmT> static SIMDINLINE \
-    Integer SIMDCALL op(Integer const &a)\
-    {\
-        return Integer\
-        {\
-            SIMD128T::template op<ImmT>(a.v4[0]),\
-            SIMD128T::template op<ImmT>(a.v4[1]),\
-        };\
+#define SIMD_EMU_IWRAPPER_1I(op)                            \
+    template <int ImmT>                                     \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
+    {                                                       \
+        return Integer{                                     \
+            SIMD128T::template op<ImmT>(a.v4[0]),           \
+            SIMD128T::template op<ImmT>(a.v4[1]),           \
+        };                                                  \
     }
 
-#define SIMD_EMU_IWRAPPER_2(op) \
-    static SIMDINLINE \
-    Integer SIMDCALL op(Integer const &a, Integer const &b)\
-    {\
-        return Integer\
-        {\
-            SIMD128T::op(a.v4[0], b.v4[0]),\
-            SIMD128T::op(a.v4[1], b.v4[1]),\
-        };\
+#define SIMD_EMU_IWRAPPER_2(op)                                               \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return Integer{                                                       \
+            SIMD128T::op(a.v4[0], b.v4[0]),                                   \
+            SIMD128T::op(a.v4[1], b.v4[1]),                                   \
+        };                                                                    \
     }
 
-#define SIMD_EMU_IWRAPPER_2I(op) \
-    template <int ImmT> static SIMDINLINE \
-    Integer SIMDCALL op(Integer const &a, Integer const &b)\
-    {\
-        return Integer\
-        {\
-            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\
-            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\
-        };\
+#define SIMD_EMU_IWRAPPER_2I(op)                                              \
+    template <int ImmT>                                                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return Integer{                                                       \
+            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),                     \
+            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),                     \
+        };                                                                    \
     }
 
 //-----------------------------------------------------------------------
 // Single precision floating point arithmetic operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);     // return a + b
-SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_2(add_ps); // return a + b
+SIMD_WRAPPER_2(div_ps); // return a / b
 
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c) // return (a * b) + c
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
+                                          Float const& b,
+                                          Float const& c) // return (a * b) + c
 {
     return add_ps(mul_ps(a, b), c);
 }
 
-static SIMDINLINE Float SIMDCALL fmsub_ps(Float const &a, Float const &b, Float const &c) // return (a * b) - c
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float const& a,
+                                          Float const& b,
+                                          Float const& c) // return (a * b) - c
 {
     return sub_ps(mul_ps(a, b), c);
 }
 
-SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);     // return a * b
-SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);     // return a - b
+SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);   // return a * b
+SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);   // return a - b
 
 template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
+static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
 {
     return _mm256_round_ps(a, static_cast<int>(RMT));
 }
 
-static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
-static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
+{
+    return round_ps<RoundMode::CEIL_NOEXC>(a);
+}
+static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
+{
+    return round_ps<RoundMode::FLOOR_NOEXC>(a);
+}
 
 //-----------------------------------------------------------------------
 // Integer (various width) arithmetic operations
@@ -209,7 +203,7 @@
 SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
 SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
@@ -237,182 +231,184 @@
 SIMD_WRAPPER_2(xor_ps);         // return a ^ b       (float treated as int)
 SIMD_EMU_IWRAPPER_2(xor_si);    // return a ^ b       (int)
 
-
 //-----------------------------------------------------------------------
 // Shift operations
 //-----------------------------------------------------------------------
-SIMD_EMU_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_EMU_IWRAPPER_1I(slli_epi32); // return a << ImmT
 
-static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const &vA, Integer const &vCount) // return a << b      (uint32)
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer const& vA,
+                                              Integer const& vCount) // return a << b      (uint32)
 {
     int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
     __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
 
-    aHi = _mm_extract_epi32(vAHi, 0);
+    aHi     = _mm_extract_epi32(vAHi, 0);
     countHi = _mm_extract_epi32(vCountHi, 0);
     aHi <<= countHi;
     vAHi = _mm_insert_epi32(vAHi, aHi, 0);
 
-    aLow = _mm_extract_epi32(vALow, 0);
+    aLow     = _mm_extract_epi32(vALow, 0);
     countLow = _mm_extract_epi32(vCountLow, 0);
     aLow <<= countLow;
     vALow = _mm_insert_epi32(vALow, aLow, 0);
 
-    aHi = _mm_extract_epi32(vAHi, 1);
+    aHi     = _mm_extract_epi32(vAHi, 1);
     countHi = _mm_extract_epi32(vCountHi, 1);
     aHi <<= countHi;
     vAHi = _mm_insert_epi32(vAHi, aHi, 1);
 
-    aLow = _mm_extract_epi32(vALow, 1);
+    aLow     = _mm_extract_epi32(vALow, 1);
     countLow = _mm_extract_epi32(vCountLow, 1);
     aLow <<= countLow;
     vALow = _mm_insert_epi32(vALow, aLow, 1);
 
-    aHi = _mm_extract_epi32(vAHi, 2);
+    aHi     = _mm_extract_epi32(vAHi, 2);
     countHi = _mm_extract_epi32(vCountHi, 2);
     aHi <<= countHi;
     vAHi = _mm_insert_epi32(vAHi, aHi, 2);
 
-    aLow = _mm_extract_epi32(vALow, 2);
+    aLow     = _mm_extract_epi32(vALow, 2);
     countLow = _mm_extract_epi32(vCountLow, 2);
     aLow <<= countLow;
     vALow = _mm_insert_epi32(vALow, aLow, 2);
 
-    aHi = _mm_extract_epi32(vAHi, 3);
+    aHi     = _mm_extract_epi32(vAHi, 3);
     countHi = _mm_extract_epi32(vCountHi, 3);
     aHi <<= countHi;
     vAHi = _mm_insert_epi32(vAHi, aHi, 3);
 
-    aLow = _mm_extract_epi32(vALow, 3);
+    aLow     = _mm_extract_epi32(vALow, 3);
     countLow = _mm_extract_epi32(vCountLow, 3);
     aLow <<= countLow;
     vALow = _mm_insert_epi32(vALow, aLow, 3);
 
     __m256i ret = _mm256_set1_epi32(0);
-    ret = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    ret         = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret         = _mm256_insertf128_si256(ret, vALow, 0);
     return ret;
 }
 
-SIMD_EMU_IWRAPPER_1I(srai_epi32);   // return a >> ImmT   (int32)
-SIMD_EMU_IWRAPPER_1I(srli_epi32);   // return a >> ImmT   (uint32)
-SIMD_EMU_IWRAPPER_1I(srli_si);      // return a >> (ImmT*8) (uint)
+SIMD_EMU_IWRAPPER_1I(srai_epi32); // return a >> ImmT   (int32)
+SIMD_EMU_IWRAPPER_1I(srli_epi32); // return a >> ImmT   (uint32)
+SIMD_EMU_IWRAPPER_1I(srli_si);    // return a >> (ImmT*8) (uint)
 
-template<int ImmT>                              // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)
+template <int ImmT> // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
 {
     return castsi_ps(srli_si<ImmT>(castps_si(a)));
 }
 
-static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const &vA, Integer const &vCount) // return a >> b      (uint32)
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer const& vA,
+                                              Integer const& vCount) // return a >> b      (uint32)
 {
     int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vAHi      = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow     = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi  = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
     __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
 
-    aHi = _mm_extract_epi32(vAHi, 0);
+    aHi     = _mm_extract_epi32(vAHi, 0);
     countHi = _mm_extract_epi32(vCountHi, 0);
     aHi >>= countHi;
     vAHi = _mm_insert_epi32(vAHi, aHi, 0);
 
-    aLow = _mm_extract_epi32(vALow, 0);
+    aLow     = _mm_extract_epi32(vALow, 0);
     countLow = _mm_extract_epi32(vCountLow, 0);
     aLow >>= countLow;
     vALow = _mm_insert_epi32(vALow, aLow, 0);
 
-    aHi = _mm_extract_epi32(vAHi, 1);
+    aHi     = _mm_extract_epi32(vAHi, 1);
     countHi = _mm_extract_epi32(vCountHi, 1);
     aHi >>= countHi;
     vAHi = _mm_insert_epi32(vAHi, aHi, 1);
 
-    aLow = _mm_extract_epi32(vALow, 1);
+    aLow     = _mm_extract_epi32(vALow, 1);
     countLow = _mm_extract_epi32(vCountLow, 1);
     aLow >>= countLow;
     vALow = _mm_insert_epi32(vALow, aLow, 1);
 
-    aHi = _mm_extract_epi32(vAHi, 2);
+    aHi     = _mm_extract_epi32(vAHi, 2);
     countHi = _mm_extract_epi32(vCountHi, 2);
     aHi >>= countHi;
     vAHi = _mm_insert_epi32(vAHi, aHi, 2);
 
-    aLow = _mm_extract_epi32(vALow, 2);
+    aLow     = _mm_extract_epi32(vALow, 2);
     countLow = _mm_extract_epi32(vCountLow, 2);
     aLow >>= countLow;
     vALow = _mm_insert_epi32(vALow, aLow, 2);
 
-    aHi = _mm_extract_epi32(vAHi, 3);
+    aHi     = _mm_extract_epi32(vAHi, 3);
     countHi = _mm_extract_epi32(vCountHi, 3);
     aHi >>= countHi;
     vAHi = _mm_insert_epi32(vAHi, aHi, 3);
 
-    aLow = _mm_extract_epi32(vALow, 3);
+    aLow     = _mm_extract_epi32(vALow, 3);
     countLow = _mm_extract_epi32(vCountLow, 3);
     aLow >>= countLow;
     vALow = _mm_insert_epi32(vALow, aLow, 3);
 
     __m256i ret = _mm256_set1_epi32(0);
-    ret = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    ret         = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret         = _mm256_insertf128_si256(ret, vALow, 0);
     return ret;
 }
 
-
-
 //-----------------------------------------------------------------------
 // Conversion operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a)   // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
 {
     return _mm256_castpd_ps(a);
 }
 
-static SIMDINLINE Integer SIMDCALL castps_si(Float const &a)   // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
 {
     return _mm256_castps_si256(a);
 }
 
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
 {
     return _mm256_castsi256_pd(a);
 }
 
-static SIMDINLINE Double SIMDCALL castps_pd(Float const &a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
 {
     return _mm256_castps_pd(a);
 }
 
-static SIMDINLINE Integer SIMDCALL castpd_si(Double const &a)   // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castpd_si(Double const& a) // return *(Integer*)(&a)
 {
     return _mm256_castpd_si256(a);
 }
 
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a)   // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
 {
     return _mm256_castsi256_ps(a);
 }
 
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a) // return (float)a    (int32 --> float)
+static SIMDINLINE Float SIMDCALL
+                        cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
 {
     return _mm256_cvtepi32_ps(a);
 }
 
-SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);                  // return (int16)a    (uint8 --> int16)
-SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);                  // return (int32)a    (uint8 --> int32)
-SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8);                 // return (int32)a    (uint16 --> int32)
-SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4);                 // return (int64)a    (uint16 --> int64)
-SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8);                 // return (int64)a    (uint32 --> int64)
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);  // return (int16)a    (uint8 --> int16)
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);  // return (int32)a    (uint8 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8); // return (int32)a    (uint16 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4); // return (int64)a    (uint16 --> int64)
+SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8); // return (int64)a    (uint32 --> int64)
 
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a)            // return (int32)a    (float --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
 {
     return _mm256_cvtps_epi32(a);
 }
 
-static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
 {
     return _mm256_cvttps_epi32(a);
 }
@@ -420,79 +416,107 @@
 //-----------------------------------------------------------------------
 // Comparison operations
 //-----------------------------------------------------------------------
-template<CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
+template <CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
 {
     return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
 }
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
-
-SIMD_EMU_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
-SIMD_EMU_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
-SIMD_EMU_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
-SIMD_EMU_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
-SIMD_EMU_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
-SIMD_EMU_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
 {
-    return  0 != _mm256_testz_ps(a, b);
+    return cmp_ps<CompareType::LT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::GT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::NEQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::EQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::GE_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::LE_OQ>(a, b);
 }
 
-static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
+SIMD_EMU_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL
+                       testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
 {
-    return  0 != _mm256_testz_si256(a, b);
+    return 0 != _mm256_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL
+                       testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return 0 != _mm256_testz_si256(a, b);
 }
 
 //-----------------------------------------------------------------------
 // Blend / shuffle / permute operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
-SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps);  // return ImmT ? b : a  (int32)
-SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+SIMD_WRAPPER_2I(blend_ps);                       // return ImmT ? b : a  (float)
+SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps); // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);                       // return mask ? b : a  (float)
 
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
+                                                Integer const& b,
+                                                Float const&   mask) // return mask ? b : a (int)
 {
     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
 }
 
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
+                                                Integer const& b,
+                                                Integer const& mask) // return mask ? b : a (int)
 {
     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
 }
 
-static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+static SIMDINLINE Float SIMDCALL
+                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
 {
     return _mm256_broadcast_ss(p);
 }
 
-SIMD_EMU_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_EMU_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_EMU_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_EMU_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_EMU_IWRAPPER_2(packs_epi16); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_EMU_IWRAPPER_2(packs_epi32); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_EMU_IWRAPPER_2(
+    packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_EMU_IWRAPPER_2(
+    packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 
-template<int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const &a)
+template <int ImmT>
+static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
 {
     return _mm256_permute_ps(a, ImmT);
 }
 
-static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+static SIMDINLINE Integer SIMDCALL permute_epi32(
+    Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
 {
     Integer result;
 
     // Ugly slow implementation
-    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
-    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
-    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+    uint32_t const* pA      = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t*       pResult = reinterpret_cast<uint32_t*>(&result);
 
     for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
     {
@@ -502,14 +526,15 @@
     return result;
 }
 
-static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL
+                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 {
     Float result;
 
     // Ugly slow implementation
-    float const *pA = reinterpret_cast<float const*>(&a);
-    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
-    float *pResult = reinterpret_cast<float *>(&result);
+    float const*    pA      = reinterpret_cast<float const*>(&a);
+    uint32_t const* pSwiz   = reinterpret_cast<uint32_t const*>(&swiz);
+    float*          pResult = reinterpret_cast<float*>(&result);
 
     for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
     {
@@ -523,11 +548,10 @@
 SIMD_DWRAPPER_2I(permute2f128_pd);
 SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
 
-
 SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
 
-template<int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b)
+template <int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
 {
     return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
 }
@@ -550,83 +574,88 @@
 //-----------------------------------------------------------------------
 // Load / store operations
 //-----------------------------------------------------------------------
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
-    uint32_t *pOffsets = (uint32_t*)&idx;
-    Float vResult;
-    float* pResult = (float*)&vResult;
+    uint32_t* pOffsets = (uint32_t*)&idx;
+    Float     vResult;
+    float*    pResult = (float*)&vResult;
     for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
     {
         uint32_t offset = pOffsets[i];
-        offset = offset * static_cast<uint32_t>(ScaleT);
-        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+        offset          = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i]      = *(float const*)(((uint8_t const*)p + offset));
     }
 
     return vResult;
 }
 
-static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+static SIMDINLINE Float SIMDCALL
+                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
 {
     return broadcast_ss(p);
 }
 
-static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+static SIMDINLINE Float SIMDCALL
+                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
 {
     return _mm256_load_ps(p);
 }
 
-static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
 {
     return _mm256_load_si256(&p->v);
 }
 
-static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+static SIMDINLINE Float SIMDCALL
+                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
 {
     return _mm256_loadu_ps(p);
 }
 
-static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+static SIMDINLINE Integer SIMDCALL
+                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
 {
     return _mm256_lddqu_si256(&p->v);
 }
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
 {
-    uint32_t *pOffsets = (uint32_t*)&idx;
-    Float vResult = old;
-    float* pResult = (float*)&vResult;
-    DWORD index;
-    uint32_t umask = movemask_ps(mask);
+    uint32_t* pOffsets = (uint32_t*)&idx;
+    Float     vResult  = old;
+    float*    pResult  = (float*)&vResult;
+    DWORD     index;
+    uint32_t  umask = movemask_ps(mask);
     while (_BitScanForward(&index, umask))
     {
         umask &= ~(1 << index);
         uint32_t offset = pOffsets[index];
-        offset = offset * static_cast<uint32_t>(ScaleT);
-        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+        offset          = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index]  = *(float const*)(((uint8_t const*)p + offset));
     }
 
     return vResult;
 }
 
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
 {
     _mm256_maskstore_ps(p, mask, src);
 }
 
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a)
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
 {
-    return SIMD128T::movemask_epi8(a.v4[0]) |
-           (SIMD128T::movemask_epi8(a.v4[1]) << 16);
+    return SIMD128T::movemask_epi8(a.v4[0]) | (SIMD128T::movemask_epi8(a.v4[1]) << 16);
 }
 
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
 {
     return static_cast<uint32_t>(_mm256_movemask_pd(a));
 }
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
 {
     return static_cast<uint32_t>(_mm256_movemask_ps(a));
 }
@@ -641,32 +670,34 @@
     return _mm256_set1_epi8(i);
 }
 
-static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
 {
     return _mm256_set1_ps(f);
 }
 
-static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
 {
     return _mm256_setzero_ps();
 }
 
-static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
 {
     return _mm256_setzero_si256();
 }
 
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a)    // *p = a   (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL
+                       store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
 {
     _mm256_store_ps(p, a);
 }
 
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a)   // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
 {
     _mm256_store_si256(&p->v, a);
 }
 
-static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+static SIMDINLINE void SIMDCALL
+                       stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
 {
     _mm256_stream_ps(p, a);
 }
@@ -675,43 +706,43 @@
 // Legacy interface (available only in SIMD256 width)
 //=======================================================================
 
-static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
+static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const* p)
 {
     return _mm256_broadcast_ps(&p->v);
 }
 
-template<int ImmT>
-static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const &a)
+template <int ImmT>
+static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double const& a)
 {
     return _mm256_extractf128_pd(a, ImmT);
 }
 
-template<int ImmT>
-static SIMDINLINE SIMD128Impl::Float  SIMDCALL extractf128_ps(Float const &a)
+template <int ImmT>
+static SIMDINLINE SIMD128Impl::Float SIMDCALL extractf128_ps(Float const& a)
 {
     return _mm256_extractf128_ps(a, ImmT);
 }
 
-template<int ImmT>
-static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const &a)
+template <int ImmT>
+static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer const& a)
 {
     return _mm256_extractf128_si256(a, ImmT);
 }
 
-template<int ImmT>
-static SIMDINLINE Double SIMDCALL insertf128_pd(Double const &a, SIMD128Impl::Double const &b)
+template <int ImmT>
+static SIMDINLINE Double SIMDCALL insertf128_pd(Double const& a, SIMD128Impl::Double const& b)
 {
     return _mm256_insertf128_pd(a, b, ImmT);
 }
 
-template<int ImmT>
-static SIMDINLINE Float SIMDCALL insertf128_ps(Float const &a, SIMD128Impl::Float const &b)
+template <int ImmT>
+static SIMDINLINE Float SIMDCALL insertf128_ps(Float const& a, SIMD128Impl::Float const& b)
 {
     return _mm256_insertf128_ps(a, b, ImmT);
 }
 
-template<int ImmT>
-static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const &a, SIMD128Impl::Integer const &b)
+template <int ImmT>
+static SIMDINLINE Integer SIMDCALL insertf128_si(Integer const& a, SIMD128Impl::Integer const& b)
 {
     return _mm256_insertf128_si256(a, b, ImmT);
 }
@@ -727,33 +758,37 @@
     _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
 #endif
 
-static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo)
+static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi,
+                                             SIMD128Impl::Integer const* plo)
 {
     return _mm256_loadu2_m128i(&phi->v, &plo->v);
 }
 
-static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+static SIMDINLINE Integer SIMDCALL
+                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
 {
     return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
-static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+static SIMDINLINE Float SIMDCALL
+                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
 {
     return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
-static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer const &src)
+static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer* phi,
+                                           SIMD128Impl::Integer* plo,
+                                           Integer const&        src)
 {
     _mm256_storeu2_m128i(&phi->v, &plo->v, src);
 }
 
 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
 {
-    Integer vec = set1_epi32(mask);
-    const Integer bit = set_epi32(
-        0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-    vec = and_si(vec, bit);
-    vec = cmplt_epi32(setzero_si(), vec);
+    Integer       vec = set1_epi32(mask);
+    const Integer bit = set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec               = and_si(vec, bit);
+    vec               = cmplt_epi32(setzero_si(), vec);
     return castsi_ps(vec);
 }
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
index 96c24ff..59a61cf 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX2_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -32,62 +32,61 @@
 // Mostly these are integer operations that are no longer emulated with SSE
 //============================================================================
 
-#define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
-    {\
-        return _mm256_##op(a);\
+#define SIMD_IWRAPPER_1(op) \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a) { return _mm256_##op(a); }
+
+#define SIMD_IWRAPPER_1L(op)                                \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
+    {                                                       \
+        return _mm256_##op(_mm256_castsi256_si128(a));      \
     }
 
-#define SIMD_IWRAPPER_1L(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
-    {\
-        return _mm256_##op(_mm256_castsi256_si128(a));\
-    }\
-
-#define SIMD_IWRAPPER_1I(op)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
-    {\
-        return _mm256_##op(a, ImmT);\
+#define SIMD_IWRAPPER_1I(op)                                \
+    template <int ImmT>                                     \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
+    {                                                       \
+        return _mm256_##op(a, ImmT);                        \
     }
 
-#define SIMD_IWRAPPER_1I_(op, intrin)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
-    {\
-        return _mm256_##intrin(a, ImmT);\
+#define SIMD_IWRAPPER_1I_(op, intrin)                       \
+    template <int ImmT>                                     \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
+    {                                                       \
+        return _mm256_##intrin(a, ImmT);                    \
     }
 
-#define SIMD_IWRAPPER_2_(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return _mm256_##intrin(a, b);\
+#define SIMD_IWRAPPER_2_(op, intrin)                                          \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return _mm256_##intrin(a, b);                                         \
     }
 
-#define SIMD_IWRAPPER_2(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return _mm256_##op(a, b);\
+#define SIMD_IWRAPPER_2(op)                                                   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return _mm256_##op(a, b);                                             \
     }
 
-#define SIMD_IWRAPPER_2I(op)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return _mm256_##op(a, b, ImmT);\
+#define SIMD_IWRAPPER_2I(op)                                                  \
+    template <int ImmT>                                                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return _mm256_##op(a, b, ImmT);                                       \
     }
 
-#define SIMD_IWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return _mm256_##op(a, b, ImmT);\
+#define SIMD_IWRAPPER_2I(op)                                                  \
+    template <int ImmT>                                                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return _mm256_##op(a, b, ImmT);                                       \
     }
 
 //-----------------------------------------------------------------------
 // Floating point arithmetic operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL fmadd_ps(Float const &a, Float const &b, Float const &c)   // return (a * b) + c
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float const& a,
+                                          Float const& b,
+                                          Float const& c) // return (a * b) + c
 {
     return _mm256_fmadd_ps(a, b, c);
 }
@@ -98,7 +97,7 @@
 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
 SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
@@ -117,51 +116,50 @@
 //-----------------------------------------------------------------------
 // Logical operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si,    and_si256);     // return a & b       (int)
-SIMD_IWRAPPER_2_(andnot_si, andnot_si256);  // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si,     or_si256);      // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si,    xor_si256);     // return a ^ b       (int)
-
+SIMD_IWRAPPER_2_(and_si, and_si256);       // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si256); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si, or_si256);         // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si, xor_si256);       // return a ^ b       (int)
 
 //-----------------------------------------------------------------------
 // Shift operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
-SIMD_IWRAPPER_2(sllv_epi32);                // return a << b      (uint32)
-SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_2(srlv_epi32);                // return a >> b      (uint32)
-SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+SIMD_IWRAPPER_1I(slli_epi32);           // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);            // return a << b      (uint32)
+SIMD_IWRAPPER_1I(srai_epi32);           // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);           // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2(srlv_epi32);            // return a >> b      (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si256); // return a >> (ImmT*8) (uint)
 
-template<int ImmT>                          // same as srli_si, but with Float cast to int
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)
+template <int ImmT> // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float const& a)
 {
     return castsi_ps(srli_si<ImmT>(castps_si(a)));
 }
 
-
 //-----------------------------------------------------------------------
 // Conversion operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_1L(cvtepu8_epi16);    // return (int16)a    (uint8 --> int16)
-SIMD_IWRAPPER_1L(cvtepu8_epi32);    // return (int32)a    (uint8 --> int32)
-SIMD_IWRAPPER_1L(cvtepu16_epi32);   // return (int32)a    (uint16 --> int32)
-SIMD_IWRAPPER_1L(cvtepu16_epi64);   // return (int64)a    (uint16 --> int64)
-SIMD_IWRAPPER_1L(cvtepu32_epi64);   // return (int64)a    (uint32 --> int64)
+SIMD_IWRAPPER_1L(cvtepu8_epi16);  // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1L(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1L(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
 
 //-----------------------------------------------------------------------
 // Comparison operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
 
-static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const &a, Integer const &b)   // return a < b (int32)
+static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer const& a,
+                                               Integer const& b) // return a < b (int32)
 {
     return cmpgt_epi32(b, a);
 }
@@ -169,28 +167,29 @@
 //-----------------------------------------------------------------------
 // Blend / shuffle / permute operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
-SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
+SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 
-template<int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const &a)
+template <int ImmT>
+static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
 {
     return _mm256_permute_ps(a, ImmT);
 }
 
 SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
 
-static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL
+                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 {
     return _mm256_permutevar8x32_ps(a, swiz);
 }
 
 SIMD_IWRAPPER_1I(shuffle_epi32);
-template<int ImmT>
-static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const &a, Integer const &b)
+template <int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer const& a, Integer const& b)
 {
     return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
 }
@@ -207,22 +206,24 @@
 //-----------------------------------------------------------------------
 // Load / store operations
 //-----------------------------------------------------------------------
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
     return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
 }
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
 {
-	// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
-	// Only for this intrinsic - not sure why. :(
+    // g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
+    // Only for this intrinsic - not sure why. :(
     return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
 }
 
-static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const &a)
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer const& a)
 {
     return static_cast<uint32_t>(_mm256_movemask_epi8(a));
 }
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
index 3fcfd25..7906098 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -34,120 +34,138 @@
 //============================================================================
 
 private:
-    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps256_ps512(r.v); }
-    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd256_pd512(r.v); }
-    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi256_si512(r.v); }
-    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps256(r); }
-    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd256(r); }
-    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si256(r); }
+static SIMDINLINE __m512 __conv(Float r)
+{
+    return _mm512_castps256_ps512(r.v);
+}
+static SIMDINLINE __m512d __conv(Double r)
+{
+    return _mm512_castpd256_pd512(r.v);
+}
+static SIMDINLINE __m512i __conv(Integer r)
+{
+    return _mm512_castsi256_si512(r.v);
+}
+static SIMDINLINE Float __conv(__m512 r)
+{
+    return _mm512_castps512_ps256(r);
+}
+static SIMDINLINE Double __conv(__m512d r)
+{
+    return _mm512_castpd512_pd256(r);
+}
+static SIMDINLINE Integer __conv(__m512i r)
+{
+    return _mm512_castsi512_si256(r);
+}
+
 public:
-
-#define SIMD_WRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+#define SIMD_WRAPPER_1_(op, intrin, mask)                        \
+    static SIMDINLINE Float SIMDCALL op(Float a)                 \
+    {                                                            \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
     }
-#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
+#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
 
-#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+#define SIMD_WRAPPER_1I_(op, intrin, mask)                             \
+    template <int ImmT>                                                \
+    static SIMDINLINE Float SIMDCALL op(Float a)                       \
+    {                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
     }
-#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
+#define SIMD_WRAPPER_1I(op) SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
 
-#define SIMD_WRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+#define SIMD_WRAPPER_2_(op, intrin, mask)                                   \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
+    {                                                                       \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
     }
-#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
 
-#define SIMD_WRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+#define SIMD_WRAPPER_2I(op)                                                 \
+    template <int ImmT>                                                     \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                   \
+    {                                                                       \
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
     }
 
-#define SIMD_WRAPPER_3_(op, intrin, mask)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+#define SIMD_WRAPPER_3_(op, intrin, mask)                                              \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)                     \
+    {                                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c))); \
     }
-#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
+#define SIMD_WRAPPER_3(op) SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
 
-#define SIMD_DWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+#define SIMD_DWRAPPER_2I(op)                                               \
+    template <int ImmT>                                                    \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)               \
+    {                                                                      \
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT)); \
     }
 
-#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
+    {                                                            \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
     }
-#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
+#define SIMD_IWRAPPER_1_32(op) SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
 
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
+    template <int ImmT>                                                \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
+    {                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
     }
-#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
+#define SIMD_IWRAPPER_1I_32(op) SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
 
-#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
+    {                                                                       \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
     }
-#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
+#define SIMD_IWRAPPER_2_32(op) SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
 
-#define SIMD_IWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+#define SIMD_IWRAPPER_2I(op)                                                \
+    template <int ImmT>                                                     \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
+    {                                                                       \
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT)); \
     }
 
 //-----------------------------------------------------------------------
 // Single precision floating point arithmetic operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);     // return a + b
-SIMD_WRAPPER_2(div_ps);     // return a / b
-SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_2(add_ps);                                 // return a + b
+SIMD_WRAPPER_2(div_ps);                                 // return a / b
+SIMD_WRAPPER_3(fmadd_ps);                               // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);                               // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);                                 // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);                                 // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);                                 // return a * b
 SIMD_WRAPPER_1_(rcp_ps, rcp14_ps, __mmask16(0xff));     // return 1.0f / a
-SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff));   // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);     // return a - b
+SIMD_WRAPPER_1_(rsqrt_ps, rsqrt14_ps, __mmask16(0xff)); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);                                 // return a - b
 
 //-----------------------------------------------------------------------
 // Integer (various width) arithmetic operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
-SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
-SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
-SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
-SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+SIMD_IWRAPPER_1_32(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32); // return a * b (int32)
 
 // SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+// SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 
 // return (a * b) & 0xFFFFFFFF
 //
 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
 // and store the low 32 bits of the intermediate integers in dst.
 SIMD_IWRAPPER_2_32(mullo_epi32);
-SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+SIMD_IWRAPPER_2_32(sub_epi32); // return a - b (int32)
 
 // SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
 // SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
@@ -155,23 +173,22 @@
 //-----------------------------------------------------------------------
 // Logical operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xff));    // return a & b       (int)
+SIMD_IWRAPPER_2_(and_si, and_epi32, __mmask16(0xff));       // return a & b       (int)
 SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xff));     // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xff));    // return a ^ b       (int)
-
+SIMD_IWRAPPER_2_(or_si, or_epi32, __mmask16(0xff));         // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si, xor_epi32, __mmask16(0xff));       // return a ^ b       (int)
 
 //-----------------------------------------------------------------------
 // Shift operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
-SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
-SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
-SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+SIMD_IWRAPPER_1I_32(slli_epi32); // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);  // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32); // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32); // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);  // return a >> b      (uint32)
 
 // use AVX2 version
-//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+// SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
 
 //-----------------------------------------------------------------------
 // Conversion operations (Use AVX2 versions)
@@ -185,16 +202,16 @@
 //-----------------------------------------------------------------------
 // Comparison operations (Use AVX2 versions
 //-----------------------------------------------------------------------
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
 //
-//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+// static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
 //{
 //    return cmpgt_epi32(b, a);
 //}
@@ -202,25 +219,28 @@
 //-----------------------------------------------------------------------
 // Blend / shuffle / permute operations
 //-----------------------------------------------------------------------
-// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-// SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-// SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+// SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16
+// and _mm512_packs_epi16 SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation
+// for _mm256_packs_epi32 and _mm512_packs_epi32 SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 -->
+// uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+// SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for
+// _mm256_packus_epi32 and _mm512_packus_epi32
 
 // SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
 
-//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+// static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for
+// each 32-bit lane i (float)
 //{
 //    return _mm256_permutevar8x32_ps(a, swiz);
 //}
 
 SIMD_IWRAPPER_1I_32(shuffle_epi32);
-//template<int ImmT>
-//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+// template<int ImmT>
+// static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
 //{
 //    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
 //}
-//SIMD_IWRAPPER_2(shuffle_epi8);
+// SIMD_IWRAPPER_2(shuffle_epi8);
 SIMD_IWRAPPER_2_32(unpackhi_epi32);
 SIMD_IWRAPPER_2_32(unpacklo_epi32);
 
@@ -234,50 +254,47 @@
 //-----------------------------------------------------------------------
 // Load / store operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+static SIMDINLINE Float SIMDCALL
+                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
 {
     return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
 }
 
-static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
 {
     return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
 }
 
-static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+static SIMDINLINE Float SIMDCALL
+                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
 {
     return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
 }
 
-static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+static SIMDINLINE Integer SIMDCALL
+                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
 {
     return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
 }
 
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
     return __conv(_mm512_mask_i32gather_ps(
-                    _mm512_setzero_ps(),
-                    __mmask16(0xff),
-                    __conv(idx),
-                    p,
-                    static_cast<int>(ScaleT)));
+        _mm512_setzero_ps(), __mmask16(0xff), __conv(idx), p, static_cast<int>(ScaleT)));
 }
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
 {
     __mmask16 m = 0xff;
-    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
-                                _mm512_set1_epi32(0x80000000));
-    return __conv(_mm512_mask_i32gather_ps(
-                    __conv(old),
-                    m,
-                    __conv(idx),
-                    p,
-                    static_cast<int>(ScaleT)));
+    m           = _mm512_mask_test_epi32_mask(
+        m, _mm512_castps_si512(__conv(mask)), _mm512_set1_epi32(0x80000000));
+    return __conv(
+        _mm512_mask_i32gather_ps(__conv(old), m, __conv(idx), p, static_cast<int>(ScaleT)));
 }
 
 // static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
@@ -287,19 +304,20 @@
 //         _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
 // }
 
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
 {
     __mmask16 m = 0xff;
-    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    m           = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
     _mm512_mask_storeu_ps(p, m, __conv(src));
 }
 
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL
+                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
 {
     _mm512_mask_storeu_ps(p, __mmask16(0xff), __conv(a));
 }
 
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
 {
     _mm512_mask_storeu_epi32(p, __mmask16(0xff), __conv(a));
 }
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
index 6ffe7c2..1acdc7e 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_core.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -33,65 +33,68 @@
 // register set.
 //============================================================================
 
-#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Double SIMDCALL op(Double a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+#define SIMD_DWRAPPER_1_(op, intrin, mask)                       \
+    static SIMDINLINE Double SIMDCALL op(Double a)               \
+    {                                                            \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
     }
-#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
+#define SIMD_DWRAPPER_1(op) SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
 
-#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Double SIMDCALL op(Double a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)                            \
+    template <int ImmT>                                                \
+    static SIMDINLINE Double SIMDCALL op(Double a)                     \
+    {                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
     }
-#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
+#define SIMD_DWRAPPER_1I(op) SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
 
-#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+#define SIMD_DWRAPPER_2_(op, intrin, mask)                                  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)                \
+    {                                                                       \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
     }
-#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
+#define SIMD_DWRAPPER_2(op) SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
 
-#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+#define SIMD_IWRAPPER_1_(op, intrin, mask)                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)             \
+    {                                                            \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a))); \
     }
-#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
+#define SIMD_IWRAPPER_1_8(op) SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1_16(op) SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1_64(op) SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
 
-#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)                            \
+    template <int ImmT>                                                \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)                   \
+    {                                                                  \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT)); \
     }
-#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
+#define SIMD_IWRAPPER_1I_8(op) SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1I_16(op) SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1I_64(op) SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
 
-#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+#define SIMD_IWRAPPER_2_(op, intrin, mask)                                  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)             \
+    {                                                                       \
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b))); \
     }
-#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
-#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
-#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
+#define SIMD_IWRAPPER_2_8(op) SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_2_16(op) SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_2_64(op) SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
 
-
-SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
-SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
-SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
-SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
-SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_8(add_epi8);      // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);     // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_IWRAPPER_2_64(sub_epi64);    // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);     // return (b > a) ? 0 : (a - b) (uint8)
+SIMD_IWRAPPER_2_8(packs_epi16);   // int16 --> int8    See documentation for _mm256_packs_epi16 and
+                                  // _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);  // int32 --> int16   See documentation for _mm256_packs_epi32 and
+                                  // _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);  // uint16 --> uint8  See documentation for _mm256_packus_epi16 and
+                                  // _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32); // uint32 --> uint16 See documentation for _mm256_packus_epi32 and
+                                  // _mm512_packus_epi32
 SIMD_IWRAPPER_2_16(unpackhi_epi16);
 SIMD_IWRAPPER_2_64(unpackhi_epi64);
 SIMD_IWRAPPER_2_8(unpackhi_epi8);
@@ -102,8 +105,7 @@
 static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
 {
     __mmask64 m = 0xffffffffull;
-    return static_cast<uint32_t>(
-        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+    return static_cast<uint32_t>(_mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
 }
 
 #undef SIMD_DWRAPPER_1_
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
index acd8ffd..52b6ca2 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512_knights.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -32,4 +32,3 @@
 // These use native AVX512 instructions with masking to enable a larger
 // register set.
 //============================================================================
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
index dfe19d3..e9e908a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -1,41 +1,41 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
 
-#if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
 // gcc as of 7.1 was missing these intrinsics
 #ifndef _mm512_cmpneq_ps_mask
-#define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
+#define _mm512_cmpneq_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_NEQ_UQ)
 #endif
 
 #ifndef _mm512_cmplt_ps_mask
-#define _mm512_cmplt_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_LT_OS)
+#define _mm512_cmplt_ps_mask(a, b) _mm512_cmp_ps_mask((a), (b), _CMP_LT_OS)
 #endif
 
 #ifndef _mm512_cmplt_pd_mask
-#define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
+#define _mm512_cmplt_pd_mask(a, b) _mm512_cmp_pd_mask((a), (b), _CMP_LT_OS)
 #endif
 
 #endif
@@ -47,136 +47,108 @@
 //============================================================================
 
 static const int TARGET_SIMD_WIDTH = 16;
-using SIMD256T = SIMD256Impl::AVX2Impl;
+using SIMD256T                     = SIMD256Impl::AVX2Impl;
 
-#define SIMD_WRAPPER_1_(op, intrin)  \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return intrin(a);\
-    }
+#define SIMD_WRAPPER_1_(op, intrin) \
+    static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
 
-#define SIMD_WRAPPER_1(op)  \
-    SIMD_WRAPPER_1_(op, _mm512_##op)
+#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
 
-#define SIMD_WRAPPER_2_(op, intrin)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm512_##intrin(a, b);\
-    }
+#define SIMD_WRAPPER_2_(op, intrin) \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
 #define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
 
-#define SIMD_WRAPPERI_2_(op, intrin)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm512_castsi512_ps(_mm512_##intrin(\
-            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+#define SIMD_WRAPPERI_2_(op, intrin)                                          \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                     \
+    {                                                                         \
+        return _mm512_castsi512_ps(                                           \
+            _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
     }
 
-#define SIMD_DWRAPPER_2(op)  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return _mm512_##op(a, b);\
-    }
+#define SIMD_DWRAPPER_2(op) \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
 
-#define SIMD_WRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm512_##intrin(a, b, ImmT);\
+#define SIMD_WRAPPER_2I_(op, intrin)                      \
+    template <int ImmT>                                   \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+    {                                                     \
+        return _mm512_##intrin(a, b, ImmT);               \
     }
-#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
+#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
 
-#define SIMD_DWRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return _mm512_##intrin(a, b, ImmT);\
+#define SIMD_DWRAPPER_2I_(op, intrin)                        \
+    template <int ImmT>                                      \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+    {                                                        \
+        return _mm512_##intrin(a, b, ImmT);                  \
     }
-#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
+#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
 
-#define SIMD_WRAPPER_3(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
-    {\
-        return _mm512_##op(a, b, c);\
-    }
+#define SIMD_WRAPPER_3(op) \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
 
-#define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return _mm512_##op(a);\
-    }
-#define SIMD_IWRAPPER_1_8(op)  \
-    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
-    {\
-        return _mm512_##op(a);\
-    }
+#define SIMD_IWRAPPER_1(op) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
+#define SIMD_IWRAPPER_1_8(op) \
+    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
 
-#define SIMD_IWRAPPER_1_4(op)  \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
-    {\
-        return _mm512_##op(a);\
-    }
+#define SIMD_IWRAPPER_1_4(op) \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
 
-#define SIMD_IWRAPPER_1I_(op, intrin)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return intrin(a, ImmT);\
+#define SIMD_IWRAPPER_1I_(op, intrin)                \
+    template <int ImmT>                              \
+    static SIMDINLINE Integer SIMDCALL op(Integer a) \
+    {                                                \
+        return intrin(a, ImmT);                      \
     }
 #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
 
-#define SIMD_IWRAPPER_2_(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return _mm512_##intrin(a, b);\
-    }
-#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
+#define SIMD_IWRAPPER_2_(op, intrin) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
+#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
 
-#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return cmp(a, b);\
+#define SIMD_IWRAPPER_2_CMP(op, cmp) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
+
+#define SIMD_IFWRAPPER_2(op, intrin)                                   \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)        \
+    {                                                                  \
+        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
     }
 
-#define SIMD_IFWRAPPER_2(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return _mm512_##intrin(a, b, ImmT);\
+#define SIMD_IWRAPPER_2I_(op, intrin)                           \
+    template <int ImmT>                                         \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+    {                                                           \
+        return _mm512_##intrin(a, b, ImmT);                     \
     }
 #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
 
 private:
-    static SIMDINLINE Integer vmask(__mmask16 m)
-    {
-        return _mm512_maskz_set1_epi32(m, -1);
-    }
+static SIMDINLINE Integer vmask(__mmask16 m)
+{
+    return _mm512_maskz_set1_epi32(m, -1);
+}
 
-    static SIMDINLINE Integer vmask(__mmask8 m)
-    {
-        return _mm512_maskz_set1_epi64(m, -1LL);
-    }
+static SIMDINLINE Integer vmask(__mmask8 m)
+{
+    return _mm512_maskz_set1_epi64(m, -1LL);
+}
 
 public:
 //-----------------------------------------------------------------------
 // Single precision floating point arithmetic operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);     // return a + b
-SIMD_WRAPPER_2(div_ps);     // return a / b
-SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);     // return a * b
-SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps);       // return 1.0f / a
-SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps);   // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);     // return a - b
+SIMD_WRAPPER_2(add_ps);                       // return a + b
+SIMD_WRAPPER_2(div_ps);                       // return a / b
+SIMD_WRAPPER_3(fmadd_ps);                     // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);                     // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);                       // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);                       // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);                       // return a * b
+SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps);     // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);                       // return a - b
 
 template <RoundMode RMT>
 static SIMDINLINE Float SIMDCALL round_ps(Float a)
@@ -184,52 +156,57 @@
     return _mm512_roundscale_ps(a, static_cast<int>(RMT));
 }
 
-static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
-static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a)
+{
+    return round_ps<RoundMode::CEIL_NOEXC>(a);
+}
+static SIMDINLINE Float SIMDCALL floor_ps(Float a)
+{
+    return round_ps<RoundMode::FLOOR_NOEXC>(a);
+}
 
 //-----------------------------------------------------------------------
 // Integer (various width) arithmetic operations
 //-----------------------------------------------------------------------
 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
-//SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-//SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+// SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+// SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
 
-                            // return (a * b) & 0xFFFFFFFF
-                            //
-                            // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
-                            // and store the low 32 bits of the intermediate integers in dst.
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
 SIMD_IWRAPPER_2(mullo_epi32);
 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
-//SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+// SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
 
 //-----------------------------------------------------------------------
 // Logical operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_2_(and_si, and_si512);        // return a & b       (int)
-SIMD_IWRAPPER_2_(andnot_si, andnot_si512);  // return (~a) & b    (int)
-SIMD_IWRAPPER_2_(or_si, or_si512);          // return a | b       (int)
-SIMD_IWRAPPER_2_(xor_si, xor_si512);        // return a ^ b       (int)
+SIMD_IWRAPPER_2_(and_si, and_si512);       // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si512); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si, or_si512);         // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si, xor_si512);       // return a ^ b       (int)
 
 // SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
 // SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
 // SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
 // SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
 
-
 //-----------------------------------------------------------------------
 // Shift operations
 //-----------------------------------------------------------------------
-SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_1I(slli_epi32); // return a << ImmT
 SIMD_IWRAPPER_2(sllv_epi32);
-SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
-SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_1I(srai_epi32); // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32); // return a >> ImmT   (uint32)
 
 #if 0
 SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
@@ -246,32 +223,32 @@
 //-----------------------------------------------------------------------
 // Conversion operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a) // return *(Float*)(&a)
 {
     return _mm512_castpd_ps(a);
 }
 
-static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castps_si(Float a) // return *(Integer*)(&a)
 {
     return _mm512_castps_si512(a);
 }
 
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a) // return *(Double*)(&a)
 {
     return _mm512_castsi512_pd(a);
 }
 
-static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castps_pd(Float a) // return *(Double*)(&a)
 {
     return _mm512_castps_pd(a);
 }
 
-static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a) // return *(Integer*)(&a)
 {
     return _mm512_castpd_si512(a);
 }
 
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a) // return *(Float*)(&a)
 {
     return _mm512_castsi512_ps(a);
 }
@@ -281,18 +258,19 @@
     return _mm512_cvtepi32_ps(a);
 }
 
-//SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
-SIMD_IWRAPPER_1_4(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
-SIMD_IWRAPPER_1_8(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
-SIMD_IWRAPPER_1_4(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
-SIMD_IWRAPPER_1_8(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+// SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1_4(cvtepu8_epi32);  // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1_8(cvtepu16_epi32); // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1_4(cvtepu16_epi64); // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1_8(cvtepu32_epi64); // return (int64)a    (uint32 --> int64)
 
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a) // return (int32)a    (float --> int32)
 {
     return _mm512_cvtps_epi32(a);
 }
 
-static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvttps_epi32(Float a) // return (int32)a    (rnd_to_zero(float) --> int32)
 {
     return _mm512_cvttps_epi32(a);
 }
@@ -300,13 +278,13 @@
 //-----------------------------------------------------------------------
 // Comparison operations
 //-----------------------------------------------------------------------
-template<CompareType CmpTypeT>
+template <CompareType CmpTypeT>
 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
 {
     return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
 }
 
-template<CompareType CmpTypeT>
+template <CompareType CmpTypeT>
 static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
 {
     // Legacy vector mask generator
@@ -314,21 +292,39 @@
     return castsi_ps(vmask(result));
 }
 
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::LT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::GT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::NEQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::EQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::GE_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b)
+{
+    return cmp_ps<CompareType::LE_OQ>(a, b);
+}
 
-template<CompareTypeInt CmpTypeT>
+template <CompareTypeInt CmpTypeT>
 static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
 {
     // Legacy vector mask generator
     __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
     return vmask(result);
 }
-template<CompareTypeInt CmpTypeT>
+template <CompareTypeInt CmpTypeT>
 static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
 {
     // Legacy vector mask generator
@@ -336,22 +332,24 @@
     return vmask(result);
 }
 
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
-//SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>);   // return a == b (int32)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>);   // return a == b (int64)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
-//SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>);   // return a > b (int32)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>);   // return a > b (int64)
-SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>);   // return a < b (int32)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
+// SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>); // return a == b (int32)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>); // return a == b (int64)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
+// SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>); // return a > b (int32)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>); // return a > b (int64)
+SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>); // return a < b (int32)
 
-static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+static SIMDINLINE bool SIMDCALL testz_ps(Float a,
+                                         Float b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
 {
     return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
 }
 
-static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+static SIMDINLINE bool SIMDCALL testz_si(Integer a,
+                                         Integer b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
 {
     return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
 }
@@ -376,75 +374,82 @@
     return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
 }
 
-
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
+                                                Integer b,
+                                                Float   mask) // return mask ? b : a (int)
 {
     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
 }
 
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a,
+                                                Integer b,
+                                                Integer mask) // return mask ? b : a (int)
 {
     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
 }
 
-static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+static SIMDINLINE Float SIMDCALL
+                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
 {
     return _mm512_set1_ps(*p);
 }
 
-template<int imm>
+template <int imm>
 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
 {
     return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
 }
 
-template<int imm>
+template <int imm>
 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
 {
     return _mm512_extractf64x4_pd(a, imm);
 }
 
-template<int imm>
+template <int imm>
 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
 {
     return _mm512_extracti64x4_epi64(a, imm);
 }
 
-template<int imm>
+template <int imm>
 static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
 {
     return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
 }
 
-template<int imm>
+template <int imm>
 static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
 {
     return _mm512_insertf64x4(a, b, imm);
 }
 
-template<int imm>
+template <int imm>
 static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
 {
     return _mm512_inserti64x4(a, b, imm);
 }
 
-// SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
-// SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
-// SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
-// SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
+// SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and
+// _mm512_packs_epi16 SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32
+// and _mm512_packs_epi32 SIMD_IWRAPPER_2(packus_epi16);  // See documentation for
+// _mm512_packus_epi16 and _mm512_packus_epi16 SIMD_IWRAPPER_2(packus_epi32);  // See documentation
+// for _mm512_packus_epi32 and _mm512_packus_epi32
 
-template<int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const &a)
+template <int ImmT>
+static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
 {
     return _mm512_permute_ps(a, ImmT);
 }
 
-static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Integer SIMDCALL
+                          permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 {
     return _mm512_permutexvar_epi32(swiz, a);
 }
 
-static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL
+                        permute_ps(Float a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 {
     return _mm512_permutexvar_ps(swiz, a);
 }
@@ -455,11 +460,11 @@
 
 SIMD_IWRAPPER_1I(shuffle_epi32);
 
-//SIMD_IWRAPPER_2(shuffle_epi8);
+// SIMD_IWRAPPER_2(shuffle_epi8);
 SIMD_DWRAPPER_2I(shuffle_pd);
 SIMD_WRAPPER_2I(shuffle_ps);
 
-template<int ImmT>
+template <int ImmT>
 static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
 {
     return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
@@ -467,73 +472,79 @@
 
 SIMD_IWRAPPER_2(unpackhi_epi16);
 
-//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
+// SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
 static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
 {
     return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
 }
 
 SIMD_IWRAPPER_2(unpackhi_epi64);
-//SIMD_IWRAPPER_2(unpackhi_epi8);
+// SIMD_IWRAPPER_2(unpackhi_epi8);
 SIMD_DWRAPPER_2(unpackhi_pd);
 SIMD_WRAPPER_2(unpackhi_ps);
-//SIMD_IWRAPPER_2(unpacklo_epi16);
+// SIMD_IWRAPPER_2(unpacklo_epi16);
 SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
 SIMD_IWRAPPER_2(unpacklo_epi64);
-//SIMD_IWRAPPER_2(unpacklo_epi8);
+// SIMD_IWRAPPER_2(unpacklo_epi8);
 SIMD_DWRAPPER_2(unpacklo_pd);
 SIMD_WRAPPER_2(unpacklo_ps);
 
 //-----------------------------------------------------------------------
 // Load / store operations
 //-----------------------------------------------------------------------
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
     return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
 }
 
-static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+static SIMDINLINE Float SIMDCALL
+                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
 {
     return broadcast_ss(p);
 }
 
-static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+static SIMDINLINE Float SIMDCALL
+                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
 {
     return _mm512_load_ps(p);
 }
 
-static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
 {
     return _mm512_load_si512(&p->v);
 }
 
-static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+static SIMDINLINE Float SIMDCALL
+                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
 {
     return _mm512_loadu_ps(p);
 }
 
-static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+static SIMDINLINE Integer SIMDCALL
+                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
 {
     return _mm512_loadu_si512(p);
 }
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
 {
     __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
 
     return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
 }
 
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer mask, Float src)
 {
     Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
     _mm512_mask_store_ps(p, m, src);
 }
 
-//static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+// static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
 //{
 //    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
 //    return static_cast<uint64_t>(m);
@@ -565,78 +576,99 @@
     return _mm512_set1_epi8(i);
 }
 
-static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
 {
     return _mm512_set1_ps(f);
 }
 
-static SIMDINLINE Double SIMDCALL setzero_pd()      // return 0 (double)
+static SIMDINLINE Double SIMDCALL setzero_pd() // return 0 (double)
 {
     return _mm512_setzero_pd();
 }
 
-static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
 {
     return _mm512_setzero_ps();
 }
 
-static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
 {
     return _mm512_setzero_si512();
 }
 
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL
+                       store_ps(float* p, Float a) // *p = a   (stores all elements contiguously in memory)
 {
     _mm512_store_ps(p, a);
 }
 
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer a) // *p = a
 {
     _mm512_store_si512(&p->v, a);
 }
 
-static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+static SIMDINLINE void SIMDCALL
+                       storeu_si(Integer* p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
 {
     _mm512_storeu_si512(&p->v, a);
 }
 
-static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+static SIMDINLINE void SIMDCALL
+                       stream_ps(float* p, Float a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
 {
     _mm512_stream_ps(p, a);
 }
 
-static SIMDINLINE Integer SIMDCALL set_epi32(
-    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
-    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
+                                             int i14,
+                                             int i13,
+                                             int i12,
+                                             int i11,
+                                             int i10,
+                                             int i9,
+                                             int i8,
+                                             int i7,
+                                             int i6,
+                                             int i5,
+                                             int i4,
+                                             int i3,
+                                             int i2,
+                                             int i1,
+                                             int i0)
 {
-    return _mm512_set_epi32(
-        i15, i14, i13, i12, i11, i10, i9, i8,
-        i7, i6, i5, i4, i3, i2, i1, i0);
+    return _mm512_set_epi32(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
-static SIMDINLINE Integer SIMDCALL set_epi32(
-    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+static SIMDINLINE Integer SIMDCALL
+                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
 {
-    return set_epi32(
-        0, 0, 0, 0, 0, 0, 0, 0,
-        i7, i6, i5, i4, i3, i2, i1, i0);
+    return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
-static SIMDINLINE Float SIMDCALL set_ps(
-    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
-    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+static SIMDINLINE Float SIMDCALL set_ps(float i15,
+                                        float i14,
+                                        float i13,
+                                        float i12,
+                                        float i11,
+                                        float i10,
+                                        float i9,
+                                        float i8,
+                                        float i7,
+                                        float i6,
+                                        float i5,
+                                        float i4,
+                                        float i3,
+                                        float i2,
+                                        float i1,
+                                        float i0)
 {
-    return _mm512_set_ps(
-        i15, i14, i13, i12, i11, i10, i9, i8,
-        i7, i6, i5, i4, i3, i2, i1, i0);
+    return _mm512_set_ps(i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
-static SIMDINLINE Float SIMDCALL set_ps(
-    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+static SIMDINLINE Float SIMDCALL
+                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
 {
-    return set_ps(
-        0, 0, 0, 0, 0, 0, 0, 0,
-        i7, i6, i5, i4, i3, i2, i1, i0);
+    return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
@@ -665,4 +697,3 @@
 #undef SIMD_IWRAPPER_2
 #undef SIMD_IWRAPPER_2_
 #undef SIMD_IWRAPPER_2I
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
index fed6307..82aa2bb 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_core.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -29,139 +29,111 @@
 //
 //============================================================================
 
-#define SIMD_WRAPPER_1_(op, intrin)  \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return intrin(a);\
-    }
+#define SIMD_WRAPPER_1_(op, intrin) \
+    static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
 
-#define SIMD_WRAPPER_1(op)  \
-    SIMD_WRAPPER_1_(op, _mm512_##op)
+#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
 
-#define SIMD_WRAPPER_2_(op, intrin)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm512_##intrin(a, b);\
-    }
+#define SIMD_WRAPPER_2_(op, intrin) \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
 #define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
 
-#define SIMD_WRAPPERI_2_(op, intrin)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm512_castsi512_ps(_mm512_##intrin(\
-            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+#define SIMD_WRAPPERI_2_(op, intrin)                                          \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                     \
+    {                                                                         \
+        return _mm512_castsi512_ps(                                           \
+            _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
     }
 
-#define SIMD_DWRAPPER_2(op)  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return _mm512_##op(a, b);\
-    }
+#define SIMD_DWRAPPER_2(op) \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
 
-#define SIMD_WRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm512_##intrin(a, b, ImmT);\
+#define SIMD_WRAPPER_2I_(op, intrin)                      \
+    template <int ImmT>                                   \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+    {                                                     \
+        return _mm512_##intrin(a, b, ImmT);               \
     }
-#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
+#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
 
-#define SIMD_DWRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return _mm512_##intrin(a, b, ImmT);\
+#define SIMD_DWRAPPER_2I_(op, intrin)                        \
+    template <int ImmT>                                      \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+    {                                                        \
+        return _mm512_##intrin(a, b, ImmT);                  \
     }
-#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
+#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
 
-#define SIMD_WRAPPER_3(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
-    {\
-        return _mm512_##op(a, b, c);\
-    }
+#define SIMD_WRAPPER_3(op) \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
 
-#define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return _mm512_##op(a);\
-    }
-#define SIMD_IWRAPPER_1_8(op)  \
-    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
-    {\
-        return _mm512_##op(a);\
-    }
+#define SIMD_IWRAPPER_1(op) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
+#define SIMD_IWRAPPER_1_8(op) \
+    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
 
-#define SIMD_IWRAPPER_1_4(op)  \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
-    {\
-        return _mm512_##op(a);\
-    }
+#define SIMD_IWRAPPER_1_4(op) \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
 
-#define SIMD_IWRAPPER_1I_(op, intrin)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return intrin(a, ImmT);\
+#define SIMD_IWRAPPER_1I_(op, intrin)                \
+    template <int ImmT>                              \
+    static SIMDINLINE Integer SIMDCALL op(Integer a) \
+    {                                                \
+        return intrin(a, ImmT);                      \
     }
 #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
 
-#define SIMD_IWRAPPER_2_(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return _mm512_##intrin(a, b);\
-    }
-#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
+#define SIMD_IWRAPPER_2_(op, intrin) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
+#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
 
-#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return cmp(a, b);\
+#define SIMD_IWRAPPER_2_CMP(op, cmp) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
+
+#define SIMD_IFWRAPPER_2(op, intrin)                                   \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)        \
+    {                                                                  \
+        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
     }
 
-#define SIMD_IFWRAPPER_2(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return _mm512_##intrin(a, b, ImmT);\
+#define SIMD_IWRAPPER_2I_(op, intrin)                           \
+    template <int ImmT>                                         \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+    {                                                           \
+        return _mm512_##intrin(a, b, ImmT);                     \
     }
 #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
 
 private:
-    static SIMDINLINE Integer vmask(__mmask32 m)
-    {
-        return _mm512_maskz_set1_epi16(m, -1);
-    }
-    static SIMDINLINE Integer vmask(__mmask64 m)
-    {
-        return _mm512_maskz_set1_epi8(m, -1);
-    }
+static SIMDINLINE Integer vmask(__mmask32 m)
+{
+    return _mm512_maskz_set1_epi16(m, -1);
+}
+static SIMDINLINE Integer vmask(__mmask64 m)
+{
+    return _mm512_maskz_set1_epi8(m, -1);
+}
+
 public:
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
 
-SIMD_IWRAPPER_2(add_epi8);                  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8);                 // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
-SIMD_IWRAPPER_2(subs_epu8);                 // return (b > a) ? 0 : (a - b) (uint8)
+SIMD_WRAPPER_2(and_ps);    // return a & b       (float treated as int)
+SIMD_WRAPPER_2(andnot_ps); // return (~a) & b    (float treated as int)
+SIMD_WRAPPER_2(or_ps);     // return a | b       (float treated as int)
+SIMD_WRAPPER_2(xor_ps);    // return a ^ b       (float treated as int)
 
-SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
-SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
-SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
-SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_1_8(cvtepu8_epi16); // return (int16)a    (uint8 --> int16)
 
-SIMD_IWRAPPER_1_8(cvtepu8_epi16);           // return (int16)a    (uint8 --> int16)
-
-template<CompareTypeInt CmpTypeT>
+template <CompareTypeInt CmpTypeT>
 static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
 {
     // Legacy vector mask generator
     __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
     return vmask(result);
 }
-template<CompareTypeInt CmpTypeT>
+template <CompareTypeInt CmpTypeT>
 static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
 {
     // Legacy vector mask generator
@@ -169,19 +141,19 @@
     return vmask(result);
 }
 
-SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
-SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
-SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi8, cmp_epi8<CompareTypeInt::EQ>);   // return a == b (int8)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>); // return a == b (int16)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi8, cmp_epi8<CompareTypeInt::GT>);   // return a > b (int8)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>); // return a > b (int16)
 
-SIMD_IWRAPPER_2(packs_epi16);               // See documentation for _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);               // See documentation for _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16);              // See documentation for _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32);              // See documentation for _mm512_packus_epi32
+SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm512_packus_epi32
 
-SIMD_IWRAPPER_2(unpackhi_epi8);             // See documentation for _mm512_unpackhi_epi8
-SIMD_IWRAPPER_2(unpacklo_epi16);            // See documentation for _mm512_unpacklo_epi16
-SIMD_IWRAPPER_2(unpacklo_epi8);             // See documentation for _mm512_unpacklo_epi8
+SIMD_IWRAPPER_2(unpackhi_epi8);  // See documentation for _mm512_unpackhi_epi8
+SIMD_IWRAPPER_2(unpacklo_epi16); // See documentation for _mm512_unpacklo_epi16
+SIMD_IWRAPPER_2(unpacklo_epi8);  // See documentation for _mm512_unpacklo_epi8
 
 SIMD_IWRAPPER_2(shuffle_epi8);
 
@@ -191,8 +163,6 @@
     return static_cast<uint64_t>(m);
 }
 
-
-
 #undef SIMD_WRAPPER_1_
 #undef SIMD_WRAPPER_1
 #undef SIMD_WRAPPER_2
@@ -214,4 +184,3 @@
 #undef SIMD_IWRAPPER_2
 #undef SIMD_IWRAPPER_2_
 #undef SIMD_IWRAPPER_2I
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
index 690ab38..9ec3ff6 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_knights.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -29,113 +29,85 @@
 //
 //============================================================================
 
-#define SIMD_WRAPPER_1_(op, intrin)  \
-    static SIMDINLINE Float SIMDCALL op(Float a)   \
-    {\
-        return intrin(a);\
-    }
+#define SIMD_WRAPPER_1_(op, intrin) \
+    static SIMDINLINE Float SIMDCALL op(Float a) { return intrin(a); }
 
-#define SIMD_WRAPPER_1(op)  \
-    SIMD_WRAPPER_1_(op, _mm512_##op)
+#define SIMD_WRAPPER_1(op) SIMD_WRAPPER_1_(op, _mm512_##op)
 
-#define SIMD_WRAPPER_2_(op, intrin)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm512_##intrin(a, b);\
-    }
+#define SIMD_WRAPPER_2_(op, intrin) \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b) { return _mm512_##intrin(a, b); }
 #define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
 
-#define SIMD_WRAPPERI_2_(op, intrin)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm512_castsi512_ps(_mm512_##intrin(\
-            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+#define SIMD_WRAPPERI_2_(op, intrin)                                          \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)                     \
+    {                                                                         \
+        return _mm512_castsi512_ps(                                           \
+            _mm512_##intrin(_mm512_castps_si512(a), _mm512_castps_si512(b))); \
     }
 
-#define SIMD_DWRAPPER_2(op)  \
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return _mm512_##op(a, b);\
-    }
+#define SIMD_DWRAPPER_2(op) \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b) { return _mm512_##op(a, b); }
 
-#define SIMD_WRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
-    {\
-        return _mm512_##intrin(a, b, ImmT);\
+#define SIMD_WRAPPER_2I_(op, intrin)                      \
+    template <int ImmT>                                   \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b) \
+    {                                                     \
+        return _mm512_##intrin(a, b, ImmT);               \
     }
-#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
+#define SIMD_WRAPPER_2I(op) SIMD_WRAPPER_2I_(op, op)
 
-#define SIMD_DWRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
-    {\
-        return _mm512_##intrin(a, b, ImmT);\
+#define SIMD_DWRAPPER_2I_(op, intrin)                        \
+    template <int ImmT>                                      \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b) \
+    {                                                        \
+        return _mm512_##intrin(a, b, ImmT);                  \
     }
-#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
+#define SIMD_DWRAPPER_2I(op) SIMD_DWRAPPER_2I_(op, op)
 
-#define SIMD_WRAPPER_3(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
-    {\
-        return _mm512_##op(a, b, c);\
-    }
+#define SIMD_WRAPPER_3(op) \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) { return _mm512_##op(a, b, c); }
 
-#define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return _mm512_##op(a);\
-    }
-#define SIMD_IWRAPPER_1_8(op)  \
-    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
-    {\
-        return _mm512_##op(a);\
-    }
+#define SIMD_IWRAPPER_1(op) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a) { return _mm512_##op(a); }
+#define SIMD_IWRAPPER_1_8(op) \
+    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a) { return _mm512_##op(a); }
 
-#define SIMD_IWRAPPER_1_4(op)  \
-    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
-    {\
-        return _mm512_##op(a);\
-    }
+#define SIMD_IWRAPPER_1_4(op) \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a) { return _mm512_##op(a); }
 
-#define SIMD_IWRAPPER_1I_(op, intrin)  \
-    template<int ImmT> \
-    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
-    {\
-        return intrin(a, ImmT);\
+#define SIMD_IWRAPPER_1I_(op, intrin)                \
+    template <int ImmT>                              \
+    static SIMDINLINE Integer SIMDCALL op(Integer a) \
+    {                                                \
+        return intrin(a, ImmT);                      \
     }
 #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
 
-#define SIMD_IWRAPPER_2_(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return _mm512_##intrin(a, b);\
-    }
-#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
+#define SIMD_IWRAPPER_2_(op, intrin) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return _mm512_##intrin(a, b); }
+#define SIMD_IWRAPPER_2(op) SIMD_IWRAPPER_2_(op, op)
 
-#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return cmp(a, b);\
+#define SIMD_IWRAPPER_2_CMP(op, cmp) \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) { return cmp(a, b); }
+
+#define SIMD_IFWRAPPER_2(op, intrin)                                   \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)        \
+    {                                                                  \
+        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b))); \
     }
 
-#define SIMD_IFWRAPPER_2(op, intrin)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
-    }
-
-#define SIMD_IWRAPPER_2I_(op, intrin)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
-    {\
-        return _mm512_##intrin(a, b, ImmT);\
+#define SIMD_IWRAPPER_2I_(op, intrin)                           \
+    template <int ImmT>                                         \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b) \
+    {                                                           \
+        return _mm512_##intrin(a, b, ImmT);                     \
     }
 #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
 
-SIMD_WRAPPERI_2_(and_ps, and_epi32);          // return a & b       (float treated as int)
-SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32);    // return (~a) & b    (float treated as int)
-SIMD_WRAPPERI_2_(or_ps, or_epi32);            // return a | b       (float treated as int)
-SIMD_WRAPPERI_2_(xor_ps, xor_epi32);          // return a ^ b       (float treated as int)
+SIMD_WRAPPERI_2_(and_ps, and_epi32);       // return a & b       (float treated as int)
+SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32); // return (~a) & b    (float treated as int)
+SIMD_WRAPPERI_2_(or_ps, or_epi32);         // return a | b       (float treated as int)
+SIMD_WRAPPERI_2_(xor_ps, xor_epi32);       // return a ^ b       (float treated as int)
 
 #undef SIMD_WRAPPER_1_
 #undef SIMD_WRAPPER_1
@@ -158,4 +130,3 @@
 #undef SIMD_IWRAPPER_2
 #undef SIMD_IWRAPPER_2_
 #undef SIMD_IWRAPPER_2I
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
index 3e36ce5..f9d4b8c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
index 3e36ce5..f9d4b8c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_core.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl
index 3e36ce5..f9d4b8c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks_knights.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX512_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
index 55981dc..91705f2 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
@@ -29,149 +29,143 @@
 //============================================================================
 
 static const int TARGET_SIMD_WIDTH = 8;
-using SIMD128T = SIMD128Impl::AVXImpl;
+using SIMD128T                     = SIMD128Impl::AVXImpl;
 
-#define SIMD_WRAPPER_1(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float const &a)   \
-    {\
-        return Float\
-        {\
-            SIMD256T::op(a.v8[0]),\
-            SIMD256T::op(a.v8[1]),\
-        };\
+#define SIMD_WRAPPER_1(op)                              \
+    static SIMDINLINE Float SIMDCALL op(Float const& a) \
+    {                                                   \
+        return Float{                                   \
+            SIMD256T::op(a.v8[0]),                      \
+            SIMD256T::op(a.v8[1]),                      \
+        };                                              \
     }
 
-#define SIMD_WRAPPER_2(op)  \
-    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)    \
-    {\
-        return Float\
-        {\
-            SIMD256T::op(a.v8[0], b.v8[0]),\
-            SIMD256T::op(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_WRAPPER_2(op)                                              \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
+    {                                                                   \
+        return Float{                                                   \
+            SIMD256T::op(a.v8[0], b.v8[0]),                             \
+            SIMD256T::op(a.v8[1], b.v8[1]),                             \
+        };                                                              \
     }
 
-#define SIMD_WRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
-    {\
-        return Float\
-        {\
-            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_WRAPPER_2I(op)                                                              \
+    template <int ImmT>                                                                  \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b)                  \
+    {                                                                                    \
+        return Float{                                                                    \
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
+        };                                                                               \
     }
 
-#define SIMD_WRAPPER_2I_1(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b)   \
-    {\
-        return Float\
-        {\
-            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_WRAPPER_2I_1(op)                                           \
+    template <int ImmT>                                                 \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b) \
+    {                                                                   \
+        return Float{                                                   \
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),              \
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),              \
+        };                                                              \
     }
 
-#define SIMD_WRAPPER_3(op)  \
-        static SIMDINLINE Float SIMDCALL op(Float const &a, Float const &b, Float const &c)   \
-        {\
-            return Float\
-            {\
-                SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
-                SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
-            };\
-        }
-
-#define SIMD_IWRAPPER_1(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::op(a.v8[0]),\
-            SIMD256T::op(a.v8[1]),\
-        };\
+#define SIMD_WRAPPER_3(op)                                                              \
+    static SIMDINLINE Float SIMDCALL op(Float const& a, Float const& b, Float const& c) \
+    {                                                                                   \
+        return Float{                                                                   \
+            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                    \
+            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                    \
+        };                                                                              \
     }
 
-#define SIMD_IWRAPPER_2(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::op(a.v8[0], b.v8[0]),\
-            SIMD256T::op(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_1(op)                                 \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a) \
+    {                                                       \
+        return Integer{                                     \
+            SIMD256T::op(a.v8[0]),                          \
+            SIMD256T::op(a.v8[1]),                          \
+        };                                                  \
     }
 
-#define SIMD_IWRAPPER_2I(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_2(op)                                                   \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return Integer{                                                       \
+            SIMD256T::op(a.v8[0], b.v8[0]),                                   \
+            SIMD256T::op(a.v8[1], b.v8[1]),                                   \
+        };                                                                    \
     }
 
-#define SIMD_IWRAPPER_2I_1(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_2I(op)                                                             \
+    template <int ImmT>                                                                  \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b)            \
+    {                                                                                    \
+        return Integer{                                                                  \
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),                        \
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]), \
+        };                                                                               \
     }
 
-#define SIMD_IWRAPPER_2I_2(op)  \
-    template<int ImmT>\
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
-            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_2I_1(op)                                                \
+    template <int ImmT>                                                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return Integer{                                                       \
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),                    \
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),                    \
+        };                                                                    \
     }
 
-#define SIMD_IWRAPPER_3(op)  \
-    static SIMDINLINE Integer SIMDCALL op(Integer const &a, Integer const &b, Integer const &c)   \
-    {\
-        return Integer\
-        {\
-            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
-            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
-        };\
+#define SIMD_IWRAPPER_2I_2(op)                                                \
+    template <int ImmT>                                                       \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b) \
+    {                                                                         \
+        return Integer{                                                       \
+            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),              \
+            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),       \
+        };                                                                    \
+    }
+
+#define SIMD_IWRAPPER_3(op)                                                                     \
+    static SIMDINLINE Integer SIMDCALL op(Integer const& a, Integer const& b, Integer const& c) \
+    {                                                                                           \
+        return Integer{                                                                         \
+            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),                                            \
+            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),                                            \
+        };                                                                                      \
     }
 
 //-----------------------------------------------------------------------
 // Single precision floating point arithmetic operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2(add_ps);     // return a + b
-SIMD_WRAPPER_2(div_ps);     // return a / b
-SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
-SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
-SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
-SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
-SIMD_WRAPPER_2(mul_ps);     // return a * b
-SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
-SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
-SIMD_WRAPPER_2(sub_ps);     // return a - b
+SIMD_WRAPPER_2(add_ps);   // return a + b
+SIMD_WRAPPER_2(div_ps);   // return a / b
+SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);   // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);   // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);   // return a * b
+SIMD_WRAPPER_1(rcp_ps);   // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps); // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);   // return a - b
 
 template <RoundMode RMT>
-static SIMDINLINE Float SIMDCALL round_ps(Float const &a)
+static SIMDINLINE Float SIMDCALL round_ps(Float const& a)
 {
-    return Float
-    {
+    return Float{
         SIMD256T::template round_ps<RMT>(a.v8[0]),
         SIMD256T::template round_ps<RMT>(a.v8[1]),
     };
 }
 
-static SIMDINLINE Float SIMDCALL ceil_ps(Float const &a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
-static SIMDINLINE Float SIMDCALL floor_ps(Float const &a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL ceil_ps(Float const& a)
+{
+    return round_ps<RoundMode::CEIL_NOEXC>(a);
+}
+static SIMDINLINE Float SIMDCALL floor_ps(Float const& a)
+{
+    return round_ps<RoundMode::FLOOR_NOEXC>(a);
+}
 
 //-----------------------------------------------------------------------
 // Integer (various width) arithmetic operations
@@ -179,7 +173,7 @@
 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
 SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
-SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
@@ -207,178 +201,168 @@
 SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
 SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
 
-
 //-----------------------------------------------------------------------
 // Shift operations
 //-----------------------------------------------------------------------
-template<int ImmT>
-static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const &a)      // return a << ImmT
+template <int ImmT>
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer const& a) // return a << ImmT
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
         SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
     };
 }
 
-SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
+SIMD_IWRAPPER_2(sllv_epi32); // return a << b      (uint32)
 
-template<int ImmT>
-static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const &a)      // return a >> ImmT   (int32)
+template <int ImmT>
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer const& a) // return a >> ImmT   (int32)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
         SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
     };
 }
 
-template<int ImmT>
-static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const &a)      // return a >> ImmT   (uint32)
+template <int ImmT>
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer const& a) // return a >> ImmT   (uint32)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
         SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
     };
 }
 
-template<int ImmT>                                          // for each 128-bit lane:
-static SIMDINLINE Integer SIMDCALL srli_si(Integer const &a)         //  return a >> (ImmT*8) (uint)
+template <int ImmT>                                          // for each 128-bit lane:
+static SIMDINLINE Integer SIMDCALL srli_si(Integer const& a) //  return a >> (ImmT*8) (uint)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::template srli_si<ImmT>(a.v8[0]),
         SIMD256T::template srli_si<ImmT>(a.v8[1]),
     };
 }
-template<int ImmT>
-static SIMDINLINE Float SIMDCALL srlisi_ps(Float const &a)       // same as srli_si, but with Float cast to int
+template <int ImmT>
+static SIMDINLINE Float SIMDCALL
+                        srlisi_ps(Float const& a) // same as srli_si, but with Float cast to int
 {
-    return Float
-    {
+    return Float{
         SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
         SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
     };
 }
 
-SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b      (uint32)
+SIMD_IWRAPPER_2(srlv_epi32); // return a >> b      (uint32)
 
 //-----------------------------------------------------------------------
 // Conversion operations
 //-----------------------------------------------------------------------
-static SIMDINLINE Float SIMDCALL castpd_ps(Double const &a)              // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castpd_ps(Double const& a) // return *(Float*)(&a)
 {
-    return Float
-    {
+    return Float{
         SIMD256T::castpd_ps(a.v8[0]),
         SIMD256T::castpd_ps(a.v8[1]),
     };
 }
 
-static SIMDINLINE Integer SIMDCALL castps_si(Float const &a)              // return *(Integer*)(&a)
+static SIMDINLINE Integer SIMDCALL castps_si(Float const& a) // return *(Integer*)(&a)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::castps_si(a.v8[0]),
         SIMD256T::castps_si(a.v8[1]),
     };
 }
 
-static SIMDINLINE Double SIMDCALL castsi_pd(Integer const &a)              // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer const& a) // return *(Double*)(&a)
 {
-    return Double
-    {
+    return Double{
         SIMD256T::castsi_pd(a.v8[0]),
         SIMD256T::castsi_pd(a.v8[1]),
     };
 }
 
-static SIMDINLINE Double SIMDCALL castps_pd(Float const &a)   // return *(Double*)(&a)
+static SIMDINLINE Double SIMDCALL castps_pd(Float const& a) // return *(Double*)(&a)
 {
-    return Double
-    {
+    return Double{
         SIMD256T::castps_pd(a.v8[0]),
         SIMD256T::castps_pd(a.v8[1]),
     };
 }
 
-static SIMDINLINE Float SIMDCALL castsi_ps(Integer const &a)              // return *(Float*)(&a)
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer const& a) // return *(Float*)(&a)
 {
-    return Float
-    {
+    return Float{
         SIMD256T::castsi_ps(a.v8[0]),
         SIMD256T::castsi_ps(a.v8[1]),
     };
 }
 
-static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer const &a)            // return (float)a    (int32 --> float)
+static SIMDINLINE Float SIMDCALL
+                        cvtepi32_ps(Integer const& a) // return (float)a    (int32 --> float)
 {
-    return Float
-    {
+    return Float{
         SIMD256T::cvtepi32_ps(a.v8[0]),
         SIMD256T::cvtepi32_ps(a.v8[1]),
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer const &a)          // return (int16)a    (uint8 --> int16)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu8_epi16(SIMD256Impl::Integer const& a) // return (int16)a    (uint8 --> int16)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::cvtepu8_epi16(a.v4[0]),
         SIMD256T::cvtepu8_epi16(a.v4[1]),
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer const &a)          // return (int32)a    (uint8 --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu8_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint8 --> int32)
 {
-    return Integer
-	{
+    return Integer{
         SIMD256T::cvtepu8_epi32(a.v4[0]),
         SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
-	};
+    };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer const &a)         // return (int32)a    (uint16 --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu16_epi32(SIMD256Impl::Integer const& a) // return (int32)a    (uint16 --> int32)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::cvtepu16_epi32(a.v4[0]),
         SIMD256T::cvtepu16_epi32(a.v4[1]),
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint16 --> int64)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu16_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint16 --> int64)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::cvtepu16_epi64(a.v4[0]),
         SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer const &a)         // return (int64)a    (uint32 --> int64)
+static SIMDINLINE Integer SIMDCALL
+                          cvtepu32_epi64(SIMD256Impl::Integer const& a) // return (int64)a    (uint32 --> int64)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::cvtepu32_epi64(a.v4[0]),
         SIMD256T::cvtepu32_epi64(a.v4[1]),
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float const &a)            // return (int32)a    (float --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvtps_epi32(Float const& a) // return (int32)a    (float --> int32)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::cvtps_epi32(a.v8[0]),
         SIMD256T::cvtps_epi32(a.v8[1]),
     };
 }
 
-static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float const &a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+static SIMDINLINE Integer SIMDCALL
+                          cvttps_epi32(Float const& a) // return (int32)a    (rnd_to_zero(float) --> int32)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::cvtps_epi32(a.v8[0]),
         SIMD256T::cvtps_epi32(a.v8[1]),
     };
@@ -387,126 +371,144 @@
 //-----------------------------------------------------------------------
 // Comparison operations
 //-----------------------------------------------------------------------
-template<CompareType CmpTypeT>
-static SIMDINLINE Float SIMDCALL cmp_ps(Float const &a, Float const &b) // return a (CmpTypeT) b
+template <CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float const& a, Float const& b) // return a (CmpTypeT) b
 {
-    return Float
-    {
+    return Float{
         SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
         SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
     };
 }
-static SIMDINLINE Float SIMDCALL cmplt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmpge_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
-static SIMDINLINE Float SIMDCALL cmple_ps(Float const &a, Float const &b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::LT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::GT_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::NEQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::EQ_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::GE_OQ>(a, b);
+}
+static SIMDINLINE Float SIMDCALL cmple_ps(Float const& a, Float const& b)
+{
+    return cmp_ps<CompareType::LE_OQ>(a, b);
+}
 
-template<CompareType CmpTypeT>
-static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const &a, Float const &b)
+template <CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float const& a, Float const& b)
 {
     return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
 }
 
+SIMD_IWRAPPER_2(cmpeq_epi8);  // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16); // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32); // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64); // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);  // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16); // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32); // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64); // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32); // return a < b (int32)
 
-SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
-SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
-SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
-SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
-SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
-SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
-SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
-SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
-SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
-
-static SIMDINLINE bool SIMDCALL testz_ps(Float const &a, Float const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+static SIMDINLINE bool SIMDCALL
+                       testz_ps(Float const& a, Float const& b) // return all_lanes_zero(a & b) ? 1 : 0 (float)
 {
-    return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
-                  SIMD256T::testz_ps(a.v8[1], b.v8[1]));
+    return 0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) & SIMD256T::testz_ps(a.v8[1], b.v8[1]));
 }
 
-static SIMDINLINE bool SIMDCALL testz_si(Integer const &a, Integer const &b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+static SIMDINLINE bool SIMDCALL
+                       testz_si(Integer const& a, Integer const& b) // return all_lanes_zero(a & b) ? 1 : 0 (int)
 {
-    return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
-                  SIMD256T::testz_si(a.v8[1], b.v8[1]));
+    return 0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) & SIMD256T::testz_si(a.v8[1], b.v8[1]));
 }
 
 //-----------------------------------------------------------------------
 // Blend / shuffle / permute operations
 //-----------------------------------------------------------------------
-SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
-SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
-SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Float const &mask) // return mask ? b : a (int)
+SIMD_WRAPPER_2I(blend_ps);     // return ImmT ? b : a  (float)
+SIMD_IWRAPPER_2I(blend_epi32); // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);     // return mask ? b : a  (float)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
+                                                Integer const& b,
+                                                Float const&   mask) // return mask ? b : a (int)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
         SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
     };
 }
 
-static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const &a, Integer const &b, Integer const &mask) // return mask ? b : a (int)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer const& a,
+                                                Integer const& b,
+                                                Integer const& mask) // return mask ? b : a (int)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
         SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
     };
 }
 
-static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return *p (all elements in vector get same value)
+static SIMDINLINE Float SIMDCALL
+                        broadcast_ss(float const* p) // return *p (all elements in vector get same value)
 {
     float f = *p;
-    return Float
-    {
+    return Float{
         SIMD256T::set1_ps(f),
         SIMD256T::set1_ps(f),
     };
 }
 
-template<int imm>
-static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const &a)
+template <int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float const& a)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
     return a.v8[imm];
 }
 
-template<int imm>
-static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const &a)
+template <int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double const& a)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
     return a.v8[imm];
 }
 
-template<int imm>
-static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const &a)
+template <int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer const& a)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
     return a.v8[imm];
 }
 
-template<int imm>
-static SIMDINLINE Float SIMDCALL insert_ps(Float const &a, SIMD256Impl::Float const &b)
+template <int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float const& a, SIMD256Impl::Float const& b)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Float r = a;
+    Float r   = a;
     r.v8[imm] = b;
     return r;
 }
 
-template<int imm>
-static SIMDINLINE Double SIMDCALL insert_pd(Double const &a, SIMD256Impl::Double const &b)
+template <int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double const& a, SIMD256Impl::Double const& b)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
-    Double r = a;
+    Double r  = a;
     r.v8[imm] = b;
     return r;
 }
 
-template<int imm>
-static SIMDINLINE Integer SIMDCALL insert_si(Integer const &a, SIMD256Impl::Integer const &b)
+template <int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer const& a, SIMD256Impl::Integer const& b)
 {
     SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
     Integer r = a;
@@ -514,27 +516,28 @@
     return r;
 }
 
-SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
-SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
-SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
-SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2(packs_epi16);  // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);  // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
 
-template<int ImmT>
-static SIMDINLINE Float SIMDCALL permute_ps(Float const &a)
+template <int ImmT>
+static SIMDINLINE Float SIMDCALL permute_ps(Float const& a)
 {
-    return Float
-    {
+    return Float{
         SIMD256T::template permute_ps<ImmT>(a.v8[0]),
         SIMD256T::template permute_ps<ImmT>(a.v8[1]),
     };
 }
 
-static SIMDINLINE Integer SIMDCALL permute_epi32(Integer const &a, Integer const &swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+static SIMDINLINE Integer SIMDCALL permute_epi32(
+    Integer const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
 {
     return castps_si(permute_ps(castsi_ps(a), swiz));
 }
 
-static SIMDINLINE Float SIMDCALL permute_ps(Float const &a, Integer const &swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+static SIMDINLINE Float SIMDCALL
+                        permute_ps(Float const& a, Integer const& swiz) // return a[swiz[i]] for each 32-bit lane i (float)
 {
     const auto mask = SIMD256T::set1_epi32(7);
 
@@ -544,10 +547,11 @@
     auto hilo = SIMD256T::permute_ps(a.v8[0], SIMD256T::and_si(swiz.v8[1], mask));
     auto hihi = SIMD256T::permute_ps(a.v8[1], SIMD256T::and_si(swiz.v8[1], mask));
 
-    return Float
-    {
-        SIMD256T::blendv_ps(lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
-        SIMD256T::blendv_ps(hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
+    return Float{
+        SIMD256T::blendv_ps(
+            lolo, lohi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[0], mask))),
+        SIMD256T::blendv_ps(
+            hilo, hihi, SIMD256T::castsi_ps(SIMD256T::cmpgt_epi32(swiz.v8[1], mask))),
     };
 }
 
@@ -562,7 +566,7 @@
 //              ESAC
 //              RETURN tmp[127:0]
 //      }
-//      
+//
 //      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
 //      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
 //      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
@@ -574,32 +578,35 @@
 // AVX instructions for emulation.
 //
 template <int shuf>
-static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const &a, Float const &b)
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float const& a, Float const& b)
 {
-    return Float
-    {
-        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
-        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    return Float{
+        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
+                                                                                        a.v8[1]),
+        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
+                                                                                        b.v8[1]),
     };
 }
 
 template <int shuf>
-static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const &a, Double const &b)
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double const& a, Double const& b)
 {
-    return Double
-    {
-        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
-        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    return Double{
+        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
+                                                                                        a.v8[1]),
+        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
+                                                                                        b.v8[1]),
     };
 }
 
 template <int shuf>
-static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const &a, Integer const &b)
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer const& a, Integer const& b)
 {
-    return Integer
-    {
-        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
-        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    return Integer{
+        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0],
+                                                                                        a.v8[1]),
+        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0],
+                                                                                        b.v8[1]),
     };
 }
 
@@ -624,209 +631,193 @@
 //-----------------------------------------------------------------------
 // Load / store operations
 //-----------------------------------------------------------------------
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer const &idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        i32gather_ps(float const* p, Integer const& idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
 {
-    return Float
-    {
+    return Float{
         SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
         SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
     };
 }
 
-static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+static SIMDINLINE Float SIMDCALL
+                        load1_ps(float const* p) // return *p    (broadcast 1 value to all elements)
 {
     return broadcast_ss(p);
 }
 
-static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+static SIMDINLINE Float SIMDCALL
+                        load_ps(float const* p) // return *p    (loads SIMD width elements from memory)
 {
-    return Float
-    {
-        SIMD256T::load_ps(p),
-        SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
-    };
+    return Float{SIMD256T::load_ps(p), SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)};
 }
 
-static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+static SIMDINLINE Integer SIMDCALL load_si(Integer const* p) // return *p
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::load_si(&p->v8[0]),
         SIMD256T::load_si(&p->v8[1]),
     };
 }
 
-static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+static SIMDINLINE Float SIMDCALL
+                        loadu_ps(float const* p) // return *p    (same as load_ps but allows for unaligned mem)
 {
-    return Float
-    {
-        SIMD256T::loadu_ps(p),
-        SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
-    };
+    return Float{SIMD256T::loadu_ps(p), SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)};
 }
 
-static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+static SIMDINLINE Integer SIMDCALL
+                          loadu_si(Integer const* p) // return *p    (same as load_si but allows for unaligned mem)
 {
-    return Integer
-    {
+    return Integer{
         SIMD256T::loadu_si(&p->v8[0]),
         SIMD256T::loadu_si(&p->v8[1]),
     };
 }
 
 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
-template<ScaleFactor ScaleT>
-static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float const &old, float const* p, Integer const &idx, Float const &mask)
+template <ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL
+                        mask_i32gather_ps(Float const& old, float const* p, Integer const& idx, Float const& mask)
 {
-    return Float
-    {
+    return Float{
         SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
         SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
     };
 }
 
-static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer const &mask, Float const &src)
+static SIMDINLINE void SIMDCALL maskstore_ps(float* p, Integer const& mask, Float const& src)
 {
     SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
     SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
 }
 
-static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const &a)
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer const& a)
 {
     uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
-             mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
+    mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
 
     return mask;
 }
 
-static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const &a)
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double const& a)
 {
     uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
-             mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
+    mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
 
     return mask;
 }
-static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const &a)
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float const& a)
 {
     uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
-             mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
+    mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
 
     return mask;
 }
 
 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
 {
-    return Integer
-    {
-        SIMD256T::set1_epi32(i),
-        SIMD256T::set1_epi32(i)
-    };
+    return Integer{SIMD256T::set1_epi32(i), SIMD256T::set1_epi32(i)};
 }
 
 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
 {
-    return Integer
-    {
-        SIMD256T::set1_epi8(i),
-        SIMD256T::set1_epi8(i)
-    };
+    return Integer{SIMD256T::set1_epi8(i), SIMD256T::set1_epi8(i)};
 }
 
-static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+static SIMDINLINE Float SIMDCALL set1_ps(float f) // return f (all elements are same value)
 {
-    return Float
-    {
-        SIMD256T::set1_ps(f),
-        SIMD256T::set1_ps(f)
-    };
+    return Float{SIMD256T::set1_ps(f), SIMD256T::set1_ps(f)};
 }
 
-static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+static SIMDINLINE Float SIMDCALL setzero_ps() // return 0 (float)
 {
-    return Float
-    {
-        SIMD256T::setzero_ps(),
-        SIMD256T::setzero_ps()
-    };
+    return Float{SIMD256T::setzero_ps(), SIMD256T::setzero_ps()};
 }
 
-static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+static SIMDINLINE Integer SIMDCALL setzero_si() // return 0 (integer)
 {
-    return Integer
-    {
-        SIMD256T::setzero_si(),
-        SIMD256T::setzero_si()
-    };
+    return Integer{SIMD256T::setzero_si(), SIMD256T::setzero_si()};
 }
 
-static SIMDINLINE void SIMDCALL store_ps(float *p, Float const &a)    // *p = a   (stores all elements contiguously in memory)
+static SIMDINLINE void SIMDCALL
+                       store_ps(float* p, Float const& a) // *p = a   (stores all elements contiguously in memory)
 {
     SIMD256T::store_ps(p, a.v8[0]);
     SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
 }
 
-static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer const &a)   // *p = a
+static SIMDINLINE void SIMDCALL store_si(Integer* p, Integer const& a) // *p = a
 {
     SIMD256T::store_si(&p->v8[0], a.v8[0]);
     SIMD256T::store_si(&p->v8[1], a.v8[1]);
 }
 
-static SIMDINLINE void SIMDCALL stream_ps(float *p, Float const &a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+static SIMDINLINE void SIMDCALL
+                       stream_ps(float* p, Float const& a) // *p = a   (same as store_ps, but doesn't keep memory in cache)
 {
     SIMD256T::stream_ps(p, a.v8[0]);
     SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
 }
 
-static SIMDINLINE Integer SIMDCALL set_epi32(
-    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
-    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+static SIMDINLINE Integer SIMDCALL set_epi32(int i15,
+                                             int i14,
+                                             int i13,
+                                             int i12,
+                                             int i11,
+                                             int i10,
+                                             int i9,
+                                             int i8,
+                                             int i7,
+                                             int i6,
+                                             int i5,
+                                             int i4,
+                                             int i3,
+                                             int i2,
+                                             int i1,
+                                             int i0)
 {
-    return Integer
-    {
-        SIMD256T::set_epi32(
-            i7, i6, i5, i4, i3, i2, i1, i0),
-        SIMD256T::set_epi32(
-            i15, i14, i13, i12, i11, i10, i9, i8)
-    };
+    return Integer{SIMD256T::set_epi32(i7, i6, i5, i4, i3, i2, i1, i0),
+                   SIMD256T::set_epi32(i15, i14, i13, i12, i11, i10, i9, i8)};
 }
 
-static SIMDINLINE Integer SIMDCALL set_epi32(
-    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+static SIMDINLINE Integer SIMDCALL
+                          set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
 {
-    return set_epi32(
-        0, 0, 0, 0, 0, 0, 0, 0,
-        i7, i6, i5, i4, i3, i2, i1, i0);
+    return set_epi32(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
-static SIMDINLINE Float SIMDCALL set_ps(
-    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
-    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+static SIMDINLINE Float SIMDCALL set_ps(float i15,
+                                        float i14,
+                                        float i13,
+                                        float i12,
+                                        float i11,
+                                        float i10,
+                                        float i9,
+                                        float i8,
+                                        float i7,
+                                        float i6,
+                                        float i5,
+                                        float i4,
+                                        float i3,
+                                        float i2,
+                                        float i1,
+                                        float i0)
 {
-    return Float
-    {
-        SIMD256T::set_ps(
-            i7, i6, i5, i4, i3, i2, i1, i0),
-        SIMD256T::set_ps(
-            i15, i14, i13, i12, i11, i10, i9, i8)
-    };
+    return Float{SIMD256T::set_ps(i7, i6, i5, i4, i3, i2, i1, i0),
+                 SIMD256T::set_ps(i15, i14, i13, i12, i11, i10, i9, i8)};
 }
 
-static SIMDINLINE Float SIMDCALL set_ps(
-    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+static SIMDINLINE Float SIMDCALL
+                        set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
 {
-    return set_ps(
-        0, 0, 0, 0, 0, 0, 0, 0,
-        i7, i6, i5, i4, i3, i2, i1, i0);
+    return set_ps(0, 0, 0, 0, 0, 0, 0, 0, i7, i6, i5, i4, i3, i2, i1, i0);
 }
 
 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
 {
-    return Float
-    {
-        SIMD256T::vmask_ps(mask),
-        SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)
-    };
+    return Float{SIMD256T::vmask_ps(mask), SIMD256T::vmask_ps(mask >> TARGET_SIMD_WIDTH)};
 }
 
 #undef SIMD_WRAPPER_1
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
index bc5bff4..4739348 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
@@ -1,28 +1,27 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #if !defined(__SIMD_LIB_AVX_HPP__)
 #error Do not include this file directly, use "simdlib.hpp" instead.
 #endif
 
 // no backwards compatibility for simd mask-enabled functions
-
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
index df2df1b..7902bcb 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #pragma once
 #if 0
 //===========================================================================
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
index 0fad0e1..944c3c2 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 #pragma once
 
 #if !defined(__cplusplus)
@@ -30,9 +30,9 @@
 #include <inttypes.h>
 #include <stdint.h>
 
-#define SIMD_ARCH_AVX       0
-#define SIMD_ARCH_AVX2      1
-#define SIMD_ARCH_AVX512    2
+#define SIMD_ARCH_AVX 0
+#define SIMD_ARCH_AVX2 1
+#define SIMD_ARCH_AVX512 2
 
 #if !defined(SIMD_ARCH)
 #define SIMD_ARCH SIMD_ARCH_AVX
@@ -55,81 +55,81 @@
 {
     enum class CompareType
     {
-        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
-        LT_OS      = 0x01, // Less-than (ordered, signaling)
-        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
-        UNORD_Q    = 0x03, // Unordered (nonsignaling)
-        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
-        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
-        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
-        ORD_Q      = 0x07, // Ordered (nonsignaling)
-        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
-        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
-        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
-        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
-        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
-        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
-        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
-        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
-        EQ_OS      = 0x10, // Equal (ordered, signaling)
-        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
-        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
-        UNORD_S    = 0x13, // Unordered (signaling)
-        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
-        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
-        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
-        ORD_S      = 0x17, // Ordered (signaling)
-        EQ_US      = 0x18, // Equal (unordered, signaling)
-        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
-        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
-        FALSE_OS   = 0x1B, // False (ordered, signaling)
-        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
-        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
-        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
-        TRUE_US    = 0x1F, // True (unordered, signaling)
+        EQ_OQ    = 0x00, // Equal (ordered, nonsignaling)
+        LT_OS    = 0x01, // Less-than (ordered, signaling)
+        LE_OS    = 0x02, // Less-than-or-equal (ordered, signaling)
+        UNORD_Q  = 0x03, // Unordered (nonsignaling)
+        NEQ_UQ   = 0x04, // Not-equal (unordered, nonsignaling)
+        NLT_US   = 0x05, // Not-less-than (unordered, signaling)
+        NLE_US   = 0x06, // Not-less-than-or-equal (unordered, signaling)
+        ORD_Q    = 0x07, // Ordered (nonsignaling)
+        EQ_UQ    = 0x08, // Equal (unordered, non-signaling)
+        NGE_US   = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+        NGT_US   = 0x0A, // Not-greater-than (unordered, signaling)
+        FALSE_OQ = 0x0B, // False (ordered, nonsignaling)
+        NEQ_OQ   = 0x0C, // Not-equal (ordered, non-signaling)
+        GE_OS    = 0x0D, // Greater-than-or-equal (ordered, signaling)
+        GT_OS    = 0x0E, // Greater-than (ordered, signaling)
+        TRUE_UQ  = 0x0F, // True (unordered, non-signaling)
+        EQ_OS    = 0x10, // Equal (ordered, signaling)
+        LT_OQ    = 0x11, // Less-than (ordered, nonsignaling)
+        LE_OQ    = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+        UNORD_S  = 0x13, // Unordered (signaling)
+        NEQ_US   = 0x14, // Not-equal (unordered, signaling)
+        NLT_UQ   = 0x15, // Not-less-than (unordered, nonsignaling)
+        NLE_UQ   = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+        ORD_S    = 0x17, // Ordered (signaling)
+        EQ_US    = 0x18, // Equal (unordered, signaling)
+        NGE_UQ   = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+        NGT_UQ   = 0x1A, // Not-greater-than (unordered, nonsignaling)
+        FALSE_OS = 0x1B, // False (ordered, signaling)
+        NEQ_OS   = 0x1C, // Not-equal (ordered, signaling)
+        GE_OQ    = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+        GT_OQ    = 0x1E, // Greater-than (ordered, nonsignaling)
+        TRUE_US  = 0x1F, // True (unordered, signaling)
     };
 
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
     enum class CompareTypeInt
     {
-        EQ  = _MM_CMPINT_EQ,    // Equal
-        LT  = _MM_CMPINT_LT,    // Less than
-        LE  = _MM_CMPINT_LE,    // Less than or Equal
-        NE  = _MM_CMPINT_NE,    // Not Equal
-        GE  = _MM_CMPINT_GE,    // Greater than or Equal
-        GT  = _MM_CMPINT_GT,    // Greater than
+        EQ = _MM_CMPINT_EQ, // Equal
+        LT = _MM_CMPINT_LT, // Less than
+        LE = _MM_CMPINT_LE, // Less than or Equal
+        NE = _MM_CMPINT_NE, // Not Equal
+        GE = _MM_CMPINT_GE, // Greater than or Equal
+        GT = _MM_CMPINT_GT, // Greater than
     };
 #endif // SIMD_ARCH >= SIMD_ARCH_AVX512
 
     enum class ScaleFactor
     {
-        SF_1 = 1,   // No scaling
-        SF_2 = 2,   // Scale offset by 2
-        SF_4 = 4,   // Scale offset by 4
-        SF_8 = 8,   // Scale offset by 8
+        SF_1 = 1, // No scaling
+        SF_2 = 2, // Scale offset by 2
+        SF_4 = 4, // Scale offset by 4
+        SF_8 = 8, // Scale offset by 8
     };
 
     enum class RoundMode
     {
-        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
-        TO_NEG_INF      = 0x01, // Round to negative infinity
-        TO_POS_INF      = 0x02, // Round to positive infinity
-        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
-        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
-        
-        RAISE_EXC       = 0x00, // Raise exception on overflow
-        NO_EXC          = 0x08, // Suppress exceptions
-        
-        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
-        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
-        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
-        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
-        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
-        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
-        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
-        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
-        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
-        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
+        TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
+        TO_NEG_INF     = 0x01, // Round to negative infinity
+        TO_POS_INF     = 0x02, // Round to positive infinity
+        TO_ZERO        = 0x03, // Round to 0 a.k.a. truncate
+        CUR_DIRECTION  = 0x04, // Round in direction set in MXCSR register
+
+        RAISE_EXC = 0x00, // Raise exception on overflow
+        NO_EXC    = 0x08, // Suppress exceptions
+
+        NINT        = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC),
+        NINT_NOEXC  = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC),
+        FLOOR       = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC),
+        FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC),
+        CEIL        = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC),
+        CEIL_NOEXC  = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC),
+        TRUNC       = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC),
+        TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC),
+        RINT        = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC),
+        NEARBYINT   = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC),
     };
 
     struct Traits
@@ -140,7 +140,7 @@
     };
 
     // Attribute, 4-dimensional attribute in SIMD SOA layout
-    template<typename Float, typename Integer, typename Double>
+    template <typename Float, typename Integer, typename Double>
     union Vec4
     {
         Float   v[4];
@@ -148,14 +148,14 @@
         Double  vd[4];
         struct
         {
-            Float  x;
-            Float  y;
-            Float  z;
-            Float  w;
+            Float x;
+            Float y;
+            Float z;
+            Float w;
         };
-        SIMDINLINE Float& SIMDCALL operator[] (const int i) { return v[i]; }
-        SIMDINLINE Float const & SIMDCALL operator[] (const int i) const { return v[i]; }
-        SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const & in)
+        SIMDINLINE Float& SIMDCALL operator[](const int i) { return v[i]; }
+        SIMDINLINE Float const& SIMDCALL operator[](const int i) const { return v[i]; }
+        SIMDINLINE Vec4& SIMDCALL operator=(Vec4 const& in)
         {
             v[0] = in.v[0];
             v[1] = in.v[1];
@@ -171,8 +171,16 @@
         {
             SIMDINLINE Float() = default;
             SIMDINLINE Float(__m128 in) : v(in) {}
-            SIMDINLINE Float& SIMDCALL operator=(__m128 in) { v = in; return *this; }
-            SIMDINLINE Float& SIMDCALL operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE Float& SIMDCALL operator=(__m128 in)
+            {
+                v = in;
+                return *this;
+            }
+            SIMDINLINE Float& SIMDCALL operator=(Float const& in)
+            {
+                v = in.v;
+                return *this;
+            }
             SIMDINLINE SIMDCALL operator __m128() const { return v; }
 
             SIMDALIGN(__m128, 16) v;
@@ -182,8 +190,16 @@
         {
             SIMDINLINE Integer() = default;
             SIMDINLINE Integer(__m128i in) : v(in) {}
-            SIMDINLINE Integer& SIMDCALL operator=(__m128i in) { v = in; return *this; }
-            SIMDINLINE Integer& SIMDCALL operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE Integer& SIMDCALL operator=(__m128i in)
+            {
+                v = in;
+                return *this;
+            }
+            SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
+            {
+                v = in.v;
+                return *this;
+            }
             SIMDINLINE SIMDCALL operator __m128i() const { return v; }
 
             SIMDALIGN(__m128i, 16) v;
@@ -193,8 +209,16 @@
         {
             SIMDINLINE Double() = default;
             SIMDINLINE Double(__m128d in) : v(in) {}
-            SIMDINLINE Double& SIMDCALL operator=(__m128d in) { v = in; return *this; }
-            SIMDINLINE Double& SIMDCALL operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE Double& SIMDCALL operator=(__m128d in)
+            {
+                v = in;
+                return *this;
+            }
+            SIMDINLINE Double& SIMDCALL operator=(Double const& in)
+            {
+                v = in.v;
+                return *this;
+            }
             SIMDINLINE SIMDCALL operator __m128d() const { return v; }
 
             SIMDALIGN(__m128d, 16) v;
@@ -204,7 +228,7 @@
         using Mask = uint8_t;
 
         static const uint32_t SIMD_WIDTH = 4;
-    } // ns SIMD128Impl
+    } // namespace SIMD128Impl
 
     namespace SIMD256Impl
     {
@@ -212,12 +236,21 @@
         {
             SIMDINLINE Float() = default;
             SIMDINLINE Float(__m256 in) : v(in) {}
-            SIMDINLINE Float(SIMD128Impl::Float const &in_lo, SIMD128Impl::Float const &in_hi = _mm_setzero_ps())
+            SIMDINLINE Float(SIMD128Impl::Float const& in_lo,
+                             SIMD128Impl::Float const& in_hi = _mm_setzero_ps())
             {
                 v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
             }
-            SIMDINLINE Float& SIMDCALL operator=(__m256 in) { v = in; return *this; }
-            SIMDINLINE Float& SIMDCALL operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE Float& SIMDCALL operator=(__m256 in)
+            {
+                v = in;
+                return *this;
+            }
+            SIMDINLINE Float& SIMDCALL operator=(Float const& in)
+            {
+                v = in.v;
+                return *this;
+            }
             SIMDINLINE SIMDCALL operator __m256() const { return v; }
 
             SIMDALIGN(__m256, 32) v;
@@ -228,12 +261,21 @@
         {
             SIMDINLINE Integer() = default;
             SIMDINLINE Integer(__m256i in) : v(in) {}
-            SIMDINLINE Integer(SIMD128Impl::Integer const &in_lo, SIMD128Impl::Integer const &in_hi = _mm_setzero_si128())
+            SIMDINLINE Integer(SIMD128Impl::Integer const& in_lo,
+                               SIMD128Impl::Integer const& in_hi = _mm_setzero_si128())
             {
                 v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
             }
-            SIMDINLINE Integer& SIMDCALL operator=(__m256i in) { v = in; return *this; }
-            SIMDINLINE Integer& SIMDCALL operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE Integer& SIMDCALL operator=(__m256i in)
+            {
+                v = in;
+                return *this;
+            }
+            SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
+            {
+                v = in.v;
+                return *this;
+            }
             SIMDINLINE SIMDCALL operator __m256i() const { return v; }
 
             SIMDALIGN(__m256i, 32) v;
@@ -243,13 +285,22 @@
         union Double
         {
             SIMDINLINE Double() = default;
-            SIMDINLINE Double(__m256d const &in) : v(in) {}
-            SIMDINLINE Double(SIMD128Impl::Double const &in_lo, SIMD128Impl::Double const &in_hi = _mm_setzero_pd())
+            SIMDINLINE Double(__m256d const& in) : v(in) {}
+            SIMDINLINE Double(SIMD128Impl::Double const& in_lo,
+                              SIMD128Impl::Double const& in_hi = _mm_setzero_pd())
             {
                 v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
             }
-            SIMDINLINE Double& SIMDCALL operator=(__m256d in) { v = in; return *this; }
-            SIMDINLINE Double& SIMDCALL operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE Double& SIMDCALL operator=(__m256d in)
+            {
+                v = in;
+                return *this;
+            }
+            SIMDINLINE Double& SIMDCALL operator=(Double const& in)
+            {
+                v = in.v;
+                return *this;
+            }
             SIMDINLINE SIMDCALL operator __m256d() const { return v; }
 
             SIMDALIGN(__m256d, 32) v;
@@ -260,7 +311,7 @@
         using Mask = uint8_t;
 
         static const uint32_t SIMD_WIDTH = 8;
-    } // ns SIMD256Impl
+    } // namespace SIMD256Impl
 
     namespace SIMD512Impl
     {
@@ -282,14 +333,14 @@
         union __m512i
         {
         private:
-            int8_t              m512i_i8[64];
-            int16_t             m512i_i16[32];
-            int32_t             m512i_i32[16];
-            int64_t             m512i_i64[8];
-            uint8_t             m512i_u8[64];
-            uint16_t            m512i_u16[32];
-            uint32_t            m512i_u32[16];
-            uint64_t            m512i_u64[8];
+            int8_t   m512i_i8[64];
+            int16_t  m512i_i16[32];
+            int32_t  m512i_i32[16];
+            int64_t  m512i_i64[8];
+            uint8_t  m512i_u8[64];
+            uint16_t m512i_u16[32];
+            uint32_t m512i_u32[16];
+            uint64_t m512i_u64[8];
         };
 
         using __mmask16 = uint16_t;
@@ -305,9 +356,18 @@
         {
             SIMDINLINE Float() = default;
             SIMDINLINE Float(__m512 in) : v(in) {}
-            SIMDINLINE Float(SIMD256Impl::Float const &in_lo, SIMD256Impl::Float const &in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
-            SIMDINLINE Float& SIMDCALL operator=(__m512 in) { v = in; return *this; }
-            SIMDINLINE Float& SIMDCALL operator=(Float const & in)
+            SIMDINLINE Float(SIMD256Impl::Float const& in_lo,
+                             SIMD256Impl::Float const& in_hi = _mm256_setzero_ps())
+            {
+                v8[0] = in_lo;
+                v8[1] = in_hi;
+            }
+            SIMDINLINE Float& SIMDCALL operator=(__m512 in)
+            {
+                v = in;
+                return *this;
+            }
+            SIMDINLINE Float& SIMDCALL operator=(Float const& in)
             {
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
                 v = in.v;
@@ -327,9 +387,18 @@
         {
             SIMDINLINE Integer() = default;
             SIMDINLINE Integer(__m512i in) : v(in) {}
-            SIMDINLINE Integer(SIMD256Impl::Integer const &in_lo, SIMD256Impl::Integer const &in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
-            SIMDINLINE Integer& SIMDCALL operator=(__m512i in) { v = in; return *this; }
-            SIMDINLINE Integer& SIMDCALL operator=(Integer const & in)
+            SIMDINLINE Integer(SIMD256Impl::Integer const& in_lo,
+                               SIMD256Impl::Integer const& in_hi = _mm256_setzero_si256())
+            {
+                v8[0] = in_lo;
+                v8[1] = in_hi;
+            }
+            SIMDINLINE Integer& SIMDCALL operator=(__m512i in)
+            {
+                v = in;
+                return *this;
+            }
+            SIMDINLINE Integer& SIMDCALL operator=(Integer const& in)
             {
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
                 v = in.v;
@@ -350,9 +419,18 @@
         {
             SIMDINLINE Double() = default;
             SIMDINLINE Double(__m512d in) : v(in) {}
-            SIMDINLINE Double(SIMD256Impl::Double const &in_lo, SIMD256Impl::Double const &in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
-            SIMDINLINE Double& SIMDCALL operator=(__m512d in) { v = in; return *this; }
-            SIMDINLINE Double& SIMDCALL operator=(Double const & in)
+            SIMDINLINE Double(SIMD256Impl::Double const& in_lo,
+                              SIMD256Impl::Double const& in_hi = _mm256_setzero_pd())
+            {
+                v8[0] = in_lo;
+                v8[1] = in_hi;
+            }
+            SIMDINLINE Double& SIMDCALL operator=(__m512d in)
+            {
+                v = in;
+                return *this;
+            }
+            SIMDINLINE Double& SIMDCALL operator=(Double const& in)
             {
 #if SIMD_ARCH >= SIMD_ARCH_AVX512
                 v = in.v;
@@ -375,5 +453,5 @@
         static const uint32_t SIMD_WIDTH = 16;
 
 #undef SIMD_ALIGNMENT_BYTES
-    } // ns SIMD512Impl
-} // ns SIMDImpl
+    } // namespace SIMD512Impl
+} // namespace SIMDImpl
diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
index 43b74a6..8e874fb 100644
--- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #include "common/os.h"
 #include <stdarg.h>
@@ -38,29 +38,32 @@
 {
     enum class TextColor
     {
-        BLACK      = 0,
+        BLACK = 0,
 #if defined(_WIN32)
-        RED        = 4,
-        GREEN      = 2,
-        BLUE       = 1,
+        RED   = 4,
+        GREEN = 2,
+        BLUE  = 1,
 #else
-        RED        = 1,
-        GREEN      = 2,
-        BLUE       = 4,
+        RED   = 1,
+        GREEN = 2,
+        BLUE  = 4,
 #endif // _WIN32
-        PURPLE     = static_cast<uint32_t>(RED) | static_cast<uint32_t>(BLUE),
-        CYAN       = static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
-        YELLOW     = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN),
-        WHITE      = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
+        PURPLE = static_cast<uint32_t>(RED) | static_cast<uint32_t>(BLUE),
+        CYAN   = static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
+        YELLOW = static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN),
+        WHITE =
+            static_cast<uint32_t>(RED) | static_cast<uint32_t>(GREEN) | static_cast<uint32_t>(BLUE),
     };
 
     enum class TextStyle
     {
-        NORMAL     = 0,
-        INTENSITY  = 1,
+        NORMAL    = 0,
+        INTENSITY = 1,
     };
 
-    void SetTextColor(FILE* stream, TextColor color = TextColor::WHITE, TextStyle style = TextStyle::NORMAL)
+    void SetTextColor(FILE*     stream,
+                      TextColor color = TextColor::WHITE,
+                      TextStyle style = TextStyle::NORMAL)
     {
 #if defined(_WIN32)
 
@@ -89,7 +92,8 @@
 #else // !_WIN32
 
         // Print ANSI codes
-        uint32_t cc = 30 + ((style == TextStyle::INTENSITY) ? 60 : 0) + static_cast<uint32_t>(color);
+        uint32_t cc =
+            30 + ((style == TextStyle::INTENSITY) ? 60 : 0) + static_cast<uint32_t>(color);
         fprintf(stream, "\033[0m\033[%d;%dm", static_cast<uint32_t>(style), cc);
 
 #endif
@@ -110,17 +114,16 @@
     }
 
     static std::mutex g_stderrMutex;
-} // ns ConsoleUtils
+} // namespace ConsoleUtils
 
-bool SwrAssert(
-        bool        chkDebugger,
-        bool&       enabled,
-        const char* pExpression,
-        const char* pFileName,
-        uint32_t    lineNum,
-        const char* pFunction,
-        const char* pFmtString,
-        ...)
+bool SwrAssert(bool        chkDebugger,
+               bool&       enabled,
+               const char* pExpression,
+               const char* pFileName,
+               uint32_t    lineNum,
+               const char* pFunction,
+               const char* pFmtString,
+               ...)
 {
     using namespace ConsoleUtils;
     std::lock_guard<std::mutex> l(g_stderrMutex);
@@ -151,7 +154,7 @@
 
 #if defined(_WIN32)
     static const int MAX_MESSAGE_LEN = 2048;
-    char msgBuf[MAX_MESSAGE_LEN];
+    char             msgBuf[MAX_MESSAGE_LEN];
 
     sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression);
     msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
@@ -169,15 +172,13 @@
     {
         va_list args;
         va_start(args, pFmtString);
-        offset = _vsnprintf_s(
-                msgBuf,
-                sizeof(msgBuf),
-                sizeof(msgBuf),
-                pFmtString,
-                args);
+        offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
         va_end(args);
 
-        if (offset < 0) { return true; }
+        if (offset < 0)
+        {
+            return true;
+        }
 
         OutputDebugStringA("\t");
         OutputDebugStringA(msgBuf);
@@ -186,46 +187,51 @@
 
     if (enabled && KNOB_ENABLE_ASSERT_DIALOGS)
     {
-        int retval = sprintf_s(
-                &msgBuf[offset],
-                MAX_MESSAGE_LEN - offset,
-                "\n\n"
-                "File: %s\n"
-                "Line: %d\n"
-                "\n"
-                "Expression: %s\n\n"
-                "Cancel: Disable this assert for the remainder of the process\n"
-                "Try Again: Break into the debugger\n"
-                "Continue: Continue execution (but leave assert enabled)",
-                pFileName,
-                lineNum,
-                pExpression);
+        int retval = sprintf_s(&msgBuf[offset],
+                               MAX_MESSAGE_LEN - offset,
+                               "\n\n"
+                               "File: %s\n"
+                               "Line: %d\n"
+                               "\n"
+                               "Expression: %s\n\n"
+                               "Cancel: Disable this assert for the remainder of the process\n"
+                               "Try Again: Break into the debugger\n"
+                               "Continue: Continue execution (but leave assert enabled)",
+                               pFileName,
+                               lineNum,
+                               pExpression);
 
-        if (retval < 0) { return true; }
+        if (retval < 0)
+        {
+            return true;
+        }
 
         offset += retval;
 
         if (!IsDebuggerPresent())
         {
-            sprintf_s(
-                    &msgBuf[offset],
-                    MAX_MESSAGE_LEN - offset,
-                    "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a program crash!");
+            sprintf_s(&msgBuf[offset],
+                      MAX_MESSAGE_LEN - offset,
+                      "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a "
+                      "program crash!");
         }
 
-        retval = MessageBoxA(nullptr, msgBuf, "Assert Failed", MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION | MB_SETFOREGROUND);
+        retval = MessageBoxA(nullptr,
+                             msgBuf,
+                             "Assert Failed",
+                             MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION | MB_SETFOREGROUND);
 
         switch (retval)
         {
-            case IDCANCEL:
-                enabled = false;
-                return false;
+        case IDCANCEL:
+            enabled = false;
+            return false;
 
-            case IDTRYAGAIN:
-                return true;
+        case IDTRYAGAIN:
+            return true;
 
-            case IDCONTINUE:
-                return false;
+        case IDCONTINUE:
+            return false;
         }
     }
     else
@@ -238,11 +244,7 @@
 }
 
 void SwrTrace(
-        const char* pFileName,
-        uint32_t    lineNum,
-        const char* pFunction,
-        const char* pFmtString,
-        ...)
+    const char* pFileName, uint32_t lineNum, const char* pFunction, const char* pFmtString, ...)
 {
     using namespace ConsoleUtils;
     std::lock_guard<std::mutex> l(g_stderrMutex);
@@ -266,7 +268,7 @@
 
 #if defined(_WIN32)
     static const int MAX_MESSAGE_LEN = 2048;
-    char msgBuf[MAX_MESSAGE_LEN];
+    char             msgBuf[MAX_MESSAGE_LEN];
 
     sprintf_s(msgBuf, "%s(%d): TRACE in %s\n", pFileName, lineNum, pFunction);
     msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
@@ -279,15 +281,13 @@
     {
         va_list args;
         va_start(args, pFmtString);
-        offset = _vsnprintf_s(
-                msgBuf,
-                sizeof(msgBuf),
-                sizeof(msgBuf),
-                pFmtString,
-                args);
+        offset = _vsnprintf_s(msgBuf, sizeof(msgBuf), sizeof(msgBuf), pFmtString, args);
         va_end(args);
 
-        if (offset < 0) { return; }
+        if (offset < 0)
+        {
+            return;
+        }
 
         OutputDebugStringA("\t");
         OutputDebugStringA(msgBuf);
diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
index a9e5bb4..d74b798 100644
--- a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #ifndef __SWR_ASSERT_H__
 #define __SWR_ASSERT_H__
@@ -55,28 +55,38 @@
 
 // Stupid preprocessor tricks to avoid -Wall / -W4 warnings
 #if defined(_MSC_VER)
-#define _SWR_WARN_DISABLE __pragma(warning(push)) __pragma(warning(disable:4127))
+#define _SWR_WARN_DISABLE __pragma(warning(push)) __pragma(warning(disable : 4127))
 #define _SWR_WARN_RESTORE __pragma(warning(pop))
 #else // ! MSVC compiler
 #define _SWR_WARN_DISABLE
 #define _SWR_WARN_RESTORE
 #endif
 
-#define _SWR_MACRO_START do {
-#define _SWR_MACRO_END  \
-    _SWR_WARN_DISABLE   \
-    } while(0)          \
+#define _SWR_MACRO_START \
+    do                   \
+    {
+#define _SWR_MACRO_END \
+    _SWR_WARN_DISABLE  \
+    }                  \
+    while (0)          \
     _SWR_WARN_RESTORE
 
-
 #if defined(_WIN32)
-#define SWR_ASSUME(e, ...) _SWR_MACRO_START __assume(e); _SWR_MACRO_END
+#define SWR_ASSUME(e, ...)        \
+    _SWR_MACRO_START __assume(e); \
+    _SWR_MACRO_END
 #elif defined(__clang__)
-#define SWR_ASSUME(e, ...) _SWR_MACRO_START __builtin_assume(e); _SWR_MACRO_END
+#define SWR_ASSUME(e, ...)                \
+    _SWR_MACRO_START __builtin_assume(e); \
+    _SWR_MACRO_END
 #elif defined(__GNUC__)
-#define SWR_ASSUME(e, ...) _SWR_MACRO_START ((e) ? ((void)0) : __builtin_unreachable()); _SWR_MACRO_END
+#define SWR_ASSUME(e, ...)                                       \
+    _SWR_MACRO_START((e) ? ((void)0) : __builtin_unreachable()); \
+    _SWR_MACRO_END
 #else
-#define SWR_ASSUME(e, ...) _SWR_MACRO_START ASSUME(e); _SWR_MACRO_END
+#define SWR_ASSUME(e, ...)      \
+    _SWR_MACRO_START ASSUME(e); \
+    _SWR_MACRO_END
 #endif
 
 #if !defined(SWR_ENABLE_ASSERTS)
@@ -110,47 +120,50 @@
 
 #else
 
-bool SwrAssert(
-    bool        chkDebugger,
-    bool&       enabled,
-    const char* pExpression,
-    const char* pFileName,
-    uint32_t    lineNum,
-    const char* function,
-    const char* pFmtString = nullptr,
-    ...);
+bool SwrAssert(bool        chkDebugger,
+               bool&       enabled,
+               const char* pExpression,
+               const char* pFileName,
+               uint32_t    lineNum,
+               const char* function,
+               const char* pFmtString = nullptr,
+               ...);
 
 void SwrTrace(
-    const char* pFileName,
-    uint32_t    lineNum,
-    const char* function,
-    const char* pFmtString,
-    ...);
+    const char* pFileName, uint32_t lineNum, const char* function, const char* pFmtString, ...);
 
-#define _SWR_ASSERT(chkDebugger, e, ...)    \
-    _SWR_MACRO_START \
-    bool expFailed = !(e);\
-    if (expFailed) {\
-        static bool swrAssertEnabled = true;\
-        expFailed = SwrAssert(chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__);\
-        if (expFailed) { DEBUGBREAK; }\
-    }\
+#define _SWR_ASSERT(chkDebugger, e, ...)                                                                            \
+    _SWR_MACRO_START                                                                                                \
+    bool expFailed = !(e);                                                                                          \
+    if (expFailed)                                                                                                  \
+    {                                                                                                               \
+        static bool swrAssertEnabled = true;                                                                        \
+        expFailed                    = SwrAssert(                                                                   \
+            chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
+        if (expFailed)                                                                                              \
+        {                                                                                                           \
+            DEBUGBREAK;                                                                                             \
+        }                                                                                                           \
+    }                                                                                                               \
     _SWR_MACRO_END
 
-#define _SWR_INVALID(chkDebugger, ...)    \
-    _SWR_MACRO_START \
-    static bool swrAssertEnabled = true;\
-    bool expFailed = SwrAssert(chkDebugger, swrAssertEnabled, "", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__);\
-    if (expFailed) { DEBUGBREAK; }\
+#define _SWR_INVALID(chkDebugger, ...)                                                                     \
+    _SWR_MACRO_START                                                                                       \
+    static bool swrAssertEnabled = true;                                                                   \
+    bool        expFailed        = SwrAssert(                                                              \
+        chkDebugger, swrAssertEnabled, "", __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__); \
+    if (expFailed)                                                                                         \
+    {                                                                                                      \
+        DEBUGBREAK;                                                                                        \
+    }                                                                                                      \
     _SWR_MACRO_END
 
-#define _SWR_TRACE(_fmtstr, ...) \
-    SwrTrace(__FILE__, __LINE__, __FUNCTION__, _fmtstr, ##__VA_ARGS__);
+#define _SWR_TRACE(_fmtstr, ...) SwrTrace(__FILE__, __LINE__, __FUNCTION__, _fmtstr, ##__VA_ARGS__);
 
 #if SWR_ENABLE_ASSERTS
-#define SWR_ASSERT(e, ...)              _SWR_ASSERT(true, e, ##__VA_ARGS__)
-#define SWR_ASSUME_ASSERT(e, ...)       SWR_ASSERT(e, ##__VA_ARGS__)
-#define SWR_TRACE(_fmtstr, ...)         _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
+#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__)
+#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSERT(e, ##__VA_ARGS__)
+#define SWR_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
 
 #if defined(assert)
 #undef assert
@@ -160,24 +173,25 @@
 #endif // SWR_ENABLE_ASSERTS
 
 #if SWR_ENABLE_REL_ASSERTS
-#define SWR_REL_ASSERT(e, ...)          _SWR_ASSERT(false, e, ##__VA_ARGS__)
-#define SWR_REL_ASSUME_ASSERT(e, ...)   SWR_REL_ASSERT(e, ##__VA_ARGS__)
-#define SWR_REL_TRACE(_fmtstr, ...)     _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
+#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__)
+#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_REL_ASSERT(e, ##__VA_ARGS__)
+#define SWR_REL_TRACE(_fmtstr, ...) _SWR_TRACE(_fmtstr, ##__VA_ARGS__)
 
 // SWR_INVALID is always enabled
 // Funky handling to allow 0 arguments with g++/gcc
 // This is needed because you can't "swallow commas" with ##_VA_ARGS__ unless
 // there is a first argument to the macro.  So having a macro that can optionally
 // accept 0 arguments is tricky.
-#define _SWR_INVALID_0()                _SWR_INVALID(false)
-#define _SWR_INVALID_1(...)             _SWR_INVALID(false, ##__VA_ARGS__)
+#define _SWR_INVALID_0() _SWR_INVALID(false)
+#define _SWR_INVALID_1(...) _SWR_INVALID(false, ##__VA_ARGS__)
 #define _SWR_INVALID_VARGS_(_10, _9, _8, _7, _6, _5, _4, _3, _2, _1, N, ...) N
-#define _SWR_INVALID_VARGS(...)         _SWR_INVALID_VARGS_(__VA_ARGS__, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1)
-#define _SWR_INVALID_VARGS_0()          1, 2, 3, 4, 5, 6, 7, 9, 9, 10
-#define _SWR_INVALID_CONCAT_(a, b)      a##b
-#define _SWR_INVALID_CONCAT(a, b)       _SWR_INVALID_CONCAT_(a, b)
-#define SWR_INVALID(...)                \
-    _SWR_INVALID_CONCAT(_SWR_INVALID_,_SWR_INVALID_VARGS(_SWR_INVALID_VARGS_0 __VA_ARGS__ ()))(__VA_ARGS__)
+#define _SWR_INVALID_VARGS(...) _SWR_INVALID_VARGS_(__VA_ARGS__, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1)
+#define _SWR_INVALID_VARGS_0() 1, 2, 3, 4, 5, 6, 7, 9, 9, 10
+#define _SWR_INVALID_CONCAT_(a, b) a##b
+#define _SWR_INVALID_CONCAT(a, b) _SWR_INVALID_CONCAT_(a, b)
+#define SWR_INVALID(...)                                                                       \
+    _SWR_INVALID_CONCAT(_SWR_INVALID_, _SWR_INVALID_VARGS(_SWR_INVALID_VARGS_0 __VA_ARGS__())) \
+    (__VA_ARGS__)
 #endif
 
 #endif // C++
@@ -185,20 +199,33 @@
 #endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
 
 // Needed to allow passing bitfield members to sizeof() in disabled asserts
-template<typename T>
-static bool SwrSizeofWorkaround(T) {return false;}
+template <typename T>
+static bool SwrSizeofWorkaround(T)
+{
+    return false;
+}
 
 #if !SWR_ENABLE_ASSERTS
-#define SWR_ASSERT(e, ...)              _SWR_MACRO_START (void)sizeof(SwrSizeofWorkaround(e)); _SWR_MACRO_END
-#define SWR_ASSUME_ASSERT(e, ...)       SWR_ASSUME(e, ##__VA_ARGS__)
-#define SWR_TRACE(_fmtstr, ...)         _SWR_MACRO_START (void)(0); _SWR_MACRO_END
+#define SWR_ASSERT(e, ...)                                 \
+    _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
+    _SWR_MACRO_END
+#define SWR_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
+#define SWR_TRACE(_fmtstr, ...) \
+    _SWR_MACRO_START(void)(0);  \
+    _SWR_MACRO_END
 #endif
 
 #if !SWR_ENABLE_REL_ASSERTS
-#define SWR_REL_ASSERT(e, ...)          _SWR_MACRO_START (void)sizeof(SwrSizeofWorkaround(e)); _SWR_MACRO_END
-#define SWR_INVALID(...)                _SWR_MACRO_START (void)(0); _SWR_MACRO_END
-#define SWR_REL_ASSUME_ASSERT(e, ...)   SWR_ASSUME(e, ##__VA_ARGS__)
-#define SWR_REL_TRACE(_fmtstr, ...)     _SWR_MACRO_START (void)(0); _SWR_MACRO_END
+#define SWR_REL_ASSERT(e, ...)                             \
+    _SWR_MACRO_START(void) sizeof(SwrSizeofWorkaround(e)); \
+    _SWR_MACRO_END
+#define SWR_INVALID(...)       \
+    _SWR_MACRO_START(void)(0); \
+    _SWR_MACRO_END
+#define SWR_REL_ASSUME_ASSERT(e, ...) SWR_ASSUME(e, ##__VA_ARGS__)
+#define SWR_REL_TRACE(_fmtstr, ...) \
+    _SWR_MACRO_START(void)(0);      \
+    _SWR_MACRO_END
 #endif
 
 #if defined(_MSC_VER)
@@ -211,4 +238,4 @@
 
 #define SWR_NOT_IMPL SWR_INVALID("%s not implemented", SWR_FUNCTION_DECL)
 
-#endif//__SWR_ASSERT_H__
+#endif //__SWR_ASSERT_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index e37e2e4..00f3313 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file api.cpp
-*
-* @brief API implementation
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file api.cpp
+ *
+ * @brief API implementation
+ *
+ ******************************************************************************/
 
 #include <cfloat>
 #include <cmath>
@@ -42,19 +42,20 @@
 #include "core/tilemgr.h"
 #include "core/clip.h"
 #include "core/utils.h"
+#include "core/tileset.h"
 
 #include "common/os.h"
 
-static const SWR_RECT g_MaxScissorRect = { 0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y };
+static const SWR_RECT g_MaxScissorRect = {0, 0, KNOB_MAX_SCISSOR_X, KNOB_MAX_SCISSOR_Y};
 
-void SetupDefaultState(SWR_CONTEXT *pContext);
+void SetupDefaultState(SWR_CONTEXT* pContext);
 
 static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
 {
     return (SWR_CONTEXT*)hContext;
 }
 
-void WakeAllThreads(SWR_CONTEXT *pContext)
+void WakeAllThreads(SWR_CONTEXT* pContext)
 {
     pContext->FifosNotEmpty.notify_all();
 }
@@ -62,15 +63,14 @@
 //////////////////////////////////////////////////////////////////////////
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
-HANDLE SwrCreateContext(
-    SWR_CREATECONTEXT_INFO* pCreateInfo)
+HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
 {
     RDTSC_RESET();
     RDTSC_INIT(0);
 
     void* pContextMem = AlignedMalloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
     memset(pContextMem, 0, sizeof(SWR_CONTEXT));
-    SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
+    SWR_CONTEXT* pContext = new (pContextMem) SWR_CONTEXT();
 
     pContext->privateStateSize = pCreateInfo->privateStateSize;
 
@@ -83,8 +83,10 @@
     pContext->dcRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
     pContext->dsRing.Init(pContext->MAX_DRAWS_IN_FLIGHT);
 
-    pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
-    pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
+    pContext->pMacroTileManagerArray =
+        (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
+    pContext->pDispatchQueueArray =
+        (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * pContext->MAX_DRAWS_IN_FLIGHT, 64);
 
     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
     {
@@ -101,14 +103,14 @@
     }
     else
     {
-        pContext->threadInfo.MAX_WORKER_THREADS         = KNOB_MAX_WORKER_THREADS;
-        pContext->threadInfo.BASE_NUMA_NODE             = KNOB_BASE_NUMA_NODE;
-        pContext->threadInfo.BASE_CORE                  = KNOB_BASE_CORE;
-        pContext->threadInfo.BASE_THREAD                = KNOB_BASE_THREAD;
-        pContext->threadInfo.MAX_NUMA_NODES             = KNOB_MAX_NUMA_NODES;
-        pContext->threadInfo.MAX_CORES_PER_NUMA_NODE    = KNOB_MAX_CORES_PER_NUMA_NODE;
-        pContext->threadInfo.MAX_THREADS_PER_CORE       = KNOB_MAX_THREADS_PER_CORE;
-        pContext->threadInfo.SINGLE_THREADED            = KNOB_SINGLE_THREADED;
+        pContext->threadInfo.MAX_WORKER_THREADS      = KNOB_MAX_WORKER_THREADS;
+        pContext->threadInfo.BASE_NUMA_NODE          = KNOB_BASE_NUMA_NODE;
+        pContext->threadInfo.BASE_CORE               = KNOB_BASE_CORE;
+        pContext->threadInfo.BASE_THREAD             = KNOB_BASE_THREAD;
+        pContext->threadInfo.MAX_NUMA_NODES          = KNOB_MAX_NUMA_NODES;
+        pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
+        pContext->threadInfo.MAX_THREADS_PER_CORE    = KNOB_MAX_THREADS_PER_CORE;
+        pContext->threadInfo.SINGLE_THREADED         = KNOB_SINGLE_THREADED;
     }
 
     if (pCreateInfo->pApiThreadInfo)
@@ -117,9 +119,9 @@
     }
     else
     {
-        pContext->apiThreadInfo.bindAPIThread0          = true;
-        pContext->apiThreadInfo.numAPIReservedThreads   = 1;
-        pContext->apiThreadInfo.numAPIThreadsPerCore    = 1;
+        pContext->apiThreadInfo.bindAPIThread0        = true;
+        pContext->apiThreadInfo.numAPIReservedThreads = 1;
+        pContext->apiThreadInfo.numAPIThreadsPerCore  = 1;
     }
 
     if (pCreateInfo->pWorkerPrivateState)
@@ -139,13 +141,20 @@
         BindApiThread(pContext, 0);
     }
 
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        pContext->pSingleThreadLockedTiles = new TileSet();
+    }
+
     pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
-    pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
+    pContext->pStats =
+        (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
 
 #if defined(KNOB_ENABLE_AR)
     // Setup ArchRast thread contexts which includes +1 for API thread.
-    pContext->pArContext = new HANDLE[pContext->NumWorkerThreads+1];
-    pContext->pArContext[pContext->NumWorkerThreads] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
+    pContext->pArContext = new HANDLE[pContext->NumWorkerThreads + 1];
+    pContext->pArContext[pContext->NumWorkerThreads] =
+        ArchRast::CreateThreadContext(ArchRast::AR_THREAD::API);
 #endif
 
     // Allocate scratch space for workers.
@@ -153,14 +162,17 @@
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
 #if defined(_WIN32)
-        uint32_t numaNode = pContext->threadPool.pThreadData ?
-            pContext->threadPool.pThreadData[i].numaId : 0;
-        pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(
-            GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
-            MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
-            numaNode);
+        uint32_t numaNode =
+            pContext->threadPool.pThreadData ? pContext->threadPool.pThreadData[i].numaId : 0;
+        pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(GetCurrentProcess(),
+                                                              nullptr,
+                                                              32 * sizeof(KILOBYTE),
+                                                              MEM_RESERVE | MEM_COMMIT,
+                                                              PAGE_READWRITE,
+                                                              numaNode);
 #else
-        pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+        pContext->ppScratch[i] =
+            (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
 #endif
 
 #if defined(KNOB_ENABLE_AR)
@@ -181,13 +193,13 @@
     pContext->pHotTileMgr = new HotTileMgr();
 
     // initialize callback functions
-    pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
-    pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
-    pContext->pfnClearTile = pCreateInfo->pfnClearTile;
+    pContext->pfnLoadTile            = pCreateInfo->pfnLoadTile;
+    pContext->pfnStoreTile           = pCreateInfo->pfnStoreTile;
+    pContext->pfnClearTile           = pCreateInfo->pfnClearTile;
     pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
-    pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
-    pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
-    
+    pContext->pfnUpdateStats         = pCreateInfo->pfnUpdateStats;
+    pContext->pfnUpdateStatsFE       = pCreateInfo->pfnUpdateStatsFE;
+
 
     // pass pointer to bucket manager back to caller
 #ifdef KNOB_ENABLE_RDTSC
@@ -206,11 +218,11 @@
     memcpy(&dst.state, &src.state, sizeof(API_STATE));
 }
 
-template<bool IsDraw>
-void QueueWork(SWR_CONTEXT *pContext)
+template <bool IsDraw>
+void QueueWork(SWR_CONTEXT* pContext)
 {
-    DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
-    uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
+    DRAW_CONTEXT* pDC     = pContext->pCurDrawContext;
+    uint32_t      dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
 
     if (IsDraw)
     {
@@ -243,9 +255,10 @@
 
         if (IsDraw)
         {
-            uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+            uint32_t curDraw[2] = {pContext->pCurDrawContext->drawId,
+                                   pContext->pCurDrawContext->drawId};
             WorkOnFifoFE(pContext, 0, curDraw[0]);
-            WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0);
+            WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
         }
         else
         {
@@ -253,8 +266,11 @@
             WorkOnCompute(pContext, 0, curDispatch);
         }
 
-        // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
-        while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
+        // Dequeue the work here, if not already done, since we're single threaded (i.e. no
+        // workers).
+        while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0)
+        {
+        }
 
         // restore csr
         _mm_setcsr(mxcsr);
@@ -266,9 +282,10 @@
         RDTSC_END(APIDrawWakeAllThreads, 1);
     }
 
-    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
+    // Set current draw context to NULL so that next state call forces a new draw context to be
+    // created and populated.
     pContext->pPrevDrawContext = pContext->pCurDrawContext;
-    pContext->pCurDrawContext = nullptr;
+    pContext->pCurDrawContext  = nullptr;
 }
 
 INLINE void QueueDraw(SWR_CONTEXT* pContext)
@@ -281,7 +298,7 @@
     QueueWork<false>(pContext);
 }
 
-DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
+DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT* pContext, bool isSplitDraw = false)
 {
     RDTSC_BEGIN(APIGetDrawContext, 0);
     // If current draw context is null then need to obtain a new draw context to use from ring.
@@ -303,14 +320,14 @@
             pContext->cachingArenaAllocator.FreeOldBlocks();
 
             pContext->lastFrameChecked = pContext->frameCount;
-            pContext->lastDrawChecked = curDraw;
+            pContext->lastDrawChecked  = curDraw;
         }
 
         DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
-        pContext->pCurDrawContext = pCurDrawContext;
+        pContext->pCurDrawContext     = pCurDrawContext;
 
         // Assign next available entry in DS ring to this DC.
-        uint32_t dsIndex = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT;
+        uint32_t dsIndex        = pContext->curStateId % pContext->MAX_DRAWS_IN_FLIGHT;
         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
 
         // Copy previous state to current state.
@@ -330,7 +347,7 @@
 
                 pCurDrawContext->pState->pPrivateState = nullptr;
 
-                pContext->curStateId++;  // Progress state ring index forward.
+                pContext->curStateId++; // Progress state ring index forward.
             }
             else
             {
@@ -343,21 +360,21 @@
         else
         {
             SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
-            pContext->curStateId++;  // Progress state ring index forward.
+            pContext->curStateId++; // Progress state ring index forward.
         }
 
         SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
 
         // Reset dependency
-        pCurDrawContext->dependent = false;
+        pCurDrawContext->dependent   = false;
         pCurDrawContext->dependentFE = false;
 
-        pCurDrawContext->pContext = pContext;
+        pCurDrawContext->pContext  = pContext;
         pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
 
-        pCurDrawContext->doneFE = false;
-        pCurDrawContext->FeLock = 0;
-        pCurDrawContext->threadsDone = 0;
+        pCurDrawContext->doneFE                         = false;
+        pCurDrawContext->FeLock                         = 0;
+        pCurDrawContext->threadsDone                    = 0;
         pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
 
         pCurDrawContext->dynState.Reset(pContext->NumWorkerThreads);
@@ -376,7 +393,7 @@
     return pContext->pCurDrawContext;
 }
 
-API_STATE* GetDrawState(SWR_CONTEXT *pContext)
+API_STATE* GetDrawState(SWR_CONTEXT* pContext)
 {
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
     SWR_ASSERT(pDC->pState != nullptr);
@@ -386,13 +403,13 @@
 
 void SwrDestroyContext(HANDLE hContext)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
-    pDC->FeWork.type = SHUTDOWN;
+    pDC->FeWork.type    = SHUTDOWN;
     pDC->FeWork.pfnWork = ProcessShutdown;
 
-    //enqueue
+    // enqueue
     QueueDraw(pContext);
 
     DestroyThreadPool(pContext, &pContext->threadPool);
@@ -427,7 +444,8 @@
     delete[] pContext->ppScratch;
     AlignedFree(pContext->pStats);
 
-    delete(pContext->pHotTileMgr);
+    delete pContext->pHotTileMgr;
+    delete pContext->pSingleThreadLockedTiles;
 
     pContext->~SWR_CONTEXT();
     AlignedFree(GetContext(hContext));
@@ -435,67 +453,65 @@
 
 void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
+    SWR_CONTEXT* pContext = GetContext(hContext);
     BindApiThread(pContext, apiThreadId);
 }
 
-void SWR_API SwrSaveState(
-    HANDLE hContext,
-    void* pOutputStateBlock,
-    size_t memSize)
+void SWR_API SwrSaveState(HANDLE hContext, void* pOutputStateBlock, size_t memSize)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    auto pSrc = GetDrawState(pContext);
+    SWR_CONTEXT* pContext = GetContext(hContext);
+    auto         pSrc     = GetDrawState(pContext);
     SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
 
     memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
 }
 
-void SWR_API SwrRestoreState(
-    HANDLE hContext,
-    const void* pStateBlock,
-    size_t memSize)
+void SWR_API SwrRestoreState(HANDLE hContext, const void* pStateBlock, size_t memSize)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    auto pDst = GetDrawState(pContext);
+    SWR_CONTEXT* pContext = GetContext(hContext);
+    auto         pDst     = GetDrawState(pContext);
     SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
 
     memcpy(pDst, pStateBlock, sizeof(*pDst));
 }
 
-void SetupDefaultState(SWR_CONTEXT *pContext)
+void SetupDefaultState(SWR_CONTEXT* pContext)
 {
     API_STATE* pState = GetDrawState(pContext);
 
-    pState->rastState.cullMode = SWR_CULLMODE_NONE;
+    pState->rastState.cullMode     = SWR_CULLMODE_NONE;
     pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
 
-    pState->depthBoundsState.depthBoundsTestEnable = false;
+    pState->depthBoundsState.depthBoundsTestEnable   = false;
     pState->depthBoundsState.depthBoundsTestMinValue = 0.0f;
     pState->depthBoundsState.depthBoundsTestMaxValue = 1.0f;
 }
 
-void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
+void SWR_API SwrSync(HANDLE            hContext,
+                     PFN_CALLBACK_FUNC pfnFunc,
+                     uint64_t          userData,
+                     uint64_t          userData2,
+                     uint64_t          userData3)
 {
     SWR_ASSERT(pfnFunc != nullptr);
 
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     RDTSC_BEGIN(APISync, 0);
 
-    pDC->FeWork.type = SYNC;
+    pDC->FeWork.type    = SYNC;
     pDC->FeWork.pfnWork = ProcessSync;
 
     // Setup callback function
     pDC->retireCallback.pfnCallbackFunc = pfnFunc;
-    pDC->retireCallback.userData = userData;
-    pDC->retireCallback.userData2 = userData2;
-    pDC->retireCallback.userData3 = userData3;
+    pDC->retireCallback.userData        = userData;
+    pDC->retireCallback.userData2       = userData2;
+    pDC->retireCallback.userData3       = userData3;
 
     AR_API_EVENT(SwrSyncEvent(pDC->drawId));
 
-    //enqueue
+    // enqueue
     QueueDraw(pContext);
 
     RDTSC_END(APISync, 1);
@@ -503,15 +519,15 @@
 
 void SwrStallBE(HANDLE hContext)
 {
-    SWR_CONTEXT* pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     pDC->dependent = true;
 }
 
 void SwrWaitForIdle(HANDLE hContext)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
+    SWR_CONTEXT* pContext = GetContext(hContext);
 
     RDTSC_BEGIN(APIWaitForIdle, 0);
 
@@ -525,7 +541,7 @@
 
 void SwrWaitForIdleFE(HANDLE hContext)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
+    SWR_CONTEXT* pContext = GetContext(hContext);
 
     RDTSC_BEGIN(APIWaitForIdle, 0);
 
@@ -537,42 +553,34 @@
     RDTSC_END(APIWaitForIdle, 1);
 }
 
-void SwrSetVertexBuffers(
-    HANDLE hContext,
-    uint32_t numBuffers,
-    const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
+void SwrSetVertexBuffers(HANDLE                         hContext,
+                         uint32_t                       numBuffers,
+                         const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
     for (uint32_t i = 0; i < numBuffers; ++i)
     {
-        const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
-        pState->vertexBuffers[pVB->index] = *pVB;
+        const SWR_VERTEX_BUFFER_STATE* pVB = &pVertexBuffers[i];
+        pState->vertexBuffers[pVB->index]  = *pVB;
     }
 }
 
-void SwrSetIndexBuffer(
-    HANDLE hContext,
-    const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
+void SwrSetIndexBuffer(HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
     pState->indexBuffer = *pIndexBuffer;
 }
 
-void SwrSetFetchFunc(
-    HANDLE hContext,
-    PFN_FETCH_FUNC    pfnFetchFunc)
+void SwrSetFetchFunc(HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
     pState->pfnFetchFunc = pfnFetchFunc;
 }
 
-void SwrSetSoFunc(
-    HANDLE hContext,
-    PFN_SO_FUNC    pfnSoFunc,
-    uint32_t streamIndex)
+void SwrSetSoFunc(HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
@@ -581,19 +589,14 @@
     pState->pfnSoFunc[streamIndex] = pfnSoFunc;
 }
 
-void SwrSetSoState(
-    HANDLE hContext,
-    SWR_STREAMOUT_STATE* pSoState)
+void SwrSetSoState(HANDLE hContext, SWR_STREAMOUT_STATE* pSoState)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
     pState->soState = *pSoState;
 }
 
-void SwrSetSoBuffers(
-    HANDLE hContext,
-    SWR_STREAMOUT_BUFFER* pSoBuffer,
-    uint32_t slot)
+void SwrSetSoBuffers(HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
@@ -602,168 +605,136 @@
     pState->soBuffer[slot] = *pSoBuffer;
 }
 
-void SwrSetVertexFunc(
-    HANDLE hContext,
-    PFN_VERTEX_FUNC pfnVertexFunc)
+void SwrSetVertexFunc(HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
     pState->pfnVertexFunc = pfnVertexFunc;
 }
 
-void SwrSetFrontendState(
-    HANDLE hContext,
-    SWR_FRONTEND_STATE *pFEState)
+void SwrSetFrontendState(HANDLE hContext, SWR_FRONTEND_STATE* pFEState)
 {
-    API_STATE* pState = GetDrawState(GetContext(hContext));
+    API_STATE* pState     = GetDrawState(GetContext(hContext));
     pState->frontendState = *pFEState;
 }
 
-void SwrSetGsState(
-    HANDLE hContext,
-    SWR_GS_STATE *pGSState)
+void SwrSetGsState(HANDLE hContext, SWR_GS_STATE* pGSState)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
-    pState->gsState = *pGSState;
+    pState->gsState   = *pGSState;
 }
 
-void SwrSetGsFunc(
-    HANDLE hContext,
-    PFN_GS_FUNC pfnGsFunc)
+void SwrSetGsFunc(HANDLE hContext, PFN_GS_FUNC pfnGsFunc)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
     pState->pfnGsFunc = pfnGsFunc;
 }
 
-void SwrSetCsFunc(
-    HANDLE hContext,
-    PFN_CS_FUNC pfnCsFunc,
-    uint32_t totalThreadsInGroup,
-    uint32_t totalSpillFillSize,
-    uint32_t scratchSpaceSizePerInstance,
-    uint32_t numInstances)
+void SwrSetCsFunc(HANDLE      hContext,
+                  PFN_CS_FUNC pfnCsFunc,
+                  uint32_t    totalThreadsInGroup,
+                  uint32_t    totalSpillFillSize,
+                  uint32_t    scratchSpaceSizePerInstance,
+                  uint32_t    numInstances)
 {
-    API_STATE* pState = GetDrawState(GetContext(hContext));
-    pState->pfnCsFunc = pfnCsFunc;
-    pState->totalThreadsInGroup = totalThreadsInGroup;
-    pState->totalSpillFillSize = totalSpillFillSize;
-    pState->scratchSpaceSize = scratchSpaceSizePerInstance;
+    API_STATE* pState                = GetDrawState(GetContext(hContext));
+    pState->pfnCsFunc                = pfnCsFunc;
+    pState->totalThreadsInGroup      = totalThreadsInGroup;
+    pState->totalSpillFillSize       = totalSpillFillSize;
+    pState->scratchSpaceSize         = scratchSpaceSizePerInstance;
     pState->scratchSpaceNumInstances = numInstances;
 }
 
-void SwrSetTsState(
-    HANDLE hContext,
-    SWR_TS_STATE *pState)
+void SwrSetTsState(HANDLE hContext, SWR_TS_STATE* pState)
 {
     API_STATE* pApiState = GetDrawState(GetContext(hContext));
-    pApiState->tsState = *pState;
+    pApiState->tsState   = *pState;
 }
 
-void SwrSetHsFunc(
-    HANDLE hContext,
-    PFN_HS_FUNC pfnFunc)
+void SwrSetHsFunc(HANDLE hContext, PFN_HS_FUNC pfnFunc)
 {
     API_STATE* pApiState = GetDrawState(GetContext(hContext));
     pApiState->pfnHsFunc = pfnFunc;
 }
 
-void SwrSetDsFunc(
-    HANDLE hContext,
-    PFN_DS_FUNC pfnFunc)
+void SwrSetDsFunc(HANDLE hContext, PFN_DS_FUNC pfnFunc)
 {
     API_STATE* pApiState = GetDrawState(GetContext(hContext));
     pApiState->pfnDsFunc = pfnFunc;
 }
 
-void SwrSetDepthStencilState(
-    HANDLE hContext,
-    SWR_DEPTH_STENCIL_STATE *pDSState)
+void SwrSetDepthStencilState(HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pDSState)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
     pState->depthStencilState = *pDSState;
 }
 
-void SwrSetBackendState(
-    HANDLE hContext,
-    SWR_BACKEND_STATE *pBEState)
+void SwrSetBackendState(HANDLE hContext, SWR_BACKEND_STATE* pBEState)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
     pState->backendState = *pBEState;
 }
 
-void SwrSetDepthBoundsState(
-    HANDLE hContext,
-    SWR_DEPTH_BOUNDS_STATE *pDBState)
+void SwrSetDepthBoundsState(HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pDBState)
 {
     API_STATE* pState = GetDrawState(GetContext(hContext));
 
     pState->depthBoundsState = *pDBState;
 }
 
-void SwrSetPixelShaderState(
-    HANDLE hContext,
-    SWR_PS_STATE *pPSState)
+void SwrSetPixelShaderState(HANDLE hContext, SWR_PS_STATE* pPSState)
 {
-    API_STATE *pState = GetDrawState(GetContext(hContext));
-    pState->psState = *pPSState;
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+    pState->psState   = *pPSState;
 }
 
-void SwrSetBlendState(
-    HANDLE hContext,
-    SWR_BLEND_STATE *pBlendState)
+void SwrSetBlendState(HANDLE hContext, SWR_BLEND_STATE* pBlendState)
 {
-    API_STATE *pState = GetDrawState(GetContext(hContext));
+    API_STATE* pState = GetDrawState(GetContext(hContext));
     memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
 }
 
-void SwrSetBlendFunc(
-    HANDLE hContext,
-    uint32_t renderTarget,
-    PFN_BLEND_JIT_FUNC pfnBlendFunc)
+void SwrSetBlendFunc(HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc)
 {
     SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
-    API_STATE *pState = GetDrawState(GetContext(hContext));
+    API_STATE* pState                  = GetDrawState(GetContext(hContext));
     pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
 }
 
 // update guardband multipliers for the viewport
-void updateGuardbands(API_STATE *pState)
+void updateGuardbands(API_STATE* pState)
 {
     uint32_t numGbs = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
 
-    for(uint32_t i = 0; i < numGbs; ++i)
+    for (uint32_t i = 0; i < numGbs; ++i)
     {
         // guardband center is viewport center
-        pState->gbState.left[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
-        pState->gbState.right[i] = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
-        pState->gbState.top[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
+        pState->gbState.left[i]   = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
+        pState->gbState.right[i]  = KNOB_GUARDBAND_WIDTH / pState->vp[i].width;
+        pState->gbState.top[i]    = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
         pState->gbState.bottom[i] = KNOB_GUARDBAND_HEIGHT / pState->vp[i].height;
     }
 }
 
-void SwrSetRastState(
-    HANDLE hContext,
-    const SWR_RASTSTATE *pRastState)
+void SwrSetRastState(HANDLE hContext, const SWR_RASTSTATE* pRastState)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    API_STATE* pState = GetDrawState(pContext);
+    SWR_CONTEXT* pContext = GetContext(hContext);
+    API_STATE*   pState   = GetDrawState(pContext);
 
     memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
 }
 
-void SwrSetViewports(
-    HANDLE hContext,
-    uint32_t numViewports,
-    const SWR_VIEWPORT* pViewports,
-    const SWR_VIEWPORT_MATRICES* pMatrices)
+void SwrSetViewports(HANDLE                       hContext,
+                     uint32_t                     numViewports,
+                     const SWR_VIEWPORT*          pViewports,
+                     const SWR_VIEWPORT_MATRICES* pMatrices)
 {
-    SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
-        "Invalid number of viewports.");
+    SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of viewports.");
 
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    API_STATE* pState = GetDrawState(pContext);
+    SWR_CONTEXT* pContext = GetContext(hContext);
+    API_STATE*   pState   = GetDrawState(pContext);
 
     memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
     // @todo Faster to copy portions of the SOA or just copy all of it?
@@ -772,27 +743,24 @@
     updateGuardbands(pState);
 }
 
-void SwrSetScissorRects(
-    HANDLE hContext,
-    uint32_t numScissors,
-    const SWR_RECT* pScissors)
+void SwrSetScissorRects(HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors)
 {
-    SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
-        "Invalid number of scissor rects.");
+    SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS, "Invalid number of scissor rects.");
 
     API_STATE* pState = GetDrawState(GetContext(hContext));
     memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(pScissors[0]));
 };
 
-void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
+void SetupMacroTileScissors(DRAW_CONTEXT* pDC)
 {
-    API_STATE *pState = &pDC->pState->state;
-    uint32_t numScissors = pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
+    API_STATE* pState = &pDC->pState->state;
+    uint32_t numScissors =
+        pState->backendState.readViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
     pState->scissorsTileAligned = true;
 
     for (uint32_t index = 0; index < numScissors; ++index)
     {
-        SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index];
+        SWR_RECT& scissorInFixedPoint = pState->scissorsInFixedPoint[index];
 
         // Set up scissor dimensions based on scissor or viewport
         if (pState->rastState.scissorEnable)
@@ -801,8 +769,9 @@
         }
         else
         {
-            // the vp width and height must be added to origin un-rounded then the result round to -inf.
-            // The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
+            // the vp width and height must be added to origin un-rounded then the result round to
+            // -inf. The cast to int works for rounding assuming all [left, right, top, bottom] are
+            // positive.
             scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
             scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
             scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
@@ -814,7 +783,7 @@
 
         // Test for tile alignment
         bool tileAligned;
-        tileAligned  = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
+        tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
         tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
         tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
         tileAligned &= (scissorInFixedPoint.ymax % KNOB_TILE_Y_DIM) == 0;
@@ -836,12 +805,12 @@
 
 // templated backend function tables
 
-void SetupPipeline(DRAW_CONTEXT *pDC)
+void SetupPipeline(DRAW_CONTEXT* pDC)
 {
-    DRAW_STATE* pState = pDC->pState;
-    const SWR_RASTSTATE &rastState = pState->state.rastState;
-    const SWR_PS_STATE &psState = pState->state.psState;
-    BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
+    DRAW_STATE*          pState       = pDC->pState;
+    const SWR_RASTSTATE& rastState    = pState->state.rastState;
+    const SWR_PS_STATE&  psState      = pState->state.psState;
+    BACKEND_FUNCS&       backendFuncs = pState->backendFuncs;
 
     // setup backend
     if (psState.pfnPixelShader == nullptr)
@@ -851,35 +820,46 @@
     else
     {
         const uint32_t forcedSampleCount = (rastState.forcedSampleCount) ? 1 : 0;
-        const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
-        const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
-        const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
+        const bool     bMultisampleEnable =
+            ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || forcedSampleCount) ? 1 : 0;
+        const uint32_t centroid =
+            ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+        const uint32_t canEarlyZ =
+            (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesUAV)) ? 1 : 0;
         SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
-        
+
         // select backend function
-        switch(psState.shadingRate)
+        switch (psState.shadingRate)
         {
         case SWR_SHADING_RATE_PIXEL:
-            if(bMultisampleEnable)
+            if (bMultisampleEnable)
             {
                 // always need to generate I & J per sample for Z interpolation
-                barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern][psState.inputCoverage]
-                                                                [centroid][forcedSampleCount][canEarlyZ]
+                barycentricsMask =
+                    (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
+                backendFuncs.pfnBackend =
+                    gBackendPixelRateTable[rastState.sampleCount][rastState.bIsCenterPattern]
+                                          [psState.inputCoverage][centroid][forcedSampleCount]
+                                          [canEarlyZ]
                     ;
             }
             else
             {
                 // always need to generate I & J per pixel for Z interpolation
-                barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
-                backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
+                barycentricsMask =
+                    (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
+                backendFuncs.pfnBackend =
+                    gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ];
             }
             break;
         case SWR_SHADING_RATE_SAMPLE:
             SWR_ASSERT(rastState.bIsCenterPattern != true);
             // always need to generate I & J per sample for Z interpolation
-            barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ];
+            barycentricsMask =
+                (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
+            backendFuncs.pfnBackend =
+                gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid]
+                                       [canEarlyZ];
             break;
         default:
             SWR_ASSERT(0 && "Invalid shading rate");
@@ -897,10 +877,10 @@
     {
     case TOP_POINT_LIST:
         pState->pfnProcessPrims = ClipPoints;
-        pfnBinner = BinPoints;
+        pfnBinner               = BinPoints;
 #if USE_SIMD16_FRONTEND
         pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
-        pfnBinner_simd16 = BinPoints_simd16;
+        pfnBinner_simd16               = BinPoints_simd16;
 #endif
         break;
     case TOP_LINE_LIST:
@@ -909,15 +889,15 @@
     case TOP_LINE_LIST_ADJ:
     case TOP_LISTSTRIP_ADJ:
         pState->pfnProcessPrims = ClipLines;
-        pfnBinner = BinLines;
+        pfnBinner               = BinLines;
 #if USE_SIMD16_FRONTEND
         pState->pfnProcessPrims_simd16 = ClipLines_simd16;
-        pfnBinner_simd16 = BinLines_simd16;
+        pfnBinner_simd16               = BinLines_simd16;
 #endif
         break;
     default:
         pState->pfnProcessPrims = ClipTriangles;
-        pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
+        pfnBinner               = GetBinTrianglesFunc((rastState.conservativeRast > 0));
 #if USE_SIMD16_FRONTEND
         pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
         pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
@@ -959,14 +939,16 @@
 
 
     // set up the frontend attribute count
-    pState->state.feNumAttributes = 0;
+    pState->state.feNumAttributes         = 0;
     const SWR_BACKEND_STATE& backendState = pState->state.backendState;
     if (backendState.swizzleEnable)
     {
         // attribute swizzling is enabled, iterate over the map and record the max attribute used
         for (uint32_t i = 0; i < backendState.numAttributes; ++i)
         {
-            pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1);
+            pState->state.feNumAttributes =
+                std::max(pState->state.feNumAttributes,
+                         (uint32_t)backendState.swizzleMap[i].sourceAttrib + 1);
         }
     }
     else
@@ -976,46 +958,53 @@
 
     if (pState->state.soState.soEnable)
     {
-        uint32_t streamMasks = 0;
+        uint64_t streamMasks = 0;
         for (uint32_t i = 0; i < 4; ++i)
         {
             streamMasks |= pState->state.soState.streamMasks[i];
         }
 
         DWORD maxAttrib;
-        if (_BitScanReverse(&maxAttrib, streamMasks))
+        if (_BitScanReverse64(&maxAttrib, streamMasks))
         {
-            pState->state.feNumAttributes = std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
+            pState->state.feNumAttributes =
+                std::max(pState->state.feNumAttributes, (uint32_t)(maxAttrib + 1));
         }
     }
 
     // complicated logic to test for cases where we don't need backing hottile memory for a draw
-    // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
-    pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
-                                           !pState->state.depthStencilState.depthWriteEnable &&
-                                           !pState->state.depthBoundsState.depthBoundsTestEnable &&
-                                           pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && 
-                                        (pState->state.depthStencilState.depthTestEnable || 
-                                         pState->state.depthStencilState.depthWriteEnable ||
-                                         pState->state.depthBoundsState.depthBoundsTestEnable)) ? true : false;
+    // have to check for the special case where depth/stencil test is enabled but depthwrite is
+    // disabled.
+    pState->state.depthHottileEnable =
+        ((!(pState->state.depthStencilState.depthTestEnable &&
+            !pState->state.depthStencilState.depthWriteEnable &&
+            !pState->state.depthBoundsState.depthBoundsTestEnable &&
+            pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) &&
+         (pState->state.depthStencilState.depthTestEnable ||
+          pState->state.depthStencilState.depthWriteEnable ||
+          pState->state.depthBoundsState.depthBoundsTestEnable))
+            ? true
+            : false;
 
-    pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
-                                             !pState->state.depthStencilState.stencilWriteEnable &&
-                                              pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
-                                          // for stencil we have to check the double sided state as well
-                                          (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
-                                             !pState->state.depthStencilState.stencilWriteEnable &&
-                                              pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) && 
-                                          (pState->state.depthStencilState.stencilTestEnable  ||
-                                           pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
-
+    pState->state.stencilHottileEnable =
+        (((!(pState->state.depthStencilState.stencilTestEnable &&
+             !pState->state.depthStencilState.stencilWriteEnable &&
+             pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
+          // for stencil we have to check the double sided state as well
+          (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
+             !pState->state.depthStencilState.stencilWriteEnable &&
+             pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) &&
+         (pState->state.depthStencilState.stencilTestEnable ||
+          pState->state.depthStencilState.stencilWriteEnable))
+            ? true
+            : false;
 
     uint32_t hotTileEnable = pState->state.psState.renderTargetMask;
 
     // Disable hottile for surfaces with no writes
     if (psState.pfnPixelShader != nullptr)
     {
-        DWORD rt;
+        DWORD    rt;
         uint32_t rtMask = pState->state.psState.renderTargetMask;
         while (_BitScanForward(&rt, rtMask))
         {
@@ -1033,33 +1022,39 @@
 
     pState->state.colorHottileEnable = hotTileEnable;
 
-
     // Setup depth quantization function
     if (pState->state.depthHottileEnable)
     {
         switch (pState->state.rastState.depthFormat)
         {
-        case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
-        case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
-        case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
-        case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
-        default: SWR_INVALID("Unsupported depth format for depth quantiztion.");
-            pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
+        case R32_FLOAT_X8X24_TYPELESS:
+            pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT_X8X24_TYPELESS>;
+            break;
+        case R32_FLOAT:
+            pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
+            break;
+        case R24_UNORM_X8_TYPELESS:
+            pState->state.pfnQuantizeDepth = QuantizeDepth<R24_UNORM_X8_TYPELESS>;
+            break;
+        case R16_UNORM:
+            pState->state.pfnQuantizeDepth = QuantizeDepth<R16_UNORM>;
+            break;
+        default:
+            SWR_INVALID("Unsupported depth format for depth quantiztion.");
+            pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
         }
     }
     else
     {
         // set up pass-through quantize if depth isn't enabled
-        pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
+        pState->state.pfnQuantizeDepth = QuantizeDepth<R32_FLOAT>;
     }
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief InitDraw
 /// @param pDC - Draw context to initialize for this draw.
-void InitDraw(
-    DRAW_CONTEXT *pDC,
-    bool isSplitDraw)
+void InitDraw(DRAW_CONTEXT* pDC, bool isSplitDraw)
 {
     // We don't need to re-setup the scissors/pipeline state again for split draw.
     if (isSplitDraw == false)
@@ -1067,7 +1062,6 @@
         SetupMacroTileScissors(pDC);
         SetupPipeline(pDC);
     }
-    
 
 }
 
@@ -1075,10 +1069,7 @@
 /// @brief We can split the draw for certain topologies for better performance.
 /// @param totalVerts - Total vertices for draw
 /// @param topology - Topology used for draw
-uint32_t MaxVertsPerDraw(
-    DRAW_CONTEXT* pDC,
-    uint32_t totalVerts,
-    PRIMITIVE_TOPOLOGY topology)
+uint32_t MaxVertsPerDraw(DRAW_CONTEXT* pDC, uint32_t totalVerts, PRIMITIVE_TOPOLOGY topology)
 {
     API_STATE& state = pDC->pState->state;
 
@@ -1145,7 +1136,7 @@
         if (pDC->pState->state.tsState.tsEnable)
         {
             uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
-            vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
+            vertsPerDraw          = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
         }
         break;
     default:
@@ -1156,7 +1147,6 @@
     return vertsPerDraw;
 }
 
-
 //////////////////////////////////////////////////////////////////////////
 /// @brief DrawInstanced
 /// @param hContext - Handle passed back from SwrCreateContext
@@ -1164,31 +1154,31 @@
 /// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
 /// @param startVertex - Specifies start vertex for draw. (vertex data)
 /// @param numInstances - How many instances to render.
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-void DrawInstanced(
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t numVertices,
-    uint32_t startVertex,
-    uint32_t numInstances = 1,
-    uint32_t startInstance = 0)
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer
+/// (instanced data)
+void DrawInstanced(HANDLE             hContext,
+                   PRIMITIVE_TOPOLOGY topology,
+                   uint32_t           numVertices,
+                   uint32_t           startVertex,
+                   uint32_t           numInstances  = 1,
+                   uint32_t           startInstance = 0)
 {
     if (KNOB_TOSS_DRAW)
     {
         return;
     }
 
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     RDTSC_BEGIN(APIDraw, pDC->drawId);
 
     uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
-    uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
-    uint32_t remainingVerts = numVertices;
+    uint32_t primsPerDraw    = GetNumPrims(topology, maxVertsPerDraw);
+    uint32_t remainingVerts  = numVertices;
 
-    API_STATE    *pState = &pDC->pState->state;
-    pState->topology = topology;
+    API_STATE* pState  = &pDC->pState->state;
+    pState->topology   = topology;
     pState->forceFront = false;
 
     // disable culling for points/lines
@@ -1196,7 +1186,7 @@
     if (topology == TOP_POINT_LIST)
     {
         pState->rastState.cullMode = SWR_CULLMODE_NONE;
-        pState->forceFront = true;
+        pState->forceFront         = true;
     }
     else if (topology == TOP_RECT_LIST)
     {
@@ -1206,42 +1196,50 @@
     int draw = 0;
     while (remainingVerts)
     {
-        uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
-        remainingVerts : maxVertsPerDraw;
+        uint32_t numVertsForDraw =
+            (remainingVerts < maxVertsPerDraw) ? remainingVerts : maxVertsPerDraw;
 
-        bool isSplitDraw = (draw > 0) ? true : false;
-        DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
+        bool          isSplitDraw = (draw > 0) ? true : false;
+        DRAW_CONTEXT* pDC         = GetDrawContext(pContext, isSplitDraw);
         InitDraw(pDC, isSplitDraw);
 
-        pDC->FeWork.type = DRAW;
-        pDC->FeWork.pfnWork = GetProcessDrawFunc(
-            false,  // IsIndexed
-            false, // bEnableCutIndex
-            pState->tsState.tsEnable,
-            pState->gsState.gsEnable,
-            pState->soState.soEnable,
-            pDC->pState->pfnProcessPrims != nullptr);
-        pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
-        pDC->FeWork.desc.draw.startVertex = startVertex;
-        pDC->FeWork.desc.draw.numInstances = numInstances;
+        pDC->FeWork.type                    = DRAW;
+        pDC->FeWork.pfnWork                 = GetProcessDrawFunc(false, // IsIndexed
+                                                 false, // bEnableCutIndex
+                                                 pState->tsState.tsEnable,
+                                                 pState->gsState.gsEnable,
+                                                 pState->soState.soEnable,
+                                                 pDC->pState->pfnProcessPrims != nullptr);
+        pDC->FeWork.desc.draw.numVerts      = numVertsForDraw;
+        pDC->FeWork.desc.draw.startVertex   = startVertex;
+        pDC->FeWork.desc.draw.numInstances  = numInstances;
         pDC->FeWork.desc.draw.startInstance = startInstance;
-        pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
+        pDC->FeWork.desc.draw.startPrimID   = draw * primsPerDraw;
         pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
 
         pDC->cleanupState = (remainingVerts == numVertsForDraw);
 
-        //enqueue DC
+        // enqueue DC
         QueueDraw(pContext);
 
-        AR_API_EVENT(DrawInstancedEvent(pDC->drawId, topology, numVertsForDraw, startVertex, numInstances,
-            startInstance, pState->tsState.tsEnable, pState->gsState.gsEnable, pState->soState.soEnable, pState->gsState.outputTopology, draw));
+        AR_API_EVENT(DrawInstancedEvent(pDC->drawId,
+                                        topology,
+                                        numVertsForDraw,
+                                        startVertex,
+                                        numInstances,
+                                        startInstance,
+                                        pState->tsState.tsEnable,
+                                        pState->gsState.gsEnable,
+                                        pState->soState.soEnable,
+                                        pState->gsState.outputTopology,
+                                        draw));
 
         remainingVerts -= numVertsForDraw;
         draw++;
     }
 
     // restore culling state
-    pDC = GetDrawContext(pContext);
+    pDC                                   = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
 
     RDTSC_END(APIDraw, numVertices * numInstances);
@@ -1253,11 +1251,10 @@
 /// @param topology - Specifies topology for draw.
 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
 /// @param primCount - Number of vertices.
-void SwrDraw(
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t startVertex,
-    uint32_t numVertices)
+void SwrDraw(HANDLE             hContext,
+             PRIMITIVE_TOPOLOGY topology,
+             uint32_t           startVertex,
+             uint32_t           numVertices)
 {
     DrawInstanced(hContext, topology, numVertices, startVertex);
 }
@@ -1269,17 +1266,17 @@
 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
 /// @param numInstances - How many instances to render.
 /// @param startVertex - Specifies start vertex for draw. (vertex data)
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-void SwrDrawInstanced(
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t numVertsPerInstance,
-    uint32_t numInstances,
-    uint32_t startVertex,
-    uint32_t startInstance
-    )
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer
+/// (instanced data)
+void SwrDrawInstanced(HANDLE             hContext,
+                      PRIMITIVE_TOPOLOGY topology,
+                      uint32_t           numVertsPerInstance,
+                      uint32_t           numInstances,
+                      uint32_t           startVertex,
+                      uint32_t           startInstance)
 {
-    DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
+    DrawInstanced(
+        hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1290,46 +1287,52 @@
 /// @param indexOffset - Starting index into index buffer.
 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
 /// @param numInstances - Number of instances to render.
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-void DrawIndexedInstance(
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t numIndices,
-    uint32_t indexOffset,
-    int32_t baseVertex,
-    uint32_t numInstances = 1,
-    uint32_t startInstance = 0)
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer
+/// (instanced data)
+void DrawIndexedInstance(HANDLE             hContext,
+                         PRIMITIVE_TOPOLOGY topology,
+                         uint32_t           numIndices,
+                         uint32_t           indexOffset,
+                         int32_t            baseVertex,
+                         uint32_t           numInstances  = 1,
+                         uint32_t           startInstance = 0)
 {
     if (KNOB_TOSS_DRAW)
     {
         return;
     }
 
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-    API_STATE* pState = &pDC->pState->state;
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
+    API_STATE*    pState   = &pDC->pState->state;
 
     RDTSC_BEGIN(APIDrawIndexed, pDC->drawId);
 
     uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
-    uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
-    uint32_t remainingIndices = numIndices;
+    uint32_t primsPerDraw      = GetNumPrims(topology, maxIndicesPerDraw);
+    uint32_t remainingIndices  = numIndices;
 
     uint32_t indexSize = 0;
     switch (pState->indexBuffer.format)
     {
-    case R32_UINT: indexSize = sizeof(uint32_t); break;
-    case R16_UINT: indexSize = sizeof(uint16_t); break;
-    case R8_UINT: indexSize = sizeof(uint8_t); break;
+    case R32_UINT:
+        indexSize = sizeof(uint32_t);
+        break;
+    case R16_UINT:
+        indexSize = sizeof(uint16_t);
+        break;
+    case R8_UINT:
+        indexSize = sizeof(uint8_t);
+        break;
     default:
         SWR_INVALID("Invalid index buffer format: %d", pState->indexBuffer.format);
     }
 
-    int draw = 0;
+    int      draw = 0;
     gfxptr_t xpIB = pState->indexBuffer.xpIndices;
     xpIB += (uint64_t)indexOffset * (uint64_t)indexSize;
 
-    pState->topology = topology;
+    pState->topology   = topology;
     pState->forceFront = false;
 
     // disable culling for points/lines
@@ -1337,7 +1340,7 @@
     if (topology == TOP_POINT_LIST)
     {
         pState->rastState.cullMode = SWR_CULLMODE_NONE;
-        pState->forceFront = true;
+        pState->forceFront         = true;
     }
     else if (topology == TOP_RECT_LIST)
     {
@@ -1346,8 +1349,8 @@
 
     while (remainingIndices)
     {
-        uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
-        remainingIndices : maxIndicesPerDraw;
+        uint32_t numIndicesForDraw =
+            (remainingIndices < maxIndicesPerDraw) ? remainingIndices : maxIndicesPerDraw;
 
         // When breaking up draw, we need to obtain new draw context for each iteration.
         bool isSplitDraw = (draw > 0) ? true : false;
@@ -1355,31 +1358,40 @@
         pDC = GetDrawContext(pContext, isSplitDraw);
         InitDraw(pDC, isSplitDraw);
 
-        pDC->FeWork.type = DRAW;
-        pDC->FeWork.pfnWork = GetProcessDrawFunc(
-            true,   // IsIndexed
-            pState->frontendState.bEnableCutIndex,
-            pState->tsState.tsEnable,
-            pState->gsState.gsEnable,
-            pState->soState.soEnable,
-            pDC->pState->pfnProcessPrims != nullptr);
-        pDC->FeWork.desc.draw.pDC = pDC;
+        pDC->FeWork.type                 = DRAW;
+        pDC->FeWork.pfnWork              = GetProcessDrawFunc(true, // IsIndexed
+                                                 pState->frontendState.bEnableCutIndex,
+                                                 pState->tsState.tsEnable,
+                                                 pState->gsState.gsEnable,
+                                                 pState->soState.soEnable,
+                                                 pDC->pState->pfnProcessPrims != nullptr);
+        pDC->FeWork.desc.draw.pDC        = pDC;
         pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
-        pDC->FeWork.desc.draw.xpIB = xpIB;
-        pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
+        pDC->FeWork.desc.draw.xpIB       = xpIB;
+        pDC->FeWork.desc.draw.type       = pDC->pState->state.indexBuffer.format;
 
-        pDC->FeWork.desc.draw.numInstances = numInstances;
+        pDC->FeWork.desc.draw.numInstances  = numInstances;
         pDC->FeWork.desc.draw.startInstance = startInstance;
-        pDC->FeWork.desc.draw.baseVertex = baseVertex;
-        pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
+        pDC->FeWork.desc.draw.baseVertex    = baseVertex;
+        pDC->FeWork.desc.draw.startPrimID   = draw * primsPerDraw;
 
         pDC->cleanupState = (remainingIndices == numIndicesForDraw);
 
-        //enqueue DC
+        // enqueue DC
         QueueDraw(pContext);
 
-        AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId, topology, numIndicesForDraw, indexOffset, baseVertex,
-            numInstances, startInstance, pState->tsState.tsEnable, pState->gsState.gsEnable, pState->soState.soEnable, pState->gsState.outputTopology, draw));
+        AR_API_EVENT(DrawIndexedInstancedEvent(pDC->drawId,
+                                               topology,
+                                               numIndicesForDraw,
+                                               indexOffset,
+                                               baseVertex,
+                                               numInstances,
+                                               startInstance,
+                                               pState->tsState.tsEnable,
+                                               pState->gsState.gsEnable,
+                                               pState->soState.soEnable,
+                                               pState->gsState.outputTopology,
+                                               draw));
 
         xpIB += maxIndicesPerDraw * indexSize;
         remainingIndices -= numIndicesForDraw;
@@ -1387,13 +1399,12 @@
     }
 
     // Restore culling state
-    pDC = GetDrawContext(pContext);
+    pDC                                   = GetDrawContext(pContext);
     pDC->pState->state.rastState.cullMode = oldCullMode;
- 
+
     RDTSC_END(APIDrawIndexed, numIndices * numInstances);
 }
 
-
 //////////////////////////////////////////////////////////////////////////
 /// @brief DrawIndexed
 /// @param hContext - Handle passed back from SwrCreateContext
@@ -1401,13 +1412,11 @@
 /// @param numIndices - Number of indices to read sequentially from index buffer.
 /// @param indexOffset - Starting index into index buffer.
 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-void SwrDrawIndexed(
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t numIndices,
-    uint32_t indexOffset,
-    int32_t baseVertex
-    )
+void SwrDrawIndexed(HANDLE             hContext,
+                    PRIMITIVE_TOPOLOGY topology,
+                    uint32_t           numIndices,
+                    uint32_t           indexOffset,
+                    int32_t            baseVertex)
 {
     DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
 }
@@ -1420,48 +1429,49 @@
 /// @param numInstances - Number of instances to render.
 /// @param indexOffset - Starting index into index buffer.
 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-void SwrDrawIndexedInstanced(
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t numIndices,
-    uint32_t numInstances,
-    uint32_t indexOffset,
-    int32_t baseVertex,
-    uint32_t startInstance)
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer
+/// (instanced data)
+void SwrDrawIndexedInstanced(HANDLE             hContext,
+                             PRIMITIVE_TOPOLOGY topology,
+                             uint32_t           numIndices,
+                             uint32_t           numInstances,
+                             uint32_t           indexOffset,
+                             int32_t            baseVertex,
+                             uint32_t           startInstance)
 {
-    DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
+    DrawIndexedInstance(
+        hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrInvalidateTiles
 /// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
+/// invalidate.
 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate.  This will be expanded to
 ///                         be hottile size-aligned.
-void SWR_API SwrInvalidateTiles(
-    HANDLE hContext,
-    uint32_t attachmentMask,
-    const SWR_RECT& invalidateRect)
+void SWR_API SwrInvalidateTiles(HANDLE          hContext,
+                                uint32_t        attachmentMask,
+                                const SWR_RECT& invalidateRect)
 {
     if (KNOB_TOSS_DRAW)
     {
         return;
     }
 
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
-    pDC->FeWork.type = DISCARDINVALIDATETILES;
-    pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+    pDC->FeWork.type                                       = DISCARDINVALIDATETILES;
+    pDC->FeWork.pfnWork                                    = ProcessDiscardInvalidateTiles;
     pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
-    pDC->FeWork.desc.discardInvalidateTiles.rect = invalidateRect;
+    pDC->FeWork.desc.discardInvalidateTiles.rect           = invalidateRect;
     pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
-    pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
+    pDC->FeWork.desc.discardInvalidateTiles.newTileState   = SWR_TILE_INVALID;
     pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
-    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
+    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly  = false;
 
-    //enqueue
+    // enqueue
     QueueDraw(pContext);
 
     AR_API_EVENT(SwrInvalidateTilesEvent(pDC->drawId));
@@ -1473,30 +1483,27 @@
 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
 /// @param rect - The pixel-coordinate rectangle to discard.  Only fully-covered hottiles will be
 ///               discarded.
-void SWR_API SwrDiscardRect(
-    HANDLE hContext,
-    uint32_t attachmentMask,
-    const SWR_RECT& rect)
+void SWR_API SwrDiscardRect(HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect)
 {
     if (KNOB_TOSS_DRAW)
     {
         return;
     }
 
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     // Queue a load to the hottile
-    pDC->FeWork.type = DISCARDINVALIDATETILES;
-    pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+    pDC->FeWork.type                                       = DISCARDINVALIDATETILES;
+    pDC->FeWork.pfnWork                                    = ProcessDiscardInvalidateTiles;
     pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
-    pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
+    pDC->FeWork.desc.discardInvalidateTiles.rect           = rect;
     pDC->FeWork.desc.discardInvalidateTiles.rect &= g_MaxScissorRect;
-    pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
+    pDC->FeWork.desc.discardInvalidateTiles.newTileState   = SWR_TILE_RESOLVED;
     pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
-    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
+    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly  = true;
 
-    //enqueue
+    // enqueue
     QueueDraw(pContext);
 
     AR_API_EVENT(SwrDiscardRectEvent(pDC->drawId));
@@ -1508,23 +1515,23 @@
 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
-void SwrDispatch(
-    HANDLE hContext,
-    uint32_t threadGroupCountX,
-    uint32_t threadGroupCountY,
-    uint32_t threadGroupCountZ)
+void SwrDispatch(HANDLE   hContext,
+                 uint32_t threadGroupCountX,
+                 uint32_t threadGroupCountY,
+                 uint32_t threadGroupCountZ)
 {
     if (KNOB_TOSS_DRAW)
     {
         return;
     }
 
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     RDTSC_BEGIN(APIDispatch, pDC->drawId);
-    AR_API_EVENT(DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
-    pDC->isCompute = true;      // This is a compute context.
+    AR_API_EVENT(
+        DispatchEvent(pDC->drawId, threadGroupCountX, threadGroupCountY, threadGroupCountZ));
+    pDC->isCompute = true; // This is a compute context.
 
     COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
 
@@ -1533,8 +1540,8 @@
     pTaskData->threadGroupCountZ = threadGroupCountZ;
 
     uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
-    uint32_t dcIndex = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
-    pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
+    uint32_t dcIndex           = pDC->drawId % pContext->MAX_DRAWS_IN_FLIGHT;
+    pDC->pDispatch             = &pContext->pDispatchQueueArray[dcIndex];
     pDC->pDispatch->initialize(totalThreadGroups, pTaskData, &ProcessComputeBE);
 
     QueueDispatch(pContext);
@@ -1543,30 +1550,29 @@
 
 // Deswizzles, converts and stores current contents of the hot tiles to surface
 // described by pState
-void SWR_API SwrStoreTiles(
-    HANDLE hContext,
-    uint32_t attachmentMask,
-    SWR_TILE_STATE postStoreTileState,
-    const SWR_RECT& storeRect)
+void SWR_API SwrStoreTiles(HANDLE          hContext,
+                           uint32_t        attachmentMask,
+                           SWR_TILE_STATE  postStoreTileState,
+                           const SWR_RECT& storeRect)
 {
     if (KNOB_TOSS_DRAW)
     {
         return;
     }
 
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     RDTSC_BEGIN(APIStoreTiles, pDC->drawId);
 
-    pDC->FeWork.type = STORETILES;
-    pDC->FeWork.pfnWork = ProcessStoreTiles;
-    pDC->FeWork.desc.storeTiles.attachmentMask = attachmentMask;
+    pDC->FeWork.type                               = STORETILES;
+    pDC->FeWork.pfnWork                            = ProcessStoreTiles;
+    pDC->FeWork.desc.storeTiles.attachmentMask     = attachmentMask;
     pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
-    pDC->FeWork.desc.storeTiles.rect = storeRect;
+    pDC->FeWork.desc.storeTiles.rect               = storeRect;
     pDC->FeWork.desc.storeTiles.rect &= g_MaxScissorRect;
 
-    //enqueue
+    // enqueue
     QueueDraw(pContext);
 
     AR_API_EVENT(SwrStoreTilesEvent(pDC->drawId));
@@ -1583,37 +1589,36 @@
 /// @param z - depth value use for clearing depth buffer
 /// @param stencil - stencil value used for clearing stencil buffer
 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
-void SWR_API SwrClearRenderTarget(
-    HANDLE hContext,
-    uint32_t attachmentMask,
-    uint32_t renderTargetArrayIndex,
-    const float clearColor[4],
-    float z,
-    uint8_t stencil,
-    const SWR_RECT& clearRect)
+void SWR_API SwrClearRenderTarget(HANDLE          hContext,
+                                  uint32_t        attachmentMask,
+                                  uint32_t        renderTargetArrayIndex,
+                                  const float     clearColor[4],
+                                  float           z,
+                                  uint8_t         stencil,
+                                  const SWR_RECT& clearRect)
 {
     if (KNOB_TOSS_DRAW)
     {
         return;
     }
 
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     RDTSC_BEGIN(APIClearRenderTarget, pDC->drawId);
 
-    pDC->FeWork.type = CLEAR;
-    pDC->FeWork.pfnWork = ProcessClear;
+    pDC->FeWork.type            = CLEAR;
+    pDC->FeWork.pfnWork         = ProcessClear;
     pDC->FeWork.desc.clear.rect = clearRect;
     pDC->FeWork.desc.clear.rect &= g_MaxScissorRect;
-    pDC->FeWork.desc.clear.attachmentMask = attachmentMask;
+    pDC->FeWork.desc.clear.attachmentMask         = attachmentMask;
     pDC->FeWork.desc.clear.renderTargetArrayIndex = renderTargetArrayIndex;
-    pDC->FeWork.desc.clear.clearDepth = z;
-    pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
-    pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
-    pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
-    pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
-    pDC->FeWork.desc.clear.clearStencil = stencil;
+    pDC->FeWork.desc.clear.clearDepth             = z;
+    pDC->FeWork.desc.clear.clearRTColor[0]        = clearColor[0];
+    pDC->FeWork.desc.clear.clearRTColor[1]        = clearColor[1];
+    pDC->FeWork.desc.clear.clearRTColor[2]        = clearColor[2];
+    pDC->FeWork.desc.clear.clearRTColor[3]        = clearColor[3];
+    pDC->FeWork.desc.clear.clearStencil           = stencil;
 
     // enqueue draw
     QueueDraw(pContext);
@@ -1627,16 +1632,16 @@
 ///        sampler.
 ///        SWR is responsible for the allocation of the private context state.
 /// @param hContext - Handle passed back from SwrCreateContext
-VOID* SwrGetPrivateContextState(
-    HANDLE hContext)
+VOID* SwrGetPrivateContextState(HANDLE hContext)
 {
-    SWR_CONTEXT* pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-    DRAW_STATE* pState = pDC->pState;
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
+    DRAW_STATE*   pState   = pDC->pState;
 
     if (pState->pPrivateState == nullptr)
     {
-        pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
+        pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize,
+                                                             KNOB_SIMD_WIDTH * sizeof(float));
     }
 
     return pState->pPrivateState;
@@ -1650,13 +1655,10 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param size - Size of allocation
 /// @param align - Alignment needed for allocation.
-VOID* SwrAllocDrawContextMemory(
-    HANDLE hContext,
-    uint32_t size,
-    uint32_t align)
+VOID* SwrAllocDrawContextMemory(HANDLE hContext, uint32_t size, uint32_t align)
 {
-    SWR_CONTEXT* pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     return pDC->pState->pArena->AllocAligned(size, align);
 }
@@ -1665,12 +1667,10 @@
 /// @brief Enables stats counting
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param enable - If true then counts are incremented.
-void SwrEnableStatsFE(
-    HANDLE hContext,
-    bool enable)
+void SwrEnableStatsFE(HANDLE hContext, bool enable)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     pDC->pState->state.enableStatsFE = enable;
 }
@@ -1679,12 +1679,10 @@
 /// @brief Enables stats counting
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param enable - If true then counts are incremented.
-void SwrEnableStatsBE(
-    HANDLE hContext,
-    bool enable)
+void SwrEnableStatsBE(HANDLE hContext, bool enable)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
 
     pDC->pState->state.enableStatsBE = enable;
 }
@@ -1692,11 +1690,10 @@
 //////////////////////////////////////////////////////////////////////////
 /// @brief Mark end of frame - used for performance profiling
 /// @param hContext - Handle passed back from SwrCreateContext
-void SWR_API SwrEndFrame(
-    HANDLE hContext)
+void SWR_API SwrEndFrame(HANDLE hContext)
 {
-    SWR_CONTEXT *pContext = GetContext(hContext);
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_CONTEXT*  pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC      = GetDrawContext(pContext);
     (void)pDC; // var used
 
     RDTSC_ENDFRAME();
@@ -1716,64 +1713,60 @@
 /// @brief Initialize swr backend and memory internal tables
 void SwrInit()
 {
-    InitSimLoadTilesTable();
-    InitSimStoreTilesTable();
-    InitSimClearTilesTable();
-
     InitClearTilesTable();
     InitBackendFuncTables();
     InitRasterizerFunctions();
 }
 
-void SwrGetInterface(SWR_INTERFACE &out_funcs)
+void SwrGetInterface(SWR_INTERFACE& out_funcs)
 {
-    out_funcs.pfnSwrCreateContext = SwrCreateContext;
-    out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
-    out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
-    out_funcs.pfnSwrSaveState = SwrSaveState;
-    out_funcs.pfnSwrRestoreState = SwrRestoreState;
-    out_funcs.pfnSwrSync = SwrSync;
-    out_funcs.pfnSwrStallBE = SwrStallBE;
-    out_funcs.pfnSwrWaitForIdle = SwrWaitForIdle;
-    out_funcs.pfnSwrWaitForIdleFE = SwrWaitForIdleFE;
-    out_funcs.pfnSwrSetVertexBuffers = SwrSetVertexBuffers;
-    out_funcs.pfnSwrSetIndexBuffer = SwrSetIndexBuffer;
-    out_funcs.pfnSwrSetFetchFunc = SwrSetFetchFunc;
-    out_funcs.pfnSwrSetSoFunc = SwrSetSoFunc;
-    out_funcs.pfnSwrSetSoState = SwrSetSoState;
-    out_funcs.pfnSwrSetSoBuffers = SwrSetSoBuffers;
-    out_funcs.pfnSwrSetVertexFunc = SwrSetVertexFunc;
-    out_funcs.pfnSwrSetFrontendState = SwrSetFrontendState;
-    out_funcs.pfnSwrSetGsState = SwrSetGsState;
-    out_funcs.pfnSwrSetGsFunc = SwrSetGsFunc;
-    out_funcs.pfnSwrSetCsFunc = SwrSetCsFunc;
-    out_funcs.pfnSwrSetTsState = SwrSetTsState;
-    out_funcs.pfnSwrSetHsFunc = SwrSetHsFunc;
-    out_funcs.pfnSwrSetDsFunc = SwrSetDsFunc;
-    out_funcs.pfnSwrSetDepthStencilState = SwrSetDepthStencilState;
-    out_funcs.pfnSwrSetBackendState = SwrSetBackendState;
-    out_funcs.pfnSwrSetDepthBoundsState = SwrSetDepthBoundsState;
-    out_funcs.pfnSwrSetPixelShaderState = SwrSetPixelShaderState;
-    out_funcs.pfnSwrSetBlendState = SwrSetBlendState;
-    out_funcs.pfnSwrSetBlendFunc = SwrSetBlendFunc;
-    out_funcs.pfnSwrDraw = SwrDraw;
-    out_funcs.pfnSwrDrawInstanced = SwrDrawInstanced;
-    out_funcs.pfnSwrDrawIndexed = SwrDrawIndexed;
-    out_funcs.pfnSwrDrawIndexedInstanced = SwrDrawIndexedInstanced;
-    out_funcs.pfnSwrInvalidateTiles = SwrInvalidateTiles;
-    out_funcs.pfnSwrDiscardRect = SwrDiscardRect;
-    out_funcs.pfnSwrDispatch = SwrDispatch;
-    out_funcs.pfnSwrStoreTiles = SwrStoreTiles;
-    out_funcs.pfnSwrClearRenderTarget = SwrClearRenderTarget;
-    out_funcs.pfnSwrSetRastState = SwrSetRastState;
-    out_funcs.pfnSwrSetViewports = SwrSetViewports;
-    out_funcs.pfnSwrSetScissorRects = SwrSetScissorRects;
+    out_funcs.pfnSwrCreateContext          = SwrCreateContext;
+    out_funcs.pfnSwrDestroyContext         = SwrDestroyContext;
+    out_funcs.pfnSwrBindApiThread          = SwrBindApiThread;
+    out_funcs.pfnSwrSaveState              = SwrSaveState;
+    out_funcs.pfnSwrRestoreState           = SwrRestoreState;
+    out_funcs.pfnSwrSync                   = SwrSync;
+    out_funcs.pfnSwrStallBE                = SwrStallBE;
+    out_funcs.pfnSwrWaitForIdle            = SwrWaitForIdle;
+    out_funcs.pfnSwrWaitForIdleFE          = SwrWaitForIdleFE;
+    out_funcs.pfnSwrSetVertexBuffers       = SwrSetVertexBuffers;
+    out_funcs.pfnSwrSetIndexBuffer         = SwrSetIndexBuffer;
+    out_funcs.pfnSwrSetFetchFunc           = SwrSetFetchFunc;
+    out_funcs.pfnSwrSetSoFunc              = SwrSetSoFunc;
+    out_funcs.pfnSwrSetSoState             = SwrSetSoState;
+    out_funcs.pfnSwrSetSoBuffers           = SwrSetSoBuffers;
+    out_funcs.pfnSwrSetVertexFunc          = SwrSetVertexFunc;
+    out_funcs.pfnSwrSetFrontendState       = SwrSetFrontendState;
+    out_funcs.pfnSwrSetGsState             = SwrSetGsState;
+    out_funcs.pfnSwrSetGsFunc              = SwrSetGsFunc;
+    out_funcs.pfnSwrSetCsFunc              = SwrSetCsFunc;
+    out_funcs.pfnSwrSetTsState             = SwrSetTsState;
+    out_funcs.pfnSwrSetHsFunc              = SwrSetHsFunc;
+    out_funcs.pfnSwrSetDsFunc              = SwrSetDsFunc;
+    out_funcs.pfnSwrSetDepthStencilState   = SwrSetDepthStencilState;
+    out_funcs.pfnSwrSetBackendState        = SwrSetBackendState;
+    out_funcs.pfnSwrSetDepthBoundsState    = SwrSetDepthBoundsState;
+    out_funcs.pfnSwrSetPixelShaderState    = SwrSetPixelShaderState;
+    out_funcs.pfnSwrSetBlendState          = SwrSetBlendState;
+    out_funcs.pfnSwrSetBlendFunc           = SwrSetBlendFunc;
+    out_funcs.pfnSwrDraw                   = SwrDraw;
+    out_funcs.pfnSwrDrawInstanced          = SwrDrawInstanced;
+    out_funcs.pfnSwrDrawIndexed            = SwrDrawIndexed;
+    out_funcs.pfnSwrDrawIndexedInstanced   = SwrDrawIndexedInstanced;
+    out_funcs.pfnSwrInvalidateTiles        = SwrInvalidateTiles;
+    out_funcs.pfnSwrDiscardRect            = SwrDiscardRect;
+    out_funcs.pfnSwrDispatch               = SwrDispatch;
+    out_funcs.pfnSwrStoreTiles             = SwrStoreTiles;
+    out_funcs.pfnSwrClearRenderTarget      = SwrClearRenderTarget;
+    out_funcs.pfnSwrSetRastState           = SwrSetRastState;
+    out_funcs.pfnSwrSetViewports           = SwrSetViewports;
+    out_funcs.pfnSwrSetScissorRects        = SwrSetScissorRects;
     out_funcs.pfnSwrGetPrivateContextState = SwrGetPrivateContextState;
     out_funcs.pfnSwrAllocDrawContextMemory = SwrAllocDrawContextMemory;
-    out_funcs.pfnSwrEnableStatsFE = SwrEnableStatsFE;
-    out_funcs.pfnSwrEnableStatsBE = SwrEnableStatsBE;
-    out_funcs.pfnSwrEndFrame = SwrEndFrame;
-    out_funcs.pfnSwrInit = SwrInit;
+    out_funcs.pfnSwrEnableStatsFE          = SwrEnableStatsFE;
+    out_funcs.pfnSwrEnableStatsBE          = SwrEnableStatsBE;
+    out_funcs.pfnSwrEndFrame               = SwrEndFrame;
+    out_funcs.pfnSwrInit                   = SwrInit;
     out_funcs.pfnSwrLoadHotTile = SwrLoadHotTile;
     out_funcs.pfnSwrStoreHotTileToSurface = SwrStoreHotTileToSurface;
     out_funcs.pfnSwrStoreHotTileClear = SwrStoreHotTileClear;
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index b171188..9cc5292 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file api.h
-*
-* @brief API definitions
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file api.h
+ *
+ * @brief API definitions
+ *
+ ******************************************************************************/
 
 #ifndef __SWR_API_H__
 #define __SWR_API_H__
@@ -38,7 +38,7 @@
 #include "common/formats.h"
 #include "core/state.h"
 
-typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3);
+typedef void(SWR_API* PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Rectangle structure
@@ -47,20 +47,15 @@
     int32_t xmin; ///< inclusive
     int32_t ymin; ///< inclusive
     int32_t xmax; ///< exclusive
-    int32_t ymax; ///< exclusive 
+    int32_t ymax; ///< exclusive
 
-    bool operator == (const SWR_RECT& rhs)
+    bool operator==(const SWR_RECT& rhs)
     {
-        return (this->ymin == rhs.ymin &&
-            this->ymax == rhs.ymax &&
-            this->xmin == rhs.xmin &&
-            this->xmax == rhs.xmax);
+        return (this->ymin == rhs.ymin && this->ymax == rhs.ymax && this->xmin == rhs.xmin &&
+                this->xmax == rhs.xmax);
     }
 
-    bool operator != (const SWR_RECT& rhs)
-    {
-        return !(*this == rhs);
-    }
+    bool operator!=(const SWR_RECT& rhs) { return !(*this == rhs); }
 
     SWR_RECT& Intersect(const SWR_RECT& other)
     {
@@ -69,8 +64,7 @@
         this->xmax = std::min(this->xmax, other.xmax);
         this->ymax = std::min(this->ymax, other.ymax);
 
-        if (xmax - xmin < 0 ||
-            ymax - ymin < 0)
+        if (xmax - xmin < 0 || ymax - ymin < 0)
         {
             // Zero area
             ymin = ymax = xmin = xmax = 0;
@@ -78,10 +72,7 @@
 
         return *this;
     }
-    SWR_RECT& operator &= (const SWR_RECT& other)
-    {
-        return Intersect(other);
-    }
+    SWR_RECT& operator&=(const SWR_RECT& other) { return Intersect(other); }
 
     SWR_RECT& Union(const SWR_RECT& other)
     {
@@ -93,10 +84,7 @@
         return *this;
     }
 
-    SWR_RECT& operator |= (const SWR_RECT& other)
-    {
-        return Union(other);
-    }
+    SWR_RECT& operator|=(const SWR_RECT& other) { return Union(other); }
 
     void Translate(int32_t x, int32_t y)
     {
@@ -115,10 +103,14 @@
 /// @param x - destination x coordinate
 /// @param y - destination y coordinate
 /// @param pDstHotTile - pointer to the hot tile surface
-typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPrivateData,
-    SWR_FORMAT dstFormat,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile);
+typedef void(SWR_API* PFN_LOAD_TILE)(HANDLE                      hPrivateContext,
+                                     HANDLE                      hWorkerPrivateData,
+                                     SWR_FORMAT                  dstFormat,
+                                     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+                                     uint32_t                    x,
+                                     uint32_t                    y,
+                                     uint32_t                    renderTargetArrayIndex,
+                                     uint8_t*                    pDstHotTile);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Function signature for store hot tiles
@@ -128,10 +120,14 @@
 /// @param x - destination x coordinate
 /// @param y - destination y coordinate
 /// @param pSrcHotTile - pointer to the hot tile surface
-typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPrivateData,
-    SWR_FORMAT srcFormat,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile);
+typedef void(SWR_API* PFN_STORE_TILE)(HANDLE                      hPrivateContext,
+                                      HANDLE                      hWorkerPrivateData,
+                                      SWR_FORMAT                  srcFormat,
+                                      SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+                                      uint32_t                    x,
+                                      uint32_t                    y,
+                                      uint32_t                    renderTargetArrayIndex,
+                                      uint8_t*                    pSrcHotTile);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Function signature for clearing from the hot tiles clear value
@@ -141,9 +137,13 @@
 /// @param y - destination y coordinate
 /// @param renderTargetArrayIndex - render target array offset from arrayIndex
 /// @param pClearColor - pointer to the hot tile's clear value
-typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, HANDLE hWorkerPrivateData,
-    SWR_RENDERTARGET_ATTACHMENT rtIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, const float* pClearColor);
+typedef void(SWR_API* PFN_CLEAR_TILE)(HANDLE                      hPrivateContext,
+                                      HANDLE                      hWorkerPrivateData,
+                                      SWR_RENDERTARGET_ATTACHMENT rtIndex,
+                                      uint32_t                    x,
+                                      uint32_t                    y,
+                                      uint32_t                    renderTargetArrayIndex,
+                                      const float*                pClearColor);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Callback to allow driver to update their copy of streamout write offset.
@@ -152,15 +152,15 @@
 /// @param hPrivateContext - handle to private data
 /// @param soBufferSlot - buffer slot for write offset
 /// @param soWriteOffset - update value for so write offset.
-typedef void(SWR_API *PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext,
-    uint32_t soBufferSlot, uint32_t soWriteOffset);
+typedef void(SWR_API* PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE   hPrivateContext,
+                                                  uint32_t soBufferSlot,
+                                                  uint32_t soWriteOffset);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Callback to allow driver to update their copy of stats.
 /// @param hPrivateContext - handle to private data
 /// @param pStats - pointer to draw stats
-typedef void(SWR_API *PFN_UPDATE_STATS)(HANDLE hPrivateContext,
-    const SWR_STATS* pStats);
+typedef void(SWR_API* PFN_UPDATE_STATS)(HANDLE hPrivateContext, const SWR_STATS* pStats);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Callback to allow driver to update their copy of FE stats.
@@ -169,8 +169,7 @@
 ///       to sum up the stats across all of the workers.
 /// @param hPrivateContext - handle to private data
 /// @param pStats - pointer to draw stats
-typedef void(SWR_API *PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext,
-    const SWR_STATS_FE* pStats);
+typedef void(SWR_API* PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext, const SWR_STATS_FE* pStats);
 
 //////////////////////////////////////////////////////////////////////////
 /// BucketManager
@@ -183,14 +182,14 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_THREADING_INFO
 {
-    uint32_t    BASE_NUMA_NODE;
-    uint32_t    BASE_CORE;
-    uint32_t    BASE_THREAD;
-    uint32_t    MAX_WORKER_THREADS;
-    uint32_t    MAX_NUMA_NODES;
-    uint32_t    MAX_CORES_PER_NUMA_NODE;
-    uint32_t    MAX_THREADS_PER_CORE;
-    bool        SINGLE_THREADED;
+    uint32_t BASE_NUMA_NODE;
+    uint32_t BASE_CORE;
+    uint32_t BASE_THREAD;
+    uint32_t MAX_WORKER_THREADS;
+    uint32_t MAX_NUMA_NODES;
+    uint32_t MAX_CORES_PER_NUMA_NODE;
+    uint32_t MAX_THREADS_PER_CORE;
+    bool     SINGLE_THREADED;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -206,8 +205,8 @@
     uint32_t bindAPIThread0;        // Default is true if numAPIReservedThreads is > 0,
                                     // binds thread used in SwrCreateContext to API Reserved
                                     // thread 0
-    uint32_t numAPIThreadsPerCore;  // 0 - means use all threads per core, else clamp to this number.
-                                    // Independent of KNOB_MAX_THREADS_PER_CORE.
+    uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number.
+                                   // Independent of KNOB_MAX_THREADS_PER_CORE.
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -217,13 +216,13 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_WORKER_PRIVATE_STATE
 {
-    typedef void (SWR_API *PFN_WORKER_DATA)(HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
+    typedef void(SWR_API* PFN_WORKER_DATA)(HANDLE hWorkerPrivateData, uint32_t iWorkerNum);
 
-    size_t              perWorkerPrivateStateSize;  ///< Amount of data to allocate per-worker
-    PFN_WORKER_DATA     pfnInitWorkerData;          ///< Init function for worker data.  If null
-                                                    ///< worker data will be initialized to 0.
-    PFN_WORKER_DATA     pfnFinishWorkerData;        ///< Finish / destroy function for worker data.
-                                                    ///< Can be null.
+    size_t          perWorkerPrivateStateSize; ///< Amount of data to allocate per-worker
+    PFN_WORKER_DATA pfnInitWorkerData;         ///< Init function for worker data.  If null
+                                               ///< worker data will be initialized to 0.
+    PFN_WORKER_DATA pfnFinishWorkerData;       ///< Finish / destroy function for worker data.
+                                               ///< Can be null.
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -233,198 +232,167 @@
 {
     // External functions (e.g. sampler) need per draw context state.
     // Use SwrGetPrivateContextState() to access private state.
-    size_t                      privateStateSize;
+    size_t privateStateSize;
 
     // Optional per-worker state, can be NULL for no worker-private data
-    SWR_WORKER_PRIVATE_STATE*   pWorkerPrivateState;
+    SWR_WORKER_PRIVATE_STATE* pWorkerPrivateState;
 
     // Callback functions
-    PFN_LOAD_TILE               pfnLoadTile;
-    PFN_STORE_TILE              pfnStoreTile;
-    PFN_CLEAR_TILE              pfnClearTile;
-    PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS            pfnUpdateStats;
-    PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
+    PFN_LOAD_TILE              pfnLoadTile;
+    PFN_STORE_TILE             pfnStoreTile;
+    PFN_CLEAR_TILE             pfnClearTile;
+    PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
+    PFN_UPDATE_STATS           pfnUpdateStats;
+    PFN_UPDATE_STATS_FE        pfnUpdateStatsFE;
 
 
     // Pointer to rdtsc buckets mgr returned to the caller.
     // Only populated when KNOB_ENABLE_RDTSC is set
-    BucketManager*              pBucketMgr;
+    BucketManager* pBucketMgr;
 
     // Output: size required memory passed to for SwrSaveState / SwrRestoreState
-    size_t                      contextSaveSize;
+    size_t contextSaveSize;
 
     // ArchRast event manager.
-    HANDLE                      hArEventManager;
+    HANDLE hArEventManager;
 
     // Input (optional): Threading info that overrides any set KNOB values.
-    SWR_THREADING_INFO*         pThreadInfo;
+    SWR_THREADING_INFO* pThreadInfo;
 
     // Input (optional): Info for reserving API threads
-    SWR_API_THREADING_INFO*     pApiThreadInfo;
+    SWR_API_THREADING_INFO* pApiThreadInfo;
 
     // Input: if set to non-zero value, overrides KNOB value for maximum
     // number of draws in flight
-    uint32_t                    MAX_DRAWS_IN_FLIGHT;
+    uint32_t MAX_DRAWS_IN_FLIGHT;
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
-SWR_FUNC(HANDLE, SwrCreateContext,
-    SWR_CREATECONTEXT_INFO* pCreateInfo);
+SWR_FUNC(HANDLE, SwrCreateContext, SWR_CREATECONTEXT_INFO* pCreateInfo);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Destroys SWR Context.
 /// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrDestroyContext,
-    HANDLE hContext);
+SWR_FUNC(void, SwrDestroyContext, HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Bind current thread to an API reserved HW thread
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param apiThreadId - index of reserved HW thread to bind to.
-SWR_FUNC(void, SwrBindApiThread,
-    HANDLE hContext,
-    uint32_t apiThreadId);
+SWR_FUNC(void, SwrBindApiThread, HANDLE hContext, uint32_t apiThreadId);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Saves API state associated with hContext
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pOutputStateBlock - Memory block to receive API state data
 /// @param memSize - Size of memory pointed to by pOutputStateBlock
-SWR_FUNC(void, SwrSaveState,
-    HANDLE hContext,
-    void* pOutputStateBlock,
-    size_t memSize);
+SWR_FUNC(void, SwrSaveState, HANDLE hContext, void* pOutputStateBlock, size_t memSize);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Restores API state to hContext previously saved with SwrSaveState
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pStateBlock - Memory block to read API state data from
 /// @param memSize - Size of memory pointed to by pStateBlock
-SWR_FUNC(void, SwrRestoreState,
-    HANDLE hContext,
-    const void* pStateBlock,
-    size_t memSize);
+SWR_FUNC(void, SwrRestoreState, HANDLE hContext, const void* pStateBlock, size_t memSize);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Sync cmd. Executes the callback func when all rendering up to this sync
 ///        has been completed
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnFunc - pointer to callback function,
-/// @param userData - user data to pass back 
-SWR_FUNC(void, SwrSync,
-    HANDLE hContext,
-    PFN_CALLBACK_FUNC pfnFunc,
-    uint64_t userData,
-    uint64_t userData2,
-    uint64_t userData3);
+/// @param userData - user data to pass back
+SWR_FUNC(void,
+         SwrSync,
+         HANDLE            hContext,
+         PFN_CALLBACK_FUNC pfnFunc,
+         uint64_t          userData,
+         uint64_t          userData2,
+         uint64_t          userData3);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Stall cmd. Stalls the backend until all previous work has been completed.
 ///        Frontend work can continue to make progress
 /// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrStallBE,
-    HANDLE hContext);
+SWR_FUNC(void, SwrStallBE, HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Blocks until all rendering has been completed.
 /// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrWaitForIdle,
-    HANDLE hContext);
+SWR_FUNC(void, SwrWaitForIdle, HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Blocks until all FE rendering has been completed.
 /// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrWaitForIdleFE,
-    HANDLE hContext);
+SWR_FUNC(void, SwrWaitForIdleFE, HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set vertex buffer state.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param numBuffers - Number of vertex buffer state descriptors.
 /// @param pVertexBuffers - Array of vertex buffer state descriptors.
-SWR_FUNC(void, SwrSetVertexBuffers,
-    HANDLE hContext,
-    uint32_t numBuffers,
-    const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
+SWR_FUNC(void,
+         SwrSetVertexBuffers,
+         HANDLE                         hContext,
+         uint32_t                       numBuffers,
+         const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set index buffer
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pIndexBuffer - Index buffer.
-SWR_FUNC(void, SwrSetIndexBuffer,
-    HANDLE hContext,
-    const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
+SWR_FUNC(void, SwrSetIndexBuffer, HANDLE hContext, const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set fetch shader pointer.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnFetchFunc - Pointer to shader.
-SWR_FUNC(void, SwrSetFetchFunc,
-    HANDLE hContext,
-    PFN_FETCH_FUNC    pfnFetchFunc);
+SWR_FUNC(void, SwrSetFetchFunc, HANDLE hContext, PFN_FETCH_FUNC pfnFetchFunc);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set streamout shader pointer.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnSoFunc - Pointer to shader.
 /// @param streamIndex - specifies stream
-SWR_FUNC(void, SwrSetSoFunc,
-    HANDLE hContext,
-    PFN_SO_FUNC    pfnSoFunc,
-    uint32_t streamIndex);
+SWR_FUNC(void, SwrSetSoFunc, HANDLE hContext, PFN_SO_FUNC pfnSoFunc, uint32_t streamIndex);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set streamout state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pSoState - Pointer to streamout state.
-SWR_FUNC(void, SwrSetSoState,
-    HANDLE hContext,
-    SWR_STREAMOUT_STATE* pSoState);
+SWR_FUNC(void, SwrSetSoState, HANDLE hContext, SWR_STREAMOUT_STATE* pSoState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set streamout buffer state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pSoBuffer - Pointer to streamout buffer.
 /// @param slot - Slot to bind SO buffer to.
-SWR_FUNC(void, SwrSetSoBuffers,
-    HANDLE hContext,
-    SWR_STREAMOUT_BUFFER* pSoBuffer,
-    uint32_t slot);
+SWR_FUNC(void, SwrSetSoBuffers, HANDLE hContext, SWR_STREAMOUT_BUFFER* pSoBuffer, uint32_t slot);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set vertex shader pointer.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnVertexFunc - Pointer to shader.
-SWR_FUNC(void, SwrSetVertexFunc,
-    HANDLE hContext,
-    PFN_VERTEX_FUNC pfnVertexFunc);
+SWR_FUNC(void, SwrSetVertexFunc, HANDLE hContext, PFN_VERTEX_FUNC pfnVertexFunc);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set frontend state.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state
-SWR_FUNC(void, SwrSetFrontendState,
-    HANDLE hContext,
-    SWR_FRONTEND_STATE *pState);
+SWR_FUNC(void, SwrSetFrontendState, HANDLE hContext, SWR_FRONTEND_STATE* pState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set geometry shader state.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state
-SWR_FUNC(void, SwrSetGsState,
-    HANDLE hContext,
-    SWR_GS_STATE *pState);
+SWR_FUNC(void, SwrSetGsState, HANDLE hContext, SWR_GS_STATE* pState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set geometry shader
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to geometry shader function
-SWR_FUNC(void, SwrSetGsFunc,
-    HANDLE hContext,
-    PFN_GS_FUNC pfnGsFunc);
+SWR_FUNC(void, SwrSetGsFunc, HANDLE hContext, PFN_GS_FUNC pfnGsFunc);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set compute shader
@@ -434,88 +402,70 @@
 /// @param totalSpillFillSize - size in bytes needed for spill/fill.
 /// @param scratchSpaceSizePerInstance - size of the scratch space needed per simd instance
 /// @param numInstances - number of simd instances that are run per execution of the shader
-SWR_FUNC(void, SwrSetCsFunc,
-    HANDLE hContext,
-    PFN_CS_FUNC pfnCsFunc,
-    uint32_t totalThreadsInGroup,
-    uint32_t totalSpillFillSize,
-    uint32_t scratchSpaceSizePerInstance,
-    uint32_t numInstances
-    );
+SWR_FUNC(void,
+         SwrSetCsFunc,
+         HANDLE      hContext,
+         PFN_CS_FUNC pfnCsFunc,
+         uint32_t    totalThreadsInGroup,
+         uint32_t    totalSpillFillSize,
+         uint32_t    scratchSpaceSizePerInstance,
+         uint32_t    numInstances);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set tessellation state.
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state
-SWR_FUNC(void, SwrSetTsState,
-    HANDLE hContext,
-    SWR_TS_STATE *pState);
+SWR_FUNC(void, SwrSetTsState, HANDLE hContext, SWR_TS_STATE* pState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set hull shader
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnFunc - Pointer to shader function
-SWR_FUNC(void, SwrSetHsFunc,
-    HANDLE hContext,
-    PFN_HS_FUNC pfnFunc);
+SWR_FUNC(void, SwrSetHsFunc, HANDLE hContext, PFN_HS_FUNC pfnFunc);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set domain shader
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pfnFunc - Pointer to shader function
-SWR_FUNC(void, SwrSetDsFunc,
-    HANDLE hContext,
-    PFN_DS_FUNC pfnFunc);
+SWR_FUNC(void, SwrSetDsFunc, HANDLE hContext, PFN_DS_FUNC pfnFunc);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set depth stencil state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetDepthStencilState,
-    HANDLE hContext,
-    SWR_DEPTH_STENCIL_STATE *pState);
+SWR_FUNC(void, SwrSetDepthStencilState, HANDLE hContext, SWR_DEPTH_STENCIL_STATE* pState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set backend state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetBackendState,
-    HANDLE hContext,
-    SWR_BACKEND_STATE *pState);
+SWR_FUNC(void, SwrSetBackendState, HANDLE hContext, SWR_BACKEND_STATE* pState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set depth bounds state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetDepthBoundsState,
-    HANDLE hContext,
-    SWR_DEPTH_BOUNDS_STATE *pState);
+SWR_FUNC(void, SwrSetDepthBoundsState, HANDLE hContext, SWR_DEPTH_BOUNDS_STATE* pState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set pixel shader state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetPixelShaderState,
-    HANDLE hContext,
-    SWR_PS_STATE *pState);
+SWR_FUNC(void, SwrSetPixelShaderState, HANDLE hContext, SWR_PS_STATE* pState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set blend state
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pState - Pointer to state.
-SWR_FUNC(void, SwrSetBlendState,
-    HANDLE hContext,
-    SWR_BLEND_STATE *pState);
+SWR_FUNC(void, SwrSetBlendState, HANDLE hContext, SWR_BLEND_STATE* pState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Set blend function
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param renderTarget - render target index
 /// @param pfnBlendFunc - function pointer
-SWR_FUNC(void, SwrSetBlendFunc,
-    HANDLE hContext,
-    uint32_t renderTarget,
-    PFN_BLEND_JIT_FUNC pfnBlendFunc);
+SWR_FUNC(
+    void, SwrSetBlendFunc, HANDLE hContext, uint32_t renderTarget, PFN_BLEND_JIT_FUNC pfnBlendFunc);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrDraw
@@ -523,11 +473,12 @@
 /// @param topology - Specifies topology for draw.
 /// @param startVertex - Specifies start vertex in vertex buffer for draw.
 /// @param primCount - Number of vertices.
-SWR_FUNC(void, SwrDraw,
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t startVertex,
-    uint32_t primCount);
+SWR_FUNC(void,
+         SwrDraw,
+         HANDLE             hContext,
+         PRIMITIVE_TOPOLOGY topology,
+         uint32_t           startVertex,
+         uint32_t           primCount);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrDrawInstanced
@@ -536,14 +487,16 @@
 /// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
 /// @param numInstances - How many instances to render.
 /// @param startVertex - Specifies start vertex for draw. (vertex data)
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-SWR_FUNC(void, SwrDrawInstanced,
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t numVertsPerInstance,
-    uint32_t numInstances,
-    uint32_t startVertex,
-    uint32_t startInstance);
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer
+/// (instanced data)
+SWR_FUNC(void,
+         SwrDrawInstanced,
+         HANDLE             hContext,
+         PRIMITIVE_TOPOLOGY topology,
+         uint32_t           numVertsPerInstance,
+         uint32_t           numInstances,
+         uint32_t           startVertex,
+         uint32_t           startInstance);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief DrawIndexed
@@ -552,12 +505,13 @@
 /// @param numIndices - Number of indices to read sequentially from index buffer.
 /// @param indexOffset - Starting index into index buffer.
 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-SWR_FUNC(void, SwrDrawIndexed,
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t numIndices,
-    uint32_t indexOffset,
-    int32_t baseVertex);
+SWR_FUNC(void,
+         SwrDrawIndexed,
+         HANDLE             hContext,
+         PRIMITIVE_TOPOLOGY topology,
+         uint32_t           numIndices,
+         uint32_t           indexOffset,
+         int32_t            baseVertex);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrDrawIndexedInstanced
@@ -567,26 +521,30 @@
 /// @param numInstances - Number of instances to render.
 /// @param indexOffset - Starting index into index buffer.
 /// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
-/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
-SWR_FUNC(void, SwrDrawIndexedInstanced,
-    HANDLE hContext,
-    PRIMITIVE_TOPOLOGY topology,
-    uint32_t numIndices,
-    uint32_t numInstances,
-    uint32_t indexOffset,
-    int32_t baseVertex,
-    uint32_t startInstance);
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer
+/// (instanced data)
+SWR_FUNC(void,
+         SwrDrawIndexedInstanced,
+         HANDLE             hContext,
+         PRIMITIVE_TOPOLOGY topology,
+         uint32_t           numIndices,
+         uint32_t           numInstances,
+         uint32_t           indexOffset,
+         int32_t            baseVertex,
+         uint32_t           startInstance);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrInvalidateTiles
 /// @param hContext - Handle passed back from SwrCreateContext
-/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to
+/// invalidate.
 /// @param invalidateRect - The pixel-coordinate rectangle to invalidate.  This will be expanded to
 ///                         be hottile size-aligned.
-SWR_FUNC(void, SwrInvalidateTiles,
-    HANDLE hContext,
-    uint32_t attachmentMask,
-    const SWR_RECT& invalidateRect);
+SWR_FUNC(void,
+         SwrInvalidateTiles,
+         HANDLE          hContext,
+         uint32_t        attachmentMask,
+         const SWR_RECT& invalidateRect);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrDiscardRect
@@ -594,10 +552,7 @@
 /// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
 /// @param rect - The pixel-coordinate rectangle to discard.  Only fully-covered hottiles will be
 ///               discarded.
-SWR_FUNC(void, SwrDiscardRect,
-    HANDLE hContext,
-    uint32_t attachmentMask,
-    const SWR_RECT& rect);
+SWR_FUNC(void, SwrDiscardRect, HANDLE hContext, uint32_t attachmentMask, const SWR_RECT& rect);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrDispatch
@@ -605,27 +560,29 @@
 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
 /// @param threadGroupCountY - Number of thread groups dispatched in Y direction
 /// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
-SWR_FUNC(void, SwrDispatch,
-    HANDLE hContext,
-    uint32_t threadGroupCountX,
-    uint32_t threadGroupCountY,
-    uint32_t threadGroupCountZ);
-
+SWR_FUNC(void,
+         SwrDispatch,
+         HANDLE   hContext,
+         uint32_t threadGroupCountX,
+         uint32_t threadGroupCountY,
+         uint32_t threadGroupCountZ);
 
 enum SWR_TILE_STATE
 {
-    SWR_TILE_INVALID    = 0,    // tile is in unitialized state and should be loaded with surface contents before rendering
-    SWR_TILE_DIRTY      = 2,    // tile contains newer data than surface it represents
-    SWR_TILE_RESOLVED   = 3,    // is in sync with surface it represents
+    SWR_TILE_INVALID = 0, // tile is in unitialized state and should be loaded with surface contents
+                          // before rendering
+    SWR_TILE_DIRTY    = 2, // tile contains newer data than surface it represents
+    SWR_TILE_RESOLVED = 3, // is in sync with surface it represents
 };
 
-/// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs.
-SWR_FUNC(void, SwrStoreTiles,
-    HANDLE hContext,
-    uint32_t attachmentMask,
-    SWR_TILE_STATE postStoreTileState,
-    const SWR_RECT& storeRect);
-
+/// @todo Add a good description for what attachments are and when and why you would use the
+/// different SWR_TILE_STATEs.
+SWR_FUNC(void,
+         SwrStoreTiles,
+         HANDLE          hContext,
+         uint32_t        attachmentMask,
+         SWR_TILE_STATE  postStoreTileState,
+         const SWR_RECT& storeRect);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrClearRenderTarget - Clear attached render targets / depth / stencil
@@ -636,22 +593,21 @@
 /// @param z - depth value use for clearing depth buffer
 /// @param stencil - stencil value used for clearing stencil buffer
 /// @param clearRect - The pixel-coordinate rectangle to clear in all cleared buffers
-SWR_FUNC(void, SwrClearRenderTarget,
-    HANDLE hContext,
-    uint32_t attachmentMask,
-    uint32_t renderTargetArrayIndex,
-    const float clearColor[4],
-    float z,
-    uint8_t stencil,
-    const SWR_RECT& clearRect);
+SWR_FUNC(void,
+         SwrClearRenderTarget,
+         HANDLE          hContext,
+         uint32_t        attachmentMask,
+         uint32_t        renderTargetArrayIndex,
+         const float     clearColor[4],
+         float           z,
+         uint8_t         stencil,
+         const SWR_RECT& clearRect);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrSetRastState
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param pRastState - New SWR_RASTSTATE used for SwrDraw* commands
-SWR_FUNC(void, SwrSetRastState,
-    HANDLE hContext,
-    const SWR_RASTSTATE *pRastState);
+SWR_FUNC(void, SwrSetRastState, HANDLE hContext, const SWR_RASTSTATE* pRastState);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrSetViewports
@@ -659,21 +615,20 @@
 /// @param numViewports - number of viewports passed in
 /// @param pViewports - Specifies extents of viewport.
 /// @param pMatrices - If not specified then SWR computes a default one.
-SWR_FUNC(void, SwrSetViewports,
-    HANDLE hContext,
-    uint32_t numViewports,
-    const SWR_VIEWPORT* pViewports,
-    const SWR_VIEWPORT_MATRICES* pMatrices);
+SWR_FUNC(void,
+         SwrSetViewports,
+         HANDLE                       hContext,
+         uint32_t                     numViewports,
+         const SWR_VIEWPORT*          pViewports,
+         const SWR_VIEWPORT_MATRICES* pMatrices);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrSetScissorRects
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param numScissors - number of scissors passed in
 /// @param pScissors - array of scissors
-SWR_FUNC(void, SwrSetScissorRects,
-    HANDLE hContext,
-    uint32_t numScissors,
-    const SWR_RECT* pScissors);
+SWR_FUNC(
+    void, SwrSetScissorRects, HANDLE hContext, uint32_t numScissors, const SWR_RECT* pScissors);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Returns a pointer to the private context state for the current
@@ -683,8 +638,7 @@
 /// @note  Client needs to resend private state prior to each draw call.
 ///        Also, SWR is responsible for the private state memory.
 /// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void*, SwrGetPrivateContextState,
-    HANDLE hContext);
+SWR_FUNC(void*, SwrGetPrivateContextState, HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Clients can use this to allocate memory for draw/dispatch
@@ -694,32 +648,24 @@
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param size - Size of allocation
 /// @param align - Alignment needed for allocation.
-SWR_FUNC(void*, SwrAllocDrawContextMemory,
-    HANDLE hContext,
-    uint32_t size,
-    uint32_t align);
+SWR_FUNC(void*, SwrAllocDrawContextMemory, HANDLE hContext, uint32_t size, uint32_t align);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Enables stats counting
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param enable - If true then counts are incremented.
-SWR_FUNC(void, SwrEnableStatsFE,
-    HANDLE hContext,
-    bool enable);
+SWR_FUNC(void, SwrEnableStatsFE, HANDLE hContext, bool enable);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Enables stats counting
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param enable - If true then counts are incremented.
-SWR_FUNC(void, SwrEnableStatsBE,
-    HANDLE hContext,
-    bool enable);
+SWR_FUNC(void, SwrEnableStatsBE, HANDLE hContext, bool enable);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Mark end of frame - used for performance profiling
 /// @param hContext - Handle passed back from SwrCreateContext
-SWR_FUNC(void, SwrEndFrame,
-    HANDLE hContext);
+SWR_FUNC(void, SwrEndFrame, HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Initialize swr backend and memory internal tables
@@ -733,13 +679,16 @@
 /// @param renderTargetIndex - Index to src render target
 /// @param x, y - Coordinates to raster tile.
 /// @param pDstHotTile - Pointer to Hot Tile
-SWR_FUNC(void, SwrLoadHotTile,
-    HANDLE hWorkerPrivateData,
-    const SWR_SURFACE_STATE *pSrcSurface,
-    SWR_FORMAT dstFormat,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
-    uint8_t *pDstHotTile);
+SWR_FUNC(void,
+         SwrLoadHotTile,
+         HANDLE                      hWorkerPrivateData,
+         const SWR_SURFACE_STATE*    pSrcSurface,
+         SWR_FORMAT                  dstFormat,
+         SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+         uint32_t                    x,
+         uint32_t                    y,
+         uint32_t                    renderTargetArrayIndex,
+         uint8_t*                    pDstHotTile);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Deswizzles and stores a full hottile to a render surface
@@ -748,13 +697,16 @@
 /// @param renderTargetIndex - Index to destination render target
 /// @param x, y - Coordinates to raster tile.
 /// @param pSrcHotTile - Pointer to Hot Tile
-SWR_FUNC(void, SwrStoreHotTileToSurface,
-    HANDLE hWorkerPrivateData,
-    SWR_SURFACE_STATE *pDstSurface,
-    SWR_FORMAT srcFormat,
-    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
-    uint8_t *pSrcHotTile);
+SWR_FUNC(void,
+         SwrStoreHotTileToSurface,
+         HANDLE                      hWorkerPrivateData,
+         SWR_SURFACE_STATE*          pDstSurface,
+         SWR_FORMAT                  srcFormat,
+         SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+         uint32_t                    x,
+         uint32_t                    y,
+         uint32_t                    renderTargetArrayIndex,
+         uint8_t*                    pSrcHotTile);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Writes clear color to every pixel of a render surface
@@ -762,72 +714,73 @@
 /// @param renderTargetIndex - Index to destination render target
 /// @param x, y - Coordinates to raster tile.
 /// @param pClearColor - Pointer to clear color
-SWR_FUNC(void, SwrStoreHotTileClear,
-         HANDLE hWorkerPrivateData,
-         SWR_SURFACE_STATE *pDstSurface,
+SWR_FUNC(void,
+         SwrStoreHotTileClear,
+         HANDLE                      hWorkerPrivateData,
+         SWR_SURFACE_STATE*          pDstSurface,
          SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-         uint32_t x,
-         uint32_t y,
-         uint32_t renderTargetArrayIndex,
-         const float* pClearColor);
+         uint32_t                    x,
+         uint32_t                    y,
+         uint32_t                    renderTargetArrayIndex,
+         const float*                pClearColor);
 
 struct SWR_INTERFACE
 {
-    PFNSwrCreateContext pfnSwrCreateContext;
-    PFNSwrDestroyContext pfnSwrDestroyContext;
-    PFNSwrBindApiThread pfnSwrBindApiThread;
-    PFNSwrSaveState pfnSwrSaveState;
-    PFNSwrRestoreState pfnSwrRestoreState;
-    PFNSwrSync pfnSwrSync;
-    PFNSwrStallBE pfnSwrStallBE;
-    PFNSwrWaitForIdle pfnSwrWaitForIdle;
-    PFNSwrWaitForIdleFE pfnSwrWaitForIdleFE;
-    PFNSwrSetVertexBuffers pfnSwrSetVertexBuffers;
-    PFNSwrSetIndexBuffer pfnSwrSetIndexBuffer;
-    PFNSwrSetFetchFunc pfnSwrSetFetchFunc;
-    PFNSwrSetSoFunc pfnSwrSetSoFunc;
-    PFNSwrSetSoState pfnSwrSetSoState;
-    PFNSwrSetSoBuffers pfnSwrSetSoBuffers;
-    PFNSwrSetVertexFunc pfnSwrSetVertexFunc;
-    PFNSwrSetFrontendState pfnSwrSetFrontendState;
-    PFNSwrSetGsState pfnSwrSetGsState;
-    PFNSwrSetGsFunc pfnSwrSetGsFunc;
-    PFNSwrSetCsFunc pfnSwrSetCsFunc;
-    PFNSwrSetTsState pfnSwrSetTsState;
-    PFNSwrSetHsFunc pfnSwrSetHsFunc;
-    PFNSwrSetDsFunc pfnSwrSetDsFunc;
-    PFNSwrSetDepthStencilState pfnSwrSetDepthStencilState;
-    PFNSwrSetBackendState pfnSwrSetBackendState;
-    PFNSwrSetDepthBoundsState pfnSwrSetDepthBoundsState;
-    PFNSwrSetPixelShaderState pfnSwrSetPixelShaderState;
-    PFNSwrSetBlendState pfnSwrSetBlendState;
-    PFNSwrSetBlendFunc pfnSwrSetBlendFunc;
-    PFNSwrDraw pfnSwrDraw;
-    PFNSwrDrawInstanced pfnSwrDrawInstanced;
-    PFNSwrDrawIndexed pfnSwrDrawIndexed;
-    PFNSwrDrawIndexedInstanced pfnSwrDrawIndexedInstanced;
-    PFNSwrInvalidateTiles pfnSwrInvalidateTiles;
-    PFNSwrDiscardRect pfnSwrDiscardRect;
-    PFNSwrDispatch pfnSwrDispatch;
-    PFNSwrStoreTiles pfnSwrStoreTiles;
-    PFNSwrClearRenderTarget pfnSwrClearRenderTarget;
-    PFNSwrSetRastState pfnSwrSetRastState;
-    PFNSwrSetViewports pfnSwrSetViewports;
-    PFNSwrSetScissorRects pfnSwrSetScissorRects;
+    PFNSwrCreateContext          pfnSwrCreateContext;
+    PFNSwrDestroyContext         pfnSwrDestroyContext;
+    PFNSwrBindApiThread          pfnSwrBindApiThread;
+    PFNSwrSaveState              pfnSwrSaveState;
+    PFNSwrRestoreState           pfnSwrRestoreState;
+    PFNSwrSync                   pfnSwrSync;
+    PFNSwrStallBE                pfnSwrStallBE;
+    PFNSwrWaitForIdle            pfnSwrWaitForIdle;
+    PFNSwrWaitForIdleFE          pfnSwrWaitForIdleFE;
+    PFNSwrSetVertexBuffers       pfnSwrSetVertexBuffers;
+    PFNSwrSetIndexBuffer         pfnSwrSetIndexBuffer;
+    PFNSwrSetFetchFunc           pfnSwrSetFetchFunc;
+    PFNSwrSetSoFunc              pfnSwrSetSoFunc;
+    PFNSwrSetSoState             pfnSwrSetSoState;
+    PFNSwrSetSoBuffers           pfnSwrSetSoBuffers;
+    PFNSwrSetVertexFunc          pfnSwrSetVertexFunc;
+    PFNSwrSetFrontendState       pfnSwrSetFrontendState;
+    PFNSwrSetGsState             pfnSwrSetGsState;
+    PFNSwrSetGsFunc              pfnSwrSetGsFunc;
+    PFNSwrSetCsFunc              pfnSwrSetCsFunc;
+    PFNSwrSetTsState             pfnSwrSetTsState;
+    PFNSwrSetHsFunc              pfnSwrSetHsFunc;
+    PFNSwrSetDsFunc              pfnSwrSetDsFunc;
+    PFNSwrSetDepthStencilState   pfnSwrSetDepthStencilState;
+    PFNSwrSetBackendState        pfnSwrSetBackendState;
+    PFNSwrSetDepthBoundsState    pfnSwrSetDepthBoundsState;
+    PFNSwrSetPixelShaderState    pfnSwrSetPixelShaderState;
+    PFNSwrSetBlendState          pfnSwrSetBlendState;
+    PFNSwrSetBlendFunc           pfnSwrSetBlendFunc;
+    PFNSwrDraw                   pfnSwrDraw;
+    PFNSwrDrawInstanced          pfnSwrDrawInstanced;
+    PFNSwrDrawIndexed            pfnSwrDrawIndexed;
+    PFNSwrDrawIndexedInstanced   pfnSwrDrawIndexedInstanced;
+    PFNSwrInvalidateTiles        pfnSwrInvalidateTiles;
+    PFNSwrDiscardRect            pfnSwrDiscardRect;
+    PFNSwrDispatch               pfnSwrDispatch;
+    PFNSwrStoreTiles             pfnSwrStoreTiles;
+    PFNSwrClearRenderTarget      pfnSwrClearRenderTarget;
+    PFNSwrSetRastState           pfnSwrSetRastState;
+    PFNSwrSetViewports           pfnSwrSetViewports;
+    PFNSwrSetScissorRects        pfnSwrSetScissorRects;
     PFNSwrGetPrivateContextState pfnSwrGetPrivateContextState;
     PFNSwrAllocDrawContextMemory pfnSwrAllocDrawContextMemory;
-    PFNSwrEnableStatsFE pfnSwrEnableStatsFE;
-    PFNSwrEnableStatsBE pfnSwrEnableStatsBE;
-    PFNSwrEndFrame pfnSwrEndFrame;
-    PFNSwrInit pfnSwrInit;
-    PFNSwrLoadHotTile pfnSwrLoadHotTile;
+    PFNSwrEnableStatsFE          pfnSwrEnableStatsFE;
+    PFNSwrEnableStatsBE          pfnSwrEnableStatsBE;
+    PFNSwrEndFrame               pfnSwrEndFrame;
+    PFNSwrInit                   pfnSwrInit;
+    PFNSwrLoadHotTile           pfnSwrLoadHotTile;
     PFNSwrStoreHotTileToSurface pfnSwrStoreHotTileToSurface;
-    PFNSwrStoreHotTileClear pfnSwrStoreHotTileClear;
+    PFNSwrStoreHotTileClear     pfnSwrStoreHotTileClear;
 };
 
 extern "C" {
-typedef void (SWR_API * PFNSwrGetInterface)(SWR_INTERFACE &out_funcs);
-SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE &out_funcs);
+typedef void(SWR_API* PFNSwrGetInterface)(SWR_INTERFACE& out_funcs);
+SWR_VISIBLE void SWR_API SwrGetInterface(SWR_INTERFACE& out_funcs);
 }
 
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 1db0972..a3cfdb4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -1,35 +1,35 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file arena.h
-*
-* @brief Arena memory manager
-*        The arena is convenient and fast for managing allocations for any of
-*        our allocations that are associated with operations and can all be freed
-*        once when their operation has completed. Allocations are cheap since
-*        most of the time its simply an increment of an offset. Also, no need to
-*        free individual allocations. All of the arena memory can be freed at once.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file arena.h
+ *
+ * @brief Arena memory manager
+ *        The arena is convenient and fast for managing allocations for any of
+ *        our allocations that are associated with operations and can all be freed
+ *        once when their operation has completed. Allocations are cheap since
+ *        most of the time its simply an increment of an offset. Also, no need to
+ *        free individual allocations. All of the arena memory can be freed at once.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include <mutex>
@@ -42,10 +42,9 @@
 struct ArenaBlock
 {
     size_t      blockSize = 0;
-    ArenaBlock* pNext = nullptr;
+    ArenaBlock* pNext     = nullptr;
 };
-static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
-    "Increase BLOCK_ALIGN size");
+static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size");
 
 class DefaultAllocator
 {
@@ -55,7 +54,7 @@
         SWR_ASSUME_ASSERT(size >= sizeof(ArenaBlock));
 
         ArenaBlock* p = new (AlignedMalloc(size, align)) ArenaBlock();
-        p->blockSize = size;
+        p->blockSize  = size;
         return p;
     }
 
@@ -70,7 +69,7 @@
 };
 
 // Caching Allocator for Arena
-template<uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12>
+template <uint32_t NumBucketsT = 8, uint32_t StartBucketBitT = 12>
 struct CachingAllocatorT : DefaultAllocator
 {
     ArenaBlock* AllocateAligned(size_t size, size_t align)
@@ -83,8 +82,8 @@
         {
             // search cached blocks
             std::lock_guard<std::mutex> l(m_mutex);
-            ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket];
-            ArenaBlock* pBlock = SearchBlocks(pPrevBlock, size, align);
+            ArenaBlock*                 pPrevBlock = &m_cachedBlocks[bucket];
+            ArenaBlock*                 pBlock     = SearchBlocks(pPrevBlock, size, align);
 
             if (pBlock)
             {
@@ -97,7 +96,7 @@
             else
             {
                 pPrevBlock = &m_oldCachedBlocks[bucket];
-                pBlock = SearchBlocks(pPrevBlock, size, align);
+                pBlock     = SearchBlocks(pPrevBlock, size, align);
 
                 if (pBlock)
                 {
@@ -113,7 +112,7 @@
             {
                 SWR_ASSUME_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock);
                 pPrevBlock->pNext = pBlock->pNext;
-                pBlock->pNext = nullptr;
+                pBlock->pNext     = nullptr;
 
                 return pBlock;
             }
@@ -150,7 +149,10 @@
 
     void FreeOldBlocks()
     {
-        if (!m_cachedSize) { return; }
+        if (!m_cachedSize)
+        {
+            return;
+        }
         std::lock_guard<std::mutex> l(m_mutex);
 
         bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE);
@@ -169,7 +171,7 @@
                     pBlock = pNext;
                 }
                 m_oldCachedBlocks[i].pNext = nullptr;
-                m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
+                m_pOldLastCachedBlocks[i]  = &m_oldCachedBlocks[i];
             }
 
             if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i])
@@ -179,8 +181,8 @@
                     // We know that all blocks are the same size.
                     // Just move the list over.
                     m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext;
-                    m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext;
-                    m_cachedBlocks[i].pNext = nullptr;
+                    m_oldCachedBlocks[i].pNext    = m_cachedBlocks[i].pNext;
+                    m_cachedBlocks[i].pNext       = nullptr;
                     if (m_pOldLastCachedBlocks[i]->pNext)
                     {
                         m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i];
@@ -195,13 +197,13 @@
                     while (pBlock)
                     {
                         ArenaBlock* pNext = pBlock->pNext;
-                        pBlock->pNext = nullptr;
+                        pBlock->pNext     = nullptr;
                         m_cachedSize -= pBlock->blockSize;
                         InsertCachedBlock<true>(i, pBlock);
                         pBlock = pNext;
                     }
 
-                    m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
+                    m_pLastCachedBlocks[i]  = &m_cachedBlocks[i];
                     m_cachedBlocks[i].pNext = nullptr;
                 }
             }
@@ -215,7 +217,7 @@
     {
         for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
         {
-            m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
+            m_pLastCachedBlocks[i]    = &m_cachedBlocks[i];
             m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
         }
     }
@@ -260,7 +262,8 @@
     {
         SWR_ASSUME_ASSERT(bucketId < CACHE_NUM_BUCKETS);
 
-        ArenaBlock* pPrevBlock = OldBlockT ? &m_oldCachedBlocks[bucketId] : &m_cachedBlocks[bucketId];
+        ArenaBlock* pPrevBlock =
+            OldBlockT ? &m_oldCachedBlocks[bucketId] : &m_cachedBlocks[bucketId];
         ArenaBlock* pBlock = pPrevBlock->pNext;
 
         while (pBlock)
@@ -271,13 +274,13 @@
                 break;
             }
             pPrevBlock = pBlock;
-            pBlock = pBlock->pNext;
+            pBlock     = pBlock->pNext;
         }
 
         // Insert into list
         SWR_ASSUME_ASSERT(pPrevBlock);
         pPrevBlock->pNext = pNewBlock;
-        pNewBlock->pNext = pBlock;
+        pNewBlock->pNext  = pBlock;
 
         if (OldBlockT)
         {
@@ -301,9 +304,9 @@
 
     static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align)
     {
-        ArenaBlock* pBlock = pPrevBlock->pNext;
+        ArenaBlock* pBlock          = pPrevBlock->pNext;
         ArenaBlock* pPotentialBlock = nullptr;
-        ArenaBlock* pPotentialPrev = nullptr;
+        ArenaBlock* pPotentialPrev  = nullptr;
 
         while (pBlock)
         {
@@ -320,26 +323,26 @@
                     // We could use this as it is larger than we wanted, but
                     // continue to search for a better match
                     pPotentialBlock = pBlock;
-                    pPotentialPrev = pPrevBlock;
+                    pPotentialPrev  = pPrevBlock;
                 }
             }
             else
             {
                 // Blocks are sorted by size (biggest first)
-                // So, if we get here, there are no blocks 
+                // So, if we get here, there are no blocks
                 // large enough, fall through to allocation.
                 pBlock = nullptr;
                 break;
             }
 
             pPrevBlock = pBlock;
-            pBlock = pBlock->pNext;
+            pBlock     = pBlock->pNext;
         }
 
         if (!pBlock)
         {
             // Couldn't find an exact match, use next biggest size
-            pBlock = pPotentialBlock;
+            pBlock     = pPotentialBlock;
             pPrevBlock = pPotentialPrev;
         }
 
@@ -347,35 +350,32 @@
     }
 
     // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
-    static const uint32_t   CACHE_NUM_BUCKETS       = NumBucketsT;
-    static const uint32_t   CACHE_START_BUCKET_BIT  = StartBucketBitT;
-    static const size_t     MAX_UNUSED_SIZE         = sizeof(MEGABYTE);
+    static const uint32_t CACHE_NUM_BUCKETS      = NumBucketsT;
+    static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT;
+    static const size_t   MAX_UNUSED_SIZE        = sizeof(MEGABYTE);
 
-    ArenaBlock              m_cachedBlocks[CACHE_NUM_BUCKETS];
-    ArenaBlock*             m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
-    ArenaBlock              m_oldCachedBlocks[CACHE_NUM_BUCKETS];
-    ArenaBlock*             m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS];
-    std::mutex              m_mutex;
+    ArenaBlock  m_cachedBlocks[CACHE_NUM_BUCKETS];
+    ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
+    ArenaBlock  m_oldCachedBlocks[CACHE_NUM_BUCKETS];
+    ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS];
+    std::mutex  m_mutex;
 
-    size_t                  m_totalAllocated = 0;
+    size_t m_totalAllocated = 0;
 
-    size_t                  m_cachedSize = 0;
-    size_t                  m_oldCachedSize = 0;
+    size_t m_cachedSize    = 0;
+    size_t m_oldCachedSize = 0;
 };
 typedef CachingAllocatorT<> CachingAllocator;
 
-template<typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)>
+template <typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)>
 class TArena
 {
 public:
-    TArena(T& in_allocator)  : m_allocator(in_allocator) {}
-    TArena()                 : m_allocator(m_defAllocator) {}
-    ~TArena()
-    {
-        Reset(true);
-    }
+    TArena(T& in_allocator) : m_allocator(in_allocator) {}
+    TArena() : m_allocator(m_defAllocator) {}
+    ~TArena() { Reset(true); }
 
-    void* AllocAligned(size_t size, size_t  align)
+    void* AllocAligned(size_t size, size_t align)
     {
         if (0 == size)
         {
@@ -387,12 +387,12 @@
         if (m_pCurBlock)
         {
             ArenaBlock* pCurBlock = m_pCurBlock;
-            size_t offset = AlignUp(m_offset, align);
+            size_t      offset    = AlignUp(m_offset, align);
 
             if ((offset + size) <= pCurBlock->blockSize)
             {
                 void* pMem = PtrAdd(pCurBlock, offset);
-                m_offset = offset + size;
+                m_offset   = offset + size;
                 return pMem;
             }
 
@@ -401,17 +401,18 @@
         }
 
         static const size_t ArenaBlockSize = BlockSizeT;
-        size_t blockSize = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize);
+        size_t              blockSize      = std::max(size + ARENA_BLOCK_ALIGN, ArenaBlockSize);
 
         // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
         blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
 
-        ArenaBlock* pNewBlock = m_allocator.AllocateAligned(blockSize, ARENA_BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
+        ArenaBlock* pNewBlock = m_allocator.AllocateAligned(
+            blockSize, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned.
         SWR_ASSERT(pNewBlock != nullptr);
 
         if (pNewBlock != nullptr)
         {
-            m_offset = ARENA_BLOCK_ALIGN;
+            m_offset         = ARENA_BLOCK_ALIGN;
             pNewBlock->pNext = m_pCurBlock;
 
             m_pCurBlock = pNewBlock;
@@ -420,10 +421,7 @@
         return AllocAligned(size, align);
     }
 
-    void* Alloc(size_t  size)
-    {
-        return AllocAligned(size, 1);
-    }
+    void* Alloc(size_t size) { return AllocAligned(size, 1); }
 
     void* AllocAlignedSync(size_t size, size_t align)
     {
@@ -453,12 +451,12 @@
 
         if (m_pCurBlock)
         {
-            ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
-            m_pCurBlock->pNext = nullptr;
+            ArenaBlock* pUsedBlocks = m_pCurBlock->pNext;
+            m_pCurBlock->pNext      = nullptr;
             while (pUsedBlocks)
             {
                 ArenaBlock* pBlock = pUsedBlocks;
-                pUsedBlocks = pBlock->pNext;
+                pUsedBlocks        = pBlock->pNext;
 
                 m_allocator.Free(pBlock);
             }
@@ -473,20 +471,20 @@
 
     bool IsEmpty()
     {
-        return (m_pCurBlock == nullptr) || (m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr);
+        return (m_pCurBlock == nullptr) ||
+               (m_offset == ARENA_BLOCK_ALIGN && m_pCurBlock->pNext == nullptr);
     }
 
 private:
-
-    ArenaBlock*         m_pCurBlock = nullptr;
-    size_t              m_offset    = ARENA_BLOCK_ALIGN;
+    ArenaBlock* m_pCurBlock = nullptr;
+    size_t      m_offset    = ARENA_BLOCK_ALIGN;
 
     /// @note Mutex is only used by sync allocation functions.
-    std::mutex          m_mutex;
+    std::mutex m_mutex;
 
-    DefaultAllocator    m_defAllocator;
-    T&                  m_allocator;
+    DefaultAllocator m_defAllocator;
+    T&               m_allocator;
 };
 
-using StdArena      = TArena<DefaultAllocator>;
-using CachingArena  = TArena<CachingAllocator>;
+using StdArena     = TArena<DefaultAllocator>;
+using CachingArena = TArena<CachingAllocator>;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 5ac9ceb..8f8dbcf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -1,31 +1,31 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file backend.cpp
-*
-* @brief Backend handles rasterization, pixel shading and output merger
-*        operations.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file backend.cpp
+ *
+ * @brief Backend handles rasterization, pixel shading and output merger
+ *        operations.
+ *
+ ******************************************************************************/
 
 #include <smmintrin.h>
 
@@ -44,9 +44,13 @@
 /// @param pDC - pointer to draw context (dispatch).
 /// @param workerId - The unique worker ID that is assigned to this thread.
 /// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
+void ProcessComputeBE(DRAW_CONTEXT* pDC,
+                      uint32_t      workerId,
+                      uint32_t      threadGroupId,
+                      void*&        pSpillFillBuffer,
+                      void*&        pScratchSpace)
 {
-    SWR_CONTEXT *pContext = pDC->pContext;
+    SWR_CONTEXT* pContext = pDC->pContext;
 
     RDTSC_BEGIN(BEDispatch, pDC->drawId);
 
@@ -59,8 +63,9 @@
     {
         pSpillFillBuffer = pDC->pArena->AllocAlignedSync(spillFillSize, KNOB_SIMD_BYTES);
     }
-    
-    size_t scratchSpaceSize = pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances;
+
+    size_t scratchSpaceSize =
+        pDC->pState->state.scratchSpaceSize * pDC->pState->state.scratchSpaceNumInstances;
     if (scratchSpaceSize && pScratchSpace == nullptr)
     {
         pScratchSpace = pDC->pArena->AllocAlignedSync(scratchSpaceSize, KNOB_SIMD_BYTES);
@@ -68,17 +73,19 @@
 
     const API_STATE& state = GetApiState(pDC);
 
-    SWR_CS_CONTEXT csContext{ 0 };
-    csContext.tileCounter = threadGroupId;
-    csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
-    csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
-    csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
-    csContext.pTGSM = pContext->ppScratch[workerId];
-    csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
-    csContext.pScratchSpace = (uint8_t*)pScratchSpace;
+    SWR_CS_CONTEXT csContext{0};
+    csContext.tileCounter         = threadGroupId;
+    csContext.dispatchDims[0]     = pTaskData->threadGroupCountX;
+    csContext.dispatchDims[1]     = pTaskData->threadGroupCountY;
+    csContext.dispatchDims[2]     = pTaskData->threadGroupCountZ;
+    csContext.pTGSM               = pContext->ppScratch[workerId];
+    csContext.pSpillFillBuffer    = (uint8_t*)pSpillFillBuffer;
+    csContext.pScratchSpace       = (uint8_t*)pScratchSpace;
     csContext.scratchSpacePerSimd = pDC->pState->state.scratchSpaceSize;
 
-    state.pfnCsFunc(GetPrivateState(pDC), pContext->threadPool.pThreadData[workerId].pWorkerPrivateData, &csContext);
+    state.pfnCsFunc(GetPrivateState(pDC),
+                    pContext->threadPool.pThreadData[workerId].pWorkerPrivateData,
+                    &csContext);
 
     UPDATE_STAT_BE(CsInvocations, state.totalThreadsInGroup);
     AR_EVENT(CSStats(csContext.stats.numInstExecuted));
@@ -91,23 +98,26 @@
 /// @param pDC - pointer to draw context (dispatch).
 /// @param workerId - The unique worker ID that is assigned to this thread.
 /// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
 {
     // Dummy function
 }
 
-void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
 {
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroTile, x, y);
     SWR_ASSERT(x == 0 && y == 0);
 }
 
-void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, STORE_TILES_DESC* pDesc, 
-    SWR_RENDERTARGET_ATTACHMENT attachment)
+void ProcessStoreTileBE(DRAW_CONTEXT*               pDC,
+                        uint32_t                    workerId,
+                        uint32_t                    macroTile,
+                        STORE_TILES_DESC*           pDesc,
+                        SWR_RENDERTARGET_ATTACHMENT attachment)
 {
-    SWR_CONTEXT *pContext = pDC->pContext;
-    HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+    SWR_CONTEXT* pContext           = pDC->pContext;
+    HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     RDTSC_BEGIN(BEStoreTiles, pDC->drawId);
 
@@ -121,17 +131,27 @@
     case SWR_ATTACHMENT_COLOR4:
     case SWR_ATTACHMENT_COLOR5:
     case SWR_ATTACHMENT_COLOR6:
-    case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
-    case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
-    case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
-    default: SWR_INVALID("Unknown attachment: %d", attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
+    case SWR_ATTACHMENT_COLOR7:
+        srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
+        break;
+    case SWR_ATTACHMENT_DEPTH:
+        srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT;
+        break;
+    case SWR_ATTACHMENT_STENCIL:
+        srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT;
+        break;
+    default:
+        SWR_INVALID("Unknown attachment: %d", attachment);
+        srcFormat = KNOB_COLOR_HOT_TILE_FORMAT;
+        break;
     }
 
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroTile, x, y);
 
     // Only need to store the hottile if it's been rendered to...
-    HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
+    HOTTILE* pHotTile =
+        pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, attachment, false);
     if (pHotTile)
     {
         // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
@@ -140,22 +160,35 @@
             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[srcFormat];
             SWR_ASSERT(pfnClearTiles != nullptr);
 
-            pfnClearTiles(pDC, hWorkerPrivateData, attachment, macroTile, pHotTile->renderTargetArrayIndex, pHotTile->clearData, pDesc->rect);
+            pfnClearTiles(pDC,
+                          hWorkerPrivateData,
+                          attachment,
+                          macroTile,
+                          pHotTile->renderTargetArrayIndex,
+                          pHotTile->clearData,
+                          pDesc->rect);
         }
 
-        if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
+        if (pHotTile->state == HOTTILE_DIRTY ||
+            pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
         {
             int32_t destX = KNOB_MACROTILE_X_DIM * x;
             int32_t destY = KNOB_MACROTILE_Y_DIM * y;
 
-            pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, srcFormat,
-                attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pContext->pfnStoreTile(GetPrivateState(pDC),
+                                   hWorkerPrivateData,
+                                   srcFormat,
+                                   attachment,
+                                   destX,
+                                   destY,
+                                   pHotTile->renderTargetArrayIndex,
+                                   pHotTile->pBuffer);
         }
-        
 
         if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
         {
-            if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY && pHotTile->state == HOTTILE_RESOLVED))
+            if (!(pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY &&
+                  pHotTile->state == HOTTILE_RESOLVED))
             {
                 pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
             }
@@ -164,12 +197,12 @@
     RDTSC_END(BEStoreTiles, 1);
 }
 
-void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
 {
-    STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
+    STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pData;
 
-    unsigned long rt = 0;
-    uint32_t mask = pDesc->attachmentMask;
+    unsigned long rt   = 0;
+    uint32_t      mask = pDesc->attachmentMask;
     while (_BitScanForward(&rt, mask))
     {
         mask &= ~(1 << rt);
@@ -177,10 +210,13 @@
     }
 }
 
-void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
+                                     uint32_t      workerId,
+                                     uint32_t      macroTile,
+                                     void*         pData)
 {
-    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
-    SWR_CONTEXT *pContext = pDC->pContext;
+    DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pData;
+    SWR_CONTEXT*                   pContext = pDC->pContext;
 
     const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
 
@@ -188,8 +224,13 @@
     {
         if (pDesc->attachmentMask & (1 << i))
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
-                pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
+            HOTTILE* pHotTile =
+                pContext->pHotTileMgr->GetHotTileNoLoad(pContext,
+                                                        pDC,
+                                                        macroTile,
+                                                        (SWR_RENDERTARGET_ATTACHMENT)i,
+                                                        pDesc->createNewTiles,
+                                                        numSamples);
             if (pHotTile)
             {
                 pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
@@ -198,14 +239,19 @@
     }
 }
 
-template<uint32_t sampleCountT>
-void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+template <uint32_t sampleCountT>
+void BackendNullPS(DRAW_CONTEXT*        pDC,
+                   uint32_t             workerId,
+                   uint32_t             x,
+                   uint32_t             y,
+                   SWR_TRIANGLE_DESC&   work,
+                   RenderOutputBuffers& renderBuffers)
 {
     RDTSC_BEGIN(BENullBackend, pDC->drawId);
     ///@todo: handle center multisample pattern
     RDTSC_BEGIN(BESetup, pDC->drawId);
 
-    const API_STATE &state = GetApiState(pDC);
+    const API_STATE& state = GetApiState(pDC);
 
     BarycentricCoeffs coeffs;
     SetupBarycentricCoeffs(&coeffs, work);
@@ -220,7 +266,7 @@
 
     simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 
-    const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
+    const simdscalar           dy        = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
@@ -231,8 +277,8 @@
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
             // iterate over active samples
-            unsigned long sample = 0;
-            uint32_t sampleMask = state.blendState.sampleMask;
+            unsigned long sample     = 0;
+            uint32_t      sampleMask = state.blendState.sampleMask;
             while (_BitScanForward(&sample, sampleMask))
             {
                 sampleMask &= ~(1 << sample);
@@ -242,14 +288,16 @@
                 if (coverageMask)
                 {
                     // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+                    uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
 
                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                     {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
+                                      "Unsupported depth hot tile format");
 
-                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+                        const simdscalar z =
+                            _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
 
                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
@@ -266,7 +314,11 @@
                     CalcSampleBarycentrics(coeffs, psContext);
 
                     // interpolate and quantize z
-                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    psContext.vZ = vplaneps(coeffs.vZa,
+                                            coeffs.vZb,
+                                            coeffs.vZc,
+                                            psContext.vI.sample,
+                                            psContext.vJ.sample);
                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
                     RDTSC_END(BEBarycentric, 0);
@@ -274,21 +326,39 @@
                     // interpolate user clip distance if available
                     if (state.backendState.clipDistanceMask)
                     {
-                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
+                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
+                                                             work.pUserClipBuffer,
+                                                             psContext.vI.sample,
+                                                             psContext.vJ.sample);
                     }
 
-                    simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
+                    simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
                     simdscalar stencilPassMask = vCoverageMask;
 
                     RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
-                    simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                        psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                    AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                        pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+                    simdscalar depthPassMask = DepthStencilTest(&state,
+                                                                work.triFlags.frontFacing,
+                                                                work.triFlags.viewportIndex,
+                                                                psContext.vZ,
+                                                                pDepthSample,
+                                                                vCoverageMask,
+                                                                pStencilSample,
+                                                                &stencilPassMask);
+                    AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask),
+                                                         _simd_movemask_ps(stencilPassMask),
+                                                         _simd_movemask_ps(vCoverageMask)));
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
+                                      &state.depthStencilState,
+                                      work.triFlags.frontFacing,
+                                      psContext.vZ,
+                                      pDepthSample,
+                                      depthPassMask,
+                                      vCoverageMask,
+                                      pStencilSample,
+                                      stencilPassMask);
                     RDTSC_END(BEEarlyDepthTest, 0);
 
-                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                    uint32_t statMask  = _simd_movemask_ps(depthPassMask);
                     uint32_t statCount = _mm_popcnt_u32(statMask);
                     UPDATE_STAT_BE(DepthPassCount, statCount);
                 }
@@ -299,7 +369,8 @@
             }
 
             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer +=
+                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
             vXSamplePosUL = _simd_add_ps(vXSamplePosUL, dx);
         }
@@ -310,34 +381,30 @@
     RDTSC_END(BENullBackend, 0);
 }
 
-PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS] = {};
+PFN_CLEAR_TILES  gClearTilesTable[NUM_SWR_FORMATS] = {};
 PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
-                                     [2] // centroid
-                                     [2] // canEarlyZ
-                                     = {};
-PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
-                                       [2] // isCenterPattern
-                                       [SWR_INPUT_COVERAGE_COUNT]
-                                       [2] // centroid
-                                       [2] // forcedSampleCount
-                                       [2] // canEarlyZ
-                                       = {};
-PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
-                                        [SWR_INPUT_COVERAGE_COUNT]
+PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2] // centroid
+                                     [2]                           // canEarlyZ
+    = {};
+PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
+                                       [SWR_INPUT_COVERAGE_COUNT][2]   // centroid
+                                       [2]                             // forcedSampleCount
+                                       [2]                             // canEarlyZ
+    = {};
+PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT]
                                         [2] // centroid
                                         [2] // canEarlyZ
-                                        = {};
+    = {};
 
 void InitBackendFuncTables()
-{    
+{
     InitBackendPixelRate();
     InitBackendSingleFuncTable(gBackendSingleSample);
     InitBackendSampleFuncTable(gBackendSampleRateTable);
 
-    gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
-    gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
-    gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
-    gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
-    gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
+    gBackendNullPs[SWR_MULTISAMPLE_1X]  = &BackendNullPS<SWR_MULTISAMPLE_1X>;
+    gBackendNullPs[SWR_MULTISAMPLE_2X]  = &BackendNullPS<SWR_MULTISAMPLE_2X>;
+    gBackendNullPs[SWR_MULTISAMPLE_4X]  = &BackendNullPS<SWR_MULTISAMPLE_4X>;
+    gBackendNullPs[SWR_MULTISAMPLE_8X]  = &BackendNullPS<SWR_MULTISAMPLE_8X>;
+    gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS<SWR_MULTISAMPLE_16X>;
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 7a842fe..79d9007 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -1,31 +1,31 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file backend.h
-*
-* @brief Backend handles rasterization, pixel shading and output merger
-*        operations.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file backend.h
+ *
+ * @brief Backend handles rasterization, pixel shading and output merger
+ *        operations.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "common/os.h"
@@ -34,29 +34,37 @@
 #include "depthstencil.h"
 #include "rdtsc_core.h"
 
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace);
-void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
-void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
-void ProcessStoreTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-void ProcessShutdownBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
+void ProcessComputeBE(DRAW_CONTEXT* pDC,
+                      uint32_t      workerId,
+                      uint32_t      threadGroupId,
+                      void*&        pSpillFillBuffer,
+                      void*&        pScratchSpace);
+void ProcessSyncBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
+void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
+void ProcessStoreTilesBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT* pDC,
+                                     uint32_t      workerId,
+                                     uint32_t      macroTile,
+                                     void*         pData);
+void ProcessShutdownBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData);
 
-typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, HANDLE hWorkerData, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect);
+typedef void (*PFN_CLEAR_TILES)(DRAW_CONTEXT*,
+                                HANDLE                      hWorkerData,
+                                SWR_RENDERTARGET_ATTACHMENT rt,
+                                uint32_t,
+                                uint32_t,
+                                DWORD[4],
+                                const SWR_RECT& rect);
 
-extern PFN_CLEAR_TILES gClearTilesTable[NUM_SWR_FORMATS];
+extern PFN_CLEAR_TILES  gClearTilesTable[NUM_SWR_FORMATS];
 extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT];
-extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT]
-                                     [2]  // centroid
-                                     [2]; // canEarlyZ
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
-                                       [2] // isCenterPattern
-                                       [SWR_INPUT_COVERAGE_COUNT]
-                                       [2] // centroid
-                                       [2] // forcedSampleCount
-                                       [2] // canEarlyZ
-                                       ;
+extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2]     // centroid
+                                            [2];                              // canEarlyZ
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2] // isCenterPattern
+                                              [SWR_INPUT_COVERAGE_COUNT][2]   // centroid
+                                              [2]                             // forcedSampleCount
+                                              [2]                             // canEarlyZ
+    ;
 extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT]
-                                        [SWR_INPUT_COVERAGE_COUNT]
-                                        [2]  // centroid
-                                        [2]; // canEarlyZ
-
+                                               [SWR_INPUT_COVERAGE_COUNT][2] // centroid
+                                               [2];                          // canEarlyZ
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
index af031f9..0b14ca0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_clear.cpp
@@ -1,31 +1,31 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file backend.cpp
-*
-* @brief Backend handles rasterization, pixel shading and output merger
-*        operations.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file backend.cpp
+ *
+ * @brief Backend handles rasterization, pixel shading and output merger
+ *        operations.
+ *
+ ******************************************************************************/
 
 #include <smmintrin.h>
 
@@ -37,17 +37,17 @@
 
 #include <algorithm>
 
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
+template <SWR_FORMAT format>
+void ClearRasterTile(uint8_t* pTileBuffer, simdvector& value)
 {
-    auto lambda = [&](int32_t comp)
-    {
+    auto lambda = [&](int32_t comp) {
         FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
 
         pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
     };
 
-    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
+    const uint32_t numIter =
+        (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
 
     for (uint32_t i = 0; i < numIter; ++i)
     {
@@ -56,17 +56,17 @@
 }
 
 #if USE_8x2_TILE_BACKEND
-template<SWR_FORMAT format>
-void ClearRasterTile(uint8_t *pTileBuffer, simd16vector &value)
+template <SWR_FORMAT format>
+void ClearRasterTile(uint8_t* pTileBuffer, simd16vector& value)
 {
-    auto lambda = [&](int32_t comp)
-    {
+    auto lambda = [&](int32_t comp) {
         FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
 
         pTileBuffer += (KNOB_SIMD16_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
     };
 
-    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
+    const uint32_t numIter =
+        (KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM);
 
     for (uint32_t i = 0; i < numIter; ++i)
     {
@@ -75,8 +75,14 @@
 }
 
 #endif
-template<SWR_FORMAT format>
-INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, HANDLE hWorkerPrivateData, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, uint32_t renderTargetArrayIndex, DWORD clear[4], const SWR_RECT& rect)
+template <SWR_FORMAT format>
+INLINE void ClearMacroTile(DRAW_CONTEXT*               pDC,
+                           HANDLE                      hWorkerPrivateData,
+                           SWR_RENDERTARGET_ATTACHMENT rt,
+                           uint32_t                    macroTile,
+                           uint32_t                    renderTargetArrayIndex,
+                           DWORD                       clear[4],
+                           const SWR_RECT&             rect)
 {
     // convert clear color to hottile format
     // clear color is in RGBA float/uint32
@@ -91,7 +97,7 @@
             vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<format>::fromFloat(comp)));
             vComp = _simd16_castsi_ps(_simd16_cvtps_epi32(vComp));
         }
-        vComp = FormatTraits<format>::pack(comp, vComp);
+        vComp                                         = FormatTraits<format>::pack(comp, vComp);
         vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
     }
 
@@ -106,7 +112,7 @@
             vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
             vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
         }
-        vComp = FormatTraits<format>::pack(comp, vComp);
+        vComp                                         = FormatTraits<format>::pack(comp, vComp);
         vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
     }
 
@@ -115,8 +121,7 @@
     MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
 
     // Init to full macrotile
-    SWR_RECT clearTile =
-    {
+    SWR_RECT clearTile = {
         KNOB_MACROTILE_X_DIM * int32_t(tileX),
         KNOB_MACROTILE_Y_DIM * int32_t(tileY),
         KNOB_MACROTILE_X_DIM * int32_t(tileX + 1),
@@ -127,7 +132,8 @@
     clearTile &= rect;
 
     // translate to local hottile origin
-    clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM, -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
+    clearTile.Translate(-int32_t(tileX) * KNOB_MACROTILE_X_DIM,
+                        -int32_t(tileY) * KNOB_MACROTILE_Y_DIM);
 
     // Make maximums inclusive (needed for convert to raster tiles)
     clearTile.xmax -= 1;
@@ -141,14 +147,29 @@
 
     const int32_t numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
     // compute steps between raster tile samples / raster tiles / macro tile rows
-    const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
-    const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
+    const uint32_t rasterTileSampleStep =
+        KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
+    const uint32_t rasterTileStep =
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
     const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
-    const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
+    const uint32_t pitch            = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
 
-    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, hWorkerPrivateData, macroTile, rt, true, numSamples, renderTargetArrayIndex);
-    uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, clearTile.xmin, clearTile.ymin)) * numSamples;
-    uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
+    HOTTILE* pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext,
+                                                               pDC,
+                                                               hWorkerPrivateData,
+                                                               macroTile,
+                                                               rt,
+                                                               true,
+                                                               numSamples,
+                                                               renderTargetArrayIndex);
+    uint32_t rasterTileStartOffset =
+        (ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp>>(
+            pitch, clearTile.xmin, clearTile.ymin)) *
+        numSamples;
+    uint8_t* pRasterTileRow =
+        pHotTile->pBuffer +
+        rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ,
+                               // FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
 
     // loop over all raster tiles in the current hot tile
     for (int32_t y = clearTile.ymin; y <= clearTile.ymax; ++y)
@@ -156,7 +177,7 @@
         uint8_t* pRasterTile = pRasterTileRow;
         for (int32_t x = clearTile.xmin; x <= clearTile.xmax; ++x)
         {
-            for( int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
+            for (int32_t sampleNum = 0; sampleNum < numSamples; sampleNum++)
             {
                 ClearRasterTile<format>(pRasterTile, vClear);
                 pRasterTile += rasterTileSampleStep;
@@ -168,17 +189,16 @@
     pHotTile->state = HOTTILE_DIRTY;
 }
 
-
-void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+void ProcessClearBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pUserData)
 {
-    SWR_CONTEXT *pContext = pDC->pContext;
-    HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+    SWR_CONTEXT* pContext           = pDC->pContext;
+    HANDLE       hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     if (KNOB_FAST_CLEAR)
     {
-        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+        CLEAR_DESC*           pClear      = (CLEAR_DESC*)pUserData;
         SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
-        uint32_t numSamples = GetNumSamples(sampleCount);
+        uint32_t              numSamples  = GetNumSamples(sampleCount);
 
         SWR_ASSERT(pClear->attachmentMask != 0); // shouldn't be here without a reason.
 
@@ -186,36 +206,58 @@
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
         {
-            unsigned long rt = 0;
-            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
+            unsigned long rt   = 0;
+            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
             while (_BitScanForward(&rt, mask))
             {
                 mask &= ~(1 << rt);
 
-                HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, (SWR_RENDERTARGET_ATTACHMENT)rt, true, numSamples, pClear->renderTargetArrayIndex);
+                HOTTILE* pHotTile =
+                    pContext->pHotTileMgr->GetHotTile(pContext,
+                                                      pDC,
+                                                      hWorkerPrivateData,
+                                                      macroTile,
+                                                      (SWR_RENDERTARGET_ATTACHMENT)rt,
+                                                      true,
+                                                      numSamples,
+                                                      pClear->renderTargetArrayIndex);
 
                 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
                 pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
                 pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
                 pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
                 pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
-                pHotTile->state = HOTTILE_CLEAR;
+                pHotTile->state        = HOTTILE_CLEAR;
             }
         }
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples, pClear->renderTargetArrayIndex);
+            HOTTILE* pHotTile      = pContext->pHotTileMgr->GetHotTile(pContext,
+                                                                  pDC,
+                                                                  hWorkerPrivateData,
+                                                                  macroTile,
+                                                                  SWR_ATTACHMENT_DEPTH,
+                                                                  true,
+                                                                  numSamples,
+                                                                  pClear->renderTargetArrayIndex);
             pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
-            pHotTile->state = HOTTILE_CLEAR;
+            pHotTile->state        = HOTTILE_CLEAR;
         }
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples, pClear->renderTargetArrayIndex);
+            HOTTILE* pHotTile = pContext->pHotTileMgr->GetHotTile(pContext,
+                                                                  pDC,
+                                                                  hWorkerPrivateData,
+                                                                  macroTile,
+                                                                  SWR_ATTACHMENT_STENCIL,
+                                                                  true,
+                                                                  numSamples,
+                                                                  pClear->renderTargetArrayIndex);
 
             pHotTile->clearData[0] = pClear->clearStencil;
-            pHotTile->state = HOTTILE_CLEAR;
+            pHotTile->state        = HOTTILE_CLEAR;
         }
 
         RDTSC_END(BEClear, 1);
@@ -223,7 +265,7 @@
     else
     {
         // Legacy clear
-        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+        CLEAR_DESC* pClear = (CLEAR_DESC*)pUserData;
         RDTSC_BEGIN(BEClear, pDC->drawId);
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR)
@@ -237,33 +279,51 @@
             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
             SWR_ASSERT(pfnClearTiles != nullptr);
 
-            unsigned long rt = 0;
-            uint32_t mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
+            unsigned long rt   = 0;
+            uint32_t      mask = pClear->attachmentMask & SWR_ATTACHMENT_MASK_COLOR;
             while (_BitScanForward(&rt, mask))
             {
                 mask &= ~(1 << rt);
 
-                pfnClearTiles(pDC, hWorkerPrivateData, (SWR_RENDERTARGET_ATTACHMENT)rt, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+                pfnClearTiles(pDC,
+                              hWorkerPrivateData,
+                              (SWR_RENDERTARGET_ATTACHMENT)rt,
+                              macroTile,
+                              pClear->renderTargetArrayIndex,
+                              clearData,
+                              pClear->rect);
             }
         }
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_DEPTH_BIT)
         {
             DWORD clearData[4];
-            clearData[0] = *(DWORD*)&pClear->clearDepth;
+            clearData[0]                  = *(DWORD*)&pClear->clearDepth;
             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
             SWR_ASSERT(pfnClearTiles != nullptr);
 
-            pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_DEPTH, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+            pfnClearTiles(pDC,
+                          hWorkerPrivateData,
+                          SWR_ATTACHMENT_DEPTH,
+                          macroTile,
+                          pClear->renderTargetArrayIndex,
+                          clearData,
+                          pClear->rect);
         }
 
         if (pClear->attachmentMask & SWR_ATTACHMENT_STENCIL_BIT)
         {
             DWORD clearData[4];
-            clearData[0] = pClear->clearStencil;
+            clearData[0]                  = pClear->clearStencil;
             PFN_CLEAR_TILES pfnClearTiles = gClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
 
-            pfnClearTiles(pDC, hWorkerPrivateData, SWR_ATTACHMENT_STENCIL, macroTile, pClear->renderTargetArrayIndex, clearData, pClear->rect);
+            pfnClearTiles(pDC,
+                          hWorkerPrivateData,
+                          SWR_ATTACHMENT_STENCIL,
+                          macroTile,
+                          pClear->renderTargetArrayIndex,
+                          clearData,
+                          pClear->rect);
         }
 
         RDTSC_END(BEClear, 1);
@@ -274,9 +334,9 @@
 {
     memset(gClearTilesTable, 0, sizeof(gClearTilesTable));
 
-    gClearTilesTable[R8G8B8A8_UNORM]        = ClearMacroTile<R8G8B8A8_UNORM>;
-    gClearTilesTable[B8G8R8A8_UNORM]        = ClearMacroTile<B8G8R8A8_UNORM>;
-    gClearTilesTable[R32_FLOAT]             = ClearMacroTile<R32_FLOAT>;
-    gClearTilesTable[R32G32B32A32_FLOAT]    = ClearMacroTile<R32G32B32A32_FLOAT>;
-    gClearTilesTable[R8_UINT]               = ClearMacroTile<R8_UINT>;
+    gClearTilesTable[R8G8B8A8_UNORM]     = ClearMacroTile<R8G8B8A8_UNORM>;
+    gClearTilesTable[B8G8R8A8_UNORM]     = ClearMacroTile<B8G8R8A8_UNORM>;
+    gClearTilesTable[R32_FLOAT]          = ClearMacroTile<R32_FLOAT>;
+    gClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
+    gClearTilesTable[R8_UINT]            = ClearMacroTile<R8_UINT>;
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
index 05234c2..1798dad 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -1,37 +1,39 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file backend.h
-*
-* @brief Backend handles rasterization, pixel shading and output merger
-*        operations.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file backend.h
+ *
+ * @brief Backend handles rasterization, pixel shading and output merger
+ *        operations.
+ *
+ ******************************************************************************/
 #pragma once
 
-void InitBackendSingleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
+void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]);
+void InitBackendSampleFuncTable(
+    PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]);
 
-static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext);
+static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
+                                          SWR_PS_CONTEXT&          psContext);
 
 
 enum SWR_BACKEND_FUNCS
@@ -45,15 +47,18 @@
 #if KNOB_SIMD_WIDTH == 8
 static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
 static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
-static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+static const __m256 vULOffsetsX     = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+static const __m256 vULOffsetsY     = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
 #define MASK 0xff
 #endif
 
-static INLINE simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar const &vI, simdscalar const &vJ)
+static INLINE simdmask ComputeUserClipMask(uint8_t           clipMask,
+                                           float*            pUserClipBuffer,
+                                           simdscalar const& vI,
+                                           simdscalar const& vJ)
 {
-    simdscalar vClipMask = _simd_setzero_ps();
-    uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
+    simdscalar vClipMask       = _simd_setzero_ps();
+    uint32_t   numClipDistance = _mm_popcnt_u32(clipMask);
 
     for (uint32_t i = 0; i < numClipDistance; ++i)
     {
@@ -76,23 +81,29 @@
 
 INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
 {
-    static const uint32_t RasterTileColorOffsets[16]
-    { 0,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    static const uint32_t RasterTileColorOffsets[16]{
+        0,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
+            10,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
+            11,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
+            12,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
+            13,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
+            14,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) *
+            15,
     };
     assert(sampleNum < 16);
     return RasterTileColorOffsets[sampleNum];
@@ -100,23 +111,29 @@
 
 INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
 {
-    static const uint32_t RasterTileDepthOffsets[16]
-    { 0,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    static const uint32_t RasterTileDepthOffsets[16]{
+        0,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
+            10,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
+            11,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
+            12,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
+            13,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
+            14,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) *
+            15,
     };
     assert(sampleNum < 16);
     return RasterTileDepthOffsets[sampleNum];
@@ -124,60 +141,78 @@
 
 INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
 {
-    static const uint32_t RasterTileStencilOffsets[16]
-    { 0,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
-      (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
+    static const uint32_t RasterTileStencilOffsets[16]{
+        0,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            2,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            3,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            4,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            5,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            6,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            7,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            8,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            9,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            10,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            11,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            12,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            13,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            14,
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) *
+            15,
     };
     assert(sampleNum < 16);
     return RasterTileStencilOffsets[sampleNum];
 }
 
-template<typename T, uint32_t InputCoverage>
+template <typename T, uint32_t InputCoverage>
 struct generateInputCoverage
 {
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+    INLINE generateInputCoverage(const uint64_t* const coverageMask,
+                                 uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
+                                 const uint32_t sampleMask)
     {
         // will need to update for avx512
         assert(KNOB_SIMD_WIDTH == 8);
 
         simdscalari mask[2];
         simdscalari sampleCoverage[2];
-        
-        if(T::bIsCenterPattern)
+
+        if (T::bIsCenterPattern)
         {
             // center coverage is the same for all samples; just broadcast to the sample slots
             uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-            if(T::MultisampleT::numSamples == 1)
+            if (T::MultisampleT::numSamples == 1)
             {
                 sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
             }
-            else if(T::MultisampleT::numSamples == 2)
+            else if (T::MultisampleT::numSamples == 2)
             {
-                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+                sampleCoverage[0] =
+                    _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
             }
-            else if(T::MultisampleT::numSamples == 4)
+            else if (T::MultisampleT::numSamples == 4)
             {
-                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+                sampleCoverage[0] = _simd_set_epi32(
+                    0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
             }
-            else if(T::MultisampleT::numSamples == 8)
+            else if (T::MultisampleT::numSamples == 8)
             {
                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
             }
-            else if(T::MultisampleT::numSamples == 16)
+            else if (T::MultisampleT::numSamples == 16)
             {
                 sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
                 sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
@@ -185,80 +220,127 @@
         }
         else
         {
-            simdscalari src = _simd_set1_epi32(0);
+            simdscalari src    = _simd_set1_epi32(0);
             simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
 
-            if(T::MultisampleT::numSamples == 1)
+            if (T::MultisampleT::numSamples == 1)
             {
                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
             }
-            else if(T::MultisampleT::numSamples == 2)
+            else if (T::MultisampleT::numSamples == 2)
             {
                 mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
             }
-            else if(T::MultisampleT::numSamples == 4)
+            else if (T::MultisampleT::numSamples == 4)
             {
                 mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
             }
-            else if(T::MultisampleT::numSamples == 8)
+            else if (T::MultisampleT::numSamples == 8)
             {
                 mask[0] = _simd_set1_epi32(-1);
             }
-            else if(T::MultisampleT::numSamples == 16)
+            else if (T::MultisampleT::numSamples == 16)
             {
                 mask[0] = _simd_set1_epi32(-1);
                 mask[1] = _simd_set1_epi32(-1);
-                index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+                index1  = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
             }
 
             // gather coverage for samples 0-7
-            sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
-            if(T::MultisampleT::numSamples > 8)
+            sampleCoverage[0] =
+                _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
+                                                            (const float*)coverageMask,
+                                                            index0,
+                                                            _mm256_castsi256_ps(mask[0]),
+                                                            8));
+            if (T::MultisampleT::numSamples > 8)
             {
                 // gather coverage for samples 8-15
-                sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+                sampleCoverage[1] =
+                    _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src),
+                                                                (const float*)coverageMask,
+                                                                index1,
+                                                                _mm256_castsi256_ps(mask[1]),
+                                                                8));
             }
         }
 
-        mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-                                  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+        mask[0] = _mm256_set_epi8(-1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  0xC,
+                                  0x8,
+                                  0x4,
+                                  0x0,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  -1,
+                                  0xC,
+                                  0x8,
+                                  0x4,
+                                  0x0);
         // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
         simdscalari packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
 
         simdscalari packedCoverage1;
-        if(T::MultisampleT::numSamples > 8)
+        if (T::MultisampleT::numSamples > 8)
         {
-            // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+            // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit
+            // lane
             packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
         }
 
-    #if (KNOB_ARCH == KNOB_ARCH_AVX)
-        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
         simdscalari hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
-        simdscalar shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-        packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+        simdscalar  shufRes = _mm256_shuffle_ps(
+            _mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+        packedCoverage0 = _mm256_castps_si256(
+            _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
 
         simdscalari packedSampleCoverage;
-        if(T::MultisampleT::numSamples > 8)
+        if (T::MultisampleT::numSamples > 8)
         {
             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-            hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
-            shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-            shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
-            packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
-            packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+            hiToLow         = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+            shufRes         = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow),
+                                        _mm256_castsi256_ps(hiToLow),
+                                        _MM_SHUFFLE(1, 1, 0, 1));
+            shufRes         = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+            packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(
+                _mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+            packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(
+                _mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
         }
         else
         {
             packedSampleCoverage = packedCoverage0;
         }
-    #else
+#else
         simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
-        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+        // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
         packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
 
         simdscalari packedSampleCoverage;
-        if(T::MultisampleT::numSamples > 8)
+        if (T::MultisampleT::numSamples > 8)
         {
             permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
@@ -271,14 +353,15 @@
         {
             packedSampleCoverage = packedCoverage0;
         }
-    #endif
+#endif
 
-        for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+        for (int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
         {
-            // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+            // convert packed sample coverage masks into single coverage masks for all samples for
+            // each pixel in the 4x2
             inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
 
-            if(!T::bForcedSampleCount)
+            if (!T::bForcedSampleCount)
             {
                 // input coverage has to be anded with sample mask if MSAA isn't forced on
                 inputMask[i] &= sampleMask;
@@ -289,35 +372,47 @@
         }
     }
 
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
+    INLINE generateInputCoverage(const uint64_t* const coverageMask,
+                                 simdscalar&           inputCoverage,
+                                 const uint32_t        sampleMask)
     {
         uint32_t inputMask[KNOB_SIMD_WIDTH];
         generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-        inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+        inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7],
+                                                        inputMask[6],
+                                                        inputMask[5],
+                                                        inputMask[4],
+                                                        inputMask[3],
+                                                        inputMask[2],
+                                                        inputMask[1],
+                                                        inputMask[0]));
     }
-
 };
 
-template<typename T>
+template <typename T>
 struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
 {
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, simdscalar &inputCoverage, const uint32_t sampleMask)
+    INLINE generateInputCoverage(const uint64_t* const coverageMask,
+                                 simdscalar&           inputCoverage,
+                                 const uint32_t        sampleMask)
     {
         // will need to update for avx512
         assert(KNOB_SIMD_WIDTH == 8);
-        simdscalari vec = _simd_set1_epi32(coverageMask[0]);
+        simdscalari       vec = _simd_set1_epi32(coverageMask[0]);
         const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-        vec = _simd_and_si(vec, bit);
-        vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
-        vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
-        inputCoverage = _simd_castsi_ps(vec);
+        vec                   = _simd_and_si(vec, bit);
+        vec                   = _simd_cmplt_epi32(_simd_setzero_si(), vec);
+        vec                   = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
+        inputCoverage         = _simd_castsi_ps(vec);
     }
 
-    INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+    INLINE generateInputCoverage(const uint64_t* const coverageMask,
+                                 uint32_t (&inputMask)[KNOB_SIMD_WIDTH],
+                                 const uint32_t sampleMask)
     {
-        uint32_t simdCoverage = (coverageMask[0] & MASK);
+        uint32_t              simdCoverage     = (coverageMask[0] & MASK);
         static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1;
-        for(int i = 0; i < KNOB_SIMD_WIDTH; i++)
+        for (int i = 0; i < KNOB_SIMD_WIDTH; i++)
         {
             // set all samples to covered if conservative coverage mask is set for that pixel
             inputMask[i] = (((1 << i) & simdCoverage) > 0) ? FullCoverageMask : 0;
@@ -327,18 +422,25 @@
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // Centroid behaves exactly as follows :
-// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 
+// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center
+// (even if the sample pattern does not happen to
 //     have a sample location there).
-// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 
+// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample
+// index, where sample coverage is after ANDing the
 //     coverage with the SampleMask Rasterizer State.
-// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 
-//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
-//     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
+// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to
+// fill out 2x2 pixel stamps, the attribute is
+//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the
+//     pixel, then the first sample covered by the SampleMask Rasterizer State is the evaluation
+//     point.Otherwise (full SampleMask), the pixel center is the evaluation point.
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-template<typename T>
-INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS& samplePos,
-                            const uint64_t *const coverageMask, const uint32_t sampleMask,
-                            simdscalar const &vXSamplePosUL, simdscalar const &vYSamplePosUL)
+template <typename T>
+INLINE void CalcCentroidPos(SWR_PS_CONTEXT&            psContext,
+                            const SWR_MULTISAMPLE_POS& samplePos,
+                            const uint64_t* const      coverageMask,
+                            const uint32_t             sampleMask,
+                            simdscalar const&          vXSamplePosUL,
+                            simdscalar const&          vYSamplePosUL)
 {
     uint32_t inputMask[KNOB_SIMD_WIDTH];
     generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
@@ -356,50 +458,60 @@
     (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
     (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
 
-    // look up and set the sample offsets from UL pixel corner for first covered sample 
+    // look up and set the sample offsets from UL pixel corner for first covered sample
     simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
-                                    samplePos.X(sampleNum[6]),
-                                    samplePos.X(sampleNum[5]),
-                                    samplePos.X(sampleNum[4]),
-                                    samplePos.X(sampleNum[3]),
-                                    samplePos.X(sampleNum[2]),
-                                    samplePos.X(sampleNum[1]),
-                                    samplePos.X(sampleNum[0]));
+                                       samplePos.X(sampleNum[6]),
+                                       samplePos.X(sampleNum[5]),
+                                       samplePos.X(sampleNum[4]),
+                                       samplePos.X(sampleNum[3]),
+                                       samplePos.X(sampleNum[2]),
+                                       samplePos.X(sampleNum[1]),
+                                       samplePos.X(sampleNum[0]));
 
     simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
-                                    samplePos.Y(sampleNum[6]),
-                                    samplePos.Y(sampleNum[5]),
-                                    samplePos.Y(sampleNum[4]),
-                                    samplePos.Y(sampleNum[3]),
-                                    samplePos.Y(sampleNum[2]),
-                                    samplePos.Y(sampleNum[1]),
-                                    samplePos.Y(sampleNum[0]));
+                                       samplePos.Y(sampleNum[6]),
+                                       samplePos.Y(sampleNum[5]),
+                                       samplePos.Y(sampleNum[4]),
+                                       samplePos.Y(sampleNum[3]),
+                                       samplePos.Y(sampleNum[2]),
+                                       samplePos.Y(sampleNum[1]),
+                                       samplePos.Y(sampleNum[0]));
     // add sample offset to UL pixel corner
     vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
     vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
 
     // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
     static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
-    simdscalari vInputCoveragei =  _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+    simdscalari              vInputCoveragei   = _simd_set_epi32(inputMask[7],
+                                                  inputMask[6],
+                                                  inputMask[5],
+                                                  inputMask[4],
+                                                  inputMask[3],
+                                                  inputMask[2],
+                                                  inputMask[1],
+                                                  inputMask[0]);
     simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
 
     static const simdscalari vZero = _simd_setzero_si();
-    const simdscalari vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
-    simdscalari vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
-    simdscalari vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
-    simdscalari vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
+    const simdscalari vSampleMask  = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
+    simdscalari       vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
+    simdscalari       vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
+    simdscalari       vCase3b           = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
 
     simdscalari vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
 
     // set the centroid position based on results from above
-    psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
-    psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
+    psContext.vX.centroid =
+        _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
+    psContext.vY.centroid =
+        _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
 
     // Case (3a) No samples covered and partial sample mask
     simdscalari vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
     // sample mask should never be all 0's for this case, but handle it anyways
     unsigned long firstCoveredSampleMaskSample = 0;
-    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
+    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask))
+                     : (firstCoveredSampleMaskSample = 0);
 
     simdscalari vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
 
@@ -407,24 +519,34 @@
     vYSample = _simd_set1_ps(samplePos.Y(firstCoveredSampleMaskSample));
 
     // blend in case 3a pixel locations
-    psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
-    psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
+    psContext.vX.centroid =
+        _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
+    psContext.vY.centroid =
+        _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
 }
 
-INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
-                                     const simdscalar &vXSamplePosUL, const simdscalar &vYSamplePosUL)
+INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs,
+                                     SWR_PS_CONTEXT&          psContext,
+                                     const simdscalar&        vXSamplePosUL,
+                                     const simdscalar&        vYSamplePosUL)
 {
     // evaluate I,J
-    psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
-    psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vI.centroid =
+        vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
+    psContext.vJ.centroid =
+        vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
     psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
     psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
 
     // interpolate 1/w
-    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
+    psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW,
+                                            coeffs.vBOneOverW,
+                                            coeffs.vCOneOverW,
+                                            psContext.vI.centroid,
+                                            psContext.vJ.centroid);
 }
 
-INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const &z, float minz, float maxz)
+INLINE simdmask CalcDepthBoundsAcceptMask(simdscalar const& z, float minz, float maxz)
 {
     const simdscalar minzMask = _simd_cmpge_ps(z, _simd_set1_ps(minz));
     const simdscalar maxzMask = _simd_cmple_ps(z, _simd_set1_ps(maxz));
@@ -432,16 +554,17 @@
     return _simd_movemask_ps(_simd_and_ps(minzMask, maxzMask));
 }
 
-template<typename T>
+template <typename T>
 INLINE uint32_t GetNumOMSamples(SWR_MULTISAMPLE_COUNT blendSampleCount)
 {
     // RT has to be single sample if we're in forcedMSAA mode
-    if(T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
+    if (T::bForcedSampleCount && (T::MultisampleT::sampleCount > SWR_MULTISAMPLE_1X))
     {
         return 1;
     }
-    // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
-    else if(T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
+    // unless we're forced to single sample, in which case we run the OM at the sample count of the
+    // RT
+    else if (T::bForcedSampleCount && (T::MultisampleT::sampleCount == SWR_MULTISAMPLE_1X))
     {
         return GetNumSamples(blendSampleCount);
     }
@@ -452,7 +575,7 @@
     }
 }
 
-inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE_DESC &work)
+inline void SetupBarycentricCoeffs(BarycentricCoeffs* coeffs, const SWR_TRIANGLE_DESC& work)
 {
     // broadcast scalars
 
@@ -475,9 +598,12 @@
     coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
 }
 
-inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers)
+inline void SetupRenderBuffers(uint8_t*             pColorBuffer[SWR_NUM_RENDERTARGETS],
+                               uint8_t**            pDepthBuffer,
+                               uint8_t**            pStencilBuffer,
+                               uint32_t             colorHotTileMask,
+                               RenderOutputBuffers& renderBuffers)
 {
-    
     DWORD index;
     while (_BitScanForward(&index, colorHotTileMask))
     {
@@ -493,41 +619,51 @@
 
     if (pStencilBuffer)
     {
-        *pStencilBuffer = renderBuffers.pStencil;;
+        *pStencilBuffer = renderBuffers.pStencil;
+        ;
     }
 }
 
-template<typename T>
-void SetupPixelShaderContext(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos, SWR_TRIANGLE_DESC &work)
+template <typename T>
+void SetupPixelShaderContext(SWR_PS_CONTEXT*            psContext,
+                             const SWR_MULTISAMPLE_POS& samplePos,
+                             SWR_TRIANGLE_DESC&         work)
 {
-    psContext->pAttribs = work.pAttribs;
-    psContext->pPerspAttribs = work.pPerspAttribs;
-    psContext->frontFace = work.triFlags.frontFacing;
+    psContext->pAttribs               = work.pAttribs;
+    psContext->pPerspAttribs          = work.pPerspAttribs;
+    psContext->frontFace              = work.triFlags.frontFacing;
     psContext->renderTargetArrayIndex = work.triFlags.renderTargetArrayIndex;
 
-    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull
+    // attribs
     psContext->I = work.I;
     psContext->J = work.J;
 
     psContext->recipDet = work.recipDet;
-    psContext->pRecipW = work.pRecipW;
-    psContext->pSamplePosX = samplePos.X();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
-    psContext->pSamplePosY = samplePos.Y();//reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
+    psContext->pRecipW  = work.pRecipW;
+    psContext->pSamplePosX =
+        samplePos.X(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosX);
+    psContext->pSamplePosY =
+        samplePos.Y(); // reinterpret_cast<const float *>(&T::MultisampleT::samplePosY);
     psContext->rasterizerSampleCount = T::MultisampleT::numSamples;
-    psContext->sampleIndex = 0;
+    psContext->sampleIndex           = 0;
 }
 
-template<typename T, bool IsSingleSample>
-void CalcCentroid(SWR_PS_CONTEXT *psContext, const SWR_MULTISAMPLE_POS& samplePos,
-                  const BarycentricCoeffs &coeffs, const uint64_t * const coverageMask, uint32_t sampleMask)
+template <typename T, bool IsSingleSample>
+void CalcCentroid(SWR_PS_CONTEXT*            psContext,
+                  const SWR_MULTISAMPLE_POS& samplePos,
+                  const BarycentricCoeffs&   coeffs,
+                  const uint64_t* const      coverageMask,
+                  uint32_t                   sampleMask)
 {
-    if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid positions are still different
+    if (IsSingleSample) // if (T::MultisampleT::numSamples == 1) // doesn't cut it, the centroid
+                        // positions are still different
     {
         // for 1x case, centroid is pixel center
-        psContext->vX.centroid = psContext->vX.center;
-        psContext->vY.centroid = psContext->vY.center;
-        psContext->vI.centroid = psContext->vI.center;
-        psContext->vJ.centroid = psContext->vJ.center;
+        psContext->vX.centroid        = psContext->vX.center;
+        psContext->vY.centroid        = psContext->vY.center;
+        psContext->vI.centroid        = psContext->vI.center;
+        psContext->vJ.centroid        = psContext->vJ.center;
         psContext->vOneOverW.centroid = psContext->vOneOverW.center;
     }
     else
@@ -542,8 +678,14 @@
             }
             else
             {
-                // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate coverage 2X'..
-                CalcCentroidPos<T>(*psContext, samplePos, coverageMask, sampleMask, psContext->vX.UL, psContext->vY.UL);
+                // add param: const uint32_t inputMask[KNOB_SIMD_WIDTH] to eliminate 'generate
+                // coverage 2X'..
+                CalcCentroidPos<T>(*psContext,
+                                   samplePos,
+                                   coverageMask,
+                                   sampleMask,
+                                   psContext->vX.UL,
+                                   psContext->vY.UL);
             }
 
             CalcCentroidBarycentrics(coeffs, *psContext, psContext->vX.UL, psContext->vY.UL);
@@ -556,47 +698,61 @@
     }
 }
 
-template<typename T>
+template <typename T>
 struct PixelRateZTestLoop
 {
-    PixelRateZTestLoop(DRAW_CONTEXT *DC, uint32_t _workerId, const SWR_TRIANGLE_DESC &Work, const BarycentricCoeffs& Coeffs, const API_STATE& apiState,
-                       uint8_t*& depthBuffer, uint8_t*& stencilBuffer, const uint8_t ClipDistanceMask) :
-                       pDC(DC), workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
-                       samplePos(state.rastState.samplePositions),
-                       clipDistanceMask(ClipDistanceMask), pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
+    PixelRateZTestLoop(DRAW_CONTEXT*            DC,
+                       uint32_t                 _workerId,
+                       const SWR_TRIANGLE_DESC& Work,
+                       const BarycentricCoeffs& Coeffs,
+                       const API_STATE&         apiState,
+                       uint8_t*&                depthBuffer,
+                       uint8_t*&                stencilBuffer,
+                       const uint8_t            ClipDistanceMask) :
+        pDC(DC),
+        workerId(_workerId), work(Work), coeffs(Coeffs), state(apiState), psState(apiState.psState),
+        samplePos(state.rastState.samplePositions), clipDistanceMask(ClipDistanceMask),
+        pDepthBuffer(depthBuffer), pStencilBuffer(stencilBuffer){};
 
     INLINE
-    uint32_t operator()(simdscalar& activeLanes, SWR_PS_CONTEXT& psContext, 
-                        const CORE_BUCKETS BEDepthBucket, uint32_t currentSimdIn8x8 = 0)
+    uint32_t operator()(simdscalar&        activeLanes,
+                        SWR_PS_CONTEXT&    psContext,
+                        const CORE_BUCKETS BEDepthBucket,
+                        uint32_t           currentSimdIn8x8 = 0)
     {
 
-        uint32_t statCount = 0;
+        uint32_t   statCount            = 0;
         simdscalar anyDepthSamplePassed = _simd_setzero_ps();
-        for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+        for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
         {
-            const uint8_t *pCoverageMask = (uint8_t*)&work.coverageMask[sample];
-            vCoverageMask[sample] = _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
+            const uint8_t* pCoverageMask = (uint8_t*)&work.coverageMask[sample];
+            vCoverageMask[sample] =
+                _simd_and_ps(activeLanes, _simd_vmask_ps(pCoverageMask[currentSimdIn8x8] & MASK));
 
-            if(!_simd_movemask_ps(vCoverageMask[sample]))
+            if (!_simd_movemask_ps(vCoverageMask[sample]))
             {
-                vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] = _simd_setzero_ps();
+                vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] =
+                    _simd_setzero_ps();
                 continue;
             }
 
             // offset depth/stencil buffers current sample
-            uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-            uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+            uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
+            uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
 
             if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
             {
-                static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+                static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
+                              "Unsupported depth hot tile format");
 
-                const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+                const simdscalar z = _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
 
                 const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                 const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
 
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
+                vCoverageMask[sample] =
+                    _simd_and_ps(vCoverageMask[sample],
+                                 _simd_vmask_ps(CalcDepthBoundsAcceptMask(z, minz, maxz)));
             }
 
             RDTSC_BEGIN(BEBarycentric, pDC->drawId);
@@ -608,7 +764,7 @@
             // calc I & J per sample
             CalcSampleBarycentrics(coeffs, psContext);
 
-            if(psState.writesODepth)
+            if (psState.writesODepth)
             {
                 {
                     // broadcast and test oDepth(psContext.vZ) written from the PS for each sample
@@ -617,7 +773,8 @@
             }
             else
             {
-                vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                vZ[sample] = vplaneps(
+                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
                 vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
             }
 
@@ -625,36 +782,52 @@
 
             ///@todo: perspective correct vs non-perspective correct clipping?
             // if clip distances are enabled, we need to interpolate for each sample
-            if(clipDistanceMask)
+            if (clipDistanceMask)
             {
-                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
+                uint8_t clipMask = ComputeUserClipMask(clipDistanceMask,
+                                                       work.pUserClipBuffer,
+                                                       psContext.vI.sample,
+                                                       psContext.vJ.sample);
 
-                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
+                vCoverageMask[sample] =
+                    _simd_and_ps(vCoverageMask[sample], _simd_vmask_ps(~clipMask));
             }
 
             // ZTest for this sample
             ///@todo Need to uncomment out this bucket.
-            //RDTSC_BEGIN(BEDepthBucket, pDC->drawId);
-            depthPassMask[sample] = vCoverageMask[sample];
+            // RDTSC_BEGIN(BEDepthBucket, pDC->drawId);
+            depthPassMask[sample]   = vCoverageMask[sample];
             stencilPassMask[sample] = vCoverageMask[sample];
-            depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                     vZ[sample], pDepthSample, vCoverageMask[sample], 
-                                                     pStencilSample, &stencilPassMask[sample]);
-            //RDTSC_END(BEDepthBucket, 0);
+            depthPassMask[sample]   = DepthStencilTest(&state,
+                                                     work.triFlags.frontFacing,
+                                                     work.triFlags.viewportIndex,
+                                                     vZ[sample],
+                                                     pDepthSample,
+                                                     vCoverageMask[sample],
+                                                     pStencilSample,
+                                                     &stencilPassMask[sample]);
+            // RDTSC_END(BEDepthBucket, 0);
 
             // early-exit if no pixels passed depth or earlyZ is forced on
-            if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
+            if (psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
             {
-                DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
-                                  pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
+                DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
+                                  &state.depthStencilState,
+                                  work.triFlags.frontFacing,
+                                  vZ[sample],
+                                  pDepthSample,
+                                  depthPassMask[sample],
+                                  vCoverageMask[sample],
+                                  pStencilSample,
+                                  stencilPassMask[sample]);
 
-                if(!_simd_movemask_ps(depthPassMask[sample]))
+                if (!_simd_movemask_ps(depthPassMask[sample]))
                 {
                     continue;
                 }
             }
             anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
-            uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
+            uint32_t statMask    = _simd_movemask_ps(depthPassMask[sample]);
             statCount += _mm_popcnt_u32(statMask);
         }
 
@@ -672,106 +845,129 @@
 private:
     // functor inputs
     DRAW_CONTEXT* pDC;
-    uint32_t workerId;
+    uint32_t      workerId;
 
-    const SWR_TRIANGLE_DESC& work;
-    const BarycentricCoeffs& coeffs;
-    const API_STATE& state;
-    const SWR_PS_STATE& psState;
+    const SWR_TRIANGLE_DESC&   work;
+    const BarycentricCoeffs&   coeffs;
+    const API_STATE&           state;
+    const SWR_PS_STATE&        psState;
     const SWR_MULTISAMPLE_POS& samplePos;
-    const uint8_t clipDistanceMask;
-    uint8_t*& pDepthBuffer;
-    uint8_t*& pStencilBuffer;
+    const uint8_t              clipDistanceMask;
+    uint8_t*&                  pDepthBuffer;
+    uint8_t*&                  pStencilBuffer;
 };
 
-INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT& psContext)
 {
     // evaluate I,J
-    psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
-    psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
+    psContext.vI.center =
+        vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
+    psContext.vJ.center =
+        vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
     psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
     psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
 
     // interpolate 1/w
-    psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
+    psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW,
+                                          coeffs.vBOneOverW,
+                                          coeffs.vCOneOverW,
+                                          psContext.vI.center,
+                                          psContext.vJ.center);
 }
 
-static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs,
+                                          SWR_PS_CONTEXT&          psContext)
 {
     // evaluate I,J
-    psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
-    psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
+    psContext.vI.sample =
+        vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
+    psContext.vJ.sample =
+        vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
     psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
     psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
 
     // interpolate 1/w
-    psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
+    psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW,
+                                          coeffs.vBOneOverW,
+                                          coeffs.vCOneOverW,
+                                          psContext.vI.sample,
+                                          psContext.vJ.sample);
 }
 
 // Merge Output to 4x2 SIMD Tile Format
-INLINE void OutputMerger4x2(DRAW_CONTEXT *pDC, SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, uint32_t workerId)
+INLINE void OutputMerger4x2(DRAW_CONTEXT*   pDC,
+                            SWR_PS_CONTEXT& psContext,
+                            uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS],
+                            uint32_t               sample,
+                            const SWR_BLEND_STATE* pBlendState,
+                            const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS],
+                            simdscalar&       coverageMask,
+                            simdscalar const& depthPassMask,
+                            uint32_t          renderTargetMask,
+                            uint32_t          workerId)
 {
     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
     const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
-    simdvector blendOut;
+    simdvector     blendOut;
 
     DWORD rt = 0;
     while (_BitScanForward(&rt, renderTargetMask))
     {
         renderTargetMask &= ~(1 << rt);
-        uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset;
+        uint8_t* pColorSample = pColorBase[rt] + rasterTileColorOffset;
 
-        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+        const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt];
 
-        SWR_BLEND_CONTEXT blendContext = { 0 };
+        SWR_BLEND_CONTEXT blendContext = {0};
         {
             // pfnBlendFunc may not update all channels.  Initialize with PS output.
             /// TODO: move this into the blend JIT.
             blendOut = psContext.shaded[rt];
 
             blendContext.pBlendState = pBlendState;
-            blendContext.src = &psContext.shaded[rt];
-            blendContext.src1 = &psContext.shaded[1];
-            blendContext.src0alpha = reinterpret_cast<simdvector *>(&psContext.shaded[0].w);
-            blendContext.sampleNum = sample;
-            blendContext.pDst = (simdvector *) &pColorSample;
-            blendContext.result = &blendOut;
-            blendContext.oMask = &psContext.oMask;
-            blendContext.pMask = reinterpret_cast<simdscalari *>(&coverageMask);
+            blendContext.src         = &psContext.shaded[rt];
+            blendContext.src1        = &psContext.shaded[1];
+            blendContext.src0alpha   = reinterpret_cast<simdvector*>(&psContext.shaded[0].w);
+            blendContext.sampleNum   = sample;
+            blendContext.pDst        = (simdvector*)&pColorSample;
+            blendContext.result      = &blendOut;
+            blendContext.oMask       = &psContext.oMask;
+            blendContext.pMask       = reinterpret_cast<simdscalari*>(&coverageMask);
 
             // Blend outputs and update coverage mask for alpha test
-            if(pfnBlendFunc[rt] != nullptr)
+            if (pfnBlendFunc[rt] != nullptr)
             {
                 pfnBlendFunc[rt](&blendContext);
             }
         }
 
         // Track alpha events
-        AR_EVENT(AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
+        AR_EVENT(
+            AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
 
-        // final write mask 
+        // final write mask
         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
 
         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
+                      "Unsupported hot tile format");
 
         const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
 
         // store with color mask
-        if(!pRTBlend->writeDisableRed)
+        if (!pRTBlend->writeDisableRed)
         {
             _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
         }
-        if(!pRTBlend->writeDisableGreen)
+        if (!pRTBlend->writeDisableGreen)
         {
             _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
         }
-        if(!pRTBlend->writeDisableBlue)
+        if (!pRTBlend->writeDisableBlue)
         {
             _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
         }
-        if(!pRTBlend->writeDisableAlpha)
+        if (!pRTBlend->writeDisableAlpha)
         {
             _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
         }
@@ -780,8 +976,17 @@
 
 #if USE_8x2_TILE_BACKEND
 // Merge Output to 8x2 SIMD16 Tile Format
-INLINE void OutputMerger8x2(DRAW_CONTEXT *pDC, SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
-    const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar const &depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset, uint32_t workerId)
+INLINE void OutputMerger8x2(DRAW_CONTEXT*   pDC,
+                            SWR_PS_CONTEXT& psContext,
+                            uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS],
+                            uint32_t               sample,
+                            const SWR_BLEND_STATE* pBlendState,
+                            const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS],
+                            simdscalar&       coverageMask,
+                            simdscalar const& depthPassMask,
+                            uint32_t          renderTargetMask,
+                            bool              useAlternateOffset,
+                            uint32_t          workerId)
 {
     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
     uint32_t rasterTileColorOffset = RasterTileColorOffset(sample);
@@ -799,150 +1004,180 @@
     {
         renderTargetMask &= ~(1 << rt);
 
-        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+        const SWR_RENDER_TARGET_BLEND_STATE* pRTBlend = &pBlendState->renderTarget[rt];
 
         simdscalar* pColorSample;
-        bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
+        bool        hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed ||
+                             !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue;
         if (hotTileEnable)
         {
-            pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset);
-            blendSrc[0] = pColorSample[0];
-            blendSrc[1] = pColorSample[2];
-            blendSrc[2] = pColorSample[4];
-            blendSrc[3] = pColorSample[6];
+            pColorSample = reinterpret_cast<simdscalar*>(pColorBase[rt] + rasterTileColorOffset);
+            blendSrc[0]  = pColorSample[0];
+            blendSrc[1]  = pColorSample[2];
+            blendSrc[2]  = pColorSample[4];
+            blendSrc[3]  = pColorSample[6];
         }
         else
         {
             pColorSample = nullptr;
         }
 
-        SWR_BLEND_CONTEXT blendContext = { 0 };
+        SWR_BLEND_CONTEXT blendContext = {0};
         {
             // pfnBlendFunc may not update all channels.  Initialize with PS output.
             /// TODO: move this into the blend JIT.
             blendOut = psContext.shaded[rt];
 
-            blendContext.pBlendState    = pBlendState;
-            blendContext.src            = &psContext.shaded[rt];
-            blendContext.src1           = &psContext.shaded[1];
-            blendContext.src0alpha      = reinterpret_cast<simdvector *>(&psContext.shaded[0].w);
-            blendContext.sampleNum      = sample;
-            blendContext.pDst           = &blendSrc;
-            blendContext.result         = &blendOut;
-            blendContext.oMask          = &psContext.oMask;
-            blendContext.pMask          = reinterpret_cast<simdscalari *>(&coverageMask);
+            blendContext.pBlendState = pBlendState;
+            blendContext.src         = &psContext.shaded[rt];
+            blendContext.src1        = &psContext.shaded[1];
+            blendContext.src0alpha   = reinterpret_cast<simdvector*>(&psContext.shaded[0].w);
+            blendContext.sampleNum   = sample;
+            blendContext.pDst        = &blendSrc;
+            blendContext.result      = &blendOut;
+            blendContext.oMask       = &psContext.oMask;
+            blendContext.pMask       = reinterpret_cast<simdscalari*>(&coverageMask);
 
             // Blend outputs and update coverage mask for alpha test
-            if(pfnBlendFunc[rt] != nullptr)
+            if (pfnBlendFunc[rt] != nullptr)
             {
                 pfnBlendFunc[rt](&blendContext);
             }
         }
 
         // Track alpha events
-        AR_EVENT(AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
+        AR_EVENT(
+            AlphaInfoEvent(pDC->drawId, blendContext.isAlphaTested, blendContext.isAlphaBlended));
 
-        // final write mask 
+        // final write mask
         simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
 
         ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
+                      "Unsupported hot tile format");
 
         // store with color mask
         if (!pRTBlend->writeDisableRed)
         {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[0]), outputMask, blendOut.x);
+            _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[0]), outputMask, blendOut.x);
         }
         if (!pRTBlend->writeDisableGreen)
         {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[2]), outputMask, blendOut.y);
+            _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[2]), outputMask, blendOut.y);
         }
         if (!pRTBlend->writeDisableBlue)
         {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[4]), outputMask, blendOut.z);
+            _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[4]), outputMask, blendOut.z);
         }
         if (!pRTBlend->writeDisableAlpha)
         {
-            _simd_maskstore_ps(reinterpret_cast<float *>(&pColorSample[6]), outputMask, blendOut.w);
+            _simd_maskstore_ps(reinterpret_cast<float*>(&pColorSample[6]), outputMask, blendOut.w);
         }
     }
 }
 
 #endif
 
-template<typename T>
-void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+template <typename T>
+void BackendPixelRate(DRAW_CONTEXT*        pDC,
+                      uint32_t             workerId,
+                      uint32_t             x,
+                      uint32_t             y,
+                      SWR_TRIANGLE_DESC&   work,
+                      RenderOutputBuffers& renderBuffers)
 {
-    ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend
+    ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the
+    /// backend
 
 
     RDTSC_BEGIN(BEPixelRateBackend, pDC->drawId);
     RDTSC_BEGIN(BESetup, pDC->drawId);
 
-    const API_STATE &state = GetApiState(pDC);
+    const API_STATE& state = GetApiState(pDC);
 
     BarycentricCoeffs coeffs;
     SetupBarycentricCoeffs(&coeffs, work);
 
-    SWR_CONTEXT *pContext = pDC->pContext;
-    void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+    SWR_CONTEXT* pContext    = pDC->pContext;
+    void*        pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
-    SWR_PS_CONTEXT psContext;
+    SWR_PS_CONTEXT             psContext;
     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
     SetupPixelShaderContext<T>(&psContext, samplePos, work);
 
     uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
+    SetupRenderBuffers(psContext.pColorBuffer,
+                       &pDepthBuffer,
+                       &pStencilBuffer,
+                       state.colorHottileEnable,
+                       renderBuffers);
 
     RDTSC_END(BESetup, 0);
 
-    PixelRateZTestLoop<T> PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.backendState.clipDistanceMask);
+    PixelRateZTestLoop<T> PixelRateZTest(pDC,
+                                         workerId,
+                                         work,
+                                         coeffs,
+                                         state,
+                                         pDepthBuffer,
+                                         pStencilBuffer,
+                                         state.backendState.clipDistanceMask);
 
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 
     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
 
-    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
 
         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
 
-        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
 #if USE_8x2_TILE_BACKEND
             const bool useAlternateOffset = ((xx & SIMD_TILE_X_DIM) != 0);
 #endif
             simdscalar activeLanes;
-            if(!(work.anyCoveredSamples & MASK)) {goto Endtile;};
+            if (!(work.anyCoveredSamples & MASK))
+            {
+                goto Endtile;
+            };
             activeLanes = _simd_vmask_ps(work.anyCoveredSamples & MASK);
 
             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
             {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+                const uint64_t* pCoverageMask =
+                    (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+                        ? &work.innerCoverageMask
+                        : &work.coverageMask[0];
 
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+                generateInputCoverage<T, T::InputCoverage>(
+                    pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
             }
 
             RDTSC_BEGIN(BEBarycentric, pDC->drawId);
 
             CalcPixelBarycentrics(coeffs, psContext);
 
-            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+            CalcCentroid<T, false>(
+                &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
 
             RDTSC_END(BEBarycentric, 0);
 
-            if(T::bForcedSampleCount)
+            if (T::bForcedSampleCount)
             {
-                // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
-                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
-                activeLanes = _simd_and_ps(activeLanes, vSampleMask);
+                // candidate pixels (that passed coverage) will cause shader invocation if any bits
+                // in the samplemask are set
+                const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(
+                    _simd_set1_epi32(state.blendState.sampleMask), _simd_setzero_si()));
+                activeLanes                  = _simd_and_ps(activeLanes, vSampleMask);
             }
 
             // Early-Z?
-            if(T::bCanEarlyZ && !T::bForcedSampleCount)
+            if (T::bCanEarlyZ && !T::bForcedSampleCount)
             {
                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BEEarlyDepthTest);
                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
@@ -950,20 +1185,24 @@
             }
 
             // if we have no covered samples that passed depth at this point, go to next tile
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+            if (!_simd_movemask_ps(activeLanes))
+            {
+                goto Endtile;
+            };
 
-            if(state.psState.usesSourceDepth)
+            if (state.psState.usesSourceDepth)
             {
                 RDTSC_BEGIN(BEBarycentric, pDC->drawId);
                 // interpolate and quantize z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = vplaneps(
+                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
                 RDTSC_END(BEBarycentric, 0);
             }
 
             // pixels that are currently active
             psContext.activeMask = _simd_castps_si(activeLanes);
-            psContext.oMask = T::MultisampleT::FullSampleMask();
+            psContext.oMask      = T::MultisampleT::FullSampleMask();
 
             // execute pixel shader
             RDTSC_BEGIN(BEPixelShader, pDC->drawId);
@@ -976,29 +1215,39 @@
             AR_EVENT(PSStats(psContext.stats.numInstExecuted));
 
             // update active lanes to remove any discarded or oMask'd pixels
-            activeLanes = _simd_castsi_ps(_simd_and_si(psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+            activeLanes = _simd_castsi_ps(_simd_and_si(
+                psContext.activeMask, _simd_cmpgt_epi32(psContext.oMask, _simd_setzero_si())));
+            if (!_simd_movemask_ps(activeLanes))
+            {
+                goto Endtile;
+            };
 
             // late-Z
-            if(!T::bCanEarlyZ && !T::bForcedSampleCount)
+            if (!T::bCanEarlyZ && !T::bForcedSampleCount)
             {
                 uint32_t depthPassCount = PixelRateZTest(activeLanes, psContext, BELateDepthTest);
                 UPDATE_STAT_BE(DepthPassCount, depthPassCount);
                 AR_EVENT(LateDepthInfoPixelRate(depthPassCount, _simd_movemask_ps(activeLanes)));
             }
 
-            // if we have no covered samples that passed depth at this point, skip OM and go to next tile
-            if(!_simd_movemask_ps(activeLanes)) { goto Endtile; };
+            // if we have no covered samples that passed depth at this point, skip OM and go to next
+            // tile
+            if (!_simd_movemask_ps(activeLanes))
+            {
+                goto Endtile;
+            };
 
             // output merger
             // loop over all samples, broadcasting the results of the PS to all passing pixels
-            for(uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount); sample++)
+            for (uint32_t sample = 0; sample < GetNumOMSamples<T>(state.blendState.sampleCount);
+                 sample++)
             {
                 RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
-                // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
-                uint32_t coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
+                // center pattern does a single coverage/depth/stencil test, standard pattern tests
+                // all samples
+                uint32_t   coverageSampleNum = (T::bIsCenterPattern) ? 0 : sample;
                 simdscalar coverageMask, depthMask;
-                if(T::bForcedSampleCount)
+                if (T::bForcedSampleCount)
                 {
                     coverageMask = depthMask = activeLanes;
                 }
@@ -1006,40 +1255,66 @@
                 {
                     coverageMask = PixelRateZTest.vCoverageMask[coverageSampleNum];
                     depthMask = PixelRateZTest.depthPassMask[coverageSampleNum];
-                    if(!_simd_movemask_ps(depthMask))
+                    if (!_simd_movemask_ps(depthMask))
                     {
                         // stencil should already have been written in early/lateZ tests
                         RDTSC_END(BEOutputMerger, 0);
                         continue;
                     }
                 }
-                
+
                 // broadcast the results of the PS to all passing pixels
 #if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset, workerId);
-#else // USE_8x2_TILE_BACKEND
-                OutputMerger4x2(pDC, psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, workerId);
+                OutputMerger8x2(pDC,
+                                psContext,
+                                psContext.pColorBuffer,
+                                sample,
+                                &state.blendState,
+                                state.pfnBlendFunc,
+                                coverageMask,
+                                depthMask,
+                                state.psState.renderTargetMask,
+                                useAlternateOffset,
+                                workerId);
+#else  // USE_8x2_TILE_BACKEND
+                OutputMerger4x2(pDC,
+                                psContext,
+                                psContext.pColorBuffer,
+                                sample,
+                                &state.blendState,
+                                state.pfnBlendFunc,
+                                coverageMask,
+                                depthMask,
+                                state.psState.renderTargetMask,
+                                workerId);
 #endif // USE_8x2_TILE_BACKEND
 
-                if(!state.psState.forceEarlyZ && !T::bForcedSampleCount)
+                if (!state.psState.forceEarlyZ && !T::bForcedSampleCount)
                 {
-                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t * pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+                    uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
 
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
-                                      pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
+                                      &state.depthStencilState,
+                                      work.triFlags.frontFacing,
+                                      PixelRateZTest.vZ[coverageSampleNum],
+                                      pDepthSample,
+                                      depthMask,
+                                      coverageMask,
+                                      pStencilSample,
+                                      PixelRateZTest.stencilPassMask[coverageSampleNum]);
                 }
                 RDTSC_END(BEOutputMerger, 0);
             }
-Endtile:
+        Endtile:
             RDTSC_BEGIN(BEEndTile, pDC->drawId);
 
-            for(uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
+            for (uint32_t sample = 0; sample < T::MultisampleT::numCoverageSamples; sample++)
             {
                 work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
 
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
             {
                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
@@ -1048,48 +1323,55 @@
 #if USE_8x2_TILE_BACKEND
             if (useAlternateOffset)
             {
-                DWORD rt;
+                DWORD    rt;
                 uint32_t rtMask = state.colorHottileEnable;
                 while (_BitScanForward(&rt, rtMask))
                 {
                     rtMask &= ~(1 << rt);
-                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                    psContext.pColorBuffer[rt] +=
+                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
-            DWORD rt;
+            DWORD    rt;
             uint32_t rtMask = state.colorHottileEnable;
             while (_BitScanForward(&rt, rtMask))
             {
                 rtMask &= ~(1 << rt);
-                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                psContext.pColorBuffer[rt] +=
+                    (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
 #endif
             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer +=
+                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
             RDTSC_END(BEEndTile, 0);
 
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
         }
 
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
     }
 
     RDTSC_END(BEPixelRateBackend, 0);
 }
 
-template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t isCenter = 0,
-         uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0
-    >
+template <uint32_t sampleCountT = SWR_MULTISAMPLE_1X,
+          uint32_t isCenter     = 0,
+          uint32_t coverage     = 0,
+          uint32_t centroid     = 0,
+          uint32_t forced       = 0,
+          uint32_t canEarlyZ    = 0
+          >
 struct SwrBackendTraits
 {
-    static const bool bIsCenterPattern = (isCenter == 1);
-    static const uint32_t InputCoverage = coverage;
-    static const bool bCentroidPos = (centroid == 1);
-    static const bool bForcedSampleCount = (forced == 1);
-    static const bool bCanEarlyZ = (canEarlyZ == 1);
+    static const bool     bIsCenterPattern   = (isCenter == 1);
+    static const uint32_t InputCoverage      = coverage;
+    static const bool     bCentroidPos       = (centroid == 1);
+    static const bool     bForcedSampleCount = (forced == 1);
+    static const bool     bCanEarlyZ         = (canEarlyZ == 1);
     typedef MultisampleTraits<(SWR_MULTISAMPLE_COUNT)sampleCountT, bIsCenterPattern> MultisampleT;
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
index 5940aa7..a1a1185 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_sample.cpp
@@ -1,31 +1,31 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file backend.cpp
-*
-* @brief Backend handles rasterization, pixel shading and output merger
-*        operations.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file backend.cpp
+ *
+ * @brief Backend handles rasterization, pixel shading and output merger
+ *        operations.
+ *
+ ******************************************************************************/
 
 #include <smmintrin.h>
 
@@ -37,35 +37,44 @@
 
 #include <algorithm>
 
-template<typename T>
-void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+template <typename T>
+void BackendSampleRate(DRAW_CONTEXT*        pDC,
+                       uint32_t             workerId,
+                       uint32_t             x,
+                       uint32_t             y,
+                       SWR_TRIANGLE_DESC&   work,
+                       RenderOutputBuffers& renderBuffers)
 {
     RDTSC_BEGIN(BESampleRateBackend, pDC->drawId);
     RDTSC_BEGIN(BESetup, pDC->drawId);
 
-    void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
-    const API_STATE &state = GetApiState(pDC);
+    void* pWorkerData      = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
+    const API_STATE& state = GetApiState(pDC);
 
     BarycentricCoeffs coeffs;
     SetupBarycentricCoeffs(&coeffs, work);
 
-    SWR_PS_CONTEXT psContext;
+    SWR_PS_CONTEXT             psContext;
     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
     SetupPixelShaderContext<T>(&psContext, samplePos, work);
 
     uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
+    SetupRenderBuffers(psContext.pColorBuffer,
+                       &pDepthBuffer,
+                       &pStencilBuffer,
+                       state.colorHottileEnable,
+                       renderBuffers);
 
     RDTSC_END(BESetup, 0);
 
-    psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 
     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
 
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
 
         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
@@ -77,16 +86,21 @@
 #endif
             if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
             {
-                const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+                const uint64_t* pCoverageMask =
+                    (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+                        ? &work.innerCoverageMask
+                        : &work.coverageMask[0];
 
-                generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+                generateInputCoverage<T, T::InputCoverage>(
+                    pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
             }
 
             RDTSC_BEGIN(BEBarycentric, pDC->drawId);
 
             CalcPixelBarycentrics(coeffs, psContext);
 
-            CalcCentroid<T, false>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+            CalcCentroid<T, false>(
+                &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
 
             RDTSC_END(BEBarycentric, 0);
 
@@ -97,14 +111,16 @@
                 if (coverageMask)
                 {
                     // offset depth/stencil buffers current sample
-                    uint8_t *pDepthSample = pDepthBuffer + RasterTileDepthOffset(sample);
-                    uint8_t *pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
+                    uint8_t* pDepthSample   = pDepthBuffer + RasterTileDepthOffset(sample);
+                    uint8_t* pStencilSample = pStencilBuffer + RasterTileStencilOffset(sample);
 
                     if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                     {
-                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+                        static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
+                                      "Unsupported depth hot tile format");
 
-                        const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthSample));
+                        const simdscalar z =
+                            _simd_load_ps(reinterpret_cast<const float*>(pDepthSample));
 
                         const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                         const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
@@ -121,7 +137,11 @@
                     CalcSampleBarycentrics(coeffs, psContext);
 
                     // interpolate and quantize z
-                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    psContext.vZ = vplaneps(coeffs.vZa,
+                                            coeffs.vZb,
+                                            coeffs.vZc,
+                                            psContext.vI.sample,
+                                            psContext.vJ.sample);
                     psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
                     RDTSC_END(BEBarycentric, 0);
@@ -129,27 +149,45 @@
                     // interpolate user clip distance if available
                     if (state.backendState.clipDistanceMask)
                     {
-                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.sample, psContext.vJ.sample);
+                        coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
+                                                             work.pUserClipBuffer,
+                                                             psContext.vI.sample,
+                                                             psContext.vJ.sample);
                     }
 
-                    simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
-                    simdscalar depthPassMask = vCoverageMask;
+                    simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
+                    simdscalar depthPassMask   = vCoverageMask;
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // Early-Z?
                     if (T::bCanEarlyZ)
                     {
                         RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                        depthPassMask = DepthStencilTest(&state,
+                                                         work.triFlags.frontFacing,
+                                                         work.triFlags.viewportIndex,
+                                                         psContext.vZ,
+                                                         pDepthSample,
+                                                         vCoverageMask,
+                                                         pStencilSample,
+                                                         &stencilPassMask);
+                        AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
+                                                                 _simd_movemask_ps(stencilPassMask),
+                                                                 _simd_movemask_ps(vCoverageMask)));
                         RDTSC_END(BEEarlyDepthTest, 0);
 
                         // early-exit if no samples passed depth or earlyZ is forced on.
                         if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                         {
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
+                                              &state.depthStencilState,
+                                              work.triFlags.frontFacing,
+                                              psContext.vZ,
+                                              pDepthSample,
+                                              depthPassMask,
+                                              vCoverageMask,
+                                              pStencilSample,
+                                              stencilPassMask);
 
                             if (!_simd_movemask_ps(depthPassMask))
                             {
@@ -160,7 +198,7 @@
                     }
 
                     psContext.sampleIndex = sample;
-                    psContext.activeMask = _simd_castps_si(vCoverageMask);
+                    psContext.activeMask  = _simd_castps_si(vCoverageMask);
 
                     // execute pixel shader
                     RDTSC_BEGIN(BEPixelShader, pDC->drawId);
@@ -177,39 +215,80 @@
                     if (!T::bCanEarlyZ)
                     {
                         RDTSC_BEGIN(BELateDepthTest, pDC->drawId);
-                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                            psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
-                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                        depthPassMask = DepthStencilTest(&state,
+                                                         work.triFlags.frontFacing,
+                                                         work.triFlags.viewportIndex,
+                                                         psContext.vZ,
+                                                         pDepthSample,
+                                                         vCoverageMask,
+                                                         pStencilSample,
+                                                         &stencilPassMask);
+                        AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask),
+                                                                _simd_movemask_ps(stencilPassMask),
+                                                                _simd_movemask_ps(vCoverageMask)));
                         RDTSC_END(BELateDepthTest, 0);
 
                         if (!_simd_movemask_ps(depthPassMask))
                         {
                             // need to call depth/stencil write for stencil write
-                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+                            DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
+                                              &state.depthStencilState,
+                                              work.triFlags.frontFacing,
+                                              psContext.vZ,
+                                              pDepthSample,
+                                              depthPassMask,
+                                              vCoverageMask,
+                                              pStencilSample,
+                                              stencilPassMask);
 
                             work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
                             continue;
                         }
                     }
 
-                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                    uint32_t statMask  = _simd_movemask_ps(depthPassMask);
                     uint32_t statCount = _mm_popcnt_u32(statMask);
                     UPDATE_STAT_BE(DepthPassCount, statCount);
 
                     // output merger
                     RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
 #if USE_8x2_TILE_BACKEND
-                    OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset, workerId);
+                    OutputMerger8x2(pDC,
+                                    psContext,
+                                    psContext.pColorBuffer,
+                                    sample,
+                                    &state.blendState,
+                                    state.pfnBlendFunc,
+                                    vCoverageMask,
+                                    depthPassMask,
+                                    state.psState.renderTargetMask,
+                                    useAlternateOffset,
+                                    workerId);
 #else
-                    OutputMerger4x2(pDC, psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, workerId);
+                    OutputMerger4x2(pDC,
+                                    psContext,
+                                    psContext.pColorBuffer,
+                                    sample,
+                                    &state.blendState,
+                                    state.pfnBlendFunc,
+                                    vCoverageMask,
+                                    depthPassMask,
+                                    state.psState.renderTargetMask,
+                                    workerId);
 #endif
 
                     // do final depth write after all pixel kills
                     if (!state.psState.forceEarlyZ)
                     {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
+                                          &state.depthStencilState,
+                                          work.triFlags.frontFacing,
+                                          psContext.vZ,
+                                          pDepthSample,
+                                          depthPassMask,
+                                          vCoverageMask,
+                                          pStencilSample,
+                                          stencilPassMask);
                     }
                     RDTSC_END(BEOutputMerger, 0);
                 }
@@ -229,12 +308,13 @@
 #if USE_8x2_TILE_BACKEND
             if (useAlternateOffset)
             {
-                DWORD rt;
+                DWORD    rt;
                 uint32_t rtMask = state.colorHottileEnable;
                 while (_BitScanForward(&rt, rtMask))
                 {
                     rtMask &= ~(1 << rt);
-                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                    psContext.pColorBuffer[rt] +=
+                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
@@ -243,19 +323,21 @@
             while (_BitScanForward(&rt, rtMask))
             {
                 rtMask &= ~(1 << rt);
-                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                psContext.pColorBuffer[rt] +=
+                    (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
 #endif
             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer +=
+                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
             RDTSC_END(BEEndTile, 0);
 
-            psContext.vX.UL = _simd_add_ps(psContext.vX.UL, dx);
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
         }
 
-        psContext.vY.UL = _simd_add_ps(psContext.vY.UL, dy);
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
     }
 
@@ -272,7 +354,9 @@
     {
         switch (tArg)
         {
-        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_MSAA_SAMPLE_RATE:
+            return BackendSampleRate<SwrBackendTraits<ArgsT...>>;
+            break;
         case SWR_BACKEND_SINGLE_SAMPLE:
         case SWR_BACKEND_MSAA_PIXEL_RATE:
             SWR_ASSERT(0 && "Invalid backend func\n");
@@ -291,12 +375,22 @@
     {
         switch (tArg)
         {
-        case SWR_INPUT_COVERAGE_NONE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_NONE:
+            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
+                remainingArgs...);
+            break;
+        case SWR_INPUT_COVERAGE_NORMAL:
+            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
+                remainingArgs...);
+            break;
+        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
+            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
+                remainingArgs...);
+            break;
         default:
             SWR_ASSERT(0 && "Invalid sample pattern\n");
-            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
+            return BEChooserSampleRate<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
+                remainingArgs...);
             break;
         }
     }
@@ -307,11 +401,21 @@
     {
         switch (tArg)
         {
-        case SWR_MULTISAMPLE_1X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_2X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_4X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_8X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_16X: return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_1X:
+            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+            break;
+        case SWR_MULTISAMPLE_2X:
+            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
+            break;
+        case SWR_MULTISAMPLE_4X:
+            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
+            break;
+        case SWR_MULTISAMPLE_8X:
+            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
+            break;
+        case SWR_MULTISAMPLE_16X:
+            return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
+            break;
         default:
             SWR_ASSERT(0 && "Invalid sample count\n");
             return BEChooserSampleRate<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
@@ -332,9 +436,11 @@
     }
 };
 
-void InitBackendSampleFuncTable(PFN_BACKEND_FUNC(&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
+void InitBackendSampleFuncTable(
+    PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2])
 {
-    for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++)
+    for (uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT;
+         sampleCount++)
     {
         for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
         {
@@ -343,8 +449,14 @@
                 for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
                 {
                     table[sampleCount][inputCoverage][centroid][canEarlyZ] =
-                        BEChooserSampleRate<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, false, (SWR_INPUT_COVERAGE)inputCoverage,
-                        (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+                        BEChooserSampleRate<>::GetFunc(
+                            (SWR_MULTISAMPLE_COUNT)sampleCount,
+                            false,
+                            (SWR_INPUT_COVERAGE)inputCoverage,
+                            (centroid > 0),
+                            false,
+                            (canEarlyZ > 0),
+                            (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
                 }
             }
         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
index aaaba63..2efb01f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_singlesample.cpp
@@ -1,31 +1,31 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file backend.cpp
-*
-* @brief Backend handles rasterization, pixel shading and output merger
-*        operations.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file backend.cpp
+ *
+ * @brief Backend handles rasterization, pixel shading and output merger
+ *        operations.
+ *
+ ******************************************************************************/
 
 #include <smmintrin.h>
 
@@ -37,36 +37,45 @@
 
 #include <algorithm>
 
-template<typename T>
-void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+template <typename T>
+void BackendSingleSample(DRAW_CONTEXT*        pDC,
+                         uint32_t             workerId,
+                         uint32_t             x,
+                         uint32_t             y,
+                         SWR_TRIANGLE_DESC&   work,
+                         RenderOutputBuffers& renderBuffers)
 {
     RDTSC_BEGIN(BESingleSampleBackend, pDC->drawId);
     RDTSC_BEGIN(BESetup, pDC->drawId);
 
     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
-    const API_STATE &state = GetApiState(pDC);
+    const API_STATE& state = GetApiState(pDC);
 
     BarycentricCoeffs coeffs;
     SetupBarycentricCoeffs(&coeffs, work);
 
-    SWR_PS_CONTEXT psContext;
+    SWR_PS_CONTEXT             psContext;
     const SWR_MULTISAMPLE_POS& samplePos = state.rastState.samplePositions;
     SetupPixelShaderContext<T>(&psContext, samplePos, work);
 
     uint8_t *pDepthBuffer, *pStencilBuffer;
-    SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers);
+    SetupRenderBuffers(psContext.pColorBuffer,
+                       &pDepthBuffer,
+                       &pStencilBuffer,
+                       state.colorHottileEnable,
+                       renderBuffers);
 
     RDTSC_END(BESetup, 1);
 
-    psContext.vY.UL     = _simd_add_ps(vULOffsetsY,     _simd_set1_ps(static_cast<float>(y)));
+    psContext.vY.UL     = _simd_add_ps(vULOffsetsY, _simd_set1_ps(static_cast<float>(y)));
     psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps(static_cast<float>(y)));
 
     const simdscalar dy = _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM));
 
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        psContext.vX.UL     = _simd_add_ps(vULOffsetsX,     _simd_set1_ps(static_cast<float>(x)));
+        psContext.vX.UL     = _simd_add_ps(vULOffsetsX, _simd_set1_ps(static_cast<float>(x)));
         psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps(static_cast<float>(x)));
 
         const simdscalar dx = _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM));
@@ -82,9 +91,11 @@
             {
                 if (state.depthHottileEnable && state.depthBoundsState.depthBoundsTestEnable)
                 {
-                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+                    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT,
+                                  "Unsupported depth hot tile format");
 
-                    const simdscalar z = _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer));
+                    const simdscalar z =
+                        _simd_load_ps(reinterpret_cast<const float*>(pDepthBuffer));
 
                     const float minz = state.depthBoundsState.depthBoundsTestMinValue;
                     const float maxz = state.depthBoundsState.depthBoundsTestMaxValue;
@@ -94,19 +105,25 @@
 
                 if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE)
                 {
-                    const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : &work.coverageMask[0];
+                    const uint64_t* pCoverageMask =
+                        (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+                            ? &work.innerCoverageMask
+                            : &work.coverageMask[0];
 
-                    generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
+                    generateInputCoverage<T, T::InputCoverage>(
+                        pCoverageMask, psContext.inputMask, state.blendState.sampleMask);
                 }
 
                 RDTSC_BEGIN(BEBarycentric, pDC->drawId);
 
                 CalcPixelBarycentrics(coeffs, psContext);
 
-                CalcCentroid<T, true>(&psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
+                CalcCentroid<T, true>(
+                    &psContext, samplePos, coeffs, work.coverageMask, state.blendState.sampleMask);
 
                 // interpolate and quantize z
-                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = vplaneps(
+                    coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
                 psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
                 RDTSC_END(BEBarycentric, 1);
@@ -114,27 +131,45 @@
                 // interpolate user clip distance if available
                 if (state.backendState.clipDistanceMask)
                 {
-                    coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask, work.pUserClipBuffer, psContext.vI.center, psContext.vJ.center);
+                    coverageMask &= ~ComputeUserClipMask(state.backendState.clipDistanceMask,
+                                                         work.pUserClipBuffer,
+                                                         psContext.vI.center,
+                                                         psContext.vJ.center);
                 }
 
-                simdscalar vCoverageMask = _simd_vmask_ps(coverageMask);
-                simdscalar depthPassMask = vCoverageMask;
+                simdscalar vCoverageMask   = _simd_vmask_ps(coverageMask);
+                simdscalar depthPassMask   = vCoverageMask;
                 simdscalar stencilPassMask = vCoverageMask;
 
                 // Early-Z?
                 if (T::bCanEarlyZ)
                 {
                     RDTSC_BEGIN(BEEarlyDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                     psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
-                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                    depthPassMask = DepthStencilTest(&state,
+                                                     work.triFlags.frontFacing,
+                                                     work.triFlags.viewportIndex,
+                                                     psContext.vZ,
+                                                     pDepthBuffer,
+                                                     vCoverageMask,
+                                                     pStencilBuffer,
+                                                     &stencilPassMask);
+                    AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
+                                                               _simd_movemask_ps(stencilPassMask),
+                                                               _simd_movemask_ps(vCoverageMask)));
                     RDTSC_END(BEEarlyDepthTest, 0);
 
                     // early-exit if no pixels passed depth or earlyZ is forced on
                     if (state.psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask))
                     {
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
+                                          &state.depthStencilState,
+                                          work.triFlags.frontFacing,
+                                          psContext.vZ,
+                                          pDepthBuffer,
+                                          depthPassMask,
+                                          vCoverageMask,
+                                          pStencilBuffer,
+                                          stencilPassMask);
 
                         if (!_simd_movemask_ps(depthPassMask))
                         {
@@ -144,7 +179,7 @@
                 }
 
                 psContext.sampleIndex = 0;
-                psContext.activeMask = _simd_castps_si(vCoverageMask);
+                psContext.activeMask  = _simd_castps_si(vCoverageMask);
 
                 // execute pixel shader
                 RDTSC_BEGIN(BEPixelShader, pDC->drawId);
@@ -161,50 +196,94 @@
                 if (!T::bCanEarlyZ)
                 {
                     RDTSC_BEGIN(BELateDepthTest, pDC->drawId);
-                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
-                                                        psContext.vZ, pDepthBuffer, vCoverageMask, pStencilBuffer, &stencilPassMask);
-                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask), _simd_movemask_ps(stencilPassMask), _simd_movemask_ps(vCoverageMask)));
+                    depthPassMask = DepthStencilTest(&state,
+                                                     work.triFlags.frontFacing,
+                                                     work.triFlags.viewportIndex,
+                                                     psContext.vZ,
+                                                     pDepthBuffer,
+                                                     vCoverageMask,
+                                                     pStencilBuffer,
+                                                     &stencilPassMask);
+                    AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask),
+                                                              _simd_movemask_ps(stencilPassMask),
+                                                              _simd_movemask_ps(vCoverageMask)));
                     RDTSC_END(BELateDepthTest, 0);
 
                     if (!_simd_movemask_ps(depthPassMask))
                     {
                         // need to call depth/stencil write for stencil write
-                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                            pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+                        DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
+                                          &state.depthStencilState,
+                                          work.triFlags.frontFacing,
+                                          psContext.vZ,
+                                          pDepthBuffer,
+                                          depthPassMask,
+                                          vCoverageMask,
+                                          pStencilBuffer,
+                                          stencilPassMask);
                         goto Endtile;
                     }
-                } else {
+                }
+                else
+                {
                     // for early z, consolidate discards from shader
                     // into depthPassMask
                     depthPassMask = _simd_and_ps(depthPassMask, vCoverageMask);
                 }
 
-                uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                uint32_t statMask  = _simd_movemask_ps(depthPassMask);
                 uint32_t statCount = _mm_popcnt_u32(statMask);
                 UPDATE_STAT_BE(DepthPassCount, statCount);
 
                 // output merger
                 RDTSC_BEGIN(BEOutputMerger, pDC->drawId);
 #if USE_8x2_TILE_BACKEND
-                OutputMerger8x2(pDC, psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset, workerId);
+                OutputMerger8x2(pDC,
+                                psContext,
+                                psContext.pColorBuffer,
+                                0,
+                                &state.blendState,
+                                state.pfnBlendFunc,
+                                vCoverageMask,
+                                depthPassMask,
+                                state.psState.renderTargetMask,
+                                useAlternateOffset,
+                                workerId);
 #else
-                OutputMerger4x2(pDC, psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, workerId, workerId);
+                OutputMerger4x2(pDC,
+                                psContext,
+                                psContext.pColorBuffer,
+                                0,
+                                &state.blendState,
+                                state.pfnBlendFunc,
+                                vCoverageMask,
+                                depthPassMask,
+                                state.psState.renderTargetMask,
+                                workerId,
+                                workerId);
 #endif
 
                 // do final depth write after all pixel kills
                 if (!state.psState.forceEarlyZ)
                 {
-                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
-                        pDepthBuffer, depthPassMask, vCoverageMask, pStencilBuffer, stencilPassMask);
+                    DepthStencilWrite(&state.vp[work.triFlags.viewportIndex],
+                                      &state.depthStencilState,
+                                      work.triFlags.frontFacing,
+                                      psContext.vZ,
+                                      pDepthBuffer,
+                                      depthPassMask,
+                                      vCoverageMask,
+                                      pStencilBuffer,
+                                      stencilPassMask);
                 }
                 RDTSC_END(BEOutputMerger, 0);
             }
 
-Endtile:
+        Endtile:
             RDTSC_BEGIN(BEEndTile, pDC->drawId);
 
             work.coverageMask[0] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
-            if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
+            if (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE)
             {
                 work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
             }
@@ -212,12 +291,13 @@
 #if USE_8x2_TILE_BACKEND
             if (useAlternateOffset)
             {
-                DWORD rt;
+                DWORD    rt;
                 uint32_t rtMask = state.colorHottileEnable;
-                while(_BitScanForward(&rt, rtMask))
+                while (_BitScanForward(&rt, rtMask))
                 {
                     rtMask &= ~(1 << rt);
-                    psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                    psContext.pColorBuffer[rt] +=
+                        (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
                 }
             }
 #else
@@ -226,19 +306,21 @@
             while (_BitScanForward(&rt, rtMask))
             {
                 rtMask &= ~(1 << rt);
-                psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+                psContext.pColorBuffer[rt] +=
+                    (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
             }
 #endif
             pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
-            pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBuffer +=
+                (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
 
             RDTSC_END(BEEndTile, 0);
 
-            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL,     dx);
+            psContext.vX.UL     = _simd_add_ps(psContext.vX.UL, dx);
             psContext.vX.center = _simd_add_ps(psContext.vX.center, dx);
         }
 
-        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL,     dy);
+        psContext.vY.UL     = _simd_add_ps(psContext.vY.UL, dy);
         psContext.vY.center = _simd_add_ps(psContext.vY.center, dy);
     }
 
@@ -253,9 +335,11 @@
     // Last Arg Terminator
     static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
     {
-        switch(tArg)
+        switch (tArg)
         {
-        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<SwrBackendTraits<ArgsT...>>; break;
+        case SWR_BACKEND_SINGLE_SAMPLE:
+            return BackendSingleSample<SwrBackendTraits<ArgsT...>>;
+            break;
         case SWR_BACKEND_MSAA_PIXEL_RATE:
         case SWR_BACKEND_MSAA_SAMPLE_RATE:
         default:
@@ -269,15 +353,25 @@
     template <typename... TArgsT>
     static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs)
     {
-        switch(tArg)
+        switch (tArg)
         {
-        case SWR_INPUT_COVERAGE_NONE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_NORMAL: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break;
-        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break;
+        case SWR_INPUT_COVERAGE_NONE:
+            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
+                remainingArgs...);
+            break;
+        case SWR_INPUT_COVERAGE_NORMAL:
+            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(
+                remainingArgs...);
+            break;
+        case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE:
+            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(
+                remainingArgs...);
+            break;
         default:
-        SWR_ASSERT(0 && "Invalid sample pattern\n");
-        return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...);
-        break;
+            SWR_ASSERT(0 && "Invalid sample pattern\n");
+            return BEChooserSingleSample<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(
+                remainingArgs...);
+            break;
         }
     }
 
@@ -285,17 +379,27 @@
     template <typename... TArgsT>
     static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
     {
-        switch(tArg)
+        switch (tArg)
         {
-        case SWR_MULTISAMPLE_1X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_2X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_4X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_8X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
-        case SWR_MULTISAMPLE_16X: return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_1X:
+            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+            break;
+        case SWR_MULTISAMPLE_2X:
+            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...);
+            break;
+        case SWR_MULTISAMPLE_4X:
+            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...);
+            break;
+        case SWR_MULTISAMPLE_8X:
+            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...);
+            break;
+        case SWR_MULTISAMPLE_16X:
+            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...);
+            break;
         default:
-        SWR_ASSERT(0 && "Invalid sample count\n");
-        return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
-        break;
+            SWR_ASSERT(0 && "Invalid sample count\n");
+            return BEChooserSingleSample<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...);
+            break;
         }
     }
 
@@ -303,7 +407,7 @@
     template <typename... TArgsT>
     static PFN_BACKEND_FUNC GetFunc(bool tArg, TArgsT... remainingArgs)
     {
-        if(tArg == true)
+        if (tArg == true)
         {
             return BEChooserSingleSample<ArgsT..., 1>::GetFunc(remainingArgs...);
         }
@@ -314,15 +418,20 @@
 
 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2])
 {
-    for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
+    for (uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++)
     {
-        for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
+        for (uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
         {
-            for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
+            for (uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++)
             {
                 table[inputCoverage][isCentroid][canEarlyZ] =
-                    BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X, false, (SWR_INPUT_COVERAGE)inputCoverage,
-                                         (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE);
+                    BEChooserSingleSample<>::GetFunc(SWR_MULTISAMPLE_1X,
+                                                     false,
+                                                     (SWR_INPUT_COVERAGE)inputCoverage,
+                                                     (isCentroid > 0),
+                                                     false,
+                                                     (canEarlyZ > 0),
+                                                     SWR_BACKEND_SINGLE_SAMPLE);
             }
         }
     }
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 9f8dc88..6d9680b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file binner.cpp
-*
-* @brief Implementation for the macrotile binner
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file binner.cpp
+ *
+ * @brief Implementation for the macrotile binner
+ *
+ ******************************************************************************/
 
 #include "binner.h"
 #include "context.h"
@@ -37,27 +37,25 @@
 
 // Function Prototype
 template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPostSetupLinesImpl(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    Vec4<SIMD_T> prim[],
-    Float<SIMD_T> recipW[],
-    uint32_t primMask,
-    Integer<SIMD_T> const &primID,
-    Integer<SIMD_T> const &viewportIdx,
-    Integer<SIMD_T> const &rtIdx);
+void BinPostSetupLinesImpl(DRAW_CONTEXT*          pDC,
+                           PA_STATE&              pa,
+                           uint32_t               workerId,
+                           Vec4<SIMD_T>           prim[],
+                           Float<SIMD_T>          recipW[],
+                           uint32_t               primMask,
+                           Integer<SIMD_T> const& primID,
+                           Integer<SIMD_T> const& viewportIdx,
+                           Integer<SIMD_T> const& rtIdx);
 
 template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPostSetupPointsImpl(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    Vec4<SIMD_T> prim[],
-    uint32_t primMask,
-    Integer<SIMD_T> const &primID,
-    Integer<SIMD_T> const &viewportIdx,
-    Integer<SIMD_T> const &rtIdx);
+void BinPostSetupPointsImpl(DRAW_CONTEXT*          pDC,
+                            PA_STATE&              pa,
+                            uint32_t               workerId,
+                            Vec4<SIMD_T>           prim[],
+                            uint32_t               primMask,
+                            Integer<SIMD_T> const& primID,
+                            Integer<SIMD_T> const& viewportIdx,
+                            Integer<SIMD_T> const& rtIdx);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Processes attributes for the backend based on linkage mask and
@@ -68,26 +66,23 @@
 /// @param pLinkageMap - maps VS attribute slot to PS slot
 /// @param triIndex - Triangle to process attributes for
 /// @param pBuffer - Output result
-template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
+template <typename NumVertsT,
+          typename IsSwizzledT,
+          typename HasConstantInterpT,
+          typename IsDegenerate>
 INLINE void ProcessAttributes(
-    DRAW_CONTEXT *pDC,
-    PA_STATE&pa,
-    uint32_t triIndex,
-    uint32_t primId,
-    float *pBuffer)
+    DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t triIndex, uint32_t primId, float* pBuffer)
 {
     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
-    uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
+    uint32_t constantInterpMask =
+        IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
-    const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
+    const PRIMITIVE_TOPOLOGY topo  = pa.binTopology;
 
     static const float constTable[3][4] = {
-        { 0.0f, 0.0f, 0.0f, 0.0f },
-        { 0.0f, 0.0f, 0.0f, 1.0f },
-        { 1.0f, 1.0f, 1.0f, 1.0f }
-    };
+        {0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 1.0f}, {1.0f, 1.0f, 1.0f, 1.0f}};
 
     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
     {
@@ -96,46 +91,45 @@
         {
             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
             inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
-
         }
         else
         {
             inputSlot = backendState.vertexAttribOffset + i;
         }
 
-        simd4scalar attrib[3];    // triangle attribs (always 4 wide)
-        float* pAttribStart = pBuffer;
+        simd4scalar attrib[3]; // triangle attribs (always 4 wide)
+        float*      pAttribStart = pBuffer;
 
         if (HasConstantInterpT::value || IsDegenerate::value)
         {
             if (CheckBit(constantInterpMask, i))
             {
-                uint32_t vid;
-                uint32_t adjustedTriIndex;
-                static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
-                static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
-                static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
-                static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
-                static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
+                uint32_t              vid;
+                uint32_t              adjustedTriIndex;
+                static const uint32_t tristripProvokingVertex[]   = {0, 2, 1};
+                static const int32_t  quadProvokingTri[2][4]      = {{0, 0, 0, 1}, {0, -1, 0, 0}};
+                static const uint32_t quadProvokingVertex[2][4]   = {{0, 1, 2, 2}, {0, 1, 1, 2}};
+                static const int32_t  qstripProvokingTri[2][4]    = {{0, 0, 0, 1}, {-1, 0, 0, 0}};
+                static const uint32_t qstripProvokingVertex[2][4] = {{0, 1, 2, 1}, {0, 0, 2, 1}};
 
-                switch (topo) {
+                switch (topo)
+                {
                 case TOP_QUAD_LIST:
                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
-                    vid = quadProvokingVertex[triIndex & 1][provokingVertex];
+                    vid              = quadProvokingVertex[triIndex & 1][provokingVertex];
                     break;
                 case TOP_QUAD_STRIP:
                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
-                    vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
+                    vid              = qstripProvokingVertex[triIndex & 1][provokingVertex];
                     break;
                 case TOP_TRIANGLE_STRIP:
                     adjustedTriIndex = triIndex;
-                    vid = (triIndex & 1)
-                        ? tristripProvokingVertex[provokingVertex]
-                        : provokingVertex;
+                    vid =
+                        (triIndex & 1) ? tristripProvokingVertex[provokingVertex] : provokingVertex;
                     break;
                 default:
                     adjustedTriIndex = triIndex;
-                    vid = provokingVertex;
+                    vid              = provokingVertex;
                     break;
                 }
 
@@ -214,7 +208,7 @@
     }
 }
 
-typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
+typedef void (*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
 
 struct ProcessAttributesChooser
 {
@@ -227,9 +221,13 @@
     }
 };
 
-PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
+PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts,
+                                                bool     IsSwizzled,
+                                                bool     HasConstantInterp,
+                                                bool     IsDegenerate = false)
 {
-    return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
+    return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(
+        IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -240,18 +238,22 @@
 /// @param primIndex - primitive index to process
 /// @param clipDistMask - mask of enabled clip distances
 /// @param pUserClipBuffer - buffer to store results
-template<uint32_t NumVerts>
-void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
+template <uint32_t NumVerts>
+void ProcessUserClipDist(const SWR_BACKEND_STATE& state,
+                         PA_STATE&                pa,
+                         uint32_t                 primIndex,
+                         float*                   pRecipW,
+                         float*                   pUserClipBuffer)
 {
-    DWORD clipDist;
+    DWORD    clipDist;
     uint32_t clipDistMask = state.clipDistanceMask;
     while (_BitScanForward(&clipDist, clipDistMask))
     {
         clipDistMask &= ~(1 << clipDist);
         uint32_t clipSlot = clipDist >> 2;
         uint32_t clipComp = clipDist & 0x3;
-        uint32_t clipAttribSlot = clipSlot == 0 ?
-            state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
+        uint32_t clipAttribSlot =
+            clipSlot == 0 ? state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
 
         simd4scalar primClipDist[3];
         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
@@ -281,30 +283,35 @@
 }
 
 INLINE
-void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
+void TransposeVertices(simd4scalar (&dst)[8],
+                       const simdscalar& src0,
+                       const simdscalar& src1,
+                       const simdscalar& src2)
 {
     vTranspose3x8(dst, src0, src1, src2);
 }
 
 INLINE
-void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
+void TransposeVertices(simd4scalar (&dst)[16],
+                       const simd16scalar& src0,
+                       const simd16scalar& src1,
+                       const simd16scalar& src2)
 {
-    vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
+    vTranspose4x16(
+        reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
 }
 
-
 #if KNOB_ENABLE_EARLY_RAST
 
 #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
 #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
 
-
-template<typename SIMD_T>
+template <typename SIMD_T>
 struct EarlyRastHelper
 {
 };
 
-template<>
+template <>
 struct EarlyRastHelper<SIMD256>
 {
     static SIMD256::Integer InitShiftCntrl()
@@ -314,7 +321,7 @@
 };
 
 #if USE_SIMD16_FRONTEND
-template<>
+template <>
 struct EarlyRastHelper<SIMD512>
 {
     static SIMD512::Integer InitShiftCntrl()
@@ -340,21 +347,22 @@
 /// @param oneTileMask - defines triangles for ER to work on
 ///                      (tris that fit into ER tile)
 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
-uint32_t SIMDCALL EarlyRasterizer(
-        SIMDBBOX_T<SIMD_T> &er_bbox,
-        Integer<SIMD_T> (&vAi)[3],
-        Integer<SIMD_T> (&vBi)[3],
-        Integer<SIMD_T> (&vXi)[3],
-        Integer<SIMD_T> (&vYi)[3],
-        uint32_t cwTrisMask,
-        uint32_t triMask,
-        uint32_t oneTileMask)
+uint32_t SIMDCALL EarlyRasterizer(SIMDBBOX_T<SIMD_T>& er_bbox,
+                                  Integer<SIMD_T> (&vAi)[3],
+                                  Integer<SIMD_T> (&vBi)[3],
+                                  Integer<SIMD_T> (&vXi)[3],
+                                  Integer<SIMD_T> (&vYi)[3],
+                                  uint32_t cwTrisMask,
+                                  uint32_t triMask,
+                                  uint32_t oneTileMask)
 {
     // step to pixel center of top-left pixel of the triangle bbox
-    Integer<SIMD_T> vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
+    Integer<SIMD_T> vTopLeftX =
+        SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
     vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
 
-    Integer<SIMD_T> vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
+    Integer<SIMD_T> vTopLeftY =
+        SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
     vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
 
     // negate A and B for CW tris
@@ -367,16 +375,22 @@
 
     RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0);
 
-    Integer<SIMD_T> vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
-    Integer<SIMD_T> vCwTris = SIMD_T::set1_epi32(cwTrisMask);
-    Integer<SIMD_T> vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
+    Integer<SIMD_T> vShiftCntrl = EarlyRastHelper<SIMD_T>::InitShiftCntrl();
+    Integer<SIMD_T> vCwTris     = SIMD_T::set1_epi32(cwTrisMask);
+    Integer<SIMD_T> vMask       = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
 
-    vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
-    vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
-    vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
-    vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
-    vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
-    vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
+    vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
+    vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
+    vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
+    vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
+    vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
+    vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
 
     // evaluate edge equations at top-left pixel
     Integer<SIMD_T> vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
@@ -409,9 +423,12 @@
     Integer<SIMD_T> vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
 
     // vA < 0
-    vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
-    vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
-    vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
+    vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
+    vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
+    vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
 
     // vA == 0 && vB < 0
     Integer<SIMD_T> vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
@@ -422,75 +439,77 @@
     vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
     vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]);
 
-    vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
-    vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
-    vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
-
+    vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
+    vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
+    vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(
+        SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
 
 #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
     // Go down
     // coverage pixel 0
     Integer<SIMD_T> vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
-    vMask0 = SIMD_T::and_si(vMask0, vEdge2);
+    vMask0                 = SIMD_T::and_si(vMask0, vEdge2);
 
     // coverage pixel 1
     Integer<SIMD_T> vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
     Integer<SIMD_T> vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
     Integer<SIMD_T> vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
-    Integer<SIMD_T> vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
+    Integer<SIMD_T> vMask1  = SIMD_T::and_si(vEdge0N, vEdge1N);
+    vMask1                  = SIMD_T::and_si(vMask1, vEdge2N);
 
     // coverage pixel 2
-    vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
+    vEdge0N                = SIMD_T::add_epi32(vEdge0N, vBi[0]);
+    vEdge1N                = SIMD_T::add_epi32(vEdge1N, vBi[1]);
+    vEdge2N                = SIMD_T::add_epi32(vEdge2N, vBi[2]);
     Integer<SIMD_T> vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
+    vMask2                 = SIMD_T::and_si(vMask2, vEdge2N);
 
     // coverage pixel 3
-    vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
+    vEdge0N                = SIMD_T::add_epi32(vEdge0N, vBi[0]);
+    vEdge1N                = SIMD_T::add_epi32(vEdge1N, vBi[1]);
+    vEdge2N                = SIMD_T::add_epi32(vEdge2N, vBi[2]);
     Integer<SIMD_T> vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
+    vMask3                 = SIMD_T::and_si(vMask3, vEdge2N);
 
     // One step to the right and then up
 
     // coverage pixel 4
-    vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
-    vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
-    vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
+    vEdge0N                = SIMD_T::add_epi32(vEdge0N, vAi[0]);
+    vEdge1N                = SIMD_T::add_epi32(vEdge1N, vAi[1]);
+    vEdge2N                = SIMD_T::add_epi32(vEdge2N, vAi[2]);
     Integer<SIMD_T> vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
+    vMask4                 = SIMD_T::and_si(vMask4, vEdge2N);
 
     // coverage pixel 5
-    vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
+    vEdge0N                = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
+    vEdge1N                = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
+    vEdge2N                = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
     Integer<SIMD_T> vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
+    vMask5                 = SIMD_T::and_si(vMask5, vEdge2N);
 
     // coverage pixel 6
-    vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
+    vEdge0N                = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
+    vEdge1N                = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
+    vEdge2N                = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
     Integer<SIMD_T> vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
+    vMask6                 = SIMD_T::and_si(vMask6, vEdge2N);
 
     // coverage pixel 7
-    vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
-    vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
-    vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
+    vEdge0N                = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
+    vEdge1N                = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
+    vEdge2N                = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
     Integer<SIMD_T> vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
+    vMask7                 = SIMD_T::and_si(vMask7, vEdge2N);
 
     Integer<SIMD_T> vLit1 = SIMD_T::or_si(vMask0, vMask1);
-    vLit1 = SIMD_T::or_si(vLit1, vMask2);
-    vLit1 = SIMD_T::or_si(vLit1, vMask3);
-    vLit1 = SIMD_T::or_si(vLit1, vMask4);
-    vLit1 = SIMD_T::or_si(vLit1, vMask5);
-    vLit1 = SIMD_T::or_si(vLit1, vMask6);
-    vLit1 = SIMD_T::or_si(vLit1, vMask7);
+    vLit1                 = SIMD_T::or_si(vLit1, vMask2);
+    vLit1                 = SIMD_T::or_si(vLit1, vMask3);
+    vLit1                 = SIMD_T::or_si(vLit1, vMask4);
+    vLit1                 = SIMD_T::or_si(vLit1, vMask5);
+    vLit1                 = SIMD_T::or_si(vLit1, vMask6);
+    vLit1                 = SIMD_T::or_si(vLit1, vMask7);
 
     // Step to the right and go down again
 
@@ -498,29 +517,29 @@
     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
-    vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask0 = SIMD_T::and_si(vMask0, vEdge2N);
+    vMask0  = SIMD_T::and_si(vEdge0N, vEdge1N);
+    vMask0  = SIMD_T::and_si(vMask0, vEdge2N);
 
     // coverage pixel 1
     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
+    vMask1  = SIMD_T::and_si(vEdge0N, vEdge1N);
+    vMask1  = SIMD_T::and_si(vMask1, vEdge2N);
 
     // coverage pixel 2
     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
+    vMask2  = SIMD_T::and_si(vEdge0N, vEdge1N);
+    vMask2  = SIMD_T::and_si(vMask2, vEdge2N);
 
     // coverage pixel 3
     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
+    vMask3  = SIMD_T::and_si(vEdge0N, vEdge1N);
+    vMask3  = SIMD_T::and_si(vMask3, vEdge2N);
 
     // And for the last time - to the right and up
 
@@ -528,37 +547,37 @@
     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
-    vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
+    vMask4  = SIMD_T::and_si(vEdge0N, vEdge1N);
+    vMask4  = SIMD_T::and_si(vMask4, vEdge2N);
 
     // coverage pixel 5
     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
+    vMask5  = SIMD_T::and_si(vEdge0N, vEdge1N);
+    vMask5  = SIMD_T::and_si(vMask5, vEdge2N);
 
     // coverage pixel 6
     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
+    vMask6  = SIMD_T::and_si(vEdge0N, vEdge1N);
+    vMask6  = SIMD_T::and_si(vMask6, vEdge2N);
 
     // coverage pixel 7
     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
-    vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
+    vMask7  = SIMD_T::and_si(vEdge0N, vEdge1N);
+    vMask7  = SIMD_T::and_si(vMask7, vEdge2N);
 
     Integer<SIMD_T> vLit2 = SIMD_T::or_si(vMask0, vMask1);
-    vLit2 = SIMD_T::or_si(vLit2, vMask2);
-    vLit2 = SIMD_T::or_si(vLit2, vMask3);
-    vLit2 = SIMD_T::or_si(vLit2, vMask4);
-    vLit2 = SIMD_T::or_si(vLit2, vMask5);
-    vLit2 = SIMD_T::or_si(vLit2, vMask6);
-    vLit2 = SIMD_T::or_si(vLit2, vMask7);
+    vLit2                 = SIMD_T::or_si(vLit2, vMask2);
+    vLit2                 = SIMD_T::or_si(vLit2, vMask3);
+    vLit2                 = SIMD_T::or_si(vLit2, vMask4);
+    vLit2                 = SIMD_T::or_si(vLit2, vMask5);
+    vLit2                 = SIMD_T::or_si(vLit2, vMask6);
+    vLit2                 = SIMD_T::or_si(vLit2, vMask7);
 
     Integer<SIMD_T> vLit = SIMD_T::or_si(vLit1, vLit2);
 
@@ -612,7 +631,7 @@
 
 #endif
     // Check which triangles has any pixel lit
-    uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
+    uint32_t maskLit   = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
     uint32_t maskUnlit = ~maskLit & oneTileMask;
 
     uint32_t oldTriMask = triMask;
@@ -638,25 +657,24 @@
 /// @param viewportIdx - viewport array index for each triangle.
 /// @tparam CT - ConservativeRastFETraits
 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
-void SIMDCALL BinTrianglesImpl(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    Vec4<SIMD_T> tri[3],
-    uint32_t triMask,
-    Integer<SIMD_T> const &primID,
-    Integer<SIMD_T> const &viewportIdx,
-    Integer<SIMD_T> const &rtIdx)
+void SIMDCALL BinTrianglesImpl(DRAW_CONTEXT*          pDC,
+                               PA_STATE&              pa,
+                               uint32_t               workerId,
+                               Vec4<SIMD_T>           tri[3],
+                               uint32_t               triMask,
+                               Integer<SIMD_T> const& primID,
+                               Integer<SIMD_T> const& viewportIdx,
+                               Integer<SIMD_T> const& rtIdx)
 {
-    const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
+    const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
 
     RDTSC_BEGIN(FEBinTriangles, pDC->drawId);
 
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-    const SWR_FRONTEND_STATE& feState = state.frontendState;
+    const API_STATE&          state     = GetApiState(pDC);
+    const SWR_RASTSTATE&      rastState = state.rastState;
+    const SWR_FRONTEND_STATE& feState   = state.frontendState;
 
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    MacroTileMgr* pTileMgr = pDC->pTileMgr;
 
     Float<SIMD_T> vRecipW0 = SIMD_T::set1_ps(1.0f);
     Float<SIMD_T> vRecipW1 = SIMD_T::set1_ps(1.0f);
@@ -724,8 +742,10 @@
     calcDeterminantIntVertical(vAi, vBi, vDet);
 
     // cull zero area
-    uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
-    uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
+    uint32_t maskLo =
+        SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
+    uint32_t maskHi =
+        SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
 
     uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
 
@@ -744,13 +764,17 @@
     uint32_t frontWindingTris;
     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
     {
-        maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
-        maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
+        maskLo = SIMD_T::movemask_pd(
+            SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
+        maskHi = SIMD_T::movemask_pd(
+            SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
     }
     else
     {
-        maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
-        maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
+        maskLo = SIMD_T::movemask_pd(
+            SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
+        maskHi = SIMD_T::movemask_pd(
+            SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
     }
     frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
 
@@ -758,12 +782,24 @@
     uint32_t cullTris;
     switch ((SWR_CULLMODE)rastState.cullMode)
     {
-    case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
-    case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
-    case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
-        // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
-    case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
-    default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
+    case SWR_CULLMODE_BOTH:
+        cullTris = 0xffffffff;
+        break;
+    case SWR_CULLMODE_NONE:
+        cullTris = 0x0;
+        break;
+    case SWR_CULLMODE_FRONT:
+        cullTris = frontWindingTris;
+        break;
+        // 0 area triangles are marked as backfacing, which is required behavior for conservative
+        // rast
+    case SWR_CULLMODE_BACK:
+        cullTris = ~frontWindingTris;
+        break;
+    default:
+        SWR_INVALID("Invalid cull mode: %d", rastState.cullMode);
+        cullTris = 0x0;
+        break;
     }
 
     triMask &= ~cullTris;
@@ -777,12 +813,12 @@
 
     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
     // compute per tri backface
-    uint32_t frontFaceMask = frontWindingTris;
-    uint32_t *pPrimID = (uint32_t *)&primID;
-    const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
-    DWORD triIndex = 0;
+    uint32_t        frontFaceMask  = frontWindingTris;
+    uint32_t*       pPrimID        = (uint32_t*)&primID;
+    const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx;
+    DWORD           triIndex       = 0;
 
-    uint32_t edgeEnable;
+    uint32_t      edgeEnable;
     PFN_WORK_FUNC pfnWork;
     if (CT::IsConservativeT::value)
     {
@@ -794,13 +830,15 @@
             const Integer<SIMD_T> x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
             const Integer<SIMD_T> y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
 
-            uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
+            uint32_t e0Mask =
+                SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
 
             // e1 = v2-v1
             const Integer<SIMD_T> x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
             const Integer<SIMD_T> y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
 
-            uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
+            uint32_t e1Mask =
+                SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
 
             // e2 = v0-v2
             // if v0 == v1 & v1 == v2, v0 == v2
@@ -827,8 +865,12 @@
     else
     {
         // degenerate triangles won't be sent to rasterizer; just enable all edges
-        pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
-            (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
+        pfnWork = GetRasterizerFunc(rastState.sampleCount,
+                                    rastState.bIsCenterPattern,
+                                    (rastState.conservativeRast > 0),
+                                    (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage,
+                                    EdgeValToEdgeState(ALL_EDGES_VALID),
+                                    (state.scissorsTileAligned == false));
     }
 
     SIMDBBOX_T<SIMD_T> bbox;
@@ -854,20 +896,20 @@
 
         {
             Integer<SIMD_T> xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
-            xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
+            xmin                 = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
             Integer<SIMD_T> xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
-            xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
+            xmax                 = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
 
             Integer<SIMD_T> vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
 
             Integer<SIMD_T> ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
-            ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
+            ymin                 = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
             Integer<SIMD_T> ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
-            ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
+            ymax                 = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
 
             Integer<SIMD_T> vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
 
-            vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
+            vMaskV         = SIMD_T::or_si(vMaskH, vMaskV);
             cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
         }
 
@@ -879,15 +921,20 @@
         }
     }
 
-    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    // Gather the AOS effective scissor rects based on the per-prim VP index.
+    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is
+    // exclusive. Gather the AOS effective scissor rects based on the per-prim VP index.
     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
     {
         Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
         if (pa.viewportArrayActive)
 
         {
-            GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
+            GatherScissors(&state.scissorsInFixedPoint[0],
+                           pViewportIndex,
+                           scisXmin,
+                           scisYmin,
+                           scisXmax,
+                           scisYmax);
         }
         else // broadcast fast path for non-VPAI case.
         {
@@ -909,23 +956,26 @@
 
     if (CT::IsConservativeT::value)
     {
-        // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
-        // some area. Bump the xmax/ymax edges out 
+        // in the case where a degenerate triangle is on a scissor edge, we need to make sure the
+        // primitive bbox has some area. Bump the xmax/ymax edges out
 
         Integer<SIMD_T> topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
-        bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
+        bbox.ymax                       = SIMD_T::blendv_epi32(
+            bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
 
         Integer<SIMD_T> leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
-        bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
+        bbox.xmax                       = SIMD_T::blendv_epi32(
+            bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
     }
 
     // Cull tris completely outside scissor
     {
         Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
         Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
-        Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+        Integer<SIMD_T> maskOutsideScissorXY =
+            SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
-        triMask = triMask & ~maskOutsideScissor;
+        triMask                     = triMask & ~maskOutsideScissor;
     }
 
 #if KNOB_ENABLE_EARLY_RAST
@@ -936,26 +986,34 @@
         // convert to ER tiles
         SIMDBBOX_T<SIMD_T> er_bbox;
 
-        er_bbox.xmin = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
-        er_bbox.xmax = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
-        er_bbox.ymin = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
-        er_bbox.ymax = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
+        er_bbox.xmin =
+            SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
+        er_bbox.xmax =
+            SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
+        er_bbox.ymin =
+            SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
+        er_bbox.ymax =
+            SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
 
         Integer<SIMD_T> vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
         Integer<SIMD_T> vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
 
         // Take only triangles that fit into ER tile
-        uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
+        uint32_t oneTileMask =
+            triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
 
         if (oneTileMask)
         {
             // determine CW tris (det > 0)
-            uint32_t maskCwLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
-            uint32_t maskCwHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
+            uint32_t maskCwLo = SIMD_T::movemask_pd(
+                SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
+            uint32_t maskCwHi = SIMD_T::movemask_pd(
+                SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
             uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2));
 
             // Try early rasterization
-            triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
+            triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(
+                er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
 
             if (!triMask)
             {
@@ -963,7 +1021,6 @@
                 return;
             }
         }
-
     }
 #endif
 
@@ -975,29 +1032,32 @@
     {
         // Simple non-conformant wireframe mode, useful for debugging
         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
-        Vec4<SIMD_T> line[2];
+        Vec4<SIMD_T>  line[2];
         Float<SIMD_T> recipW[2];
 
-        line[0] = tri[0];
-        line[1] = tri[1];
+        line[0]   = tri[0];
+        line[1]   = tri[1];
         recipW[0] = vRecipW0;
         recipW[1] = vRecipW1;
 
-        BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
+        BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
+            pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
 
-        line[0] = tri[1];
-        line[1] = tri[2];
+        line[0]   = tri[1];
+        line[1]   = tri[2];
         recipW[0] = vRecipW1;
         recipW[1] = vRecipW2;
 
-        BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
+        BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
+            pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
 
-        line[0] = tri[2];
-        line[1] = tri[0];
+        line[0]   = tri[2];
+        line[1]   = tri[0];
         recipW[0] = vRecipW2;
         recipW[1] = vRecipW0;
 
-        BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
+        BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
+            pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
 
         RDTSC_END(FEBinTriangles, 1);
         return;
@@ -1005,9 +1065,12 @@
     else if (rastState.fillMode == SWR_FILLMODE_POINT)
     {
         // Bin 3 points
-        BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
-        BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
-        BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
+        BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
+            pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
+        BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
+            pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
+        BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
+            pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
 
         RDTSC_END(FEBinTriangles, 1);
         return;
@@ -1019,12 +1082,13 @@
     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
 
-    OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
+    OSALIGNSIMD16(uint32_t)
+    aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
 
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft),   bbox.xmin);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight),  bbox.xmax);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop),    bbox.ymin);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
@@ -1041,7 +1105,7 @@
     // scan remaining valid triangles and bin each separately
     while (_BitScanForward(&triIndex, triMask))
     {
-        uint32_t linkageCount = state.backendState.numAttributes;
+        uint32_t linkageCount     = state.backendState.numAttributes;
         uint32_t numScalarAttribs = linkageCount * 4;
 
         BE_WORK work;
@@ -1052,8 +1116,13 @@
         {
             // only rasterize valid edges if we have a degenerate primitive
             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
-            work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
-                (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
+            work.pfnWork =
+                GetRasterizerFunc(rastState.sampleCount,
+                                  rastState.bIsCenterPattern,
+                                  (rastState.conservativeRast > 0),
+                                  (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage,
+                                  EdgeValToEdgeState(triEdgeEnable),
+                                  (state.scissorsTileAligned == false));
 
             // Degenerate triangles are required to be constant interpolated
             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
@@ -1065,30 +1134,33 @@
         }
 
         // Select attribute processor
-        PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
-            state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
+        PFN_PROCESS_ATTRIBUTES pfnProcessAttribs =
+            GetProcessAttributesFunc(3,
+                                     state.backendState.swizzleEnable,
+                                     state.backendState.constantInterpolationMask,
+                                     isDegenerate);
 
-        TRIANGLE_WORK_DESC &desc = work.desc.tri;
+        TRIANGLE_WORK_DESC& desc = work.desc.tri;
 
         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
-        desc.triFlags.viewportIndex = pViewportIndex[triIndex];
+        desc.triFlags.viewportIndex          = pViewportIndex[triIndex];
 
         auto pArena = pDC->pArena;
         SWR_ASSERT(pArena != nullptr);
 
         // store active attribs
-        float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
-        desc.pAttribs = pAttribs;
+        float* pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
+        desc.pAttribs   = pAttribs;
         desc.numAttribs = linkageCount;
         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
 
         // store triangle vertex data
         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
 
-        SIMD128::store_ps(&desc.pTriBuffer[0],  vHorizX[triIndex]);
-        SIMD128::store_ps(&desc.pTriBuffer[4],  vHorizY[triIndex]);
-        SIMD128::store_ps(&desc.pTriBuffer[8],  vHorizZ[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
         SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
 
         // store user clip distances
@@ -1096,7 +1168,8 @@
         {
             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
-            ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
+            ProcessUserClipDist<3>(
+                state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
         }
 
         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
@@ -1112,39 +1185,39 @@
             }
         }
 
-                     triMask &= ~(1 << triIndex);
+        triMask &= ~(1 << triIndex);
     }
 
     RDTSC_END(FEBinTriangles, 1);
 }
 
 template <typename CT>
-void BinTriangles(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    simdvector tri[3],
-    uint32_t triMask,
-    simdscalari const &primID,
-    simdscalari const &viewportIdx,
-    simdscalari const &rtIdx)
+void BinTriangles(DRAW_CONTEXT*      pDC,
+                  PA_STATE&          pa,
+                  uint32_t           workerId,
+                  simdvector         tri[3],
+                  uint32_t           triMask,
+                  simdscalari const& primID,
+                  simdscalari const& viewportIdx,
+                  simdscalari const& rtIdx)
 {
-    BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
+    BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(
+        pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
 }
 
 #if USE_SIMD16_FRONTEND
 template <typename CT>
-void SIMDCALL BinTriangles_simd16(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    simd16vector tri[3],
-    uint32_t triMask,
-    simd16scalari const &primID,
-    simd16scalari const &viewportIdx,
-    simd16scalari const &rtIdx)
+void SIMDCALL BinTriangles_simd16(DRAW_CONTEXT*        pDC,
+                                  PA_STATE&            pa,
+                                  uint32_t             workerId,
+                                  simd16vector         tri[3],
+                                  uint32_t             triMask,
+                                  simd16scalari const& primID,
+                                  simd16scalari const& viewportIdx,
+                                  simd16scalari const& rtIdx)
 {
-    BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
+    BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(
+        pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
 }
 
 #endif
@@ -1186,27 +1259,26 @@
 #endif
 
 template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPostSetupPointsImpl(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    Vec4<SIMD_T> prim[],
-    uint32_t primMask,
-    Integer<SIMD_T> const &primID,
-    Integer<SIMD_T> const &viewportIdx,
-    Integer<SIMD_T> const &rtIdx)
+void BinPostSetupPointsImpl(DRAW_CONTEXT*          pDC,
+                            PA_STATE&              pa,
+                            uint32_t               workerId,
+                            Vec4<SIMD_T>           prim[],
+                            uint32_t               primMask,
+                            Integer<SIMD_T> const& primID,
+                            Integer<SIMD_T> const& viewportIdx,
+                            Integer<SIMD_T> const& rtIdx)
 {
     RDTSC_BEGIN(FEBinPoints, pDC->drawId);
 
-    Vec4<SIMD_T> &primVerts = prim[0];
+    Vec4<SIMD_T>& primVerts = prim[0];
 
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-    const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
+    const API_STATE&     state          = GetApiState(pDC);
+    const SWR_RASTSTATE& rastState      = state.rastState;
+    const uint32_t*      pViewportIndex = (uint32_t*)&viewportIdx;
 
     // Select attribute processor
-    PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
-        state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
+    PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(
+        1, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
 
     // convert to fixed point
     Integer<SIMD_T> vXi, vYi;
@@ -1224,64 +1296,68 @@
         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
 
-        // compute macro tile coordinates 
+        // compute macro tile coordinates
         Integer<SIMD_T> macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
         Integer<SIMD_T> macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
 
         OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
 
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroX), macroX);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroY), macroY);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroX), macroX);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMacroY), macroY);
 
         // compute raster tile coordinates
-        Integer<SIMD_T> rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
-        Integer<SIMD_T> rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
+        Integer<SIMD_T> rasterX =
+            SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
+        Integer<SIMD_T> rasterY =
+            SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
 
         // compute raster tile relative x,y for coverage mask
         Integer<SIMD_T> tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
         Integer<SIMD_T> tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
 
-        Integer<SIMD_T> tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
-        Integer<SIMD_T> tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
+        Integer<SIMD_T> tileRelativeX =
+            SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
+        Integer<SIMD_T> tileRelativeY =
+            SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
 
         OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
         OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
 
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeX), tileRelativeX);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeY), tileRelativeY);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeX), tileRelativeX);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileRelativeY), tileRelativeY);
 
         OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
         OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
 
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedX), tileAlignedX);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedY), tileAlignedY);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedX), tileAlignedX);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aTileAlignedY), tileAlignedY);
 
         OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
-        SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
+        SIMD_T::store_ps(reinterpret_cast<float*>(aZ), primVerts.z);
 
         // store render target array index
-        const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
-        
-        uint32_t *pPrimID = (uint32_t *)&primID;
-        DWORD primIndex = 0;
+        const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
+
+        uint32_t* pPrimID   = (uint32_t*)&primID;
+        DWORD     primIndex = 0;
 
         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
 
         // scan remaining valid triangles and bin each separately
         while (_BitScanForward(&primIndex, primMask))
         {
-            uint32_t linkageCount = backendState.numAttributes;
+            uint32_t linkageCount     = backendState.numAttributes;
             uint32_t numScalarAttribs = linkageCount * 4;
 
             BE_WORK work;
             work.type = DRAW;
 
-            TRIANGLE_WORK_DESC &desc = work.desc.tri;
+            TRIANGLE_WORK_DESC& desc = work.desc.tri;
 
             // points are always front facing
-            desc.triFlags.frontFacing = 1;
+            desc.triFlags.frontFacing            = 1;
             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-            desc.triFlags.viewportIndex = pViewportIndex[primIndex];
+            desc.triFlags.viewportIndex          = pViewportIndex[primIndex];
 
             work.pfnWork = RasterizeSimplePoint;
 
@@ -1289,18 +1365,19 @@
             SWR_ASSERT(pArena != nullptr);
 
             // store attributes
-            float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
-            desc.pAttribs = pAttribs;
+            float* pAttribs =
+                (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
+            desc.pAttribs   = pAttribs;
             desc.numAttribs = linkageCount;
 
             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
 
             // store raster tile aligned x, y, perspective correct z
-            float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
-            desc.pTriBuffer = pTriBuffer;
+            float* pTriBuffer        = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
+            desc.pTriBuffer          = pTriBuffer;
             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
-            *pTriBuffer = aZ[primIndex];
+            *pTriBuffer              = aZ[primIndex];
 
             uint32_t tX = aTileRelativeX[primIndex];
             uint32_t tY = aTileRelativeY[primIndex];
@@ -1310,7 +1387,7 @@
             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
 
             // bin it
-            MacroTileMgr *pTileMgr = pDC->pTileMgr;
+            MacroTileMgr* pTileMgr = pDC->pTileMgr;
 #if KNOB_ENABLE_TOSS_POINTS
             if (!KNOB_TOSS_SETUP_TRIS)
 #endif
@@ -1343,7 +1420,7 @@
         bbox.xmin = bbox.xmax = vXi;
         bbox.ymin = bbox.ymax = vYi;
 
-        Float<SIMD_T> vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
+        Float<SIMD_T>   vHalfWidth  = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
         Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
 
         bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
@@ -1351,15 +1428,20 @@
         bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
         bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
 
-        // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-        // Gather the AOS effective scissor rects based on the per-prim VP index.
+        // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge
+        // is exclusive. Gather the AOS effective scissor rects based on the per-prim VP index.
         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
         {
             Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
 
             if (pa.viewportArrayActive)
             {
-                GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
+                GatherScissors(&state.scissorsInFixedPoint[0],
+                               pViewportIndex,
+                               scisXmin,
+                               scisYmin,
+                               scisXmax,
+                               scisYmax);
             }
             else // broadcast fast path for non-VPAI case.
             {
@@ -1371,16 +1453,19 @@
 
             bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
             bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
-            bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
-            bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
+            bbox.xmax =
+                SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
+            bbox.ymax =
+                SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
         }
 
         // Cull bloated points completely outside scissor
         Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
         Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
-        Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+        Integer<SIMD_T> maskOutsideScissorXY =
+            SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
-        primMask = primMask & ~maskOutsideScissor;
+        primMask                    = primMask & ~maskOutsideScissor;
 
         // Convert bbox to macrotile units.
         bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
@@ -1388,46 +1473,47 @@
         bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
         bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
 
-        OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
+        OSALIGNSIMD16(uint32_t)
+        aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
 
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft),   bbox.xmin);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight),  bbox.xmax);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop),    bbox.ymin);
-        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
 
         // store render target array index
-        const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
+        const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
 
         OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
-        SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
+        SIMD_T::store_ps(reinterpret_cast<float*>(aPointSize), vPointSize);
 
-        uint32_t *pPrimID = (uint32_t *)&primID;
+        uint32_t* pPrimID = (uint32_t*)&primID;
 
         OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
         OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
         OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
 
-        SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
-        SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
-        SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
+        SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsX), primVerts.x);
+        SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsY), primVerts.y);
+        SIMD_T::store_ps(reinterpret_cast<float*>(aPrimVertsZ), primVerts.z);
 
         // scan remaining valid prims and bin each separately
         const SWR_BACKEND_STATE& backendState = state.backendState;
-        DWORD primIndex;
+        DWORD                    primIndex;
         while (_BitScanForward(&primIndex, primMask))
         {
-            uint32_t linkageCount = backendState.numAttributes;
+            uint32_t linkageCount     = backendState.numAttributes;
             uint32_t numScalarAttribs = linkageCount * 4;
 
             BE_WORK work;
             work.type = DRAW;
 
-            TRIANGLE_WORK_DESC &desc = work.desc.tri;
+            TRIANGLE_WORK_DESC& desc = work.desc.tri;
 
-            desc.triFlags.frontFacing = 1;
-            desc.triFlags.pointSize = aPointSize[primIndex];
+            desc.triFlags.frontFacing            = 1;
+            desc.triFlags.pointSize              = aPointSize[primIndex];
             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-            desc.triFlags.viewportIndex = pViewportIndex[primIndex];
+            desc.triFlags.viewportIndex          = pViewportIndex[primIndex];
 
             work.pfnWork = RasterizeTriPoint;
 
@@ -1440,11 +1526,11 @@
             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
 
             // store point vertex data
-            float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
-            desc.pTriBuffer = pTriBuffer;
-            *pTriBuffer++ = aPrimVertsX[primIndex];
-            *pTriBuffer++ = aPrimVertsY[primIndex];
-            *pTriBuffer = aPrimVertsZ[primIndex];
+            float* pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
+            desc.pTriBuffer   = pTriBuffer;
+            *pTriBuffer++     = aPrimVertsX[primIndex];
+            *pTriBuffer++     = aPrimVertsY[primIndex];
+            *pTriBuffer       = aPrimVertsZ[primIndex];
 
             // store user clip distances
             if (backendState.clipDistanceMask)
@@ -1454,14 +1540,15 @@
                 float dists[8];
                 float one = 1.0f;
                 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
-                for (uint32_t i = 0; i < numClipDist; i++) {
+                for (uint32_t i = 0; i < numClipDist; i++)
+                {
                     desc.pUserClipBuffer[3 * i + 0] = 0.0f;
                     desc.pUserClipBuffer[3 * i + 1] = 0.0f;
                     desc.pUserClipBuffer[3 * i + 2] = dists[i];
                 }
             }
 
-            MacroTileMgr *pTileMgr = pDC->pTileMgr;
+            MacroTileMgr* pTileMgr = pDC->pTileMgr;
             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
             {
                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
@@ -1490,19 +1577,18 @@
 /// @param tri - Contains point position data for SIMDs worth of points.
 /// @param primID - Primitive ID for each point.
 template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPointsImpl(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    Vec4<SIMD_T> prim[3],
-    uint32_t primMask,
-    Integer<SIMD_T> const &primID,
-    Integer<SIMD_T> const &viewportIdx,
-    Integer<SIMD_T> const &rtIdx)
+void BinPointsImpl(DRAW_CONTEXT*          pDC,
+                   PA_STATE&              pa,
+                   uint32_t               workerId,
+                   Vec4<SIMD_T>           prim[3],
+                   uint32_t               primMask,
+                   Integer<SIMD_T> const& primID,
+                   Integer<SIMD_T> const& viewportIdx,
+                   Integer<SIMD_T> const& rtIdx)
 {
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_FRONTEND_STATE& feState = state.frontendState;
-    const SWR_RASTSTATE& rastState = state.rastState;
+    const API_STATE&          state     = GetApiState(pDC);
+    const SWR_FRONTEND_STATE& feState   = state.frontendState;
+    const SWR_RASTSTATE&      rastState = state.rastState;
 
     if (!feState.vpTransformDisable)
     {
@@ -1530,57 +1616,34 @@
     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
 
     BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
-        pDC,
-        pa,
-        workerId,
-        prim,
-        primMask,
-        primID,
-        viewportIdx,
-        rtIdx);
+        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
 }
 
-void BinPoints(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    simdvector prim[3],
-    uint32_t primMask,
-    simdscalari const &primID,
-    simdscalari const &viewportIdx,
-    simdscalari const &rtIdx)
+void BinPoints(DRAW_CONTEXT*      pDC,
+               PA_STATE&          pa,
+               uint32_t           workerId,
+               simdvector         prim[3],
+               uint32_t           primMask,
+               simdscalari const& primID,
+               simdscalari const& viewportIdx,
+               simdscalari const& rtIdx)
 {
     BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
-        pDC,
-        pa,
-        workerId,
-        prim,
-        primMask,
-        primID,
-        viewportIdx,
-        rtIdx);
+        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
 }
 
 #if USE_SIMD16_FRONTEND
-void SIMDCALL BinPoints_simd16(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    simd16vector prim[3],
-    uint32_t primMask,
-    simd16scalari const &primID,
-    simd16scalari const &viewportIdx,
-    simd16scalari const & rtIdx)
+void SIMDCALL BinPoints_simd16(DRAW_CONTEXT*        pDC,
+                               PA_STATE&            pa,
+                               uint32_t             workerId,
+                               simd16vector         prim[3],
+                               uint32_t             primMask,
+                               simd16scalari const& primID,
+                               simd16scalari const& viewportIdx,
+                               simd16scalari const& rtIdx)
 {
     BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
-        pDC,
-        pa,
-        workerId,
-        prim,
-        primMask,
-        primID,
-        viewportIdx,
-        rtIdx);
+        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
 }
 
 #endif
@@ -1593,30 +1656,29 @@
 /// @param primID - Primitive ID for each line.
 /// @param viewportIdx - Viewport Array Index for each line.
 template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void BinPostSetupLinesImpl(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    Vec4<SIMD_T> prim[],
-    Float<SIMD_T> recipW[],
-    uint32_t primMask,
-    Integer<SIMD_T> const &primID,
-    Integer<SIMD_T> const &viewportIdx,
-    Integer<SIMD_T> const &rtIdx)
+void BinPostSetupLinesImpl(DRAW_CONTEXT*          pDC,
+                           PA_STATE&              pa,
+                           uint32_t               workerId,
+                           Vec4<SIMD_T>           prim[],
+                           Float<SIMD_T>          recipW[],
+                           uint32_t               primMask,
+                           Integer<SIMD_T> const& primID,
+                           Integer<SIMD_T> const& viewportIdx,
+                           Integer<SIMD_T> const& rtIdx)
 {
-    const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
+    const uint32_t* aRTAI = reinterpret_cast<const uint32_t*>(&rtIdx);
 
     RDTSC_BEGIN(FEBinLines, pDC->drawId);
 
-    const API_STATE &state = GetApiState(pDC);
-    const SWR_RASTSTATE &rastState = state.rastState;
+    const API_STATE&     state     = GetApiState(pDC);
+    const SWR_RASTSTATE& rastState = state.rastState;
 
     // Select attribute processor
-    PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
-        state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
+    PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(
+        2, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
 
-    Float<SIMD_T> &vRecipW0 = recipW[0];
-    Float<SIMD_T> &vRecipW1 = recipW[1];
+    Float<SIMD_T>& vRecipW0 = recipW[0];
+    Float<SIMD_T>& vRecipW1 = recipW[1];
 
     // convert to fixed point
     Integer<SIMD_T> vXi[2], vYi[2];
@@ -1627,19 +1689,20 @@
     vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
 
     // compute x-major vs y-major mask
-    Integer<SIMD_T> xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
-    Integer<SIMD_T> yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
-    Float<SIMD_T> vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
-    uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
+    Integer<SIMD_T> xLength     = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
+    Integer<SIMD_T> yLength     = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
+    Float<SIMD_T>   vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
+    uint32_t        yMajorMask  = SIMD_T::movemask_ps(vYmajorMask);
 
     // cull zero-length lines
     Integer<SIMD_T> vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
-    vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
+    vZeroLengthMask =
+        SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
 
     primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
 
-    uint32_t *pPrimID = (uint32_t *)&primID;
-    const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
+    uint32_t*       pPrimID        = (uint32_t*)&primID;
+    const uint32_t* pViewportIndex = (uint32_t*)&viewportIdx;
 
     // Calc bounding box of lines
     SIMDBBOX_T<SIMD_T> bbox;
@@ -1649,7 +1712,7 @@
     bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
 
     // bloat bbox by line width along minor axis
-    Float<SIMD_T> vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
+    Float<SIMD_T>   vHalfWidth  = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
     Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
 
     SIMDBBOX_T<SIMD_T> bloatBox;
@@ -1664,13 +1727,19 @@
     bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
     bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
 
-    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
+    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is
+    // exclusive.
     {
         Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
 
         if (pa.viewportArrayActive)
         {
-            GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
+            GatherScissors(&state.scissorsInFixedPoint[0],
+                           pViewportIndex,
+                           scisXmin,
+                           scisYmin,
+                           scisXmax,
+                           scisYmax);
         }
         else // broadcast fast path for non-VPAI case.
         {
@@ -1682,17 +1751,20 @@
 
         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
-        bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
+        bbox.xmax =
+            SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
+        bbox.ymax =
+            SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
     }
 
     // Cull prims completely outside scissor
     {
         Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
         Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
-        Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+        Integer<SIMD_T> maskOutsideScissorXY =
+            SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
-        primMask = primMask & ~maskOutsideScissor;
+        primMask                    = primMask & ~maskOutsideScissor;
     }
 
     // transpose verts needed for backend
@@ -1713,34 +1785,35 @@
     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
 
-    OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
+    OSALIGNSIMD16(uint32_t)
+    aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
 
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft),   bbox.xmin);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight),  bbox.xmax);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop),    bbox.ymin);
-    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTLeft), bbox.xmin);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTRight), bbox.xmax);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTTop), bbox.ymin);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T>*>(aMTBottom), bbox.ymax);
 
     TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
     TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
     TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
-    TransposeVertices(vHorizW, vRecipW0,  vRecipW1,  SIMD_T::setzero_ps());
+    TransposeVertices(vHorizW, vRecipW0, vRecipW1, SIMD_T::setzero_ps());
 
     // scan remaining valid prims and bin each separately
     DWORD primIndex;
     while (_BitScanForward(&primIndex, primMask))
     {
-        uint32_t linkageCount = state.backendState.numAttributes;
+        uint32_t linkageCount     = state.backendState.numAttributes;
         uint32_t numScalarAttribs = linkageCount * 4;
 
         BE_WORK work;
         work.type = DRAW;
 
-        TRIANGLE_WORK_DESC &desc = work.desc.tri;
+        TRIANGLE_WORK_DESC& desc = work.desc.tri;
 
-        desc.triFlags.frontFacing = 1;
-        desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
+        desc.triFlags.frontFacing            = 1;
+        desc.triFlags.yMajor                 = (yMajorMask >> primIndex) & 1;
         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
-        desc.triFlags.viewportIndex = pViewportIndex[primIndex];
+        desc.triFlags.viewportIndex          = pViewportIndex[primIndex];
 
         work.pfnWork = RasterizeLine;
 
@@ -1748,16 +1821,16 @@
         SWR_ASSERT(pArena != nullptr);
 
         // store active attribs
-        desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
+        desc.pAttribs   = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
         desc.numAttribs = linkageCount;
         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
 
         // store line vertex data
         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
 
-        _mm_store_ps(&desc.pTriBuffer[0],  vHorizX[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4],  vHorizY[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8],  vHorizZ[primIndex]);
+        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
 
         // store user clip distances
@@ -1765,10 +1838,11 @@
         {
             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
-            ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
+            ProcessUserClipDist<2>(
+                state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
         }
 
-        MacroTileMgr *pTileMgr = pDC->pTileMgr;
+        MacroTileMgr* pTileMgr = pDC->pTileMgr;
         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
         {
             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
@@ -1799,21 +1873,20 @@
 /// @param primID - Primitive ID for each line.
 /// @param viewportIdx - Viewport Array Index for each line.
 template <typename SIMD_T, uint32_t SIMD_WIDTH>
-void SIMDCALL BinLinesImpl(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    Vec4<SIMD_T> prim[3],
-    uint32_t primMask,
-    Integer<SIMD_T> const &primID,
-    Integer<SIMD_T> const &viewportIdx,
-    Integer<SIMD_T> const & rtIdx)
+void SIMDCALL BinLinesImpl(DRAW_CONTEXT*          pDC,
+                           PA_STATE&              pa,
+                           uint32_t               workerId,
+                           Vec4<SIMD_T>           prim[3],
+                           uint32_t               primMask,
+                           Integer<SIMD_T> const& primID,
+                           Integer<SIMD_T> const& viewportIdx,
+                           Integer<SIMD_T> const& rtIdx)
 {
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_RASTSTATE& rastState = state.rastState;
-    const SWR_FRONTEND_STATE& feState = state.frontendState;
+    const API_STATE&          state     = GetApiState(pDC);
+    const SWR_RASTSTATE&      rastState = state.rastState;
+    const SWR_FRONTEND_STATE& feState   = state.frontendState;
 
-    Float<SIMD_T> vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
+    Float<SIMD_T> vRecipW[2] = {SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f)};
 
     if (!feState.vpTransformDisable)
     {
@@ -1851,42 +1924,34 @@
     prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
 
     BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
-        pDC,
-        pa,
-        workerId,
-        prim,
-        vRecipW,
-        primMask,
-        primID,
-        viewportIdx,
-        rtIdx);
+        pDC, pa, workerId, prim, vRecipW, primMask, primID, viewportIdx, rtIdx);
 }
 
-void BinLines(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    simdvector prim[],
-    uint32_t primMask,
-    simdscalari const &primID,
-    simdscalari const &viewportIdx,
-    simdscalari const &rtIdx)
+void BinLines(DRAW_CONTEXT*      pDC,
+              PA_STATE&          pa,
+              uint32_t           workerId,
+              simdvector         prim[],
+              uint32_t           primMask,
+              simdscalari const& primID,
+              simdscalari const& viewportIdx,
+              simdscalari const& rtIdx)
 {
-    BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
+    BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(
+        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
 }
 
 #if USE_SIMD16_FRONTEND
-void SIMDCALL BinLines_simd16(
-    DRAW_CONTEXT *pDC,
-    PA_STATE &pa,
-    uint32_t workerId,
-    simd16vector prim[3],
-    uint32_t primMask,
-    simd16scalari const &primID,
-    simd16scalari const &viewportIdx,
-    simd16scalari const &rtIdx)
+void SIMDCALL BinLines_simd16(DRAW_CONTEXT*        pDC,
+                              PA_STATE&            pa,
+                              uint32_t             workerId,
+                              simd16vector         prim[3],
+                              uint32_t             primMask,
+                              simd16scalari const& primID,
+                              simd16scalari const& viewportIdx,
+                              simd16scalari const& rtIdx)
 {
-    BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
+    BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(
+        pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
 }
 
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h
index 443dac5..f5f6d80 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.h
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file binner.h
-*
-* @brief Declaration for the macrotile binner
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file binner.h
+ *
+ * @brief Declaration for the macrotile binner
+ *
+ ******************************************************************************/
 #include "state.h"
 #include "conservativeRast.h"
 #include "utils.h"
@@ -47,22 +47,23 @@
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Convert the X,Y coords of a triangle to the requested Fixed 
+/// @brief Convert the X,Y coords of a triangle to the requested Fixed
 /// Point precision from FP32.
 template <typename SIMD_T, typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T> &vIn)
+INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T>& vIn)
 {
     return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value)));
 }
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Helper function to set the X,Y coords of a triangle to the 
+/// @brief Helper function to set the X,Y coords of a triangle to the
 /// requested Fixed Point precision from FP32.
 /// @param tri: simdvector[3] of FP triangle verts
 /// @param vXi: fixed point X coords of tri verts
 /// @param vYi: fixed point Y coords of tri verts
 template <typename SIMD_T>
-INLINE static void FPToFixedPoint(const Vec4<SIMD_T> *const tri, Integer<SIMD_T>(&vXi)[3], Integer<SIMD_T>(&vYi)[3])
+INLINE static void
+FPToFixedPoint(const Vec4<SIMD_T>* const tri, Integer<SIMD_T> (&vXi)[3], Integer<SIMD_T> (&vYi)[3])
 {
     vXi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].x);
     vYi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].y);
@@ -78,10 +79,12 @@
 /// @param vX: fixed point X position for triangle verts
 /// @param vY: fixed point Y position for triangle verts
 /// @param bbox: fixed point bbox
-/// *Note*: expects vX, vY to be in the correct precision for the type 
+/// *Note*: expects vX, vY to be in the correct precision for the type
 /// of rasterization. This avoids unnecessary FP->fixed conversions.
 template <typename SIMD_T, typename CT>
-INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T>(&vX)[3], const Integer<SIMD_T>(&vY)[3], SIMDBBOX_T<SIMD_T> &bbox)
+INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T> (&vX)[3],
+                                       const Integer<SIMD_T> (&vY)[3],
+                                       SIMDBBOX_T<SIMD_T>& bbox)
 {
     Integer<SIMD_T> vMinX = vX[0];
 
@@ -105,8 +108,9 @@
 
     if (CT::BoundingBoxOffsetT::value != 0)
     {
-        /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
-        /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
+        /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative
+        /// rasterization expand bbox by 1/256; coverage will be correctly handled in the
+        /// rasterizer.
 
         const Integer<SIMD_T> value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value);
 
@@ -132,119 +136,119 @@
 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
 //
 /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
-    simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax)
+static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
+                           const uint32_t* pViewportIndex,
+                           simdscalari&    scisXmin,
+                           simdscalari&    scisYmin,
+                           simdscalari&    scisXmax,
+                           simdscalari&    scisYmax)
 {
-    scisXmin = _simd_set_epi32(
-        pScissorsInFixedPoint[pViewportIndex[7]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[5]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[0]].xmin);
-    scisYmin = _simd_set_epi32(
-        pScissorsInFixedPoint[pViewportIndex[7]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[5]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[0]].ymin);
-    scisXmax = _simd_set_epi32(
-        pScissorsInFixedPoint[pViewportIndex[7]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[5]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[0]].xmax);
-    scisYmax = _simd_set_epi32(
-        pScissorsInFixedPoint[pViewportIndex[7]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[5]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[1]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[0]].ymax);
+    scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmin,
+                               pScissorsInFixedPoint[pViewportIndex[6]].xmin,
+                               pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+                               pScissorsInFixedPoint[pViewportIndex[4]].xmin,
+                               pScissorsInFixedPoint[pViewportIndex[3]].xmin,
+                               pScissorsInFixedPoint[pViewportIndex[2]].xmin,
+                               pScissorsInFixedPoint[pViewportIndex[1]].xmin,
+                               pScissorsInFixedPoint[pViewportIndex[0]].xmin);
+    scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymin,
+                               pScissorsInFixedPoint[pViewportIndex[6]].ymin,
+                               pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+                               pScissorsInFixedPoint[pViewportIndex[4]].ymin,
+                               pScissorsInFixedPoint[pViewportIndex[3]].ymin,
+                               pScissorsInFixedPoint[pViewportIndex[2]].ymin,
+                               pScissorsInFixedPoint[pViewportIndex[1]].ymin,
+                               pScissorsInFixedPoint[pViewportIndex[0]].ymin);
+    scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].xmax,
+                               pScissorsInFixedPoint[pViewportIndex[6]].xmax,
+                               pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+                               pScissorsInFixedPoint[pViewportIndex[4]].xmax,
+                               pScissorsInFixedPoint[pViewportIndex[3]].xmax,
+                               pScissorsInFixedPoint[pViewportIndex[2]].xmax,
+                               pScissorsInFixedPoint[pViewportIndex[1]].xmax,
+                               pScissorsInFixedPoint[pViewportIndex[0]].xmax);
+    scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[7]].ymax,
+                               pScissorsInFixedPoint[pViewportIndex[6]].ymax,
+                               pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+                               pScissorsInFixedPoint[pViewportIndex[4]].ymax,
+                               pScissorsInFixedPoint[pViewportIndex[3]].ymax,
+                               pScissorsInFixedPoint[pViewportIndex[2]].ymax,
+                               pScissorsInFixedPoint[pViewportIndex[1]].ymax,
+                               pScissorsInFixedPoint[pViewportIndex[0]].ymax);
 }
 
-static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
-    simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax)
+static void GatherScissors(const SWR_RECT* pScissorsInFixedPoint,
+                           const uint32_t* pViewportIndex,
+                           simd16scalari&  scisXmin,
+                           simd16scalari&  scisYmin,
+                           simd16scalari&  scisXmax,
+                           simd16scalari&  scisYmax)
 {
-    scisXmin = _simd16_set_epi32(
-        pScissorsInFixedPoint[pViewportIndex[15]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[14]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[13]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[12]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[11]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[10]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[9]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[8]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[7]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[6]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[5]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[4]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[3]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[2]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[1]].xmin,
-        pScissorsInFixedPoint[pViewportIndex[0]].xmin);
+    scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[14]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[13]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[12]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[11]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[10]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[9]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[8]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[7]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[6]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[5]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[4]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[3]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[2]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[1]].xmin,
+                                 pScissorsInFixedPoint[pViewportIndex[0]].xmin);
 
-    scisYmin = _simd16_set_epi32(
-        pScissorsInFixedPoint[pViewportIndex[15]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[14]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[13]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[12]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[11]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[10]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[9]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[8]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[7]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[6]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[5]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[4]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[3]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[2]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[1]].ymin,
-        pScissorsInFixedPoint[pViewportIndex[0]].ymin);
+    scisYmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[14]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[13]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[12]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[11]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[10]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[9]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[8]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[7]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[6]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[5]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[4]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[3]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[2]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[1]].ymin,
+                                 pScissorsInFixedPoint[pViewportIndex[0]].ymin);
 
-    scisXmax = _simd16_set_epi32(
-        pScissorsInFixedPoint[pViewportIndex[15]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[14]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[13]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[12]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[11]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[10]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[9]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[8]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[7]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[6]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[5]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[4]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[3]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[2]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[1]].xmax,
-        pScissorsInFixedPoint[pViewportIndex[0]].xmax);
+    scisXmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[14]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[13]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[12]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[11]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[10]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[9]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[8]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[7]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[6]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[5]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[4]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[3]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[2]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[1]].xmax,
+                                 pScissorsInFixedPoint[pViewportIndex[0]].xmax);
 
-    scisYmax = _simd16_set_epi32(
-        pScissorsInFixedPoint[pViewportIndex[15]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[14]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[13]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[12]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[11]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[10]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[9]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[8]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[7]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[6]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[5]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[4]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[3]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[2]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[1]].ymax,
-        pScissorsInFixedPoint[pViewportIndex[0]].ymax);
+    scisYmax = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[15]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[14]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[13]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[12]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[11]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[10]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[9]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[8]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[7]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[6]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[5]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[4]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[3]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[2]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[1]].ymax,
+                                 pScissorsInFixedPoint[pViewportIndex[0]].ymax);
 }
\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h
index c89c476..7b2f779 100644
--- a/src/gallium/drivers/swr/rasterizer/core/blend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/blend.h
@@ -1,77 +1,82 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file blend.cpp
-*
-* @brief Implementation for blending operations.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file blend.cpp
+ *
+ * @brief Implementation for blending operations.
+ *
+ ******************************************************************************/
 #include "state.h"
 
-template<bool Color, bool Alpha>
-INLINE
-void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdvector &src, simdvector &src1, simdvector &dst, simdvector &out)
+template <bool Color, bool Alpha>
+INLINE void GenerateBlendFactor(SWR_BLEND_FACTOR func,
+                                simdvector&      constantColor,
+                                simdvector&      src,
+                                simdvector&      src1,
+                                simdvector&      dst,
+                                simdvector&      out)
 {
     simdvector result;
 
     switch (func)
     {
-    case BLENDFACTOR_ZERO: 
+    case BLENDFACTOR_ZERO:
         result.x = _simd_setzero_ps();
         result.y = _simd_setzero_ps();
         result.z = _simd_setzero_ps();
         result.w = _simd_setzero_ps();
         break;
 
-    case BLENDFACTOR_ONE: 
+    case BLENDFACTOR_ONE:
         result.x = _simd_set1_ps(1.0);
         result.y = _simd_set1_ps(1.0);
         result.z = _simd_set1_ps(1.0);
         result.w = _simd_set1_ps(1.0);
         break;
 
-    case BLENDFACTOR_SRC_COLOR: 
+    case BLENDFACTOR_SRC_COLOR:
         result = src;
         break;
 
-    case BLENDFACTOR_DST_COLOR: 
+    case BLENDFACTOR_DST_COLOR:
         result = dst;
         break;
 
-    case BLENDFACTOR_INV_SRC_COLOR: 
+    case BLENDFACTOR_INV_SRC_COLOR:
         result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x);
         result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y);
         result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z);
         result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
         break;
 
-    case BLENDFACTOR_INV_DST_COLOR: 
+    case BLENDFACTOR_INV_DST_COLOR:
         result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x);
         result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y);
         result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z);
         result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
         break;
 
-    case BLENDFACTOR_SRC_ALPHA: result.x = src.w;
+    case BLENDFACTOR_SRC_ALPHA:
+        result.x = src.w;
         result.y = src.w;
         result.z = src.w;
         result.w = src.w;
@@ -80,14 +85,15 @@
     case BLENDFACTOR_INV_SRC_ALPHA:
     {
         simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
-        result.x = oneMinusSrcA;
-        result.y = oneMinusSrcA;
-        result.z = oneMinusSrcA;
-        result.w = oneMinusSrcA;
+        result.x                = oneMinusSrcA;
+        result.y                = oneMinusSrcA;
+        result.z                = oneMinusSrcA;
+        result.w                = oneMinusSrcA;
         break;
     }
 
-    case BLENDFACTOR_DST_ALPHA: result.x = dst.w;
+    case BLENDFACTOR_DST_ALPHA:
+        result.x = dst.w;
         result.y = dst.w;
         result.z = dst.w;
         result.w = dst.w;
@@ -96,20 +102,20 @@
     case BLENDFACTOR_INV_DST_ALPHA:
     {
         simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
-        result.x = oneMinusDstA;
-        result.y = oneMinusDstA;
-        result.z = oneMinusDstA;
-        result.w = oneMinusDstA;
+        result.x                = oneMinusDstA;
+        result.y                = oneMinusDstA;
+        result.z                = oneMinusDstA;
+        result.w                = oneMinusDstA;
         break;
     }
 
     case BLENDFACTOR_SRC_ALPHA_SATURATE:
     {
         simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w));
-        result.x = sat;
-        result.y = sat;
-        result.z = sat;
-        result.w = _simd_set1_ps(1.0);
+        result.x       = sat;
+        result.y       = sat;
+        result.z       = sat;
+        result.w       = _simd_set1_ps(1.0);
         break;
     }
 
@@ -135,7 +141,8 @@
 
     case BLENDFACTOR_INV_CONST_ALPHA:
     {
-        result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
+        result.x = result.y = result.z = result.w =
+            _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
         break;
     }
 
@@ -161,7 +168,8 @@
         result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
         break;
 
-    default: SWR_INVALID("Unimplemented blend factor: %d", func);
+    default:
+        SWR_INVALID("Unimplemented blend factor: %d", func);
     }
 
     if (Color)
@@ -174,11 +182,15 @@
     {
         out.w = result.w;
     }
-
 }
 
-template<bool Color, bool Alpha>
-INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFactor, simdvector &dst, simdvector &dstFactor, simdvector &out)
+template <bool Color, bool Alpha>
+INLINE void BlendFunc(SWR_BLEND_OP blendOp,
+                      simdvector&  src,
+                      simdvector&  srcFactor,
+                      simdvector&  dst,
+                      simdvector&  dstFactor,
+                      simdvector&  out)
 {
     simdvector result;
 
@@ -204,21 +216,21 @@
         result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z));
         result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w));
         break;
-        
+
     case BLENDOP_MIN:
         result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
         result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
         result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
         result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
         break;
-        
+
     case BLENDOP_MAX:
         result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
         result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
         result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
         result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
         break;
-        
+
     default:
         SWR_INVALID("Unimplemented blend function: %d", blendOp);
     }
@@ -235,8 +247,8 @@
     }
 }
 
-template<SWR_TYPE type>
-INLINE void Clamp(simdvector &src)
+template <SWR_TYPE type>
+INLINE void Clamp(simdvector& src)
 {
     switch (type)
     {
@@ -277,8 +289,13 @@
     }
 }
 
-template<SWR_TYPE type>
-void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, uint8_t *pDst, simdvector &result)
+template <SWR_TYPE type>
+void Blend(const SWR_BLEND_STATE*               pBlendState,
+           const SWR_RENDER_TARGET_BLEND_STATE* pState,
+           simdvector&                          src,
+           simdvector&                          src1,
+           uint8_t*                             pDst,
+           simdvector&                          result)
 {
     // load render target
     simdvector dst;
@@ -299,20 +316,33 @@
     simdvector srcFactor, dstFactor;
     if (pBlendState->independentAlphaBlendEnable)
     {
-        GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
-        GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, constColor, src, src1, dst, srcFactor);
+        GenerateBlendFactor<true, false>(
+            (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
+        GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor,
+                                         constColor,
+                                         src,
+                                         src1,
+                                         dst,
+                                         srcFactor);
 
-        GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
-        GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
+        GenerateBlendFactor<true, false>(
+            (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
+        GenerateBlendFactor<false, true>(
+            (SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
 
-        BlendFunc<true, false>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-        BlendFunc<false, true>((SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
+        BlendFunc<true, false>(
+            (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+        BlendFunc<false, true>(
+            (SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
     }
     else
     {
-        GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
-        GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
+        GenerateBlendFactor<true, true>(
+            (SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
+        GenerateBlendFactor<true, true>(
+            (SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
 
-        BlendFunc<true, true>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+        BlendFunc<true, true>(
+            (SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
     }
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index 780ca15..8c53fca 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file clip.cpp
-*
-* @brief Implementation for clipping
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file clip.cpp
+ *
+ * @brief Implementation for clipping
+ *
+ ******************************************************************************/
 
 #include <assert.h>
 
@@ -42,115 +42,137 @@
     return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
 }
 
-template<SWR_CLIPCODES ClippingPlane>
+template <SWR_CLIPCODES ClippingPlane>
 inline void intersect(
-    int s,                       // index to first edge vertex v0 in pInPts.
-    int p,                       // index to second edge vertex v1 in pInPts.
-    const float *pInPts,         // array of all the input positions.
-    const float *pInAttribs,     // array of all attributes for all vertex. All the attributes for each vertex is contiguous.
-    int numInAttribs,            // number of attributes per vertex.
-    int i,                       // output index.
-    float *pOutPts,              // array of output positions. We'll write our new intersection point at i*4.
-    float *pOutAttribs)          // array of output attributes. We'll write our new attributes at i*numInAttribs.
+    int          s,          // index to first edge vertex v0 in pInPts.
+    int          p,          // index to second edge vertex v1 in pInPts.
+    const float* pInPts,     // array of all the input positions.
+    const float* pInAttribs, // array of all attributes for all vertex. All the attributes for each
+                             // vertex is contiguous.
+    int    numInAttribs,     // number of attributes per vertex.
+    int    i,                // output index.
+    float* pOutPts,     // array of output positions. We'll write our new intersection point at i*4.
+    float* pOutAttribs) // array of output attributes. We'll write our new attributes at
+                        // i*numInAttribs.
 {
     float t;
 
     // Find the parameter of the intersection.
     //        t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
-    const float *v1 = &pInPts[s*4];
-    const float *v2 = &pInPts[p*4];
+    const float* v1 = &pInPts[s * 4];
+    const float* v2 = &pInPts[p * 4];
 
     switch (ClippingPlane)
     {
-    case FRUSTUM_LEFT:      t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break;
-    case FRUSTUM_RIGHT:     t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break;
-    case FRUSTUM_TOP:       t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break;
-    case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break;
-    case FRUSTUM_NEAR:      t = ComputeInterpFactor(v1[2], v2[2]); break;
-    case FRUSTUM_FAR:       t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break;
-    default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
+    case FRUSTUM_LEFT:
+        t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]);
+        break;
+    case FRUSTUM_RIGHT:
+        t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]);
+        break;
+    case FRUSTUM_TOP:
+        t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]);
+        break;
+    case FRUSTUM_BOTTOM:
+        t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]);
+        break;
+    case FRUSTUM_NEAR:
+        t = ComputeInterpFactor(v1[2], v2[2]);
+        break;
+    case FRUSTUM_FAR:
+        t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]);
+        break;
+    default:
+        SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
     };
 
+    const float* a1 = &pInAttribs[s * numInAttribs];
+    const float* a2 = &pInAttribs[p * numInAttribs];
 
-    const float *a1 = &pInAttribs[s*numInAttribs];
-    const float *a2 = &pInAttribs[p*numInAttribs];
-
-    float *pOutP    = &pOutPts[i*4];
-    float *pOutA    = &pOutAttribs[i*numInAttribs];
+    float* pOutP = &pOutPts[i * 4];
+    float* pOutA = &pOutAttribs[i * numInAttribs];
 
     // Interpolate new position.
-    for(int j = 0; j < 4; ++j)
+    for (int j = 0; j < 4; ++j)
     {
-        pOutP[j] = v1[j] + (v2[j]-v1[j])*t;
+        pOutP[j] = v1[j] + (v2[j] - v1[j]) * t;
     }
 
     // Interpolate Attributes
-    for(int attr = 0; attr < numInAttribs; ++attr)
+    for (int attr = 0; attr < numInAttribs; ++attr)
     {
-        pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t;
+        pOutA[attr] = a1[attr] + (a2[attr] - a1[attr]) * t;
     }
 }
 
-
 // Checks whether vertex v lies inside clipping plane
 // in homogenous coords check -w < {x,y,z} < w;
 //
-template<SWR_CLIPCODES ClippingPlane>
+template <SWR_CLIPCODES ClippingPlane>
 inline int inside(const float v[4])
 {
     switch (ClippingPlane)
     {
-    case FRUSTUM_LEFT   : return (v[0]>=-v[3]);
-    case FRUSTUM_RIGHT  : return (v[0]<= v[3]);
-    case FRUSTUM_TOP    : return (v[1]>=-v[3]);
-    case FRUSTUM_BOTTOM : return (v[1]<= v[3]);
-    case FRUSTUM_NEAR   : return (v[2]>=0.0f);
-    case FRUSTUM_FAR    : return (v[2]<= v[3]);
+    case FRUSTUM_LEFT:
+        return (v[0] >= -v[3]);
+    case FRUSTUM_RIGHT:
+        return (v[0] <= v[3]);
+    case FRUSTUM_TOP:
+        return (v[1] >= -v[3]);
+    case FRUSTUM_BOTTOM:
+        return (v[1] <= v[3]);
+    case FRUSTUM_NEAR:
+        return (v[2] >= 0.0f);
+    case FRUSTUM_FAR:
+        return (v[2] <= v[3]);
     default:
         SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
         return 0;
     }
 }
 
-
 // Clips a polygon in homogenous coordinates to a particular clipping plane.
 // Takes in vertices of the polygon (InPts) and the clipping plane
 // Puts the vertices of the clipped polygon in OutPts
 // Returns number of points in clipped polygon
 //
-template<SWR_CLIPCODES ClippingPlane>
-int ClipTriToPlane( const float *pInPts, int numInPts,
-                    const float *pInAttribs, int numInAttribs,
-                    float *pOutPts, float *pOutAttribs)
+template <SWR_CLIPCODES ClippingPlane>
+int ClipTriToPlane(const float* pInPts,
+                   int          numInPts,
+                   const float* pInAttribs,
+                   int          numInAttribs,
+                   float*       pOutPts,
+                   float*       pOutAttribs)
 {
-    int i=0; // index number of OutPts, # of vertices in OutPts = i div 4;
+    int i = 0; // index number of OutPts, # of vertices in OutPts = i div 4;
 
     for (int j = 0; j < numInPts; ++j)
     {
         int s = j;
         int p = (j + 1) % numInPts;
 
-        int s_in = inside<ClippingPlane>(&pInPts[s*4]);
-        int p_in = inside<ClippingPlane>(&pInPts[p*4]);
+        int s_in = inside<ClippingPlane>(&pInPts[s * 4]);
+        int p_in = inside<ClippingPlane>(&pInPts[p * 4]);
 
         // test if vertex is to be added to output vertices
-        if (s_in != p_in)  // edge crosses clipping plane
+        if (s_in != p_in) // edge crosses clipping plane
         {
             // find point of intersection
-            intersect<ClippingPlane>(s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
+            intersect<ClippingPlane>(
+                s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
             i++;
         }
         if (p_in) // 2nd vertex is inside clipping volume, add it to output
         {
             // Copy 2nd vertex position of edge over to output.
-            for(int k = 0; k < 4; ++k)
+            for (int k = 0; k < 4; ++k)
             {
-                pOutPts[i*4 + k] = pInPts[p*4 + k];
+                pOutPts[i * 4 + k] = pInPts[p * 4 + k];
             }
             // Copy 2nd vertex attributes of edge over to output.
-            for(int attr = 0; attr < numInAttribs; ++attr)
+            for (int attr = 0; attr < numInAttribs; ++attr)
             {
-                pOutAttribs[i*numInAttribs+attr] = pInAttribs[p*numInAttribs+attr];
+                pOutAttribs[i * numInAttribs + attr] = pInAttribs[p * numInAttribs + attr];
             }
             i++;
         }
@@ -160,8 +182,29 @@
     return i;
 }
 
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask,
-                   simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx)
+void ClipRectangles(DRAW_CONTEXT*      pDC,
+                    PA_STATE&          pa,
+                    uint32_t           workerId,
+                    simdvector         prims[],
+                    uint32_t           primMask,
+                    simdscalari const& primId,
+                    simdscalari const& viewportIdx,
+                    simdscalari const& rtIdx)
+{
+    RDTSC_BEGIN(FEClipRectangles, pDC->drawId);
+    Clipper<SIMD256, 3> clipper(workerId, pDC);
+    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
+    RDTSC_END(FEClipRectangles, 1);
+}
+
+void ClipTriangles(DRAW_CONTEXT*      pDC,
+                   PA_STATE&          pa,
+                   uint32_t           workerId,
+                   simdvector         prims[],
+                   uint32_t           primMask,
+                   simdscalari const& primId,
+                   simdscalari const& viewportIdx,
+                   simdscalari const& rtIdx)
 {
     RDTSC_BEGIN(FEClipTriangles, pDC->drawId);
     Clipper<SIMD256, 3> clipper(workerId, pDC);
@@ -169,8 +212,14 @@
     RDTSC_END(FEClipTriangles, 1);
 }
 
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask,
-               simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx)
+void ClipLines(DRAW_CONTEXT*      pDC,
+               PA_STATE&          pa,
+               uint32_t           workerId,
+               simdvector         prims[],
+               uint32_t           primMask,
+               simdscalari const& primId,
+               simdscalari const& viewportIdx,
+               simdscalari const& rtIdx)
 {
     RDTSC_BEGIN(FEClipLines, pDC->drawId);
     Clipper<SIMD256, 2> clipper(workerId, pDC);
@@ -178,8 +227,14 @@
     RDTSC_END(FEClipLines, 1);
 }
 
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask,
-                simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx)
+void ClipPoints(DRAW_CONTEXT*      pDC,
+                PA_STATE&          pa,
+                uint32_t           workerId,
+                simdvector         prims[],
+                uint32_t           primMask,
+                simdscalari const& primId,
+                simdscalari const& viewportIdx,
+                simdscalari const& rtIdx)
 {
     RDTSC_BEGIN(FEClipPoints, pDC->drawId);
     Clipper<SIMD256, 1> clipper(workerId, pDC);
@@ -188,12 +243,45 @@
 }
 
 #if USE_SIMD16_FRONTEND
-void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask,
-                                   simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx)
+void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT*        pDC,
+                                    PA_STATE&            pa,
+                                    uint32_t             workerId,
+                                    simd16vector         prims[],
+                                    uint32_t             primMask,
+                                    simd16scalari const& primId,
+                                    simd16scalari const& viewportIdx,
+                                    simd16scalari const& rtIdx)
+{
+    RDTSC_BEGIN(FEClipRectangles, pDC->drawId);
+
+    enum
+    {
+        VERTS_PER_PRIM = 3
+    };
+
+    Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
+
+    pa.useAlternateOffset = false;
+    clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx, rtIdx);
+
+    RDTSC_END(FEClipRectangles, 1);
+}
+
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT*        pDC,
+                                   PA_STATE&            pa,
+                                   uint32_t             workerId,
+                                   simd16vector         prims[],
+                                   uint32_t             primMask,
+                                   simd16scalari const& primId,
+                                   simd16scalari const& viewportIdx,
+                                   simd16scalari const& rtIdx)
 {
     RDTSC_BEGIN(FEClipTriangles, pDC->drawId);
 
-    enum { VERTS_PER_PRIM = 3 };
+    enum
+    {
+        VERTS_PER_PRIM = 3
+    };
 
     Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
 
@@ -203,12 +291,21 @@
     RDTSC_END(FEClipTriangles, 1);
 }
 
-void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask,
-                               simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx)
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT*        pDC,
+                               PA_STATE&            pa,
+                               uint32_t             workerId,
+                               simd16vector         prims[],
+                               uint32_t             primMask,
+                               simd16scalari const& primId,
+                               simd16scalari const& viewportIdx,
+                               simd16scalari const& rtIdx)
 {
     RDTSC_BEGIN(FEClipLines, pDC->drawId);
 
-    enum { VERTS_PER_PRIM = 2 };
+    enum
+    {
+        VERTS_PER_PRIM = 2
+    };
 
     Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
 
@@ -218,12 +315,21 @@
     RDTSC_END(FEClipLines, 1);
 }
 
-void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask,
-                                simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx)
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT*        pDC,
+                                PA_STATE&            pa,
+                                uint32_t             workerId,
+                                simd16vector         prims[],
+                                uint32_t             primMask,
+                                simd16scalari const& primId,
+                                simd16scalari const& viewportIdx,
+                                simd16scalari const& rtIdx)
 {
     RDTSC_BEGIN(FEClipPoints, pDC->drawId);
 
-    enum { VERTS_PER_PRIM = 1 };
+    enum
+    {
+        VERTS_PER_PRIM = 1
+    };
 
     Clipper<SIMD512, VERTS_PER_PRIM> clipper(workerId, pDC);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 0f8399c..7b4ed58 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file clip.h
-*
-* @brief Definitions for clipping
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file clip.h
+ *
+ * @brief Definitions for clipping
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "common/simdintrin.h"
@@ -40,18 +40,19 @@
 
 enum SWR_CLIPCODES
 {
-    // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
-    // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
+// Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
+// Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union,
+// rather than intersection, of clipcodes.
 #define CLIPCODE_SHIFT 23
-    FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
-    FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
-    FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
-    FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
+    FRUSTUM_LEFT   = (0x01 << CLIPCODE_SHIFT),
+    FRUSTUM_TOP    = (0x02 << CLIPCODE_SHIFT),
+    FRUSTUM_RIGHT  = (0x04 << CLIPCODE_SHIFT),
+    FRUSTUM_BOTTOM = (0x08 << CLIPCODE_SHIFT),
 
-    FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
-    FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
+    FRUSTUM_NEAR = (0x10 << CLIPCODE_SHIFT),
+    FRUSTUM_FAR  = (0x20 << CLIPCODE_SHIFT),
 
-    NEGW            = (0x40 << CLIPCODE_SHIFT),
+    NEGW = (0x40 << CLIPCODE_SHIFT),
 
     GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
     GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
@@ -59,32 +60,41 @@
     GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
 };
 
-#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
-#define FRUSTUM_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|FRUSTUM_LEFT|FRUSTUM_RIGHT|FRUSTUM_TOP|FRUSTUM_BOTTOM)
+#define GUARDBAND_CLIP_MASK                                                          \
+    (FRUSTUM_NEAR | FRUSTUM_FAR | GUARDBAND_LEFT | GUARDBAND_TOP | GUARDBAND_RIGHT | \
+     GUARDBAND_BOTTOM | NEGW)
+#define FRUSTUM_CLIP_MASK \
+    (FRUSTUM_NEAR | FRUSTUM_FAR | FRUSTUM_LEFT | FRUSTUM_RIGHT | FRUSTUM_TOP | FRUSTUM_BOTTOM)
 
-template<typename SIMD_T>
-void ComputeClipCodes(const API_STATE &state, const Vec4<SIMD_T> &vertex, Float<SIMD_T> &clipCodes, Integer<SIMD_T> const &viewportIndexes)
+template <typename SIMD_T>
+void ComputeClipCodes(const API_STATE&       state,
+                      const Vec4<SIMD_T>&    vertex,
+                      Float<SIMD_T>&         clipCodes,
+                      Integer<SIMD_T> const& viewportIndexes)
 {
     clipCodes = SIMD_T::setzero_ps();
 
     // -w
-    Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
+    Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w, SIMD_T::set1_ps(-1.0f));
 
     // FRUSTUM_LEFT
     Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
-    clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
+    clipCodes          = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
 
     // FRUSTUM_TOP
-    vRes = SIMD_T::cmplt_ps(vertex.y, vNegW);
-    clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
+    vRes      = SIMD_T::cmplt_ps(vertex.y, vNegW);
+    clipCodes = SIMD_T::or_ps(
+        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_TOP))));
 
     // FRUSTUM_RIGHT
-    vRes = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
-    clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
+    vRes      = SIMD_T::cmpgt_ps(vertex.x, vertex.w);
+    clipCodes = SIMD_T::or_ps(
+        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_RIGHT))));
 
     // FRUSTUM_BOTTOM
-    vRes = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
-    clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
+    vRes      = SIMD_T::cmpgt_ps(vertex.y, vertex.w);
+    clipCodes = SIMD_T::or_ps(
+        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_BOTTOM))));
 
     if (state.rastState.depthClipEnable)
     {
@@ -98,50 +108,66 @@
         {
             vRes = SIMD_T::cmplt_ps(vertex.z, vNegW);
         }
-        clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
+        clipCodes = SIMD_T::or_ps(
+            clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_NEAR))));
 
         // FRUSTUM_FAR
-        vRes = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
-        clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
+        vRes      = SIMD_T::cmpgt_ps(vertex.z, vertex.w);
+        clipCodes = SIMD_T::or_ps(
+            clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_FAR))));
     }
 
     // NEGW
     vRes = SIMD_T::cmple_ps(vertex.w, SIMD_T::setzero_ps());
-    clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
+    clipCodes =
+        SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
 
     // GUARDBAND_LEFT
-    Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.left[0], viewportIndexes));
-    vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
-    clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
+    Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW,
+                                          SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
+                                              &state.gbState.left[0], viewportIndexes));
+    vRes                 = SIMD_T::cmplt_ps(vertex.x, gbMult);
+    clipCodes            = SIMD_T::or_ps(
+        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
 
     // GUARDBAND_TOP
-    gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.top[0], viewportIndexes));
-    vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
-    clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
+    gbMult    = SIMD_T::mul_ps(vNegW,
+                            SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
+                                &state.gbState.top[0], viewportIndexes));
+    vRes      = SIMD_T::cmplt_ps(vertex.y, gbMult);
+    clipCodes = SIMD_T::or_ps(
+        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
 
     // GUARDBAND_RIGHT
-    gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.right[0], viewportIndexes));
-    vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
-    clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
+    gbMult    = SIMD_T::mul_ps(vertex.w,
+                            SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
+                                &state.gbState.right[0], viewportIndexes));
+    vRes      = SIMD_T::cmpgt_ps(vertex.x, gbMult);
+    clipCodes = SIMD_T::or_ps(
+        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
 
     // GUARDBAND_BOTTOM
-    gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.bottom[0], viewportIndexes));
-    vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
-    clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
+    gbMult    = SIMD_T::mul_ps(vertex.w,
+                            SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(
+                                &state.gbState.bottom[0], viewportIndexes));
+    vRes      = SIMD_T::cmpgt_ps(vertex.y, gbMult);
+    clipCodes = SIMD_T::or_ps(
+        clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
 }
 
-template<typename SIMD_T>
+template <typename SIMD_T>
 struct BinnerChooser
 {
 };
 
-template<>
+template <>
 struct BinnerChooser<SIMD256>
 {
     PFN_PROCESS_PRIMS pfnBinFunc;
 
     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
-        :pfnBinFunc(nullptr)
+        :
+        pfnBinFunc(nullptr)
     {
         if (numVertsPerPrim == 3)
         {
@@ -159,7 +185,8 @@
     }
 
     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
-        :pfnBinFunc(nullptr)
+        :
+        pfnBinFunc(nullptr)
     {
         switch (topology)
         {
@@ -179,7 +206,14 @@
         };
     }
 
-    void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD256::Vec4 prims[], uint32_t primMask, SIMD256::Integer const &primID, SIMD256::Integer &viewportIdx, SIMD256::Integer &rtIdx)
+    void BinFunc(DRAW_CONTEXT*           pDC,
+                 PA_STATE&               pa,
+                 uint32_t                workerId,
+                 SIMD256::Vec4           prims[],
+                 uint32_t                primMask,
+                 SIMD256::Integer const& primID,
+                 SIMD256::Integer&       viewportIdx,
+                 SIMD256::Integer&       rtIdx)
     {
         SWR_ASSERT(pfnBinFunc != nullptr);
 
@@ -188,13 +222,14 @@
 };
 
 #if USE_SIMD16_FRONTEND
-template<>
+template <>
 struct BinnerChooser<SIMD512>
 {
     PFN_PROCESS_PRIMS_SIMD16 pfnBinFunc;
 
     BinnerChooser(uint32_t numVertsPerPrim, uint32_t conservativeRast)
-        :pfnBinFunc(nullptr)
+        :
+        pfnBinFunc(nullptr)
     {
         if (numVertsPerPrim == 3)
         {
@@ -212,7 +247,8 @@
     }
 
     BinnerChooser(PRIMITIVE_TOPOLOGY topology, uint32_t conservativeRast)
-        :pfnBinFunc(nullptr)
+        :
+        pfnBinFunc(nullptr)
     {
         switch (topology)
         {
@@ -232,7 +268,14 @@
         };
     }
 
-    void BinFunc(DRAW_CONTEXT *pDC, PA_STATE &pa, uint32_t workerId, SIMD512::Vec4 prims[], uint32_t primMask, SIMD512::Integer const &primID, SIMD512::Integer &viewportIdx, SIMD512::Integer &rtIdx)
+    void BinFunc(DRAW_CONTEXT*           pDC,
+                 PA_STATE&               pa,
+                 uint32_t                workerId,
+                 SIMD512::Vec4           prims[],
+                 uint32_t                primMask,
+                 SIMD512::Integer const& primID,
+                 SIMD512::Integer&       viewportIdx,
+                 SIMD512::Integer&       rtIdx)
     {
         SWR_ASSERT(pfnBinFunc != nullptr);
 
@@ -241,18 +284,15 @@
 };
 
 #endif
-template<typename SIMD_T>
+template <typename SIMD_T>
 struct SimdHelper
 {
 };
 
-template<>
+template <>
 struct SimdHelper<SIMD256>
 {
-    static SIMD256::Float insert_lo_ps(SIMD256::Float a)
-    {
-        return a;
-    }
+    static SIMD256::Float insert_lo_ps(SIMD256::Float a) { return a; }
 
     static SIMD256::Mask cmpeq_ps_mask(SIMD256::Float a, SIMD256::Float b)
     {
@@ -261,7 +301,7 @@
 };
 
 #if USE_SIMD16_FRONTEND
-template<>
+template <>
 struct SimdHelper<SIMD512>
 {
     static SIMD512::Float insert_lo_ps(SIMD256::Float a)
@@ -277,32 +317,26 @@
 
 #endif
 // Temp storage used by the clipper
-template<typename SIMD_T>
+template <typename SIMD_T>
 struct ClipHelper
 {
 };
 
-template<>
+template <>
 struct ClipHelper<SIMD256>
 {
-    static SIMDVERTEX_T<SIMD256> *GetTempVertices()
-    {
-        return tlsTempVertices;
-    }
+    static SIMDVERTEX_T<SIMD256>* GetTempVertices() { return tlsTempVertices; }
 };
 
 #if USE_SIMD16_FRONTEND
-template<>
+template <>
 struct ClipHelper<SIMD512>
 {
-    static SIMDVERTEX_T<SIMD512> *GetTempVertices()
-    {
-        return tlsTempVertices_simd16;
-    }
+    static SIMDVERTEX_T<SIMD512>* GetTempVertices() { return tlsTempVertices_simd16; }
 };
 
 #endif
-template<typename SIMD_T, uint32_t NumVertsPerPrim>
+template <typename SIMD_T, uint32_t NumVertsPerPrim>
 class Clipper
 {
 public:
@@ -312,7 +346,7 @@
         static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
     }
 
-    void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T> &viewportIndexes)
+    void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T>& viewportIndexes)
     {
         for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
         {
@@ -348,7 +382,8 @@
     {
         Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
 
-        clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
+        clipUnion =
+            SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
 
         return SIMD_T::movemask_ps(SIMD_T::cmpneq_ps(clipUnion, SIMD_T::setzero_ps()));
     }
@@ -360,19 +395,21 @@
 
         for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
         {
-            Float<SIMD_T> vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
+            Float<SIMD_T> vNan01 =
+                SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
             vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
 
-            Float<SIMD_T> vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
+            Float<SIMD_T> vNan23 =
+                SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
             vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
         }
 
         return SIMD_T::movemask_ps(vNanMask);
     }
 
-    int ComputeUserClipCullMask(PA_STATE &pa, Vec4<SIMD_T> prim[])
+    int ComputeUserClipCullMask(PA_STATE& pa, Vec4<SIMD_T> prim[])
     {
-        uint8_t cullMask = state.backendState.cullDistanceMask;
+        uint8_t  cullMask             = state.backendState.cullDistanceMask;
         uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
 
         Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
@@ -387,7 +424,7 @@
         while (_BitScanForward(&index, cullMask))
         {
             cullMask &= ~(1 << index);
-            uint32_t slot = index >> 2;
+            uint32_t slot      = index >> 2;
             uint32_t component = index & 0x3;
 
             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
@@ -404,7 +441,8 @@
                 }
 
                 // cull if cull distance < 0 || NAN
-                Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
+                Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
+                    SIMD_T::setzero_ps(), vCullComp);
                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
             }
             vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
@@ -415,7 +453,7 @@
         while (_BitScanForward(&index, clipMask))
         {
             clipMask &= ~(1 << index);
-            uint32_t slot = index >> 2;
+            uint32_t slot      = index >> 2;
             uint32_t component = index & 0x3;
 
             Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
@@ -431,8 +469,10 @@
                     vClipComp = vClipCullDistHi[e][component];
                 }
 
-                Float<SIMD_T> vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
-                Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vClipComp);
+                Float<SIMD_T> vClip =
+                    SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
+                Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(
+                    SIMD_T::setzero_ps(), vClipComp);
                 vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
                 vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
             }
@@ -442,14 +482,19 @@
         return SIMD_T::movemask_ps(vClipCullMask);
     }
 
-    void ClipSimd(const Vec4<SIMD_T> prim[], const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, PA_STATE &pa,
-                  const Integer<SIMD_T> &vPrimId, const Integer<SIMD_T> &vViewportIdx, const Integer<SIMD_T> &vRtIdx)
+    void ClipSimd(const Vec4<SIMD_T>     prim[],
+                  const Float<SIMD_T>&   vPrimMask,
+                  const Float<SIMD_T>&   vClipMask,
+                  PA_STATE&              pa,
+                  const Integer<SIMD_T>& vPrimId,
+                  const Integer<SIMD_T>& vViewportIdx,
+                  const Integer<SIMD_T>& vRtIdx)
     {
         // input/output vertex store for clipper
         SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
 
         uint32_t constantInterpMask = state.backendState.constantInterpolationMask;
-        uint32_t provokingVertex = 0;
+        uint32_t provokingVertex    = 0;
         if (pa.binTopology == TOP_TRIANGLE_FAN)
         {
             provokingVertex = state.frontendState.provokingVertex.triFan;
@@ -470,8 +515,9 @@
         for (uint32_t slot = 0; slot < backendState.numAttributes; ++slot)
         {
             // Compute absolute attrib slot in vertex array
-            uint32_t mapSlot = backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
-            maxSlot = std::max<int32_t>(maxSlot, mapSlot);
+            uint32_t mapSlot =
+                backendState.swizzleEnable ? backendState.swizzleMap[slot].sourceAttrib : slot;
+            maxSlot            = std::max<int32_t>(maxSlot, mapSlot);
             uint32_t inputSlot = backendState.vertexAttribOffset + mapSlot;
 
             pa.Assemble(inputSlot, tmpVector);
@@ -516,9 +562,11 @@
 
         uint32_t numAttribs = maxSlot + 1;
 
-        Integer<SIMD_T> vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
+        Integer<SIMD_T> vNumClippedVerts =
+            ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
 
-        BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
+        BinnerChooser<SIMD_T> binner(NumVertsPerPrim,
+                                     pa.pDC->pState->state.rastState.conservativeRast);
 
         // set up new PA for binning clipped primitives
         PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
@@ -531,6 +579,10 @@
             {
                 clipTopology = TOP_POINT_LIST;
             }
+            else if (pa.binTopology == TOP_RECT_LIST)
+            {
+                clipTopology = TOP_RECT_LIST;
+            }
         }
         else if (NumVertsPerPrim == 2)
         {
@@ -541,20 +593,20 @@
             SWR_ASSERT(0 && "Unexpected points in clipper.");
         }
 
-        const uint32_t *pVertexCount = reinterpret_cast<const uint32_t *>(&vNumClippedVerts);
-        const uint32_t *pPrimitiveId = reinterpret_cast<const uint32_t *>(&vPrimId);
-        const uint32_t *pViewportIdx = reinterpret_cast<const uint32_t *>(&vViewportIdx);
-        const uint32_t *pRtIdx = reinterpret_cast<const uint32_t *>(&vRtIdx);
+        const uint32_t* pVertexCount = reinterpret_cast<const uint32_t*>(&vNumClippedVerts);
+        const uint32_t* pPrimitiveId = reinterpret_cast<const uint32_t*>(&vPrimId);
+        const uint32_t* pViewportIdx = reinterpret_cast<const uint32_t*>(&vViewportIdx);
+        const uint32_t* pRtIdx       = reinterpret_cast<const uint32_t*>(&vRtIdx);
 
-        const SIMD256::Integer vOffsets = SIMD256::set_epi32(
-            0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
-            6 * sizeof(SIMDVERTEX_T<SIMD_T>),
-            5 * sizeof(SIMDVERTEX_T<SIMD_T>),
-            4 * sizeof(SIMDVERTEX_T<SIMD_T>),
-            3 * sizeof(SIMDVERTEX_T<SIMD_T>),
-            2 * sizeof(SIMDVERTEX_T<SIMD_T>),
-            1 * sizeof(SIMDVERTEX_T<SIMD_T>),
-            0 * sizeof(SIMDVERTEX_T<SIMD_T>));
+        const SIMD256::Integer vOffsets =
+            SIMD256::set_epi32(0 * sizeof(SIMDVERTEX_T<SIMD_T>), // unused lane
+                               6 * sizeof(SIMDVERTEX_T<SIMD_T>),
+                               5 * sizeof(SIMDVERTEX_T<SIMD_T>),
+                               4 * sizeof(SIMDVERTEX_T<SIMD_T>),
+                               3 * sizeof(SIMDVERTEX_T<SIMD_T>),
+                               2 * sizeof(SIMDVERTEX_T<SIMD_T>),
+                               1 * sizeof(SIMDVERTEX_T<SIMD_T>),
+                               0 * sizeof(SIMDVERTEX_T<SIMD_T>));
 
         // only need to gather 7 verts
         // @todo dynamic mask based on actual # of verts generated per lane
@@ -567,14 +619,16 @@
         // for triangle fan
 
 #if defined(_DEBUG)
-        // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack overflow in debug builds
-        SIMDVERTEX_T<SIMD_T> *transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T> *>(AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
+        // TODO: need to increase stack size, allocating SIMD16-widened transposedPrims causes stack
+        // overflow in debug builds
+        SIMDVERTEX_T<SIMD_T>* transposedPrims = reinterpret_cast<SIMDVERTEX_T<SIMD_T>*>(
+            AlignedMalloc(sizeof(SIMDVERTEX_T<SIMD_T>) * 2, 64));
 
 #else
-        SIMDVERTEX_T<SIMD_T> transposedPrims[2];
+        SIMDVERTEX_T<SIMD_T>  transposedPrims[2];
 
 #endif
-        uint32_t numInputPrims = pa.NumPrims();
+        uint32_t              numInputPrims = pa.NumPrims();
         for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
         {
             uint32_t numEmittedVerts = pVertexCount[inputPrim];
@@ -594,7 +648,8 @@
             // for triangle fan
 
             // transpose pos
-            uint8_t *pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
+            uint8_t* pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[VERTEX_POSITION_SLOT]) +
+                             sizeof(float) * inputPrim;
 
 #if 0
             // TEMPORARY WORKAROUND for bizarre VS2015 code-gen bug
@@ -603,13 +658,17 @@
 #endif
             for (uint32_t c = 0; c < 4; ++c)
             {
-                SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
-                transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
+                SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+                    SIMD256::setzero_ps(), reinterpret_cast<const float*>(pBase), vOffsets, vMask);
+                transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] =
+                    SimdHelper<SIMD_T>::insert_lo_ps(temp);
                 pBase += sizeof(Float<SIMD_T>);
             }
 
             // transpose attribs
-            pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[backendState.vertexAttribOffset]) + sizeof(float) * inputPrim;
+            pBase =
+                reinterpret_cast<uint8_t*>(&vertices[0].attrib[backendState.vertexAttribOffset]) +
+                sizeof(float) * inputPrim;
 
             for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
             {
@@ -617,8 +676,14 @@
 
                 for (uint32_t c = 0; c < 4; ++c)
                 {
-                    SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
-                    transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
+                    SIMD256::Float temp =
+                        SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+                            SIMD256::setzero_ps(),
+                            reinterpret_cast<const float*>(pBase),
+                            vOffsets,
+                            vMask);
+                    transposedPrims[0].attrib[attribSlot][c] =
+                        SimdHelper<SIMD_T>::insert_lo_ps(temp);
                     pBase += sizeof(Float<SIMD_T>);
                 }
             }
@@ -627,40 +692,60 @@
             uint32_t vertexClipCullSlot = backendState.vertexClipCullOffset;
             if (state.backendState.clipDistanceMask & 0x0f)
             {
-                pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot]) + sizeof(float) * inputPrim;
+                pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot]) +
+                        sizeof(float) * inputPrim;
 
                 for (uint32_t c = 0; c < 4; ++c)
                 {
-                    SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
-                    transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
+                    SIMD256::Float temp =
+                        SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+                            SIMD256::setzero_ps(),
+                            reinterpret_cast<const float*>(pBase),
+                            vOffsets,
+                            vMask);
+                    transposedPrims[0].attrib[vertexClipCullSlot][c] =
+                        SimdHelper<SIMD_T>::insert_lo_ps(temp);
                     pBase += sizeof(Float<SIMD_T>);
                 }
             }
 
             if (state.backendState.clipDistanceMask & 0xf0)
             {
-                pBase = reinterpret_cast<uint8_t *>(&vertices[0].attrib[vertexClipCullSlot + 1]) + sizeof(float) * inputPrim;
+                pBase = reinterpret_cast<uint8_t*>(&vertices[0].attrib[vertexClipCullSlot + 1]) +
+                        sizeof(float) * inputPrim;
 
                 for (uint32_t c = 0; c < 4; ++c)
                 {
-                    SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
-                    transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
+                    SIMD256::Float temp =
+                        SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+                            SIMD256::setzero_ps(),
+                            reinterpret_cast<const float*>(pBase),
+                            vOffsets,
+                            vMask);
+                    transposedPrims[0].attrib[vertexClipCullSlot + 1][c] =
+                        SimdHelper<SIMD_T>::insert_lo_ps(temp);
                     pBase += sizeof(Float<SIMD_T>);
                 }
             }
 
-            PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
+            PA_STATE_OPT clipPA(pDC,
+                                numEmittedPrims,
+                                reinterpret_cast<uint8_t*>(&transposedPrims[0]),
+                                numEmittedVerts,
+                                SWR_VTX_NUM_SLOTS,
+                                true,
+                                NumVertsPerPrim,
+                                clipTopology);
             clipPA.viewportArrayActive = pa.viewportArrayActive;
-            clipPA.rtArrayActive = pa.rtArrayActive;
+            clipPA.rtArrayActive       = pa.rtArrayActive;
 
-            static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
+            static const uint32_t primMaskMap[] = {0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f};
 
             const uint32_t primMask = primMaskMap[numEmittedPrims];
 
-            const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
+            const Integer<SIMD_T> primID      = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
             const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
-            const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
-
+            const Integer<SIMD_T> rtIdx       = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
 
             while (clipPA.GetNextStreamOutput())
             {
@@ -672,7 +757,8 @@
 
                     if (assemble)
                     {
-                        binner.pfnBinFunc(pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
+                        binner.pfnBinFunc(
+                            pDC, clipPA, workerId, attrib, primMask, primID, viewportIdx, rtIdx);
                     }
 
                 } while (clipPA.NextPrim());
@@ -687,12 +773,17 @@
         UPDATE_STAT_FE(CPrimitives, numClippedPrims);
     }
 
-    void ExecuteStage(PA_STATE &pa, Vec4<SIMD_T> prim[], uint32_t primMask,
-                      Integer<SIMD_T> const &primId, Integer<SIMD_T> const &viewportIdx, Integer<SIMD_T> const &rtIdx)
+    void ExecuteStage(PA_STATE&              pa,
+                      Vec4<SIMD_T>           prim[],
+                      uint32_t               primMask,
+                      Integer<SIMD_T> const& primId,
+                      Integer<SIMD_T> const& viewportIdx,
+                      Integer<SIMD_T> const& rtIdx)
     {
         SWR_ASSERT(pa.pDC != nullptr);
 
-        BinnerChooser<SIMD_T> binner(pa.binTopology, pa.pDC->pState->state.rastState.conservativeRast);
+        BinnerChooser<SIMD_T> binner(pa.binTopology,
+                                     pa.pDC->pState->state.rastState.conservativeRast);
 
         // update clipper invocations pipeline stat
         uint32_t numInvoc = _mm_popcnt_u32(primMask);
@@ -703,7 +794,7 @@
         // cull prims with NAN coords
         primMask &= ~ComputeNaNMask(prim);
 
-        // user cull distance cull 
+        // user cull distance cull
         if (state.backendState.cullDistanceMask | state.backendState.clipDistanceMask)
         {
             primMask &= ~ComputeUserClipCullMask(pa, prim);
@@ -711,10 +802,12 @@
 
         Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
         // Mask out non-frustum codes
-        clipIntersection = SIMD_T::and_ps(clipIntersection, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
+        clipIntersection = SIMD_T::and_ps(clipIntersection,
+                                          SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_CLIP_MASK)));
 
         // cull prims outside view frustum
-        int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
+        int validMask =
+            primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
 
         // skip clipping for points
         uint32_t clipMask = 0;
@@ -730,7 +823,13 @@
             RDTSC_BEGIN(FEGuardbandClip, pa.pDC->drawId);
             // we have to clip tris, execute the clipper, which will also
             // call the binner
-            ClipSimd(prim, SIMD_T::vmask_ps(validMask), SIMD_T::vmask_ps(clipMask), pa, primId, viewportIdx, rtIdx);
+            ClipSimd(prim,
+                     SIMD_T::vmask_ps(validMask),
+                     SIMD_T::vmask_ps(clipMask),
+                     pa,
+                     primId,
+                     viewportIdx,
+                     rtIdx);
             RDTSC_END(FEGuardbandClip, 1);
         }
         else if (validMask)
@@ -739,24 +838,26 @@
             UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
 
             // forward valid prims directly to binner
-            binner.pfnBinFunc(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
+            binner.pfnBinFunc(
+                this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx, rtIdx);
         }
     }
 
 private:
-    Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const &boundaryCoord0, Float<SIMD_T> const &boundaryCoord1)
+    Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const& boundaryCoord0,
+                                      Float<SIMD_T> const& boundaryCoord1)
     {
         return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
     }
 
-    Integer<SIMD_T> ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const &vIndices, uint32_t component)
+    Integer<SIMD_T>
+    ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const& vIndices, uint32_t component)
     {
         const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
         const uint32_t componentStride  = sizeof(Float<SIMD_T>);
         const uint32_t attribStride     = sizeof(Vec4<SIMD_T>);
 
-        static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
-        {
+        static const OSALIGNSIMD16(uint32_t) elemOffset[16] = {
             0 * sizeof(float),
             1 * sizeof(float),
             2 * sizeof(float),
@@ -775,15 +876,19 @@
             15 * sizeof(float),
         };
 
-        static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
+        static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset),
+                      "Clipper::ComputeOffsets, Increase number of element offsets.");
 
-        Integer<SIMD_T> vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T> *>(elemOffset));
+        Integer<SIMD_T> vElemOffset =
+            SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T>*>(elemOffset));
 
         // step to the simdvertex
-        Integer<SIMD_T> vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
+        Integer<SIMD_T> vOffsets =
+            SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
 
         // step to the attribute and component
-        vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
+        vOffsets = SIMD_T::add_epi32(
+            vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
 
         // step to the lane
         vOffsets = SIMD_T::add_epi32(vOffsets, vElemOffset);
@@ -791,53 +896,71 @@
         return vOffsets;
     }
 
-    Float<SIMD_T> GatherComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component)
+    Float<SIMD_T> GatherComponent(const float*           pBuffer,
+                                  uint32_t               attrib,
+                                  Float<SIMD_T> const&   vMask,
+                                  Integer<SIMD_T> const& vIndices,
+                                  uint32_t               component)
     {
         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
-        Float<SIMD_T> vSrc = SIMD_T::setzero_ps();
+        Float<SIMD_T>   vSrc     = SIMD_T::setzero_ps();
 
-        return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(vSrc, pBuffer, vOffsets, vMask);
+        return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+            vSrc, pBuffer, vOffsets, vMask);
     }
 
-    void ScatterComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component, Float<SIMD_T> const &vSrc)
+    void ScatterComponent(const float*           pBuffer,
+                          uint32_t               attrib,
+                          Float<SIMD_T> const&   vMask,
+                          Integer<SIMD_T> const& vIndices,
+                          uint32_t               component,
+                          Float<SIMD_T> const&   vSrc)
     {
         Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
 
-        const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
-        const float *pSrc = reinterpret_cast<const float *>(&vSrc);
-        uint32_t mask = SIMD_T::movemask_ps(vMask);
-        DWORD lane;
+        const uint32_t* pOffsets = reinterpret_cast<const uint32_t*>(&vOffsets);
+        const float*    pSrc     = reinterpret_cast<const float*>(&vSrc);
+        uint32_t        mask     = SIMD_T::movemask_ps(vMask);
+        DWORD           lane;
         while (_BitScanForward(&lane, mask))
         {
             mask &= ~(1 << lane);
-            const uint8_t *pBuf = reinterpret_cast<const uint8_t *>(pBuffer) + pOffsets[lane];
-            *(float *)pBuf = pSrc[lane];
+            const uint8_t* pBuf = reinterpret_cast<const uint8_t*>(pBuffer) + pOffsets[lane];
+            *(float*)pBuf       = pSrc[lane];
         }
     }
 
-    template<SWR_CLIPCODES ClippingPlane>
-    void intersect(
-        const Float<SIMD_T> &vActiveMask,  // active lanes to operate on
-        const Integer<SIMD_T> &s,          // index to first edge vertex v0 in pInPts.
-        const Integer<SIMD_T> &p,          // index to second edge vertex v1 in pInPts.
-        const Vec4<SIMD_T> &v1,            // vertex 0 position
-        const Vec4<SIMD_T> &v2,            // vertex 1 position
-        Integer<SIMD_T> &outIndex,         // output index.
-        const float *pInVerts,                      // array of all the input positions.
-        uint32_t numInAttribs,                      // number of attributes per vertex.
-        float *pOutVerts)                           // array of output positions. We'll write our new intersection point at i*4.
+    template <SWR_CLIPCODES ClippingPlane>
+    void intersect(const Float<SIMD_T>&   vActiveMask,  // active lanes to operate on
+                   const Integer<SIMD_T>& s,            // index to first edge vertex v0 in pInPts.
+                   const Integer<SIMD_T>& p,            // index to second edge vertex v1 in pInPts.
+                   const Vec4<SIMD_T>&    v1,           // vertex 0 position
+                   const Vec4<SIMD_T>&    v2,           // vertex 1 position
+                   Integer<SIMD_T>&       outIndex,     // output index.
+                   const float*           pInVerts,     // array of all the input positions.
+                   uint32_t               numInAttribs, // number of attributes per vertex.
+                   float* pOutVerts) // array of output positions. We'll write our new intersection
+                                     // point at i*4.
     {
-        uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
+        uint32_t vertexAttribOffset   = this->state.backendState.vertexAttribOffset;
         uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
 
         // compute interpolation factor
         Float<SIMD_T> t;
         switch (ClippingPlane)
         {
-        case FRUSTUM_LEFT:      t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
-        case FRUSTUM_RIGHT:     t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0])); break;
-        case FRUSTUM_TOP:       t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1])); break;
-        case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1])); break;
+        case FRUSTUM_LEFT:
+            t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0]));
+            break;
+        case FRUSTUM_RIGHT:
+            t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[0]), SIMD_T::sub_ps(v2[3], v2[0]));
+            break;
+        case FRUSTUM_TOP:
+            t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[1]), SIMD_T::add_ps(v2[3], v2[1]));
+            break;
+        case FRUSTUM_BOTTOM:
+            t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[1]), SIMD_T::sub_ps(v2[3], v2[1]));
+            break;
         case FRUSTUM_NEAR:
             // DX Znear plane is 0, GL is -w
             if (this->state.rastState.clipHalfZ)
@@ -849,8 +972,11 @@
                 t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[2]), SIMD_T::add_ps(v2[3], v2[2]));
             }
             break;
-        case FRUSTUM_FAR:       t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2])); break;
-        default: SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
+        case FRUSTUM_FAR:
+            t = ComputeInterpFactor(SIMD_T::sub_ps(v1[3], v1[2]), SIMD_T::sub_ps(v2[3], v2[2]));
+            break;
+        default:
+            SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
         };
 
         // interpolate position and store
@@ -868,7 +994,8 @@
             {
                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-                Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                Float<SIMD_T> vOutAttrib =
+                    SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
             }
         }
@@ -881,7 +1008,8 @@
             {
                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-                Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                Float<SIMD_T> vOutAttrib =
+                    SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
             }
         }
@@ -893,44 +1021,58 @@
             {
                 Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
                 Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-                Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                Float<SIMD_T> vOutAttrib =
+                    SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
             }
         }
     }
 
-    template<SWR_CLIPCODES ClippingPlane>
-    Float<SIMD_T> inside(const Vec4<SIMD_T> &v)
+    template <SWR_CLIPCODES ClippingPlane>
+    Float<SIMD_T> inside(const Vec4<SIMD_T>& v)
     {
         switch (ClippingPlane)
         {
-        case FRUSTUM_LEFT:      return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
-        case FRUSTUM_RIGHT:     return SIMD_T::cmple_ps(v[0], v[3]);
-        case FRUSTUM_TOP:       return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
-        case FRUSTUM_BOTTOM:    return SIMD_T::cmple_ps(v[1], v[3]);
-        case FRUSTUM_NEAR:      return SIMD_T::cmpge_ps(v[2], this->state.rastState.clipHalfZ ? SIMD_T::setzero_ps() : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
-        case FRUSTUM_FAR:       return SIMD_T::cmple_ps(v[2], v[3]);
+        case FRUSTUM_LEFT:
+            return SIMD_T::cmpge_ps(v[0], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
+        case FRUSTUM_RIGHT:
+            return SIMD_T::cmple_ps(v[0], v[3]);
+        case FRUSTUM_TOP:
+            return SIMD_T::cmpge_ps(v[1], SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
+        case FRUSTUM_BOTTOM:
+            return SIMD_T::cmple_ps(v[1], v[3]);
+        case FRUSTUM_NEAR:
+            return SIMD_T::cmpge_ps(v[2],
+                                    this->state.rastState.clipHalfZ
+                                        ? SIMD_T::setzero_ps()
+                                        : SIMD_T::mul_ps(v[3], SIMD_T::set1_ps(-1.0f)));
+        case FRUSTUM_FAR:
+            return SIMD_T::cmple_ps(v[2], v[3]);
         default:
             SWR_INVALID("invalid clipping plane: %d", ClippingPlane);
             return SIMD_T::setzero_ps();
         }
     }
 
-    template<SWR_CLIPCODES ClippingPlane>
-    Integer<SIMD_T> ClipTriToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
+    template <SWR_CLIPCODES ClippingPlane>
+    Integer<SIMD_T> ClipTriToPlane(const float*           pInVerts,
+                                   const Integer<SIMD_T>& vNumInPts,
+                                   uint32_t               numInAttribs,
+                                   float*                 pOutVerts)
     {
         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 
-        Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
-        Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
-        Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
+        Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
+        Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
+        Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
 
         while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
         {
-            Integer<SIMD_T> s = vCurIndex;
-            Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
+            Integer<SIMD_T> s             = vCurIndex;
+            Integer<SIMD_T> p             = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
             Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
-            p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
+            p                             = SIMD_T::castps_si(SIMD_T::blendv_ps(
+                SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
 
             // gather position
             Vec4<SIMD_T> vInPos0, vInPos1;
@@ -946,7 +1088,7 @@
 
             // compute intersection mask (s_in != p_in)
             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
-            intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
+            intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
 
             // store s if inside
             s_in = SIMD_T::and_ps(s_in, vActiveMask);
@@ -955,7 +1097,8 @@
                 // store position
                 for (uint32_t c = 0; c < 4; ++c)
                 {
-                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
+                    ScatterComponent(
+                        pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
                 }
 
                 // store attribs
@@ -992,34 +1135,47 @@
                 }
 
                 // increment outIndex
-                vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
+                vOutIndex = SIMD_T::blendv_epi32(
+                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
             }
 
             // compute and store intersection
             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
             {
-                intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
+                intersect<ClippingPlane>(intersectMask,
+                                         s,
+                                         p,
+                                         vInPos0,
+                                         vInPos1,
+                                         vOutIndex,
+                                         pInVerts,
+                                         numInAttribs,
+                                         pOutVerts);
 
                 // increment outIndex for active lanes
-                vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
+                vOutIndex = SIMD_T::blendv_epi32(
+                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
             }
 
             // increment loop index and update active mask
-            vCurIndex = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
+            vCurIndex   = SIMD_T::add_epi32(vCurIndex, SIMD_T::set1_epi32(1));
             vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
         }
 
         return vOutIndex;
     }
 
-    template<SWR_CLIPCODES ClippingPlane>
-    Integer<SIMD_T> ClipLineToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
+    template <SWR_CLIPCODES ClippingPlane>
+    Integer<SIMD_T> ClipLineToPlane(const float*           pInVerts,
+                                    const Integer<SIMD_T>& vNumInPts,
+                                    uint32_t               numInAttribs,
+                                    float*                 pOutVerts)
     {
         uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
 
-        Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
-        Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
-        Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
+        Integer<SIMD_T> vCurIndex   = SIMD_T::setzero_si();
+        Integer<SIMD_T> vOutIndex   = SIMD_T::setzero_si();
+        Float<SIMD_T>   vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
 
         if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
         {
@@ -1040,7 +1196,7 @@
 
             // compute intersection mask (s_in != p_in)
             Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
-            intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
+            intersectMask               = SIMD_T::and_ps(intersectMask, vActiveMask);
 
             // store s if inside
             s_in = SIMD_T::and_ps(s_in, vActiveMask);
@@ -1048,7 +1204,8 @@
             {
                 for (uint32_t c = 0; c < 4; ++c)
                 {
-                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
+                    ScatterComponent(
+                        pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
                 }
 
                 // interpolate attributes and store
@@ -1063,16 +1220,26 @@
                 }
 
                 // increment outIndex
-                vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
+                vOutIndex = SIMD_T::blendv_epi32(
+                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), s_in);
             }
 
             // compute and store intersection
             if (!SIMD_T::testz_ps(intersectMask, intersectMask))
             {
-                intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
+                intersect<ClippingPlane>(intersectMask,
+                                         s,
+                                         p,
+                                         vInPos0,
+                                         vInPos1,
+                                         vOutIndex,
+                                         pInVerts,
+                                         numInAttribs,
+                                         pOutVerts);
 
                 // increment outIndex for active lanes
-                vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
+                vOutIndex = SIMD_T::blendv_epi32(
+                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), intersectMask);
             }
 
             // store p if inside
@@ -1081,7 +1248,8 @@
             {
                 for (uint32_t c = 0; c < 4; ++c)
                 {
-                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
+                    ScatterComponent(
+                        pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
                 }
 
                 // interpolate attributes and store
@@ -1096,17 +1264,21 @@
                 }
 
                 // increment outIndex
-                vOutIndex = SIMD_T::blendv_epi32(vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
+                vOutIndex = SIMD_T::blendv_epi32(
+                    vOutIndex, SIMD_T::add_epi32(vOutIndex, SIMD_T::set1_epi32(1)), p_in);
             }
         }
 
         return vOutIndex;
     }
 
-    Integer<SIMD_T> ClipPrims(float *pVertices, const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, int numAttribs)
+    Integer<SIMD_T> ClipPrims(float*               pVertices,
+                              const Float<SIMD_T>& vPrimMask,
+                              const Float<SIMD_T>& vClipMask,
+                              int                  numAttribs)
     {
         // temp storage
-        float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
+        float* pTempVerts = reinterpret_cast<float*>(ClipHelper<SIMD_T>::GetTempVertices());
 
         // zero out num input verts for non-active lanes
         Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
@@ -1118,43 +1290,109 @@
         {
             vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
             vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-            vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-            vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-            vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+            vNumOutPts =
+                ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+            vNumOutPts =
+                ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+            vNumOutPts =
+                ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
             vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
         }
         else
         {
             SWR_ASSERT(NumVertsPerPrim == 2);
-            vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
-            vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-            vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-            vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
-            vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
-            vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+            vNumOutPts =
+                ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
+            vNumOutPts =
+                ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+            vNumOutPts =
+                ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+            vNumOutPts =
+                ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+            vNumOutPts =
+                ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+            vNumOutPts =
+                ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
         }
 
         // restore num verts for non-clipped, active lanes
         Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
-        vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
+        vNumOutPts =
+            SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
 
         return vNumOutPts;
     }
 
-    const uint32_t workerId{ 0 };
-    DRAW_CONTEXT *pDC{ nullptr };
-    const API_STATE &state;
-    Float<SIMD_T> clipCodes[NumVertsPerPrim];
+    const uint32_t   workerId{0};
+    DRAW_CONTEXT*    pDC{nullptr};
+    const API_STATE& state;
+    Float<SIMD_T>    clipCodes[NumVertsPerPrim];
 };
 
-
 // pipeline stage functions
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari const &primId, simdscalari const &viewportIdx, simdscalari const &rtIdx);
+void ClipRectangles(DRAW_CONTEXT*      pDC,
+                    PA_STATE&          pa,
+                    uint32_t           workerId,
+                    simdvector         prims[],
+                    uint32_t           primMask,
+                    simdscalari const& primId,
+                    simdscalari const& viewportIdx,
+                    simdscalari const& rtIdx);
+void ClipTriangles(DRAW_CONTEXT*      pDC,
+                   PA_STATE&          pa,
+                   uint32_t           workerId,
+                   simdvector         prims[],
+                   uint32_t           primMask,
+                   simdscalari const& primId,
+                   simdscalari const& viewportIdx,
+                   simdscalari const& rtIdx);
+void ClipLines(DRAW_CONTEXT*      pDC,
+               PA_STATE&          pa,
+               uint32_t           workerId,
+               simdvector         prims[],
+               uint32_t           primMask,
+               simdscalari const& primId,
+               simdscalari const& viewportIdx,
+               simdscalari const& rtIdx);
+void ClipPoints(DRAW_CONTEXT*      pDC,
+                PA_STATE&          pa,
+                uint32_t           workerId,
+                simdvector         prims[],
+                uint32_t           primMask,
+                simdscalari const& primId,
+                simdscalari const& viewportIdx,
+                simdscalari const& rtIdx);
 #if USE_SIMD16_FRONTEND
-void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
-void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
-void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari const &primId, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
+void SIMDCALL ClipRectangles_simd16(DRAW_CONTEXT*        pDC,
+                                    PA_STATE&            pa,
+                                    uint32_t             workerId,
+                                    simd16vector         prims[],
+                                    uint32_t             primMask,
+                                    simd16scalari const& primId,
+                                    simd16scalari const& viewportIdx,
+                                    simd16scalari const& rtIdx);
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT*        pDC,
+                                   PA_STATE&            pa,
+                                   uint32_t             workerId,
+                                   simd16vector         prims[],
+                                   uint32_t             primMask,
+                                   simd16scalari const& primId,
+                                   simd16scalari const& viewportIdx,
+                                   simd16scalari const& rtIdx);
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT*        pDC,
+                               PA_STATE&            pa,
+                               uint32_t             workerId,
+                               simd16vector         prims[],
+                               uint32_t             primMask,
+                               simd16scalari const& primId,
+                               simd16scalari const& viewportIdx,
+                               simd16scalari const& rtIdx);
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT*        pDC,
+                                PA_STATE&            pa,
+                                uint32_t             workerId,
+                                simd16vector         prims[],
+                                uint32_t             primMask,
+                                simd16scalari const& primId,
+                                simd16scalari const& viewportIdx,
+                                simd16scalari const& rtIdx);
 #endif
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h
index 00c3a87..9e7f96c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h
+++ b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h
@@ -1,28 +1,28 @@
 /****************************************************************************
-* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file conservativerast.h
-*
-******************************************************************************/
+ * Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file conservativerast.h
+ *
+ ******************************************************************************/
 #pragma once
 #include <type_traits>
 #include "common/simdintrin.h"
@@ -38,77 +38,82 @@
 //////////////////////////////////////////////////////////////////////////
 /// @brief convenience typedefs for supported Fixed Point precisions
 typedef std::integral_constant<uint32_t, FP_UNINIT> Fixed_Uninit;
-typedef std::integral_constant<uint32_t, _16_8> Fixed_16_8;
-typedef std::integral_constant<uint32_t, _16_9> Fixed_16_9;
-typedef std::integral_constant<uint32_t, _X_16> Fixed_X_16;
+typedef std::integral_constant<uint32_t, _16_8>     Fixed_16_8;
+typedef std::integral_constant<uint32_t, _16_9>     Fixed_16_9;
+typedef std::integral_constant<uint32_t, _X_16>     Fixed_X_16;
 
 //////////////////////////////////////////////////////////////////////////
 /// @struct FixedPointTraits
-/// @brief holds constants relating to converting between FP and Fixed point 
+/// @brief holds constants relating to converting between FP and Fixed point
 /// @tparam FT: fixed precision type
-template<typename FT>
-struct FixedPointTraits{};
+template <typename FT>
+struct FixedPointTraits
+{
+};
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Fixed_16_8 specialization of FixedPointTraits
-template<>
+template <>
 struct FixedPointTraits<Fixed_16_8>
 {
     /// multiplier to go from FP32 to Fixed Point 16.8
     typedef std::integral_constant<uint32_t, 256> ScaleT;
     /// number of bits to shift to go from 16.8 fixed => int32
     typedef std::integral_constant<uint32_t, 8> BitsT;
-    typedef Fixed_16_8 TypeT;
+    typedef Fixed_16_8                          TypeT;
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Fixed_16_9 specialization of FixedPointTraits
-template<>
+template <>
 struct FixedPointTraits<Fixed_16_9>
 {
     /// multiplier to go from FP32 to Fixed Point 16.9
     typedef std::integral_constant<uint32_t, 512> ScaleT;
     /// number of bits to shift to go from 16.9 fixed => int32
     typedef std::integral_constant<uint32_t, 9> BitsT;
-    typedef Fixed_16_9 TypeT;
+    typedef Fixed_16_9                          TypeT;
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Fixed_16_9 specialization of FixedPointTraits
-template<>
+template <>
 struct FixedPointTraits<Fixed_X_16>
 {
     /// multiplier to go from FP32 to Fixed Point X.16
     typedef std::integral_constant<uint32_t, 65536> ScaleT;
     /// number of bits to shift to go from X.16 fixed => int32
     typedef std::integral_constant<uint32_t, 16> BitsT;
-    typedef Fixed_X_16 TypeT;
+    typedef Fixed_X_16                           TypeT;
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for conservative rasterization modes  
+/// @brief convenience typedefs for conservative rasterization modes
 typedef std::false_type StandardRastT;
-typedef std::true_type ConservativeRastT;
+typedef std::true_type  ConservativeRastT;
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for Input Coverage rasterization modes  
-typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NONE> NoInputCoverageT;
+/// @brief convenience typedefs for Input Coverage rasterization modes
+typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NONE>   NoInputCoverageT;
 typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_NORMAL> OuterConservativeCoverageT;
-typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> InnerConservativeCoverageT;
+typedef std::integral_constant<uint32_t, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
+    InnerConservativeCoverageT;
 
 //////////////////////////////////////////////////////////////////////////
 /// @struct ConservativeRastTraits
 /// @brief primary ConservativeRastTraits template. Shouldn't be instantiated
 /// @tparam ConservativeT: type of conservative rasterization
 template <typename ConservativeT>
-struct ConservativeRastFETraits {};
+struct ConservativeRastFETraits
+{
+};
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief StandardRast specialization of ConservativeRastTraits
 template <>
 struct ConservativeRastFETraits<StandardRastT>
 {
-    typedef std::false_type IsConservativeT;
+    typedef std::false_type                     IsConservativeT;
     typedef std::integral_constant<uint32_t, 0> BoundingBoxOffsetT;
 };
 
@@ -117,13 +122,13 @@
 template <>
 struct ConservativeRastFETraits<ConservativeRastT>
 {
-    typedef std::true_type IsConservativeT;
+    typedef std::true_type                      IsConservativeT;
     typedef std::integral_constant<uint32_t, 1> BoundingBoxOffsetT;
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief convenience typedefs for ConservativeRastFETraits 
-typedef ConservativeRastFETraits<StandardRastT> FEStandardRastT;
+/// @brief convenience typedefs for ConservativeRastFETraits
+typedef ConservativeRastFETraits<StandardRastT>     FEStandardRastT;
 typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT;
 
 //////////////////////////////////////////////////////////////////////////
@@ -133,10 +138,11 @@
 /// @tparam ConservativeT: type of conservative rasterization
 /// @tparam InputCoverageT: type of input coverage requested, if any
 template <typename ConservativeT, typename _InputCoverageT>
-struct ConservativeRastBETraits {
-    typedef std::false_type IsConservativeT;
-    typedef _InputCoverageT InputCoverageT;
-    typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT;
+struct ConservativeRastBETraits
+{
+    typedef std::false_type                    IsConservativeT;
+    typedef _InputCoverageT                    InputCoverageT;
+    typedef FixedPointTraits<Fixed_16_8>       ConservativePrecisionT;
     typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
     typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
 };
@@ -146,9 +152,9 @@
 template <typename _InputCoverageT>
 struct ConservativeRastBETraits<StandardRastT, _InputCoverageT>
 {
-    typedef std::false_type IsConservativeT;
-    typedef _InputCoverageT InputCoverageT;
-    typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT;
+    typedef std::false_type                    IsConservativeT;
+    typedef _InputCoverageT                    InputCoverageT;
+    typedef FixedPointTraits<Fixed_16_8>       ConservativePrecisionT;
     typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT;
     typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
 };
@@ -159,16 +165,17 @@
 template <>
 struct ConservativeRastBETraits<ConservativeRastT, NoInputCoverageT>
 {
-    typedef std::true_type IsConservativeT;
+    typedef std::true_type   IsConservativeT;
     typedef NoInputCoverageT InputCoverageT;
 
     typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
 
     /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of 
-    /// of having to compare individual edges to pixel corners to check if any part of the triangle 
-    /// intersects a pixel
-    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value/2) + 1> ConservativeEdgeOffsetT;
+    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
+    /// of of having to compare individual edges to pixel corners to check if any part of the
+    /// triangle intersects a pixel
+    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
+                                               ConservativeEdgeOffsetT;
     typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
 };
 
@@ -178,18 +185,18 @@
 template <>
 struct ConservativeRastBETraits<ConservativeRastT, OuterConservativeCoverageT>
 {
-    typedef std::true_type IsConservativeT;
+    typedef std::true_type             IsConservativeT;
     typedef OuterConservativeCoverageT InputCoverageT;
 
     typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
 
     /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of 
-    /// of having to compare individual edges to pixel corners to check if any part of the triangle 
-    /// intersects a pixel
-    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value/2) + 1> ConservativeEdgeOffsetT;
+    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
+    /// of of having to compare individual edges to pixel corners to check if any part of the
+    /// triangle intersects a pixel
+    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
+                                               ConservativeEdgeOffsetT;
     typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT;
-
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -198,19 +205,25 @@
 template <>
 struct ConservativeRastBETraits<ConservativeRastT, InnerConservativeCoverageT>
 {
-    typedef std::true_type IsConservativeT;
+    typedef std::true_type             IsConservativeT;
     typedef InnerConservativeCoverageT InputCoverageT;
 
     typedef FixedPointTraits<Fixed_16_9> ConservativePrecisionT;
 
     /// offset edge away from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of 
-    /// of having to compare individual edges to pixel corners to check if any part of the triangle 
-    /// intersects a pixel
-    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value/2) + 1> ConservativeEdgeOffsetT;
+    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead
+    /// of of having to compare individual edges to pixel corners to check if any part of the
+    /// triangle intersects a pixel
+    typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value / 2) + 1>
+        ConservativeEdgeOffsetT;
 
-    /// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision
-    /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of 
-    /// of having to compare individual edges to pixel corners to check if a pixel is fully covered by a triangle
-    typedef std::integral_constant<int32_t, static_cast<int32_t>(-((ConservativePrecisionT::ScaleT::value/2) + 1) - ConservativeEdgeOffsetT::value)> InnerConservativeEdgeOffsetT;
+    /// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel +
+    /// 1/512, in Fixed 16.9 precision this allows the rasterizer to do the 3 edge coverage tests
+    /// against a single point, instead of of having to compare individual edges to pixel corners to
+    /// check if a pixel is fully covered by a triangle
+    typedef std::integral_constant<int32_t,
+                                   static_cast<int32_t>(
+                                       -((ConservativePrecisionT::ScaleT::value / 2) + 1) -
+                                       ConservativeEdgeOffsetT::value)>
+        InnerConservativeEdgeOffsetT;
 };
\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index af8f4b8..6d378ed 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -1,34 +1,34 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file context.h
-*
-* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
-*        The SWR_CONTEXT is our global context and contains the DC ring,
-*        thread state, etc.
-*
-*        The DRAW_CONTEXT contains all state associated with a draw operation.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file context.h
+ *
+ * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
+ *        The SWR_CONTEXT is our global context and contains the DC ring,
+ *        thread state, etc.
+ *
+ *        The DRAW_CONTEXT contains all state associated with a draw operation.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include <condition_variable>
@@ -59,9 +59,9 @@
 {
     uint32_t frontFacing : 1;
     uint32_t yMajor : 1;
-    uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
+    uint32_t coverageMask : (SIMD_TILE_X_DIM* SIMD_TILE_Y_DIM);
     uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
-    float pointSize;
+    float    pointSize;
     uint32_t renderTargetArrayIndex;
     uint32_t viewportIndex;
 };
@@ -77,14 +77,15 @@
     float OneOverW[3];
     float recipDet;
 
-    float *pRecipW;
-    float *pAttribs;
-    float *pPerspAttribs;
-    float *pSamplePos;
-    float *pUserClipBuffer;
+    float* pRecipW;
+    float* pAttribs;
+    float* pPerspAttribs;
+    float* pSamplePos;
+    float* pUserClipBuffer;
 
     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
-    uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered
+    uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if
+                                // entire pixel is covered
     uint64_t anyCoveredSamples;
 
     TRI_FLAGS triFlags;
@@ -92,10 +93,10 @@
 
 struct TRIANGLE_WORK_DESC
 {
-    float *pTriBuffer;
-    float *pAttribs;
-    float *pUserClipBuffer;
-    uint32_t numAttribs;
+    float* pTriBuffer;
+    float* pAttribs;
+    float* pUserClipBuffer;
+    uint32_t  numAttribs;
     TRI_FLAGS triFlags;
 };
 
@@ -104,33 +105,33 @@
     SWR_RECT rect;
     uint32_t attachmentMask;
     uint32_t renderTargetArrayIndex;
-    float clearRTColor[4];  // RGBA_32F
-    float clearDepth;   // [0..1]
-    uint8_t clearStencil;
+    float    clearRTColor[4]; // RGBA_32F
+    float    clearDepth;      // [0..1]
+    uint8_t  clearStencil;
 };
 
 struct DISCARD_INVALIDATE_TILES_DESC
 {
-    uint32_t attachmentMask;
-    SWR_RECT rect;
+    uint32_t       attachmentMask;
+    SWR_RECT       rect;
     SWR_TILE_STATE newTileState;
-    bool createNewTiles;
-    bool fullTilesOnly;
+    bool           createNewTiles;
+    bool           fullTilesOnly;
 };
 
 struct SYNC_DESC
 {
     PFN_CALLBACK_FUNC pfnCallbackFunc;
-    uint64_t userData;
-    uint64_t userData2;
-    uint64_t userData3;
+    uint64_t          userData;
+    uint64_t          userData2;
+    uint64_t          userData3;
 };
 
 struct STORE_TILES_DESC
 {
-    uint32_t attachmentMask;
+    uint32_t       attachmentMask;
     SWR_TILE_STATE postStoreTileState;
-    SWR_RECT rect;
+    SWR_RECT       rect;
 };
 
 struct COMPUTE_DESC
@@ -140,7 +141,10 @@
     uint32_t threadGroupCountZ;
 };
 
-typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
+typedef void (*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC,
+                              uint32_t      workerId,
+                              uint32_t      macroTile,
+                              void*         pDesc);
 
 enum WORK_TYPE
 {
@@ -154,51 +158,55 @@
 
 OSALIGNSIMD(struct) BE_WORK
 {
-    WORK_TYPE type;
+    WORK_TYPE     type;
     PFN_WORK_FUNC pfnWork;
     union
     {
-        SYNC_DESC sync;
-        TRIANGLE_WORK_DESC tri;
-        CLEAR_DESC clear;
+        SYNC_DESC                     sync;
+        TRIANGLE_WORK_DESC            tri;
+        CLEAR_DESC                    clear;
         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
-        STORE_TILES_DESC storeTiles;
+        STORE_TILES_DESC              storeTiles;
     } desc;
 };
 
 struct DRAW_WORK
 {
-    DRAW_CONTEXT*   pDC;
+    DRAW_CONTEXT* pDC;
     union
     {
-        uint32_t   numIndices;      // DrawIndexed: Number of indices for draw.
-        uint32_t   numVerts;        // Draw: Number of verts (triangles, lines, etc)
+        uint32_t numIndices; // DrawIndexed: Number of indices for draw.
+        uint32_t numVerts;   // Draw: Number of verts (triangles, lines, etc)
     };
     union
     {
-        gfxptr_t   xpIB;              // DrawIndexed: App supplied int32 indices 
-        uint32_t   startVertex;    // Draw: Starting vertex in VB to render from.
+        gfxptr_t xpIB;        // DrawIndexed: App supplied int32 indices
+        uint32_t startVertex; // Draw: Starting vertex in VB to render from.
     };
-    int32_t    baseVertex;
-    uint32_t   numInstances;        // Number of instances
-    uint32_t   startInstance;       // Instance offset
-    uint32_t   startPrimID;         // starting primitiveID for this draw batch
-    uint32_t   startVertexID;       // starting VertexID for this draw batch (only needed for non-indexed draws)
-    SWR_FORMAT type;                // index buffer type
+    int32_t  baseVertex;
+    uint32_t numInstances;  // Number of instances
+    uint32_t startInstance; // Instance offset
+    uint32_t startPrimID;   // starting primitiveID for this draw batch
+    uint32_t
+               startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws)
+    SWR_FORMAT type;          // index buffer type
 };
 
-typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
+typedef void (*PFN_FE_WORK_FUNC)(SWR_CONTEXT*  pContext,
+                                 DRAW_CONTEXT* pDC,
+                                 uint32_t      workerId,
+                                 void*         pDesc);
 struct FE_WORK
 {
-    WORK_TYPE type;
+    WORK_TYPE        type;
     PFN_FE_WORK_FUNC pfnWork;
     union
     {
-        SYNC_DESC sync;
-        DRAW_WORK draw;
-        CLEAR_DESC clear;
+        SYNC_DESC                     sync;
+        DRAW_WORK                     draw;
+        CLEAR_DESC                    clear;
         DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
-        STORE_TILES_DESC storeTiles;
+        STORE_TILES_DESC              storeTiles;
     } desc;
 };
 
@@ -213,13 +221,25 @@
 struct PA_STATE;
 
 // function signature for pipeline stages that execute after primitive assembly
-typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], 
-    uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
+typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT*      pDC,
+                                  PA_STATE&          pa,
+                                  uint32_t           workerId,
+                                  simdvector         prims[],
+                                  uint32_t           primMask,
+                                  simdscalari const& primID,
+                                  simdscalari const& viewportIdx,
+                                  simdscalari const& rtIdx);
 
 #if ENABLE_AVX512_SIMD16
 // function signature for pipeline stages that execute after primitive assembly
-typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
-    uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
+typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT*        pDC,
+                                                 PA_STATE&            pa,
+                                                 uint32_t             workerId,
+                                                 simd16vector         prims[],
+                                                 uint32_t             primMask,
+                                                 simd16scalari const& primID,
+                                                 simd16scalari const& viewportIdx,
+                                                 simd16scalari const& rtIdx);
 
 #endif
 OSALIGNLINE(struct) API_STATE
@@ -228,86 +248,85 @@
     SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
 
     // GS - Geometry Shader State
-    SWR_GS_STATE            gsState;
-    PFN_GS_FUNC             pfnGsFunc;
+    SWR_GS_STATE gsState;
+    PFN_GS_FUNC  pfnGsFunc;
 
     // FS - Fetch Shader State
-    PFN_FETCH_FUNC          pfnFetchFunc;
+    PFN_FETCH_FUNC pfnFetchFunc;
 
     // VS - Vertex Shader State
-    PFN_VERTEX_FUNC         pfnVertexFunc;
+    PFN_VERTEX_FUNC pfnVertexFunc;
 
     // Index Buffer
-    SWR_INDEX_BUFFER_STATE  indexBuffer;
+    SWR_INDEX_BUFFER_STATE indexBuffer;
 
     // CS - Compute Shader
-    PFN_CS_FUNC             pfnCsFunc;
-    uint32_t                totalThreadsInGroup;
-    uint32_t                totalSpillFillSize;
-    uint32_t                scratchSpaceSize;
-    uint32_t                scratchSpaceNumInstances;
+    PFN_CS_FUNC pfnCsFunc;
+    uint32_t    totalThreadsInGroup;
+    uint32_t    totalSpillFillSize;
+    uint32_t    scratchSpaceSize;
+    uint32_t    scratchSpaceNumInstances;
 
     // FE - Frontend State
-    SWR_FRONTEND_STATE      frontendState;
+    SWR_FRONTEND_STATE frontendState;
 
     // SOS - Streamout Shader State
-    PFN_SO_FUNC             pfnSoFunc[MAX_SO_STREAMS];
+    PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS];
 
     // Streamout state
-    SWR_STREAMOUT_STATE     soState;
+    SWR_STREAMOUT_STATE          soState;
     mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
 
     // Tessellation State
-    PFN_HS_FUNC             pfnHsFunc;
-    PFN_DS_FUNC             pfnDsFunc;
-    SWR_TS_STATE            tsState;
+    PFN_HS_FUNC  pfnHsFunc;
+    PFN_DS_FUNC  pfnDsFunc;
+    SWR_TS_STATE tsState;
 
     // Number of attributes used by the frontend (vs, so, gs)
-    uint32_t                feNumAttributes;
-
+    uint32_t feNumAttributes;
 
     // RS - Rasterizer State
-    SWR_RASTSTATE           rastState;
+    SWR_RASTSTATE rastState;
     // floating point multisample offsets
     float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
 
-    GUARDBANDS               gbState;
+    GUARDBANDS gbState;
 
-    SWR_VIEWPORT            vp[KNOB_NUM_VIEWPORTS_SCISSORS];
-    SWR_VIEWPORT_MATRICES   vpMatrices;
+    SWR_VIEWPORT          vp[KNOB_NUM_VIEWPORTS_SCISSORS];
+    SWR_VIEWPORT_MATRICES vpMatrices;
 
-    SWR_RECT                scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
-    SWR_RECT                scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
-    bool                    scissorsTileAligned;
+    SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
+    SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
+    bool     scissorsTileAligned;
 
-    bool                    forceFront;
-    PRIMITIVE_TOPOLOGY      topology;
+    bool               forceFront;
+    PRIMITIVE_TOPOLOGY topology;
 
 
     // Backend state
     OSALIGNLINE(SWR_BACKEND_STATE) backendState;
 
-    SWR_DEPTH_BOUNDS_STATE  depthBoundsState;
+    SWR_DEPTH_BOUNDS_STATE depthBoundsState;
 
     // PS - Pixel shader state
-    SWR_PS_STATE            psState;
+    SWR_PS_STATE psState;
 
     SWR_DEPTH_STENCIL_STATE depthStencilState;
 
     // OM - Output Merger State
-    SWR_BLEND_STATE         blendState;
-    PFN_BLEND_JIT_FUNC      pfnBlendFunc[SWR_NUM_RENDERTARGETS];
+    SWR_BLEND_STATE    blendState;
+    PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS];
 
     struct
     {
-        uint32_t enableStatsFE : 1;             // Enable frontend pipeline stats
-        uint32_t enableStatsBE : 1;             // Enable backend pipeline stats
-        uint32_t colorHottileEnable : 8;        // Bitmask of enabled color hottiles
-        uint32_t depthHottileEnable: 1;         // Enable depth buffer hottile
-        uint32_t stencilHottileEnable : 1;      // Enable stencil buffer hottile
+        uint32_t enableStatsFE : 1;        // Enable frontend pipeline stats
+        uint32_t enableStatsBE : 1;        // Enable backend pipeline stats
+        uint32_t colorHottileEnable : 8;   // Bitmask of enabled color hottiles
+        uint32_t depthHottileEnable : 1;   // Enable depth buffer hottile
+        uint32_t stencilHottileEnable : 1; // Enable stencil buffer hottile
     };
 
-    PFN_QUANTIZE_DEPTH      pfnQuantizeDepth;
+    PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
 };
 
 class MacroTileMgr;
@@ -343,13 +362,23 @@
 };
 
 // pipeline function pointer types
-typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
-typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
-                                 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar const &);
-typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
-typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
-typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
-                                              simdscalar const &, simdscalar const &);
+typedef void (*PFN_BACKEND_FUNC)(
+    DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
+typedef void (*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT&,
+                                  uint8_t* (&)[SWR_NUM_RENDERTARGETS],
+                                  uint32_t,
+                                  const SWR_BLEND_STATE*,
+                                  const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS],
+                                  simdscalar&,
+                                  simdscalar const&);
+typedef void (*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
+typedef void (*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
+typedef void (*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&,
+                                               SWR_PS_CONTEXT&,
+                                               const uint64_t* const,
+                                               const uint32_t,
+                                               simdscalar const&,
+                                               simdscalar const&);
 
 struct BACKEND_FUNCS
 {
@@ -361,16 +390,16 @@
 {
     API_STATE state;
 
-    void* pPrivateState;  // Its required the driver sets this up for each draw.
+    void* pPrivateState; // Its required the driver sets this up for each draw.
 
     // pipeline function pointers, filled in by API thread when setting up the draw
-    BACKEND_FUNCS backendFuncs;
+    BACKEND_FUNCS     backendFuncs;
     PFN_PROCESS_PRIMS pfnProcessPrims;
 #if USE_SIMD16_FRONTEND
     PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
 #endif
 
-    CachingArena* pArena;     // This should only be used by API thread.
+    CachingArena* pArena; // This should only be used by API thread.
 };
 
 struct DRAW_DYNAMIC_STATE
@@ -386,7 +415,7 @@
     uint32_t SoWriteOffset[4];
     bool     SoWriteOffsetDirty[4];
 
-    SWR_STATS_FE statsFE;   // Only one FE thread per DC.
+    SWR_STATS_FE statsFE; // Only one FE thread per DC.
     SWR_STATS*   pStats;
 };
 
@@ -395,30 +424,30 @@
 //    This draw context maintains all of the state needed for the draw operation.
 struct DRAW_CONTEXT
 {
-    SWR_CONTEXT*    pContext;
+    SWR_CONTEXT* pContext;
     union
     {
-        MacroTileMgr*   pTileMgr;
-        DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
+        MacroTileMgr*  pTileMgr;
+        DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
     };
-    DRAW_STATE*     pState;             // Read-only state. Core should not update this outside of API thread.
-    CachingArena*   pArena;
+    DRAW_STATE*   pState; // Read-only state. Core should not update this outside of API thread.
+    CachingArena* pArena;
 
-    uint32_t        drawId;
-    bool            dependentFE;    // Frontend work is dependent on all previous FE
-    bool            dependent;      // Backend work is dependent on all previous BE
-    bool            isCompute;      // Is this DC a compute context?
-    bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
+    uint32_t drawId;
+    bool     dependentFE;  // Frontend work is dependent on all previous FE
+    bool     dependent;    // Backend work is dependent on all previous BE
+    bool     isCompute;    // Is this DC a compute context?
+    bool     cleanupState; // True if this is the last draw using an entry in the state ring.
 
-    FE_WORK         FeWork;
+    FE_WORK FeWork;
 
-    SYNC_DESC       retireCallback; // Call this func when this DC is retired.
+    SYNC_DESC retireCallback; // Call this func when this DC is retired.
 
     DRAW_DYNAMIC_STATE dynState;
 
-    volatile OSALIGNLINE(bool)       doneFE;         // Is FE work done for this draw?
-    volatile OSALIGNLINE(uint32_t)   FeLock;
-    volatile OSALIGNLINE(uint32_t)   threadsDone;
+    volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
+    volatile OSALIGNLINE(uint32_t) FeLock;
+    volatile OSALIGNLINE(uint32_t) threadsDone;
 };
 
 static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
@@ -444,14 +473,14 @@
 struct SWR_CONTEXT
 {
     // Draw Context Ring
-    //  Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
-    //  We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
-    //  of draws that can be in flight at any given time.
+    //  Each draw needs its own state in order to support mulitple draws in flight across multiple
+    //  threads. We maintain N draw contexts configured as a ring. The size of the ring limits the
+    //  maximum number of draws that can be in flight at any given time.
     //
     //  Description:
     //  1. State - When an application first sets state we'll request a new draw context to use.
-    //     a. If there are no available draw contexts then we'll have to wait until one becomes free.
-    //     b. If one is available then set pCurDrawContext to point to it and mark it in use.
+    //     a. If there are no available draw contexts then we'll have to wait until one becomes
+    //     free. b. If one is available then set pCurDrawContext to point to it and mark it in use.
     //     c. All state calls set state on pCurDrawContext.
     //  2. Draw - Creates submits a work item that is associated with current draw context.
     //     a. Set pPrevDrawContext = pCurDrawContext
@@ -461,10 +490,11 @@
     //     b. State is copied from prev draw context to current.
     RingBuffer<DRAW_CONTEXT> dcRing;
 
-    DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
-    DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
+    DRAW_CONTEXT* pCurDrawContext;  // This points to DC entry in ring for an unsubmitted draw.
+    DRAW_CONTEXT* pPrevDrawContext; // This points to DC entry for the previous context submitted
+                                    // that we can copy state from.
 
-    MacroTileMgr* pMacroTileManagerArray;
+    MacroTileMgr*  pMacroTileManagerArray;
     DispatchQueue* pDispatchQueueArray;
 
     // Draw State Ring
@@ -474,33 +504,33 @@
     //  to reference a single entry in the DS ring.
     RingBuffer<DRAW_STATE> dsRing;
 
-    uint32_t curStateId;               // Current index to the next available entry in the DS ring.
+    uint32_t curStateId; // Current index to the next available entry in the DS ring.
 
     uint32_t NumWorkerThreads;
     uint32_t NumFEThreads;
     uint32_t NumBEThreads;
 
-    THREAD_POOL threadPool; // Thread pool associated with this context
-    SWR_THREADING_INFO threadInfo;
-    SWR_API_THREADING_INFO apiThreadInfo;
+    THREAD_POOL              threadPool; // Thread pool associated with this context
+    SWR_THREADING_INFO       threadInfo;
+    SWR_API_THREADING_INFO   apiThreadInfo;
     SWR_WORKER_PRIVATE_STATE workerPrivateState;
 
     uint32_t MAX_DRAWS_IN_FLIGHT;
 
     std::condition_variable FifosNotEmpty;
-    std::mutex WaitLock;
+    std::mutex              WaitLock;
 
     uint32_t privateStateSize;
 
-    HotTileMgr *pHotTileMgr;
+    HotTileMgr* pHotTileMgr;
 
     // Callback functions, passed in at create context time
-    PFN_LOAD_TILE               pfnLoadTile;
-    PFN_STORE_TILE              pfnStoreTile;
-    PFN_CLEAR_TILE              pfnClearTile;
-    PFN_UPDATE_SO_WRITE_OFFSET  pfnUpdateSoWriteOffset;
-    PFN_UPDATE_STATS            pfnUpdateStats;
-    PFN_UPDATE_STATS_FE         pfnUpdateStatsFE;
+    PFN_LOAD_TILE              pfnLoadTile;
+    PFN_STORE_TILE             pfnStoreTile;
+    PFN_CLEAR_TILE             pfnClearTile;
+    PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
+    PFN_UPDATE_STATS           pfnUpdateStats;
+    PFN_UPDATE_STATS_FE        pfnUpdateStatsFE;
 
 
     // Global Stats
@@ -509,40 +539,48 @@
     // Scratch space for workers.
     uint8_t** ppScratch;
 
-    volatile OSALIGNLINE(uint32_t)  drawsOutstandingFE;
+    volatile OSALIGNLINE(uint32_t) drawsOutstandingFE;
 
     OSALIGNLINE(CachingAllocator) cachingArenaAllocator;
     uint32_t frameCount;
 
     uint32_t lastFrameChecked;
     uint64_t lastDrawChecked;
-    TileSet singleThreadLockedTiles;
+    TileSet* pSingleThreadLockedTiles;
 
     // ArchRast thread contexts.
     HANDLE* pArContext;
 };
 
-#define UPDATE_STAT_BE(name, count) if (GetApiState(pDC).enableStatsBE) { pDC->dynState.pStats[workerId].name += count; }
-#define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStatsFE) { pDC->dynState.statsFE.name += count; }
+#define UPDATE_STAT_BE(name, count)                   \
+    if (GetApiState(pDC).enableStatsBE)               \
+    {                                                 \
+        pDC->dynState.pStats[workerId].name += count; \
+    }
+#define UPDATE_STAT_FE(name, count)          \
+    if (GetApiState(pDC).enableStatsFE)      \
+    {                                        \
+        pDC->dynState.statsFE.name += count; \
+    }
 
 // ArchRast instrumentation framework
-#define AR_WORKER_CTX  pDC->pContext->pArContext[workerId]
-#define AR_API_CTX     pDC->pContext->pArContext[pContext->NumWorkerThreads]
+#define AR_WORKER_CTX pDC->pContext->pArContext[workerId]
+#define AR_API_CTX pDC->pContext->pArContext[pContext->NumWorkerThreads]
 
 #ifdef KNOB_ENABLE_RDTSC
 #define RDTSC_BEGIN(type, drawid) RDTSC_START(type)
-#define RDTSC_END(type, count)   RDTSC_STOP(type, count, 0)
+#define RDTSC_END(type, count) RDTSC_STOP(type, count, 0)
 #else
 #define RDTSC_BEGIN(type, count)
 #define RDTSC_END(type, count)
 #endif
 
 #ifdef KNOB_ENABLE_AR
-    #define _AR_EVENT(ctx, event)       ArchRast::Dispatch(ctx, ArchRast::event)
-    #define _AR_FLUSH(ctx, id)          ArchRast::FlushDraw(ctx, id)
+#define _AR_EVENT(ctx, event) ArchRast::Dispatch(ctx, ArchRast::event)
+#define _AR_FLUSH(ctx, id) ArchRast::FlushDraw(ctx, id)
 #else
-    #define _AR_EVENT(ctx, event)
-    #define _AR_FLUSH(ctx, id)
+#define _AR_EVENT(ctx, event)
+#define _AR_FLUSH(ctx, id)
 #endif
 
 // Use these macros for api thread.
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index fafc36d..54a3489 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -1,36 +1,39 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file depthstencil.h
-*
-* @brief Implements depth/stencil functionality
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file depthstencil.h
+ *
+ * @brief Implements depth/stencil functionality
+ *
+ ******************************************************************************/
 #pragma once
 #include "common/os.h"
 #include "format_conversion.h"
 
 INLINE
-void StencilOp(SWR_STENCILOP op, simdscalar const &mask, simdscalar const &stencilRefps, simdscalar &stencilps)
+void StencilOp(SWR_STENCILOP     op,
+               simdscalar const& mask,
+               simdscalar const& stencilRefps,
+               simdscalar&       stencilps)
 {
     simdscalari stencil = _simd_castps_si(stencilps);
 
@@ -47,30 +50,31 @@
     case STENCILOP_INCRSAT:
     {
         simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1));
-        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
+        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
         break;
     }
     case STENCILOP_DECRSAT:
     {
         simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1));
-        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
+        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
         break;
     }
     case STENCILOP_INCR:
     {
         simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1));
-        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
+        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
         break;
     }
     case STENCILOP_DECR:
     {
         simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff));
-        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
+        stencilps               = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
         break;
     }
     case STENCILOP_INVERT:
     {
-        simdscalar stencilinvert = _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
+        simdscalar stencilinvert =
+            _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
         stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask);
         break;
     }
@@ -79,12 +83,11 @@
     }
 }
 
-
-template<SWR_FORMAT depthFormatT>
-simdscalar QuantizeDepth(simdscalar const &depth)
+template <SWR_FORMAT depthFormatT>
+simdscalar QuantizeDepth(simdscalar const& depth)
 {
     SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
-    uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0);
+    uint32_t depthBpc  = FormatTraits<depthFormatT>::GetBPC(0);
 
     if (depthType == SWR_TYPE_FLOAT)
     {
@@ -98,11 +101,11 @@
     // should be unorm depth if not float
     SWR_ASSERT(depthType == SWR_TYPE_UNORM);
 
-    float quantize = (float)((1 << depthBpc) - 1);
-    simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize));
-    result = _simd_add_ps(result, _simd_set1_ps(0.5f));
-    result = _simd_round_ps(result, _MM_FROUND_TO_ZERO);
-    
+    float      quantize = (float)((1 << depthBpc) - 1);
+    simdscalar result   = _simd_mul_ps(depth, _simd_set1_ps(quantize));
+    result              = _simd_add_ps(result, _simd_set1_ps(0.5f));
+    result              = _simd_round_ps(result, _MM_FROUND_TO_ZERO);
+
     if (depthBpc > 16)
     {
         result = _simd_div_ps(result, _simd_set1_ps(quantize));
@@ -116,42 +119,62 @@
 }
 
 INLINE
-simdscalar DepthStencilTest(const API_STATE* pState,
-                 bool frontFacing, uint32_t viewportIndex, simdscalar const &iZ, uint8_t* pDepthBase, simdscalar const &coverageMask,
-                 uint8_t *pStencilBase, simdscalar* pStencilMask)
+simdscalar DepthStencilTest(const API_STATE*  pState,
+                            bool              frontFacing,
+                            uint32_t          viewportIndex,
+                            simdscalar const& iZ,
+                            uint8_t*          pDepthBase,
+                            simdscalar const& coverageMask,
+                            uint8_t*          pStencilBase,
+                            simdscalar*       pStencilMask)
 {
     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
     static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
 
-    const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState;
-    const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex];
+    const SWR_DEPTH_STENCIL_STATE* pDSState  = &pState->depthStencilState;
+    const SWR_VIEWPORT*            pViewport = &pState->vp[viewportIndex];
 
     simdscalar depthResult = _simd_set1_ps(-1.0f);
     simdscalar zbuf;
 
     // clamp Z to viewport [minZ..maxZ]
-    simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
-    simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
+    simdscalar vMinZ   = _simd_broadcast_ss(&pViewport->minZ);
+    simdscalar vMaxZ   = _simd_broadcast_ss(&pViewport->maxZ);
     simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
-    
+
     if (pDSState->depthTestEnable)
     {
         switch (pDSState->depthTestFunc)
         {
-        case ZFUNC_NEVER: depthResult = _simd_setzero_ps(); break;
-        case ZFUNC_ALWAYS: break;
+        case ZFUNC_NEVER:
+            depthResult = _simd_setzero_ps();
+            break;
+        case ZFUNC_ALWAYS:
+            break;
         default:
             zbuf = _simd_load_ps((const float*)pDepthBase);
         }
 
         switch (pDSState->depthTestFunc)
         {
-        case ZFUNC_LE: depthResult = _simd_cmple_ps(interpZ, zbuf); break;
-        case ZFUNC_LT: depthResult = _simd_cmplt_ps(interpZ, zbuf); break;
-        case ZFUNC_GT: depthResult = _simd_cmpgt_ps(interpZ, zbuf); break;
-        case ZFUNC_GE: depthResult = _simd_cmpge_ps(interpZ, zbuf); break;
-        case ZFUNC_EQ: depthResult = _simd_cmpeq_ps(interpZ, zbuf); break;
-        case ZFUNC_NE: depthResult = _simd_cmpneq_ps(interpZ, zbuf); break;
+        case ZFUNC_LE:
+            depthResult = _simd_cmple_ps(interpZ, zbuf);
+            break;
+        case ZFUNC_LT:
+            depthResult = _simd_cmplt_ps(interpZ, zbuf);
+            break;
+        case ZFUNC_GT:
+            depthResult = _simd_cmpgt_ps(interpZ, zbuf);
+            break;
+        case ZFUNC_GE:
+            depthResult = _simd_cmpge_ps(interpZ, zbuf);
+            break;
+        case ZFUNC_EQ:
+            depthResult = _simd_cmpeq_ps(interpZ, zbuf);
+            break;
+        case ZFUNC_NE:
+            depthResult = _simd_cmpneq_ps(interpZ, zbuf);
+            break;
         }
     }
 
@@ -159,9 +182,9 @@
 
     if (pDSState->stencilTestEnable)
     {
-        uint8_t stencilRefValue;
+        uint8_t  stencilRefValue;
         uint32_t stencilTestFunc;
-        uint8_t stencilTestMask;
+        uint8_t  stencilTestMask;
         if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
         {
             stencilRefValue = pDSState->stencilRefValue;
@@ -178,15 +201,19 @@
         simdvector sbuf;
         simdscalar stencilWithMask;
         simdscalar stencilRef;
-        switch(stencilTestFunc)
+        switch (stencilTestFunc)
         {
-        case ZFUNC_NEVER: stencilMask = _simd_setzero_ps(); break;
-        case ZFUNC_ALWAYS: break;
+        case ZFUNC_NEVER:
+            stencilMask = _simd_setzero_ps();
+            break;
+        case ZFUNC_ALWAYS:
+            break;
         default:
             LoadSOA<R8_UINT>(pStencilBase, sbuf);
-            
+
             // apply stencil read mask
-            stencilWithMask = _simd_castsi_ps(_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
+            stencilWithMask = _simd_castsi_ps(
+                _simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
 
             // do stencil compare in float to avoid simd integer emulation in AVX1
             stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask));
@@ -195,34 +222,52 @@
             break;
         }
 
-        switch(stencilTestFunc)
+        switch (stencilTestFunc)
         {
-        case ZFUNC_LE: stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); break;
-        case ZFUNC_LT: stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); break;
-        case ZFUNC_GT: stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); break;
-        case ZFUNC_GE: stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); break;
-        case ZFUNC_EQ: stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); break;
-        case ZFUNC_NE: stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); break;
+        case ZFUNC_LE:
+            stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask);
+            break;
+        case ZFUNC_LT:
+            stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask);
+            break;
+        case ZFUNC_GT:
+            stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask);
+            break;
+        case ZFUNC_GE:
+            stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask);
+            break;
+        case ZFUNC_EQ:
+            stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask);
+            break;
+        case ZFUNC_NE:
+            stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask);
+            break;
         }
     }
 
     simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask);
-    depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask);
+    depthWriteMask            = _simd_and_ps(depthWriteMask, coverageMask);
 
     *pStencilMask = stencilMask;
     return depthWriteMask;
 }
 
 INLINE
-void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
-        bool frontFacing, simdscalar const &iZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
-        uint8_t *pStencilBase, const simdscalar& stencilMask)
+void DepthStencilWrite(const SWR_VIEWPORT*            pViewport,
+                       const SWR_DEPTH_STENCIL_STATE* pDSState,
+                       bool                           frontFacing,
+                       simdscalar const&              iZ,
+                       uint8_t*                       pDepthBase,
+                       const simdscalar&              depthMask,
+                       const simdscalar&              coverageMask,
+                       uint8_t*                       pStencilBase,
+                       const simdscalar&              stencilMask)
 {
     if (pDSState->depthWriteEnable)
     {
         // clamp Z to viewport [minZ..maxZ]
-        simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
-        simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
+        simdscalar vMinZ   = _simd_broadcast_ss(&pViewport->minZ);
+        simdscalar vMaxZ   = _simd_broadcast_ss(&pViewport->maxZ);
         simdscalar interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, iZ));
 
         simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
@@ -235,49 +280,56 @@
         LoadSOA<R8_UINT>(pStencilBase, sbuf);
         simdscalar stencilbuf = sbuf.v[0];
 
-        uint8_t stencilRefValue;
+        uint8_t  stencilRefValue;
         uint32_t stencilFailOp;
         uint32_t stencilPassDepthPassOp;
         uint32_t stencilPassDepthFailOp;
-        uint8_t stencilWriteMask;
+        uint8_t  stencilWriteMask;
         if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
         {
-            stencilRefValue = pDSState->stencilRefValue;
-            stencilFailOp = pDSState->stencilFailOp;
+            stencilRefValue        = pDSState->stencilRefValue;
+            stencilFailOp          = pDSState->stencilFailOp;
             stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp;
             stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp;
-            stencilWriteMask = pDSState->stencilWriteMask;
+            stencilWriteMask       = pDSState->stencilWriteMask;
         }
         else
         {
-            stencilRefValue = pDSState->backfaceStencilRefValue;
-            stencilFailOp = pDSState->backfaceStencilFailOp;
+            stencilRefValue        = pDSState->backfaceStencilRefValue;
+            stencilFailOp          = pDSState->backfaceStencilFailOp;
             stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp;
             stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp;
-            stencilWriteMask = pDSState->backfaceStencilWriteMask;
+            stencilWriteMask       = pDSState->backfaceStencilWriteMask;
         }
 
-        simdscalar stencilps = stencilbuf;
+        simdscalar stencilps    = stencilbuf;
         simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue));
 
-        simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask);
+        simdscalar stencilFailMask          = _simd_andnot_ps(stencilMask, coverageMask);
         simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask);
-        simdscalar stencilPassDepthFailMask = _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1)));
+        simdscalar stencilPassDepthFailMask =
+            _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1)));
 
         simdscalar origStencil = stencilps;
 
         StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps);
-        StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, stencilPassDepthFailMask, stencilRefps, stencilps);
-        StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, stencilPassDepthPassMask, stencilRefps, stencilps);
+        StencilOp((SWR_STENCILOP)stencilPassDepthFailOp,
+                  stencilPassDepthFailMask,
+                  stencilRefps,
+                  stencilps);
+        StencilOp((SWR_STENCILOP)stencilPassDepthPassOp,
+                  stencilPassDepthPassMask,
+                  stencilRefps,
+                  stencilps);
 
         // apply stencil write mask
         simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask);
-        stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
-        stencilps = _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
+        stencilps              = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
+        stencilps =
+            _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
 
         simdvector stencilResult;
         stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask);
         StoreSOA<R8_UINT>(stencilResult, pStencilBase);
     }
-
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index 43d3a83..9a9cc26 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -1,53 +1,52 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file fifo.hpp
-*
-* @brief Definitions for our fifos used for thread communication.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file fifo.hpp
+ *
+ * @brief Definitions for our fifos used for thread communication.
+ *
+ ******************************************************************************/
 #pragma once
 
-
 #include "common/os.h"
 #include "arena.h"
 
 #include <vector>
 #include <cassert>
 
-template<class T>
+template <class T>
 struct QUEUE
 {
-    OSALIGNLINE(volatile uint32_t) mLock{ 0 };
-    OSALIGNLINE(volatile uint32_t) mNumEntries{ 0 };
+    OSALIGNLINE(volatile uint32_t) mLock{0};
+    OSALIGNLINE(volatile uint32_t) mNumEntries{0};
     std::vector<T*> mBlocks;
-    T* mCurBlock{ nullptr };
-    uint32_t mHead{ 0 };
-    uint32_t mTail{ 0 };
-    uint32_t mCurBlockIdx{ 0 };
+    T*              mCurBlock{nullptr};
+    uint32_t        mHead{0};
+    uint32_t        mTail{0};
+    uint32_t        mCurBlockIdx{0};
 
     // power of 2
     static const uint32_t mBlockSizeShift = 6;
-    static const uint32_t mBlockSize = 1 << mBlockSizeShift;
+    static const uint32_t mBlockSize      = 1 << mBlockSizeShift;
 
     template <typename ArenaT>
     void clear(ArenaT& arena)
@@ -55,18 +54,15 @@
         mHead = 0;
         mTail = 0;
         mBlocks.clear();
-        T* pNewBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
+        T* pNewBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
         mBlocks.push_back(pNewBlock);
-        mCurBlock = pNewBlock;
+        mCurBlock    = pNewBlock;
         mCurBlockIdx = 0;
-        mNumEntries = 0;
-        mLock = 0;
+        mNumEntries  = 0;
+        mLock        = 0;
     }
 
-    uint32_t getNumQueued()
-    {
-        return mNumEntries;
-    }
+    uint32_t getNumQueued() { return mNumEntries; }
 
     bool tryLock()
     {
@@ -80,10 +76,7 @@
         return (initial == 0);
     }
 
-    void unlock()
-    {
-        mLock = 0;
-    }
+    void unlock() { mLock = 0; }
 
     T* peek()
     {
@@ -92,34 +85,33 @@
             return nullptr;
         }
         uint32_t block = mHead >> mBlockSizeShift;
-        return &mBlocks[block][mHead & (mBlockSize-1)];
+        return &mBlocks[block][mHead & (mBlockSize - 1)];
     }
 
     void dequeue_noinc()
     {
-        mHead ++;
-        mNumEntries --;
+        mHead++;
+        mNumEntries--;
     }
 
     template <typename ArenaT>
     bool enqueue_try_nosync(ArenaT& arena, const T* entry)
     {
         const float* pSrc = (const float*)entry;
-        float* pDst = (float*)&mCurBlock[mTail];
+        float*       pDst = (float*)&mCurBlock[mTail];
 
-        auto lambda = [&](int32_t i)
-        {
-            __m256 vSrc = _mm256_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
-            _mm256_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
+        auto lambda = [&](int32_t i) {
+            __m256 vSrc = _mm256_load_ps(pSrc + i * KNOB_SIMD_WIDTH);
+            _mm256_stream_ps(pDst + i * KNOB_SIMD_WIDTH, vSrc);
         };
 
-        const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
+        const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH * 4);
         static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
-            "FIFO element size should be multiple of SIMD width.");
+                      "FIFO element size should be multiple of SIMD width.");
 
         UnrollerL<0, numSimdLines, 1>::step(lambda);
 
-        mTail ++;
+        mTail++;
         if (mTail == mBlockSize)
         {
             if (++mCurBlockIdx < mBlocks.size())
@@ -128,7 +120,7 @@
             }
             else
             {
-                T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
+                T* newBlock = (T*)arena.AllocAligned(sizeof(T) * mBlockSize, KNOB_SIMD_WIDTH * 4);
                 SWR_ASSERT(newBlock);
 
                 mBlocks.push_back(newBlock);
@@ -138,12 +130,9 @@
             mTail = 0;
         }
 
-        mNumEntries ++;
+        mNumEntries++;
         return true;
     }
 
-    void destroy()
-    {
-    }
-
+    void destroy() {}
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 72843f5..90bf118 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file format_conversion.h
-*
-* @brief API implementation
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file format_conversion.h
+ *
+ * @brief API implementation
+ *
+ ******************************************************************************/
 #include "format_types.h"
 #include "format_traits.h"
 
@@ -33,15 +33,15 @@
 ///        SOA RGBA32_FLOAT format.
 /// @param pSrc - source data in SOA form
 /// @param dst - output data in SOA form
-template<SWR_FORMAT SrcFormat>
-INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst)
+template <SWR_FORMAT SrcFormat>
+INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst)
 {
     // fast path for float32
-    if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
+    if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
+        (FormatTraits<SrcFormat>::GetBPC(0) == 32))
     {
-        auto lambda = [&](int comp)
-        {
-            simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp*sizeof(simdscalar)));
+        auto lambda = [&](int comp) {
+            simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp * sizeof(simdscalar)));
 
             dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
         };
@@ -50,8 +50,7 @@
         return;
     }
 
-    auto lambda = [&](int comp)
-    {
+    auto lambda = [&](int comp) {
         // load SIMD components
         simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc);
 
@@ -74,12 +73,12 @@
 }
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Clamps the given component based on the requirements on the 
+/// @brief Clamps the given component based on the requirements on the
 ///        Format template arg
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
-template<SWR_FORMAT Format>
-INLINE simdscalar Clamp(simdscalar const &vC, uint32_t Component)
+template <SWR_FORMAT Format>
+INLINE simdscalar Clamp(simdscalar const& vC, uint32_t Component)
 {
     simdscalar vComp = vC;
     if (FormatTraits<Format>::isNormalized(Component))
@@ -99,21 +98,21 @@
     {
         if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
         {
-            int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
-            int iMin = 0;
+            int         iMax   = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
+            int         iMin   = 0;
             simdscalari vCompi = _simd_castps_si(vComp);
-            vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin));
-            vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax));
-            vComp = _simd_castsi_ps(vCompi);
+            vCompi             = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin));
+            vCompi             = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax));
+            vComp              = _simd_castsi_ps(vCompi);
         }
         else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
         {
-            int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
-            int iMin = -1 - iMax;
+            int         iMax   = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
+            int         iMin   = -1 - iMax;
             simdscalari vCompi = _simd_castps_si(vComp);
-            vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin));
-            vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax));
-            vComp = _simd_castsi_ps(vCompi);
+            vCompi             = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin));
+            vCompi             = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax));
+            vComp              = _simd_castsi_ps(vCompi);
         }
     }
 
@@ -125,8 +124,8 @@
 ///        Format template arg
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
-template<SWR_FORMAT Format>
-INLINE simdscalar Normalize(simdscalar const &vC, uint32_t Component)
+template <SWR_FORMAT Format>
+INLINE simdscalar Normalize(simdscalar const& vC, uint32_t Component)
 {
     simdscalar vComp = vC;
     if (FormatTraits<Format>::isNormalized(Component))
@@ -142,11 +141,12 @@
 ///        RGBA32_FLOAT to SOA format
 /// @param src - source data in SOA form
 /// @param dst - output data in SOA form
-template<SWR_FORMAT DstFormat>
-INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
+template <SWR_FORMAT DstFormat>
+INLINE void StoreSOA(const simdvector& src, uint8_t* pDst)
 {
     // fast path for float32
-    if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
+    if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
+        (FormatTraits<DstFormat>::GetBPC(0) == 32))
     {
         for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
         {
@@ -155,25 +155,24 @@
             // Gamma-correct
             if (FormatTraits<DstFormat>::isSRGB)
             {
-                if (comp < 3)  // Input format is always RGBA32_FLOAT.
+                if (comp < 3) // Input format is always RGBA32_FLOAT.
                 {
                     vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
                 }
             }
 
-            _simd_store_ps((float*)(pDst + comp*sizeof(simdscalar)), vComp);
+            _simd_store_ps((float*)(pDst + comp * sizeof(simdscalar)), vComp);
         }
         return;
     }
 
-    auto lambda = [&](int comp)
-    {
+    auto lambda = [&](int comp) {
         simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
 
         // Gamma-correct
         if (FormatTraits<DstFormat>::isSRGB)
         {
-            if (comp < 3)  // Input format is always RGBA32_FLOAT.
+            if (comp < 3) // Input format is always RGBA32_FLOAT.
             {
                 vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
             }
@@ -203,15 +202,16 @@
 ///        SOA RGBA32_FLOAT format.
 /// @param pSrc - source data in SOA form
 /// @param dst - output data in SOA form
-template<SWR_FORMAT SrcFormat>
-INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst)
+template <SWR_FORMAT SrcFormat>
+INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst)
 {
     // fast path for float32
-    if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
+    if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
+        (FormatTraits<SrcFormat>::GetBPC(0) == 32))
     {
-        auto lambda = [&](int comp)
-        {
-            simd16scalar vComp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc + comp * sizeof(simd16scalar)));
+        auto lambda = [&](int comp) {
+            simd16scalar vComp =
+                _simd16_load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(simd16scalar)));
 
             dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
         };
@@ -220,8 +220,7 @@
         return;
     }
 
-    auto lambda = [&](int comp)
-    {
+    auto lambda = [&](int comp) {
         // load SIMD components
         simd16scalar vComp = FormatTraits<SrcFormat>::loadSOA_16(comp, pSrc);
 
@@ -244,12 +243,12 @@
 }
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Clamps the given component based on the requirements on the 
+/// @brief Clamps the given component based on the requirements on the
 ///        Format template arg
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
-template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Clamp(simd16scalar const &v, uint32_t Component)
+template <SWR_FORMAT Format>
+INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component)
 {
     simd16scalar vComp = v;
     if (FormatTraits<Format>::isNormalized(Component))
@@ -269,21 +268,21 @@
     {
         if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
         {
-            int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
-            int iMin = 0;
+            int           iMax   = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
+            int           iMin   = 0;
             simd16scalari vCompi = _simd16_castps_si(vComp);
-            vCompi = _simd16_max_epu32(vCompi, _simd16_set1_epi32(iMin));
-            vCompi = _simd16_min_epu32(vCompi, _simd16_set1_epi32(iMax));
-            vComp = _simd16_castsi_ps(vCompi);
+            vCompi               = _simd16_max_epu32(vCompi, _simd16_set1_epi32(iMin));
+            vCompi               = _simd16_min_epu32(vCompi, _simd16_set1_epi32(iMax));
+            vComp                = _simd16_castsi_ps(vCompi);
         }
         else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
         {
-            int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
-            int iMin = -1 - iMax;
+            int           iMax   = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
+            int           iMin   = -1 - iMax;
             simd16scalari vCompi = _simd16_castps_si(vComp);
-            vCompi = _simd16_max_epi32(vCompi, _simd16_set1_epi32(iMin));
-            vCompi = _simd16_min_epi32(vCompi, _simd16_set1_epi32(iMax));
-            vComp = _simd16_castsi_ps(vCompi);
+            vCompi               = _simd16_max_epi32(vCompi, _simd16_set1_epi32(iMin));
+            vCompi               = _simd16_min_epi32(vCompi, _simd16_set1_epi32(iMax));
+            vComp                = _simd16_castsi_ps(vCompi);
         }
     }
 
@@ -295,8 +294,8 @@
 ///        Format template arg
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
-template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Normalize(simd16scalar const &vComp, uint32_t Component)
+template <SWR_FORMAT Format>
+INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component)
 {
     simd16scalar r = vComp;
     if (FormatTraits<Format>::isNormalized(Component))
@@ -312,11 +311,12 @@
 ///        RGBA32_FLOAT to SOA format
 /// @param src - source data in SOA form
 /// @param dst - output data in SOA form
-template<SWR_FORMAT DstFormat>
-INLINE void SIMDCALL StoreSOA(const simd16vector &src, uint8_t *pDst)
+template <SWR_FORMAT DstFormat>
+INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
 {
     // fast path for float32
-    if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
+    if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
+        (FormatTraits<DstFormat>::GetBPC(0) == 32))
     {
         for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
         {
@@ -325,25 +325,24 @@
             // Gamma-correct
             if (FormatTraits<DstFormat>::isSRGB)
             {
-                if (comp < 3)  // Input format is always RGBA32_FLOAT.
+                if (comp < 3) // Input format is always RGBA32_FLOAT.
                 {
                     vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
                 }
             }
 
-            _simd16_store_ps(reinterpret_cast<float *>(pDst + comp * sizeof(simd16scalar)), vComp);
+            _simd16_store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp);
         }
         return;
     }
 
-    auto lambda = [&](int comp)
-    {
+    auto lambda = [&](int comp) {
         simd16scalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
 
         // Gamma-correct
         if (FormatTraits<DstFormat>::isSRGB)
         {
-            if (comp < 3)  // Input format is always RGBA32_FLOAT.
+            if (comp < 3) // Input format is always RGBA32_FLOAT.
             {
                 vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
             }
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
index bc585dd..97e7d56 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_traits.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file format_traits.h
-*
-* @brief Format Traits.  auto-generated file
-*
-* DO NOT EDIT
-*
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file format_traits.h
+ *
+ * @brief Format Traits.  auto-generated file
+ *
+ * DO NOT EDIT
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "format_types.h"
@@ -35,13 +35,13 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatSwizzle - Component swizzle selects
 //////////////////////////////////////////////////////////////////////////
-template<uint32_t comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0>
+template <uint32_t comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0>
 struct FormatSwizzle
 {
     // Return swizzle select for component.
     INLINE static uint32_t swizzle(uint32_t c)
     {
-        static const uint32_t s[4] = { comp0, comp1, comp2, comp3 };
+        static const uint32_t s[4] = {comp0, comp1, comp2, comp3};
         return s[c];
     }
 };
@@ -49,41 +49,45 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits - Format traits
 //////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT format>
-struct FormatTraits :
-    ComponentTraits<SWR_TYPE_UNKNOWN, 0>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0>
+template <SWR_FORMAT format>
+struct FormatTraits : ComponentTraits<SWR_TYPE_UNKNOWN, 0>, FormatSwizzle<0>, Defaults<0, 0, 0, 0>
 {
-    static const uint32_t bpp{ 0 };
-    static const uint32_t numComps{ 0 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
+    static const uint32_t bpp{0};
+    static const uint32_t numComps{0};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
 
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32A32_FLOAT> - Format traits specialization for R32G32B32A32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32A32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32B32A32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
+                                                          32,
+                                                          SWR_TYPE_FLOAT,
+                                                          32,
+                                                          SWR_TYPE_FLOAT,
+                                                          32,
+                                                          SWR_TYPE_FLOAT,
+                                                          32>,
+                                          FormatSwizzle<0, 1, 2, 3>,
+                                          Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32_32    TransposeT;
     typedef Format4<32, 32, 32, 32> FormatT;
@@ -92,20 +96,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32A32_SINT> - Format traits specialization for R32G32B32A32_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32A32_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R32G32B32A32_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32_32    TransposeT;
     typedef Format4<32, 32, 32, 32> FormatT;
@@ -114,20 +119,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32A32_UINT> - Format traits specialization for R32G32B32A32_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32A32_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R32G32B32A32_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32_32    TransposeT;
     typedef Format4<32, 32, 32, 32> FormatT;
@@ -136,20 +142,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R64G64_FLOAT> - Format traits specialization for R64G64_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R64G64_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R64G64_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
+                                    FormatSwizzle<0, 1>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose64_64  TransposeT;
     typedef Format2<64, 64> FormatT;
@@ -158,20 +164,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32X32_FLOAT> - Format traits specialization for R32G32B32X32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32X32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32B32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
+                                                          32,
+                                                          SWR_TYPE_FLOAT,
+                                                          32,
+                                                          SWR_TYPE_FLOAT,
+                                                          32,
+                                                          SWR_TYPE_UNUSED,
+                                                          32>,
+                                          FormatSwizzle<0, 1, 2, 3>,
+                                          Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32_32    TransposeT;
     typedef Format4<32, 32, 32, 32> FormatT;
@@ -180,20 +193,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32A32_SSCALED> - Format traits specialization for R32G32B32A32_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32A32_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32B32A32_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
+                                                            32,
+                                                            SWR_TYPE_SSCALED,
+                                                            32,
+                                                            SWR_TYPE_SSCALED,
+                                                            32,
+                                                            SWR_TYPE_SSCALED,
+                                                            32>,
+                                            FormatSwizzle<0, 1, 2, 3>,
+                                            Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32_32    TransposeT;
     typedef Format4<32, 32, 32, 32> FormatT;
@@ -202,20 +222,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32A32_USCALED> - Format traits specialization for R32G32B32A32_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32A32_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32B32A32_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
+                                                            32,
+                                                            SWR_TYPE_USCALED,
+                                                            32,
+                                                            SWR_TYPE_USCALED,
+                                                            32,
+                                                            SWR_TYPE_USCALED,
+                                                            32>,
+                                            FormatSwizzle<0, 1, 2, 3>,
+                                            Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32_32    TransposeT;
     typedef Format4<32, 32, 32, 32> FormatT;
@@ -224,20 +251,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32A32_SFIXED> - Format traits specialization for R32G32B32A32_SFIXED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32A32_SFIXED> :
-    ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32B32A32_SFIXED> : ComponentTraits<SWR_TYPE_SFIXED,
+                                                           32,
+                                                           SWR_TYPE_SFIXED,
+                                                           32,
+                                                           SWR_TYPE_SFIXED,
+                                                           32,
+                                                           SWR_TYPE_SFIXED,
+                                                           32>,
+                                           FormatSwizzle<0, 1, 2, 3>,
+                                           Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32_32    TransposeT;
     typedef Format4<32, 32, 32, 32> FormatT;
@@ -246,20 +280,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32_FLOAT> - Format traits specialization for R32G32B32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32B32_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 96 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{96};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32   TransposeT;
     typedef Format3<32, 32, 32> FormatT;
@@ -268,20 +303,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32_SINT> - Format traits specialization for R32G32B32_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R32G32B32_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 96 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{96};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32   TransposeT;
     typedef Format3<32, 32, 32> FormatT;
@@ -290,20 +326,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32_UINT> - Format traits specialization for R32G32B32_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R32G32B32_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 96 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{96};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32   TransposeT;
     typedef Format3<32, 32, 32> FormatT;
@@ -312,20 +349,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32_SSCALED> - Format traits specialization for R32G32B32_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32B32_SSCALED>
+    : ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 96 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{96};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32   TransposeT;
     typedef Format3<32, 32, 32> FormatT;
@@ -334,20 +372,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32_USCALED> - Format traits specialization for R32G32B32_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32B32_USCALED>
+    : ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 96 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{96};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32   TransposeT;
     typedef Format3<32, 32, 32> FormatT;
@@ -356,20 +395,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32B32_SFIXED> - Format traits specialization for R32G32B32_SFIXED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32B32_SFIXED> :
-    ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32B32_SFIXED>
+    : ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 96 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{96};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32_32   TransposeT;
     typedef Format3<32, 32, 32> FormatT;
@@ -378,20 +418,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16A16_UNORM> - Format traits specialization for R16G16B16A16_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16A16_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16A16_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
+                                                          16,
+                                                          SWR_TYPE_UNORM,
+                                                          16,
+                                                          SWR_TYPE_UNORM,
+                                                          16,
+                                                          SWR_TYPE_UNORM,
+                                                          16>,
+                                          FormatSwizzle<0, 1, 2, 3>,
+                                          Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16_16    TransposeT;
     typedef Format4<16, 16, 16, 16> FormatT;
@@ -400,20 +447,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16A16_SNORM> - Format traits specialization for R16G16B16A16_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16A16_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16A16_SNORM> : ComponentTraits<SWR_TYPE_SNORM,
+                                                          16,
+                                                          SWR_TYPE_SNORM,
+                                                          16,
+                                                          SWR_TYPE_SNORM,
+                                                          16,
+                                                          SWR_TYPE_SNORM,
+                                                          16>,
+                                          FormatSwizzle<0, 1, 2, 3>,
+                                          Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16_16    TransposeT;
     typedef Format4<16, 16, 16, 16> FormatT;
@@ -422,20 +476,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16A16_SINT> - Format traits specialization for R16G16B16A16_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16A16_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R16G16B16A16_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16_16    TransposeT;
     typedef Format4<16, 16, 16, 16> FormatT;
@@ -444,20 +499,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16A16_UINT> - Format traits specialization for R16G16B16A16_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16A16_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R16G16B16A16_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16_16    TransposeT;
     typedef Format4<16, 16, 16, 16> FormatT;
@@ -466,20 +522,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16A16_FLOAT> - Format traits specialization for R16G16B16A16_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16A16_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16A16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
+                                                          16,
+                                                          SWR_TYPE_FLOAT,
+                                                          16,
+                                                          SWR_TYPE_FLOAT,
+                                                          16,
+                                                          SWR_TYPE_FLOAT,
+                                                          16>,
+                                          FormatSwizzle<0, 1, 2, 3>,
+                                          Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16_16    TransposeT;
     typedef Format4<16, 16, 16, 16> FormatT;
@@ -488,20 +551,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32_FLOAT> - Format traits specialization for R32G32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+                                    FormatSwizzle<0, 1>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -510,20 +573,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32_SINT> - Format traits specialization for R32G32_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R32G32_SINT> : ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
+                                   FormatSwizzle<0, 1>,
+                                   Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -532,42 +595,44 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32_UINT> - Format traits specialization for R32G32_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R32G32_UINT> : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
+                                   FormatSwizzle<0, 1>,
+                                   Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS
+/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for
+/// R32_FLOAT_X8X24_TYPELESS
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32_FLOAT_X8X24_TYPELESS>
+    : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
+      FormatSwizzle<0, 1>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -576,20 +641,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<X32_TYPELESS_G8X24_UINT> - Format traits specialization for X32_TYPELESS_G8X24_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<X32_TYPELESS_G8X24_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UNUSED, 32>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<X32_TYPELESS_G8X24_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UNUSED, 32>,
+      FormatSwizzle<0, 1>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -598,20 +664,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L32A32_FLOAT> - Format traits specialization for L32A32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L32A32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L32A32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+                                    FormatSwizzle<0, 3>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 1 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{1};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -620,20 +686,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R64_FLOAT> - Format traits specialization for R64_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R64_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 64>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R64_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 64>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<64> TransposeT;
     typedef Format1<64>                  FormatT;
@@ -642,20 +707,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16X16_UNORM> - Format traits specialization for R16G16B16X16_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16X16_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNUSED, 16>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16X16_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
+                                                          16,
+                                                          SWR_TYPE_UNORM,
+                                                          16,
+                                                          SWR_TYPE_UNORM,
+                                                          16,
+                                                          SWR_TYPE_UNUSED,
+                                                          16>,
+                                          FormatSwizzle<0, 1, 2, 3>,
+                                          Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16_16    TransposeT;
     typedef Format4<16, 16, 16, 16> FormatT;
@@ -664,20 +736,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16X16_FLOAT> - Format traits specialization for R16G16B16X16_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16X16_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_UNUSED, 16>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16X16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
+                                                          16,
+                                                          SWR_TYPE_FLOAT,
+                                                          16,
+                                                          SWR_TYPE_FLOAT,
+                                                          16,
+                                                          SWR_TYPE_UNUSED,
+                                                          16>,
+                                          FormatSwizzle<0, 1, 2, 3>,
+                                          Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16_16    TransposeT;
     typedef Format4<16, 16, 16, 16> FormatT;
@@ -686,20 +765,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L32X32_FLOAT> - Format traits specialization for L32X32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L32X32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+                                    FormatSwizzle<0, 3>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -708,20 +787,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<I32X32_FLOAT> - Format traits specialization for I32X32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<I32X32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<I32X32_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+                                    FormatSwizzle<0, 3>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -730,20 +809,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16A16_SSCALED> - Format traits specialization for R16G16B16A16_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16A16_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16A16_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
+                                                            16,
+                                                            SWR_TYPE_SSCALED,
+                                                            16,
+                                                            SWR_TYPE_SSCALED,
+                                                            16,
+                                                            SWR_TYPE_SSCALED,
+                                                            16>,
+                                            FormatSwizzle<0, 1, 2, 3>,
+                                            Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16_16    TransposeT;
     typedef Format4<16, 16, 16, 16> FormatT;
@@ -752,20 +838,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16A16_USCALED> - Format traits specialization for R16G16B16A16_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16A16_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16A16_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
+                                                            16,
+                                                            SWR_TYPE_USCALED,
+                                                            16,
+                                                            SWR_TYPE_USCALED,
+                                                            16,
+                                                            SWR_TYPE_USCALED,
+                                                            16>,
+                                            FormatSwizzle<0, 1, 2, 3>,
+                                            Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16_16    TransposeT;
     typedef Format4<16, 16, 16, 16> FormatT;
@@ -774,20 +867,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32_SSCALED> - Format traits specialization for R32G32_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
+                                      FormatSwizzle<0, 1>,
+                                      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -796,20 +889,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32_USCALED> - Format traits specialization for R32G32_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
+                                      FormatSwizzle<0, 1>,
+                                      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -818,20 +911,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32G32_SFIXED> - Format traits specialization for R32G32_SFIXED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32G32_SFIXED> :
-    ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32G32_SFIXED> : ComponentTraits<SWR_TYPE_SFIXED, 32, SWR_TYPE_SFIXED, 32>,
+                                     FormatSwizzle<0, 1>,
+                                     Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose32_32  TransposeT;
     typedef Format2<32, 32> FormatT;
@@ -840,20 +933,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B8G8R8A8_UNORM> - Format traits specialization for B8G8R8A8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B8G8R8A8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B8G8R8A8_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -862,20 +956,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B8G8R8A8_UNORM_SRGB> - Format traits specialization for B8G8R8A8_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B8G8R8A8_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B8G8R8A8_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -884,20 +979,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R10G10B10A2_UNORM> - Format traits specialization for R10G10B10A2_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R10G10B10A2_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R10G10B10A2_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
+                                                         10,
+                                                         SWR_TYPE_UNORM,
+                                                         10,
+                                                         SWR_TYPE_UNORM,
+                                                         10,
+                                                         SWR_TYPE_UNORM,
+                                                         2>,
+                                         FormatSwizzle<0, 1, 2, 3>,
+                                         Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -906,20 +1008,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R10G10B10A2_UNORM_SRGB> - Format traits specialization for R10G10B10A2_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R10G10B10A2_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R10G10B10A2_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM,
+                                                              10,
+                                                              SWR_TYPE_UNORM,
+                                                              10,
+                                                              SWR_TYPE_UNORM,
+                                                              10,
+                                                              SWR_TYPE_UNORM,
+                                                              2>,
+                                              FormatSwizzle<0, 1, 2, 3>,
+                                              Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -928,20 +1037,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R10G10B10A2_UINT> - Format traits specialization for R10G10B10A2_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R10G10B10A2_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R10G10B10A2_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -950,20 +1060,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8A8_UNORM> - Format traits specialization for R8G8B8A8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8A8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8A8_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -972,20 +1083,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8A8_UNORM_SRGB> - Format traits specialization for R8G8B8A8_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8A8_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8A8_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -994,20 +1106,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8A8_SNORM> - Format traits specialization for R8G8B8A8_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8A8_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8A8_SNORM>
+    : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -1016,20 +1129,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8A8_SINT> - Format traits specialization for R8G8B8A8_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8A8_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R8G8B8A8_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -1038,20 +1152,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8A8_UINT> - Format traits specialization for R8G8B8A8_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8A8_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R8G8B8A8_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -1060,20 +1175,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16_UNORM> - Format traits specialization for R16G16_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+                                    FormatSwizzle<0, 1>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16  TransposeT;
     typedef Format2<16, 16> FormatT;
@@ -1082,20 +1197,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16_SNORM> - Format traits specialization for R16G16_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16_SNORM> : ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
+                                    FormatSwizzle<0, 1>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16  TransposeT;
     typedef Format2<16, 16> FormatT;
@@ -1104,20 +1219,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16_SINT> - Format traits specialization for R16G16_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R16G16_SINT> : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
+                                   FormatSwizzle<0, 1>,
+                                   Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16  TransposeT;
     typedef Format2<16, 16> FormatT;
@@ -1126,20 +1241,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16_UINT> - Format traits specialization for R16G16_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R16G16_UINT> : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
+                                   FormatSwizzle<0, 1>,
+                                   Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16  TransposeT;
     typedef Format2<16, 16> FormatT;
@@ -1148,20 +1263,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16_FLOAT> - Format traits specialization for R16G16_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+                                    FormatSwizzle<0, 1>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16  TransposeT;
     typedef Format2<16, 16> FormatT;
@@ -1170,20 +1285,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B10G10R10A2_UNORM> - Format traits specialization for B10G10R10A2_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B10G10R10A2_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B10G10R10A2_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
+                                                         10,
+                                                         SWR_TYPE_UNORM,
+                                                         10,
+                                                         SWR_TYPE_UNORM,
+                                                         10,
+                                                         SWR_TYPE_UNORM,
+                                                         2>,
+                                         FormatSwizzle<2, 1, 0, 3>,
+                                         Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -1192,20 +1314,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B10G10R10A2_UNORM_SRGB> - Format traits specialization for B10G10R10A2_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B10G10R10A2_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B10G10R10A2_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM,
+                                                              10,
+                                                              SWR_TYPE_UNORM,
+                                                              10,
+                                                              SWR_TYPE_UNORM,
+                                                              10,
+                                                              SWR_TYPE_UNORM,
+                                                              2>,
+                                              FormatSwizzle<2, 1, 0, 3>,
+                                              Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -1214,42 +1343,51 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R11G11B10_FLOAT> - Format traits specialization for R11G11B10_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R11G11B10_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R11G11B10_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose11_11_10   TransposeT;
     typedef Format3<11, 11, 10> FormatT;
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// FormatTraits<R10G10B10_FLOAT_A2_UNORM> - Format traits specialization for R10G10B10_FLOAT_A2_UNORM
+/// FormatTraits<R10G10B10_FLOAT_A2_UNORM> - Format traits specialization for
+/// R10G10B10_FLOAT_A2_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R10G10B10_FLOAT_A2_UNORM> :
-    ComponentTraits<SWR_TYPE_FLOAT, 10, SWR_TYPE_FLOAT, 10, SWR_TYPE_FLOAT, 10, SWR_TYPE_UNORM, 2>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R10G10B10_FLOAT_A2_UNORM> : ComponentTraits<SWR_TYPE_FLOAT,
+                                                                10,
+                                                                SWR_TYPE_FLOAT,
+                                                                10,
+                                                                SWR_TYPE_FLOAT,
+                                                                10,
+                                                                SWR_TYPE_UNORM,
+                                                                2>,
+                                                FormatSwizzle<0, 1, 2, 3>,
+                                                Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -1258,20 +1396,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32_SINT> - Format traits specialization for R32_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 32>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R32_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1280,20 +1417,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32_UINT> - Format traits specialization for R32_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 32>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R32_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1302,20 +1438,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32_FLOAT> - Format traits specialization for R32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1324,20 +1459,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R24_UNORM_X8_TYPELESS> - Format traits specialization for R24_UNORM_X8_TYPELESS
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R24_UNORM_X8_TYPELESS> :
-    ComponentTraits<SWR_TYPE_UNORM, 24>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R24_UNORM_X8_TYPELESS>
+    : ComponentTraits<SWR_TYPE_UNORM, 24>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<24>                  FormatT;
@@ -1346,20 +1480,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<X24_TYPELESS_G8_UINT> - Format traits specialization for X24_TYPELESS_G8_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<X24_TYPELESS_G8_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 32>,
-    FormatSwizzle<1>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<X24_TYPELESS_G8_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 32>, FormatSwizzle<1>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1368,20 +1501,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L32_UNORM> - Format traits specialization for L32_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L32_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 32>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L32_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1390,20 +1522,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L16A16_UNORM> - Format traits specialization for L16A16_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L16A16_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L16A16_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+                                    FormatSwizzle<0, 3>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 1 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{1};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16  TransposeT;
     typedef Format2<16, 16> FormatT;
@@ -1412,20 +1544,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<I24X8_UNORM> - Format traits specialization for I24X8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<I24X8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<I24X8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
+                                   FormatSwizzle<0, 3>,
+                                   Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose24_8  TransposeT;
     typedef Format2<24, 8> FormatT;
@@ -1434,20 +1566,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L24X8_UNORM> - Format traits specialization for L24X8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L24X8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L24X8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
+                                   FormatSwizzle<0, 3>,
+                                   Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose24_8  TransposeT;
     typedef Format2<24, 8> FormatT;
@@ -1456,20 +1588,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<I32_FLOAT> - Format traits specialization for I32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<I32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<I32_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1478,20 +1609,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L32_FLOAT> - Format traits specialization for L32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L32_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1500,20 +1630,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<A32_FLOAT> - Format traits specialization for A32_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<A32_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 32>,
-    FormatSwizzle<3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<A32_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 32>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1522,20 +1651,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B8G8R8X8_UNORM> - Format traits specialization for B8G8R8X8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B8G8R8X8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B8G8R8X8_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -1544,20 +1674,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B8G8R8X8_UNORM_SRGB> - Format traits specialization for B8G8R8X8_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B8G8R8X8_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B8G8R8X8_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -1566,20 +1697,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8X8_UNORM> - Format traits specialization for R8G8B8X8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8X8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8X8_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -1588,20 +1720,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8X8_UNORM_SRGB> - Format traits specialization for R8G8B8X8_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8X8_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8X8_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -1610,20 +1743,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R9G9B9E5_SHAREDEXP> - Format traits specialization for R9G9B9E5_SHAREDEXP
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R9G9B9E5_SHAREDEXP> :
-    ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R9G9B9E5_SHAREDEXP>
+    : ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose9_9_9_5    TransposeT;
     typedef Format4<9, 9, 9, 5> FormatT;
@@ -1632,20 +1766,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B10G10R10X2_UNORM> - Format traits specialization for B10G10R10X2_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B10G10R10X2_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNUSED, 2>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B10G10R10X2_UNORM> : ComponentTraits<SWR_TYPE_UNORM,
+                                                         10,
+                                                         SWR_TYPE_UNORM,
+                                                         10,
+                                                         SWR_TYPE_UNORM,
+                                                         10,
+                                                         SWR_TYPE_UNUSED,
+                                                         2>,
+                                         FormatSwizzle<2, 1, 0, 3>,
+                                         Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -1654,20 +1795,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L16A16_FLOAT> - Format traits specialization for L16A16_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L16A16_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L16A16_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+                                    FormatSwizzle<0, 3>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 1 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{1};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16  TransposeT;
     typedef Format2<16, 16> FormatT;
@@ -1676,20 +1817,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R10G10B10X2_USCALED> - Format traits specialization for R10G10B10X2_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R10G10B10X2_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_UNUSED, 2>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R10G10B10X2_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
+                                                           10,
+                                                           SWR_TYPE_USCALED,
+                                                           10,
+                                                           SWR_TYPE_USCALED,
+                                                           10,
+                                                           SWR_TYPE_UNUSED,
+                                                           2>,
+                                           FormatSwizzle<0, 1, 2, 3>,
+                                           Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -1698,20 +1846,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8A8_SSCALED> - Format traits specialization for R8G8B8A8_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8A8_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8A8_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
+                                                        8,
+                                                        SWR_TYPE_SSCALED,
+                                                        8,
+                                                        SWR_TYPE_SSCALED,
+                                                        8,
+                                                        SWR_TYPE_SSCALED,
+                                                        8>,
+                                        FormatSwizzle<0, 1, 2, 3>,
+                                        Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -1720,20 +1875,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8A8_USCALED> - Format traits specialization for R8G8B8A8_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8A8_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8A8_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
+                                                        8,
+                                                        SWR_TYPE_USCALED,
+                                                        8,
+                                                        SWR_TYPE_USCALED,
+                                                        8,
+                                                        SWR_TYPE_USCALED,
+                                                        8>,
+                                        FormatSwizzle<0, 1, 2, 3>,
+                                        Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -1742,20 +1904,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16_SSCALED> - Format traits specialization for R16G16_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
+                                      FormatSwizzle<0, 1>,
+                                      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16  TransposeT;
     typedef Format2<16, 16> FormatT;
@@ -1764,20 +1926,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16_USCALED> - Format traits specialization for R16G16_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
+                                      FormatSwizzle<0, 1>,
+                                      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16  TransposeT;
     typedef Format2<16, 16> FormatT;
@@ -1786,20 +1948,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32_SSCALED> - Format traits specialization for R32_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 32>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32_SSCALED>
+    : ComponentTraits<SWR_TYPE_SSCALED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1808,20 +1969,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32_USCALED> - Format traits specialization for R32_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 32>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32_USCALED>
+    : ComponentTraits<SWR_TYPE_USCALED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -1830,20 +1990,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B5G6R5_UNORM> - Format traits specialization for B5G6R5_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B5G6R5_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
-    FormatSwizzle<2, 1, 0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B5G6R5_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
+      FormatSwizzle<2, 1, 0>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose5_6_5   TransposeT;
     typedef Format3<5, 6, 5> FormatT;
@@ -1852,20 +2013,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B5G6R5_UNORM_SRGB> - Format traits specialization for B5G6R5_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B5G6R5_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
-    FormatSwizzle<2, 1, 0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B5G6R5_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
+      FormatSwizzle<2, 1, 0>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose5_6_5   TransposeT;
     typedef Format3<5, 6, 5> FormatT;
@@ -1874,20 +2036,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B5G5R5A1_UNORM> - Format traits specialization for B5G5R5A1_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B5G5R5A1_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B5G5R5A1_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose5_5_5_1    TransposeT;
     typedef Format4<5, 5, 5, 1> FormatT;
@@ -1896,20 +2059,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B5G5R5A1_UNORM_SRGB> - Format traits specialization for B5G5R5A1_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B5G5R5A1_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B5G5R5A1_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose5_5_5_1    TransposeT;
     typedef Format4<5, 5, 5, 1> FormatT;
@@ -1918,20 +2082,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B4G4R4A4_UNORM> - Format traits specialization for B4G4R4A4_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B4G4R4A4_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B4G4R4A4_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose4_4_4_4    TransposeT;
     typedef Format4<4, 4, 4, 4> FormatT;
@@ -1940,20 +2105,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B4G4R4A4_UNORM_SRGB> - Format traits specialization for B4G4R4A4_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B4G4R4A4_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B4G4R4A4_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose4_4_4_4    TransposeT;
     typedef Format4<4, 4, 4, 4> FormatT;
@@ -1962,20 +2128,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8_UNORM> - Format traits specialization for R8G8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+                                  FormatSwizzle<0, 1>,
+                                  Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -1984,20 +2150,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8_SNORM> - Format traits specialization for R8G8_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8_SNORM> : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
+                                  FormatSwizzle<0, 1>,
+                                  Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -2006,20 +2172,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8_SINT> - Format traits specialization for R8G8_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R8G8_SINT> : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+                                 FormatSwizzle<0, 1>,
+                                 Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -2028,20 +2194,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8_UINT> - Format traits specialization for R8G8_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R8G8_UINT> : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+                                 FormatSwizzle<0, 1>,
+                                 Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -2050,20 +2216,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16_UNORM> - Format traits specialization for R16_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2072,20 +2237,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16_SNORM> - Format traits specialization for R16_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16_SNORM>
+    : ComponentTraits<SWR_TYPE_SNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2094,20 +2258,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16_SINT> - Format traits specialization for R16_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R16_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2116,20 +2279,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16_UINT> - Format traits specialization for R16_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R16_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2138,20 +2300,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16_FLOAT> - Format traits specialization for R16_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2160,20 +2321,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<I16_UNORM> - Format traits specialization for I16_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<I16_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<I16_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2182,20 +2342,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L16_UNORM> - Format traits specialization for L16_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L16_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L16_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2204,20 +2363,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<A16_UNORM> - Format traits specialization for A16_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<A16_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 16>,
-    FormatSwizzle<3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<A16_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 16>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2226,20 +2384,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L8A8_UNORM> - Format traits specialization for L8A8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L8A8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L8A8_UNORM> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+                                  FormatSwizzle<0, 3>,
+                                  Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 1 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{1};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -2248,20 +2406,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<I16_FLOAT> - Format traits specialization for I16_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<I16_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<I16_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2270,20 +2427,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L16_FLOAT> - Format traits specialization for L16_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L16_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L16_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2292,20 +2448,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<A16_FLOAT> - Format traits specialization for A16_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<A16_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 16>,
-    FormatSwizzle<3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<A16_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 16>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2314,20 +2469,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L8A8_UNORM_SRGB> - Format traits specialization for L8A8_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L8A8_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L8A8_UNORM_SRGB> : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+                                       FormatSwizzle<0, 3>,
+                                       Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 1 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{1};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -2336,20 +2491,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B5G5R5X1_UNORM> - Format traits specialization for B5G5R5X1_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B5G5R5X1_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B5G5R5X1_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose5_5_5_1    TransposeT;
     typedef Format4<5, 5, 5, 1> FormatT;
@@ -2358,20 +2514,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B5G5R5X1_UNORM_SRGB> - Format traits specialization for B5G5R5X1_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B5G5R5X1_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B5G5R5X1_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose5_5_5_1    TransposeT;
     typedef Format4<5, 5, 5, 1> FormatT;
@@ -2380,20 +2537,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8_SSCALED> - Format traits specialization for R8G8_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
+                                    FormatSwizzle<0, 1>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -2402,20 +2559,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8_USCALED> - Format traits specialization for R8G8_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
-    FormatSwizzle<0, 1>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8_USCALED> : ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
+                                    FormatSwizzle<0, 1>,
+                                    Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -2424,20 +2581,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16_SSCALED> - Format traits specialization for R16_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16_SSCALED>
+    : ComponentTraits<SWR_TYPE_SSCALED, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2446,20 +2602,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16_USCALED> - Format traits specialization for R16_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 16>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16_USCALED>
+    : ComponentTraits<SWR_TYPE_USCALED, 16>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<16> TransposeT;
     typedef Format1<16>                  FormatT;
@@ -2468,20 +2623,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<A1B5G5R5_UNORM> - Format traits specialization for A1B5G5R5_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<A1B5G5R5_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 1, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5>,
-    FormatSwizzle<3, 2, 1, 0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<A1B5G5R5_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 1, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5>,
+      FormatSwizzle<3, 2, 1, 0>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose1_5_5_5    TransposeT;
     typedef Format4<1, 5, 5, 5> FormatT;
@@ -2490,20 +2646,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<A4B4G4R4_UNORM> - Format traits specialization for A4B4G4R4_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<A4B4G4R4_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
-    FormatSwizzle<3, 2, 1, 0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<A4B4G4R4_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
+      FormatSwizzle<3, 2, 1, 0>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose4_4_4_4    TransposeT;
     typedef Format4<4, 4, 4, 4> FormatT;
@@ -2512,20 +2669,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L8A8_UINT> - Format traits specialization for L8A8_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L8A8_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<L8A8_UINT> : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+                                 FormatSwizzle<0, 3>,
+                                 Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 1 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{1};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -2534,20 +2691,20 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L8A8_SINT> - Format traits specialization for L8A8_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L8A8_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-    FormatSwizzle<0, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<L8A8_SINT> : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+                                 FormatSwizzle<0, 3>,
+                                 Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 16 };
-    static const uint32_t numComps{ 2 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 1 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{16};
+    static const uint32_t numComps{2};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{1};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8  TransposeT;
     typedef Format2<8, 8> FormatT;
@@ -2556,20 +2713,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8_UNORM> - Format traits specialization for R8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2578,20 +2734,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8_SNORM> - Format traits specialization for R8_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8_SNORM>
+    : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2600,20 +2755,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8_SINT> - Format traits specialization for R8_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R8_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2622,20 +2776,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8_UINT> - Format traits specialization for R8_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R8_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2644,20 +2797,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<A8_UNORM> - Format traits specialization for A8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<A8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<A8_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<3>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2666,20 +2818,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<I8_UNORM> - Format traits specialization for I8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<I8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<I8_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2688,20 +2839,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L8_UNORM> - Format traits specialization for L8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L8_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2710,20 +2860,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8_SSCALED> - Format traits specialization for R8_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8_SSCALED>
+    : ComponentTraits<SWR_TYPE_SSCALED, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2732,20 +2881,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8_USCALED> - Format traits specialization for R8_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8_USCALED>
+    : ComponentTraits<SWR_TYPE_USCALED, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2754,20 +2902,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L8_UNORM_SRGB> - Format traits specialization for L8_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L8_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<L8_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2776,20 +2923,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L8_UINT> - Format traits specialization for L8_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L8_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<L8_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2798,20 +2944,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<L8_SINT> - Format traits specialization for L8_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<L8_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<L8_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2820,20 +2965,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<I8_UINT> - Format traits specialization for I8_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<I8_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<I8_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2842,20 +2986,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<I8_SINT> - Format traits specialization for I8_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<I8_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<I8_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2864,20 +3007,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<DXT1_RGB_SRGB> - Format traits specialization for DXT1_RGB_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<DXT1_RGB_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<DXT1_RGB_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2886,20 +3028,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<YCRCB_SWAPUVY> - Format traits specialization for YCRCB_SWAPUVY
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<YCRCB_SWAPUVY> :
-    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<YCRCB_SWAPUVY>
+    : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ true };
-    static const uint32_t bcWidth{ 2 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{true};
+    static const uint32_t bcWidth{2};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -2908,20 +3051,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC1_UNORM> - Format traits specialization for BC1_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC1_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC1_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2930,20 +3072,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC2_UNORM> - Format traits specialization for BC2_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC2_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC2_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2952,20 +3093,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC3_UNORM> - Format traits specialization for BC3_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC3_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC3_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2974,20 +3114,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC4_UNORM> - Format traits specialization for BC4_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC4_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC4_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -2996,20 +3135,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC5_UNORM> - Format traits specialization for BC5_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC5_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC5_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3018,20 +3156,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC1_UNORM_SRGB> - Format traits specialization for BC1_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC1_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC1_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3040,20 +3177,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC2_UNORM_SRGB> - Format traits specialization for BC2_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC2_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC2_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3062,20 +3198,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC3_UNORM_SRGB> - Format traits specialization for BC3_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC3_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC3_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3084,20 +3219,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<YCRCB_SWAPUV> - Format traits specialization for YCRCB_SWAPUV
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<YCRCB_SWAPUV> :
-    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<YCRCB_SWAPUV>
+    : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ true };
-    static const uint32_t bcWidth{ 2 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{true};
+    static const uint32_t bcWidth{2};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8_8    TransposeT;
     typedef Format4<8, 8, 8, 8> FormatT;
@@ -3106,20 +3242,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<DXT1_RGB> - Format traits specialization for DXT1_RGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<DXT1_RGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<DXT1_RGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3128,20 +3263,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8_UNORM> - Format traits specialization for R8G8B8_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 24 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{24};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8   TransposeT;
     typedef Format3<8, 8, 8> FormatT;
@@ -3150,20 +3286,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8_SNORM> - Format traits specialization for R8G8B8_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8_SNORM>
+    : ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 24 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{24};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8   TransposeT;
     typedef Format3<8, 8, 8> FormatT;
@@ -3172,20 +3309,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8_SSCALED> - Format traits specialization for R8G8B8_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8_SSCALED>
+    : ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 24 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{24};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8   TransposeT;
     typedef Format3<8, 8, 8> FormatT;
@@ -3194,20 +3332,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8_USCALED> - Format traits specialization for R8G8B8_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8_USCALED>
+    : ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 24 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{24};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8   TransposeT;
     typedef Format3<8, 8, 8> FormatT;
@@ -3216,20 +3355,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R64G64B64A64_FLOAT> - Format traits specialization for R64G64B64A64_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R64G64B64A64_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R64G64B64A64_FLOAT> : ComponentTraits<SWR_TYPE_FLOAT,
+                                                          64,
+                                                          SWR_TYPE_FLOAT,
+                                                          64,
+                                                          SWR_TYPE_FLOAT,
+                                                          64,
+                                                          SWR_TYPE_FLOAT,
+                                                          64>,
+                                          FormatSwizzle<0, 1, 2, 3>,
+                                          Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 256 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{256};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose64_64_64_64    TransposeT;
     typedef Format4<64, 64, 64, 64> FormatT;
@@ -3238,20 +3384,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R64G64B64_FLOAT> - Format traits specialization for R64G64B64_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R64G64B64_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R64G64B64_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64, SWR_TYPE_FLOAT, 64>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 192 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{192};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose64_64_64   TransposeT;
     typedef Format3<64, 64, 64> FormatT;
@@ -3260,20 +3407,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC4_SNORM> - Format traits specialization for BC4_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC4_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC4_SNORM>
+    : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 64 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{64};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3282,20 +3428,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC5_SNORM> - Format traits specialization for BC5_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC5_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC5_SNORM>
+    : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3304,20 +3449,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16_FLOAT> - Format traits specialization for R16G16B16_FLOAT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16_FLOAT> :
-    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16_FLOAT>
+    : ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 48 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{48};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16   TransposeT;
     typedef Format3<16, 16, 16> FormatT;
@@ -3326,20 +3472,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16_UNORM> - Format traits specialization for R16G16B16_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 48 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{48};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16   TransposeT;
     typedef Format3<16, 16, 16> FormatT;
@@ -3348,20 +3495,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16_SNORM> - Format traits specialization for R16G16B16_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16_SNORM>
+    : ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 48 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{48};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16   TransposeT;
     typedef Format3<16, 16, 16> FormatT;
@@ -3370,20 +3518,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16_SSCALED> - Format traits specialization for R16G16B16_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16_SSCALED>
+    : ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 48 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{48};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16   TransposeT;
     typedef Format3<16, 16, 16> FormatT;
@@ -3392,20 +3541,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16_USCALED> - Format traits specialization for R16G16B16_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R16G16B16_USCALED>
+    : ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 48 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{48};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16   TransposeT;
     typedef Format3<16, 16, 16> FormatT;
@@ -3414,20 +3564,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC6H_SF16> - Format traits specialization for BC6H_SF16
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC6H_SF16> :
-    ComponentTraits<SWR_TYPE_SNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC6H_SF16>
+    : ComponentTraits<SWR_TYPE_SNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3436,20 +3585,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC7_UNORM> - Format traits specialization for BC7_UNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC7_UNORM> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC7_UNORM>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3458,20 +3606,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC7_UNORM_SRGB> - Format traits specialization for BC7_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC7_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC7_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{true};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3480,20 +3627,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<BC6H_UF16> - Format traits specialization for BC6H_UF16
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<BC6H_UF16> :
-    ComponentTraits<SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<BC6H_UF16>
+    : ComponentTraits<SWR_TYPE_UNORM, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 128 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ true };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 4 };
-    static const uint32_t bcHeight{ 4 };
+    static const uint32_t bpp{128};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{true};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{4};
+    static const uint32_t bcHeight{4};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
@@ -3502,20 +3648,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8_UNORM_SRGB> - Format traits specialization for R8G8B8_UNORM_SRGB
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8_UNORM_SRGB> :
-    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R8G8B8_UNORM_SRGB>
+    : ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 24 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ true };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{24};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{true};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8   TransposeT;
     typedef Format3<8, 8, 8> FormatT;
@@ -3524,20 +3671,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16_UINT> - Format traits specialization for R16G16B16_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R16G16B16_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 48 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{48};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16   TransposeT;
     typedef Format3<16, 16, 16> FormatT;
@@ -3546,20 +3694,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R16G16B16_SINT> - Format traits specialization for R16G16B16_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R16G16B16_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R16G16B16_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 48 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{48};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose16_16_16   TransposeT;
     typedef Format3<16, 16, 16> FormatT;
@@ -3568,20 +3717,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R32_SFIXED> - Format traits specialization for R32_SFIXED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R32_SFIXED> :
-    ComponentTraits<SWR_TYPE_SFIXED, 32>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R32_SFIXED>
+    : ComponentTraits<SWR_TYPE_SFIXED, 32>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<32> TransposeT;
     typedef Format1<32>                  FormatT;
@@ -3590,20 +3738,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R10G10B10A2_SNORM> - Format traits specialization for R10G10B10A2_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R10G10B10A2_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R10G10B10A2_SNORM> : ComponentTraits<SWR_TYPE_SNORM,
+                                                         10,
+                                                         SWR_TYPE_SNORM,
+                                                         10,
+                                                         SWR_TYPE_SNORM,
+                                                         10,
+                                                         SWR_TYPE_SNORM,
+                                                         2>,
+                                         FormatSwizzle<0, 1, 2, 3>,
+                                         Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -3612,20 +3767,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R10G10B10A2_USCALED> - Format traits specialization for R10G10B10A2_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R10G10B10A2_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R10G10B10A2_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
+                                                           10,
+                                                           SWR_TYPE_USCALED,
+                                                           10,
+                                                           SWR_TYPE_USCALED,
+                                                           10,
+                                                           SWR_TYPE_USCALED,
+                                                           2>,
+                                           FormatSwizzle<0, 1, 2, 3>,
+                                           Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -3634,20 +3796,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R10G10B10A2_SSCALED> - Format traits specialization for R10G10B10A2_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R10G10B10A2_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<R10G10B10A2_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
+                                                           10,
+                                                           SWR_TYPE_SSCALED,
+                                                           10,
+                                                           SWR_TYPE_SSCALED,
+                                                           10,
+                                                           SWR_TYPE_SSCALED,
+                                                           2>,
+                                           FormatSwizzle<0, 1, 2, 3>,
+                                           Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -3656,20 +3825,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R10G10B10A2_SINT> - Format traits specialization for R10G10B10A2_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R10G10B10A2_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
-    FormatSwizzle<0, 1, 2, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R10G10B10A2_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
+      FormatSwizzle<0, 1, 2, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -3678,20 +3848,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B10G10R10A2_SNORM> - Format traits specialization for B10G10R10A2_SNORM
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B10G10R10A2_SNORM> :
-    ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B10G10R10A2_SNORM> : ComponentTraits<SWR_TYPE_SNORM,
+                                                         10,
+                                                         SWR_TYPE_SNORM,
+                                                         10,
+                                                         SWR_TYPE_SNORM,
+                                                         10,
+                                                         SWR_TYPE_SNORM,
+                                                         2>,
+                                         FormatSwizzle<2, 1, 0, 3>,
+                                         Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -3700,20 +3877,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B10G10R10A2_USCALED> - Format traits specialization for B10G10R10A2_USCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B10G10R10A2_USCALED> :
-    ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B10G10R10A2_USCALED> : ComponentTraits<SWR_TYPE_USCALED,
+                                                           10,
+                                                           SWR_TYPE_USCALED,
+                                                           10,
+                                                           SWR_TYPE_USCALED,
+                                                           10,
+                                                           SWR_TYPE_USCALED,
+                                                           2>,
+                                           FormatSwizzle<2, 1, 0, 3>,
+                                           Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -3722,20 +3906,27 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B10G10R10A2_SSCALED> - Format traits specialization for B10G10R10A2_SSCALED
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B10G10R10A2_SSCALED> :
-    ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x3f800000>
+template <>
+struct FormatTraits<B10G10R10A2_SSCALED> : ComponentTraits<SWR_TYPE_SSCALED,
+                                                           10,
+                                                           SWR_TYPE_SSCALED,
+                                                           10,
+                                                           SWR_TYPE_SSCALED,
+                                                           10,
+                                                           SWR_TYPE_SSCALED,
+                                                           2>,
+                                           FormatSwizzle<2, 1, 0, 3>,
+                                           Defaults<0, 0, 0, 0x3f800000>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -3744,20 +3935,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B10G10R10A2_UINT> - Format traits specialization for B10G10R10A2_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B10G10R10A2_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<B10G10R10A2_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -3766,20 +3958,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<B10G10R10A2_SINT> - Format traits specialization for B10G10R10A2_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<B10G10R10A2_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
-    FormatSwizzle<2, 1, 0, 3>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<B10G10R10A2_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
+      FormatSwizzle<2, 1, 0, 3>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 32 };
-    static const uint32_t numComps{ 4 };
-    static const bool hasAlpha{ true };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{32};
+    static const uint32_t numComps{4};
+    static const bool     hasAlpha{true};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose10_10_10_2    TransposeT;
     typedef Format4<10, 10, 10, 2> FormatT;
@@ -3788,20 +3981,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8_UINT> - Format traits specialization for R8G8B8_UINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8_UINT> :
-    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R8G8B8_UINT>
+    : ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 24 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{24};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8   TransposeT;
     typedef Format3<8, 8, 8> FormatT;
@@ -3810,20 +4004,21 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<R8G8B8_SINT> - Format traits specialization for R8G8B8_SINT
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<R8G8B8_SINT> :
-    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
-    FormatSwizzle<0, 1, 2>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<R8G8B8_SINT>
+    : ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+      FormatSwizzle<0, 1, 2>,
+      Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 24 };
-    static const uint32_t numComps{ 3 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 0 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{24};
+    static const uint32_t numComps{3};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{0};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef Transpose8_8_8   TransposeT;
     typedef Format3<8, 8, 8> FormatT;
@@ -3832,20 +4027,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// FormatTraits<RAW> - Format traits specialization for RAW
 //////////////////////////////////////////////////////////////////////////
-template<> struct FormatTraits<RAW> :
-    ComponentTraits<SWR_TYPE_UINT, 8>,
-    FormatSwizzle<0>,
-    Defaults<0, 0, 0, 0x1>
+template <>
+struct FormatTraits<RAW>
+    : ComponentTraits<SWR_TYPE_UINT, 8>, FormatSwizzle<0>, Defaults<0, 0, 0, 0x1>
 {
-    static const uint32_t bpp{ 8 };
-    static const uint32_t numComps{ 1 };
-    static const bool hasAlpha{ false };
-    static const uint32_t alphaComp{ 3 };
-    static const bool isSRGB{ false };
-    static const bool isBC{ false };
-    static const bool isSubsampled{ false };
-    static const uint32_t bcWidth{ 1 };
-    static const uint32_t bcHeight{ 1 };
+    static const uint32_t bpp{8};
+    static const uint32_t numComps{1};
+    static const bool     hasAlpha{false};
+    static const uint32_t alphaComp{3};
+    static const bool     isSRGB{false};
+    static const bool     isBC{false};
+    static const bool     isSubsampled{false};
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
 
     typedef TransposeSingleComponent<8> TransposeT;
     typedef Format1<8>                  FormatT;
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index c3327c1..518da82 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file formats.h
-*
-* @brief Definitions for SWR_FORMAT functions.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file formats.h
+ *
+ * @brief Definitions for SWR_FORMAT functions.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "utils.h"
@@ -36,16 +36,16 @@
 template <uint32_t NumBits, bool Signed = false>
 struct PackTraits
 {
-    static const uint32_t MyNumBits = NumBits;
-    static simdscalar loadSOA(const uint8_t *pSrc) = delete;
-    static void storeSOA(uint8_t *pDst, simdscalar const &src) = delete;
-    static simdscalar unpack(simdscalar &in) = delete;
-    static simdscalar pack(simdscalar &in) = delete;
+    static const uint32_t MyNumBits                                      = NumBits;
+    static simdscalar     loadSOA(const uint8_t* pSrc)                   = delete;
+    static void           storeSOA(uint8_t* pDst, simdscalar const& src) = delete;
+    static simdscalar     unpack(simdscalar& in)                         = delete;
+    static simdscalar     pack(simdscalar& in)                           = delete;
 #if ENABLE_AVX512_SIMD16
-    static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) = delete;
-    static simd16scalar unpack(simd16scalar &in) = delete;
-    static simd16scalar pack(simd16scalar &in) = delete;
+    static simd16scalar loadSOA_16(const uint8_t* pSrc)                   = delete;
+    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete;
+    static simd16scalar  unpack(simd16scalar& in)                         = delete;
+    static simd16scalar  pack(simd16scalar& in)                           = delete;
 #endif
 };
 
@@ -57,15 +57,15 @@
 {
     static const uint32_t MyNumBits = 0;
 
-    static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
-    static void storeSOA(uint8_t *pDst, simdscalar const &src) { return; }
-    static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
-    static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
+    static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_setzero_ps(); }
+    static void       storeSOA(uint8_t* pDst, simdscalar const& src) { return; }
+    static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); }
+    static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); }
 #if ENABLE_AVX512_SIMD16
-    static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) { return; }
-    static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
-    static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
+    static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); }
+    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; }
+    static simd16scalar  unpack(simd16scalar& in) { return _simd16_setzero_ps(); }
+    static simd16scalar  pack(simd16scalar& in) { return _simd16_setzero_ps(); }
 #endif
 };
 
@@ -77,18 +77,18 @@
 {
     static const uint32_t MyNumBits = 8;
 
-    static simdscalar loadSOA(const uint8_t *pSrc)
+    static simdscalar loadSOA(const uint8_t* pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
-        __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
+        __m128 vLo    = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
         return _mm256_insertf128_ps(result, vLo, 0);
 #else
 #error Unsupported vector width
 #endif
     }
 
-    static void storeSOA(uint8_t *pDst, simdscalar const &src)
+    static void storeSOA(uint8_t* pDst, simdscalar const& src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -98,31 +98,33 @@
 #endif
     }
 
-    static simdscalar unpack(simdscalar &in)
+    static simdscalar unpack(simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
 #if KNOB_ARCH <= KNOB_ARCH_AVX
-        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+        __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
         __m128i resLo = _mm_cvtepu8_epi32(src);
-        __m128i resHi = _mm_shuffle_epi8(src,
-            _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
+        __m128i resHi =
+            _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
 
         __m256i result = _mm256_castsi128_si256(resLo);
-        result = _mm256_insertf128_si256(result, resHi, 1);
-        return simdscalar{ _mm256_castsi256_ps(result) };
+        result         = _mm256_insertf128_si256(result, resHi, 1);
+        return simdscalar{_mm256_castsi256_ps(result)};
 #else
-        return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+        return _mm256_castsi256_ps(
+            _mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 #endif
 #else
 #error Unsupported vector width
 #endif
     }
 
-    static simdscalar pack(simdscalar &in)
+    static simdscalar pack(simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src = _simd_castps_si(in);
-        __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
+        __m128i     res16 =
+            _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
         __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
 #else
@@ -131,51 +133,64 @@
     }
 #if ENABLE_AVX512_SIMD16
 
-    static simd16scalar loadSOA_16(const uint8_t *pSrc)
+    static simd16scalar loadSOA_16(const uint8_t* pSrc)
     {
-        simd16scalar result = _simd16_setzero_ps();
-        simdscalar resultlo = _simd_setzero_ps();
+        simd16scalar result   = _simd16_setzero_ps();
+        simdscalar   resultlo = _simd_setzero_ps();
 
-        const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
+        const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
 
         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
-        result = _simd16_insert_ps(result, resultlo, 0);
+        result   = _simd16_insert_ps(result, resultlo, 0);
 
         return result;
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
+    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
     {
         // store simd16 bytes
-        _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
+        _mm_store_ps(reinterpret_cast<float*>(pDst),
+                     _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
     }
 
-    static simd16scalar unpack(simd16scalar &in)
+    static simd16scalar unpack(simd16scalar& in)
     {
-        simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+        simd4scalari  tmp    = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
         simd16scalari result = _simd16_cvtepu8_epi32(tmp);
 
         return _simd16_castsi_ps(result);
     }
 
-    static simd16scalar pack(simd16scalar &in)
+    static simd16scalar pack(simd16scalar& in)
     {
         simd16scalari result = _simd16_setzero_si();
 
-        simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));          // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
-        simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));          // r8 r9 rA rB rC rD rE rF
+        simdscalari inlo =
+            _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
+        simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
 
-        simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);           // r0 r1 r2 r3 r8 r9 rA rB (32b)
-        simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);           // r4 r5 r6 r7 rC rD rE rF (32b)
+        simdscalari permlo =
+            _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
+        simdscalari permhi =
+            _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
 
-        simdscalari pack = _simd_packus_epi32(permlo, permhi);                  // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
+        simdscalari pack = _simd_packus_epi32(
+            permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
 
         const simdscalari zero = _simd_setzero_si();
 
-        permlo = _simd_permute2f128_si(pack, zero, 0x20);   // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
-        permhi = _simd_permute2f128_si(pack, zero, 0x31);   // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
+        permlo = _simd_permute2f128_si(
+            pack,
+            zero,
+            0x20); // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
+        permhi = _simd_permute2f128_si(
+            pack,
+            zero,
+            0x31); // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
 
-        pack = _simd_packus_epi16(permlo, permhi);                              // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
+        pack = _simd_packus_epi16(permlo,
+                                  permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00
+                                           // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
 
         result = _simd16_insert_si(result, pack, 0);
 
@@ -192,18 +207,18 @@
 {
     static const uint32_t MyNumBits = 8;
 
-    static simdscalar loadSOA(const uint8_t *pSrc)
+    static simdscalar loadSOA(const uint8_t* pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
-        __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
+        __m128 vLo    = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
         return _mm256_insertf128_ps(result, vLo, 0);
 #else
 #error Unsupported vector width
 #endif
     }
 
-    static void storeSOA(uint8_t *pDst, simdscalar const &src)
+    static void storeSOA(uint8_t* pDst, simdscalar const& src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -213,32 +228,34 @@
 #endif
     }
 
-    static simdscalar unpack(simdscalar &in)
+    static simdscalar unpack(simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
 #if KNOB_ARCH <= KNOB_ARCH_AVX
         SWR_INVALID("I think this may be incorrect.");
-        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+        __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
         __m128i resLo = _mm_cvtepi8_epi32(src);
-        __m128i resHi = _mm_shuffle_epi8(src,
-            _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
+        __m128i resHi =
+            _mm_shuffle_epi8(src, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
 
         __m256i result = _mm256_castsi128_si256(resLo);
-        result = _mm256_insertf128_si256(result, resHi, 1);
+        result         = _mm256_insertf128_si256(result, resHi, 1);
         return _mm256_castsi256_ps(result);
 #else
-        return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+        return _mm256_castsi256_ps(
+            _mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 #endif
 #else
 #error Unsupported vector width
 #endif
     }
 
-    static simdscalar pack(simdscalar &in)
+    static simdscalar pack(simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src = _simd_castps_si(in);
-        __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
+        __m128i     res16 =
+            _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
         __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
 #else
@@ -247,51 +264,64 @@
     }
 #if ENABLE_AVX512_SIMD16
 
-    static simd16scalar loadSOA_16(const uint8_t *pSrc)
+    static simd16scalar loadSOA_16(const uint8_t* pSrc)
     {
-        simd16scalar result = _simd16_setzero_ps();
-        simdscalar resultlo = _simd_setzero_ps();
+        simd16scalar result   = _simd16_setzero_ps();
+        simdscalar   resultlo = _simd_setzero_ps();
 
-        const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
+        const __m128 src = _mm_load_ps(reinterpret_cast<const float*>(pSrc));
 
         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
-        result = _simd16_insert_ps(result, resultlo, 0);
+        result   = _simd16_insert_ps(result, resultlo, 0);
 
         return result;
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
+    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
     {
         // store simd16 bytes
-        _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
+        _mm_store_ps(reinterpret_cast<float*>(pDst),
+                     _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
     }
 
-    static simd16scalar unpack(simd16scalar &in)
+    static simd16scalar unpack(simd16scalar& in)
     {
-        simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+        simd4scalari  tmp    = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
         simd16scalari result = _simd16_cvtepu8_epi32(tmp);
 
         return _simd16_castsi_ps(result);
     }
 
-    static simd16scalar pack(simd16scalar &in)
+    static simd16scalar pack(simd16scalar& in)
     {
         simd16scalari result = _simd16_setzero_si();
 
-        simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));          // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
-        simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));          // r8 r9 rA rB rC rD rE rF
+        simdscalari inlo =
+            _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
+        simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
 
-        simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);           // r0 r1 r2 r3 r8 r9 rA rB (32b)
-        simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);           // r4 r5 r6 r7 rC rD rE rF (32b)
+        simdscalari permlo =
+            _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
+        simdscalari permhi =
+            _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
 
-        simdscalari pack = _simd_packs_epi32(permlo, permhi);                   // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
+        simdscalari pack = _simd_packs_epi32(
+            permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
 
         const simdscalari zero = _simd_setzero_si();
 
-        permlo = _simd_permute2f128_si(pack, zero, 0x20);   // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
-        permhi = _simd_permute2f128_si(pack, zero, 0x31);   // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
+        permlo = _simd_permute2f128_si(
+            pack,
+            zero,
+            0x20); // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
+        permhi = _simd_permute2f128_si(
+            pack,
+            zero,
+            0x31); // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
 
-        pack = _simd_packs_epi16(permlo, permhi);                               // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
+        pack =
+            _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00
+                                               // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
 
         result = _simd16_insert_si(result, pack, 0);
 
@@ -308,18 +338,18 @@
 {
     static const uint32_t MyNumBits = 16;
 
-    static simdscalar loadSOA(const uint8_t *pSrc)
+    static simdscalar loadSOA(const uint8_t* pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
-        __m128 vLo = _mm_load_ps((const float*)pSrc);
+        __m128 vLo    = _mm_load_ps((const float*)pSrc);
         return _mm256_insertf128_ps(result, vLo, 0);
 #else
 #error Unsupported vector width
 #endif
     }
 
-    static void storeSOA(uint8_t *pDst, simdscalar const &src)
+    static void storeSOA(uint8_t* pDst, simdscalar const& src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -329,31 +359,33 @@
 #endif
     }
 
-    static simdscalar unpack(simdscalar &in)
+    static simdscalar unpack(simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
 #if KNOB_ARCH <= KNOB_ARCH_AVX
-        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+        __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
         __m128i resLo = _mm_cvtepu16_epi32(src);
-        __m128i resHi = _mm_shuffle_epi8(src,
-            _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
+        __m128i resHi =
+            _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
 
         __m256i result = _mm256_castsi128_si256(resLo);
-        result = _mm256_insertf128_si256(result, resHi, 1);
+        result         = _mm256_insertf128_si256(result, resHi, 1);
         return _mm256_castsi256_ps(result);
 #else
-        return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+        return _mm256_castsi256_ps(
+            _mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 #endif
 #else
 #error Unsupported vector width
 #endif
     }
 
-    static simdscalar pack(simdscalar &in)
+    static simdscalar pack(simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src = _simd_castps_si(in);
-        __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
+        __m256i     res = _mm256_castsi128_si256(
+            _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
         return _mm256_castsi256_ps(res);
 #else
 #error Unsupported vector width
@@ -361,37 +393,45 @@
     }
 #if ENABLE_AVX512_SIMD16
 
-    static simd16scalar loadSOA_16(const uint8_t *pSrc)
+    static simd16scalar loadSOA_16(const uint8_t* pSrc)
     {
         simd16scalar result = _simd16_setzero_ps();
 
-        simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
+        simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
 
         result = _simd16_insert_ps(result, resultlo, 0);
 
         return result;
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
+    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
     {
-        _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
+        _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0));
     }
 
-    static simd16scalar unpack(simd16scalar &in)
+    static simd16scalar unpack(simd16scalar& in)
     {
         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
 
         return _simd16_castsi_ps(result);
     }
 
-    static simd16scalar pack(simd16scalar &in)
+    static simd16scalar pack(simd16scalar& in)
     {
         const simd16scalari zero = _simd16_setzero_si();
 
-        simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
-        simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
+        simd16scalari permlo = _simd16_permute2f128_si(
+            _simd16_castps_si(in),
+            zero,
+            0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
+        simd16scalari permhi = _simd16_permute2f128_si(
+            _simd16_castps_si(in),
+            zero,
+            0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
 
-        simd16scalari result = _simd16_packus_epi32(permlo, permhi);    // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
+        simd16scalari result = _simd16_packus_epi32(
+            permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00
+                             // 00 00 00 00 00 00 00 00 00 (16b)
 
         return _simd16_castsi_ps(result);
     }
@@ -406,18 +446,18 @@
 {
     static const uint32_t MyNumBits = 16;
 
-    static simdscalar loadSOA(const uint8_t *pSrc)
+    static simdscalar loadSOA(const uint8_t* pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
-        __m128 vLo = _mm_load_ps((const float*)pSrc);
+        __m128 vLo    = _mm_load_ps((const float*)pSrc);
         return _mm256_insertf128_ps(result, vLo, 0);
 #else
 #error Unsupported vector width
 #endif
     }
 
-    static void storeSOA(uint8_t *pDst, simdscalar const &src)
+    static void storeSOA(uint8_t* pDst, simdscalar const& src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -427,32 +467,34 @@
 #endif
     }
 
-    static simdscalar unpack(simdscalar &in)
+    static simdscalar unpack(simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
 #if KNOB_ARCH <= KNOB_ARCH_AVX
         SWR_INVALID("I think this may be incorrect.");
-        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+        __m128i src   = _mm_castps_si128(_mm256_castps256_ps128(in));
         __m128i resLo = _mm_cvtepi16_epi32(src);
-        __m128i resHi = _mm_shuffle_epi8(src,
-            _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
+        __m128i resHi =
+            _mm_shuffle_epi8(src, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
 
         __m256i result = _mm256_castsi128_si256(resLo);
-        result = _mm256_insertf128_si256(result, resHi, 1);
+        result         = _mm256_insertf128_si256(result, resHi, 1);
         return _mm256_castsi256_ps(result);
 #else
-        return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+        return _mm256_castsi256_ps(
+            _mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 #endif
 #else
 #error Unsupported vector width
 #endif
     }
 
-    static simdscalar pack(simdscalar &in)
+    static simdscalar pack(simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src = _simd_castps_si(in);
-        __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
+        __m256i     res = _mm256_castsi128_si256(
+            _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
         return _mm256_castsi256_ps(res);
 #else
 #error Unsupported vector width
@@ -460,37 +502,45 @@
     }
 #if ENABLE_AVX512_SIMD16
 
-    static simd16scalar loadSOA_16(const uint8_t *pSrc)
+    static simd16scalar loadSOA_16(const uint8_t* pSrc)
     {
         simd16scalar result = _simd16_setzero_ps();
 
-        simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
+        simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float*>(pSrc));
 
         result = _simd16_insert_ps(result, resultlo, 0);
 
         return result;
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
+    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
     {
-        _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
+        _simd_store_ps(reinterpret_cast<float*>(pDst), _simd16_extract_ps(src, 0));
     }
 
-    static simd16scalar unpack(simd16scalar &in)
+    static simd16scalar unpack(simd16scalar& in)
     {
         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
 
         return _simd16_castsi_ps(result);
     }
 
-    static simd16scalar pack(simd16scalar &in)
+    static simd16scalar pack(simd16scalar& in)
     {
         const simd16scalari zero = _simd16_setzero_si();
 
-        simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
-        simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
+        simd16scalari permlo = _simd16_permute2f128_si(
+            _simd16_castps_si(in),
+            zero,
+            0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
+        simd16scalari permhi = _simd16_permute2f128_si(
+            _simd16_castps_si(in),
+            zero,
+            0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
 
-        simd16scalari result = _simd16_packs_epi32(permlo, permhi);     // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
+        simd16scalari result = _simd16_packs_epi32(
+            permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00
+                             // 00 00 00 00 00 00 00 00 00 (16b)
 
         return _simd16_castsi_ps(result);
     }
@@ -505,188 +555,281 @@
 {
     static const uint32_t MyNumBits = 32;
 
-    static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
-    static void storeSOA(uint8_t *pDst, simdscalar const &src) { _simd_store_ps((float*)pDst, src); }
-    static simdscalar unpack(simdscalar &in) { return in; }
-    static simdscalar pack(simdscalar &in) { return in; }
+    static simdscalar loadSOA(const uint8_t* pSrc) { return _simd_load_ps((const float*)pSrc); }
+    static void       storeSOA(uint8_t* pDst, simdscalar const& src)
+    {
+        _simd_store_ps((float*)pDst, src);
+    }
+    static simdscalar unpack(simdscalar& in) { return in; }
+    static simdscalar pack(simdscalar& in) { return in; }
 #if ENABLE_AVX512_SIMD16
 
-    static simd16scalar loadSOA_16(const uint8_t *pSrc)
+    static simd16scalar loadSOA_16(const uint8_t* pSrc)
     {
-        return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
+        return _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
     }
 
-    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
+    static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src)
     {
-        _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
+        _simd16_store_ps(reinterpret_cast<float*>(pDst), src);
     }
 
-    static simd16scalar unpack(simd16scalar &in)
-    {
-        return in;
-    }
+    static simd16scalar unpack(simd16scalar& in) { return in; }
 
-    static simd16scalar pack(simd16scalar &in)
-    {
-        return in;
-    }
+    static simd16scalar pack(simd16scalar& in) { return in; }
 #endif
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits.
 //////////////////////////////////////////////////////////////////////////
-template<SWR_TYPE type, uint32_t NumBits>
+template <SWR_TYPE type, uint32_t NumBits>
 struct TypeTraits : PackTraits<NumBits>
 {
     static const SWR_TYPE MyType = type;
-    static float toFloat() { return 0.0; }
-    static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 0.0; }
+    static float          fromFloat()
+    {
+        SWR_NOT_IMPL;
+        return 0.0;
+    }
+    static simdscalar convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UINT8
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
+template <>
+struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
 {
     static const SWR_TYPE MyType = SWR_TYPE_UINT;
-    static float toFloat() { return 0.0; }
-    static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 0.0; }
+    static float          fromFloat()
+    {
+        SWR_NOT_IMPL;
+        return 0.0;
+    }
+    static simdscalar convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UINT8
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
+template <>
+struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
 {
     static const SWR_TYPE MyType = SWR_TYPE_SINT;
-    static float toFloat() { return 0.0; }
-    static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 0.0; }
+    static float          fromFloat()
+    {
+        SWR_NOT_IMPL;
+        return 0.0;
+    }
+    static simdscalar convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UINT16
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
+template <>
+struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
 {
     static const SWR_TYPE MyType = SWR_TYPE_UINT;
-    static float toFloat() { return 0.0; }
-    static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 0.0; }
+    static float          fromFloat()
+    {
+        SWR_NOT_IMPL;
+        return 0.0;
+    }
+    static simdscalar convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for SINT16
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
+template <>
+struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
 {
     static const SWR_TYPE MyType = SWR_TYPE_SINT;
-    static float toFloat() { return 0.0; }
-    static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 0.0; }
+    static float          fromFloat()
+    {
+        SWR_NOT_IMPL;
+        return 0.0;
+    }
+    static simdscalar convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UINT32
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
+template <>
+struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
 {
     static const SWR_TYPE MyType = SWR_TYPE_UINT;
-    static float toFloat() { return 0.0; }
-    static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 0.0; }
+    static float          fromFloat()
+    {
+        SWR_NOT_IMPL;
+        return 0.0;
+    }
+    static simdscalar convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UINT32
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
+template <>
+struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
 {
     static const SWR_TYPE MyType = SWR_TYPE_SINT;
-    static float toFloat() { return 0.0; }
-    static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 0.0; }
+    static float          fromFloat()
+    {
+        SWR_NOT_IMPL;
+        return 0.0;
+    }
+    static simdscalar convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UNORM5
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
+template <>
+struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
 {
     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float toFloat() { return 1.0f / 31.0f; }
-    static float fromFloat() { return 31.0f; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 1.0f / 31.0f; }
+    static float          fromFloat() { return 31.0f; }
+    static simdscalar     convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UNORM6
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
+template <>
+struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
 {
     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float toFloat() { return 1.0f / 63.0f; }
-    static float fromFloat() { return 63.0f; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 1.0f / 63.0f; }
+    static float          fromFloat() { return 63.0f; }
+    static simdscalar     convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UNORM8
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
+template <>
+struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
 {
     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float toFloat() { return 1.0f / 255.0f; }
-    static float fromFloat() { return 255.0f; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 1.0f / 255.0f; }
+    static float          fromFloat() { return 255.0f; }
+    static simdscalar     convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UNORM8
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
+template <>
+struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
 {
     static const SWR_TYPE MyType = SWR_TYPE_SNORM;
-    static float toFloat() { return 1.0f / 127.0f; }
-    static float fromFloat() { return 127.0f; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 1.0f / 127.0f; }
+    static float          fromFloat() { return 127.0f; }
+    static simdscalar     convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UNORM16
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
+template <>
+struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
 {
     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float toFloat() { return 1.0f / 65535.0f; }
-    static float fromFloat() { return 65535.0f; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 1.0f / 65535.0f; }
+    static float          fromFloat() { return 65535.0f; }
+    static simdscalar     convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for SNORM16
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
+template <>
+struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
 {
     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float toFloat() { return 1.0f / 32767.0f; }
-    static float fromFloat() { return 32767.0f; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 1.0f / 32767.0f; }
+    static float          fromFloat() { return 32767.0f; }
+    static simdscalar     convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for UNORM24
 //////////////////////////////////////////////////////////////////////////
-template<>
-struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
+template <>
+struct TypeTraits<SWR_TYPE_UNORM, 24> : PackTraits<32>
 {
     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
-    static float toFloat() { return 1.0f / 16777215.0f; }
-    static float fromFloat() { return 16777215.0f; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 1.0f / 16777215.0f; }
+    static float          fromFloat() { return 16777215.0f; }
+    static simdscalar     convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -697,44 +840,47 @@
 
 #include "math.h"
 
-template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
-inline static __m128 fastpow(__m128 arg) {
+template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden>
+inline static __m128 fastpow(__m128 arg)
+{
     __m128 ret = arg;
 
-    static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
-        * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
+    static const __m128 factor =
+        _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f) *
+                    powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
 
     // Apply a constant pre-correction factor.
     ret = _mm_mul_ps(ret, factor);
 
     // Reinterpret arg as integer to obtain logarithm.
-    //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
+    // asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
     ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
 
     // Multiply logarithm by power.
     ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
 
     // Convert back to "integer" to exponentiate.
-    //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
+    // asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
     ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
 
     return ret;
 }
 
-inline static __m128 pow512_4(__m128 arg) {
+inline static __m128 pow512_4(__m128 arg)
+{
     // 5/12 is too small, so compute the 4th root of 20/12 instead.
     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
-    __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
+    __m128 xf    = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg);
     __m128 xover = _mm_mul_ps(arg, xf);
 
-    __m128 xfm1 = _mm_rsqrt_ps(xf);
-    __m128 x2 = _mm_mul_ps(arg, arg);
+    __m128 xfm1   = _mm_rsqrt_ps(xf);
+    __m128 x2     = _mm_mul_ps(arg, arg);
     __m128 xunder = _mm_mul_ps(x2, xfm1);
 
     // sqrt2 * over + 2 * sqrt2 * under
     __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
-        _mm_add_ps(xover, xunder));
+                             _mm_add_ps(xover, xunder));
 
     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
@@ -743,17 +889,15 @@
 
 inline static __m128 powf_wrapper(__m128 Base, float Exp)
 {
-    float *f = (float *)(&Base);
+    float* f = (float*)(&Base);
 
-    return _mm_set_ps(powf(f[3], Exp),
-                      powf(f[2], Exp),
-                      powf(f[1], Exp),
-                      powf(f[0], Exp));
+    return _mm_set_ps(powf(f[3], Exp), powf(f[2], Exp), powf(f[1], Exp), powf(f[0], Exp));
 }
 
 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
 {
-    // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
+    // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float
+    // value
     __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
 
     // squeeze the mask down to 16 bits (4 bits per DWORD)
@@ -779,7 +923,7 @@
 #else
         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
 #endif
-        f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
+        f      = _mm_mul_ps(f, _mm_set1_ps(1.055f));
         Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
     }
     else
@@ -800,11 +944,12 @@
         f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
 
         // Clear the alpha (is garbage after the sub)
-        __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
+        __m128i i = _mm_and_si128(TO_M128i(f),
+                                  _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
 
         __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
         __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
-        __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
+        __m128i CombinedParts    = _mm_or_si128(LessThanPart, GreaterEqualPart);
 
         Result = TO_M128(CombinedParts);
     }
@@ -813,43 +958,45 @@
 }
 
 #if ENABLE_AVX512_SIMD16
-template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
-inline static simd16scalar SIMDCALL fastpow(simd16scalar const &value)
+template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden>
+inline static simd16scalar SIMDCALL fastpow(simd16scalar const& value)
 {
-    static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
-        * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
+    static const float factor1 = exp2(127.0f * expden / expnum - 127.0f) *
+                                 powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
 
     // Apply a constant pre-correction factor.
     simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
 
     // Reinterpret arg as integer to obtain logarithm.
-    //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
+    // asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
     result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
 
     // Multiply logarithm by power.
     result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
 
     // Convert back to "integer" to exponentiate.
-    //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
+    // asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
     result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
 
     return result;
 }
 
-inline static simd16scalar SIMDCALL pow512_4(simd16scalar const &arg)
+inline static simd16scalar SIMDCALL pow512_4(simd16scalar const& arg)
 {
     // 5/12 is too small, so compute the 4th root of 20/12 instead.
     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
-    simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
+    simd16scalar xf    = fastpow<2, 3, int(0.629960524947437 * 1e9), int(1e9)>(arg);
     simd16scalar xover = _simd16_mul_ps(arg, xf);
 
-    simd16scalar xfm1 = _simd16_rsqrt_ps(xf);
-    simd16scalar x2 = _simd16_mul_ps(arg, arg);
+    simd16scalar xfm1   = _simd16_rsqrt_ps(xf);
+    simd16scalar x2     = _simd16_mul_ps(arg, arg);
     simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
 
     // sqrt2 * over + 2 * sqrt2 * under
-    simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder));
+    simd16scalar xavg =
+        _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
+                       _simd16_add_ps(xover, xunder));
 
     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
@@ -857,28 +1004,26 @@
     return xavg;
 }
 
-inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar &base, float exp)
+inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar& base, float exp)
 {
-    const float *f = reinterpret_cast<const float *>(&base);
+    const float* f = reinterpret_cast<const float*>(&base);
 
-    return _simd16_set_ps(
-        powf(f[15], exp),
-        powf(f[14], exp),
-        powf(f[13], exp),
-        powf(f[12], exp),
-        powf(f[11], exp),
-        powf(f[10], exp),
-        powf(f[ 9], exp),
-        powf(f[ 8], exp),
-        powf(f[ 7], exp),
-        powf(f[ 6], exp),
-        powf(f[ 5], exp),
-        powf(f[ 4], exp),
-        powf(f[ 3], exp),
-        powf(f[ 2], exp),
-        powf(f[ 1], exp),
-        powf(f[ 0], exp)
-    );
+    return _simd16_set_ps(powf(f[15], exp),
+                          powf(f[14], exp),
+                          powf(f[13], exp),
+                          powf(f[12], exp),
+                          powf(f[11], exp),
+                          powf(f[10], exp),
+                          powf(f[9], exp),
+                          powf(f[8], exp),
+                          powf(f[7], exp),
+                          powf(f[6], exp),
+                          powf(f[5], exp),
+                          powf(f[4], exp),
+                          powf(f[3], exp),
+                          powf(f[2], exp),
+                          powf(f[1], exp),
+                          powf(f[0], exp));
 }
 
 // float to SRGB conversion formula
@@ -888,7 +1033,7 @@
 // else
 //     value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
 //
-static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value)
+static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value)
 {
     // create a mask where the source is < the minimal SRGB float value
     const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
@@ -913,7 +1058,8 @@
         // only native AVX512 can directly use the computed mask for the blend operation
         result = _mm512_mask_blend_ps(mask, result2, result);
 #else
-        result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
+        result               = _simd16_blendv_ps(
+            result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
 #endif
     }
 
@@ -924,88 +1070,100 @@
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for FLOAT16
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
+template <>
+struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
 {
     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
-    static float toFloat() { return 1.0f; }
-    static float fromFloat() { return 1.0f; }
-    static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
+    static float          toFloat() { return 1.0f; }
+    static float          fromFloat() { return 1.0f; }
+    static simdscalar     convertSrgb(simdscalar& in)
+    {
+        SWR_NOT_IMPL;
+        return _simd_setzero_ps();
+    }
 
-    static simdscalar pack(const simdscalar &in)
+    static simdscalar pack(const simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
 #if (KNOB_ARCH == KNOB_ARCH_AVX)
         // input is 8 packed float32, output is 8 packed float16
         simdscalari src = _simd_castps_si(in);
 
-        static const uint32_t FLOAT_EXP_BITS = 8;
+        static const uint32_t FLOAT_EXP_BITS      = 8;
         static const uint32_t FLOAT_MANTISSA_BITS = 23;
         static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
         static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
 
-        static const uint32_t HALF_EXP_BITS = 5;
+        static const uint32_t HALF_EXP_BITS      = 5;
         static const uint32_t HALF_MANTISSA_BITS = 10;
         static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
 
         // minimum exponent required, exponents below this are flushed to 0.
-        static const int32_t HALF_EXP_MIN = -14;
+        static const int32_t HALF_EXP_MIN   = -14;
         static const int32_t FLOAT_EXP_BIAS = 127;
-        static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
-        static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
+        static const int32_t FLOAT_EXP_MIN  = HALF_EXP_MIN + FLOAT_EXP_BIAS;
+        static const int32_t FLOAT_EXP_MIN_FTZ =
+            FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
 
         // maximum exponent required, exponents above this are set to infinity
-        static const int32_t HALF_EXP_MAX = 15;
+        static const int32_t HALF_EXP_MAX  = 15;
         static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
 
-        const simdscalari vSignMask     = _simd_set1_epi32(0x80000000);
-        const simdscalari vExpMask      = _simd_set1_epi32(FLOAT_EXP_MASK);
-        const simdscalari vManMask      = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
-        const simdscalari vExpMin       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
-        const simdscalari vExpMinFtz    = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
-        const simdscalari vExpMax       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
+        const simdscalari vSignMask = _simd_set1_epi32(0x80000000);
+        const simdscalari vExpMask  = _simd_set1_epi32(FLOAT_EXP_MASK);
+        const simdscalari vManMask  = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
+        const simdscalari vExpMin =
+            _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
+        const simdscalari vExpMinFtz =
+            _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
+        const simdscalari vExpMax =
+            _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
 
-        simdscalari vSign       = _simd_and_si(src, vSignMask);
-        simdscalari vExp        = _simd_and_si(src, vExpMask);
-        simdscalari vMan        = _simd_and_si(src, vManMask);
+        simdscalari vSign = _simd_and_si(src, vSignMask);
+        simdscalari vExp  = _simd_and_si(src, vExpMask);
+        simdscalari vMan  = _simd_and_si(src, vManMask);
 
         simdscalari vFTZMask    = _simd_cmplt_epi32(vExp, vExpMinFtz);
         simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
         simdscalari vInfMask    = _simd_cmpeq_epi32(vExpMask, vExp);
         simdscalari vClampMask  = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
 
-        simdscalari vHalfExp    = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
+        simdscalari vHalfExp = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin),
+                                               _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
 
         // pack output 16-bits into the lower 16-bits of each 32-bit channel
-        simdscalari vDst        = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
-        vDst   = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
+        simdscalari vDst =
+            _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
+        vDst = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
 
         // Flush To Zero
-        vDst   = _simd_andnot_si(vFTZMask, vDst);
+        vDst = _simd_andnot_si(vFTZMask, vDst);
         // Apply Infinites / NaN
-        vDst   = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
+        vDst = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
 
         // Apply clamps
         vDst = _simd_andnot_si(vClampMask, vDst);
-        vDst = _simd_or_si(vDst,
-                _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
+        vDst = _simd_or_si(vDst, _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
 
         // Compute Denormals (subnormals)
         if (!_mm256_testz_si256(vDenormMask, vDenormMask))
         {
-            uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
-            uint32_t *pExp = (uint32_t*)&vExp;
-            uint32_t *pMan = (uint32_t*)&vMan;
-            uint32_t *pDst = (uint32_t*)&vDst;
+            uint32_t* pDenormMask = (uint32_t*)&vDenormMask;
+            uint32_t* pExp        = (uint32_t*)&vExp;
+            uint32_t* pMan        = (uint32_t*)&vMan;
+            uint32_t* pDst        = (uint32_t*)&vDst;
             for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
             {
                 if (pDenormMask[i])
                 {
                     // Need to compute subnormal value
                     uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
-                    uint32_t mantissa = pMan[i] |
-                                        (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.  Make it explicit
+                    uint32_t mantissa =
+                        pMan[i] | (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.
+                                                               // Make it explicit
 
-                    pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
+                    pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) +
+                                           (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
                 }
             }
         }
@@ -1014,7 +1172,8 @@
         vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
 
         // Pack to lower 128-bits
-        vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
+        vDst = _mm256_castsi128_si256(
+            _mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
 
 #if 0
 #if !defined(NDEBUG)
@@ -1037,7 +1196,7 @@
 #endif
     }
 
-    static simdscalar unpack(const simdscalar &in)
+    static simdscalar unpack(const simdscalar& in)
     {
         // input is 8 packed float16, output is 8 packed float32
         SWR_NOT_IMPL; // @todo
@@ -1045,10 +1204,10 @@
     }
 #if ENABLE_AVX512_SIMD16
 
-    static simd16scalar pack(const simd16scalar &in)
+    static simd16scalar pack(const simd16scalar& in)
     {
-        simd16scalari result = _simd16_setzero_si();
-        simdscalari resultlo = _simd_setzero_si();
+        simd16scalari result   = _simd16_setzero_si();
+        simdscalari   resultlo = _simd_setzero_si();
 
 #if (KNOB_ARCH == KNOB_ARCH_AVX)
         simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
@@ -1070,7 +1229,7 @@
         return _simd16_castsi_ps(result);
     }
 
-    static simd16scalar unpack(const simd16scalar &in)
+    static simd16scalar unpack(const simd16scalar& in)
     {
         // input is 16 packed float16, output is 16 packed float32
         SWR_NOT_IMPL; //  @todo
@@ -1082,12 +1241,13 @@
 //////////////////////////////////////////////////////////////////////////
 /// TypeTraits - Format type traits specialization for FLOAT32
 //////////////////////////////////////////////////////////////////////////
-template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
+template <>
+struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
 {
-    static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
-    static float toFloat() { return 1.0f; }
-    static float fromFloat() { return 1.0f; }
-    static inline simdscalar convertSrgb(simdscalar &in)
+    static const SWR_TYPE    MyType = SWR_TYPE_FLOAT;
+    static float             toFloat() { return 1.0f; }
+    static float             fromFloat() { return 1.0f; }
+    static inline simdscalar convertSrgb(simdscalar& in)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m128 srcLo = _mm256_extractf128_ps(in, 0);
@@ -1105,10 +1265,7 @@
     }
 #if ENABLE_AVX512_SIMD16
 
-    static inline simd16scalar convertSrgb(simd16scalar &in)
-    {
-        return ConvertFloatToSRGB2(in);
-    }
+    static inline simd16scalar convertSrgb(simd16scalar& in) { return ConvertFloatToSRGB2(in); }
 #endif
 };
 
@@ -1139,7 +1296,7 @@
 //////////////////////////////////////////////////////////////////////////
 /// Format1 - Bitfield for single component formats.
 //////////////////////////////////////////////////////////////////////////
-template<uint32_t x>
+template <uint32_t x>
 union Format1
 {
     typedef typename FormatIntType<x>::TYPE TYPE;
@@ -1153,11 +1310,11 @@
     {
         TYPE g : x;
     };
-    struct 
+    struct
     {
         TYPE b : x;
     };
-    struct  
+    struct
     {
         TYPE a : x;
     };
@@ -1166,7 +1323,7 @@
 //////////////////////////////////////////////////////////////////////////
 /// Format2 - Bitfield for 2 component formats.
 //////////////////////////////////////////////////////////////////////////
-template<uint32_t x, uint32_t y>
+template <uint32_t x, uint32_t y>
 union Format2
 {
     typedef typename FormatIntType<x + y>::TYPE TYPE;
@@ -1187,7 +1344,7 @@
 //////////////////////////////////////////////////////////////////////////
 /// Format3 - Bitfield for 3 component formats.
 //////////////////////////////////////////////////////////////////////////
-template<uint32_t x, uint32_t y, uint32_t z>
+template <uint32_t x, uint32_t y, uint32_t z>
 union Format3
 {
     typedef typename FormatIntType<x + y + z>::TYPE TYPE;
@@ -1198,13 +1355,13 @@
         TYPE g : y;
         TYPE b : z;
     };
-    TYPE a;  ///@note This is here to provide full template needed in Formats.
+    TYPE a; ///@note This is here to provide full template needed in Formats.
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// Format4 - Bitfield for 4 component formats.
 //////////////////////////////////////////////////////////////////////////
-template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
+template <uint32_t x, uint32_t y, uint32_t z, uint32_t w>
 struct Format4
 {
     typedef typename FormatIntType<x + y + z + w>::TYPE TYPE;
@@ -1218,12 +1375,12 @@
 //////////////////////////////////////////////////////////////////////////
 /// ComponentTraits - Default components
 //////////////////////////////////////////////////////////////////////////
-template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
+template <uint32_t x, uint32_t y, uint32_t z, uint32_t w>
 struct Defaults
 {
     INLINE static uint32_t GetDefault(uint32_t comp)
     {
-        static const uint32_t defaults[4]{ x, y, z, w };
+        static const uint32_t defaults[4]{x, y, z, w};
         return defaults[comp];
     }
 };
@@ -1231,25 +1388,31 @@
 //////////////////////////////////////////////////////////////////////////
 /// ComponentTraits - Component type traits.
 //////////////////////////////////////////////////////////////////////////
-template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
+template <SWR_TYPE X,
+          uint32_t NumBitsX,
+          SWR_TYPE Y        = SWR_TYPE_UNKNOWN,
+          uint32_t NumBitsY = 0,
+          SWR_TYPE Z        = SWR_TYPE_UNKNOWN,
+          uint32_t NumBitsZ = 0,
+          SWR_TYPE W        = SWR_TYPE_UNKNOWN,
+          uint32_t NumBitsW = 0>
 struct ComponentTraits
 {
     INLINE static SWR_TYPE GetType(uint32_t comp)
     {
-        static const SWR_TYPE CompType[4]{ X, Y, Z, W };
+        static const SWR_TYPE CompType[4]{X, Y, Z, W};
         return CompType[comp];
     }
 
     INLINE static constexpr uint32_t GetConstBPC(uint32_t comp)
     {
-        return (comp == 3) ? NumBitsW :
-            ((comp == 2) ? NumBitsZ :
-                ((comp == 1) ? NumBitsY : NumBitsX) );
+        return (comp == 3) ? NumBitsW
+                           : ((comp == 2) ? NumBitsZ : ((comp == 1) ? NumBitsY : NumBitsX));
     }
 
     INLINE static uint32_t GetBPC(uint32_t comp)
     {
-        static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
+        static const uint32_t MyBpc[4]{NumBitsX, NumBitsY, NumBitsZ, NumBitsW};
         return MyBpc[comp];
     }
 
@@ -1285,7 +1448,6 @@
         }
         SWR_INVALID("Invalid component: %d", comp);
         return TypeTraits<X, NumBitsX>::toFloat();
-
     }
 
     INLINE static float fromFloat(uint32_t comp)
@@ -1322,7 +1484,7 @@
         return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
     }
 
-    INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar const &src)
+    INLINE static void storeSOA(uint32_t comp, uint8_t* pDst, simdscalar const& src)
     {
         switch (comp)
         {
@@ -1342,19 +1504,23 @@
         SWR_INVALID("Invalid component: %d", comp);
     }
 
-    INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
+    INLINE static simdscalar unpack(uint32_t comp, simdscalar& in)
     {
         simdscalar out;
         switch (comp)
         {
         case 0:
-            out = TypeTraits<X, NumBitsX>::unpack(in); break;
+            out = TypeTraits<X, NumBitsX>::unpack(in);
+            break;
         case 1:
-            out = TypeTraits<Y, NumBitsY>::unpack(in); break;
+            out = TypeTraits<Y, NumBitsY>::unpack(in);
+            break;
         case 2:
-            out = TypeTraits<Z, NumBitsZ>::unpack(in); break;
+            out = TypeTraits<Z, NumBitsZ>::unpack(in);
+            break;
         case 3:
-            out = TypeTraits<W, NumBitsW>::unpack(in); break;
+            out = TypeTraits<W, NumBitsW>::unpack(in);
+            break;
         default:
             SWR_INVALID("Invalid component: %d", comp);
             out = in;
@@ -1363,19 +1529,23 @@
         return out;
     }
 
-    INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
+    INLINE static simdscalar pack(uint32_t comp, simdscalar& in)
     {
         simdscalar out;
         switch (comp)
         {
         case 0:
-            out = TypeTraits<X, NumBitsX>::pack(in); break;
+            out = TypeTraits<X, NumBitsX>::pack(in);
+            break;
         case 1:
-            out = TypeTraits<Y, NumBitsY>::pack(in); break;
+            out = TypeTraits<Y, NumBitsY>::pack(in);
+            break;
         case 2:
-            out = TypeTraits<Z, NumBitsZ>::pack(in); break;
+            out = TypeTraits<Z, NumBitsZ>::pack(in);
+            break;
         case 3:
-            out = TypeTraits<W, NumBitsW>::pack(in); break;
+            out = TypeTraits<W, NumBitsW>::pack(in);
+            break;
         default:
             SWR_INVALID("Invalid component: %d", comp);
             out = in;
@@ -1384,7 +1554,7 @@
         return out;
     }
 
-    INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
+    INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar& in)
     {
         switch (comp)
         {
@@ -1419,7 +1589,7 @@
         return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
     }
 
-    INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar const &src)
+    INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t* pDst, simd16scalar const& src)
     {
         switch (comp)
         {
@@ -1440,7 +1610,7 @@
         TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
     }
 
-    INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in)
+    INLINE static simd16scalar unpack(uint32_t comp, simd16scalar& in)
     {
         switch (comp)
         {
@@ -1457,7 +1627,7 @@
         return TypeTraits<X, NumBitsX>::unpack(in);
     }
 
-    INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in)
+    INLINE static simd16scalar pack(uint32_t comp, simd16scalar& in)
     {
         switch (comp)
         {
@@ -1474,7 +1644,7 @@
         return TypeTraits<X, NumBitsX>::pack(in);
     }
 
-    INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in)
+    INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar& in)
     {
         switch (comp)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
index 576f14b..b51755d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
@@ -1,37 +1,37 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file utils.h
-*
-* @brief Utilities used by SWR core related to pixel formats.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file utils.h
+ *
+ * @brief Utilities used by SWR core related to pixel formats.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "core/utils.h"
 #include "common/simdintrin.h"
 
 INLINE
-void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
+void vTranspose(simd4scalar& row0, simd4scalar& row1, simd4scalar& row2, simd4scalar& row3)
 {
     simd4scalari row0i = SIMD128::castps_si(row0);
     simd4scalari row1i = SIMD128::castps_si(row1);
@@ -39,8 +39,8 @@
     simd4scalari row3i = SIMD128::castps_si(row3);
 
     simd4scalari vTemp = row2i;
-    row2i = SIMD128::unpacklo_epi32(row2i, row3i);
-    vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
+    row2i              = SIMD128::unpacklo_epi32(row2i, row3i);
+    vTemp              = SIMD128::unpackhi_epi32(vTemp, row3i);
 
     row3i = row0i;
     row0i = SIMD128::unpacklo_epi32(row0i, row1i);
@@ -61,11 +61,11 @@
 }
 
 INLINE
-void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
+void vTranspose(simd4scalari& row0, simd4scalari& row1, simd4scalari& row2, simd4scalari& row3)
 {
     simd4scalari vTemp = row2;
-    row2 = SIMD128::unpacklo_epi32(row2, row3);
-    vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
+    row2               = SIMD128::unpacklo_epi32(row2, row3);
+    vTemp              = SIMD128::unpackhi_epi32(vTemp, row3);
 
     row3 = row0;
     row0 = SIMD128::unpacklo_epi32(row0, row1);
@@ -82,17 +82,20 @@
 
 #if KNOB_SIMD_WIDTH == 8
 INLINE
-void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
+void vTranspose3x8(simd4scalar (&vDst)[8],
+                   const simdscalar& vSrc0,
+                   const simdscalar& vSrc1,
+                   const simdscalar& vSrc2)
 {
-    simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);                  //x0z0x1z1 x4z4x5z5
-    simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps());     //y0w0y1w1 y4w4y5w5
-    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);              //x0y0z0w0 x4y4z4w4
-    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);              //x1y1z1w1 x5y5z5w5
+    simdscalar r0r2       = _simd_unpacklo_ps(vSrc0, vSrc2);              // x0z0x1z1 x4z4x5z5
+    simdscalar r1rx       = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); // y0w0y1w1 y4w4y5w5
+    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);                // x0y0z0w0 x4y4z4w4
+    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);                // x1y1z1w1 x5y5z5w5
 
-    r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                             //x2z2x3z3 x6z6x7z7
-    r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps());                //y2w2y3w3 y6w6yw77
-    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);              //x2y2z2w2 x6y6z6w6
-    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);              //x3y3z3w3 x7y7z7w7
+    r0r2                  = _simd_unpackhi_ps(vSrc0, vSrc2);              // x2z2x3z3 x6z6x7z7
+    r1rx                  = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); // y2w2y3w3 y6w6yw77
+    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);                // x2y2z2w2 x6y6z6w6
+    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);                // x3y3z3w3 x7y7z7w7
 
     vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
     vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
@@ -106,17 +109,21 @@
 }
 
 INLINE
-void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
+void vTranspose4x8(simd4scalar (&vDst)[8],
+                   const simdscalar& vSrc0,
+                   const simdscalar& vSrc1,
+                   const simdscalar& vSrc2,
+                   const simdscalar& vSrc3)
 {
-    simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);      //x0z0x1z1 x4z4x5z5
-    simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3);      //y0w0y1w1 y4w4y5w5
-    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);  //x0y0z0w0 x4y4z4w4
-    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);  //x1y1z1w1 x5y5z5w5
+    simdscalar r0r2       = _simd_unpacklo_ps(vSrc0, vSrc2); // x0z0x1z1 x4z4x5z5
+    simdscalar r1rx       = _simd_unpacklo_ps(vSrc1, vSrc3); // y0w0y1w1 y4w4y5w5
+    simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);   // x0y0z0w0 x4y4z4w4
+    simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);   // x1y1z1w1 x5y5z5w5
 
-    r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                 //x2z2x3z3 x6z6x7z7
-    r1rx = _simd_unpackhi_ps(vSrc1, vSrc3);                 //y2w2y3w3 y6w6yw77
-    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);  //x2y2z2w2 x6y6z6w6
-    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);  //x3y3z3w3 x7y7z7w7
+    r0r2                  = _simd_unpackhi_ps(vSrc0, vSrc2); // x2z2x3z3 x6z6x7z7
+    r1rx                  = _simd_unpackhi_ps(vSrc1, vSrc3); // y2w2y3w3 y6w6yw77
+    simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);   // x2y2z2w2 x6y6z6w6
+    simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);   // x3y3z3w3 x7y7z7w7
 
     vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
     vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
@@ -131,9 +138,29 @@
 
 #if ENABLE_AVX512_SIMD16
 INLINE
-void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
+void vTranspose4x16(simd16scalar (&dst)[4],
+                    const simd16scalar& src0,
+                    const simd16scalar& src1,
+                    const simd16scalar& src2,
+                    const simd16scalar& src3)
 {
-    const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
+    const simd16scalari perm =
+        _simd16_set_epi32(15,
+                          11,
+                          7,
+                          3,
+                          14,
+                          10,
+                          6,
+                          2,
+                          13,
+                          9,
+                          5,
+                          1,
+                          12,
+                          8,
+                          4,
+                          0); // pre-permute input to setup the right order after all the unpacking
 
     simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
     simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
@@ -153,46 +180,69 @@
 
 #endif
 INLINE
-void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7)
+void vTranspose8x8(simdscalar (&vDst)[8],
+                   const simdscalar& vMask0,
+                   const simdscalar& vMask1,
+                   const simdscalar& vMask2,
+                   const simdscalar& vMask3,
+                   const simdscalar& vMask4,
+                   const simdscalar& vMask5,
+                   const simdscalar& vMask6,
+                   const simdscalar& vMask7)
 {
-    simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
-    simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
-    simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
-    simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
-    simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
-    simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
-    simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
-    simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
-    simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
-    simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
-    simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
-    simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
-    simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
-    simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
-    simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
-    simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
-    vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
-    vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
-    vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
-    vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
-    vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
-    vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
-    vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
-    vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
+    simdscalar __t0  = _simd_unpacklo_ps(vMask0, vMask1);
+    simdscalar __t1  = _simd_unpackhi_ps(vMask0, vMask1);
+    simdscalar __t2  = _simd_unpacklo_ps(vMask2, vMask3);
+    simdscalar __t3  = _simd_unpackhi_ps(vMask2, vMask3);
+    simdscalar __t4  = _simd_unpacklo_ps(vMask4, vMask5);
+    simdscalar __t5  = _simd_unpackhi_ps(vMask4, vMask5);
+    simdscalar __t6  = _simd_unpacklo_ps(vMask6, vMask7);
+    simdscalar __t7  = _simd_unpackhi_ps(vMask6, vMask7);
+    simdscalar __tt0 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0));
+    simdscalar __tt1 = _simd_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2));
+    simdscalar __tt2 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0));
+    simdscalar __tt3 = _simd_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2));
+    simdscalar __tt4 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0));
+    simdscalar __tt5 = _simd_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2));
+    simdscalar __tt6 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0));
+    simdscalar __tt7 = _simd_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2));
+    vDst[0]          = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
+    vDst[1]          = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
+    vDst[2]          = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
+    vDst[3]          = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
+    vDst[4]          = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
+    vDst[5]          = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
+    vDst[6]          = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
+    vDst[7]          = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
 }
 
 INLINE
-void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7)
+void vTranspose8x8(simdscalar (&vDst)[8],
+                   const simdscalari& vMask0,
+                   const simdscalari& vMask1,
+                   const simdscalari& vMask2,
+                   const simdscalari& vMask3,
+                   const simdscalari& vMask4,
+                   const simdscalari& vMask5,
+                   const simdscalari& vMask6,
+                   const simdscalari& vMask7)
 {
-    vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3), 
-        _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7));
+    vTranspose8x8(vDst,
+                  _simd_castsi_ps(vMask0),
+                  _simd_castsi_ps(vMask1),
+                  _simd_castsi_ps(vMask2),
+                  _simd_castsi_ps(vMask3),
+                  _simd_castsi_ps(vMask4),
+                  _simd_castsi_ps(vMask5),
+                  _simd_castsi_ps(vMask6),
+                  _simd_castsi_ps(vMask7));
 }
 #endif
 
 //////////////////////////////////////////////////////////////////////////
 /// TranposeSingleComponent
 //////////////////////////////////////////////////////////////////////////
-template<uint32_t bpp>
+template <uint32_t bpp>
 struct TransposeSingleComponent
 {
     //////////////////////////////////////////////////////////////////////////
@@ -227,23 +277,38 @@
 
 #if KNOB_SIMD_WIDTH == 8
 #if KNOB_ARCH <= KNOB_ARCH_AVX
-        simd4scalari c0c1 = src.v4[0];                                                          // rrrrrrrrgggggggg
-        simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1));  // bbbbbbbbaaaaaaaa
-        simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
-        simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
-        simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
-        simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
-        simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
-        simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
+        simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg
+        simd4scalari c2c3 =
+            SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa
+        simd4scalari c0c2    = SIMD128::unpacklo_epi64(c0c1, c2c3);            // rrrrrrrrbbbbbbbb
+        simd4scalari c1c3    = SIMD128::unpackhi_epi64(c0c1, c2c3);            // ggggggggaaaaaaaa
+        simd4scalari c01     = SIMD128::unpacklo_epi8(c0c2, c1c3);             // rgrgrgrgrgrgrgrg
+        simd4scalari c23     = SIMD128::unpackhi_epi8(c0c2, c1c3);             // babababababababa
+        simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);              // rgbargbargbargba
+        simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);              // rgbargbargbargba
         SIMD128::store_si((simd4scalari*)pDst, c0123lo);
         SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
 #else
         simdscalari dst01 = _simd_shuffle_epi8(src,
-            _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
+                                               _simd_set_epi32(0x0f078080,
+                                                               0x0e068080,
+                                                               0x0d058080,
+                                                               0x0c048080,
+                                                               0x80800b03,
+                                                               0x80800a02,
+                                                               0x80800901,
+                                                               0x80800800));
         simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
-        dst23 = _simd_shuffle_epi8(dst23,
-            _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
-        simdscalari dst = _simd_or_si(dst01, dst23);
+        dst23             = _simd_shuffle_epi8(dst23,
+                                   _simd_set_epi32(0x80800f07,
+                                                   0x80800e06,
+                                                   0x80800d05,
+                                                   0x80800c04,
+                                                   0x0b038080,
+                                                   0x0a028080,
+                                                   0x09018080,
+                                                   0x08008080));
+        simdscalari dst   = _simd_or_si(dst01, dst23);
         _simd_store_si((simdscalari*)pDst, dst);
 #endif
 #else
@@ -254,23 +319,28 @@
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
-        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
-        simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
-        simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
+        simd4scalari src0 =
+            SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+        simd4scalari src1 =
+            SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
+        simd4scalari src2 =
+            SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+        simd4scalari src3 =
+            SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
 
         simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
         simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
         simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
         simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
 
-        simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
+        simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
         simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
         simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
 
         simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
 
-        _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);             // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
+        _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst),
+                         dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
     }
 #endif
 };
@@ -305,9 +375,9 @@
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
 
-        simd4scalari rg = src.v4[0];           // rrrrrrrr gggggggg
-        simd4scalari g = SIMD128::unpackhi_epi64(rg, rg);             // gggggggg gggggggg
-        rg = SIMD128::unpacklo_epi8(rg, g);
+        simd4scalari rg = src.v4[0];                       // rrrrrrrr gggggggg
+        simd4scalari g  = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg
+        rg              = SIMD128::unpacklo_epi8(rg, g);
         SIMD128::store_si((simd4scalari*)pDst, rg);
 #else
 #error Unsupported vector width
@@ -317,8 +387,10 @@
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
-        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
+        simd4scalari src0 =
+            SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+        simd4scalari src1 =
+            SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
 
         simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
         simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
@@ -327,7 +399,8 @@
 
         simdscalari dst = _simd_or_si(cvt0, shl1);
 
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);                 // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst),
+                       dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
     }
 #endif
 };
@@ -352,13 +425,13 @@
         simd4scalar vDst[8];
         vTranspose4x8(vDst, src0, src1, src2, src3);
         SIMD128::store_ps((float*)pDst, vDst[0]);
-        SIMD128::store_ps((float*)pDst+4, vDst[1]);
-        SIMD128::store_ps((float*)pDst+8, vDst[2]);
-        SIMD128::store_ps((float*)pDst+12, vDst[3]);
-        SIMD128::store_ps((float*)pDst+16, vDst[4]);
-        SIMD128::store_ps((float*)pDst+20, vDst[5]);
-        SIMD128::store_ps((float*)pDst+24, vDst[6]);
-        SIMD128::store_ps((float*)pDst+28, vDst[7]);
+        SIMD128::store_ps((float*)pDst + 4, vDst[1]);
+        SIMD128::store_ps((float*)pDst + 8, vDst[2]);
+        SIMD128::store_ps((float*)pDst + 12, vDst[3]);
+        SIMD128::store_ps((float*)pDst + 16, vDst[4]);
+        SIMD128::store_ps((float*)pDst + 20, vDst[5]);
+        SIMD128::store_ps((float*)pDst + 24, vDst[6]);
+        SIMD128::store_ps((float*)pDst + 28, vDst[7]);
 #else
 #error Unsupported vector width
 #endif
@@ -367,19 +440,19 @@
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
-        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
-        simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
+        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
+        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
+        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
+        simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 48);
 
         simd16scalar dst[4];
 
         vTranspose4x16(dst, src0, src1, src2, src3);
 
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
     }
 #endif
 };
@@ -418,19 +491,19 @@
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
-        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
+        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
+        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
+        simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
         simd16scalar src3 = _simd16_setzero_ps();
 
         simd16scalar dst[4];
 
         vTranspose4x16(dst, src0, src1, src2, src3);
 
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
     }
 #endif
 };
@@ -447,11 +520,11 @@
     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
-        const float* pfSrc = (const float*)pSrc;
-        simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
-        simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
-        simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
-        simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
+        const float* pfSrc  = (const float*)pSrc;
+        simd4scalar  src_r0 = SIMD128::load_ps(pfSrc + 0);
+        simd4scalar  src_r1 = SIMD128::load_ps(pfSrc + 4);
+        simd4scalar  src_g0 = SIMD128::load_ps(pfSrc + 8);
+        simd4scalar  src_g1 = SIMD128::load_ps(pfSrc + 12);
 
         simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
         simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
@@ -471,20 +544,36 @@
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));                 // rrrrrrrrrrrrrrrr
-        simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);            // gggggggggggggggg
+        simd16scalar src0 =
+            _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr
+        simd16scalar src1 =
+            _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg
 
-        simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                                        // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
-        simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                                        // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
+        simd16scalar tmp0 =
+            _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
+        simd16scalar tmp1 =
+            _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
 
-        simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44);  // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
-        simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE);  // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
+        simd16scalar per0 = _simd16_permute2f128_ps(
+            tmp0,
+            tmp1,
+            0x44); // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
+        simd16scalar per1 = _simd16_permute2f128_ps(
+            tmp0,
+            tmp1,
+            0xEE); // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
 
-        simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8);  // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
-        simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8);  // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
+        simd16scalar dst0 = _simd16_permute2f128_ps(
+            per0,
+            per0,
+            0xD8); // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
+        simd16scalar dst1 = _simd16_permute2f128_ps(
+            per1,
+            per1,
+            0xD8); // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
 
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);                               // rgrgrgrgrgrgrgrg
-        _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1);                               // rgrgrgrgrgrgrgrg
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0);  // rgrgrgrgrgrgrgrg
+        _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
     }
 #endif
 };
@@ -531,30 +620,38 @@
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
-        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
-        simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3);          // aaaaaaaaaaaaaaaa
+        simdscalari src0 =
+            _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+        simdscalari src1 =
+            _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
+        simdscalari src2 =
+            _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+        simdscalari src3 =
+            _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
 
-        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
-        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
+        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
+        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
+        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
+        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
 
-        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
-        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
-        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
-        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
+        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
+        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
+        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
+        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
 
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
-        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
-        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
-        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
+        simdscalari dst0 = _simd_permute2f128_si(
+            tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
+        simdscalari dst1 = _simd_permute2f128_si(
+            tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
+        simdscalari dst2 = _simd_permute2f128_si(
+            tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
+        simdscalari dst3 = _simd_permute2f128_si(
+            tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
 
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
     }
 #endif
 };
@@ -600,30 +697,37 @@
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
-        simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
-        simdscalari src3 = _simd_setzero_si();                                                      // aaaaaaaaaaaaaaaa
+        simdscalari src0 =
+            _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+        simdscalari src1 =
+            _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
+        simdscalari src2 =
+            _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+        simdscalari src3 = _simd_setzero_si();                             // aaaaaaaaaaaaaaaa
 
-        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
-        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
-        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
+        simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
+        simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
+        simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
+        simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
 
-        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
-        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
-        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
-        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
+        simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
+        simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
+        simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
+        simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
 
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
-        simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
-        simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
-        simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
+        simdscalari dst0 = _simd_permute2f128_si(
+            tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
+        simdscalari dst1 = _simd_permute2f128_si(
+            tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
+        simdscalari dst2 = _simd_permute2f128_si(
+            tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
+        simdscalari dst3 = _simd_permute2f128_si(
+            tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
 
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
     }
 #endif
 };
@@ -661,17 +765,21 @@
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
-        simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
+        simdscalari src0 =
+            _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+        simdscalari src1 =
+            _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
 
-        simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
-        simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
+        simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
+        simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
 
-        simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20);     // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
-        simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31);     // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
+        simdscalari dst0 = _simd_permute2f128_si(
+            tmp0, tmp1, 0x20); // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
+        simdscalari dst1 = _simd_permute2f128_si(
+            tmp0, tmp1, 0x31); // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
 
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgrgrgrgrgrgrgrg
-        _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgrgrgrgrgrgrgrg
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
+        _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
     }
 #endif
 };
@@ -879,4 +987,3 @@
     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
 #endif
 };
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 9630afa..b0d9f05 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1,31 +1,31 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file frontend.cpp
-*
-* @brief Implementation for Frontend which handles vertex processing,
-*        primitive assembly, clipping, binning, etc.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file frontend.cpp
+ *
+ * @brief Implementation for Frontend which handles vertex processing,
+ *        primitive assembly, clipping, binning, etc.
+ *
+ ******************************************************************************/
 
 #include "api.h"
 #include "frontend.h"
@@ -45,7 +45,8 @@
 /// @brief Helper macro to generate a bitmask
 static INLINE uint32_t GenMask(uint32_t numBits)
 {
-    SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
+    SWR_ASSERT(
+        numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
     return ((1U << numBits) - 1);
 }
 
@@ -56,17 +57,13 @@
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to sync callback.
 /// @todo This should go away when we switch this to use compute threading.
-void ProcessSync(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
 {
     BE_WORK work;
-    work.type = SYNC;
+    work.type    = SYNC;
     work.pfnWork = ProcessSyncBE;
 
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    MacroTileMgr* pTileMgr = pDC->pTileMgr;
     pTileMgr->enqueue(0, 0, &work);
 }
 
@@ -76,17 +73,13 @@
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to sync callback.
-void ProcessShutdown(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
 {
     BE_WORK work;
-    work.type = SHUTDOWN;
+    work.type    = SHUTDOWN;
     work.pfnWork = ProcessShutdownBE;
 
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    MacroTileMgr* pTileMgr = pDC->pTileMgr;
     // Enqueue at least 1 work item for each worker thread
     // account for number of numa nodes
     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
@@ -107,14 +100,10 @@
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to clear callback.
 /// @todo This should go away when we switch this to use compute threading.
-void ProcessClear(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
 {
-    CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    CLEAR_DESC*   pDesc    = (CLEAR_DESC*)pUserData;
+    MacroTileMgr* pTileMgr = pDC->pTileMgr;
 
     // queue a clear to each macro tile
     // compute macro tile bounds for the specified rect
@@ -124,8 +113,8 @@
     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 
     BE_WORK work;
-    work.type = CLEAR;
-    work.pfnWork = ProcessClearBE;
+    work.type       = CLEAR;
+    work.pfnWork    = ProcessClearBE;
     work.desc.clear = *pDesc;
 
     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
@@ -144,15 +133,11 @@
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to callback.
 /// @todo This should go away when we switch this to use compute threading.
-void ProcessStoreTiles(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessStoreTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
 {
     RDTSC_BEGIN(FEProcessStoreTiles, pDC->drawId);
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
-    STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
+    MacroTileMgr*     pTileMgr = pDC->pTileMgr;
+    STORE_TILES_DESC* pDesc    = (STORE_TILES_DESC*)pUserData;
 
     // queue a store to each macro tile
     // compute macro tile bounds for the specified rect
@@ -163,8 +148,8 @@
 
     // store tiles
     BE_WORK work;
-    work.type = STORETILES;
-    work.pfnWork = ProcessStoreTilesBE;
+    work.type            = STORETILES;
+    work.pfnWork         = ProcessStoreTilesBE;
     work.desc.storeTiles = *pDesc;
 
     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
@@ -185,15 +170,14 @@
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to callback.
 /// @todo This should go away when we switch this to use compute threading.
-void ProcessDiscardInvalidateTiles(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
+                                   DRAW_CONTEXT* pDC,
+                                   uint32_t      workerId,
+                                   void*         pUserData)
 {
     RDTSC_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
-    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
-    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    DISCARD_INVALIDATE_TILES_DESC* pDesc    = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
+    MacroTileMgr*                  pTileMgr = pDC->pTileMgr;
 
     // compute macro tile bounds for the specified rect
     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
@@ -218,8 +202,8 @@
 
     // load tiles
     BE_WORK work;
-    work.type = DISCARDINVALIDATETILES;
-    work.pfnWork = ProcessDiscardInvalidateTilesBE;
+    work.type                        = DISCARDINVALIDATETILES;
+    work.pfnWork                     = ProcessDiscardInvalidateTilesBE;
     work.desc.discardInvalidateTiles = *pDesc;
 
     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
@@ -238,27 +222,40 @@
 /// @param mode - primitive topology for draw operation.
 /// @param numPrims - number of vertices or indices for draw.
 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
-uint32_t GetNumPrims(
-    PRIMITIVE_TOPOLOGY mode,
-    uint32_t numPrims)
+uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
 {
     switch (mode)
     {
-    case TOP_POINT_LIST: return numPrims;
-    case TOP_TRIANGLE_LIST: return numPrims / 3;
-    case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
-    case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
-    case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
-    case TOP_QUAD_LIST: return numPrims / 4;
-    case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
-    case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
-    case TOP_LINE_LIST: return numPrims / 2;
-    case TOP_LINE_LOOP: return numPrims;
-    case TOP_RECT_LIST: return numPrims / 3;
-    case TOP_LINE_LIST_ADJ: return numPrims / 4;
-    case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
-    case TOP_TRI_LIST_ADJ: return numPrims / 6;
-    case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
+    case TOP_POINT_LIST:
+        return numPrims;
+    case TOP_TRIANGLE_LIST:
+        return numPrims / 3;
+    case TOP_TRIANGLE_STRIP:
+        return numPrims < 3 ? 0 : numPrims - 2;
+    case TOP_TRIANGLE_FAN:
+        return numPrims < 3 ? 0 : numPrims - 2;
+    case TOP_TRIANGLE_DISC:
+        return numPrims < 2 ? 0 : numPrims - 1;
+    case TOP_QUAD_LIST:
+        return numPrims / 4;
+    case TOP_QUAD_STRIP:
+        return numPrims < 4 ? 0 : (numPrims - 2) / 2;
+    case TOP_LINE_STRIP:
+        return numPrims < 2 ? 0 : numPrims - 1;
+    case TOP_LINE_LIST:
+        return numPrims / 2;
+    case TOP_LINE_LOOP:
+        return numPrims;
+    case TOP_RECT_LIST:
+        return numPrims / 3;
+    case TOP_LINE_LIST_ADJ:
+        return numPrims / 4;
+    case TOP_LISTSTRIP_ADJ:
+        return numPrims < 3 ? 0 : numPrims - 3;
+    case TOP_TRI_LIST_ADJ:
+        return numPrims / 6;
+    case TOP_TRI_STRIP_ADJ:
+        return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 
     case TOP_PATCHLIST_1:
     case TOP_PATCHLIST_2:
@@ -314,27 +311,40 @@
 /// @brief Computes the number of verts given the number of primitives.
 /// @param mode - primitive topology for draw operation.
 /// @param numPrims - number of primitives for draw.
-uint32_t GetNumVerts(
-    PRIMITIVE_TOPOLOGY mode,
-    uint32_t numPrims)
+uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
 {
     switch (mode)
     {
-    case TOP_POINT_LIST: return numPrims;
-    case TOP_TRIANGLE_LIST: return numPrims * 3;
-    case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
-    case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
-    case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
-    case TOP_QUAD_LIST: return numPrims * 4;
-    case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
-    case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
-    case TOP_LINE_LIST: return numPrims * 2;
-    case TOP_LINE_LOOP: return numPrims;
-    case TOP_RECT_LIST: return numPrims * 3;
-    case TOP_LINE_LIST_ADJ: return numPrims * 4;
-    case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
-    case TOP_TRI_LIST_ADJ: return numPrims * 6;
-    case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
+    case TOP_POINT_LIST:
+        return numPrims;
+    case TOP_TRIANGLE_LIST:
+        return numPrims * 3;
+    case TOP_TRIANGLE_STRIP:
+        return numPrims ? numPrims + 2 : 0;
+    case TOP_TRIANGLE_FAN:
+        return numPrims ? numPrims + 2 : 0;
+    case TOP_TRIANGLE_DISC:
+        return numPrims ? numPrims + 1 : 0;
+    case TOP_QUAD_LIST:
+        return numPrims * 4;
+    case TOP_QUAD_STRIP:
+        return numPrims ? numPrims * 2 + 2 : 0;
+    case TOP_LINE_STRIP:
+        return numPrims ? numPrims + 1 : 0;
+    case TOP_LINE_LIST:
+        return numPrims * 2;
+    case TOP_LINE_LOOP:
+        return numPrims;
+    case TOP_RECT_LIST:
+        return numPrims * 3;
+    case TOP_LINE_LIST_ADJ:
+        return numPrims * 4;
+    case TOP_LISTSTRIP_ADJ:
+        return numPrims ? numPrims + 3 : 0;
+    case TOP_TRI_LIST_ADJ:
+        return numPrims * 6;
+    case TOP_TRI_STRIP_ADJ:
+        return numPrims ? (numPrims + 2) * 2 : 0;
 
     case TOP_PATCHLIST_1:
     case TOP_PATCHLIST_2:
@@ -465,10 +475,15 @@
         switch (topology)
         {
         case TOP_LISTSTRIP_ADJ:
-        case TOP_LINE_LIST_ADJ: numVerts = 4; break;
+        case TOP_LINE_LIST_ADJ:
+            numVerts = 4;
+            break;
         case TOP_TRI_STRIP_ADJ:
-        case TOP_TRI_LIST_ADJ: numVerts = 6; break;
-        default: break;
+        case TOP_TRI_LIST_ADJ:
+            numVerts = 6;
+            break;
+        default:
+            break;
         }
     }
 
@@ -480,14 +495,16 @@
 /// @param numWorkItems - Number of items being worked on by a SIMD.
 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 {
-    uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
+    uint32_t numActive =
+        (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
     return _simd_castps_si(_simd_vmask_ps(mask));
 }
 
 static INLINE simd16scalari GenerateMask16(uint32_t numItemsRemaining)
 {
-    uint32_t numActive = (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
+    uint32_t numActive =
+        (numItemsRemaining >= KNOB_SIMD16_WIDTH) ? KNOB_SIMD16_WIDTH : numItemsRemaining;
     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
     return _simd16_castps_si(_simd16_vmask_ps(mask));
 }
@@ -499,23 +516,20 @@
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 static void StreamOut(
-    DRAW_CONTEXT* pDC,
-    PA_STATE& pa,
-    uint32_t workerId,
-    uint32_t* pPrimData,
-    uint32_t streamIndex)
+    DRAW_CONTEXT* pDC, PA_STATE& pa, uint32_t workerId, uint32_t* pPrimData, uint32_t streamIndex)
 {
     RDTSC_BEGIN(FEStreamout, pDC->drawId);
 
-    const API_STATE& state = GetApiState(pDC);
-    const SWR_STREAMOUT_STATE &soState = state.soState;
+    const API_STATE&           state   = GetApiState(pDC);
+    const SWR_STREAMOUT_STATE& soState = state.soState;
 
     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 
-    // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
+    // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
+    // vertex.
     uint32_t primDataDwordVertexStride = (SWR_VTX_NUM_SLOTS * sizeof(float) * 4) / sizeof(uint32_t);
 
-    SWR_STREAMOUT_CONTEXT soContext = { 0 };
+    SWR_STREAMOUT_CONTEXT soContext = {0};
 
     // Setup buffer state pointers.
     for (uint32_t i = 0; i < 4; ++i)
@@ -527,14 +541,14 @@
 
     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
     {
-        DWORD slot = 0;
-        uint32_t soMask = soState.streamMasks[streamIndex];
+        DWORD    slot   = 0;
+        uint64_t soMask = soState.streamMasks[streamIndex];
 
         // Write all entries into primitive data buffer for SOS.
-        while (_BitScanForward(&slot, soMask))
+        while (_BitScanForward64(&slot, soMask))
         {
-            simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
-            uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
+            simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM]; // prim attribs (always 4 wide)
+            uint32_t    paSlot = slot + soState.vertexAttribOffset[streamIndex];
             pa.AssembleSingle(paSlot, primIndex, attrib);
 
             // Attribute offset is relative offset from start of vertex.
@@ -546,19 +560,21 @@
             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
             {
-                uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
+                uint32_t* pPrimDataAttrib =
+                    pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 
                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
             }
 
-            soMask &= ~(1 << slot);
+            soMask &= ~(uint64_t(1) << slot);
         }
 
-        // Update pPrimData pointer 
+        // Update pPrimData pointer
         soContext.pPrimData = pPrimData;
 
         // Call SOS
-        SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
+        SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
+                   "Trying to execute uninitialized streamout jit function.");
         state.pfnSoFunc[streamIndex](soContext);
     }
 
@@ -620,7 +636,10 @@
 ///
 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
 ///
-void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex *vertex_simd16, const simdvertex *vertex, uint32_t vertexCount, uint32_t attribCount)
+void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex*     vertex_simd16,
+                                           const simdvertex* vertex,
+                                           uint32_t          vertexCount,
+                                           uint32_t          attribCount)
 {
     SWR_ASSERT(vertex);
     SWR_ASSERT(vertex_simd16);
@@ -634,11 +653,13 @@
         {
             for (uint32_t k = 0; k < 4; k += 1)
             {
-                temp.attrib[j][k] = _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
+                temp.attrib[j][k] =
+                    _simd16_insert_ps(_simd16_setzero_ps(), vertex[i].attrib[j][k], 0);
 
                 if ((i + 1) < vertexCount)
                 {
-                    temp.attrib[j][k] = _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
+                    temp.attrib[j][k] =
+                        _simd16_insert_ps(temp.attrib[j][k], vertex[i + 1].attrib[j][k], 1);
                 }
             }
         }
@@ -658,9 +679,7 @@
 ///        then return the remaining amount of work.
 /// @param curIndex - The start index for the SIMD.
 /// @param maxIndex - The last index for all work items.
-static INLINE uint32_t GetNumInvocations(
-    uint32_t curIndex,
-    uint32_t maxIndex)
+static INLINE uint32_t GetNumInvocations(uint32_t curIndex, uint32_t maxIndex)
 {
     uint32_t remainder = (maxIndex - curIndex);
 #if USE_SIMD16_FRONTEND
@@ -680,17 +699,20 @@
 /// @param pStreamIdBase - pointer to the stream ID buffer
 /// @param numEmittedVerts - Number of total verts emitted by the GS
 /// @param pCutBuffer - output buffer to write cuts to
-void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
+void ProcessStreamIdBuffer(uint32_t stream,
+                           uint8_t* pStreamIdBase,
+                           uint32_t numEmittedVerts,
+                           uint8_t* pCutBuffer)
 {
     SWR_ASSERT(stream < MAX_SO_STREAMS);
 
-    uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
+    uint32_t numInputBytes  = (numEmittedVerts * 2 + 7) / 8;
     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 
     for (uint32_t b = 0; b < numOutputBytes; ++b)
     {
-        uint8_t curInputByte = pStreamIdBase[2*b];
-        uint8_t outByte = 0;
+        uint8_t curInputByte = pStreamIdBase[2 * b];
+        uint8_t outByte      = 0;
         for (uint32_t i = 0; i < 4; ++i)
         {
             if ((curInputByte & 0x3) != stream)
@@ -720,16 +742,17 @@
     uint8_t* pGsIn;
     uint8_t* pGsOut[KNOB_SIMD_WIDTH];
     uint8_t* pGsTransposed;
-    void* pStreamCutBuffer;
+    void*    pStreamCutBuffer;
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
-/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
+/// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
+/// assembler
 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
 /// @param numVerts - Number of vertices outputted by the GS
 /// @param numAttribs - Number of attributes per vertex
-template<typename SIMD_T, uint32_t SimdWidth>
+template <typename SIMD_T, uint32_t SimdWidth>
 void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
 {
     uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
@@ -743,7 +766,7 @@
     }
     auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
 
-    uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
+    uint32_t numSimd        = AlignUp(numVerts, SimdWidth) / SimdWidth;
     uint32_t remainingVerts = numVerts;
 
     for (uint32_t s = 0; s < numSimd; ++s)
@@ -753,21 +776,36 @@
 
         // Compute mask to prevent src overflow
         uint32_t mask = std::min(remainingVerts, SimdWidth);
-        mask = GenMask(mask);
-        auto vMask = SIMD_T::vmask_ps(mask);
-        auto viMask = SIMD_T::castps_si(vMask);
+        mask          = GenMask(mask);
+        auto vMask    = SIMD_T::vmask_ps(mask);
+        auto viMask   = SIMD_T::castps_si(vMask);
 
         for (uint32_t a = 0; a < numAttribs; ++a)
         {
-            auto attribGatherX = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
-            auto attribGatherY = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
-            auto attribGatherZ = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
-            auto attribGatherW = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
+            auto attribGatherX = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+                SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
+            auto attribGatherY = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+                SIMD_T::setzero_ps(),
+                (const float*)(pSrcBase + sizeof(float)),
+                vGatherOffsets,
+                vMask);
+            auto attribGatherZ = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+                SIMD_T::setzero_ps(),
+                (const float*)(pSrcBase + sizeof(float) * 2),
+                vGatherOffsets,
+                vMask);
+            auto attribGatherW = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(
+                SIMD_T::setzero_ps(),
+                (const float*)(pSrcBase + sizeof(float) * 3),
+                vGatherOffsets,
+                vMask);
 
             SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
             SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
-            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
-            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
+            SIMD_T::maskstore_ps(
+                (float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
+            SIMD_T::maskstore_ps(
+                (float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
 
             pSrcBase += sizeof(float) * 4;
             pDstBase += sizeof(Float<SIMD_T>) * 4;
@@ -783,38 +821,35 @@
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pa - The primitive assembly object.
 /// @param pGsOut - output stream for GS
-template <
-    typename HasStreamOutT,
-    typename HasRastT>
-static void GeometryShaderStage(
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    PA_STATE& pa,
-    GsBuffers* pGsBuffers,
-    uint32_t* pSoPrimData,
+template <typename HasStreamOutT, typename HasRastT>
+static void GeometryShaderStage(DRAW_CONTEXT* pDC,
+                                uint32_t      workerId,
+                                PA_STATE&     pa,
+                                GsBuffers*    pGsBuffers,
+                                uint32_t*     pSoPrimData,
 #if USE_SIMD16_FRONTEND
-    uint32_t numPrims_simd8,
+                                uint32_t numPrims_simd8,
 #endif
-    simdscalari const &primID)
+                                simdscalari const& primID)
 {
     RDTSC_BEGIN(FEGeometryShader, pDC->drawId);
 
     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
-    const API_STATE& state = GetApiState(pDC);
+    const API_STATE&    state  = GetApiState(pDC);
     const SWR_GS_STATE* pState = &state.gsState;
-    SWR_GS_CONTEXT gsContext;
+    SWR_GS_CONTEXT      gsContext;
 
-    static uint8_t sNullBuffer[128] = { 0 };
+    static uint8_t sNullBuffer[128] = {0};
 
     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
     {
         gsContext.pStreams[i] = pGsBuffers->pGsOut[i];
     }
-    gsContext.pVerts = (simdvector*)pGsBuffers->pGsIn;
+    gsContext.pVerts      = (simdvector*)pGsBuffers->pGsIn;
     gsContext.PrimitiveID = primID;
 
-    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
+    uint32_t   numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
     simdvector attrib[MAX_NUM_VERTS_PER_PRIM];
 
     // assemble all attributes for the input primitive
@@ -822,7 +857,7 @@
     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
     {
         uint32_t srcAttribSlot = pState->srcVertexAttribOffset + slot;
-        uint32_t attribSlot = pState->vertexAttribOffset + slot;
+        uint32_t attribSlot    = pState->vertexAttribOffset + slot;
         pa.Assemble(srcAttribSlot, attrib);
 
         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
@@ -843,13 +878,13 @@
 #if USE_SIMD16_FRONTEND
     uint32_t numInputPrims = numPrims_simd8;
 #else
-    uint32_t numInputPrims = pa.NumPrims();
+    uint32_t          numInputPrims = pa.NumPrims();
 #endif
 
     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
     {
         gsContext.InstanceID = instance;
-        gsContext.mask = GenerateMask(numInputPrims);
+        gsContext.mask       = GenerateMask(numInputPrims);
 
         // execute the geometry shader
         state.pfnGsFunc(GetPrivateState(pDC), pWorkerData, &gsContext);
@@ -868,23 +903,43 @@
     {
         switch (pState->outputTopology)
         {
-        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
-        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
-        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
-        default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+        case TOP_RECT_LIST:
+            pfnClipFunc = ClipRectangles_simd16;
+            break;
+        case TOP_TRIANGLE_STRIP:
+            pfnClipFunc = ClipTriangles_simd16;
+            break;
+        case TOP_LINE_STRIP:
+            pfnClipFunc = ClipLines_simd16;
+            break;
+        case TOP_POINT_LIST:
+            pfnClipFunc = ClipPoints_simd16;
+            break;
+        default:
+            SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
         }
     }
 
 #else
-    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
+    PFN_PROCESS_PRIMS pfnClipFunc   = nullptr;
     if (HasRastT::value)
     {
         switch (pState->outputTopology)
         {
-        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
-        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
-        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
-        default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+        case TOP_RECT_LIST:
+            pfnClipFunc = ClipRectangles;
+            break;
+        case TOP_TRIANGLE_STRIP:
+            pfnClipFunc = ClipTriangles;
+            break;
+        case TOP_LINE_STRIP:
+            pfnClipFunc = ClipLines;
+            break;
+        case TOP_POINT_LIST:
+            pfnClipFunc = ClipPoints;
+            break;
+        default:
+            SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
         }
     }
 
@@ -920,29 +975,37 @@
             }
 
             uint8_t* pBase = pInstanceBase + instance * pState->allocationSize;
-            uint8_t* pCutBase = pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
+            uint8_t* pCutBase =
+                pState->controlDataSize == 0 ? &sNullBuffer[0] : pBase + pState->controlDataOffset;
             uint8_t* pVertexBaseAOS = pBase + pState->outputVertexOffset;
 
 #if USE_SIMD16_FRONTEND
-            TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
+            TransposeSOAtoAOS<SIMD512, KNOB_SIMD16_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
+                                                          pVertexBaseAOS,
+                                                          vertexCount,
+                                                          pState->outputVertexSize);
 #else
-            TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed, pVertexBaseAOS, vertexCount, pState->outputVertexSize);
+            TransposeSOAtoAOS<SIMD256, KNOB_SIMD_WIDTH>((uint8_t*)pGsBuffers->pGsTransposed,
+                                                        pVertexBaseAOS,
+                                                        vertexCount,
+                                                        pState->outputVertexSize);
 #endif
 
             uint32_t numAttribs = state.feNumAttributes;
 
             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
             {
-                bool processCutVerts = false;
-                uint8_t* pCutBuffer = pCutBase;
+                bool     processCutVerts = false;
+                uint8_t* pCutBuffer      = pCutBase;
 
                 // assign default stream ID, only relevant when GS is outputting a single stream
                 uint32_t streamID = 0;
                 if (pState->isSingleStream)
                 {
                     processCutVerts = true;
-                    streamID = pState->singleStreamID;
-                    if (streamID != stream) continue;
+                    streamID        = pState->singleStreamID;
+                    if (streamID != stream)
+                        continue;
                 }
                 else
                 {
@@ -953,16 +1016,35 @@
                     }
 
                     // multi-stream output, need to translate StreamID buffer to a cut buffer
-                    ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
-                    pCutBuffer = (uint8_t*)pGsBuffers->pStreamCutBuffer;
+                    ProcessStreamIdBuffer(
+                        stream, pCutBase, numEmittedVerts, (uint8_t*)pGsBuffers->pStreamCutBuffer);
+                    pCutBuffer      = (uint8_t*)pGsBuffers->pStreamCutBuffer;
                     processCutVerts = false;
                 }
 
 #if USE_SIMD16_FRONTEND
-                PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
+                PA_STATE_CUT gsPa(pDC,
+                                  (uint8_t*)pGsBuffers->pGsTransposed,
+                                  numEmittedVerts,
+                                  pState->outputVertexSize,
+                                  reinterpret_cast<simd16mask*>(pCutBuffer),
+                                  numEmittedVerts,
+                                  numAttribs,
+                                  pState->outputTopology,
+                                  processCutVerts,
+                                  pa.numVertsPerPrim);
 
 #else
-                PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
+                PA_STATE_CUT gsPa(pDC,
+                                  (uint8_t*)pGsBuffers->pGsTransposed,
+                                  numEmittedVerts,
+                                  pState->outputVertexSize,
+                                  pCutBuffer,
+                                  numEmittedVerts,
+                                  numAttribs,
+                                  pState->outputTopology,
+                                  processCutVerts,
+                                  pa.numVertsPerPrim);
 
 #endif
                 while (gsPa.GetNextStreamOutput())
@@ -997,18 +1079,19 @@
 
                                 // Gather data from the SVG if provided.
                                 simd16scalari vViewportIdx = SIMD16::setzero_si();
-                                simd16scalari vRtIdx = SIMD16::setzero_si();
-                                SIMD16::Vec4 svgAttrib[4];
+                                simd16scalari vRtIdx       = SIMD16::setzero_si();
+                                SIMD16::Vec4  svgAttrib[4];
 
-                                if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
+                                if (state.backendState.readViewportArrayIndex ||
+                                    state.backendState.readRenderTargetArrayIndex)
                                 {
                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
                                 }
 
-
                                 if (state.backendState.readViewportArrayIndex)
                                 {
-                                    vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+                                    vViewportIdx =
+                                        SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
                                     gsPa.viewportArrayActive = true;
                                 }
                                 if (state.backendState.readRenderTargetArrayIndex)
@@ -1019,36 +1102,50 @@
 
                                 {
                                     // OOB VPAI indices => forced to zero.
-                                    vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
-                                    simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
+                                    vViewportIdx =
+                                        SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
+                                    simd16scalari vNumViewports =
+                                        SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                    simd16scalari vClearMask =
+                                        SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
                                     vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
 
                                     gsPa.useAlternateOffset = false;
-                                    pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
+                                    pfnClipFunc(pDC,
+                                                gsPa,
+                                                workerId,
+                                                attrib_simd16,
+                                                GenMask(gsPa.NumPrims()),
+                                                vPrimId,
+                                                vViewportIdx,
+                                                vRtIdx);
                                 }
 #else
                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 
                                 // Gather data from the SVG if provided.
                                 simdscalari vViewportIdx = SIMD::setzero_si();
-                                simdscalari vRtIdx = SIMD::setzero_si();
-                                SIMD::Vec4 svgAttrib[4];
+                                simdscalari vRtIdx       = SIMD::setzero_si();
+                                SIMD::Vec4  svgAttrib[4];
 
-                                if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
+                                if (state.backendState.readViewportArrayIndex ||
+                                    state.backendState.readRenderTargetArrayIndex)
                                 {
                                     gsPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
                                 }
 
-
                                 if (state.backendState.readViewportArrayIndex)
                                 {
-                                    vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+                                    vViewportIdx =
+                                        SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
 
                                     // OOB VPAI indices => forced to zero.
-                                    vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
-                                    simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                    simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+                                    vViewportIdx =
+                                        SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+                                    simdscalari vNumViewports =
+                                        SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                    simdscalari vClearMask =
+                                        SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
                                     vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
                                     gsPa.viewportArrayActive = true;
                                 }
@@ -1058,7 +1155,14 @@
                                     gsPa.rtArrayActive = true;
                                 }
 
-                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewportIdx, vRtIdx);
+                                pfnClipFunc(pDC,
+                                            gsPa,
+                                            workerId,
+                                            attrib,
+                                            GenMask(gsPa.NumPrims()),
+                                            vPrimId,
+                                            vViewportIdx,
+                                            vRtIdx);
 #endif
                             }
                         }
@@ -1071,7 +1175,7 @@
     // update GS pipeline stats
     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
-    AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim*numInputPrims));
+    AR_EVENT(GSPrimInfo(numInputPrims, totalPrimsGenerated, numVertsPerPrim * numInputPrims));
     RDTSC_END(FEGeometryShader, 1);
 }
 
@@ -1081,8 +1185,11 @@
 /// @param state - API state
 /// @param ppGsOut - pointer to GS output buffer allocation
 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
-template<typename SIMD_T, uint32_t SIMD_WIDTH>
-static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, uint32_t vertsPerPrim, GsBuffers* pGsBuffers)
+template <typename SIMD_T, uint32_t SIMD_WIDTH>
+static INLINE void AllocateGsBuffers(DRAW_CONTEXT*    pDC,
+                                     const API_STATE& state,
+                                     uint32_t         vertsPerPrim,
+                                     GsBuffers*       pGsBuffers)
 {
     auto pArena = pDC->pArena;
     SWR_ASSERT(pArena != nullptr);
@@ -1092,7 +1199,7 @@
 
     // Allocate storage for vertex inputs
     uint32_t vertexInBufferSize = gsState.inputVertStride * sizeof(simdvector) * vertsPerPrim;
-    pGsBuffers->pGsIn = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
+    pGsBuffers->pGsIn           = (uint8_t*)pArena->AllocAligned(vertexInBufferSize, 32);
 
     // Allocate arena space to hold GS output verts
     const uint32_t vertexBufferSize = gsState.instanceCount * gsState.allocationSize;
@@ -1104,7 +1211,8 @@
 
     // Allocate storage for transposed GS output
     uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
-    uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
+    uint32_t transposedBufferSize =
+        numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
     pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
 
     // Allocate storage to hold temporary stream->cut buffer, if necessary
@@ -1114,7 +1222,8 @@
     }
     else
     {
-        pGsBuffers->pStreamCutBuffer = (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
+        pGsBuffers->pStreamCutBuffer =
+            (uint8_t*)pArena->AllocAligned(AlignUp(gsState.maxNumVerts * 2, 32), 32);
     }
 }
 
@@ -1124,12 +1233,12 @@
 struct TessellationThreadLocalData
 {
     SWR_HS_CONTEXT hsContext;
-    ScalarPatch patchData[KNOB_SIMD_WIDTH];
-    void* pTxCtx;
-    size_t tsCtxSize;
+    ScalarPatch    patchData[KNOB_SIMD_WIDTH];
+    void*          pTxCtx;
+    size_t         tsCtxSize;
 
     simdscalar* pDSOutput;
-    size_t dsOutputAllocSize;
+    size_t      dsOutputAllocSize;
 };
 
 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
@@ -1142,8 +1251,8 @@
     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
     if (gt_pTessellationThreadData == nullptr)
     {
-        gt_pTessellationThreadData = (TessellationThreadLocalData*)
-            AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
+        gt_pTessellationThreadData =
+            (TessellationThreadLocalData*)AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
     }
 }
@@ -1154,42 +1263,37 @@
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pa - The primitive assembly object.
 /// @param pGsOut - output stream for GS
-template <
-    typename HasGeometryShaderT,
-    typename HasStreamOutT,
-    typename HasRastT>
-static void TessellationStages(
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    PA_STATE& pa,
-    GsBuffers* pGsBuffers,
-    uint32_t* pSoPrimData,
+template <typename HasGeometryShaderT, typename HasStreamOutT, typename HasRastT>
+static void TessellationStages(DRAW_CONTEXT* pDC,
+                               uint32_t      workerId,
+                               PA_STATE&     pa,
+                               GsBuffers*    pGsBuffers,
+                               uint32_t*     pSoPrimData,
 #if USE_SIMD16_FRONTEND
-    uint32_t numPrims_simd8,
+                               uint32_t numPrims_simd8,
 #endif
-    simdscalari const &primID)
+                               simdscalari const& primID)
 {
-    const API_STATE& state = GetApiState(pDC);
+    const API_STATE&    state   = GetApiState(pDC);
     const SWR_TS_STATE& tsState = state.tsState;
     void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     SWR_ASSERT(gt_pTessellationThreadData);
 
-    HANDLE tsCtx = TSInitCtx(
-        tsState.domain,
-        tsState.partitioning,
-        tsState.tsOutputTopology,
-        gt_pTessellationThreadData->pTxCtx,
-        gt_pTessellationThreadData->tsCtxSize);
+    HANDLE tsCtx = TSInitCtx(tsState.domain,
+                             tsState.partitioning,
+                             tsState.tsOutputTopology,
+                             gt_pTessellationThreadData->pTxCtx,
+                             gt_pTessellationThreadData->tsCtxSize);
     if (tsCtx == nullptr)
     {
-        gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
-        tsCtx = TSInitCtx(
-            tsState.domain,
-            tsState.partitioning,
-            tsState.tsOutputTopology,
-            gt_pTessellationThreadData->pTxCtx,
-            gt_pTessellationThreadData->tsCtxSize);
+        gt_pTessellationThreadData->pTxCtx =
+            AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
+        tsCtx = TSInitCtx(tsState.domain,
+                          tsState.partitioning,
+                          tsState.tsOutputTopology,
+                          gt_pTessellationThreadData->pTxCtx,
+                          gt_pTessellationThreadData->tsCtxSize);
     }
     SWR_ASSERT(tsCtx);
 
@@ -1199,10 +1303,17 @@
     {
         switch (tsState.postDSTopology)
         {
-        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
-        case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
-        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
-        default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+        case TOP_TRIANGLE_LIST:
+            pfnClipFunc = ClipTriangles_simd16;
+            break;
+        case TOP_LINE_LIST:
+            pfnClipFunc = ClipLines_simd16;
+            break;
+        case TOP_POINT_LIST:
+            pfnClipFunc = ClipPoints_simd16;
+            break;
+        default:
+            SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
         }
     }
 
@@ -1212,17 +1323,24 @@
     {
         switch (tsState.postDSTopology)
         {
-        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
-        case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
-        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
-        default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+        case TOP_TRIANGLE_LIST:
+            pfnClipFunc = ClipTriangles;
+            break;
+        case TOP_LINE_LIST:
+            pfnClipFunc = ClipLines;
+            break;
+        case TOP_POINT_LIST:
+            pfnClipFunc = ClipPoints;
+            break;
+        default:
+            SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
         }
     }
 
 #endif
     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
-    hsContext.pCPout = gt_pTessellationThreadData->patchData;
-    hsContext.PrimitiveID = primID;
+    hsContext.pCPout          = gt_pTessellationThreadData->patchData;
+    hsContext.PrimitiveID     = primID;
 
     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
     // Max storage for one attribute for an entire simdprimitive
@@ -1264,7 +1382,7 @@
     for (uint32_t p = 0; p < numPrims; ++p)
     {
         // Run Tessellator
-        SWR_TS_TESSELLATED_DATA tsData = { 0 };
+        SWR_TS_TESSELLATED_DATA tsData = {0};
         RDTSC_BEGIN(FETessellation, pDC->drawId);
         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
         AR_EVENT(TessPrimCount(1));
@@ -1277,17 +1395,20 @@
         SWR_ASSERT(tsData.NumDomainPoints);
 
         // Allocate DS Output memory
-        uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
+        uint32_t requiredDSVectorInvocations =
+            AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
 #if USE_SIMD16_FRONTEND
-        size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) * tsState.dsAllocationSize;      // simd8 -> simd16, padding
+        size_t requiredAllocSize = sizeof(simdvector) * RoundUpEven(requiredDSVectorInvocations) *
+                                   tsState.dsAllocationSize; // simd8 -> simd16, padding
 #else
         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.dsAllocationSize;
-        size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
+        size_t requiredAllocSize       = sizeof(simdvector) * requiredDSOutputVectors;
 #endif
         if (requiredAllocSize > gt_pTessellationThreadData->dsOutputAllocSize)
         {
             AlignedFree(gt_pTessellationThreadData->pDSOutput);
-            gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
+            gt_pTessellationThreadData->pDSOutput =
+                (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
             gt_pTessellationThreadData->dsOutputAllocSize = requiredAllocSize;
         }
         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
@@ -1299,21 +1420,22 @@
 
         // Run Domain Shader
         SWR_DS_CONTEXT dsContext;
-        dsContext.PrimitiveID = pPrimId[p];
-        dsContext.pCpIn = &hsContext.pCPout[p];
-        dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
-        dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
-        dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+        dsContext.PrimitiveID           = pPrimId[p];
+        dsContext.pCpIn                 = &hsContext.pCPout[p];
+        dsContext.pDomainU              = (simdscalar*)tsData.pDomainPointsU;
+        dsContext.pDomainV              = (simdscalar*)tsData.pDomainPointsV;
+        dsContext.pOutputData           = gt_pTessellationThreadData->pDSOutput;
         dsContext.outVertexAttribOffset = tsState.dsOutVtxAttribOffset;
 #if USE_SIMD16_FRONTEND
-        dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations);      // simd8 -> simd16
+        dsContext.vectorStride = RoundUpEven(requiredDSVectorInvocations); // simd8 -> simd16
 #else
-        dsContext.vectorStride = requiredDSVectorInvocations;
+        dsContext.vectorStride         = requiredDSVectorInvocations;
 #endif
 
         uint32_t dsInvocations = 0;
 
-        for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
+        for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations;
+             ++dsContext.vectorOffset)
         {
             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
 
@@ -1328,14 +1450,14 @@
         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
 
 #if USE_SIMD16_FRONTEND
-        SWR_ASSERT(IsEven(dsContext.vectorStride));                             // simd8 -> simd16
+        SWR_ASSERT(IsEven(dsContext.vectorStride)); // simd8 -> simd16
 
 #endif
         PA_TESS tessPa(
             pDC,
 #if USE_SIMD16_FRONTEND
-            reinterpret_cast<const simd16scalar *>(dsContext.pOutputData),      // simd8 -> simd16
-            dsContext.vectorStride / 2,                                         // simd8 -> simd16
+            reinterpret_cast<const simd16scalar*>(dsContext.pOutputData), // simd8 -> simd16
+            dsContext.vectorStride / 2,                                   // simd8 -> simd16
 #else
             dsContext.pOutputData,
             dsContext.vectorStride,
@@ -1350,29 +1472,37 @@
         while (tessPa.HasWork())
         {
 #if USE_SIMD16_FRONTEND
-            const uint32_t numPrims = tessPa.NumPrims();
+            const uint32_t numPrims    = tessPa.NumPrims();
             const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
-            const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+            const uint32_t numPrims_hi =
+                std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
 
-            const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID);
-            const simdscalari primID_lo = _simd16_extract_si(primID, 0);
-            const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+            const simd16scalari primID    = _simd16_set1_epi32(dsContext.PrimitiveID);
+            const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
+            const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
 
 #endif
             if (HasGeometryShaderT::value)
             {
 #if USE_SIMD16_FRONTEND
                 tessPa.useAlternateOffset = false;
-                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
+                GeometryShaderStage<HasStreamOutT, HasRastT>(
+                    pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_lo, primID_lo);
 
                 if (numPrims_hi)
                 {
                     tessPa.useAlternateOffset = true;
-                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
+                    GeometryShaderStage<HasStreamOutT, HasRastT>(
+                        pDC, workerId, tessPa, pGsBuffers, pSoPrimData, numPrims_hi, primID_hi);
                 }
 #else
                 GeometryShaderStage<HasStreamOutT, HasRastT>(
-                    pDC, workerId, tessPa, pGsBuffers, pSoPrimData, _simd_set1_epi32(dsContext.PrimitiveID));
+                    pDC,
+                    workerId,
+                    tessPa,
+                    pGsBuffers,
+                    pSoPrimData,
+                    _simd_set1_epi32(dsContext.PrimitiveID));
 #endif
             }
             else
@@ -1388,9 +1518,9 @@
                 if (HasRastT::value)
                 {
 #if USE_SIMD16_FRONTEND
-                    simd16vector    prim_simd16[3]; // Only deal with triangles, lines, or points
+                    simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points
 #else
-                    simdvector      prim[3];        // Only deal with triangles, lines, or points
+                    simdvector prim[3]; // Only deal with triangles, lines, or points
 #endif
                     RDTSC_BEGIN(FEPAAssemble, pDC->drawId);
                     bool assemble =
@@ -1406,15 +1536,15 @@
 #if USE_SIMD16_FRONTEND
                     // Gather data from the SVG if provided.
                     simd16scalari vViewportIdx = SIMD16::setzero_si();
-                    simd16scalari vRtIdx = SIMD16::setzero_si();
-                    SIMD16::Vec4 svgAttrib[4];
+                    simd16scalari vRtIdx       = SIMD16::setzero_si();
+                    SIMD16::Vec4  svgAttrib[4];
 
-                    if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
+                    if (state.backendState.readViewportArrayIndex ||
+                        state.backendState.readRenderTargetArrayIndex)
                     {
                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
                     }
 
-
                     if (state.backendState.readViewportArrayIndex)
                     {
                         vViewportIdx = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
@@ -1430,20 +1560,29 @@
                     {
                         // OOB VPAI indices => forced to zero.
                         vViewportIdx = SIMD16::max_epi32(vViewportIdx, SIMD16::setzero_si());
-                        simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                        simd16scalari vNumViewports =
+                            SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
                         simd16scalari vClearMask = SIMD16::cmplt_epi32(vViewportIdx, vNumViewports);
-                        vViewportIdx = SIMD16::and_si(vClearMask, vViewportIdx);
+                        vViewportIdx             = SIMD16::and_si(vClearMask, vViewportIdx);
 
                         tessPa.useAlternateOffset = false;
-                        pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, vViewportIdx, vRtIdx);
+                        pfnClipFunc(pDC,
+                                    tessPa,
+                                    workerId,
+                                    prim_simd16,
+                                    GenMask(numPrims),
+                                    primID,
+                                    vViewportIdx,
+                                    vRtIdx);
                     }
 #else
                     // Gather data from the SGV if provided.
                     simdscalari vViewportIdx = SIMD::setzero_si();
-                    simdscalari vRtIdx = SIMD::setzero_si();
-                    SIMD::Vec4 svgAttrib[4];
+                    simdscalari vRtIdx       = SIMD::setzero_si();
+                    SIMD::Vec4  svgAttrib[4];
 
-                    if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
+                    if (state.backendState.readViewportArrayIndex ||
+                        state.backendState.readRenderTargetArrayIndex)
                     {
                         tessPa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
                     }
@@ -1454,18 +1593,24 @@
 
                         // OOB VPAI indices => forced to zero.
                         vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
-                        simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                        simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
-                        vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
+                        simdscalari vNumViewports  = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                        simdscalari vClearMask     = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+                        vViewportIdx               = SIMD::and_si(vClearMask, vViewportIdx);
                         tessPa.viewportArrayActive = true;
                     }
                     if (state.backendState.readRenderTargetArrayIndex)
                     {
-                        vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                        vRtIdx               = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
                         tessPa.rtArrayActive = true;
                     }
-                    pfnClipFunc(pDC, tessPa, workerId, prim,
-                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), vViewportIdx, vRtIdx);
+                    pfnClipFunc(pDC,
+                                tessPa,
+                                workerId,
+                                prim,
+                                GenMask(tessPa.NumPrims()),
+                                _simd_set1_epi32(dsContext.PrimitiveID),
+                                vViewportIdx,
+                                vRtIdx);
 #endif
                 }
             }
@@ -1473,7 +1618,7 @@
             tessPa.NextPrim();
 
         } // while (tessPa.HasWork())
-    } // for (uint32_t p = 0; p < numPrims; ++p)
+    }     // for (uint32_t p = 0; p < numPrims; ++p)
 
 #if USE_SIMD16_FRONTEND
     if (gt_pTessellationThreadData->pDSOutput != nullptr)
@@ -1487,8 +1632,8 @@
     TSDestroyCtx(tsCtx);
 }
 
-THREAD PA_STATE::SIMDVERTEX *gpVertexStore = nullptr;
-THREAD uint32_t gVertexStoreSize = 0;
+THREAD PA_STATE::SIMDVERTEX* gpVertexStore = nullptr;
+THREAD uint32_t gVertexStoreSize           = 0;
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrDraw.
@@ -1501,20 +1646,14 @@
 /// @param pDC - pointer to draw context.
 /// @param workerId - thread's worker id.
 /// @param pUserData - Pointer to DRAW_WORK
-template <
-    typename IsIndexedT,
-    typename IsCutIndexEnabledT,
-    typename HasTessellationT,
-    typename HasGeometryShaderT,
-    typename HasStreamOutT,
-    typename HasRastT>
-void ProcessDraw(
-    SWR_CONTEXT *pContext,
-    DRAW_CONTEXT *pDC,
-    uint32_t workerId,
-    void *pUserData)
+template <typename IsIndexedT,
+          typename IsCutIndexEnabledT,
+          typename HasTessellationT,
+          typename HasGeometryShaderT,
+          typename HasStreamOutT,
+          typename HasRastT>
+void ProcessDraw(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData)
 {
-
 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_QUEUE_FE)
     {
@@ -1526,8 +1665,8 @@
 
     void* pWorkerData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
-    DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
-    const API_STATE&    state = GetApiState(pDC);
+    DRAW_WORK&       work  = *(DRAW_WORK*)pUserData;
+    const API_STATE& state = GetApiState(pDC);
 
     uint32_t indexSize = 0;
     uint32_t endVertex = work.numVerts;
@@ -1565,9 +1704,11 @@
     if (HasGeometryShaderT::value)
     {
 #if USE_SIMD16_FRONTEND
-        AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
+        AllocateGsBuffers<SIMD512, KNOB_SIMD16_WIDTH>(
+            pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
 #else
-        AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
+        AllocateGsBuffers<SIMD256, KNOB_SIMD_WIDTH>(
+            pDC, state, NumVertsPerPrim(state.topology, true), &gsBuffers);
 #endif
     }
 
@@ -1597,14 +1738,14 @@
 #if USE_SIMD16_FRONTEND
     uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simd16vector);
 #else
-    uint32_t simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
+    uint32_t          simdVertexSizeBytes = state.frontendState.vsVertexSize * sizeof(simdvector);
 #endif
 
     SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM);
 
     // Compute storage requirements for vertex store
     // TODO: allocation needs to be rethought for better cut support
-    uint32_t numVerts = vertexCount + 2; // Need extra space for PA state machine
+    uint32_t numVerts        = vertexCount + 2; // Need extra space for PA state machine
     uint32_t vertexStoreSize = numVerts * simdVertexSizeBytes;
 
     // grow the vertex store for the PA as necessary
@@ -1618,30 +1759,36 @@
 
         SWR_ASSERT(gpVertexStore == nullptr);
 
-        gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(vertexStoreSize, 64));
+        gpVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX*>(AlignedMalloc(vertexStoreSize, 64));
         gVertexStoreSize = vertexStoreSize;
 
         SWR_ASSERT(gpVertexStore != nullptr);
     }
 
     // choose primitive assembler
-    
-    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1));
-    PA_STATE& pa = paFactory.GetPA();
+
+    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC,
+                                                         state.topology,
+                                                         work.numVerts,
+                                                         gpVertexStore,
+                                                         numVerts,
+                                                         state.frontendState.vsVertexSize,
+                                                         GetNumVerts(state.topology, 1));
+    PA_STATE&                                  pa = paFactory.GetPA();
 
 #if USE_SIMD16_FRONTEND
 #if USE_SIMD16_SHADERS
-    simd16vertex        vin;
+    simd16vertex vin;
 #else
-    simdvertex          vin_lo;
-    simdvertex          vin_hi;
+    simdvertex vin_lo;
+    simdvertex vin_hi;
 #endif
-    SWR_VS_CONTEXT      vsContext_lo;
-    SWR_VS_CONTEXT      vsContext_hi;
+    SWR_VS_CONTEXT vsContext_lo;
+    SWR_VS_CONTEXT vsContext_hi;
 
 #if USE_SIMD16_SHADERS
-    vsContext_lo.pVin = reinterpret_cast<simdvertex *>(&vin);
-    vsContext_hi.pVin = reinterpret_cast<simdvertex *>(&vin);
+    vsContext_lo.pVin = reinterpret_cast<simdvertex*>(&vin);
+    vsContext_hi.pVin = reinterpret_cast<simdvertex*>(&vin);
 #else
     vsContext_lo.pVin = &vin_lo;
     vsContext_hi.pVin = &vin_hi;
@@ -1649,11 +1796,11 @@
     vsContext_lo.AlternateOffset = 0;
     vsContext_hi.AlternateOffset = 1;
 
-    SWR_FETCH_CONTEXT   fetchInfo_lo = { 0 };
+    SWR_FETCH_CONTEXT fetchInfo_lo = {0};
 
-    fetchInfo_lo.pStreams = &state.vertexBuffers[0];
+    fetchInfo_lo.pStreams      = &state.vertexBuffers[0];
     fetchInfo_lo.StartInstance = work.startInstance;
-    fetchInfo_lo.StartVertex = 0;
+    fetchInfo_lo.StartVertex   = 0;
 
     if (IsIndexedT::value)
     {
@@ -1672,27 +1819,30 @@
         fetchInfo_lo.StartVertex = work.startVertex;
     }
 
-    SWR_FETCH_CONTEXT   fetchInfo_hi = fetchInfo_lo;
+    SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
 
-    const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    const simd16scalari vScale =
+        _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 
     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
     {
-        uint32_t  i = 0;
+        uint32_t i = 0;
 
         simd16scalari vIndex;
 
         if (IsIndexedT::value)
         {
             fetchInfo_lo.xpIndices = work.xpIB;
-            fetchInfo_hi.xpIndices = fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize;    // 1/2 of KNOB_SIMD16_WIDTH
+            fetchInfo_hi.xpIndices =
+                fetchInfo_lo.xpIndices + KNOB_SIMD_WIDTH * indexSize; // 1/2 of KNOB_SIMD16_WIDTH
         }
         else
         {
             vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
 
             fetchInfo_lo.xpIndices = (gfxptr_t)&vIndex;
-            fetchInfo_hi.xpIndices = (gfxptr_t)&vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t); // 1/2 of KNOB_SIMD16_WIDTH
+            fetchInfo_hi.xpIndices =
+                (gfxptr_t)&vIndex + KNOB_SIMD_WIDTH * sizeof(int32_t); // 1/2 of KNOB_SIMD16_WIDTH
         }
 
         fetchInfo_lo.CurInstance = instanceNum;
@@ -1703,24 +1853,24 @@
 
         while (pa.HasWork())
         {
-            // GetNextVsOutput currently has the side effect of updating some PA state machine state.
-            // So we need to keep this outside of (i < endVertex) check.
+            // GetNextVsOutput currently has the side effect of updating some PA state machine
+            // state. So we need to keep this outside of (i < endVertex) check.
 
-            simdmask *pvCutIndices_lo = nullptr;
-            simdmask *pvCutIndices_hi = nullptr;
+            simdmask* pvCutIndices_lo = nullptr;
+            simdmask* pvCutIndices_hi = nullptr;
 
             if (IsIndexedT::value)
             {
                 // simd16mask <=> simdmask[2]
 
-                pvCutIndices_lo = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[0];
-                pvCutIndices_hi = &reinterpret_cast<simdmask *>(&pa.GetNextVsIndices())[1];
+                pvCutIndices_lo = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[0];
+                pvCutIndices_hi = &reinterpret_cast<simdmask*>(&pa.GetNextVsIndices())[1];
             }
 
-            simd16vertex &vout = pa.GetNextVsOutput();
+            simd16vertex& vout = pa.GetNextVsOutput();
 
-            vsContext_lo.pVout = reinterpret_cast<simdvertex *>(&vout);
-            vsContext_hi.pVout = reinterpret_cast<simdvertex *>(&vout);
+            vsContext_lo.pVout = reinterpret_cast<simdvertex*>(&vout);
+            vsContext_hi.pVout = reinterpret_cast<simdvertex*>(&vout);
 
             if (i < endVertex)
             {
@@ -1728,16 +1878,17 @@
                 {
                     fetchInfo_lo.xpLastIndex = fetchInfo_lo.xpIndices;
                     uint32_t offset;
-                    offset = std::min(endVertex-i, (uint32_t) KNOB_SIMD16_WIDTH);
-#if USE_SIMD16_SHADERS
+                    offset = std::min(endVertex - i, (uint32_t)KNOB_SIMD16_WIDTH);
                     offset *= 4; // convert from index to address
+#if USE_SIMD16_SHADERS
                     fetchInfo_lo.xpLastIndex += offset;
 #else
-                    fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t) KNOB_SIMD_WIDTH) * 4; // * 4 for converting index to address
-                    uint32_t offset2 = std::min(offset, (uint32_t) KNOB_SIMD16_WIDTH)-KNOB_SIMD_WIDTH;
+                    fetchInfo_lo.xpLastIndex += std::min(offset, (uint32_t)KNOB_SIMD_WIDTH);
+                    uint32_t offset2 =
+                        std::min(offset, (uint32_t)KNOB_SIMD16_WIDTH) - KNOB_SIMD_WIDTH;
                     assert(offset >= 0);
                     fetchInfo_hi.xpLastIndex = fetchInfo_hi.xpIndices;
-                    fetchInfo_hi.xpLastIndex += offset2 * 4; // * 4 for converting index to address
+                    fetchInfo_hi.xpLastIndex += offset2;
 #endif
                 }
                 // 1. Execute FS/VS for a single SIMD.
@@ -1747,7 +1898,7 @@
 #else
                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_lo, vin_lo);
 
-                if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
+                if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
                 {
                     state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo_hi, vin_hi);
                 }
@@ -1757,10 +1908,10 @@
                 // forward fetch generated vertex IDs to the vertex shader
 #if USE_SIMD16_SHADERS
 #if USE_SIMD16_VS
-                vsContext_lo.VertexID16 = _simd16_insert_si(
-                    vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
-                vsContext_lo.VertexID16 = _simd16_insert_si(
-                    vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
+                vsContext_lo.VertexID16 =
+                    _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID, 0);
+                vsContext_lo.VertexID16 =
+                    _simd16_insert_si(vsContext_lo.VertexID16, fetchInfo_lo.VertexID2, 1);
 #else
                 vsContext_lo.VertexID = fetchInfo_lo.VertexID;
                 vsContext_hi.VertexID = fetchInfo_lo.VertexID2;
@@ -1774,8 +1925,8 @@
 #if USE_SIMD16_VS
                 vsContext_lo.mask16 = GenerateMask16(endVertex - i);
 #else
-                vsContext_lo.mask = GenerateMask(endVertex - i);
-                vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
+                vsContext_lo.mask     = GenerateMask(endVertex - i);
+                vsContext_hi.mask     = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
 #endif
 
                 // forward cut mask to the PA
@@ -1804,7 +1955,7 @@
                     state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_lo);
                     AR_EVENT(VSStats(vsContext_lo.stats.numInstExecuted));
 
-                    if ((i + KNOB_SIMD_WIDTH) < endVertex)  // 1/2 of KNOB_SIMD16_WIDTH
+                    if ((i + KNOB_SIMD_WIDTH) < endVertex) // 1/2 of KNOB_SIMD16_WIDTH
                     {
                         state.pfnVertexFunc(GetPrivateState(pDC), pWorkerData, &vsContext_hi);
                         AR_EVENT(VSStats(vsContext_hi.stats.numInstExecuted));
@@ -1838,33 +1989,61 @@
                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
 
                             const uint32_t numPrims = pa.NumPrims();
-                            const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
-                            const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
+                            const uint32_t numPrims_lo =
+                                std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH);
+                            const uint32_t numPrims_hi =
+                                std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH;
 
-                            const simd16scalari primID = pa.GetPrimID(work.startPrimID);
-                            const simdscalari primID_lo = _simd16_extract_si(primID, 0);
-                            const simdscalari primID_hi = _simd16_extract_si(primID, 1);
+                            const simd16scalari primID    = pa.GetPrimID(work.startPrimID);
+                            const simdscalari   primID_lo = _simd16_extract_si(primID, 0);
+                            const simdscalari   primID_hi = _simd16_extract_si(primID, 1);
 
                             if (HasTessellationT::value)
                             {
                                 pa.useAlternateOffset = false;
-                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
+                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+                                    pDC,
+                                    workerId,
+                                    pa,
+                                    &gsBuffers,
+                                    pSoPrimData,
+                                    numPrims_lo,
+                                    primID_lo);
 
                                 if (numPrims_hi)
                                 {
                                     pa.useAlternateOffset = true;
-                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
+                                    TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+                                        pDC,
+                                        workerId,
+                                        pa,
+                                        &gsBuffers,
+                                        pSoPrimData,
+                                        numPrims_hi,
+                                        primID_hi);
                                 }
                             }
                             else if (HasGeometryShaderT::value)
                             {
                                 pa.useAlternateOffset = false;
-                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_lo, primID_lo);
+                                GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
+                                                                             workerId,
+                                                                             pa,
+                                                                             &gsBuffers,
+                                                                             pSoPrimData,
+                                                                             numPrims_lo,
+                                                                             primID_lo);
 
                                 if (numPrims_hi)
                                 {
                                     pa.useAlternateOffset = true;
-                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC, workerId, pa, &gsBuffers, pSoPrimData, numPrims_hi, primID_hi);
+                                    GeometryShaderStage<HasStreamOutT, HasRastT>(pDC,
+                                                                                 workerId,
+                                                                                 pa,
+                                                                                 &gsBuffers,
+                                                                                 pSoPrimData,
+                                                                                 numPrims_hi,
+                                                                                 primID_hi);
                                 }
                             }
                             else
@@ -1882,14 +2061,14 @@
                                     // Gather data from the SVG if provided.
                                     simd16scalari vpai = SIMD16::setzero_si();
                                     simd16scalari rtai = SIMD16::setzero_si();
-                                    SIMD16::Vec4 svgAttrib[4];
+                                    SIMD16::Vec4  svgAttrib[4];
 
-                                    if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
+                                    if (state.backendState.readViewportArrayIndex ||
+                                        state.backendState.readRenderTargetArrayIndex)
                                     {
                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
                                     }
 
-
                                     if (state.backendState.readViewportArrayIndex)
                                     {
                                         vpai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
@@ -1897,19 +2076,29 @@
                                     }
                                     if (state.backendState.readRenderTargetArrayIndex)
                                     {
-                                        rtai = SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                                        rtai =
+                                            SIMD16::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
                                         pa.rtArrayActive = true;
                                     }
 
                                     {
                                         // OOB VPAI indices => forced to zero.
                                         vpai = SIMD16::max_epi32(vpai, SIMD16::setzero_si());
-                                        simd16scalari vNumViewports = SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                        simd16scalari vClearMask = SIMD16::cmplt_epi32(vpai, vNumViewports);
+                                        simd16scalari vNumViewports =
+                                            SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                        simd16scalari vClearMask =
+                                            SIMD16::cmplt_epi32(vpai, vNumViewports);
                                         vpai = SIMD16::and_si(vClearMask, vpai);
 
                                         pa.useAlternateOffset = false;
-                                        pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, vpai, rtai);
+                                        pDC->pState->pfnProcessPrims_simd16(pDC,
+                                                                            pa,
+                                                                            workerId,
+                                                                            prim_simd16,
+                                                                            GenMask(numPrims),
+                                                                            primID,
+                                                                            vpai,
+                                                                            rtai);
                                     }
                                 }
                             }
@@ -1935,12 +2124,12 @@
     }
 
 #else
-    SWR_VS_CONTEXT      vsContext;
-    SWR_FETCH_CONTEXT   fetchInfo = { 0 };
+    SWR_VS_CONTEXT    vsContext;
+    SWR_FETCH_CONTEXT fetchInfo = {0};
 
-    fetchInfo.pStreams = &state.vertexBuffers[0];
+    fetchInfo.pStreams      = &state.vertexBuffers[0];
     fetchInfo.StartInstance = work.startInstance;
-    fetchInfo.StartVertex = 0;
+    fetchInfo.StartVertex   = 0;
 
     if (IsIndexedT::value)
     {
@@ -1948,7 +2137,8 @@
 
         // if the entire index buffer isn't being consumed, set the last index
         // so that fetches < a SIMD wide will be masked off
-        fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+        fetchInfo.pLastIndex =
+            (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
         if (xpLastRequestedIndex < fetchInfo.pLastIndex)
         {
             fetchInfo.pLastIndex = xpLastRequestedIndex;
@@ -1959,13 +2149,13 @@
         fetchInfo.StartVertex = work.startVertex;
     }
 
-    const simdscalari   vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 
     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
     {
         simdscalari vIndex;
-        uint32_t  i = 0;
+        uint32_t    i = 0;
 
         if (IsIndexedT::value)
         {
@@ -1973,17 +2163,17 @@
         }
         else
         {
-            vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
+            vIndex             = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
             fetchInfo.pIndices = (const int32_t*)&vIndex;
         }
 
         fetchInfo.CurInstance = instanceNum;
-        vsContext.InstanceID = instanceNum;
+        vsContext.InstanceID  = instanceNum;
 
         while (pa.HasWork())
         {
-            // GetNextVsOutput currently has the side effect of updating some PA state machine state.
-            // So we need to keep this outside of (i < endVertex) check.
+            // GetNextVsOutput currently has the side effect of updating some PA state machine
+            // state. So we need to keep this outside of (i < endVertex) check.
             simdmask* pvCutIndices = nullptr;
             if (IsIndexedT::value)
             {
@@ -1991,12 +2181,11 @@
             }
 
             simdvertex& vout = pa.GetNextVsOutput();
-            vsContext.pVin = &vout;
-            vsContext.pVout = &vout;
+            vsContext.pVin   = &vout;
+            vsContext.pVout  = &vout;
 
             if (i < endVertex)
             {
-
                 // 1. Execute FS/VS for a single SIMD.
                 RDTSC_BEGIN(FEFetchShader, pDC->drawId);
                 state.pfnFetchFunc(GetPrivateState(pDC), pWorkerData, fetchInfo, vout);
@@ -2053,12 +2242,22 @@
                             if (HasTessellationT::value)
                             {
                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
-                                    pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                                    pDC,
+                                    workerId,
+                                    pa,
+                                    &gsBuffers,
+                                    pSoPrimData,
+                                    pa.GetPrimID(work.startPrimID));
                             }
                             else if (HasGeometryShaderT::value)
                             {
                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
-                                    pDC, workerId, pa, &gsBuffers, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                                    pDC,
+                                    workerId,
+                                    pa,
+                                    &gsBuffers,
+                                    pSoPrimData,
+                                    pa.GetPrimID(work.startPrimID));
                             }
                             else
                             {
@@ -2074,33 +2273,45 @@
 
                                     // Gather data from the SVG if provided.
                                     simdscalari vViewportIdx = SIMD::setzero_si();
-                                    simdscalari vRtIdx = SIMD::setzero_si();
-                                    SIMD::Vec4 svgAttrib[4];
+                                    simdscalari vRtIdx       = SIMD::setzero_si();
+                                    SIMD::Vec4  svgAttrib[4];
 
-                                    if (state.backendState.readViewportArrayIndex || state.backendState.readRenderTargetArrayIndex)
+                                    if (state.backendState.readViewportArrayIndex ||
+                                        state.backendState.readRenderTargetArrayIndex)
                                     {
                                         pa.Assemble(VERTEX_SGV_SLOT, svgAttrib);
                                     }
 
                                     if (state.backendState.readViewportArrayIndex)
                                     {
-                                        vViewportIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
+                                        vViewportIdx =
+                                            SIMD::castps_si(svgAttrib[0][VERTEX_SGV_VAI_COMP]);
 
                                         // OOB VPAI indices => forced to zero.
-                                        vViewportIdx = SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
-                                        simdscalari vNumViewports = SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
-                                        simdscalari vClearMask = SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
+                                        vViewportIdx =
+                                            SIMD::max_epi32(vViewportIdx, SIMD::setzero_si());
+                                        simdscalari vNumViewports =
+                                            SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+                                        simdscalari vClearMask =
+                                            SIMD::cmplt_epi32(vViewportIdx, vNumViewports);
                                         vViewportIdx = SIMD::and_si(vClearMask, vViewportIdx);
                                         pa.viewportArrayActive = true;
                                     }
                                     if (state.backendState.readRenderTargetArrayIndex)
                                     {
-                                        vRtIdx = SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
+                                        vRtIdx =
+                                            SIMD::castps_si(svgAttrib[0][VERTEX_SGV_RTAI_COMP]);
                                         pa.rtArrayActive = true;
                                     }
 
-                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
-                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), vViewportIdx, vRtIdx);
+                                    pDC->pState->pfnProcessPrims(pDC,
+                                                                 pa,
+                                                                 workerId,
+                                                                 prim,
+                                                                 GenMask(pa.NumPrims()),
+                                                                 pa.GetPrimID(work.startPrimID),
+                                                                 vViewportIdx,
+                                                                 vRtIdx);
                                 }
                             }
                         }
@@ -2110,7 +2321,8 @@
 
             if (IsIndexedT::value)
             {
-                fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+                fetchInfo.pIndices =
+                    (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
             }
             else
             {
@@ -2138,15 +2350,18 @@
     }
 };
 
-
 // Selector for correct templated Draw front-end function
-PFN_FE_WORK_FUNC GetProcessDrawFunc(
-    bool IsIndexed,
-    bool IsCutIndexEnabled,
-    bool HasTessellation,
-    bool HasGeometryShader,
-    bool HasStreamOut,
-    bool HasRasterization)
+PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
+                                    bool IsCutIndexEnabled,
+                                    bool HasTessellation,
+                                    bool HasGeometryShader,
+                                    bool HasStreamOut,
+                                    bool HasRasterization)
 {
-    return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
+    return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed,
+                                                       IsCutIndexEnabled,
+                                                       HasTessellation,
+                                                       HasGeometryShader,
+                                                       HasStreamOut,
+                                                       HasRasterization);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index 6a2ec84..38fe77e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -1,38 +1,38 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file frontend.h
-*
-* @brief Definitions for Frontend which handles vertex processing,
-*        primitive assembly, clipping, binning, etc.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file frontend.h
+ *
+ * @brief Definitions for Frontend which handles vertex processing,
+ *        primitive assembly, clipping, binning, etc.
+ *
+ ******************************************************************************/
 #pragma once
 #include "context.h"
 #include "common/simdintrin.h"
 #include <type_traits>
 
 // Calculates the A and B coefficients for the 3 edges of the triangle
-// 
+//
 // maths for edge equations:
 //   standard form of a line in 2d
 //   Ax + By + C = 0
@@ -40,14 +40,14 @@
 //   B = x1 - x0
 //   C = x0y1 - x1y0
 INLINE
-void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
+void triangleSetupAB(const __m128 vX, const __m128 vY, __m128& vA, __m128& vB)
 {
     // vYsub = y1 y2 y0 dc
     __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
     // vY =    y0 y1 y2 dc
     vA = _mm_sub_ps(vY, vYsub);
 
-    // Result: 
+    // Result:
     // A[0] = y0 - y1
     // A[1] = y1 - y2
     // A[2] = y2 - y0
@@ -57,28 +57,31 @@
     // vX =    x0 x1 x2 dc
     vB = _mm_sub_ps(vXsub, vX);
 
-    // Result: 
+    // Result:
     // B[0] = x1 - x0
     // B[1] = x2 - x1
     // B[2] = x0 - x2
 }
 
 INLINE
-void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
+void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i& vA, __m128i& vB)
 {
     // generate edge equations
     // A = y0 - y1
     // B = x1 - x0
     // C = x0y1 - x1y0
     __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
-    vA = _mm_sub_epi32(vY, vYsub);
+    vA            = _mm_sub_epi32(vY, vYsub);
 
     __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
-    vB = _mm_sub_epi32(vXsub, vX);
+    vB            = _mm_sub_epi32(vXsub, vX);
 }
 
 INLINE
-void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
+void triangleSetupABIntVertical(const simdscalari vX[3],
+                                const simdscalari vY[3],
+                                simdscalari (&vA)[3],
+                                simdscalari (&vB)[3])
 {
     // A = y0 - y1
     // B = x1 - x0
@@ -93,7 +96,10 @@
 
 #if ENABLE_AVX512_SIMD16
 INLINE
-void triangleSetupABIntVertical(const simd16scalari vX[3], const simd16scalari vY[3], simd16scalari(&vA)[3], simd16scalari(&vB)[3])
+void triangleSetupABIntVertical(const simd16scalari vX[3],
+                                const simd16scalari vY[3],
+                                simd16scalari (&vA)[3],
+                                simd16scalari (&vB)[3])
 {
     // A = y0 - y1
     // B = x1 - x0
@@ -112,7 +118,7 @@
 // Px = x0-x2, Py = y0-y2
 // Qx = x1-x2, Qy = y1-y2
 //       |Px Qx|
-// det = |     | = PxQy - PyQx 
+// det = |     | = PxQy - PyQx
 //       |Py Qy|
 // simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
 //               try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
@@ -127,37 +133,39 @@
     // vBShuf = [B2, B0, B1, B0]
     __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
     // vMul = [A1*B2, B1*A2]
-    __m128i vMul   = _mm_mul_epi32(vAShuf, vBShuf);
+    __m128i vMul = _mm_mul_epi32(vAShuf, vBShuf);
 
     // shuffle upper to lower
     // vMul2 = [B1*A2, B1*A2]
     __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
-    //vMul = [A1*B2 - B1*A2]
+    // vMul = [A1*B2 - B1*A2]
     vMul = _mm_sub_epi64(vMul, vMul2);
 
     int64_t result;
     _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
 
     double dResult = (double)result;
-    dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
+    dResult        = dResult * (1.0 / FIXED_POINT16_SCALE);
 
     return (float)dResult;
 }
 
 INLINE
-void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
+void calcDeterminantIntVertical(const simdscalari vA[3],
+                                const simdscalari vB[3],
+                                simdscalari*      pvDet)
 {
     // refer to calcDeterminantInt comment for calculation explanation
 
     // A1*B2
-    simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]);     // 0 0 1 1 4 4 5 5
-    simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]);     // 2 2 3 3 6 6 7 7
+    simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]); // 0 0 1 1 4 4 5 5
+    simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]); // 2 2 3 3 6 6 7 7
 
     simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
     simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
 
-    simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo);        // 0 1 4 5
-    simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi);        // 2 3 6 7
+    simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo); // 0 1 4 5
+    simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi); // 2 3 6 7
 
     // B1*A2
     simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
@@ -185,19 +193,22 @@
 
 #if ENABLE_AVX512_SIMD16
 INLINE
-void calcDeterminantIntVertical(const simd16scalari vA[3], const simd16scalari vB[3], simd16scalari *pvDet)
+void calcDeterminantIntVertical(const simd16scalari vA[3],
+                                const simd16scalari vB[3],
+                                simd16scalari*      pvDet)
 {
     // refer to calcDeterminantInt comment for calculation explanation
 
     // A1*B2
-    simd16scalari vA1_lo = _simd16_unpacklo_epi32(vA[1], vA[1]);                // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
-    simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]);                // X 2 X 3 X 6 X 7 X A X B X E X F
+    simd16scalari vA1_lo =
+        _simd16_unpacklo_epi32(vA[1], vA[1]); // X 0 X 1 X 4 X 5 X 8 X 9 X C X D (32b)
+    simd16scalari vA1_hi = _simd16_unpackhi_epi32(vA[1], vA[1]); // X 2 X 3 X 6 X 7 X A X B X E X F
 
     simd16scalari vB2_lo = _simd16_unpacklo_epi32(vB[2], vB[2]);
     simd16scalari vB2_hi = _simd16_unpackhi_epi32(vB[2], vB[2]);
 
-    simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo);                 // 0 1 4 5 8 9 C D (64b)
-    simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi);                 // 2 3 6 7 A B E F
+    simd16scalari vA1B2_lo = _simd16_mul_epi32(vA1_lo, vB2_lo); // 0 1 4 5 8 9 C D (64b)
+    simd16scalari vA1B2_hi = _simd16_mul_epi32(vA1_hi, vB2_hi); // 2 3 6 7 A B E F
 
     // B1*A2
     simd16scalari vA2_lo = _simd16_unpacklo_epi32(vA[2], vA[2]);
@@ -210,32 +221,31 @@
     simd16scalari vA2B1_hi = _simd16_mul_epi32(vA2_hi, vB1_hi);
 
     // A1*B2 - A2*B1
-    simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo);               // 0 1 4 5 8 9 C D (64b)
-    simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi);               // 2 3 6 7 A B E F
+    simd16scalari difflo = _simd16_sub_epi64(vA1B2_lo, vA2B1_lo); // 0 1 4 5 8 9 C D (64b)
+    simd16scalari diffhi = _simd16_sub_epi64(vA1B2_hi, vA2B1_hi); // 2 3 6 7 A B E F
 
     // (1, 0, 1, 0) = 01 00 01 00 = 0x44, (3, 2, 3, 2) = 11 10 11 10 = 0xEE
-    simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44);       // 0 1 4 5 2 3 6 7 (64b)
-    simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE);       // 8 9 C D A B E F
+    simd16scalari templo = _simd16_permute2f128_si(difflo, diffhi, 0x44); // 0 1 4 5 2 3 6 7 (64b)
+    simd16scalari temphi = _simd16_permute2f128_si(difflo, diffhi, 0xEE); // 8 9 C D A B E F
 
     // (3, 1, 2, 0) = 11 01 10 00 = 0xD8
-    pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8);                   // 0 1 2 3 4 5 6 7 (64b)
-    pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8);                   // 8 9 A B C D E F
+    pvDet[0] = _simd16_permute2f128_si(templo, templo, 0xD8); // 0 1 2 3 4 5 6 7 (64b)
+    pvDet[1] = _simd16_permute2f128_si(temphi, temphi, 0xD8); // 8 9 A B C D E F
 }
 
 #endif
 INLINE
-void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
+void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128& vB, __m128& vC)
 {
     // C = -Ax - By
-    vC  = _mm_mul_ps(vA, vX);
-    __m128 vCy = _mm_mul_ps(vB, vY);    
-    vC  = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
-    vC  = _mm_sub_ps(vC, vCy);
+    vC         = _mm_mul_ps(vA, vX);
+    __m128 vCy = _mm_mul_ps(vB, vY);
+    vC         = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
+    vC         = _mm_sub_ps(vC, vCy);
 }
 
-template<uint32_t NumVerts>
-INLINE
-void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
+template <uint32_t NumVerts>
+INLINE void viewportTransform(simdvector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
 {
     simdscalar m00 = _simd_load1_ps(&vpMatrices.m00[0]);
     simdscalar m30 = _simd_load1_ps(&vpMatrices.m30[0]);
@@ -253,9 +263,8 @@
 }
 
 #if USE_SIMD16_FRONTEND
-template<uint32_t NumVerts>
-INLINE
-void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices)
+template <uint32_t NumVerts>
+INLINE void viewportTransform(simd16vector* v, const SWR_VIEWPORT_MATRICES& vpMatrices)
 {
     const simd16scalar m00 = _simd16_broadcast_ss(&vpMatrices.m00[0]);
     const simd16scalar m30 = _simd16_broadcast_ss(&vpMatrices.m30[0]);
@@ -273,9 +282,10 @@
 }
 
 #endif
-template<uint32_t NumVerts>
-INLINE
-void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari const &vViewportIdx)
+template <uint32_t NumVerts>
+INLINE void viewportTransform(simdvector*                  v,
+                              const SWR_VIEWPORT_MATRICES& vpMatrices,
+                              simdscalari const&           vViewportIdx)
 {
     // perform a gather of each matrix element based on the viewport array indexes
     simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
@@ -294,9 +304,10 @@
 }
 
 #if USE_SIMD16_FRONTEND
-template<uint32_t NumVerts>
-INLINE
-void viewportTransform(simd16vector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simd16scalari const &vViewportIdx)
+template <uint32_t NumVerts>
+INLINE void viewportTransform(simd16vector*                v,
+                              const SWR_VIEWPORT_MATRICES& vpMatrices,
+                              simd16scalari const&         vViewportIdx)
 {
     // perform a gather of each matrix element based on the viewport array indexes
     const simd16scalar m00 = _simd16_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 4);
@@ -316,7 +327,7 @@
 
 #endif
 INLINE
-void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, SWR_RECT &bbox)
+void calcBoundingBoxInt(const __m128i& vX, const __m128i& vY, SWR_RECT& bbox)
 {
     // Need horizontal fp min here
     __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
@@ -325,18 +336,17 @@
     __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
     __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
 
-
     __m128i vMinX = _mm_min_epi32(vX, vX1);
-            vMinX = _mm_min_epi32(vMinX, vX2);
+    vMinX         = _mm_min_epi32(vMinX, vX2);
 
     __m128i vMaxX = _mm_max_epi32(vX, vX1);
-            vMaxX = _mm_max_epi32(vMaxX, vX2);
+    vMaxX         = _mm_max_epi32(vMaxX, vX2);
 
     __m128i vMinY = _mm_min_epi32(vY, vY1);
-            vMinY = _mm_min_epi32(vMinY, vY2);
+    vMinY         = _mm_min_epi32(vMinY, vY2);
 
     __m128i vMaxY = _mm_max_epi32(vY, vY1);
-            vMaxY = _mm_max_epi32(vMaxY, vY2);
+    vMaxY         = _mm_max_epi32(vMaxY, vY2);
 
     bbox.xmin = _mm_extract_epi32(vMinX, 0);
     bbox.xmax = _mm_extract_epi32(vMaxX, 0);
@@ -345,54 +355,84 @@
 }
 
 INLINE
-bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
+bool CanUseSimplePoints(DRAW_CONTEXT* pDC)
 {
     const API_STATE& state = GetApiState(pDC);
 
     return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
-            state.rastState.pointSize == 1.0f &&
-            !state.rastState.pointParam &&
-            !state.rastState.pointSpriteEnable &&
-            !state.backendState.clipDistanceMask);
+            state.rastState.pointSize == 1.0f && !state.rastState.pointParam &&
+            !state.rastState.pointSpriteEnable && !state.backendState.clipDistanceMask);
 }
 
 INLINE
 bool vHasNaN(const __m128& vec)
 {
-    const __m128 result = _mm_cmpunord_ps(vec, vec);
-    const int32_t mask = _mm_movemask_ps(result);
+    const __m128  result = _mm_cmpunord_ps(vec, vec);
+    const int32_t mask   = _mm_movemask_ps(result);
     return (mask != 0);
 }
 
 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
 
-
 // ProcessDraw front-end function.  All combinations of parameter values are available
-PFN_FE_WORK_FUNC GetProcessDrawFunc(
-    bool IsIndexed,
-    bool IsCutIndexEnabled,
-    bool HasTessellation,
-    bool HasGeometryShader,
-    bool HasStreamOut,
-    bool HasRasterization);
+PFN_FE_WORK_FUNC GetProcessDrawFunc(bool IsIndexed,
+                                    bool IsCutIndexEnabled,
+                                    bool HasTessellation,
+                                    bool HasGeometryShader,
+                                    bool HasStreamOut,
+                                    bool HasRasterization);
 
-void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessClear(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
+void ProcessStoreTiles(SWR_CONTEXT*  pContext,
+                       DRAW_CONTEXT* pDC,
+                       uint32_t      workerId,
+                       void*         pUserData);
+void ProcessDiscardInvalidateTiles(SWR_CONTEXT*  pContext,
+                                   DRAW_CONTEXT* pDC,
+                                   uint32_t      workerId,
+                                   void*         pUserData);
+void ProcessSync(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
+void ProcessShutdown(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pUserData);
 
 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
 #if USE_SIMD16_FRONTEND
 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
 #endif
 
-struct PA_STATE_BASE;  // forward decl
-void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
-void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari const &primID, simdscalari const &viewportIdx, simdscalari const &rtIdx);
+struct PA_STATE_BASE; // forward decl
+void BinPoints(DRAW_CONTEXT*      pDC,
+               PA_STATE&          pa,
+               uint32_t           workerId,
+               simdvector         prims[3],
+               uint32_t           primMask,
+               simdscalari const& primID,
+               simdscalari const& viewportIdx,
+               simdscalari const& rtIdx);
+void BinLines(DRAW_CONTEXT*      pDC,
+              PA_STATE&          pa,
+              uint32_t           workerId,
+              simdvector         prims[3],
+              uint32_t           primMask,
+              simdscalari const& primID,
+              simdscalari const& viewportIdx,
+              simdscalari const& rtIdx);
 #if USE_SIMD16_FRONTEND
-void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
-void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari const &primID, simd16scalari const &viewportIdx, simd16scalari const &rtIdx);
+void SIMDCALL BinPoints_simd16(DRAW_CONTEXT*        pDC,
+                               PA_STATE&            pa,
+                               uint32_t             workerId,
+                               simd16vector         prims[3],
+                               uint32_t             primMask,
+                               simd16scalari const& primID,
+                               simd16scalari const& viewportIdx,
+                               simd16scalari const& rtIdx);
+void SIMDCALL BinLines_simd16(DRAW_CONTEXT*        pDC,
+                              PA_STATE&            pa,
+                              uint32_t             workerId,
+                              simd16vector         prims[3],
+                              uint32_t             primMask,
+                              simd16scalari const& primID,
+                              simd16scalari const& viewportIdx,
+                              simd16scalari const& rtIdx);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
index d88a3aa..b52accb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -1,48 +1,48 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file knobs.h
-*
-* @brief Static (Compile-Time) Knobs for Core.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file knobs.h
+ *
+ * @brief Static (Compile-Time) Knobs for Core.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include <stdint.h>
 #include <gen_knobs.h>
 
-#define KNOB_ARCH_AVX    0
-#define KNOB_ARCH_AVX2   1
+#define KNOB_ARCH_AVX 0
+#define KNOB_ARCH_AVX2 1
 #define KNOB_ARCH_AVX512 2
 
 ///////////////////////////////////////////////////////////////////////////////
 // AVX512 Support
 ///////////////////////////////////////////////////////////////////////////////
 
-#define ENABLE_AVX512_SIMD16    1
-#define USE_8x2_TILE_BACKEND    1
-#define USE_SIMD16_FRONTEND     1
-#define USE_SIMD16_SHADERS      1   // requires USE_SIMD16_FRONTEND
-#define USE_SIMD16_VS           1   // requires USE_SIMD16_SHADERS
+#define ENABLE_AVX512_SIMD16 1
+#define USE_8x2_TILE_BACKEND 1
+#define USE_SIMD16_FRONTEND 1
+#define USE_SIMD16_SHADERS 1 // requires USE_SIMD16_FRONTEND
+#define USE_SIMD16_VS 1      // requires USE_SIMD16_SHADERS
 
 ///////////////////////////////////////////////////////////////////////////////
 // Architecture validation
@@ -89,49 +89,49 @@
 // Configuration knobs
 ///////////////////////////////////////////////////////////////////////////////
 // Maximum supported number of active vertex buffer streams
-#define KNOB_NUM_STREAMS                    32
+#define KNOB_NUM_STREAMS 32
 
 // Maximum supported active viewports and scissors
-#define KNOB_NUM_VIEWPORTS_SCISSORS         16
+#define KNOB_NUM_VIEWPORTS_SCISSORS 16
 
 // Guardband range used by the clipper
-#define KNOB_GUARDBAND_WIDTH                32768.0f
-#define KNOB_GUARDBAND_HEIGHT               32768.0f
+#define KNOB_GUARDBAND_WIDTH 32768.0f
+#define KNOB_GUARDBAND_HEIGHT 32768.0f
 
 ///////////////////////////////
 // Macro tile configuration
 ///////////////////////////////
 
 // raster tile dimensions
-#define KNOB_TILE_X_DIM                      8
-#define KNOB_TILE_X_DIM_SHIFT                3
-#define KNOB_TILE_Y_DIM                      8
-#define KNOB_TILE_Y_DIM_SHIFT                3
+#define KNOB_TILE_X_DIM 8
+#define KNOB_TILE_X_DIM_SHIFT 3
+#define KNOB_TILE_Y_DIM 8
+#define KNOB_TILE_Y_DIM_SHIFT 3
 
-// fixed macrotile pixel dimension for now, eventually will be 
+// fixed macrotile pixel dimension for now, eventually will be
 // dynamically set based on tile format and pixel size
-#define KNOB_MACROTILE_X_DIM                32
-#define KNOB_MACROTILE_Y_DIM                32
-#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT    13
-#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT    13
-#define KNOB_MACROTILE_X_DIM_FIXED          (KNOB_MACROTILE_X_DIM << 8)
-#define KNOB_MACROTILE_Y_DIM_FIXED          (KNOB_MACROTILE_Y_DIM << 8)
-#define KNOB_MACROTILE_X_DIM_IN_TILES       (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
-#define KNOB_MACROTILE_Y_DIM_IN_TILES       (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
+#define KNOB_MACROTILE_X_DIM 32
+#define KNOB_MACROTILE_Y_DIM 32
+#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT 13
+#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT 13
+#define KNOB_MACROTILE_X_DIM_FIXED (KNOB_MACROTILE_X_DIM << 8)
+#define KNOB_MACROTILE_Y_DIM_FIXED (KNOB_MACROTILE_Y_DIM << 8)
+#define KNOB_MACROTILE_X_DIM_IN_TILES (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
+#define KNOB_MACROTILE_Y_DIM_IN_TILES (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
 
 // total # of hot tiles available. This should be enough to
 // fully render a 16kx16k 128bpp render target
-#define KNOB_NUM_HOT_TILES_X                 256
-#define KNOB_NUM_HOT_TILES_Y                 256
-#define KNOB_COLOR_HOT_TILE_FORMAT           R32G32B32A32_FLOAT
-#define KNOB_DEPTH_HOT_TILE_FORMAT           R32_FLOAT
-#define KNOB_STENCIL_HOT_TILE_FORMAT         R8_UINT
+#define KNOB_NUM_HOT_TILES_X 256
+#define KNOB_NUM_HOT_TILES_Y 256
+#define KNOB_COLOR_HOT_TILE_FORMAT R32G32B32A32_FLOAT
+#define KNOB_DEPTH_HOT_TILE_FORMAT R32_FLOAT
+#define KNOB_STENCIL_HOT_TILE_FORMAT R8_UINT
 
 // Max scissor rectangle
-#define KNOB_MAX_SCISSOR_X                  KNOB_NUM_HOT_TILES_X * KNOB_MACROTILE_X_DIM
-#define KNOB_MAX_SCISSOR_Y                  KNOB_NUM_HOT_TILES_Y * KNOB_MACROTILE_Y_DIM
+#define KNOB_MAX_SCISSOR_X KNOB_NUM_HOT_TILES_X* KNOB_MACROTILE_X_DIM
+#define KNOB_MAX_SCISSOR_Y KNOB_NUM_HOT_TILES_Y* KNOB_MACROTILE_Y_DIM
 
-#if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4
+#if KNOB_SIMD_WIDTH == 8 && KNOB_TILE_X_DIM < 4
 #error "incompatible width/tile dimensions"
 #endif
 
@@ -160,14 +160,14 @@
 ///////////////////////////////////////////////////////////////////////////////
 // Optimization knobs
 ///////////////////////////////////////////////////////////////////////////////
-#define KNOB_USE_FAST_SRGB                     TRUE
+#define KNOB_USE_FAST_SRGB TRUE
 
 // enables cut-aware primitive assembler
-#define KNOB_ENABLE_CUT_AWARE_PA               TRUE
+#define KNOB_ENABLE_CUT_AWARE_PA TRUE
 
 // enables early rasterization (useful for small triangles)
 #if !defined(KNOB_ENABLE_EARLY_RAST)
-#define KNOB_ENABLE_EARLY_RAST                 1
+#define KNOB_ENABLE_EARLY_RAST 1
 #endif
 
 #if KNOB_ENABLE_EARLY_RAST
@@ -182,6 +182,5 @@
 
 // Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs.
 #if !defined(KNOB_ENABLE_TOSS_POINTS)
-#define KNOB_ENABLE_TOSS_POINTS                 0
+#define KNOB_ENABLE_TOSS_POINTS 0
 #endif
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
index 12c2a30..f8797a8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file knobs_init.h
-*
-* @brief Dynamic Knobs Initialization for Core.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file knobs_init.h
+ *
+ * @brief Dynamic Knobs Initialization for Core.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include <core/knobs.h>
@@ -37,9 +37,9 @@
 template <typename T>
 static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue)
 {
-    uint32_t value = 0;
-    char* pStopped = nullptr;
-    value = strtoul(pOverride, &pStopped, 0);
+    uint32_t value    = 0;
+    char*    pStopped = nullptr;
+    value             = strtoul(pOverride, &pStopped, 0);
     if (pStopped != pOverride)
     {
         knobValue = static_cast<T>(value);
@@ -65,9 +65,9 @@
     }
 
     // Try converting to a number and casting to bool
-    uint32_t value = 0;
-    char* pStopped = nullptr;
-    value = strtoul(pOverride, &pStopped, 0);
+    uint32_t value    = 0;
+    char*    pStopped = nullptr;
+    value             = strtoul(pOverride, &pStopped, 0);
     if (pStopped != pOverride)
     {
         knobValue = value != 0;
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h
index 2ca8c1b..3b23974 100644
--- a/src/gallium/drivers/swr/rasterizer/core/multisample.h
+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h
@@ -1,28 +1,28 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file multisample.h
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file multisample.h
+ *
+ ******************************************************************************/
 
 #pragma once
 
@@ -36,225 +36,387 @@
 INLINE
 SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
 {
-    switch(numSamples)
+    switch (numSamples)
     {
-    case 1: return SWR_MULTISAMPLE_1X;
-    case 2: return SWR_MULTISAMPLE_2X;
-    case 4: return SWR_MULTISAMPLE_4X;
-    case 8: return SWR_MULTISAMPLE_8X;
-    case 16: return SWR_MULTISAMPLE_16X;
-    default: assert(0); return SWR_MULTISAMPLE_1X;
+    case 1:
+        return SWR_MULTISAMPLE_1X;
+    case 2:
+        return SWR_MULTISAMPLE_2X;
+    case 4:
+        return SWR_MULTISAMPLE_4X;
+    case 8:
+        return SWR_MULTISAMPLE_8X;
+    case 16:
+        return SWR_MULTISAMPLE_16X;
+    default:
+        assert(0);
+        return SWR_MULTISAMPLE_1X;
     }
 }
 
 // hardcoded offsets based on Direct3d standard multisample positions
 // 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
 // coords are 0.8 fixed point offsets from (0, 0)
-template<SWR_MULTISAMPLE_COUNT sampleCount, bool isCenter = false>
+template <SWR_MULTISAMPLE_COUNT sampleCount, bool isCenter = false>
 struct MultisampleTraits
 {
-    INLINE static float X(uint32_t sampleNum) = delete;
-    INLINE static float Y(uint32_t sampleNum) = delete;
-    INLINE static simdscalari FullSampleMask() = delete;
+    INLINE static float       X(uint32_t sampleNum) = delete;
+    INLINE static float       Y(uint32_t sampleNum) = delete;
+    INLINE static simdscalari FullSampleMask()      = delete;
 
     static const uint32_t numSamples = 0;
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_1X, false>
 {
-    INLINE static float X(uint32_t sampleNum) {return samplePosX[sampleNum];};
-    INLINE static float Y(uint32_t sampleNum) {return samplePosY[sampleNum];};
-    INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
+    INLINE static float       X(uint32_t sampleNum) { return samplePosX[sampleNum]; };
+    INLINE static float       Y(uint32_t sampleNum) { return samplePosY[sampleNum]; };
+    INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
 
-    static const uint32_t numSamples = 1;
-    static const uint32_t numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X;
-    static constexpr uint32_t samplePosXi[1] = { 0x80 };
-    static constexpr uint32_t samplePosYi[1] = { 0x80 };
-    static constexpr float samplePosX[1] = { 0.5f };
-    static constexpr float samplePosY[1] = { 0.5f };
+    static const uint32_t              numSamples         = 1;
+    static const uint32_t              numCoverageSamples = 1;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_1X;
+    static constexpr uint32_t          samplePosXi[1]     = {0x80};
+    static constexpr uint32_t          samplePosYi[1]     = {0x80};
+    static constexpr float             samplePosX[1]      = {0.5f};
+    static constexpr float             samplePosY[1]      = {0.5f};
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_1X, true>
 {
-    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
-    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
-    INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
-    
-    static const uint32_t numSamples = 1;
-    static const uint32_t numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_1X;
-    static constexpr uint32_t samplePosXi[1] = { 0x80 };
-    static constexpr uint32_t samplePosYi[1] = { 0x80 };
-    static constexpr float samplePosX[1] = { 0.5f };
-    static constexpr float samplePosY[1] = { 0.5f };
+    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
+    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
+    INLINE static simdscalari FullSampleMask() { return _simd_set1_epi32(0x1); };
+
+    static const uint32_t              numSamples         = 1;
+    static const uint32_t              numCoverageSamples = 1;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_1X;
+    static constexpr uint32_t          samplePosXi[1]     = {0x80};
+    static constexpr uint32_t          samplePosYi[1]     = {0x80};
+    static constexpr float             samplePosX[1]      = {0.5f};
+    static constexpr float             samplePosY[1]      = {0.5f};
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_2X, false>
 {
-    INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
-    INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+    INLINE static float X(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        return samplePosX[sampleNum];
+    };
+    INLINE static float Y(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        return samplePosY[sampleNum];
+    };
     INLINE static simdscalari FullSampleMask()
     {
-         static const simdscalari mask =_simd_set1_epi32(0x3);
-         return mask;
+        static const simdscalari mask = _simd_set1_epi32(0x3);
+        return mask;
     }
 
-    static const uint32_t numSamples = 2;
-    static const uint32_t numCoverageSamples = 2;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X;
-    static constexpr uint32_t samplePosXi[2] = { 0xC0, 0x40 };
-    static constexpr uint32_t samplePosYi[2] = { 0xC0, 0x40 };
-    static constexpr float samplePosX[2] = {0.75f, 0.25f};
-    static constexpr float samplePosY[2] = {0.75f, 0.25f};
+    static const uint32_t              numSamples         = 2;
+    static const uint32_t              numCoverageSamples = 2;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_2X;
+    static constexpr uint32_t          samplePosXi[2]     = {0xC0, 0x40};
+    static constexpr uint32_t          samplePosYi[2]     = {0xC0, 0x40};
+    static constexpr float             samplePosX[2]      = {0.75f, 0.25f};
+    static constexpr float             samplePosY[2]      = {0.75f, 0.25f};
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_2X, true>
 {
-    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
-    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
+    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
+    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
     INLINE static simdscalari FullSampleMask()
     {
-         static const simdscalari mask =_simd_set1_epi32(0x3);
-         return mask;
+        static const simdscalari mask = _simd_set1_epi32(0x3);
+        return mask;
     }
-    static const uint32_t numSamples = 2;
-    static const uint32_t numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_2X;
-    static constexpr uint32_t samplePosXi[2] = { 0x80 , 0x80 };
-    static constexpr uint32_t samplePosYi[2] = { 0x80 , 0x80 };
-    static constexpr float samplePosX[2] = { 0.5f, 0.5f };
-    static constexpr float samplePosY[2] = { 0.5f, 0.5f };
+    static const uint32_t              numSamples         = 2;
+    static const uint32_t              numCoverageSamples = 1;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_2X;
+    static constexpr uint32_t          samplePosXi[2]     = {0x80, 0x80};
+    static constexpr uint32_t          samplePosYi[2]     = {0x80, 0x80};
+    static constexpr float             samplePosX[2]      = {0.5f, 0.5f};
+    static constexpr float             samplePosY[2]      = {0.5f, 0.5f};
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_4X, false>
 {
-    INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
-    INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+    INLINE static float X(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        return samplePosX[sampleNum];
+    };
+    INLINE static float Y(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        return samplePosY[sampleNum];
+    };
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xF);
         return mask;
     }
 
-    static const uint32_t numSamples = 4;
-    static const uint32_t numCoverageSamples = 4;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X;
-    static constexpr uint32_t samplePosXi[4] = { 0x60, 0xE0, 0x20, 0xA0 };
-    static constexpr uint32_t samplePosYi[4] = { 0x20, 0x60, 0xA0, 0xE0 };
-    static constexpr float samplePosX[4] = { 0.375f, 0.875f, 0.125f, 0.625f };
-    static constexpr float samplePosY[4] = { 0.125f, 0.375f, 0.625f, 0.875f };
+    static const uint32_t              numSamples         = 4;
+    static const uint32_t              numCoverageSamples = 4;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_4X;
+    static constexpr uint32_t          samplePosXi[4]     = {0x60, 0xE0, 0x20, 0xA0};
+    static constexpr uint32_t          samplePosYi[4]     = {0x20, 0x60, 0xA0, 0xE0};
+    static constexpr float             samplePosX[4]      = {0.375f, 0.875f, 0.125f, 0.625f};
+    static constexpr float             samplePosY[4]      = {0.125f, 0.375f, 0.625f, 0.875f};
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_4X, true>
 {
-    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
-    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
+    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
+    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xF);
         return mask;
     }
 
-    static const uint32_t numSamples = 4;
-    static const uint32_t numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_4X;
-    static constexpr uint32_t samplePosXi[4] = { 0x80, 0x80, 0x80, 0x80 };
-    static constexpr uint32_t samplePosYi[4] = { 0x80, 0x80, 0x80, 0x80 };
-    static constexpr float samplePosX[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
-    static constexpr float samplePosY[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
+    static const uint32_t              numSamples         = 4;
+    static const uint32_t              numCoverageSamples = 1;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_4X;
+    static constexpr uint32_t          samplePosXi[4]     = {0x80, 0x80, 0x80, 0x80};
+    static constexpr uint32_t          samplePosYi[4]     = {0x80, 0x80, 0x80, 0x80};
+    static constexpr float             samplePosX[4]      = {0.5f, 0.5f, 0.5f, 0.5f};
+    static constexpr float             samplePosY[4]      = {0.5f, 0.5f, 0.5f, 0.5f};
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_8X, false>
 {
-    INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
-    INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+    INLINE static float X(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        return samplePosX[sampleNum];
+    };
+    INLINE static float Y(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        return samplePosY[sampleNum];
+    };
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xFF);
         return mask;
     }
 
-    static const uint32_t numSamples = 8;
-    static const uint32_t numCoverageSamples = 8;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X;
-    static constexpr uint32_t samplePosXi[8] = { 0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0 };
-    static constexpr uint32_t samplePosYi[8] = { 0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10 };
-    static constexpr float samplePosX[8] = { 0.5625f, 0.4375f, 0.8125f, 0.3125f, 0.1875f, 0.0625f, 0.6875f, 0.9375f };
-    static constexpr float samplePosY[8] = { 0.3125f, 0.6875f, 0.5625f, 0.1875f, 0.8125f, 0.4375f, 0.9375f, 0.0625f };
+    static const uint32_t              numSamples         = 8;
+    static const uint32_t              numCoverageSamples = 8;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_8X;
+    static constexpr uint32_t samplePosXi[8] = {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0};
+    static constexpr uint32_t samplePosYi[8] = {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10};
+    static constexpr float    samplePosX[8]  = {
+        0.5625f, 0.4375f, 0.8125f, 0.3125f, 0.1875f, 0.0625f, 0.6875f, 0.9375f};
+    static constexpr float samplePosY[8] = {
+        0.3125f, 0.6875f, 0.5625f, 0.1875f, 0.8125f, 0.4375f, 0.9375f, 0.0625f};
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_8X, true>
 {
-    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
-    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
+    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
+    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xFF);
         return mask;
     }
-    static const uint32_t numSamples = 8;
-    static const uint32_t numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_8X;
-    static constexpr uint32_t samplePosXi[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-    static constexpr uint32_t samplePosYi[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-    static constexpr float samplePosX[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
-    static constexpr float samplePosY[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
+    static const uint32_t              numSamples         = 8;
+    static const uint32_t              numCoverageSamples = 1;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_8X;
+    static constexpr uint32_t samplePosXi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+    static constexpr uint32_t samplePosYi[8] = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+    static constexpr float    samplePosX[8]  = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
+    static constexpr float    samplePosY[8]  = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_16X, false>
 {
-    INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
-    INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+    INLINE static float X(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        return samplePosX[sampleNum];
+    };
+    INLINE static float Y(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        return samplePosY[sampleNum];
+    };
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xFFFF);
         return mask;
     }
 
-    static const uint32_t numSamples = 16;
-    static const uint32_t numCoverageSamples = 16;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X;
-    static constexpr uint32_t samplePosXi[16] = { 0x90, 0x70, 0x50, 0xC0, 0x30, 0xA0, 0xD0, 0xB0, 0x60, 0x80, 0x40, 0x20, 0x00, 0xF0, 0xE0, 0x10 };
-    static constexpr uint32_t samplePosYi[16] = { 0x90, 0x50, 0xA0, 0x70, 0x60, 0xD0, 0xB0, 0x30, 0xE0, 0x10, 0x20, 0xC0, 0x80, 0x40, 0xF0, 0x00 };
-    static constexpr float samplePosX[16] = { 0.5625f, 0.4375f, 0.3125f, 0.7500f, 0.1875f, 0.6250f, 0.8125f, 0.6875f, 0.3750f, 0.5000f, 0.2500f, 0.1250f, 0.0000f, 0.9375f, 0.8750f, 0.0625f };
-    static constexpr float samplePosY[16] = { 0.5625f, 0.3125f, 0.6250f, 0.4375f, 0.3750f, 0.8125f, 0.6875f, 0.1875f, 0.8750f, 0.0625f, 0.1250f, 0.7500f, 0.5000f, 0.2500f, 0.9375f, 0.0000f };
+    static const uint32_t              numSamples         = 16;
+    static const uint32_t              numCoverageSamples = 16;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_16X;
+    static constexpr uint32_t          samplePosXi[16]    = {0x90,
+                                                 0x70,
+                                                 0x50,
+                                                 0xC0,
+                                                 0x30,
+                                                 0xA0,
+                                                 0xD0,
+                                                 0xB0,
+                                                 0x60,
+                                                 0x80,
+                                                 0x40,
+                                                 0x20,
+                                                 0x00,
+                                                 0xF0,
+                                                 0xE0,
+                                                 0x10};
+    static constexpr uint32_t          samplePosYi[16]    = {0x90,
+                                                 0x50,
+                                                 0xA0,
+                                                 0x70,
+                                                 0x60,
+                                                 0xD0,
+                                                 0xB0,
+                                                 0x30,
+                                                 0xE0,
+                                                 0x10,
+                                                 0x20,
+                                                 0xC0,
+                                                 0x80,
+                                                 0x40,
+                                                 0xF0,
+                                                 0x00};
+    static constexpr float             samplePosX[16]     = {0.5625f,
+                                             0.4375f,
+                                             0.3125f,
+                                             0.7500f,
+                                             0.1875f,
+                                             0.6250f,
+                                             0.8125f,
+                                             0.6875f,
+                                             0.3750f,
+                                             0.5000f,
+                                             0.2500f,
+                                             0.1250f,
+                                             0.0000f,
+                                             0.9375f,
+                                             0.8750f,
+                                             0.0625f};
+    static constexpr float             samplePosY[16]     = {0.5625f,
+                                             0.3125f,
+                                             0.6250f,
+                                             0.4375f,
+                                             0.3750f,
+                                             0.8125f,
+                                             0.6875f,
+                                             0.1875f,
+                                             0.8750f,
+                                             0.0625f,
+                                             0.1250f,
+                                             0.7500f,
+                                             0.5000f,
+                                             0.2500f,
+                                             0.9375f,
+                                             0.0000f};
 };
 
-template<>
+template <>
 struct MultisampleTraits<SWR_MULTISAMPLE_16X, true>
 {
-    INLINE static float X(uint32_t sampleNum) {return 0.5f;};
-    INLINE static float Y(uint32_t sampleNum) {return 0.5f;};
+    INLINE static float       X(uint32_t sampleNum) { return 0.5f; };
+    INLINE static float       Y(uint32_t sampleNum) { return 0.5f; };
     INLINE static simdscalari FullSampleMask()
     {
         static const simdscalari mask = _simd_set1_epi32(0xFFFF);
         return mask;
     }
-    static const uint32_t numSamples = 16;
-    static const uint32_t numCoverageSamples = 1;
-    static const SWR_MULTISAMPLE_COUNT sampleCount = SWR_MULTISAMPLE_16X;
-    static constexpr uint32_t samplePosXi[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-    static constexpr uint32_t samplePosYi[16] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-    static constexpr float samplePosX[16] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
-    static constexpr float samplePosY[16] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
+    static const uint32_t              numSamples         = 16;
+    static const uint32_t              numCoverageSamples = 1;
+    static const SWR_MULTISAMPLE_COUNT sampleCount        = SWR_MULTISAMPLE_16X;
+    static constexpr uint32_t          samplePosXi[16]    = {0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80};
+    static constexpr uint32_t          samplePosYi[16]    = {0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80,
+                                                 0x80};
+    static constexpr float             samplePosX[16]     = {0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f};
+    static constexpr float             samplePosY[16]     = {0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f,
+                                             0.5f};
 };
 
 INLINE
-bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount, const SWR_MULTISAMPLE_POS& samplePos)
+bool isNonStandardPattern(const SWR_MULTISAMPLE_COUNT sampleCount,
+                          const SWR_MULTISAMPLE_POS&  samplePos)
 {
     // detect if we're using standard or center sample patterns
     const uint32_t *standardPosX, *standardPosY;
-    switch(sampleCount)
+    switch (sampleCount)
     {
     case SWR_MULTISAMPLE_1X:
         standardPosX = MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosXi;
@@ -281,15 +443,15 @@
     }
 
     // scan sample pattern for standard or center
-    uint32_t numSamples = GetNumSamples(sampleCount);
-    bool bIsStandard = true;
-    if(numSamples > 1)
+    uint32_t numSamples  = GetNumSamples(sampleCount);
+    bool     bIsStandard = true;
+    if (numSamples > 1)
     {
-        for(uint32_t i = 0; i < numSamples; i++)
+        for (uint32_t i = 0; i < numSamples; i++)
         {
-            bIsStandard = (standardPosX[i] == samplePos.Xi(i)) ||
-                (standardPosY[i] == samplePos.Yi(i));
-            if(!bIsStandard)
+            bIsStandard =
+                (standardPosX[i] == samplePos.Xi(i)) || (standardPosY[i] == samplePos.Yi(i));
+            if (!bIsStandard)
                 break;
         }
     }
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index ed644c0..e19c8ea 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -1,33 +1,33 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file pa.h
-*
-* @brief Definitions for primitive assembly.
-*        N primitives are assembled at a time, where N is the SIMD width.
-*        A state machine, that is specific for a given topology, drives the
-*        assembly of vertices into triangles.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file pa.h
+ *
+ * @brief Definitions for primitive assembly.
+ *        N primitives are assembled at a time, where N is the SIMD width.
+ *        A state machine, that is specific for a given topology, drives the
+ *        assembly of vertices into triangles.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "frontend.h"
@@ -42,13 +42,13 @@
         SIMD_WIDTH_LOG2 = 4
     };
 
-    typedef         simd16mask          SIMDMASK;
+    typedef simd16mask SIMDMASK;
 
-    typedef         simd16scalar        SIMDSCALAR;
-    typedef         simd16vector        SIMDVECTOR;
-    typedef         simd16vertex        SIMDVERTEX;
+    typedef simd16scalar SIMDSCALAR;
+    typedef simd16vector SIMDVECTOR;
+    typedef simd16vertex SIMDVERTEX;
 
-    typedef         simd16scalari       SIMDSCALARI;
+    typedef simd16scalari SIMDSCALARI;
 
 #else
     enum
@@ -58,36 +58,45 @@
         SIMD_WIDTH_LOG2 = 3
     };
 
-    typedef         simdmask            SIMDMASK;
+    typedef simdmask SIMDMASK;
 
-    typedef         simdscalar          SIMDSCALAR;
-    typedef         simdvector          SIMDVECTOR;
-    typedef         simdvertex          SIMDVERTEX;
+    typedef simdscalar SIMDSCALAR;
+    typedef simdvector SIMDVECTOR;
+    typedef simdvertex SIMDVERTEX;
 
-    typedef         simdscalari         SIMDSCALARI;
+    typedef simdscalari SIMDSCALARI;
 
 #endif
-    DRAW_CONTEXT *pDC{ nullptr };       // draw context
-    uint8_t* pStreamBase{ nullptr };    // vertex stream
-    uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
-    uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units
+    DRAW_CONTEXT* pDC{nullptr};         // draw context
+    uint8_t*      pStreamBase{nullptr}; // vertex stream
+    uint32_t      streamSizeInVerts{0}; // total size of the input stream in verts
+    uint32_t      vertexStride{0};      // stride of a vertex in simdvector units
 
-    // The topology the binner will use. In some cases the FE changes the topology from the api state.
-    PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
+    // The topology the binner will use. In some cases the FE changes the topology from the api
+    // state.
+    PRIMITIVE_TOPOLOGY binTopology{TOP_UNKNOWN};
 
 #if ENABLE_AVX512_SIMD16
-    bool useAlternateOffset{ false };
+    bool useAlternateOffset{false};
 #endif
 
-    bool viewportArrayActive{ false };
-    bool rtArrayActive { false };
-    uint32_t numVertsPerPrim{ 0 };
+    bool     viewportArrayActive{false};
+    bool     rtArrayActive{false};
+    uint32_t numVertsPerPrim{0};
 
-    PA_STATE(){}
-    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, uint32_t in_numVertsPerPrim) :
-        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) {}
+    PA_STATE() {}
+    PA_STATE(DRAW_CONTEXT* in_pDC,
+             uint8_t*      in_pStreamBase,
+             uint32_t      in_streamSizeInVerts,
+             uint32_t      in_vertexStride,
+             uint32_t      in_numVertsPerPrim) :
+        pDC(in_pDC),
+        pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts),
+        vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim)
+    {
+    }
 
-    virtual bool HasWork() = 0;
+    virtual bool        HasWork()                                    = 0;
     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
 #if ENABLE_AVX512_SIMD16
     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
@@ -96,14 +105,14 @@
 #if ENABLE_AVX512_SIMD16
     virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
 #endif
-    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
-    virtual bool NextPrim() = 0;
-    virtual SIMDVERTEX& GetNextVsOutput() = 0;
-    virtual bool GetNextStreamOutput() = 0;
-    virtual SIMDMASK& GetNextVsIndices() = 0;
-    virtual uint32_t NumPrims() = 0;
-    virtual void Reset() = 0;
-    virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
+    virtual void        AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
+    virtual bool        NextPrim()                                                             = 0;
+    virtual SIMDVERTEX& GetNextVsOutput()                                                      = 0;
+    virtual bool        GetNextStreamOutput()                                                  = 0;
+    virtual SIMDMASK&   GetNextVsIndices()                                                     = 0;
+    virtual uint32_t    NumPrims()                                                             = 0;
+    virtual void        Reset()                                                                = 0;
+    virtual SIMDSCALARI GetPrimID(uint32_t startID)                                            = 0;
 };
 
 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
@@ -117,69 +126,77 @@
 //                1.    We call this the current and previous simd vertex.
 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
 //                    order to assemble the second triangle, for a triangle list, we'll need the
-//                    last vertex from the previous simd and the first 2 vertices from the current simd.
+//                    last vertex from the previous simd and the first 2 vertices from the current
+//                    simd.
 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
 //
 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
 // cuts
 struct PA_STATE_OPT : public PA_STATE
 {
-    uint32_t numPrims{ 0 };              // Total number of primitives for draw.
-    uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
+    uint32_t numPrims{0};         // Total number of primitives for draw.
+    uint32_t numPrimsComplete{0}; // Total number of complete primitives.
 
-    uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
+    uint32_t numSimdPrims{0}; // Number of prims in current simd.
 
-    uint32_t cur{ 0 };                   // index to current VS output.
-    uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
-    const uint32_t first{ 0 };           // index to first VS output. Used for tri fan and line loop.
+    uint32_t       cur{0};   // index to current VS output.
+    uint32_t       prev{0};  // index to prev VS output. Not really needed in the state.
+    const uint32_t first{0}; // index to first VS output. Used for tri fan and line loop.
 
-    uint32_t counter{ 0 };               // state counter
-    bool reset{ false };                 // reset state
+    uint32_t counter{0};   // state counter
+    bool     reset{false}; // reset state
 
-    uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
+    uint32_t    primIDIncr{0}; // how much to increment for each vector (typically vector / {1, 2})
     SIMDSCALARI primID;
 
-    typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+    typedef bool (*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 #if ENABLE_AVX512_SIMD16
-    typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
+    typedef bool (*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
+    typedef void (*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa,
+                                       uint32_t      slot,
+                                       uint32_t      primIndex,
+                                       simd4scalar   verts[]);
 
-    PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
+    PFN_PA_FUNC pfnPaFunc{nullptr}; // PA state machine function for assembling 4 triangles.
 #if ENABLE_AVX512_SIMD16
-    PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
+    PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{nullptr};
 #endif
-    PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
-    PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
+    PFN_PA_SINGLE_FUNC pfnPaSingleFunc{
+        nullptr}; // PA state machine function for assembling single triangle.
+    PFN_PA_FUNC pfnPaFuncReset{nullptr}; // initial state to set on reset
 #if ENABLE_AVX512_SIMD16
-    PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
+    PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{nullptr};
 #endif
 
     // state used to advance the PA when Next is called
-    PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
+    PFN_PA_FUNC pfnPaNextFunc{nullptr};
 #if ENABLE_AVX512_SIMD16
-    PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
+    PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{nullptr};
 #endif
-    uint32_t           nextNumSimdPrims{ 0 };
-    uint32_t           nextNumPrimsIncrement{ 0 };
-    bool               nextReset{ false };
-    bool               isStreaming{ false };
+    uint32_t nextNumSimdPrims{0};
+    uint32_t nextNumPrimsIncrement{0};
+    bool     nextReset{false};
+    bool     isStreaming{false};
 
-    SIMDMASK           junkIndices  { 0 };          // temporary index store for unused virtual function
+    SIMDMASK junkIndices{0}; // temporary index store for unused virtual function
 
     PA_STATE_OPT() {}
-    PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
-        uint32_t vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
+    PA_STATE_OPT(DRAW_CONTEXT*      pDC,
+                 uint32_t           numPrims,
+                 uint8_t*           pStream,
+                 uint32_t           streamSizeInVerts,
+                 uint32_t           vertexStride,
+                 bool               in_isStreaming,
+                 uint32_t           numVertsPerPrim,
+                 PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 
-    bool HasWork()
-    {
-        return (this->numPrimsComplete < this->numPrims) ? true : false;
-    }
+    bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; }
 
     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
     {
         SWR_ASSERT(slot < vertexStride);
-        uint32_t offset = index * vertexStride + slot;
+        uint32_t    offset     = index * vertexStride + slot;
         simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
         return vertexSlot;
     }
@@ -188,7 +205,7 @@
     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
     {
         SWR_ASSERT(slot < vertexStride);
-        uint32_t offset = index * vertexStride + slot;
+        uint32_t      offset     = index * vertexStride + slot;
         simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
         return vertexSlot;
     }
@@ -196,10 +213,7 @@
 #endif
     // Assembles 4 triangles. Each simdvector is a single vertex from 4
     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
-    bool Assemble(uint32_t slot, simdvector verts[])
-    {
-        return this->pfnPaFunc(*this, slot, verts);
-    }
+    bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); }
 
 #if ENABLE_AVX512_SIMD16
     bool Assemble(uint32_t slot, simd16vector verts[])
@@ -239,12 +253,12 @@
         else
         {
             this->counter = (this->reset) ? 0 : (this->counter + 1);
-            this->reset = false;
+            this->reset   = false;
         }
 
         if (!HasWork())
         {
-            morePrims = false;    // no more to do
+            morePrims = false; // no more to do
         }
 
         return morePrims;
@@ -259,15 +273,16 @@
         {
             // prev undefined for first state
             prev = cur;
-            cur = counter;
+            cur  = counter;
         }
         else
         {
-            // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
+            // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in
+            // the buffer
             uint32_t temp = prev;
 
             prev = cur;
-            cur = temp;
+            cur  = temp;
         }
 
         SWR_ASSERT(cur < numSimdVerts);
@@ -285,44 +300,46 @@
     bool GetNextStreamOutput()
     {
         this->prev = this->cur;
-        this->cur = this->counter;
+        this->cur  = this->counter;
 
         return HasWork();
     }
 
     uint32_t NumPrims()
     {
-        return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
-            (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
+        return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims)
+                   ? (SIMD_WIDTH -
+                      (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims))
+                   : SIMD_WIDTH;
     }
 
-    void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
-        PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-        uint32_t numSimdPrims = 0,
-        uint32_t numPrimsIncrement = 0,
-        bool reset = false)
+    void SetNextState(PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
+                      PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+                      uint32_t                         numSimdPrims      = 0,
+                      uint32_t                         numPrimsIncrement = 0,
+                      bool                             reset             = false)
     {
-        this->pfnPaNextFunc = pfnPaNextFunc;
-        this->nextNumSimdPrims = numSimdPrims;
+        this->pfnPaNextFunc         = pfnPaNextFunc;
+        this->nextNumSimdPrims      = numSimdPrims;
         this->nextNumPrimsIncrement = numPrimsIncrement;
-        this->nextReset = reset;
+        this->nextReset             = reset;
 
         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
     }
 
 #if ENABLE_AVX512_SIMD16
     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
-        PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
-        PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-        uint32_t numSimdPrims = 0,
-        uint32_t numPrimsIncrement = 0,
-        bool reset = false)
+                             PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
+                             PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+                             uint32_t                         numSimdPrims      = 0,
+                             uint32_t                         numPrimsIncrement = 0,
+                             bool                             reset             = false)
     {
-        this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
-        this->pfnPaNextFunc = pfnPaNextFunc;
-        this->nextNumSimdPrims = numSimdPrims;
+        this->pfnPaNextFunc_simd16  = pfnPaNextFunc_simd16;
+        this->pfnPaNextFunc         = pfnPaNextFunc;
+        this->nextNumSimdPrims      = numSimdPrims;
         this->nextNumPrimsIncrement = numPrimsIncrement;
-        this->nextReset = reset;
+        this->nextReset             = reset;
 
         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
     }
@@ -339,44 +356,54 @@
         this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
 #endif
         this->numPrimsComplete = 0;
-        this->numSimdPrims = 0;
-        this->cur = 0;
-        this->prev = 0;
-        this->counter = 0;
-        this->reset = false;
+        this->numSimdPrims     = 0;
+        this->cur              = 0;
+        this->prev             = 0;
+        this->counter          = 0;
+        this->reset            = false;
     }
 
     SIMDSCALARI GetPrimID(uint32_t startID)
     {
 #if USE_SIMD16_FRONTEND
-        return _simd16_add_epi32(this->primID,
+        return _simd16_add_epi32(
+            this->primID,
             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 #else
-        return _simd_add_epi32(this->primID,
+        return _simd_add_epi32(
+            this->primID,
             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 #endif
     }
 };
 
 // helper C wrappers to avoid having to rewrite all the PA topology state functions
-INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
-    PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-    uint32_t numSimdPrims = 0,
-    uint32_t numPrimsIncrement = 0,
-    bool reset = false)
+INLINE void SetNextPaState(PA_STATE_OPT&                    pa,
+                           PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
+                           PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+                           uint32_t                         numSimdPrims      = 0,
+                           uint32_t                         numPrimsIncrement = 0,
+                           bool                             reset             = false)
 {
-    return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
+    return pa.SetNextState(
+        pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 }
 
 #if ENABLE_AVX512_SIMD16
-INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
-    PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
-    PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
-    uint32_t numSimdPrims = 0,
-    uint32_t numPrimsIncrement = 0,
-    bool reset = false)
+INLINE void SetNextPaState_simd16(PA_STATE_OPT&                    pa,
+                                  PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
+                                  PA_STATE_OPT::PFN_PA_FUNC        pfnPaNextFunc,
+                                  PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+                                  uint32_t                         numSimdPrims      = 0,
+                                  uint32_t                         numPrimsIncrement = 0,
+                                  bool                             reset             = false)
 {
-    return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
+    return pa.SetNextState_simd16(pfnPaNextFunc_simd16,
+                                  pfnPaNextFunc,
+                                  pfnPaNextSingleFunc,
+                                  numSimdPrims,
+                                  numPrimsIncrement,
+                                  reset);
 }
 
 #endif
@@ -395,59 +422,70 @@
 // Cut-aware primitive assembler.
 struct PA_STATE_CUT : public PA_STATE
 {
-    SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
-    uint32_t numVerts{ 0 };              // number of vertices available in buffer store
-    uint32_t numAttribs{ 0 };            // number of attributes
-    int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
-    uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
+    SIMDMASK* pCutIndices{nullptr};  // cut indices buffer, 1 bit per vertex
+    uint32_t  numVerts{0};           // number of vertices available in buffer store
+    uint32_t  numAttribs{0};         // number of attributes
+    int32_t   numRemainingVerts{0};  // number of verts remaining to be assembled
+    uint32_t  numVertsToAssemble{0}; // total number of verts to assemble for the draw
 #if ENABLE_AVX512_SIMD16
-    OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
+    OSALIGNSIMD16(uint32_t)
+    indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
 #else
-    OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
+    OSALIGNSIMD(uint32_t)
+    indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH]; // current index buffer for gather
 #endif
-    SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
-    uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
-    uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
-    uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
-    uint32_t curVertex{ 0 };             // current unprocessed vertex
-    uint32_t startPrimId{ 0 };           // starting prim id
-    SIMDSCALARI vPrimId;                 // vector of prim ID
-    bool needOffsets{ false };           // need to compute gather offsets for current SIMD
-    uint32_t vertsPerPrim{ 0 };
-    bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
-                                         // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
-                                         // while the GS sends valid verts for every index
+    SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
+    uint32_t    numPrimsAssembled{0};             // number of primitives that are fully assembled
+    uint32_t    headVertex{0};      // current unused vertex slot in vertex buffer store
+    uint32_t    tailVertex{0};      // beginning vertex currently assembling
+    uint32_t    curVertex{0};       // current unprocessed vertex
+    uint32_t    startPrimId{0};     // starting prim id
+    SIMDSCALARI vPrimId;            // vector of prim ID
+    bool        needOffsets{false}; // need to compute gather offsets for current SIMD
+    uint32_t    vertsPerPrim{0};
+    bool        processCutVerts{
+        false}; // vertex indices with cuts should be processed as normal, otherwise they
+                // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
+                // while the GS sends valid verts for every index
 
-    simdvector      junkVector;          // junk simdvector for unimplemented API
+    simdvector junkVector; // junk simdvector for unimplemented API
 #if ENABLE_AVX512_SIMD16
-    simd16vector    junkVector_simd16;   // junk simd16vector for unimplemented API
+    simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
 #endif
 
     // Topology state tracking
     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
-    uint32_t curIndex{ 0 };
-    bool reverseWinding{ false };        // indicates reverse winding for strips
-    int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
+    uint32_t curIndex{0};
+    bool     reverseWinding{false}; // indicates reverse winding for strips
+    int32_t  adjExtraVert{0};       // extra vert uses for tristrip w/ adj
 
-    typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
-    PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
+    typedef void (PA_STATE_CUT::*PFN_PA_FUNC)(uint32_t vert, bool finish);
+    PFN_PA_FUNC pfnPa{nullptr}; // per-topology function that processes a single vert
 
     PA_STATE_CUT() {}
-    PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
-        uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts, uint32_t in_numVertsPerPrim)
-        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
+    PA_STATE_CUT(DRAW_CONTEXT*      pDC,
+                 uint8_t*           in_pStream,
+                 uint32_t           in_streamSizeInVerts,
+                 uint32_t           in_vertexStride,
+                 SIMDMASK*          in_pIndices,
+                 uint32_t           in_numVerts,
+                 uint32_t           in_numAttribs,
+                 PRIMITIVE_TOPOLOGY topo,
+                 bool               in_processCutVerts,
+                 uint32_t           in_numVertsPerPrim) :
+        PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
     {
-        numVerts = in_streamSizeInVerts;
-        numAttribs = in_numAttribs;
-        binTopology = topo;
-        needOffsets = false;
+        numVerts        = in_streamSizeInVerts;
+        numAttribs      = in_numAttribs;
+        binTopology     = topo;
+        needOffsets     = false;
         processCutVerts = in_processCutVerts;
 
         numVertsToAssemble = numRemainingVerts = in_numVerts;
-        numPrimsAssembled = 0;
+        numPrimsAssembled                      = 0;
         headVertex = tailVertex = curVertex = 0;
 
-        curIndex = 0;
+        curIndex    = 0;
         pCutIndices = in_pIndices;
         memset(indices, 0, sizeof(indices));
 #if USE_SIMD16_FRONTEND
@@ -456,48 +494,72 @@
         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 #endif
         reverseWinding = false;
-        adjExtraVert = -1;
+        adjExtraVert   = -1;
 
         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
-        vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
+        vertsPerPrim   = NumVertsPerPrim(topo, gsEnabled);
 
         switch (topo)
         {
-        case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
-        case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
-        case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
-        case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
-                                    {
-                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
-                                    }
-                                    else
-                                    {
-                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
-                                    }
-                                    break;
+        case TOP_TRIANGLE_LIST:
+            pfnPa = &PA_STATE_CUT::ProcessVertTriList;
+            break;
+        case TOP_TRI_LIST_ADJ:
+            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj
+                              : &PA_STATE_CUT::ProcessVertTriListAdjNoGs;
+            break;
+        case TOP_TRIANGLE_STRIP:
+            pfnPa = &PA_STATE_CUT::ProcessVertTriStrip;
+            break;
+        case TOP_TRI_STRIP_ADJ:
+            if (gsEnabled)
+            {
+                pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<true>;
+            }
+            else
+            {
+                pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj<false>;
+            }
+            break;
 
-        case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
-        case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
-        case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
-        case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
-        case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
-        default: assert(0 && "Unimplemented topology");
+        case TOP_POINT_LIST:
+            pfnPa = &PA_STATE_CUT::ProcessVertPointList;
+            break;
+        case TOP_LINE_LIST:
+            pfnPa = &PA_STATE_CUT::ProcessVertLineList;
+            break;
+        case TOP_LINE_LIST_ADJ:
+            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj
+                              : &PA_STATE_CUT::ProcessVertLineListAdjNoGs;
+            break;
+        case TOP_LINE_STRIP:
+            pfnPa = &PA_STATE_CUT::ProcessVertLineStrip;
+            break;
+        case TOP_LISTSTRIP_ADJ:
+            pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj
+                              : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs;
+            break;
+        case TOP_RECT_LIST:
+            pfnPa = &PA_STATE_CUT::ProcessVertRectList;
+            break;
+        default:
+            assert(0 && "Unimplemented topology");
         }
     }
 
     SIMDVERTEX& GetNextVsOutput()
     {
         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
-        this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
-        this->needOffsets = true;
-        SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
+        this->headVertex     = (this->headVertex + SIMD_WIDTH) % this->numVerts;
+        this->needOffsets    = true;
+        SIMDVECTOR* pVertex  = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
 
         return *(SIMDVERTEX*)pVertex;
     }
 
     SIMDMASK& GetNextVsIndices()
     {
-        uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
+        uint32_t  vertexIndex  = this->headVertex / SIMD_WIDTH;
         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
         return *pCurCutIndex;
     }
@@ -542,12 +604,12 @@
 #endif
         this->numRemainingVerts = this->numVertsToAssemble;
         this->numPrimsAssembled = 0;
-        this->curIndex = 0;
-        this->curVertex = 0;
-        this->tailVertex = 0;
-        this->headVertex = 0;
-        this->reverseWinding = false;
-        this->adjExtraVert = -1;
+        this->curIndex          = 0;
+        this->curVertex         = 0;
+        this->tailVertex        = 0;
+        this->headVertex        = 0;
+        this->reverseWinding    = false;
+        this->adjExtraVert      = -1;
 #if USE_SIMD16_FRONTEND
         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 #else
@@ -555,10 +617,7 @@
 #endif
     }
 
-    bool HasWork()
-    {
-        return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
-    }
+    bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; }
 
     bool IsVertexStoreFull()
     {
@@ -567,14 +626,14 @@
 
     void RestartTopology()
     {
-        this->curIndex = 0;
+        this->curIndex       = 0;
         this->reverseWinding = false;
-        this->adjExtraVert = -1;
+        this->adjExtraVert   = -1;
     }
 
     bool IsCutIndex(uint32_t vertex)
     {
-        uint32_t vertexIndex = vertex / SIMD_WIDTH;
+        uint32_t vertexIndex  = vertex / SIMD_WIDTH;
         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
         return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
     }
@@ -583,9 +642,8 @@
     // have assembled SIMD prims
     void ProcessVerts()
     {
-        while (this->numPrimsAssembled != SIMD_WIDTH &&
-            this->numRemainingVerts > 0 &&
-            this->curVertex != this->headVertex)
+        while (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0 &&
+               this->curVertex != this->headVertex)
         {
             // if cut index, restart topology
             if (IsCutIndex(this->curVertex))
@@ -607,14 +665,16 @@
             }
 
             this->curVertex++;
-            if (this->curVertex >= this->numVerts) {
-               this->curVertex = 0;
+            if (this->curVertex >= this->numVerts)
+            {
+                this->curVertex = 0;
             }
             this->numRemainingVerts--;
         }
 
         // special case last primitive for tri strip w/ adj
-        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
+        if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 &&
+            this->adjExtraVert != -1)
         {
             (this->*pfnPa)(this->curVertex, true);
         }
@@ -624,7 +684,7 @@
     {
         // done with current batch
         // advance tail to the current unsubmitted vertex
-        this->tailVertex = this->curVertex;
+        this->tailVertex        = this->curVertex;
         this->numPrimsAssembled = 0;
 #if USE_SIMD16_FRONTEND
         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
@@ -647,32 +707,38 @@
     {
         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
         {
-            uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
-            SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
+            uint32_t    vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
+            SIMDSCALARI vIndices          = *(SIMDSCALARI*)&this->indices[v][0];
 
             // step to simdvertex batch
             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 #if USE_SIMD16_FRONTEND
             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
-            this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
+            this->vOffsets[v] =
+                _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
 #else
             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
-            this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
+            this->vOffsets[v] =
+                _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
 #endif
 
             // step to index
             const uint32_t simdMask = SIMD_WIDTH - 1;
 #if USE_SIMD16_FRONTEND
             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
-            this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
+            this->vOffsets[v]        = _simd16_add_epi32(
+                this->vOffsets[v],
+                _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
 #else
             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
-            this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
+            this->vOffsets[v] =
+                _simd_add_epi32(this->vOffsets[v],
+                                _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 #endif
         }
     }
 
-    bool Assemble(uint32_t slot, simdvector *verts)
+    bool Assemble(uint32_t slot, simdvector* verts)
     {
         // process any outstanding verts
         ProcessVerts();
@@ -683,7 +749,8 @@
             return false;
         }
 
-        // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
+        // cache off gather offsets given the current SIMD set of indices the first time we get an
+        // assemble
         if (this->needOffsets)
         {
             ComputeOffsets();
@@ -708,7 +775,8 @@
                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
 
                 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
-                simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
+                simdscalar t =
+                    useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
                 verts[v].v[c] = t;
 #else
                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
@@ -719,13 +787,27 @@
             }
         }
 
+        // compute the implied 4th vertex, v3
+        if (this->binTopology == TOP_RECT_LIST)
+        {
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                // v1, v3 = v1 + v2 - v0, v2
+                // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
+                simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
+                temp              = _simd16_sub_ps(temp, verts[1].v[c]);
+                temp = _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
+                verts[1].v[c] = _simd16_extract_ps(temp, 0);
+            }
+        }
+
         return true;
     }
 
 #if ENABLE_AVX512_SIMD16
     bool Assemble(uint32_t slot, simd16vector verts[])
     {
-        // process any outstanding verts
+       // process any outstanding verts
         ProcessVerts();
 
         // return false if we don't have enough prims assembled
@@ -734,7 +816,8 @@
             return false;
         }
 
-        // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
+        // cache off gather offsets given the current SIMD set of indices the first time we get an
+        // assemble
         if (this->needOffsets)
         {
             ComputeOffsets();
@@ -758,7 +841,8 @@
 #if USE_SIMD16_FRONTEND
                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
 #else
-                verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
+                verts[v].v[c] = _simd16_insert_ps(
+                    _simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
 #endif
 
                 // move base to next component
@@ -766,18 +850,33 @@
             }
         }
 
+        // compute the implied 4th vertex, v3
+        if (this->binTopology == TOP_RECT_LIST)
+        {
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                // v1, v3 = v1 + v2 - v0, v2
+                // v1 stored in verts[0], v0 stored in verts[1], v2 stored in verts[2]
+                simd16scalar temp = _simd16_add_ps(verts[0].v[c], verts[2].v[c]);
+                temp              = _simd16_sub_ps(temp, verts[1].v[c]);
+                verts[1].v[c] =
+                    _simd16_blend_ps(verts[1].v[c], temp, 0xAAAA); // 1010 1010 1010 1010
+            }
+        }
+
         return true;
     }
 
 #endif
     void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
     {
-        // move to slot
+       // move to slot
         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
         {
             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 #if USE_SIMD16_FRONTEND
-            uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
+            uint32_t offset =
+                useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
 #else
             uint32_t offset = pOffset[triIndex];
 #endif
@@ -786,16 +885,28 @@
             for (uint32_t c = 0; c < 4; ++c)
             {
                 float* pComponent = (float*)(this->pStreamBase + offset);
-                pVert[c] = *pComponent;
+                pVert[c]          = *pComponent;
                 offset += SIMD_WIDTH * sizeof(float);
             }
         }
+
+        // compute the implied 4th vertex, v3
+        if ((this->binTopology == TOP_RECT_LIST) && (triIndex % 2 == 1))
+        {
+            // v1, v3 = v1 + v2 - v0, v2
+            // v1 stored in tri[0], v0 stored in tri[1], v2 stored in tri[2]
+            float* pVert0 = (float*)&tri[1];
+            float* pVert1 = (float*)&tri[0];
+            float* pVert2 = (float*)&tri[2];
+            float* pVert3 = (float*)&tri[1];
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                pVert3[c] = pVert1[c] + pVert2[c] - pVert0[c];
+            }
+        }
     }
 
-    uint32_t NumPrims()
-    {
-        return this->numPrimsAssembled;
-    }
+    uint32_t NumPrims() { return this->numPrimsAssembled; }
 
     // Per-topology functions
     void ProcessVertTriStrip(uint32_t index, bool finish)
@@ -821,14 +932,14 @@
             this->numPrimsAssembled++;
 
             // set up next prim state
-            this->vert[0] = this->vert[1];
-            this->vert[1] = this->vert[2];
+            this->vert[0]  = this->vert[1];
+            this->vert[1]  = this->vert[2];
             this->curIndex = 2;
             this->reverseWinding ^= 1;
         }
     }
 
-    template<bool gsEnabled>
+    template <bool gsEnabled>
     void AssembleTriStripAdj()
     {
         if (!gsEnabled)
@@ -855,8 +966,7 @@
         this->numPrimsAssembled++;
     }
 
-
-    template<bool gsEnabled>
+    template <bool gsEnabled>
     void ProcessVertTriStripAdj(uint32_t index, bool finish)
     {
         // handle last primitive of tristrip
@@ -1016,7 +1126,6 @@
         }
     }
 
-
     void ProcessVertLineList(uint32_t index, bool finish)
     {
         this->vert[this->curIndex] = index;
@@ -1045,7 +1154,7 @@
             this->numPrimsAssembled++;
 
             // set up next prim state
-            this->vert[0] = this->vert[1];
+            this->vert[0]  = this->vert[1];
             this->curIndex = 1;
         }
     }
@@ -1066,9 +1175,9 @@
             this->numPrimsAssembled++;
 
             // set up next prim state
-            this->vert[0] = this->vert[1];
-            this->vert[1] = this->vert[2];
-            this->vert[2] = this->vert[3];
+            this->vert[0]  = this->vert[1];
+            this->vert[1]  = this->vert[2];
+            this->vert[2]  = this->vert[3];
             this->curIndex = 3;
         }
     }
@@ -1087,9 +1196,9 @@
             this->numPrimsAssembled++;
 
             // set up next prim state
-            this->vert[0] = this->vert[1];
-            this->vert[1] = this->vert[2];
-            this->vert[2] = this->vert[3];
+            this->vert[0]  = this->vert[1];
+            this->vert[1]  = this->vert[2];
+            this->vert[2]  = this->vert[3];
             this->curIndex = 3;
         }
     }
@@ -1135,34 +1244,56 @@
             this->curIndex = 0;
         }
     }
+
+    void ProcessVertRectList(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 3)
+        {
+            // assembled enough verts for prim, add to gather indices
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
+
+            // second triangle in the rectangle
+            // v1, v3 = v1 + v2 - v0, v2
+            this->indices[0][this->numPrimsAssembled + 1] = this->vert[1];
+            this->indices[1][this->numPrimsAssembled + 1] = this->vert[0];
+            this->indices[2][this->numPrimsAssembled + 1] = this->vert[2];
+
+            // increment numPrimsAssembled
+            this->numPrimsAssembled += 2;
+
+            // set up next prim state
+            this->curIndex = 0;
+        }
+    }
 };
 
 // Primitive Assembly for data output from the DomainShader.
 struct PA_TESS : PA_STATE
 {
-    PA_TESS(
-        DRAW_CONTEXT *in_pDC,
-        const SIMDSCALAR* in_pVertData,
-        uint32_t in_attributeStrideInVectors,
-        uint32_t in_vertexStride,
-        uint32_t in_numAttributes,
-        uint32_t* (&in_ppIndices)[3],
-        uint32_t in_numPrims,
-        PRIMITIVE_TOPOLOGY in_binTopology,
-        uint32_t numVertsPerPrim) :
+    PA_TESS(DRAW_CONTEXT*     in_pDC,
+            const SIMDSCALAR* in_pVertData,
+            uint32_t          in_attributeStrideInVectors,
+            uint32_t          in_vertexStride,
+            uint32_t          in_numAttributes,
+            uint32_t* (&in_ppIndices)[3],
+            uint32_t           in_numPrims,
+            PRIMITIVE_TOPOLOGY in_binTopology,
+            uint32_t           numVertsPerPrim) :
 
         PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
-        m_pVertexData(in_pVertData),
-        m_attributeStrideInVectors(in_attributeStrideInVectors),
-        m_numAttributes(in_numAttributes),
-        m_numPrims(in_numPrims)
+        m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors),
+        m_numAttributes(in_numAttributes), m_numPrims(in_numPrims)
     {
 #if USE_SIMD16_FRONTEND
         m_vPrimId = _simd16_setzero_si();
 #else
         m_vPrimId = _simd_setzero_si();
 #endif
-        binTopology = in_binTopology;
+        binTopology    = in_binTopology;
         m_ppIndices[0] = in_ppIndices[0];
         m_ppIndices[1] = in_ppIndices[1];
         m_ppIndices[2] = in_ppIndices[2];
@@ -1187,10 +1318,7 @@
         }
     }
 
-    bool HasWork()
-    {
-        return m_numPrims != 0;
-    }
+    bool HasWork() { return m_numPrims != 0; }
 
     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
     {
@@ -1210,19 +1338,14 @@
     {
         SWR_ASSERT(numPrims <= SIMD_WIDTH);
 #if USE_SIMD16_FRONTEND
-        static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
-        {
+        static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] = {
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-        };
+            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0};
 
         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
 #else
-        static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
-        {
-            -1, -1, -1, -1, -1, -1, -1, -1,
-            0,  0,  0,  0,  0,  0,  0,  0
-        };
+        static const OSALIGNLINE(int32_t)
+            maskGen[SIMD_WIDTH * 2] = {-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0};
 
         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
 #endif
@@ -1240,7 +1363,8 @@
 
         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
 
-        const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        const float* pBaseAttrib =
+            (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
         {
 #if USE_SIMD16_FRONTEND
@@ -1253,21 +1377,21 @@
             for (uint32_t c = 0; c < 4; ++c)
             {
 #if USE_SIMD16_FRONTEND
-                simd16scalar temp = _simd16_mask_i32gather_ps(
-                    _simd16_setzero_ps(),
-                    pBase,
-                    indices,
-                    _simd16_castsi_ps(mask),
-                    4 /* gcc doesn't like sizeof(float) */);
+                simd16scalar temp =
+                    _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
+                                              pBase,
+                                              indices,
+                                              _simd16_castsi_ps(mask),
+                                              4 /* gcc doesn't like sizeof(float) */);
 
-                verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
+                verts[i].v[c] =
+                    useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
 #else
-                verts[i].v[c] = _simd_mask_i32gather_ps(
-                    _simd_setzero_ps(),
-                    pBase,
-                    indices,
-                    _simd_castsi_ps(mask),
-                    4); // gcc doesn't like sizeof(float)
+                verts[i].v[c] = _simd_mask_i32gather_ps(_simd_setzero_ps(),
+                                                        pBase,
+                                                        indices,
+                                                        _simd_castsi_ps(mask),
+                                                        4); // gcc doesn't like sizeof(float)
 #endif
                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
             }
@@ -1289,7 +1413,8 @@
 
         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
 
-        const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        const float* pBaseAttrib =
+            (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
         {
 #if USE_SIMD16_FRONTEND
@@ -1302,20 +1427,18 @@
             for (uint32_t c = 0; c < 4; ++c)
             {
 #if USE_SIMD16_FRONTEND
-                verts[i].v[c] = _simd16_mask_i32gather_ps(
-                    _simd16_setzero_ps(),
-                    pBase,
-                    indices,
-                    _simd16_castsi_ps(mask),
-                    4 /* gcc doesn't like sizeof(float) */);
+                verts[i].v[c] = _simd16_mask_i32gather_ps(_simd16_setzero_ps(),
+                                                          pBase,
+                                                          indices,
+                                                          _simd16_castsi_ps(mask),
+                                                          4 /* gcc doesn't like sizeof(float) */);
 #else
-                simdscalar temp = _simd_mask_i32gather_ps(
-                    _simd_setzero_ps(),
-                    pBase,
-                    indices,
-                    _simd_castsi_ps(mask),
-                    4 /* gcc doesn't like sizeof(float) */);
-                verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
+                simdscalar temp = _simd_mask_i32gather_ps(_simd_setzero_ps(),
+                                                          pBase,
+                                                          indices,
+                                                          _simd_castsi_ps(mask),
+                                                          4 /* gcc doesn't like sizeof(float) */);
+                verts[i].v[c]   = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
 #endif
                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
             }
@@ -1328,19 +1451,22 @@
     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
     {
         SWR_ASSERT(slot < m_numAttributes);
+
+
         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
 
-
-        const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        const float* pVertDataBase =
+            (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
         {
 #if USE_SIMD16_FRONTEND
-            uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
+            uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2]
+                                                : m_ppIndices[i][primIndex];
 #else
             uint32_t index = m_ppIndices[i][primIndex];
 #endif
             const float* pVertData = pVertDataBase;
-            float* pVert = (float*)&verts[i];
+            float*       pVert     = (float*)&verts[i];
 
             for (uint32_t c = 0; c < 4; ++c)
             {
@@ -1379,15 +1505,9 @@
         return junkIndices;
     }
 
-    uint32_t NumPrims()
-    {
-        return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
-    }
+    uint32_t NumPrims() { return std::min<uint32_t>(m_numPrims, SIMD_WIDTH); }
 
-    void Reset()
-    {
-        SWR_NOT_IMPL;
-    }
+    void Reset() { SWR_NOT_IMPL; }
 
     SIMDSCALARI GetPrimID(uint32_t startID)
     {
@@ -1399,57 +1519,77 @@
     }
 
 private:
-    const SIMDSCALAR*   m_pVertexData = nullptr;
-    uint32_t            m_attributeStrideInVectors = 0;
-    uint32_t            m_numAttributes = 0;
-    uint32_t            m_numPrims = 0;
-    uint32_t*           m_ppIndices[3];
+    const SIMDSCALAR* m_pVertexData              = nullptr;
+    uint32_t          m_attributeStrideInVectors = 0;
+    uint32_t          m_numAttributes            = 0;
+    uint32_t          m_numPrims                 = 0;
+    uint32_t*         m_ppIndices[3];
 
-    uint32_t            m_numVertsPerPrim = 0;
+    uint32_t m_numVertsPerPrim = 0;
 
-    SIMDSCALARI         m_vPrimId;
+    SIMDSCALARI m_vPrimId;
 
-    simdvector          junkVector;         // junk simdvector for unimplemented API
+    simdvector junkVector; // junk simdvector for unimplemented API
 #if ENABLE_AVX512_SIMD16
-    simd16vector        junkVector_simd16;  // junk simd16vector for unimplemented API
+    simd16vector junkVector_simd16; // junk simd16vector for unimplemented API
 #endif
-    SIMDVERTEX          junkVertex;         // junk SIMDVERTEX for unimplemented API
-    SIMDMASK            junkIndices;        // temporary index store for unused virtual function
+    SIMDVERTEX junkVertex;  // junk SIMDVERTEX for unimplemented API
+    SIMDMASK   junkIndices; // temporary index store for unused virtual function
 };
 
-// Primitive Assembler factory class, responsible for creating and initializing the correct assembler
-// based on state.
+// Primitive Assembler factory class, responsible for creating and initializing the correct
+// assembler based on state.
 template <typename IsIndexedT, typename IsCutIndexEnabledT>
 struct PA_FACTORY
 {
-    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride, uint32_t numVertsPerPrim) : topo(in_topo)
+    PA_FACTORY(DRAW_CONTEXT*         pDC,
+               PRIMITIVE_TOPOLOGY    in_topo,
+               uint32_t              numVerts,
+               PA_STATE::SIMDVERTEX* pVertexStore,
+               uint32_t              vertexStoreSize,
+               uint32_t              vertexStride,
+               uint32_t              numVertsPerPrim) :
+        topo(in_topo)
     {
 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
         const API_STATE& state = GetApiState(pDC);
-        if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
-            topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
-            topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
-            topo == TOP_TRIANGLE_LIST)) ||
+        if ((IsIndexedT::value && IsCutIndexEnabledT::value &&
+             (topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST ||
+              topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) ||
 
-            // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
-            // for them in the optimized PA
-            (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
+            // non-indexed draws with adjacency topologies must use cut-aware PA until we add
+            // support for them in the optimized PA
+            (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
+             topo == TOP_TRI_STRIP_ADJ))
         {
             memset(&indexStore, 0, sizeof(indexStore));
             uint32_t numAttribs = state.feNumAttributes;
 
-            new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
-                vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false, numVertsPerPrim);
+            new (&this->paCut) PA_STATE_CUT(pDC,
+                                            reinterpret_cast<uint8_t*>(pVertexStore),
+                                            vertexStoreSize * PA_STATE::SIMD_WIDTH,
+                                            vertexStride,
+                                            &this->indexStore[0],
+                                            numVerts,
+                                            numAttribs,
+                                            state.topology,
+                                            false,
+                                            numVertsPerPrim);
             cutPA = true;
         }
         else
 #endif
         {
             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
-            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false, numVertsPerPrim);
+            new (&this->paOpt) PA_STATE_OPT(pDC,
+                                            numPrims,
+                                            reinterpret_cast<uint8_t*>(pVertexStore),
+                                            vertexStoreSize * PA_STATE::SIMD_WIDTH,
+                                            vertexStride,
+                                            false,
+                                            numVertsPerPrim);
             cutPA = false;
         }
-
     }
 
     PA_STATE& GetPA()
@@ -1469,9 +1609,9 @@
     PA_STATE_OPT paOpt;
     PA_STATE_CUT paCut;
 
-    bool cutPA{ false };
+    bool cutPA{false};
 
-    PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
+    PRIMITIVE_TOPOLOGY topo{TOP_UNKNOWN};
 
-    PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
+    PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM];
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
index 64a90c7..25d7156 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
@@ -1,136 +1,160 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file pa_avx.cpp
-*
-* @brief AVX implementation for primitive assembly.
-*        N primitives are assembled at a time, where N is the SIMD width.
-*        A state machine, that is specific for a given topology, drives the
-*        assembly of vertices into triangles.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file pa_avx.cpp
+ *
+ * @brief AVX implementation for primitive assembly.
+ *        N primitives are assembled at a time, where N is the SIMD width.
+ *        A state machine, that is specific for a given topology, drives the
+ *        assembly of vertices into triangles.
+ *
+ ******************************************************************************/
 #include "context.h"
 #include "pa.h"
 #include "frontend.h"
 
 #if (KNOB_SIMD_WIDTH == 8)
 
-INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane0(const simdscalar& x,
+                                const simdscalar& y,
+                                const simdscalar& z,
+                                const simdscalar& w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 }
 
-INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane1(const simdscalar& x,
+                                const simdscalar& y,
+                                const simdscalar& z,
+                                const simdscalar& w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 }
 
-INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane2(const simdscalar& x,
+                                const simdscalar& y,
+                                const simdscalar& z,
+                                const simdscalar& w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 }
 
-INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane3(const simdscalar& x,
+                                const simdscalar& y,
+                                const simdscalar& z,
+                                const simdscalar& w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 }
 
-INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane4(const simdscalar& x,
+                                const simdscalar& y,
+                                const simdscalar& z,
+                                const simdscalar& w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 }
 
-INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane5(const simdscalar& x,
+                                const simdscalar& y,
+                                const simdscalar& z,
+                                const simdscalar& w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 }
 
-INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane6(const simdscalar& x,
+                                const simdscalar& y,
+                                const simdscalar& z,
+                                const simdscalar& w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 }
 
-INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane7(const simdscalar& x,
+                                const simdscalar& y,
+                                const simdscalar& z,
+                                const simdscalar& w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 }
 
-INLINE simd4scalar swizzleLane0(const simdvector &v)
+INLINE simd4scalar swizzleLane0(const simdvector& v)
 {
     return swizzleLane0(v.x, v.y, v.z, v.w);
 }
 
-INLINE simd4scalar swizzleLane1(const simdvector &v)
+INLINE simd4scalar swizzleLane1(const simdvector& v)
 {
     return swizzleLane1(v.x, v.y, v.z, v.w);
 }
 
-INLINE simd4scalar swizzleLane2(const simdvector &v)
+INLINE simd4scalar swizzleLane2(const simdvector& v)
 {
     return swizzleLane2(v.x, v.y, v.z, v.w);
 }
 
-INLINE simd4scalar swizzleLane3(const simdvector &v)
+INLINE simd4scalar swizzleLane3(const simdvector& v)
 {
     return swizzleLane3(v.x, v.y, v.z, v.w);
 }
 
-INLINE simd4scalar swizzleLane4(const simdvector &v)
+INLINE simd4scalar swizzleLane4(const simdvector& v)
 {
     return swizzleLane4(v.x, v.y, v.z, v.w);
 }
 
-INLINE simd4scalar swizzleLane5(const simdvector &v)
+INLINE simd4scalar swizzleLane5(const simdvector& v)
 {
     return swizzleLane5(v.x, v.y, v.z, v.w);
 }
 
-INLINE simd4scalar swizzleLane6(const simdvector &v)
+INLINE simd4scalar swizzleLane6(const simdvector& v)
 {
     return swizzleLane6(v.x, v.y, v.z, v.w);
 }
 
-INLINE simd4scalar swizzleLane7(const simdvector &v)
+INLINE simd4scalar swizzleLane7(const simdvector& v)
 {
     return swizzleLane7(v.x, v.y, v.z, v.w);
 }
 
-INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simdvector& v, int lane)
 {
     switch (lane)
     {
@@ -156,87 +180,135 @@
 }
 
 #if ENABLE_AVX512_SIMD16
-INLINE simd4scalar swizzleLane0(const simd16vector &v)
+INLINE simd4scalar swizzleLane0(const simd16vector& v)
 {
-    return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
+    return swizzleLane0(_simd16_extract_ps(v.x, 0),
+                        _simd16_extract_ps(v.y, 0),
+                        _simd16_extract_ps(v.z, 0),
+                        _simd16_extract_ps(v.w, 0));
 }
 
-INLINE simd4scalar swizzleLane1(const simd16vector &v)
+INLINE simd4scalar swizzleLane1(const simd16vector& v)
 {
-    return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
+    return swizzleLane1(_simd16_extract_ps(v.x, 0),
+                        _simd16_extract_ps(v.y, 0),
+                        _simd16_extract_ps(v.z, 0),
+                        _simd16_extract_ps(v.w, 0));
 }
 
-INLINE simd4scalar swizzleLane2(const simd16vector &v)
+INLINE simd4scalar swizzleLane2(const simd16vector& v)
 {
-    return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
+    return swizzleLane2(_simd16_extract_ps(v.x, 0),
+                        _simd16_extract_ps(v.y, 0),
+                        _simd16_extract_ps(v.z, 0),
+                        _simd16_extract_ps(v.w, 0));
 }
 
-INLINE simd4scalar swizzleLane3(const simd16vector &v)
+INLINE simd4scalar swizzleLane3(const simd16vector& v)
 {
-    return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
+    return swizzleLane3(_simd16_extract_ps(v.x, 0),
+                        _simd16_extract_ps(v.y, 0),
+                        _simd16_extract_ps(v.z, 0),
+                        _simd16_extract_ps(v.w, 0));
 }
 
-INLINE simd4scalar swizzleLane4(const simd16vector &v)
+INLINE simd4scalar swizzleLane4(const simd16vector& v)
 {
-    return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
+    return swizzleLane4(_simd16_extract_ps(v.x, 0),
+                        _simd16_extract_ps(v.y, 0),
+                        _simd16_extract_ps(v.z, 0),
+                        _simd16_extract_ps(v.w, 0));
 }
 
-INLINE simd4scalar swizzleLane5(const simd16vector &v)
+INLINE simd4scalar swizzleLane5(const simd16vector& v)
 {
-    return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
+    return swizzleLane5(_simd16_extract_ps(v.x, 0),
+                        _simd16_extract_ps(v.y, 0),
+                        _simd16_extract_ps(v.z, 0),
+                        _simd16_extract_ps(v.w, 0));
 }
 
-INLINE simd4scalar swizzleLane6(const simd16vector &v)
+INLINE simd4scalar swizzleLane6(const simd16vector& v)
 {
-    return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
+    return swizzleLane6(_simd16_extract_ps(v.x, 0),
+                        _simd16_extract_ps(v.y, 0),
+                        _simd16_extract_ps(v.z, 0),
+                        _simd16_extract_ps(v.w, 0));
 }
 
-INLINE simd4scalar swizzleLane7(const simd16vector &v)
+INLINE simd4scalar swizzleLane7(const simd16vector& v)
 {
-    return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
+    return swizzleLane7(_simd16_extract_ps(v.x, 0),
+                        _simd16_extract_ps(v.y, 0),
+                        _simd16_extract_ps(v.z, 0),
+                        _simd16_extract_ps(v.w, 0));
 }
 
-INLINE simd4scalar swizzleLane8(const simd16vector &v)
+INLINE simd4scalar swizzleLane8(const simd16vector& v)
 {
-    return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
+    return swizzleLane0(_simd16_extract_ps(v.x, 1),
+                        _simd16_extract_ps(v.y, 1),
+                        _simd16_extract_ps(v.z, 1),
+                        _simd16_extract_ps(v.w, 1));
 }
 
-INLINE simd4scalar swizzleLane9(const simd16vector &v)
+INLINE simd4scalar swizzleLane9(const simd16vector& v)
 {
-    return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
+    return swizzleLane1(_simd16_extract_ps(v.x, 1),
+                        _simd16_extract_ps(v.y, 1),
+                        _simd16_extract_ps(v.z, 1),
+                        _simd16_extract_ps(v.w, 1));
 }
 
-INLINE simd4scalar swizzleLaneA(const simd16vector &v)
+INLINE simd4scalar swizzleLaneA(const simd16vector& v)
 {
-    return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
+    return swizzleLane2(_simd16_extract_ps(v.x, 1),
+                        _simd16_extract_ps(v.y, 1),
+                        _simd16_extract_ps(v.z, 1),
+                        _simd16_extract_ps(v.w, 1));
 }
 
-INLINE simd4scalar swizzleLaneB(const simd16vector &v)
+INLINE simd4scalar swizzleLaneB(const simd16vector& v)
 {
-    return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
+    return swizzleLane3(_simd16_extract_ps(v.x, 1),
+                        _simd16_extract_ps(v.y, 1),
+                        _simd16_extract_ps(v.z, 1),
+                        _simd16_extract_ps(v.w, 1));
 }
 
-INLINE simd4scalar swizzleLaneC(const simd16vector &v)
+INLINE simd4scalar swizzleLaneC(const simd16vector& v)
 {
-    return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
+    return swizzleLane4(_simd16_extract_ps(v.x, 1),
+                        _simd16_extract_ps(v.y, 1),
+                        _simd16_extract_ps(v.z, 1),
+                        _simd16_extract_ps(v.w, 1));
 }
 
-INLINE simd4scalar swizzleLaneD(const simd16vector &v)
+INLINE simd4scalar swizzleLaneD(const simd16vector& v)
 {
-    return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
+    return swizzleLane5(_simd16_extract_ps(v.x, 1),
+                        _simd16_extract_ps(v.y, 1),
+                        _simd16_extract_ps(v.z, 1),
+                        _simd16_extract_ps(v.w, 1));
 }
 
-INLINE simd4scalar swizzleLaneE(const simd16vector &v)
+INLINE simd4scalar swizzleLaneE(const simd16vector& v)
 {
-    return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
+    return swizzleLane6(_simd16_extract_ps(v.x, 1),
+                        _simd16_extract_ps(v.y, 1),
+                        _simd16_extract_ps(v.z, 1),
+                        _simd16_extract_ps(v.w, 1));
 }
 
-INLINE simd4scalar swizzleLaneF(const simd16vector &v)
+INLINE simd4scalar swizzleLaneF(const simd16vector& v)
 {
-    return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
+    return swizzleLane7(_simd16_extract_ps(v.x, 1),
+                        _simd16_extract_ps(v.y, 1),
+                        _simd16_extract_ps(v.z, 1),
+                        _simd16_extract_ps(v.w, 1));
 }
 
-INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simd16vector& v, int lane)
 {
     switch (lane)
     {
@@ -374,11 +446,11 @@
     {
         uint32_t input_cp = primIndex * TotalControlPoints + cp;
 #if USE_SIMD16_FRONTEND
-        uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
+        uint32_t input_vec  = input_cp / KNOB_SIMD16_WIDTH;
         uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
 
 #else
-        uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
+        uint32_t input_vec  = input_cp / KNOB_SIMD_WIDTH;
         uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
 
 #endif
@@ -386,7 +458,8 @@
         for (uint32_t i = 0; i < 4; ++i)
         {
 #if USE_SIMD16_FRONTEND
-            const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
+            const float* pInputVec =
+                (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
 #else
             const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
 #endif
@@ -395,18 +468,17 @@
     }
 }
 
-template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
+template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
 static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
-    SetNextPaState(
-        pa,
-        PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
-        PaPatchListSingle<TotalControlPoints>);
+    SetNextPaState(pa,
+                   PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
+                   PaPatchListSingle<TotalControlPoints>);
 
     return false;
 }
 
-template<uint32_t TotalControlPoints>
+template <uint32_t TotalControlPoints>
 static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
@@ -433,14 +505,15 @@
             for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
             {
 #if USE_SIMD16_FRONTEND
-                uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp;
-                uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
+                uint32_t input_cp   = (lane + lane_offset) * TotalControlPoints + cp;
+                uint32_t input_vec  = input_cp / KNOB_SIMD16_WIDTH;
                 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
 
-                const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
+                const float* pInputVec =
+                    (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]);
 #else
-                uint32_t input_cp = lane * TotalControlPoints + cp;
-                uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
+                uint32_t input_cp   = lane * TotalControlPoints + cp;
+                uint32_t input_vec  = input_cp / KNOB_SIMD_WIDTH;
                 uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
 
                 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
@@ -451,31 +524,29 @@
         }
     }
 
-    SetNextPaState(
-        pa,
-        PaPatchList<TotalControlPoints>,
-        PaPatchListSingle<TotalControlPoints>,
-        0,
-        PA_STATE_OPT::SIMD_WIDTH,
-        true);
+    SetNextPaState(pa,
+                   PaPatchList<TotalControlPoints>,
+                   PaPatchListSingle<TotalControlPoints>,
+                   0,
+                   PA_STATE_OPT::SIMD_WIDTH,
+                   true);
 
     return true;
 }
 
 #if ENABLE_AVX512_SIMD16
-template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
+template <uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
 static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
-    SetNextPaState_simd16(
-        pa,
-        PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
-        PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
-        PaPatchListSingle<TotalControlPoints>);
+    SetNextPaState_simd16(pa,
+                          PaPatchList_simd16<TotalControlPoints, CurrentControlPoints + 1>,
+                          PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
+                          PaPatchListSingle<TotalControlPoints>);
 
     return false;
 }
 
-template<uint32_t TotalControlPoints>
+template <uint32_t TotalControlPoints>
 static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
@@ -492,33 +563,35 @@
             float vec[KNOB_SIMD16_WIDTH];
             for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane)
             {
-                uint32_t input_cp = lane * TotalControlPoints + cp;
-                uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH;
+                uint32_t input_cp   = lane * TotalControlPoints + cp;
+                uint32_t input_vec  = input_cp / KNOB_SIMD16_WIDTH;
                 uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH;
 
                 const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
-                vec[lane] = pInputVec[input_lane];
+                vec[lane]              = pInputVec[input_lane];
             }
             verts[cp][i] = _simd16_loadu_ps(vec);
         }
     }
 
-    SetNextPaState_simd16(
-        pa,
-        PaPatchList_simd16<TotalControlPoints>,
-        PaPatchList<TotalControlPoints>,
-        PaPatchListSingle<TotalControlPoints>,
-        0,
-        PA_STATE_OPT::SIMD_WIDTH,
-        true);
+    SetNextPaState_simd16(pa,
+                          PaPatchList_simd16<TotalControlPoints>,
+                          PaPatchList<TotalControlPoints>,
+                          PaPatchListSingle<TotalControlPoints>,
+                          0,
+                          PA_STATE_OPT::SIMD_WIDTH,
+                          true);
 
     return true;
 }
 
 #endif
-#define PA_PATCH_LIST_TERMINATOR(N) \
-    template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
-                           { return PaPatchListTerm<N>(pa, slot, verts); }
+#define PA_PATCH_LIST_TERMINATOR(N)                                              \
+    template <>                                                                  \
+    bool PaPatchList<N, N>(PA_STATE_OPT & pa, uint32_t slot, simdvector verts[]) \
+    {                                                                            \
+        return PaPatchListTerm<N>(pa, slot, verts);                              \
+    }
 PA_PATCH_LIST_TERMINATOR(1)
 PA_PATCH_LIST_TERMINATOR(2)
 PA_PATCH_LIST_TERMINATOR(3)
@@ -554,9 +627,12 @@
 #undef PA_PATCH_LIST_TERMINATOR
 
 #if ENABLE_AVX512_SIMD16
-#define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \
-    template<> bool PaPatchList_simd16<N, N>(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])\
-                           { return PaPatchListTerm_simd16<N>(pa, slot, verts); }
+#define PA_PATCH_LIST_TERMINATOR_SIMD16(N)                                                \
+    template <>                                                                           \
+    bool PaPatchList_simd16<N, N>(PA_STATE_OPT & pa, uint32_t slot, simd16vector verts[]) \
+    {                                                                                     \
+        return PaPatchListTerm_simd16<N>(pa, slot, verts);                                \
+    }
 PA_PATCH_LIST_TERMINATOR_SIMD16(1)
 PA_PATCH_LIST_TERMINATOR_SIMD16(2)
 PA_PATCH_LIST_TERMINATOR_SIMD16(3)
@@ -595,13 +671,13 @@
 bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
     SetNextPaState(pa, PaTriList1, PaTriListSingle0);
-    return false;    // Not enough vertices to assemble 4 or 8 triangles.
+    return false; // Not enough vertices to assemble 4 or 8 triangles.
 }
 
 bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
     SetNextPaState(pa, PaTriList2, PaTriListSingle0);
-    return false;    // Not enough vertices to assemble 8 triangles.
+    return false; // Not enough vertices to assemble 8 triangles.
 }
 
 bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
@@ -614,8 +690,8 @@
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -626,8 +702,8 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-        const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
+        const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -638,9 +714,9 @@
     }
 
 #else
-    simdvector &a = PaGetSimdVector(pa, 0, slot);
-    simdvector &b = PaGetSimdVector(pa, 1, slot);
-    simdvector &c = PaGetSimdVector(pa, 2, slot);
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
+    simdvector& b = PaGetSimdVector(pa, 1, slot);
+    simdvector& c = PaGetSimdVector(pa, 2, slot);
 
 #endif
     simdscalar s;
@@ -653,25 +729,25 @@
     for (int i = 0; i < 4; ++i)
     {
         simdvector& v0 = verts[0];
-        v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
-        v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
-        v0[i] = _simd_permute_ps_i(v0[i], 0x6C);
-        s = _simd_permute2f128_ps(v0[i], v0[i], 0x21);
-        v0[i] = _simd_blend_ps(v0[i], s, 0x44);
+        v0[i]          = _simd_blend_ps(a[i], b[i], 0x92);
+        v0[i]          = _simd_blend_ps(v0[i], c[i], 0x24);
+        v0[i]          = _simd_permute_ps_i(v0[i], 0x6C);
+        s              = _simd_permute2f128_ps(v0[i], v0[i], 0x21);
+        v0[i]          = _simd_blend_ps(v0[i], s, 0x44);
 
         simdvector& v1 = verts[1];
-        v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
-        v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
-        v1[i] = _simd_permute_ps_i(v1[i], 0xB1);
-        s = _simd_permute2f128_ps(v1[i], v1[i], 0x21);
-        v1[i] = _simd_blend_ps(v1[i], s, 0x66);
+        v1[i]          = _simd_blend_ps(a[i], b[i], 0x24);
+        v1[i]          = _simd_blend_ps(v1[i], c[i], 0x49);
+        v1[i]          = _simd_permute_ps_i(v1[i], 0xB1);
+        s              = _simd_permute2f128_ps(v1[i], v1[i], 0x21);
+        v1[i]          = _simd_blend_ps(v1[i], s, 0x66);
 
         simdvector& v2 = verts[2];
-        v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
-        v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
-        v2[i] = _simd_permute_ps_i(v2[i], 0xC6);
-        s = _simd_permute2f128_ps(v2[i], v2[i], 0x21);
-        v2[i] = _simd_blend_ps(v2[i], s, 0x22);
+        v2[i]          = _simd_blend_ps(a[i], b[i], 0x49);
+        v2[i]          = _simd_blend_ps(v2[i], c[i], 0x92);
+        v2[i]          = _simd_permute_ps_i(v2[i], 0xC6);
+        s              = _simd_permute2f128_ps(v2[i], v2[i], 0x21);
+        v2[i]          = _simd_blend_ps(v2[i], s, 0x22);
     }
 
 #elif KNOB_ARCH >= KNOB_ARCH_AVX2
@@ -686,8 +762,8 @@
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -698,8 +774,8 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
-        const simd16vector &c_16 = PaGetSimdVector_simd16(pa, 2, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
+        const simd16vector& c_16 = PaGetSimdVector_simd16(pa, 2, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -710,18 +786,18 @@
     }
 
 #else
-    const simdvector &a = PaGetSimdVector(pa, 0, slot);
-    const simdvector &b = PaGetSimdVector(pa, 1, slot);
-    const simdvector &c = PaGetSimdVector(pa, 2, slot);
+    const simdvector& a = PaGetSimdVector(pa, 0, slot);
+    const simdvector& b = PaGetSimdVector(pa, 1, slot);
+    const simdvector& c = PaGetSimdVector(pa, 2, slot);
 
 #endif
     //  v0 -> a0 a3 a6 b1 b4 b7 c2 c5
     //  v1 -> a1 a4 a7 b2 b5 c0 c3 c6
     //  v2 -> a2 a5 b0 b3 b6 c1 c4 c7
 
-    simdvector &v0 = verts[0];
-    simdvector &v1 = verts[1];
-    simdvector &v2 = verts[2];
+    simdvector& v0 = verts[0];
+    simdvector& v1 = verts[1];
+    simdvector& v2 = verts[2];
 
     // for simd x, y, z, and w
     for (int i = 0; i < 4; ++i)
@@ -744,82 +820,97 @@
 bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
     SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriList1, PaTriListSingle0);
-    return false;    // Not enough vertices to assemble 16 triangles
+    return false; // Not enough vertices to assemble 16 triangles
 }
 
 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
     SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriList2, PaTriListSingle0);
-    return false;    // Not enough vertices to assemble 16 triangles
+    return false; // Not enough vertices to assemble 16 triangles
 }
 
 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
-#if KNOB_ARCH == KNOB_ARCH_AVX
-    simd16scalar perm0 = _simd16_setzero_ps();
-    simd16scalar perm1 = _simd16_setzero_ps();
-    simd16scalar perm2 = _simd16_setzero_ps();
-#elif KNOB_ARCH >= KNOB_ARCH_AVX2
+    // clang-format off
+
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
     const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11,  8, 5, 2, 15, 12,  9, 6, 3, 0);
     const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12,  9, 6, 3,  0, 13, 10, 7, 4, 1);
     const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3,  0, 13, 10, 7, 4,  1, 14, 11, 8, 5, 2);
+#else // KNOB_ARCH == KNOB_ARCH_AVX
+    simd16scalar perm0 = _simd16_setzero_ps();
+    simd16scalar perm1 = _simd16_setzero_ps();
+    simd16scalar perm2 = _simd16_setzero_ps();
 #endif
 
-    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
-    const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot);
+    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
+    const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot);
 
-    simd16vector &v0 = verts[0];
-    simd16vector &v1 = verts[1];
-    simd16vector &v2 = verts[2];
+    const simd16mask mask0 = 0x4924;
+    const simd16mask mask1 = 0x2492;
+    const simd16mask mask2 = 0x9249;
 
     //  v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
     //  v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
     //  v2 -> a2 a5 a8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
 
+    simd16vector& v0 = verts[0];
+    simd16vector& v1 = verts[1];
+    simd16vector& v2 = verts[2];
+
     // for simd16 x, y, z, and w
     for (int i = 0; i < 4; i += 1)
     {
-        simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x4924), c[i], 0x2492);
-        simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x9249), c[i], 0x4924);
-        simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x2492), c[i], 0x9249);
+        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
+        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
+        simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i]));
 
-#if KNOB_ARCH == KNOB_ARCH_AVX
-        temp0 = _simd16_permute_ps_i(temp0, 0x6C);          // (0, 3, 2, 1) => 00 11 01 10 => 0x6C
-        perm0 = _simd16_permute2f128_ps(temp0, temp0, 0xB1);// (1, 0, 3, 2) => 01 00 11 10 => 0xB1
-        temp0 = _simd16_blend_ps(temp0, perm0, 0x4444);     // 0010 0010 0010 0010
-        perm0 = _simd16_permute2f128_ps(temp0, temp0, 0x4E);// (2, 3, 0, 1) => 10 11 00 01 => 0x4E
-        v0[i] = _simd16_blend_ps(temp0, perm0, 0x3838);     // 0001 1100 0001 1100
+        simd16scalar temp0 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask0), tempc, mask1);
+        simd16scalar temp1 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask2), tempc, mask0);
+        simd16scalar temp2 = _simd16_blend_ps(_simd16_blend_ps(tempa, tempb, mask1), tempc, mask2);
 
-        temp1 = _simd16_permute_ps_i(temp1, 0xB1);          // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
-        perm1 = _simd16_permute2f128_ps(temp1, temp1, 0xB1);// (1, 0, 3, 2) => 01 00 11 10 => 0xB1
-        temp1 = _simd16_blend_ps(temp1, perm1, 0x6666);     // 0010 0010 0010 0010
-        perm1 = _simd16_permute2f128_ps(temp1, temp1, 0x4E);// (2, 3, 0, 1) => 10 11 00 01 => 0x4E
-        v1[i] = _simd16_blend_ps(temp1, perm1, 0x1818);     // 0001 1000 0001 1000
-
-        temp2 = _simd16_permute_ps_i(temp2, 0xC6);          // (2, 1, 0, 3) => 01 10 00 11 => 0xC6
-        perm2 = _simd16_permute2f128_ps(temp2, temp2, 0xB1);// (1, 0, 3, 2) => 01 00 11 10 => 0xB1
-        temp2 = _simd16_blend_ps(temp2, perm2, 0x2222);     // 0100 0100 0100 0100
-        perm2 = _simd16_permute2f128_ps(temp2, temp2, 0x4E);// (2, 3, 0, 1) => 10 11 00 01 => 0x4E
-        v2[i] = _simd16_blend_ps(temp2, perm2, 0x1C1C);     // 0011 1000 0011 1000
-#elif KNOB_ARCH >= KNOB_ARCH_AVX2
+#if KNOB_ARCH >= KNOB_ARCH_AVX2
         v0[i] = _simd16_permute_ps(temp0, perm0);
         v1[i] = _simd16_permute_ps(temp1, perm1);
         v2[i] = _simd16_permute_ps(temp2, perm2);
+#else // #if KNOB_ARCH == KNOB_ARCH_AVX
+
+        // the general permutes (above) are prohibitively slow to emulate on AVX (its scalar code)
+
+        temp0 = _simd16_permute_ps_i(temp0, 0x6C);           // (0, 3, 2, 1) => 00 11 01 10 => 0x6C
+        perm0 = _simd16_permute2f128_ps(temp0, temp0, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
+        temp0 = _simd16_blend_ps(temp0, perm0, 0x4444);      // 0010 0010 0010 0010
+        perm0 = _simd16_permute2f128_ps(temp0, temp0, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
+        v0[i] = _simd16_blend_ps(temp0, perm0, 0x3838);      // 0001 1100 0001 1100
+
+        temp1 = _simd16_permute_ps_i(temp1, 0xB1);           // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
+        perm1 = _simd16_permute2f128_ps(temp1, temp1, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
+        temp1 = _simd16_blend_ps(temp1, perm1, 0x6666);      // 0010 0010 0010 0010
+        perm1 = _simd16_permute2f128_ps(temp1, temp1, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
+        v1[i] = _simd16_blend_ps(temp1, perm1, 0x1818);      // 0001 1000 0001 1000
+
+        temp2 = _simd16_permute_ps_i(temp2, 0xC6);           // (2, 1, 0, 3) => 01 10 00 11 => 0xC6
+        perm2 = _simd16_permute2f128_ps(temp2, temp2, 0xB1); // (1, 0, 3, 2) => 01 00 11 10 => 0xB1
+        temp2 = _simd16_blend_ps(temp2, perm2, 0x2222);      // 0100 0100 0100 0100
+        perm2 = _simd16_permute2f128_ps(temp2, temp2, 0x4E); // (2, 3, 0, 1) => 10 11 00 01 => 0x4E
+        v2[i] = _simd16_blend_ps(temp2, perm2, 0x1C1C);      // 0011 1000 0011 1000
 #endif
     }
 
     SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriList0, PaTriListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
     return true;
+
+    // clang-format on
 }
 
 #endif
 void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
-    const simd16vector &c = PaGetSimdVector_simd16(pa, 2, slot);
+    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
+    const simd16vector& c = PaGetSimdVector_simd16(pa, 2, slot);
 
     if (pa.useAlternateOffset)
     {
@@ -918,9 +1009,9 @@
     // hold at least 8 triangles worth of data. We want to assemble a single
     // triangle with data in horizontal form.
 
-    const simdvector &a = PaGetSimdVector(pa, 0, slot);
-    const simdvector &b = PaGetSimdVector(pa, 1, slot);
-    const simdvector &c = PaGetSimdVector(pa, 2, slot);
+    const simdvector& a = PaGetSimdVector(pa, 0, slot);
+    const simdvector& b = PaGetSimdVector(pa, 1, slot);
+    const simdvector& c = PaGetSimdVector(pa, 2, slot);
 
     // Convert from vertical to horizontal.
     // Tri Pattern - provoking vertex is always v0
@@ -977,7 +1068,7 @@
 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
     SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
-    return false;    // Not enough vertices to assemble 8 triangles.
+    return false; // Not enough vertices to assemble 8 triangles.
 }
 
 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
@@ -988,7 +1079,7 @@
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -998,7 +1089,7 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -1008,13 +1099,13 @@
     }
 
 #else
-    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
-    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
 
 #endif
     simdscalar s;
 
-    for(int i = 0; i < 4; ++i)
+    for (int i = 0; i < 4; ++i)
     {
         simdscalar a0 = a[i];
         simdscalar b0 = b[i];
@@ -1024,9 +1115,9 @@
         //  v1 -> 13355779
         //  v2 -> 22446688
         simdvector& v0 = verts[0];
-        v0[i] = a0;
+        v0[i]          = a0;
 
-        //  s -> 4567891011 
+        //  s -> 4567891011
         s = _simd_permute2f128_ps(a0, b0, 0x21);
         //  s -> 23456789
         s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
@@ -1044,50 +1135,59 @@
     return true;
 }
 
-#if  ENABLE_AVX512_SIMD16
+#if ENABLE_AVX512_SIMD16
 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
     SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0);
-    return false;    // Not enough vertices to assemble 16 triangles.
+    return false; // Not enough vertices to assemble 16 triangles.
 }
 
 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
-    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    // clang-format off
 
-    simd16vector &v0 = verts[0];
-    simd16vector &v1 = verts[1];
-    simd16vector &v2 = verts[2];
+    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
+
+    const simd16mask mask0 = 0xF000;
 
     //  v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
     //  v1 -> a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
     //  v2 -> a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
 
+    simd16vector& v0 = verts[0];
+    simd16vector& v1 = verts[1];
+    simd16vector& v2 = verts[2];
+
     // for simd16 x, y, z, and w
     for (int i = 0; i < 4; i += 1)
     {
-        simd16scalar perm0 = _simd16_permute2f128_ps(a[i], a[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
-        simd16scalar perm1 = _simd16_permute2f128_ps(b[i], b[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
+        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
+        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
 
-        simd16scalar blend = _simd16_blend_ps(perm0, perm1, 0xF000);                                // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
-        simd16scalar shuff = _simd16_shuffle_ps(a[i], blend, _MM_SHUFFLE(1, 0, 3, 2));              // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
+        simd16scalar perm0 = _simd16_permute2f128_ps(tempa, tempa, 0x39); // (0 3 2 1) = 00 11 10 01 // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF a0 a1 a2 a3
+        simd16scalar perm1 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
 
-        v0[i] = a[i];                                                                               // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
-        v1[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(3, 1, 3, 1));                           // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
-        v2[i] = _simd16_shuffle_ps(a[i], shuff, _MM_SHUFFLE(2, 2, 2, 2));                           // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
+        simd16scalar blend = _simd16_blend_ps(perm0, perm1, mask0);                                  // a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1 b2 b3
+        simd16scalar shuff = _simd16_shuffle_ps(tempa, blend, _MM_SHUFFLE(1, 0, 3, 2));              // a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0 b1
+
+        v0[i] = tempa;                                                                               // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
+        v1[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(3, 1, 3, 1));                           // a1 a3 a3 a5 a5 a7 a7 a9 a9 aB aB aD aD aF aF b1
+        v2[i] = _simd16_shuffle_ps(tempa, shuff, _MM_SHUFFLE(2, 2, 2, 2));                           // a2 a2 a4 a4 a6 a6 a8 a8 aA aA aC aC aE aE b0 b0
     }
 
     SetNextPaState_simd16(pa, PaTriStrip1_simd16, PaTriStrip1, PaTriStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
     return true;
+
+    // clang-format on
 }
 
 #endif
 void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
     if (pa.useAlternateOffset)
     {
@@ -1182,8 +1282,8 @@
         break;
     };
 #else
-    const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
-    const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
+    const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
 
     // Convert from vertical to horizontal.
     // Tri Pattern - provoking vertex is always v0
@@ -1240,7 +1340,7 @@
 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
-    return false;    // Not enough vertices to assemble 8 triangles.
+    return false; // Not enough vertices to assemble 8 triangles.
 }
 
 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
@@ -1250,11 +1350,11 @@
     simdvector a;
     simdvector b;
 
-    const simd16vector &leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
+    const simd16vector& leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -1266,7 +1366,7 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -1278,15 +1378,15 @@
     }
 
 #else
-    const simdvector &leadVert = PaGetSimdVector(pa, pa.first, slot);
-    const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
-    const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
+    const simdvector& leadVert = PaGetSimdVector(pa, pa.first, slot);
+    const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
 
 #endif
     simdscalar s;
 
     // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
-    for(int i = 0; i < 4; ++i)
+    for (int i = 0; i < 4; ++i)
     {
         simdscalar a0 = a[i];
         simdscalar b0 = b[i];
@@ -1294,15 +1394,15 @@
         simdscalar comp = leadVert[i];
 
         simdvector& v0 = verts[0];
-        v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
-        v0[i] = _simd_permute2f128_ps(v0[i], comp, 0x00);
+        v0[i]          = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
+        v0[i]          = _simd_permute2f128_ps(v0[i], comp, 0x00);
 
         simdvector& v2 = verts[2];
-        s = _simd_permute2f128_ps(a0, b0, 0x21);
-        v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
+        s              = _simd_permute2f128_ps(a0, b0, 0x21);
+        v2[i]          = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
 
         simdvector& v1 = verts[1];
-        v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
+        v1[i]          = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
     }
 
     SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
@@ -1313,50 +1413,62 @@
 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
     SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0);
-    return false;    // Not enough vertices to assemble 16 triangles.
+    return false; // Not enough vertices to assemble 16 triangles.
 }
 
 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
-    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    // clang-format off
 
-    simd16vector &v0 = verts[0];
-    simd16vector &v1 = verts[1];
-    simd16vector &v2 = verts[2];
+    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot);
+    const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot);
+
+    const simd16mask mask0 = 0xF000;
 
     //  v0 -> a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
     //  v1 -> b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
     //  v2 -> b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
 
+    simd16vector& v0 = verts[0];
+    simd16vector& v1 = verts[1];
+    simd16vector& v2 = verts[2];
+
     // for simd16 x, y, z, and w
     for (uint32_t i = 0; i < 4; i += 1)
     {
-        simd16scalar shuff = _simd16_shuffle_ps(a[i], a[i], _MM_SHUFFLE(0, 0, 0, 0));               // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4
+        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
+        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
+        simd16scalar tempc = _simd16_loadu_ps(reinterpret_cast<const float*>(&c[i]));
 
-        v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00);                                        // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
+        simd16scalar shuff = _simd16_shuffle_ps(tempa, tempa, _MM_SHUFFLE(0, 0, 0, 0));              // a0 a0 a0 a0 a4 a4 a4 a4 a0 a0 a0 a0 a4 a4 a4 a4
 
-        simd16scalar temp0 = _simd16_permute2f128_ps(b[i], b[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
-        simd16scalar temp1 = _simd16_permute2f128_ps(c[i], c[i], 0x39);  // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3
+        v0[i] = _simd16_permute2f128_ps(shuff, shuff, 0x00);                                         // a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0 a0
 
-        simd16scalar blend = _simd16_blend_ps(temp0, temp1, 0xF000);                                // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3
+        simd16scalar temp0 = _simd16_permute2f128_ps(tempb, tempb, 0x39); // (0 3 2 1) = 00 11 10 01 // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF b0 b1 b2 b3
+        simd16scalar temp1 = _simd16_permute2f128_ps(tempc, tempc, 0x39); // (0 3 2 1) = 00 11 10 01 // c4 c5 c6 c7 c8 c9 cA cB cC cD cE cF c0 c1 c2 c3
 
-        v2[i] = _simd16_shuffle_ps(b[i], blend, _MM_SHUFFLE(1, 0, 3, 2));                           // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
-        v1[i] = _simd16_shuffle_ps(b[i], v2[i], _MM_SHUFFLE(2, 1, 2, 1));                           // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
+        simd16scalar blend = _simd16_blend_ps(temp0, temp1, mask0);                                  // b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1 c2 c3
+
+        simd16scalar temp2 = _simd16_shuffle_ps(tempb, blend, _MM_SHUFFLE(1, 0, 3, 2));              // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
+
+        v1[i] = _simd16_shuffle_ps(tempb, temp2, _MM_SHUFFLE(2, 1, 2, 1));                           // b1 b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0
+        v2[i] = temp2;                                                                               // b2 b3 b4 b5 b6 b7 b8 b9 bA bB bC bD bE bF c0 c1
     }
 
     SetNextPaState_simd16(pa, PaTriFan1_simd16, PaTriFan1, PaTriFanSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
     return true;
+
+    // clang-format on
 }
 
 #endif
 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.first, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.prev, slot);
+    const simd16vector& c = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
     if (pa.useAlternateOffset)
     {
@@ -1390,9 +1502,9 @@
         verts[2] = swizzleLaneN(c, primIndex - 14);
     }
 #else
-    const simdvector &a = PaGetSimdVector(pa, pa.first, slot);
-    const simdvector &b = PaGetSimdVector(pa, pa.prev, slot);
-    const simdvector &c = PaGetSimdVector(pa, pa.cur, slot);
+    const simdvector& a = PaGetSimdVector(pa, pa.first, slot);
+    const simdvector& b = PaGetSimdVector(pa, pa.prev, slot);
+    const simdvector& c = PaGetSimdVector(pa, pa.cur, slot);
 
     // vert 0 from leading vertex
     verts[0] = swizzleLane0(a);
@@ -1422,7 +1534,7 @@
 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
     SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
-    return false;    // Not enough vertices to assemble 8 triangles.
+    return false; // Not enough vertices to assemble 8 triangles.
 }
 
 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
@@ -1433,7 +1545,7 @@
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -1443,7 +1555,7 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -1453,13 +1565,13 @@
     }
 
 #else
-    simdvector &a = PaGetSimdVector(pa, 0, slot);
-    simdvector &b = PaGetSimdVector(pa, 1, slot);
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
+    simdvector& b = PaGetSimdVector(pa, 1, slot);
 
 #endif
     simdscalar s1, s2;
 
-    for(int i = 0; i < 4; ++i)
+    for (int i = 0; i < 4; ++i)
     {
         simdscalar a0 = a[i];
         simdscalar b0 = b[i];
@@ -1468,13 +1580,13 @@
         s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
 
         simdvector& v0 = verts[0];
-        v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
+        v0[i]          = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
 
         simdvector& v1 = verts[1];
-        v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
+        v1[i]          = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
 
         simdvector& v2 = verts[2];
-        v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
+        v2[i]          = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
     }
 
     SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
@@ -1485,43 +1597,50 @@
 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
     SetNextPaState_simd16(pa, PaQuadList1_simd16, PaQuadList1, PaQuadListSingle0);
-    return false;    // Not enough vertices to assemble 16 triangles.
+    return false; // Not enough vertices to assemble 16 triangles.
 }
 
 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
-    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
+    // clang-format off
 
-    simd16vector &v0 = verts[0];
-    simd16vector &v1 = verts[1];
-    simd16vector &v2 = verts[2];
+    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
 
     //  v0 -> a0 a0 a4 a4 a8 a8 aC aC b0 b0 b0 b0 b0 b0 bC bC
     //  v1 -> a1 a2 a5 a6 a9 aA aD aE b1 b2 b5 b6 b9 bA bD bE
     //  v2 -> a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
 
+    simd16vector& v0 = verts[0];
+    simd16vector& v1 = verts[1];
+    simd16vector& v2 = verts[2];
+
     // for simd16 x, y, z, and w
     for (uint32_t i = 0; i < 4; i += 1)
     {
-        simd16scalar temp0 = _simd16_permute2f128_ps(a[i], b[i], 0x88); // (2 0 2 0) = 10 00 10 00  // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB
-        simd16scalar temp1 = _simd16_permute2f128_ps(a[i], b[i], 0xDD); // (3 1 3 1) = 11 01 11 01  // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
+        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
+        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
 
-        v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0));                          // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC
-        v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1));                          // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE
-        v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2));                          // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
+        simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) = 10 00 10 00 // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b8 b9 bA bB
+        simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) = 11 01 11 01 // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
+
+        v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(0, 0, 0, 0));                           // a0 a0 a4 a4 a8 a8 aC aC b0 b0 b4 b4 b8 b8 bC bC
+        v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 1, 2, 1));                           // a1 a2 a5 a6 a9 aA aD aE b1 b2 b6 b6 b9 bA bD bE
+        v2[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 2, 3, 2));                           // a2 a3 a6 a7 aA aB aE aF b2 b3 b6 b7 bA bB bE bF
     }
 
     SetNextPaState_simd16(pa, PaQuadList0_simd16, PaQuadList0, PaQuadListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
     return true;
+
+    // clang-format on
 }
 
 #endif
 void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
+    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
 
     if (pa.useAlternateOffset)
     {
@@ -1628,8 +1747,8 @@
         break;
     }
 #else
-    const simdvector &a = PaGetSimdVector(pa, 0, slot);
-    const simdvector &b = PaGetSimdVector(pa, 1, slot);
+    const simdvector& a = PaGetSimdVector(pa, 0, slot);
+    const simdvector& b = PaGetSimdVector(pa, 1, slot);
 
     switch (primIndex)
     {
@@ -1703,7 +1822,7 @@
 #if USE_SIMD16_FRONTEND
         simdvector first;
 
-        const simd16vector &first_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
+        const simd16vector& first_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
 
         if (!pa.useAlternateOffset)
         {
@@ -1721,14 +1840,14 @@
         }
 
 #else
-        simdvector &first = PaGetSimdVector(pa, pa.first, slot);
+        simdvector& first = PaGetSimdVector(pa, pa.first, slot);
 
 #endif
         for (int i = 0; i < 4; i++)
         {
-            float *firstVtx = (float *)&(first[i]);
-            float *targetVtx = (float *)&(verts[1][i]);
-            targetVtx[lane] = firstVtx[0];
+            float* firstVtx  = (float*)&(first[i]);
+            float* targetVtx = (float*)&(verts[1][i]);
+            targetVtx[lane]  = firstVtx[0];
         }
     }
 
@@ -1752,17 +1871,18 @@
         // loop reconnect now
         const int lane = pa.numPrims - pa.numPrimsComplete - 1;
 
-        const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot);
+        const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot);
 
         for (int i = 0; i < 4; i++)
         {
-            float *firstVtx = (float *)&(first[i]);
-            float *targetVtx = (float *)&(verts[1][i]);
-            targetVtx[lane] = firstVtx[0];
+            float* firstVtx  = (float*)&(first[i]);
+            float* targetVtx = (float*)&(verts[1][i]);
+            targetVtx[lane]  = firstVtx[0];
         }
     }
 
-    SetNextPaState_simd16(pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
+    SetNextPaState_simd16(
+        pa, PaLineLoop1_simd16, PaLineLoop1, PaLineLoopSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
     return true;
 }
 
@@ -1774,11 +1894,11 @@
     if (pa.numPrimsComplete + primIndex == pa.numPrims - 1)
     {
 #if USE_SIMD16_FRONTEND
-        const simd16vector &first = PaGetSimdVector_simd16(pa, pa.first, slot);
+        const simd16vector& first = PaGetSimdVector_simd16(pa, pa.first, slot);
 
         verts[1] = swizzleLane0(first);
 #else
-        const simdvector &first = PaGetSimdVector(pa, pa.first, slot);
+        const simdvector& first = PaGetSimdVector(pa, pa.first, slot);
 
         verts[1] = swizzleLane0(first);
 #endif
@@ -1788,7 +1908,7 @@
 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
     SetNextPaState(pa, PaLineList1, PaLineListSingle0);
-    return false;    // Not enough vertices to assemble 8 lines
+    return false; // Not enough vertices to assemble 8 lines
 }
 
 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
@@ -1799,7 +1919,7 @@
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -1809,7 +1929,7 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -1819,8 +1939,8 @@
     }
 
 #else
-    simdvector &a = PaGetSimdVector(pa, 0, slot);
-    simdvector &b = PaGetSimdVector(pa, 1, slot);
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
+    simdvector& b = PaGetSimdVector(pa, 1, slot);
 
 #endif
     /// @todo: verify provoking vertex is correct
@@ -1852,40 +1972,47 @@
 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
     SetNextPaState_simd16(pa, PaLineList1_simd16, PaLineList1, PaLineListSingle0);
-    return false;    // Not enough vertices to assemble 16 lines
+    return false; // Not enough vertices to assemble 16 lines
 }
 
 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
-    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
+    // clang-format off
 
-    simd16vector &v0 = verts[0];
-    simd16vector &v1 = verts[1];
+    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
 
     // v0 -> a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
     // v1 -> a1 a3 a5 a7 a9 aB aD aF b1 b3 b4 b7 b9 bB bD bF
 
+    simd16vector& v0 = verts[0];
+    simd16vector& v1 = verts[1];
+
     // for simd16 x, y, z, and w
     for (int i = 0; i < 4; i += 1)
     {
-        simd16scalar temp0 = _simd16_permute2f128_ps(a[i], b[i], 0x88); // (2 0 2 0) 10 00 10 00    // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB
-        simd16scalar temp1 = _simd16_permute2f128_ps(a[i], b[i], 0xDD); // (3 1 3 1) 11 01 11 01    // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
+        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
+        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
 
-        v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));                          // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
-        v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));                          // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
+        simd16scalar temp0 = _simd16_permute2f128_ps(tempa, tempb, 0x88); // (2 0 2 0) 10 00 10 00   // a0 a1 a2 a3 a8 a9 aA aB b0 b1 b2 b3 b9 b9 bA bB
+        simd16scalar temp1 = _simd16_permute2f128_ps(tempa, tempb, 0xDD); // (3 1 3 1) 11 01 11 01   // a4 a5 a6 a7 aC aD aE aF b4 b5 b6 b7 bC bD bE bF
+
+        v0[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));                           // a0 a2 a4 a6 a8 aA aC aE b0 b2 b4 b6 b8 bA bC bE
+        v1[i] = _simd16_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));                           // a1 a3 a5 a7 a9 aB aD aF b1 b3 b5 b7 b9 bB bD bF
     }
 
     SetNextPaState_simd16(pa, PaLineList0_simd16, PaLineList0, PaLineListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
     return true;
+
+    // clang-format on
 }
 
 #endif
 void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, 1, slot);
+    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, 1, slot);
 
     if (pa.useAlternateOffset)
     {
@@ -1960,8 +2087,8 @@
         break;
     }
 #else
-    const simdvector &a = PaGetSimdVector(pa, 0, slot);
-    const simdvector &b = PaGetSimdVector(pa, 1, slot);
+    const simdvector& a = PaGetSimdVector(pa, 0, slot);
+    const simdvector& b = PaGetSimdVector(pa, 1, slot);
 
     switch (primIndex)
     {
@@ -2004,7 +2131,7 @@
 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
     SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
-    return false;    // Not enough vertices to assemble 8 lines
+    return false; // Not enough vertices to assemble 8 lines
 }
 
 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
@@ -2015,7 +2142,7 @@
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, pa.prev, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -2025,7 +2152,7 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -2035,8 +2162,8 @@
     }
 
 #else
-    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
-    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
 
 #endif
     /// @todo: verify provoking vertex is correct
@@ -2049,7 +2176,7 @@
 
     verts[0] = a;
 
-    for(uint32_t i = 0; i < 4; ++i)
+    for (uint32_t i = 0; i < 4; ++i)
     {
         // 1 2 3 x 5 6 7 x
         __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
@@ -2070,42 +2197,51 @@
 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
     SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0);
-    return false;    // Not enough vertices to assemble 16 lines
+    return false; // Not enough vertices to assemble 16 lines
 }
 
 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
+    // clang-format off
+
     const simd16scalari perm = _simd16_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
 
-    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
-    simd16vector &v0 = verts[0];
-    simd16vector &v1 = verts[1];
+    const simd16mask mask0 = 0x0001;
 
     // v0 -> a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
     // v1 -> a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
 
-    v0 = a;                                                                                         // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
+    simd16vector& v0 = verts[0];
+    simd16vector& v1 = verts[1];
+
+    v0 = a; // a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
 
     // for simd16 x, y, z, and w
     for (int i = 0; i < 4; i += 1)
     {
-        simd16scalar temp = _simd16_blend_ps(a[i], b[i], 0x0001);                                   // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
+        simd16scalar tempa = _simd16_loadu_ps(reinterpret_cast<const float*>(&a[i]));
+        simd16scalar tempb = _simd16_loadu_ps(reinterpret_cast<const float*>(&b[i]));
 
-        v1[i] = _simd16_permute_ps(temp, perm);                                                     // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
+        simd16scalar temp = _simd16_blend_ps(tempa, tempb, mask0); // b0 a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF
+
+        v1[i] = _simd16_permute_ps(temp, perm);                    // a1 a2 a3 a4 a5 a6 a7 a8 a9 aA aB aC aD aE aF b0
     }
 
     SetNextPaState_simd16(pa, PaLineStrip1_simd16, PaLineStrip1, PaLineStripSingle0, 0, PA_STATE_OPT::SIMD_WIDTH);
     return true;
+
+    // clang-format on
 }
 
 #endif
 void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
-    const simd16vector &b = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    const simd16vector& a = PaGetSimdVector_simd16(pa, pa.prev, slot);
+    const simd16vector& b = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
     if (pa.useAlternateOffset)
     {
@@ -2180,8 +2316,8 @@
         break;
     }
 #else
-    const simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
-    const simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
+    const simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    const simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
 
     switch (primIndex)
     {
@@ -2226,7 +2362,7 @@
 #if USE_SIMD16_FRONTEND
     simdvector a;
 
-    const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
+    const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
 
     if (!pa.useAlternateOffset)
     {
@@ -2244,10 +2380,10 @@
     }
 
 #else
-    simdvector &a = PaGetSimdVector(pa, 0, slot);
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
 
 #endif
-    verts[0] = a;  // points only have 1 vertex.
+    verts[0] = a; // points only have 1 vertex.
 
     SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
     return true;
@@ -2256,11 +2392,12 @@
 #if ENABLE_AVX512_SIMD16
 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
-    simd16vector &a = PaGetSimdVector_simd16(pa, pa.cur, slot);
+    simd16vector& a = PaGetSimdVector_simd16(pa, pa.cur, slot);
 
-    verts[0] = a;  // points only have 1 vertex.
+    verts[0] = a; // points only have 1 vertex.
 
-    SetNextPaState_simd16(pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
+    SetNextPaState_simd16(
+        pa, PaPoints0_simd16, PaPoints0, PaPointsSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
     return true;
 }
 
@@ -2268,7 +2405,7 @@
 void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
-    const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
+    const simd16vector& a = PaGetSimdVector_simd16(pa, 0, slot);
 
     if (pa.useAlternateOffset)
     {
@@ -2277,7 +2414,7 @@
 
     verts[0] = swizzleLaneN(a, primIndex);
 #else
-    const simdvector &a = PaGetSimdVector(pa, 0, slot);
+    const simdvector& a = PaGetSimdVector(pa, 0, slot);
 
     verts[0] = swizzleLaneN(a, primIndex);
 #endif
@@ -2291,7 +2428,7 @@
     SetNextPaState(pa, PaRectList1, PaRectListSingle0);
     return false;
 }
- 
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief State 1 for RECT_LIST topology.
 ///   Rect lists has the following format.
@@ -2300,16 +2437,16 @@
 ///         | \ |      | \ |      | \ |       | \ |
 ///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
 ///            v0         v3         v6          v9
-/// 
+///
 ///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
-/// 
+///
 ///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
 ///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
 ///   etc.
-/// 
+///
 ///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
 ///   where v0 contains all the first vertices for 8 triangles.
-/// 
+///
 ///     Result:
 ///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
 ///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
@@ -2317,20 +2454,18 @@
 ///
 /// @param pa - State for PA state machine.
 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
-bool PaRectList1(
-    PA_STATE_OPT& pa,
-    uint32_t slot,
-    simdvector verts[])
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
+/// etc.
+bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
-    // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
+// SIMD vectors a and b are the last two vertical outputs from the vertex shader.
 #if USE_SIMD16_FRONTEND
     simdvector a;
     simdvector b;
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -2340,54 +2475,60 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
             a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);;
+            b[i] = _simd16_extract_ps(b_16[i], 1);
+            ;
         }
     }
 
 #else
-    simdvector &a = PaGetSimdVector(pa, 0, slot);           // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7 }
-    simdvector &b = PaGetSimdVector(pa, 1, slot);           // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
+    simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7 }
+    simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
 
 #endif
     __m256 tmp0, tmp1, tmp2;
 
     // Loop over each component in the simdvector.
-    for(int i = 0; i < 4; ++i)
+    for (int i = 0; i < 4; ++i)
     {
-        simdvector& v0 = verts[0];                          // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
-        tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01);    // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
-        v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20);          //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,  * } where * is don't care.
-        tmp1  = _mm256_permute_ps(v0[i], 0xF0);             // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *,  * }
-        v0[i] = _mm256_permute_ps(v0[i], 0x5A);             //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
-        v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0);         //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
+        simdvector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
+        tmp0           = _mm256_permute2f128_ps(
+            b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
+        v0[i] = _mm256_blend_ps(
+            a[i],
+            tmp0,
+            0x20); //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6, * } where * is don't care.
+        tmp1  = _mm256_permute_ps(v0[i], 0xF0); // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *, * }
+        v0[i] = _mm256_permute_ps(v0[i], 0x5A); //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
+        v0[i] =
+            _mm256_blend_ps(tmp1, v0[i], 0xF0); //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
 
         /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
         ///      AVX2 should make this much cheaper.
-        simdvector& v1 = verts[1];                          // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
-        v1[i] = _mm256_permute_ps(a[i], 0x09);              //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
-        tmp1  = _mm256_permute_ps(a[i], 0x43);              // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
-        tmp2  = _mm256_blend_ps(v1[i], tmp1, 0xF0);         // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
-        tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);    // tmp1 = { v7,  *, v4,  v5, *  *,  *,  * }
-        v1[i] = _mm256_permute_ps(tmp0, 0xE0);              //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
-        v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0);         //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
-        v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C);         //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
+        simdvector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
+        v1[i]          = _mm256_permute_ps(a[i], 0x09);  //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
+        tmp1           = _mm256_permute_ps(a[i], 0x43);  // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
+        tmp2  = _mm256_blend_ps(v1[i], tmp1, 0xF0);      // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
+        tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7,  *, v4,  v5, *, *,  *,  * }
+        v1[i] = _mm256_permute_ps(tmp0, 0xE0);      //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
+        v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0); //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
+        v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C); //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
 
         // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
-        simdvector& v2 = verts[2];                          // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
-        v2[i] = _mm256_permute_ps(tmp0, 0x30);              //   v2 = { *, *, *, *, v8, *, v11, * }
-        tmp1  = _mm256_permute_ps(tmp2, 0x31);              // tmp1 = { v2, *, v5, *, *, *, *, * }
-        v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
+        simdvector& v2 = verts[2]; // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
+        v2[i]          = _mm256_permute_ps(tmp0, 0x30); //   v2 = { *, *, *, *, v8, *, v11, * }
+        tmp1           = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
+        v2[i]          = _mm256_blend_ps(tmp1, v2[i], 0xF0);
 
         // Need to compute 4th implied vertex for the rectangle.
         tmp2  = _mm256_sub_ps(v0[i], v1[i]);
-        tmp2  = _mm256_add_ps(tmp2, v2[i]);                 // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
-        tmp2  = _mm256_permute_ps(tmp2, 0xA0);              // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
-        v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA);         //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
+        tmp2  = _mm256_add_ps(tmp2, v2[i]);         // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
+        tmp2  = _mm256_permute_ps(tmp2, 0xA0);      // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
+        v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA); //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
     }
 
     SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
@@ -2399,11 +2540,9 @@
 ///        Not implemented unless there is a use case for more then 8 rects.
 /// @param pa - State for PA state machine.
 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
-bool PaRectList2(
-    PA_STATE_OPT& pa,
-    uint32_t slot,
-    simdvector verts[])
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
+/// etc.
+bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
 {
     SWR_INVALID("Is rect list used for anything other then clears?");
     SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
@@ -2428,16 +2567,16 @@
 ///         | \ |      | \ |      | \ |       | \ |
 ///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
 ///            v0         v3         v6          v9
-/// 
+///
 ///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
-/// 
+///
 ///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
 ///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
 ///   etc.
-/// 
+///
 ///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
 ///   where v0 contains all the first vertices for 8 triangles.
-/// 
+///
 ///     Result:
 ///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
 ///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
@@ -2445,18 +2584,19 @@
 ///
 /// @param pa - State for PA state machine.
 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
-bool PaRectList1_simd16(
-    PA_STATE_OPT& pa,
-    uint32_t slot,
-    simd16vector verts[])
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
+/// etc.
+bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
+    // clang-format off
+
     simdvector a;
     simdvector b;
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7, v8, v9, v10, v11, v12, v13, v14, v15 }
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot); // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7,
+                                                                        //         v8, v9, v10, v11, v12, v13, v14, v15 }
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -2466,7 +2606,7 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. }
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot); // b[] = { v16...but not used by this implementation.. }
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -2475,45 +2615,45 @@
         }
     }
 
-    simd16vector &v0 = verts[0];                            // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
-    simd16vector &v1 = verts[1];                            // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
-    simd16vector &v2 = verts[2];                            // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
+    simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6,  v9,  v9 }
+    simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
+    simd16vector& v2 = verts[2]; // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11,   z }
 
     // Loop over each component in the simdvector.
     for (int i = 0; i < 4; i += 1)
     {
-        simdscalar v0_lo;                                   // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
-        simdscalar v1_lo;                                   // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
-        simdscalar v2_lo;                                   // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
+        simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
+        simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
+        simdscalar v2_lo; // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
 
         __m256 tmp0, tmp1, tmp2;
 
-        tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01);    // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
-        v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20);          //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,  * } where * is don't care.
-        tmp1 = _mm256_permute_ps(v0_lo, 0xF0);              // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *,  * }
-        v0_lo = _mm256_permute_ps(v0_lo, 0x5A);             //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
-        v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0);         //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
+        tmp0  = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
+        v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20);        //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,   * } where * is don't care.
+        tmp1  = _mm256_permute_ps(v0_lo, 0xF0);           // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,   *,   * }
+        v0_lo = _mm256_permute_ps(v0_lo, 0x5A);           //   v0 = {   *,   *,   *,   *,  v6, v6, v9,  v9 }
+        v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0);       //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9,  v9 }
 
         /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
         ///      AVX2 should make this much cheaper.
-        v1_lo = _mm256_permute_ps(a[i], 0x09);              //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
-        tmp1 = _mm256_permute_ps(a[i], 0x43);               // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
-        tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0);          // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
-        tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);     // tmp1 = { v7,  *, v4,  v5, *  *,  *,  * }
-        v1_lo = _mm256_permute_ps(tmp0, 0xE0);              //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
-        v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0);         //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
-        v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C);         //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
+        v1_lo = _mm256_permute_ps(a[i], 0x09);            //   v1 = { v1, v2,  *,  *,  *,  *,   *,   * }
+        tmp1  = _mm256_permute_ps(a[i], 0x43);            // tmp1 = {  *,  *,  *,  *, v7,  *,  v4,  v5 }
+        tmp2  = _mm256_blend_ps(v1_lo, tmp1, 0xF0);       // tmp2 = { v1, v2,  *,  *, v7,  *,  v4,  v5 }
+        tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);  // tmp1 = { v7,  *, v4,  v5, *,  *,   *,   * }
+        v1_lo = _mm256_permute_ps(tmp0, 0xE0);            //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
+        v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0);       //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
+        v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C);       //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
 
         // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
-        v2_lo = _mm256_permute_ps(tmp0, 0x30);              //   v2 = { *, *, *, *, v8, *, v11, * }
-        tmp1 = _mm256_permute_ps(tmp2, 0x31);               // tmp1 = { v2, *, v5, *, *, *, *, * }
+        v2_lo = _mm256_permute_ps(tmp0, 0x30);            //   v2 = { *,  *,  *, *, v8, *, v11, * }
+        tmp1  = _mm256_permute_ps(tmp2, 0x31);            // tmp1 = { v2, *, v5, *,  *, *,   *, * }
         v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0);
 
         // Need to compute 4th implied vertex for the rectangle.
-        tmp2 = _mm256_sub_ps(v0_lo, v1_lo);
-        tmp2 = _mm256_add_ps(tmp2, v2_lo);                  // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
-        tmp2 = _mm256_permute_ps(tmp2, 0xA0);               // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
-        v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA);         //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
+        tmp2  = _mm256_sub_ps(v0_lo, v1_lo);
+        tmp2  = _mm256_add_ps(tmp2, v2_lo);               // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
+        tmp2  = _mm256_permute_ps(tmp2, 0xA0);            // tmp2 = {  *,  w,  *, x, *,  y,  *,  z }
+        v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA);       //   v2 = { v2,  w, v5, x, v8, y, v11, z }
 
         v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0);
         v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0);
@@ -2522,6 +2662,8 @@
 
     SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectList1, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
     return true;
+
+    // clang-format on
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -2529,14 +2671,13 @@
 ///        Not implemented unless there is a use case for more then 8 rects.
 /// @param pa - State for PA state machine.
 /// @param slot - Index into VS output which is either a position (slot 0) or attribute.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
-bool PaRectList2_simd16(
-    PA_STATE_OPT& pa,
-    uint32_t slot,
-    simd16vector verts[])
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
+/// etc.
+bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
     SWR_INVALID("Is rect list used for anything other then clears?");
-    SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
+    SetNextPaState_simd16(
+        pa, PaRectList0_simd16, PaRectList0, PaRectListSingle0, 0, PA_STATE_OPT::SIMD_WIDTH, true);
     return true;
 }
 
@@ -2550,23 +2691,20 @@
 /// @param pa - State for PA state machine.
 /// @param slot - Index into VS output for a given attribute.
 /// @param primIndex - Binner processes each triangle individually.
-/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
-void PaRectListSingle0(
-    PA_STATE_OPT& pa,
-    uint32_t slot,
-    uint32_t primIndex,
-    simd4scalar verts[])
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1,
+/// etc.
+void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
-    // We have 12 simdscalars contained within 3 simdvectors which
-    // hold at least 8 triangles worth of data. We want to assemble a single
-    // triangle with data in horizontal form.
+// We have 12 simdscalars contained within 3 simdvectors which
+// hold at least 8 triangles worth of data. We want to assemble a single
+// triangle with data in horizontal form.
 #if USE_SIMD16_FRONTEND
     simdvector a;
     simdvector b;
 
     if (!pa.useAlternateOffset)
     {
-        const simd16vector &a_16 = PaGetSimdVector_simd16(pa, 0, slot);
+        const simd16vector& a_16 = PaGetSimdVector_simd16(pa, 0, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
@@ -2576,12 +2714,13 @@
     }
     else
     {
-        const simd16vector &b_16 = PaGetSimdVector_simd16(pa, 1, slot);
+        const simd16vector& b_16 = PaGetSimdVector_simd16(pa, 1, slot);
 
         for (uint32_t i = 0; i < 4; i += 1)
         {
             a[i] = _simd16_extract_ps(b_16[i], 0);
-            b[i] = _simd16_extract_ps(b_16[i], 1);;
+            b[i] = _simd16_extract_ps(b_16[i], 1);
+            ;
         }
     }
 
@@ -2590,7 +2729,7 @@
 
 #endif
     // Convert from vertical to horizontal.
-    switch(primIndex)
+    switch (primIndex)
     {
     case 0:
         verts[0] = swizzleLane0(a);
@@ -2613,10 +2752,17 @@
     };
 }
 
-PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, 
-    uint32_t in_vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo) : 
-    PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), 
-    cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
+PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT*      in_pDC,
+                           uint32_t           in_numPrims,
+                           uint8_t*           pStream,
+                           uint32_t           in_streamSizeInVerts,
+                           uint32_t           in_vertexStride,
+                           bool               in_isStreaming,
+                           uint32_t           numVertsPerPrim,
+                           PRIMITIVE_TOPOLOGY topo) :
+    PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim),
+    numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), cur(0), prev(0), first(0),
+    counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
 {
     const API_STATE& state = GetApiState(pDC);
 
@@ -2628,271 +2774,271 @@
 #endif
     switch (this->binTopology)
     {
-        case TOP_TRIANGLE_LIST:
-            this->pfnPaFunc = PaTriList0;
+    case TOP_TRIANGLE_LIST:
+        this->pfnPaFunc = PaTriList0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaTriList0_simd16;
+        this->pfnPaFunc_simd16 = PaTriList0_simd16;
 #endif
-            break;
-        case TOP_TRIANGLE_STRIP:
-            this->pfnPaFunc = PaTriStrip0;
+        break;
+    case TOP_TRIANGLE_STRIP:
+        this->pfnPaFunc = PaTriStrip0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
+        this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
 #endif
-            break;
-        case TOP_TRIANGLE_FAN:
-            this->pfnPaFunc = PaTriFan0;
+        break;
+    case TOP_TRIANGLE_FAN:
+        this->pfnPaFunc = PaTriFan0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaTriFan0_simd16;
+        this->pfnPaFunc_simd16 = PaTriFan0_simd16;
 #endif
-            break;
-        case TOP_QUAD_LIST:
-            this->pfnPaFunc = PaQuadList0;
+        break;
+    case TOP_QUAD_LIST:
+        this->pfnPaFunc = PaQuadList0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaQuadList0_simd16;
+        this->pfnPaFunc_simd16 = PaQuadList0_simd16;
 #endif
-            this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
-            break;
-        case TOP_QUAD_STRIP:
-            // quad strip pattern when decomposed into triangles is the same as verts strips
-            this->pfnPaFunc = PaTriStrip0;
+        this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
+        break;
+    case TOP_QUAD_STRIP:
+        // quad strip pattern when decomposed into triangles is the same as verts strips
+        this->pfnPaFunc = PaTriStrip0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
+        this->pfnPaFunc_simd16 = PaTriStrip0_simd16;
 #endif
-            this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
-            break;
-        case TOP_LINE_LIST:
-            this->pfnPaFunc = PaLineList0;
+        this->numPrims = in_numPrims * 2; // Convert quad primitives into triangles
+        break;
+    case TOP_LINE_LIST:
+        this->pfnPaFunc = PaLineList0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaLineList0_simd16;
+        this->pfnPaFunc_simd16 = PaLineList0_simd16;
 #endif
-            this->numPrims = in_numPrims;
-            break;
-        case TOP_LINE_STRIP:
-            this->pfnPaFunc = PaLineStrip0;
+        this->numPrims = in_numPrims;
+        break;
+    case TOP_LINE_STRIP:
+        this->pfnPaFunc = PaLineStrip0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaLineStrip0_simd16;
+        this->pfnPaFunc_simd16 = PaLineStrip0_simd16;
 #endif
-            this->numPrims = in_numPrims;
-            break;
-        case TOP_LINE_LOOP:
-            this->pfnPaFunc = PaLineLoop0;
+        this->numPrims = in_numPrims;
+        break;
+    case TOP_LINE_LOOP:
+        this->pfnPaFunc = PaLineLoop0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaLineLoop0_simd16;
+        this->pfnPaFunc_simd16 = PaLineLoop0_simd16;
 #endif
-            this->numPrims = in_numPrims;
-            break;
-        case TOP_POINT_LIST:
-            this->pfnPaFunc = PaPoints0;
+        this->numPrims = in_numPrims;
+        break;
+    case TOP_POINT_LIST:
+        this->pfnPaFunc = PaPoints0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPoints0_simd16;
+        this->pfnPaFunc_simd16 = PaPoints0_simd16;
 #endif
-            this->numPrims = in_numPrims;
-            break;
-        case TOP_RECT_LIST:
-            this->pfnPaFunc = PaRectList0;
+        this->numPrims = in_numPrims;
+        break;
+    case TOP_RECT_LIST:
+        this->pfnPaFunc = PaRectList0;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaRectList0_simd16;
+        this->pfnPaFunc_simd16 = PaRectList0_simd16;
 #endif
-            this->numPrims = in_numPrims * 2;
-            break;
+        this->numPrims = in_numPrims * 2;
+        break;
 
-        case TOP_PATCHLIST_1:
-            this->pfnPaFunc = PaPatchList<1>;
+    case TOP_PATCHLIST_1:
+        this->pfnPaFunc = PaPatchList<1>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<1>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<1>;
 #endif
-            break;
-        case TOP_PATCHLIST_2:
-            this->pfnPaFunc = PaPatchList<2>;
+        break;
+    case TOP_PATCHLIST_2:
+        this->pfnPaFunc = PaPatchList<2>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<2>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<2>;
 #endif
-            break;
-        case TOP_PATCHLIST_3:
-            this->pfnPaFunc = PaPatchList<3>;
+        break;
+    case TOP_PATCHLIST_3:
+        this->pfnPaFunc = PaPatchList<3>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<3>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<3>;
 #endif
-            break;
-        case TOP_PATCHLIST_4:
-            this->pfnPaFunc = PaPatchList<4>;
+        break;
+    case TOP_PATCHLIST_4:
+        this->pfnPaFunc = PaPatchList<4>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<4>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<4>;
 #endif
-            break;
-        case TOP_PATCHLIST_5:
-            this->pfnPaFunc = PaPatchList<5>;
+        break;
+    case TOP_PATCHLIST_5:
+        this->pfnPaFunc = PaPatchList<5>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<5>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<5>;
 #endif
-            break;
-        case TOP_PATCHLIST_6:
-            this->pfnPaFunc = PaPatchList<6>;
+        break;
+    case TOP_PATCHLIST_6:
+        this->pfnPaFunc = PaPatchList<6>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<6>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<6>;
 #endif
-            break;
-        case TOP_PATCHLIST_7:
-            this->pfnPaFunc = PaPatchList<7>;
+        break;
+    case TOP_PATCHLIST_7:
+        this->pfnPaFunc = PaPatchList<7>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<7>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<7>;
 #endif
-            break;
-        case TOP_PATCHLIST_8:
-            this->pfnPaFunc = PaPatchList<8>;
+        break;
+    case TOP_PATCHLIST_8:
+        this->pfnPaFunc = PaPatchList<8>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<8>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<8>;
 #endif
-            break;
-        case TOP_PATCHLIST_9:
-            this->pfnPaFunc = PaPatchList<9>;
+        break;
+    case TOP_PATCHLIST_9:
+        this->pfnPaFunc = PaPatchList<9>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<9>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<9>;
 #endif
-            break;
-        case TOP_PATCHLIST_10:
-            this->pfnPaFunc = PaPatchList<10>;
+        break;
+    case TOP_PATCHLIST_10:
+        this->pfnPaFunc = PaPatchList<10>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<10>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<10>;
 #endif
-            break;
-        case TOP_PATCHLIST_11:
-            this->pfnPaFunc = PaPatchList<11>;
+        break;
+    case TOP_PATCHLIST_11:
+        this->pfnPaFunc = PaPatchList<11>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<11>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<11>;
 #endif
-            break;
-        case TOP_PATCHLIST_12:
-            this->pfnPaFunc = PaPatchList<12>;
+        break;
+    case TOP_PATCHLIST_12:
+        this->pfnPaFunc = PaPatchList<12>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<12>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<12>;
 #endif
-            break;
-        case TOP_PATCHLIST_13:
-            this->pfnPaFunc = PaPatchList<13>;
+        break;
+    case TOP_PATCHLIST_13:
+        this->pfnPaFunc = PaPatchList<13>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<13>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<13>;
 #endif
-            break;
-        case TOP_PATCHLIST_14:
-            this->pfnPaFunc = PaPatchList<14>;
+        break;
+    case TOP_PATCHLIST_14:
+        this->pfnPaFunc = PaPatchList<14>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<14>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<14>;
 #endif
-            break;
-        case TOP_PATCHLIST_15:
-            this->pfnPaFunc = PaPatchList<15>;
+        break;
+    case TOP_PATCHLIST_15:
+        this->pfnPaFunc = PaPatchList<15>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<15>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<15>;
 #endif
-            break;
-        case TOP_PATCHLIST_16:
-            this->pfnPaFunc = PaPatchList<16>;
+        break;
+    case TOP_PATCHLIST_16:
+        this->pfnPaFunc = PaPatchList<16>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<16>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<16>;
 #endif
-            break;
-        case TOP_PATCHLIST_17:
-            this->pfnPaFunc = PaPatchList<17>;
+        break;
+    case TOP_PATCHLIST_17:
+        this->pfnPaFunc = PaPatchList<17>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<17>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<17>;
 #endif
-            break;
-        case TOP_PATCHLIST_18:
-            this->pfnPaFunc = PaPatchList<18>;
+        break;
+    case TOP_PATCHLIST_18:
+        this->pfnPaFunc = PaPatchList<18>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<18>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<18>;
 #endif
-            break;
-        case TOP_PATCHLIST_19:
-            this->pfnPaFunc = PaPatchList<19>;
+        break;
+    case TOP_PATCHLIST_19:
+        this->pfnPaFunc = PaPatchList<19>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<19>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<19>;
 #endif
-            break;
-        case TOP_PATCHLIST_20:
-            this->pfnPaFunc = PaPatchList<20>;
+        break;
+    case TOP_PATCHLIST_20:
+        this->pfnPaFunc = PaPatchList<20>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<20>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<20>;
 #endif
-            break;
-        case TOP_PATCHLIST_21:
-            this->pfnPaFunc = PaPatchList<21>;
+        break;
+    case TOP_PATCHLIST_21:
+        this->pfnPaFunc = PaPatchList<21>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<21>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<21>;
 #endif
-            break;
-        case TOP_PATCHLIST_22:
-            this->pfnPaFunc = PaPatchList<22>;
+        break;
+    case TOP_PATCHLIST_22:
+        this->pfnPaFunc = PaPatchList<22>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<22>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<22>;
 #endif
-            break;
-        case TOP_PATCHLIST_23:
-            this->pfnPaFunc = PaPatchList<23>;
+        break;
+    case TOP_PATCHLIST_23:
+        this->pfnPaFunc = PaPatchList<23>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<23>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<23>;
 #endif
-            break;
-        case TOP_PATCHLIST_24:
-            this->pfnPaFunc = PaPatchList<24>;
+        break;
+    case TOP_PATCHLIST_24:
+        this->pfnPaFunc = PaPatchList<24>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<24>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<24>;
 #endif
-            break;
-        case TOP_PATCHLIST_25:
-            this->pfnPaFunc = PaPatchList<25>;
+        break;
+    case TOP_PATCHLIST_25:
+        this->pfnPaFunc = PaPatchList<25>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<25>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<25>;
 #endif
-            break;
-        case TOP_PATCHLIST_26:
-            this->pfnPaFunc = PaPatchList<26>;
+        break;
+    case TOP_PATCHLIST_26:
+        this->pfnPaFunc = PaPatchList<26>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<26>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<26>;
 #endif
-            break;
-        case TOP_PATCHLIST_27:
-            this->pfnPaFunc = PaPatchList<27>;
+        break;
+    case TOP_PATCHLIST_27:
+        this->pfnPaFunc = PaPatchList<27>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<27>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<27>;
 #endif
-            break;
-        case TOP_PATCHLIST_28:
-            this->pfnPaFunc = PaPatchList<28>;
+        break;
+    case TOP_PATCHLIST_28:
+        this->pfnPaFunc = PaPatchList<28>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<28>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<28>;
 #endif
-            break;
-        case TOP_PATCHLIST_29:
-            this->pfnPaFunc = PaPatchList<29>;
+        break;
+    case TOP_PATCHLIST_29:
+        this->pfnPaFunc = PaPatchList<29>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<29>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<29>;
 #endif
-            break;
-        case TOP_PATCHLIST_30:
-            this->pfnPaFunc = PaPatchList<30>;
+        break;
+    case TOP_PATCHLIST_30:
+        this->pfnPaFunc = PaPatchList<30>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<30>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<30>;
 #endif
-            break;
-        case TOP_PATCHLIST_31:
-            this->pfnPaFunc = PaPatchList<31>;
+        break;
+    case TOP_PATCHLIST_31:
+        this->pfnPaFunc = PaPatchList<31>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<31>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<31>;
 #endif
-            break;
-        case TOP_PATCHLIST_32:
-            this->pfnPaFunc = PaPatchList<32>;
+        break;
+    case TOP_PATCHLIST_32:
+        this->pfnPaFunc = PaPatchList<32>;
 #if ENABLE_AVX512_SIMD16
-            this->pfnPaFunc_simd16 = PaPatchList_simd16<32>;
+        this->pfnPaFunc_simd16 = PaPatchList_simd16<32>;
 #endif
-            break;
+        break;
 
-        default:
-            SWR_INVALID("Invalid topology: %d", this->binTopology);
-            break;
+    default:
+        SWR_INVALID("Invalid topology: %d", this->binTopology);
+        break;
     };
 
     this->pfnPaFuncReset = this->pfnPaFunc;
@@ -2902,95 +3048,94 @@
 
 #if USE_SIMD16_FRONTEND
     simd16scalari id16 = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-    simd16scalari id82 = _simd16_set_epi32( 7,  7,  6,  6,  5,  5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
+    simd16scalari id82 = _simd16_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0);
 
 #else
     simdscalari id8 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
     simdscalari id4 = _simd_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
 
 #endif
-    switch(this->binTopology)
+    switch (this->binTopology)
     {
-        case TOP_TRIANGLE_LIST:
-        case TOP_TRIANGLE_STRIP:
-        case TOP_TRIANGLE_FAN:
-        case TOP_LINE_STRIP:
-        case TOP_LINE_LIST:
-        case TOP_LINE_LOOP:
+    case TOP_TRIANGLE_LIST:
+    case TOP_TRIANGLE_STRIP:
+    case TOP_TRIANGLE_FAN:
+    case TOP_LINE_STRIP:
+    case TOP_LINE_LIST:
+    case TOP_LINE_LOOP:
 #if USE_SIMD16_FRONTEND
-            this->primIDIncr = 16;
-            this->primID = id16;
+        this->primIDIncr = 16;
+        this->primID     = id16;
 #else
-            this->primIDIncr = 8;
-            this->primID = id8;
+        this->primIDIncr = 8;
+        this->primID = id8;
 #endif
-            break;
-        case TOP_QUAD_LIST:
-        case TOP_QUAD_STRIP:
-        case TOP_RECT_LIST:
+        break;
+    case TOP_QUAD_LIST:
+    case TOP_QUAD_STRIP:
+    case TOP_RECT_LIST:
 #if USE_SIMD16_FRONTEND
-            this->primIDIncr = 8;
-            this->primID = id82;
+        this->primIDIncr = 8;
+        this->primID     = id82;
 #else
-            this->primIDIncr = 4;
-            this->primID = id4;
+        this->primIDIncr = 4;
+        this->primID = id4;
 #endif
-            break;
-        case TOP_POINT_LIST:
+        break;
+    case TOP_POINT_LIST:
 #if USE_SIMD16_FRONTEND
-            this->primIDIncr = 16;
-            this->primID = id16;
+        this->primIDIncr = 16;
+        this->primID     = id16;
 #else
-            this->primIDIncr = 8;
-            this->primID = id8;
+        this->primIDIncr = 8;
+        this->primID = id8;
 #endif
-            break;
-        case TOP_PATCHLIST_1:
-        case TOP_PATCHLIST_2:
-        case TOP_PATCHLIST_3:
-        case TOP_PATCHLIST_4:
-        case TOP_PATCHLIST_5:
-        case TOP_PATCHLIST_6:
-        case TOP_PATCHLIST_7:
-        case TOP_PATCHLIST_8:
-        case TOP_PATCHLIST_9:
-        case TOP_PATCHLIST_10:
-        case TOP_PATCHLIST_11:
-        case TOP_PATCHLIST_12:
-        case TOP_PATCHLIST_13:
-        case TOP_PATCHLIST_14:
-        case TOP_PATCHLIST_15:
-        case TOP_PATCHLIST_16:
-        case TOP_PATCHLIST_17:
-        case TOP_PATCHLIST_18:
-        case TOP_PATCHLIST_19:
-        case TOP_PATCHLIST_20:
-        case TOP_PATCHLIST_21:
-        case TOP_PATCHLIST_22:
-        case TOP_PATCHLIST_23:
-        case TOP_PATCHLIST_24:
-        case TOP_PATCHLIST_25:
-        case TOP_PATCHLIST_26:
-        case TOP_PATCHLIST_27:
-        case TOP_PATCHLIST_28:
-        case TOP_PATCHLIST_29:
-        case TOP_PATCHLIST_30:
-        case TOP_PATCHLIST_31:
-        case TOP_PATCHLIST_32:
-            // Always run KNOB_SIMD_WIDTH number of patches at a time.
+        break;
+    case TOP_PATCHLIST_1:
+    case TOP_PATCHLIST_2:
+    case TOP_PATCHLIST_3:
+    case TOP_PATCHLIST_4:
+    case TOP_PATCHLIST_5:
+    case TOP_PATCHLIST_6:
+    case TOP_PATCHLIST_7:
+    case TOP_PATCHLIST_8:
+    case TOP_PATCHLIST_9:
+    case TOP_PATCHLIST_10:
+    case TOP_PATCHLIST_11:
+    case TOP_PATCHLIST_12:
+    case TOP_PATCHLIST_13:
+    case TOP_PATCHLIST_14:
+    case TOP_PATCHLIST_15:
+    case TOP_PATCHLIST_16:
+    case TOP_PATCHLIST_17:
+    case TOP_PATCHLIST_18:
+    case TOP_PATCHLIST_19:
+    case TOP_PATCHLIST_20:
+    case TOP_PATCHLIST_21:
+    case TOP_PATCHLIST_22:
+    case TOP_PATCHLIST_23:
+    case TOP_PATCHLIST_24:
+    case TOP_PATCHLIST_25:
+    case TOP_PATCHLIST_26:
+    case TOP_PATCHLIST_27:
+    case TOP_PATCHLIST_28:
+    case TOP_PATCHLIST_29:
+    case TOP_PATCHLIST_30:
+    case TOP_PATCHLIST_31:
+    case TOP_PATCHLIST_32:
+        // Always run KNOB_SIMD_WIDTH number of patches at a time.
 #if USE_SIMD16_FRONTEND
-            this->primIDIncr = 16;
-            this->primID = id16;
+        this->primIDIncr = 16;
+        this->primID     = id16;
 #else
-            this->primIDIncr = 8;
-            this->primID = id8;
+        this->primIDIncr = 8;
+        this->primID = id8;
 #endif
-            break;
+        break;
 
-        default:
-            SWR_INVALID("Invalid topology: %d", this->binTopology);
-            break;
+    default:
+        SWR_INVALID("Invalid topology: %d", this->binTopology);
+        break;
     };
-
 }
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 67c28ad..a392035 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file rasterizer.cpp
-*
-* @brief Implementation for the rasterizer.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file rasterizer.cpp
+ *
+ * @brief Implementation for the rasterizer.
+ *
+ ******************************************************************************/
 
 #include <vector>
 #include <algorithm>
@@ -39,11 +39,12 @@
 #include "memory/tilingtraits.h"
 #include "rasterizer_impl.h"
 
-PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
+PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
+                              [STATE_VALID_TRI_EDGE_COUNT][2];
 
-void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
 {
-    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
+    const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pData);
 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_BIN_TRIS)
     {
@@ -54,23 +55,24 @@
     // bloat line to two tris and call the triangle rasterizer twice
     RDTSC_BEGIN(BERasterizeLine, pDC->drawId);
 
-    const API_STATE &state = GetApiState(pDC);
-    const SWR_RASTSTATE &rastState = state.rastState;
+    const API_STATE&     state     = GetApiState(pDC);
+    const SWR_RASTSTATE& rastState = state.rastState;
 
     // macrotile dimensioning
     uint32_t macroX, macroY;
     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
-    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
-    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
-    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
+    int32_t macroBoxLeft   = macroX * KNOB_MACROTILE_X_DIM_FIXED;
+    int32_t macroBoxRight  = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
+    int32_t macroBoxTop    = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
 
-    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
+    const SWR_RECT& scissorInFixedPoint =
+        state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
 
     // create a copy of the triangle buffer to write our adjusted vertices to
     OSALIGNSIMD(float) newTriBuffer[4 * 4];
     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
-    newWorkDesc.pTriBuffer = &newTriBuffer[0];
+    newWorkDesc.pTriBuffer         = &newTriBuffer[0];
 
     // create a copy of the attrib buffer to write our adjusted attribs to
     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
@@ -81,20 +83,20 @@
 
     __m128 vX, vY, vZ, vRecipW;
 
-    vX = _mm_load_ps(workDesc.pTriBuffer);
-    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
-    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+    vX      = _mm_load_ps(workDesc.pTriBuffer);
+    vY      = _mm_load_ps(workDesc.pTriBuffer + 4);
+    vZ      = _mm_load_ps(workDesc.pTriBuffer + 8);
     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
 
     // triangle 0
     // v0,v1 -> v0,v0,v1
-    __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
-    __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 vXa      = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 vYa      = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 vZa      = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
     __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
 
     __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
-    __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
+    __m128 vAdjust    = _mm_mul_ps(vLineWidth, vBloat0);
     if (workDesc.triFlags.yMajor)
     {
         vXa = _mm_add_ps(vAdjust, vXa);
@@ -123,7 +125,7 @@
     }
 
     // Store user clip distances for triangle 0
-    float newClipBuffer[3 * 8];
+    float    newClipBuffer[3 * 8];
     uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
     if (numClipDist)
     {
@@ -151,8 +153,12 @@
     // setup triangle rasterizer function
     PFN_WORK_FUNC pfnTriRast;
     // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
-        SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
+    pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
+                                   rastState.bIsCenterPattern,
+                                   false,
+                                   SWR_INPUT_COVERAGE_NONE,
+                                   EdgeValToEdgeState(ALL_EDGES_VALID),
+                                   (pDC->pState->state.scissorsTileAligned == false));
 
     // make sure this macrotile intersects the triangle
     __m128i vXai = fpToFixedPoint(vXa);
@@ -160,23 +166,20 @@
     OSALIGNSIMD(SWR_RECT) bboxA;
     calcBoundingBoxInt(vXai, vYai, bboxA);
 
-    if (!(bboxA.xmin > macroBoxRight ||
-        bboxA.xmin > scissorInFixedPoint.xmax ||
-        bboxA.xmax - 1 < macroBoxLeft ||
-        bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-        bboxA.ymin > macroBoxBottom ||
-        bboxA.ymin > scissorInFixedPoint.ymax ||
-        bboxA.ymax - 1 < macroBoxTop ||
-        bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
+    if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
+          bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
+          bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
+          bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
+    {
         // rasterize triangle
         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
     }
 
     // triangle 1
     // v0,v1 -> v1,v1,v0
-    vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
-    vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
-    vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
+    vXa      = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
+    vYa      = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
+    vZa      = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
     vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
 
     vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
@@ -233,14 +236,11 @@
     vYai = fpToFixedPoint(vYa);
     calcBoundingBoxInt(vXai, vYai, bboxA);
 
-    if (!(bboxA.xmin > macroBoxRight ||
-        bboxA.xmin > scissorInFixedPoint.xmax ||
-        bboxA.xmax - 1 < macroBoxLeft ||
-        bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
-        bboxA.ymin > macroBoxBottom ||
-        bboxA.ymin > scissorInFixedPoint.ymax ||
-        bboxA.ymax - 1 < macroBoxTop ||
-        bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
+    if (!(bboxA.xmin > macroBoxRight || bboxA.xmin > scissorInFixedPoint.xmax ||
+          bboxA.xmax - 1 < macroBoxLeft || bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
+          bboxA.ymin > macroBoxBottom || bboxA.ymin > scissorInFixedPoint.ymax ||
+          bboxA.ymax - 1 < macroBoxTop || bboxA.ymax - 1 < scissorInFixedPoint.ymin))
+    {
         // rasterize triangle
         pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
     }
@@ -248,7 +248,7 @@
     RDTSC_BEGIN(BERasterizeLine, 1);
 }
 
-void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
 {
 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_BIN_TRIS)
@@ -257,21 +257,19 @@
     }
 #endif
 
-    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
-    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+    const TRIANGLE_WORK_DESC& workDesc     = *(const TRIANGLE_WORK_DESC*)pData;
+    const BACKEND_FUNCS&      backendFuncs = pDC->pState->backendFuncs;
 
-    // map x,y relative offsets from start of raster tile to bit position in 
+    // map x,y relative offsets from start of raster tile to bit position in
     // coverage mask for the point
-    static const uint32_t coverageMap[8][8] = {
-        { 0, 1, 4, 5, 8, 9, 12, 13 },
-        { 2, 3, 6, 7, 10, 11, 14, 15 },
-        { 16, 17, 20, 21, 24, 25, 28, 29 },
-        { 18, 19, 22, 23, 26, 27, 30, 31 },
-        { 32, 33, 36, 37, 40, 41, 44, 45 },
-        { 34, 35, 38, 39, 42, 43, 46, 47 },
-        { 48, 49, 52, 53, 56, 57, 60, 61 },
-        { 50, 51, 54, 55, 58, 59, 62, 63 }
-    };
+    static const uint32_t coverageMap[8][8] = {{0, 1, 4, 5, 8, 9, 12, 13},
+                                               {2, 3, 6, 7, 10, 11, 14, 15},
+                                               {16, 17, 20, 21, 24, 25, 28, 29},
+                                               {18, 19, 22, 23, 26, 27, 30, 31},
+                                               {32, 33, 36, 37, 40, 41, 44, 45},
+                                               {34, 35, 38, 39, 42, 43, 46, 47},
+                                               {48, 49, 52, 53, 56, 57, 60, 61},
+                                               {50, 51, 54, 55, 58, 59, 62, 63}};
 
     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
 
@@ -279,7 +277,7 @@
     // @todo use structs for readability
     uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
     uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
-    float z = *(workDesc.pTriBuffer + 2);
+    float    z            = *(workDesc.pTriBuffer + 2);
 
     // construct triangle descriptor for point
     // no interpolation, set up i,j for constant interpolation of z and attribs
@@ -294,27 +292,32 @@
 
     // no persp divide needed for points
     triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
-    triDesc.triFlags = workDesc.triFlags;
-    triDesc.recipDet = 1.0f;
+    triDesc.triFlags                         = workDesc.triFlags;
+    triDesc.recipDet                         = 1.0f;
     triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
     triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
     triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
     triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
 
     RenderOutputBuffers renderBuffers;
-    GetRenderHotTiles(pDC, workerId, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
-        renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
+    GetRenderHotTiles(pDC,
+                      workerId,
+                      macroTile,
+                      tileAlignedX >> KNOB_TILE_X_DIM_SHIFT,
+                      tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT,
+                      renderBuffers,
+                      triDesc.triFlags.renderTargetArrayIndex);
 
     RDTSC_BEGIN(BEPixelBackend, pDC->drawId);
     backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
     RDTSC_END(BEPixelBackend, 0);
 }
 
-void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData)
 {
-    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
-    const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
-    const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
+    const TRIANGLE_WORK_DESC& workDesc     = *(const TRIANGLE_WORK_DESC*)pData;
+    const SWR_RASTSTATE&      rastState    = pDC->pState->state.rastState;
+    const SWR_BACKEND_STATE&  backendState = pDC->pState->state.backendState;
 
     bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
 
@@ -326,28 +329,28 @@
     // create a copy of the triangle buffer to write our adjusted vertices to
     OSALIGNSIMD(float) newTriBuffer[4 * 4];
     TRIANGLE_WORK_DESC newWorkDesc = workDesc;
-    newWorkDesc.pTriBuffer = &newTriBuffer[0];
+    newWorkDesc.pTriBuffer         = &newTriBuffer[0];
 
     // create a copy of the attrib buffer to write our adjusted attribs to
     OSALIGNSIMD(float) newAttribBuffer[4 * 3 * SWR_VTX_NUM_SLOTS];
     newWorkDesc.pAttribs = &newAttribBuffer[0];
 
     newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
-    newWorkDesc.numAttribs = workDesc.numAttribs;
-    newWorkDesc.triFlags = workDesc.triFlags;
+    newWorkDesc.numAttribs      = workDesc.numAttribs;
+    newWorkDesc.triFlags        = workDesc.triFlags;
 
     // construct two tris by bloating point by point size
     float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
-    float lowerX = x - halfPointSize;
-    float upperX = x + halfPointSize;
-    float lowerY = y - halfPointSize;
-    float upperY = y + halfPointSize;
+    float lowerX        = x - halfPointSize;
+    float upperX        = x + halfPointSize;
+    float lowerY        = y - halfPointSize;
+    float upperY        = y + halfPointSize;
 
     // tri 0
-    float *pBuf = &newTriBuffer[0];
-    *pBuf++ = lowerX;
-    *pBuf++ = lowerX;
-    *pBuf++ = upperX;
+    float* pBuf = &newTriBuffer[0];
+    *pBuf++     = lowerX;
+    *pBuf++     = lowerX;
+    *pBuf++     = upperX;
     pBuf++;
     *pBuf++ = lowerY;
     *pBuf++ = upperY;
@@ -359,8 +362,12 @@
     // setup triangle rasterizer function
     PFN_WORK_FUNC pfnTriRast;
     // conservative rast not supported for points/lines
-    pfnTriRast = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, false,
-        SWR_INPUT_COVERAGE_NONE, EdgeValToEdgeState(ALL_EDGES_VALID), (pDC->pState->state.scissorsTileAligned == false));
+    pfnTriRast = GetRasterizerFunc(rastState.sampleCount,
+                                   rastState.bIsCenterPattern,
+                                   false,
+                                   SWR_INPUT_COVERAGE_NONE,
+                                   EdgeValToEdgeState(ALL_EDGES_VALID),
+                                   (pDC->pState->state.scissorsTileAligned == false));
 
     // overwrite texcoords for point sprites
     if (isPointSpriteTexCoordEnabled)
@@ -370,8 +377,8 @@
         newWorkDesc.pAttribs = &newAttribBuffer[0];
 
         // overwrite texcoord for point sprites
-        uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
-        DWORD texCoordAttrib = 0;
+        uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
+        DWORD    texCoordAttrib = 0;
 
         while (_BitScanForward(&texCoordAttrib, texCoordMask))
         {
@@ -400,7 +407,7 @@
     pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
 
     // tri 1
-    pBuf = &newTriBuffer[0];
+    pBuf    = &newTriBuffer[0];
     *pBuf++ = lowerX;
     *pBuf++ = upperX;
     *pBuf++ = upperX;
@@ -412,8 +419,8 @@
 
     if (isPointSpriteTexCoordEnabled)
     {
-        uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
-        DWORD texCoordAttrib = 0;
+        uint32_t texCoordMask   = backendState.pointSpriteTexCoordMask;
+        DWORD    texCoordAttrib = 0;
 
         while (_BitScanForward(&texCoordAttrib, texCoordMask))
         {
@@ -424,7 +431,6 @@
                 pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
                 pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
                 pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
-
             }
             else
             {
@@ -444,20 +450,19 @@
 }
 
 // Selector for correct templated RasterizeTriangle function
-PFN_WORK_FUNC GetRasterizerFunc(
-    SWR_MULTISAMPLE_COUNT numSamples,
-    bool IsCenter,
-    bool IsConservative,
-    SWR_INPUT_COVERAGE InputCoverage,
-    uint32_t EdgeEnable,
-    bool RasterizeScissorEdges
-)
+PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
+                                bool                  IsCenter,
+                                bool                  IsConservative,
+                                SWR_INPUT_COVERAGE    InputCoverage,
+                                uint32_t              EdgeEnable,
+                                bool                  RasterizeScissorEdges)
 {
     SWR_ASSERT(numSamples >= 0 && numSamples < SWR_MULTISAMPLE_TYPE_COUNT);
     SWR_ASSERT(InputCoverage >= 0 && InputCoverage < SWR_INPUT_COVERAGE_COUNT);
     SWR_ASSERT(EdgeEnable < STATE_VALID_TRI_EDGE_COUNT);
 
-    PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage][EdgeEnable][RasterizeScissorEdges];
+    PFN_WORK_FUNC func = gRasterizerFuncs[numSamples][IsCenter][IsConservative][InputCoverage]
+                                         [EdgeEnable][RasterizeScissorEdges];
     SWR_ASSERT(func);
 
     return func;
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
index 414d0f0..f15cc19 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file rasterizer.h
-*
-* @brief Definitions for the rasterizer.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file rasterizer.h
+ *
+ * @brief Definitions for the rasterizer.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "context.h"
@@ -32,9 +32,9 @@
 #include "conservativeRast.h"
 #include "multisample.h"
 
-void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void RasterizeLine(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
+void RasterizeSimplePoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
+void RasterizeTriPoint(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pData);
 void InitRasterizerFunctions();
 
 INLINE
@@ -56,43 +56,43 @@
 
 enum TriEdgesValues
 {
-    NO_VALID_EDGES = 0,
-    E0_E1_VALID = 0x3,
-    E0_E2_VALID = 0x5,
-    E1_E2_VALID = 0x6,
+    NO_VALID_EDGES  = 0,
+    E0_E1_VALID     = 0x3,
+    E0_E2_VALID     = 0x5,
+    E1_E2_VALID     = 0x6,
     ALL_EDGES_VALID = 0x7,
     VALID_TRI_EDGE_COUNT,
 };
 
 // Selector for correct templated RasterizeTriangle function
-PFN_WORK_FUNC GetRasterizerFunc(
-    SWR_MULTISAMPLE_COUNT numSamples,
-    bool IsCenter,
-    bool IsConservative,
-    SWR_INPUT_COVERAGE InputCoverage,
-    uint32_t EdgeEnable,
-    bool RasterizeScissorEdges);
+PFN_WORK_FUNC GetRasterizerFunc(SWR_MULTISAMPLE_COUNT numSamples,
+                                bool                  IsCenter,
+                                bool                  IsConservative,
+                                SWR_INPUT_COVERAGE    InputCoverage,
+                                uint32_t              EdgeEnable,
+                                bool                  RasterizeScissorEdges);
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief ValidTriEdges convenience typedefs used for templated function 
+/// @brief ValidTriEdges convenience typedefs used for templated function
 /// specialization supported Fixed Point precisions
 typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> AllEdgesValidT;
-typedef std::integral_constant<uint32_t, E0_E1_VALID> E0E1ValidT;
-typedef std::integral_constant<uint32_t, E0_E2_VALID> E0E2ValidT;
-typedef std::integral_constant<uint32_t, E1_E2_VALID> E1E2ValidT;
-typedef std::integral_constant<uint32_t, NO_VALID_EDGES> NoEdgesValidT;
+typedef std::integral_constant<uint32_t, E0_E1_VALID>     E0E1ValidT;
+typedef std::integral_constant<uint32_t, E0_E2_VALID>     E0E2ValidT;
+typedef std::integral_constant<uint32_t, E1_E2_VALID>     E1E2ValidT;
+typedef std::integral_constant<uint32_t, NO_VALID_EDGES>  NoEdgesValidT;
 
 typedef std::integral_constant<uint32_t, STATE_ALL_EDGES_VALID> StateAllEdgesValidT;
-typedef std::integral_constant<uint32_t, STATE_E0_E1_VALID> StateE0E1ValidT;
-typedef std::integral_constant<uint32_t, STATE_E0_E2_VALID> StateE0E2ValidT;
-typedef std::integral_constant<uint32_t, STATE_E1_E2_VALID> StateE1E2ValidT;
-typedef std::integral_constant<uint32_t, STATE_NO_VALID_EDGES> StateNoEdgesValidT;
+typedef std::integral_constant<uint32_t, STATE_E0_E1_VALID>     StateE0E1ValidT;
+typedef std::integral_constant<uint32_t, STATE_E0_E2_VALID>     StateE0E2ValidT;
+typedef std::integral_constant<uint32_t, STATE_E1_E2_VALID>     StateE1E2ValidT;
+typedef std::integral_constant<uint32_t, STATE_NO_VALID_EDGES>  StateNoEdgesValidT;
 
 // some specializations to convert from edge state to edge bitmask values
 template <typename EdgeMask>
 struct EdgeMaskVal
 {
-    static_assert(EdgeMask::value > STATE_ALL_EDGES_VALID, "Primary EdgeMaskVal shouldn't be instantiated");
+    static_assert(EdgeMask::value > STATE_ALL_EDGES_VALID,
+                  "Primary EdgeMaskVal shouldn't be instantiated");
 };
 
 template <>
@@ -128,15 +128,15 @@
 INLINE uint32_t EdgeValToEdgeState(uint32_t val)
 {
     SWR_ASSERT(val < VALID_TRI_EDGE_COUNT, "Unexpected tri edge mask");
-    static const uint32_t edgeValToEdgeState[VALID_TRI_EDGE_COUNT] = { 0, 0, 0, 1, 0, 2, 3, 4 };
-    return  edgeValToEdgeState[val];
+    static const uint32_t edgeValToEdgeState[VALID_TRI_EDGE_COUNT] = {0, 0, 0, 1, 0, 2, 3, 4};
+    return edgeValToEdgeState[val];
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @struct RasterScissorEdgesT
-/// @brief Primary RasterScissorEdgesT templated struct that holds compile 
-/// time information about the number of edges needed to be rasterized, 
-/// If either the scissor rect or conservative rast is enabled, 
+/// @brief Primary RasterScissorEdgesT templated struct that holds compile
+/// time information about the number of edges needed to be rasterized,
+/// If either the scissor rect or conservative rast is enabled,
 /// the scissor test is enabled and the rasterizer will test
 /// 3 triangle edges + 4 scissor edges for coverage.
 /// @tparam RasterScissorEdgesT: number of multisamples
@@ -145,20 +145,20 @@
 template <typename RasterScissorEdgesT, typename ConservativeT, typename EdgeMaskT>
 struct RasterEdgeTraits
 {
-    typedef std::true_type RasterizeScissorEdgesT;
+    typedef std::true_type                      RasterizeScissorEdgesT;
     typedef std::integral_constant<uint32_t, 7> NumEdgesT;
-    //typedef std::integral_constant<uint32_t, EdgeMaskT::value> ValidEdgeMaskT;
+    // typedef std::integral_constant<uint32_t, EdgeMaskT::value> ValidEdgeMaskT;
     typedef typename EdgeMaskVal<EdgeMaskT>::T ValidEdgeMaskT;
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief specialization of RasterEdgeTraits. If neither scissor rect
-/// nor conservative rast is enabled, only test 3 triangle edges 
+/// nor conservative rast is enabled, only test 3 triangle edges
 /// for coverage
 template <typename EdgeMaskT>
 struct RasterEdgeTraits<std::false_type, std::false_type, EdgeMaskT>
 {
-    typedef std::false_type RasterizeScissorEdgesT;
+    typedef std::false_type                     RasterizeScissorEdgesT;
     typedef std::integral_constant<uint32_t, 3> NumEdgesT;
     // no need for degenerate edge masking in non-conservative case; rasterize all triangle edges
     typedef std::integral_constant<uint32_t, ALL_EDGES_VALID> ValidEdgeMaskT;
@@ -166,45 +166,72 @@
 
 //////////////////////////////////////////////////////////////////////////
 /// @struct RasterizerTraits
-/// @brief templated struct that holds compile time information used 
+/// @brief templated struct that holds compile time information used
 /// during rasterization. Inherits EdgeTraits and ConservativeRastBETraits.
 /// @tparam NumSamplesT: number of multisamples
 /// @tparam ConservativeT: is this a conservative rasterization
 /// @tparam InputCoverageT: what type of input coverage is the PS expecting?
 /// (only used with conservative rasterization)
 /// @tparam RasterScissorEdgesT: do we need to rasterize with a scissor?
-template <typename NumSamplesT, typename CenterPatternT, typename ConservativeT, typename InputCoverageT, typename EdgeEnableT, typename RasterScissorEdgesT>
+template <typename NumSamplesT,
+          typename CenterPatternT,
+          typename ConservativeT,
+          typename InputCoverageT,
+          typename EdgeEnableT,
+          typename RasterScissorEdgesT>
 struct _RasterizerTraits : public ConservativeRastBETraits<ConservativeT, InputCoverageT>,
-                                public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT>
+                           public RasterEdgeTraits<RasterScissorEdgesT, ConservativeT, EdgeEnableT>
 {
-    typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value), CenterPatternT::value> MT;
+    typedef MultisampleTraits<static_cast<SWR_MULTISAMPLE_COUNT>(NumSamplesT::value),
+                              CenterPatternT::value>
+        MT;
 
     /// Fixed point precision the rasterizer is using
     typedef FixedPointTraits<Fixed_16_8> PrecisionT;
     /// Fixed point precision of the edge tests used during rasterization
     typedef FixedPointTraits<Fixed_X_16> EdgePrecisionT;
 
-    // If conservative rast or MSAA center pattern is enabled, only need a single sample coverage test, with the result copied to all samples
-    typedef std::integral_constant<int, ConservativeT::value ? 1 : MT::numCoverageSamples> NumCoverageSamplesT;
+    // If conservative rast or MSAA center pattern is enabled, only need a single sample coverage
+    // test, with the result copied to all samples
+    typedef std::integral_constant<int, ConservativeT::value ? 1 : MT::numCoverageSamples>
+        NumCoverageSamplesT;
 
-    static_assert(EdgePrecisionT::BitsT::value >=  ConservativeRastBETraits<ConservativeT, InputCoverageT>::ConservativePrecisionT::BitsT::value,
-                  "Rasterizer edge fixed point precision < required conservative rast precision");
+    static_assert(
+        EdgePrecisionT::BitsT::value >=
+            ConservativeRastBETraits<ConservativeT,
+                                     InputCoverageT>::ConservativePrecisionT::BitsT::value,
+        "Rasterizer edge fixed point precision < required conservative rast precision");
 
     /// constants used to offset between different types of raster tiles
-    static const int colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) * MT::numSamples};
-    static const int depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) * MT::numSamples};
-    static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) * MT::numSamples};
-    static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep};
-    static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep};
-    static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep};
+    static const int colorRasterTileStep{
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) *
+        MT::numSamples};
+    static const int depthRasterTileStep{
+        (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) *
+        MT::numSamples};
+    static const int stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM *
+                                            (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) *
+                                           MT::numSamples};
+    static const int colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
+                                            colorRasterTileStep};
+    static const int depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
+                                            depthRasterTileStep};
+    static const int stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) *
+                                              stencilRasterTileStep};
 };
 
-template <uint32_t NumSamplesT, uint32_t CenterPatternT, uint32_t ConservativeT, uint32_t InputCoverageT, uint32_t EdgeEnableT, uint32_t RasterScissorEdgesT>
-struct RasterizerTraits final : public _RasterizerTraits <
-    std::integral_constant<uint32_t, NumSamplesT>,
-    std::integral_constant<bool, CenterPatternT != 0>,
-    std::integral_constant<bool, ConservativeT != 0>,
-    std::integral_constant<uint32_t, InputCoverageT>,
-    std::integral_constant<uint32_t, EdgeEnableT>,
-    std::integral_constant<bool, RasterScissorEdgesT != 0> >
-{};
+template <uint32_t NumSamplesT,
+          uint32_t CenterPatternT,
+          uint32_t ConservativeT,
+          uint32_t InputCoverageT,
+          uint32_t EdgeEnableT,
+          uint32_t RasterScissorEdgesT>
+struct RasterizerTraits final
+    : public _RasterizerTraits<std::integral_constant<uint32_t, NumSamplesT>,
+                               std::integral_constant<bool, CenterPatternT != 0>,
+                               std::integral_constant<bool, ConservativeT != 0>,
+                               std::integral_constant<uint32_t, InputCoverageT>,
+                               std::integral_constant<uint32_t, EdgeEnableT>,
+                               std::integral_constant<bool, RasterScissorEdgesT != 0>>
+{
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
index ca39d7c..20206ea 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer_impl.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file rasterizer.cpp
-*
-* @brief Implementation for the rasterizer.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file rasterizer.cpp
+ *
+ * @brief Implementation for the rasterizer.
+ *
+ ******************************************************************************/
 
 #include <vector>
 #include <algorithm>
@@ -37,18 +37,29 @@
 #include "tilemgr.h"
 #include "memory/tilingtraits.h"
 
-extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT][STATE_VALID_TRI_EDGE_COUNT][2];
+extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPUT_COVERAGE_COUNT]
+                                     [STATE_VALID_TRI_EDGE_COUNT][2];
 
 template <uint32_t numSamples = 1>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex);
+void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
+                       uint32_t             workerId,
+                       uint32_t             macroID,
+                       uint32_t             x,
+                       uint32_t             y,
+                       RenderOutputBuffers& renderBuffers,
+                       uint32_t             renderTargetArrayIndex);
 template <typename RT>
-void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers);
+void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers);
 template <typename RT>
-void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow);
+void StepRasterTileY(uint32_t             colorHotTileMask,
+                     RenderOutputBuffers& buffers,
+                     RenderOutputBuffers& startBufferRow);
 
-#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
-static const __m256d gMaskToVecpd[] =
-{
+#define MASKTOVEC(i3, i2, i1, i0) \
+    {                             \
+        -i0, -i1, -i2, -i3        \
+    }
+static const __m256d gMaskToVecpd[] = {
     MASKTOVEC(0, 0, 0, 0),
     MASKTOVEC(0, 0, 0, 1),
     MASKTOVEC(0, 0, 1, 0),
@@ -74,11 +85,11 @@
 
 struct EDGE
 {
-    double a, b;                // a, b edge coefficients in fix8
-    double stepQuadX;           // step to adjacent horizontal quad in fix16
-    double stepQuadY;           // step to adjacent vertical quad in fix16
-    double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
-    double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
+    double a, b;            // a, b edge coefficients in fix8
+    double stepQuadX;       // step to adjacent horizontal quad in fix16
+    double stepQuadY;       // step to adjacent vertical quad in fix16
+    double stepRasterTileX; // step to adjacent horizontal raster tile in fix16
+    double stepRasterTileY; // step to adjacent vertical raster tile in fix16
 
     __m256d vQuadOffsets;       // offsets for 4 samples of a quad
     __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
@@ -86,12 +97,15 @@
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief rasterize a raster tile partially covered by the triangle
-/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
+/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster
+/// tile
 /// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
 /// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
 ///        Used to step between quads when sweeping over the raster tile.
-template<uint32_t NumEdges, typename EdgeMaskT>
-INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
+template <uint32_t NumEdges, typename EdgeMaskT>
+INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT* pDC,
+                                     double        startEdges[NumEdges],
+                                     EDGE*         pRastEdges)
 {
     uint64_t coverageMask = 0;
 
@@ -111,50 +125,49 @@
 
     // fast unrolled version for 8x8 tile
 #if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
-    int edgeMask[NumEdges];
+    int      edgeMask[NumEdges];
     uint64_t mask;
 
-    auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
-    auto update_lambda = [&](int e){mask &= edgeMask[e];};
-    auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
-    auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
-    auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
+    auto eval_lambda   = [&](int e) { edgeMask[e] = _mm256_movemask_pd(vEdges[e]); };
+    auto update_lambda = [&](int e) { mask &= edgeMask[e]; };
+    auto incx_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]); };
+    auto incy_lambda   = [&](int e) { vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]); };
+    auto decx_lambda   = [&](int e) { vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]); };
 
 // evaluate which pixels in the quad are covered
-#define EVAL \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
+#define EVAL UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(eval_lambda);
 
     // update coverage mask
     // if edge 0 is degenerate and will be skipped; init the mask
-#define UPDATE_MASK(bit) \
-            if(std::is_same<EdgeMaskT, E1E2ValidT>::value || std::is_same<EdgeMaskT, NoEdgesValidT>::value){\
-                mask = 0xf;\
-            }\
-            else{\
-                mask = edgeMask[0]; \
-            }\
-            UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
-            coverageMask |= (mask << bit);
+#define UPDATE_MASK(bit)                                                  \
+    if (std::is_same<EdgeMaskT, E1E2ValidT>::value ||                     \
+        std::is_same<EdgeMaskT, NoEdgesValidT>::value)                    \
+    {                                                                     \
+        mask = 0xf;                                                       \
+    }                                                                     \
+    else                                                                  \
+    {                                                                     \
+        mask = edgeMask[0];                                               \
+    }                                                                     \
+    UnrollerLMask<1, NumEdges, 1, EdgeMaskT::value>::step(update_lambda); \
+    coverageMask |= (mask << bit);
 
-    // step in the +x direction to the next quad 
-#define INCX \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
+    // step in the +x direction to the next quad
+#define INCX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incx_lambda);
 
-    // step in the +y direction to the next quad 
-#define INCY \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
+    // step in the +y direction to the next quad
+#define INCY UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(incy_lambda);
 
-    // step in the -x direction to the next quad 
-#define DECX \
-            UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
+    // step in the -x direction to the next quad
+#define DECX UnrollerLMask<0, NumEdges, 1, EdgeMaskT::value>::step(decx_lambda);
 
-    // sweep 2x2 quad back and forth through the raster tile, 
+    // sweep 2x2 quad back and forth through the raster tile,
     // computing coverage masks for the entire tile
 
     // raster tile
-    // 0  1  2  3  4  5  6  7 
+    // 0  1  2  3  4  5  6  7
     // x  x
-    // x  x ------------------>  
+    // x  x ------------------>
     //                   x  x  |
     // <-----------------x  x  V
     // ..
@@ -173,7 +186,7 @@
     UPDATE_MASK(12);
     INCY;
 
-    //row 1
+    // row 1
     EVAL;
     UPDATE_MASK(28);
     DECX;
@@ -215,7 +228,7 @@
     UPDATE_MASK(48);
 #else
     uint32_t bit = 0;
-    for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
+    for (uint32_t y = 0; y < KNOB_TILE_Y_DIM / 2; ++y)
     {
         __m256d vStartOfRowEdge[NumEdges];
         for (uint32_t e = 0; e < NumEdges; ++e)
@@ -223,7 +236,7 @@
             vStartOfRowEdge[e] = vEdges[e];
         }
 
-        for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
+        for (uint32_t x = 0; x < KNOB_TILE_X_DIM / 2; ++x)
         {
             int edgeMask[NumEdges];
             for (uint32_t e = 0; e < NumEdges; ++e)
@@ -243,7 +256,7 @@
             {
                 vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
             }
-            bit+=4;
+            bit += 4;
         }
 
         // step to the next row
@@ -254,20 +267,19 @@
     }
 #endif
     return coverageMask;
-
 }
 // Top left rule:
 // Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
-// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
-// Top left: a sample is in if it is a top or left edge.
-// Out: !(horizontal && above) = !horizontal && below
-// Out: !horizontal && left = !(!horizontal && left) = horizontal and right 
-INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d &vEdge) 
+// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it
+// is a 'left' edge Top left: a sample is in if it is a top or left edge. Out: !(horizontal &&
+// above) = !horizontal && below Out: !horizontal && left = !(!horizontal && left) = horizontal and
+// right
+INLINE void adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, __m256d& vEdge)
 {
     // if vA < 0, vC--
     // if vA == 0 && vB < 0, vC--
 
-    __m256d vEdgeOut = vEdge;
+    __m256d vEdgeOut    = vEdge;
     __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
 
     // if vA < 0 (line is not horizontal and below)
@@ -275,7 +287,7 @@
 
     // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
     __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
-    int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
+    int     msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
     msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
 
     // if either of these are true and we're on the line (edge == 0), bump it outside the line
@@ -285,17 +297,19 @@
 //////////////////////////////////////////////////////////////////////////
 /// @brief calculates difference in precision between the result of manh
 /// calculation and the edge precision, based on compile time trait values
-template<typename RT>
+template <typename RT>
 constexpr int64_t ManhToEdgePrecisionAdjust()
 {
-    static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
+    static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
+                      RT::EdgePrecisionT::BitsT::value,
                   "Inadequate precision of result of manh calculation ");
-    return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) - RT::EdgePrecisionT::BitsT::value);
+    return ((RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value) -
+            RT::EdgePrecisionT::BitsT::value);
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @struct adjustEdgeConservative
-/// @brief Primary template definition used for partially specializing 
+/// @brief Primary template definition used for partially specializing
 /// the adjustEdgeConservative function. This struct should never
 /// be instantiated.
 /// @tparam RT: rasterizer traits
@@ -306,38 +320,42 @@
     //////////////////////////////////////////////////////////////////////////
     /// @brief Performs calculations to adjust each edge of a triangle away
     /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-    /// direction. 
+    /// direction.
     ///
     /// Uncertainty regions arise from fixed point rounding, which
     /// can snap a vertex +/- by min fixed point value.
     /// Adding 1/2 pixel in x/y bumps the edge equation tests out towards the pixel corners.
-    /// This allows the rasterizer to test for coverage only at the pixel center, 
+    /// This allows the rasterizer to test for coverage only at the pixel center,
     /// instead of having to test individual pixel corners for conservative coverage
-    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
     {
-        // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge away 
-        // from the pixel center (in the direction of the edge normal A/B)
+        // Assumes CCW winding order. Subtracting from the evaluated edge equation moves the edge
+        // away from the pixel center (in the direction of the edge normal A/B)
 
         // edge = Ax + Bx + C - (manh/e)
         // manh = manhattan distance = abs(A) + abs(B)
         // e = absolute rounding error from snapping from float to fixed point precision
 
-        // 'fixed point' multiply (in double to be avx1 friendly) 
+        // 'fixed point' multiply (in double to be avx1 friendly)
         // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example
-        __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
-        __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
-                                     _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
+        __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)),
+                vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi));
+        __m256d manh =
+            _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)),
+                          _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)));
 
-        static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value,
+        static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >=
+                          RT::EdgePrecisionT::BitsT::value,
                       "Inadequate precision of result of manh calculation ");
 
-        // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision
-        // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right
+        // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the
+        // same precision since we're doing fixed math in double format, multiply by multiples of
+        // 1/2 instead of a bit shift right
         manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5));
 
-        // move the edge away from the pixel center by the required conservative precision + 1/2 pixel
-        // this allows the rasterizer to do a single conservative coverage test to see if the primitive
-        // intersects the pixel at all
+        // move the edge away from the pixel center by the required conservative precision + 1/2
+        // pixel this allows the rasterizer to do a single conservative coverage test to see if the
+        // primitive intersects the pixel at all
         vEdge = _mm256_sub_pd(vEdge, manh);
     };
 };
@@ -347,43 +365,51 @@
 template <typename RT>
 struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>>
 {
-    INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {};
+    INLINE adjustEdgeConservative(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge){};
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief calculates the distance a degenerate BBox needs to be adjusted 
+/// @brief calculates the distance a degenerate BBox needs to be adjusted
 /// for conservative rast based on compile time trait values
-template<typename RT>
+template <typename RT>
 constexpr int64_t ConservativeScissorOffset()
 {
-    static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0, "Rasterizer precision > conservative precision");
-    // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox when calculating scissor edges
-    typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1> DegenerateEdgeOffsetT;
+    static_assert(RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value >= 0,
+                  "Rasterizer precision > conservative precision");
+    // if we have a degenerate triangle, we need to compensate for adjusting the degenerate BBox
+    // when calculating scissor edges
+    typedef std::integral_constant<int32_t, (RT::ValidEdgeMaskT::value == ALL_EDGES_VALID) ? 0 : 1>
+        DegenerateEdgeOffsetT;
     // 1/2 pixel edge offset + conservative offset - degenerateTriangle
-    return RT::ConservativeEdgeOffsetT::value - (DegenerateEdgeOffsetT::value << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
+    return RT::ConservativeEdgeOffsetT::value -
+           (DegenerateEdgeOffsetT::value
+            << (RT::ConservativePrecisionT::BitsT::value - RT::PrecisionT::BitsT::value));
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Performs calculations to adjust each a vector of evaluated edges out
 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction. 
+/// direction.
 template <typename RT>
-INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge)
+INLINE void adjustScissorEdge(const double a, const double b, __m256d& vEdge)
 {
     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
-    int64_t manh = ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >> ManhToEdgePrecisionAdjust<RT>();
+    int64_t manh =
+        ((aabs * ConservativeScissorOffset<RT>()) + (babs * ConservativeScissorOffset<RT>())) >>
+        ManhToEdgePrecisionAdjust<RT>();
     vEdge = _mm256_sub_pd(vEdge, _mm256_set1_pd(manh));
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Performs calculations to adjust each a scalar evaluated edge out
 /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y
-/// direction. 
+/// direction.
 template <typename RT, typename OffsetT>
 INLINE double adjustScalarEdge(const double a, const double b, const double Edge)
 {
     int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b));
-    int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
+    int64_t manh =
+        ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>();
     return (Edge - manh);
 };
 
@@ -392,12 +418,14 @@
 template <typename RT, typename EdgeOffsetT>
 struct adjustEdgesFix16
 {
-    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
     {
-        static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
-                      "Edge equation expected to be in x.16 fixed point");
+        static_assert(
+            std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value,
+            "Edge equation expected to be in x.16 fixed point");
 
-        static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled");
+        static_assert(RT::IsConservativeT::value,
+                      "Edge offset assumes conservative rasterization is enabled");
 
         // need to apply any edge offsets before applying the top-left rule
         adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge);
@@ -411,7 +439,7 @@
 template <typename RT>
 struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>>
 {
-    INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge)
+    INLINE adjustEdgesFix16(const __m128i& vAi, const __m128i& vBi, __m256d& vEdge)
     {
         adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
     }
@@ -449,7 +477,8 @@
     return std::max(dzdx, dzdy);
 }
 
-INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
+INLINE float
+ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
 {
     if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
     {
@@ -464,7 +493,7 @@
         SWR_ASSERT(pState->depthFormat == R32_FLOAT);
 
         // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
-        float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
+        float    zMax    = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
         uint32_t zMaxInt = *(uint32_t*)&zMax;
         zMaxInt &= 0x7f800000;
         zMax = *(float*)&zMaxInt;
@@ -473,7 +502,8 @@
     }
 }
 
-INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
+INLINE float
+ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
 {
     if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
     {
@@ -512,7 +542,8 @@
 
 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
 // try to avoid _chkstk insertions; make this thread local
-static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
+static THREAD
+OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * SWR_VTX_NUM_SLOTS * componentsPerAttrib];
 
 INLINE
 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
@@ -534,11 +565,13 @@
 
     __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
     __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
-    edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
+    edge.vQuadOffsets       = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
 
     // compute raster tile offsets
-    const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
-    const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
+    const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd(
+        (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1) * FIXED_POINT_SCALE, 0);
+    const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd(
+        (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1) * FIXED_POINT_SCALE, 0, 0);
 
     __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
     __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
@@ -552,30 +585,33 @@
 }
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Primary template definition used for partially specializing 
-/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel 
+/// @brief Primary template definition used for partially specializing
+/// the UpdateEdgeMasks function. Offset evaluated edges from UL pixel
 /// corner to sample position, and test for coverage
 /// @tparam sampleCount: multisample count
 template <typename NumSamplesT>
-INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
-                            int32_t &mask0, int32_t &mask1, int32_t &mask2)
+INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3],
+                            const __m256d* vEdgeFix16,
+                            int32_t&       mask0,
+                            int32_t&       mask1,
+                            int32_t&       mask2)
 {
     __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
     // evaluate edge equations at the tile multisample bounding box
     vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
     vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
     vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
-    mask0 = _mm256_movemask_pd(vSampleBboxTest0);
-    mask1 = _mm256_movemask_pd(vSampleBboxTest1);
-    mask2 = _mm256_movemask_pd(vSampleBboxTest2);
+    mask0            = _mm256_movemask_pd(vSampleBboxTest0);
+    mask1            = _mm256_movemask_pd(vSampleBboxTest1);
+    mask2            = _mm256_movemask_pd(vSampleBboxTest2);
 }
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated
 /// when only rasterizing a single coverage test point
 template <>
-INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16,
-                                           int32_t &mask0, int32_t &mask1, int32_t &mask2)
+INLINE void UpdateEdgeMasks<SingleSampleT>(
+    const __m256d (&)[3], const __m256d* vEdgeFix16, int32_t& mask0, int32_t& mask1, int32_t& mask2)
 {
     mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
     mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
@@ -585,7 +621,7 @@
 //////////////////////////////////////////////////////////////////////////
 /// @struct ComputeScissorEdges
 /// @brief Primary template definition. Allows the function to be generically
-/// called. When paired with below specializations, will result in an empty 
+/// called. When paired with below specializations, will result in an empty
 /// inlined function if scissor is not enabled
 /// @tparam RasterScissorEdgesT: is scissor enabled?
 /// @tparam IsConservativeT: is conservative rast enabled?
@@ -593,21 +629,29 @@
 template <typename RasterScissorEdgesT, typename IsConservativeT, typename RT>
 struct ComputeScissorEdges
 {
-    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y, 
-                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7]){};
+    INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
+                               const SWR_RECT& scissorBBox,
+                               const int32_t   x,
+                               const int32_t   y,
+                               EDGE (&rastEdges)[RT::NumEdgesT::value],
+                               __m256d (&vEdgeFix16)[7]){};
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial 
+/// @brief ComputeScissorEdges<std::true_type, std::true_type, RT> partial
 /// specialization. Instantiated when conservative rast and scissor are enabled
 template <typename RT>
 struct ComputeScissorEdges<std::true_type, std::true_type, RT>
 {
     //////////////////////////////////////////////////////////////////////////
-    /// @brief Intersect tri bbox with scissor, compute scissor edge vectors, 
+    /// @brief Intersect tri bbox with scissor, compute scissor edge vectors,
     /// evaluate edge equations and offset them away from pixel center.
-    INLINE ComputeScissorEdges(const SWR_RECT &triBBox, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
-                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
+    INLINE ComputeScissorEdges(const SWR_RECT& triBBox,
+                               const SWR_RECT& scissorBBox,
+                               const int32_t   x,
+                               const int32_t   y,
+                               EDGE (&rastEdges)[RT::NumEdgesT::value],
+                               __m256d (&vEdgeFix16)[7])
     {
         // if conservative rasterizing, triangle bbox intersected with scissor bbox is used
         SWR_RECT scissor;
@@ -627,12 +671,17 @@
         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
 
-        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
-        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
-        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
-        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
+        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
+                                       (rastEdges[3].b * (y - scissor.ymin)));
+        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
+                                       (rastEdges[4].b * (y - scissor.ymax)));
+        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
+                                       (rastEdges[5].b * (y - scissor.ymax)));
+        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
+                                       (rastEdges[6].b * (y - scissor.ymin)));
 
-        // if conservative rasterizing, need to bump the scissor edges out by the conservative uncertainty distance, else do nothing
+        // if conservative rasterizing, need to bump the scissor edges out by the conservative
+        // uncertainty distance, else do nothing
         adjustScissorEdge<RT>(rastEdges[3].a, rastEdges[3].b, vEdgeFix16[3]);
         adjustScissorEdge<RT>(rastEdges[4].a, rastEdges[4].b, vEdgeFix16[4]);
         adjustScissorEdge<RT>(rastEdges[5].a, rastEdges[5].b, vEdgeFix16[5]);
@@ -645,7 +694,7 @@
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial 
+/// @brief ComputeScissorEdges<std::true_type, std::false_type, RT> partial
 /// specialization. Instantiated when scissor is enabled and conservative rast
 /// is disabled.
 template <typename RT>
@@ -653,14 +702,18 @@
 {
     //////////////////////////////////////////////////////////////////////////
     /// @brief Compute scissor edge vectors and evaluate edge equations
-    INLINE ComputeScissorEdges(const SWR_RECT &, const SWR_RECT &scissorBBox, const int32_t x, const int32_t y,
-                              EDGE (&rastEdges)[RT::NumEdgesT::value], __m256d (&vEdgeFix16)[7])
+    INLINE ComputeScissorEdges(const SWR_RECT&,
+                               const SWR_RECT& scissorBBox,
+                               const int32_t   x,
+                               const int32_t   y,
+                               EDGE (&rastEdges)[RT::NumEdgesT::value],
+                               __m256d (&vEdgeFix16)[7])
     {
-        const SWR_RECT &scissor = scissorBBox;
-        POS topLeft{scissor.xmin, scissor.ymin};
-        POS bottomLeft{scissor.xmin, scissor.ymax};
-        POS topRight{scissor.xmax, scissor.ymin};
-        POS bottomRight{scissor.xmax, scissor.ymax};
+        const SWR_RECT& scissor = scissorBBox;
+        POS             topLeft{scissor.xmin, scissor.ymin};
+        POS             bottomLeft{scissor.xmin, scissor.ymax};
+        POS             topRight{scissor.xmax, scissor.ymin};
+        POS             bottomRight{scissor.xmax, scissor.ymax};
 
         // construct 4 scissor edges in ccw direction
         ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
@@ -668,10 +721,14 @@
         ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
         ComputeEdgeData(topRight, topLeft, rastEdges[6]);
 
-        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) + (rastEdges[3].b * (y - scissor.ymin)));
-        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) + (rastEdges[4].b * (y - scissor.ymax)));
-        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) + (rastEdges[5].b * (y - scissor.ymax)));
-        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) + (rastEdges[6].b * (y - scissor.ymin)));
+        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.xmin)) +
+                                       (rastEdges[3].b * (y - scissor.ymin)));
+        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.xmin)) +
+                                       (rastEdges[4].b * (y - scissor.ymax)));
+        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.xmax)) +
+                                       (rastEdges[5].b * (y - scissor.ymax)));
+        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.xmax)) +
+                                       (rastEdges[6].b * (y - scissor.ymin)));
 
         // Upper left rule for scissor
         vEdgeFix16[3] = _mm256_sub_pd(vEdgeFix16[3], _mm256_set1_pd(1.0));
@@ -723,7 +780,8 @@
 template <>
 INLINE bool TrivialRejectTest<AllEdgesValidT>(const int mask0, const int mask1, const int mask2)
 {
-    return (!(mask0 && mask1 && mask2)) ? true : false;;
+    return (!(mask0 && mask1 && mask2)) ? true : false;
+    ;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -737,7 +795,7 @@
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Primary function template for TrivialAcceptTest. Always returns
-/// false, since it will only be called for degenerate tris, and as such 
+/// false, since it will only be called for degenerate tris, and as such
 /// will never cover the entire raster tile
 template <typename ScissorEnableT>
 INLINE bool TrivialAcceptTest(const int, const int, const int)
@@ -760,27 +818,33 @@
 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
 struct GenerateSVInnerCoverage
 {
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*,  uint64_t &){};
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, uint32_t, EDGE*, double*, uint64_t&){};
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Specialization of GenerateSVInnerCoverage where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated 
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated
 /// edge values from OuterConservative to InnerConservative and rasterizes.
 template <typename RT>
 struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT>
 {
-    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, uint32_t workerId, EDGE* pRastEdges, double* pStartQuadEdges,  uint64_t &innerCoverageMask)
+    INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC,
+                                   uint32_t      workerId,
+                                   EDGE*         pRastEdges,
+                                   double*       pStartQuadEdges,
+                                   uint64_t&     innerCoverageMask)
     {
         double startQuadEdgesAdj[RT::NumEdgesT::value];
-        for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
+        for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
         {
-            startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
+            startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(
+                pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]);
         }
 
         // not trivial accept or reject, must rasterize full tile
         RDTSC_BEGIN(BERasterizePartial, pDC->drawId);
-        innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges);
+        innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
+            pDC, startQuadEdgesAdj, pRastEdges);
         RDTSC_END(BERasterizePartial, 0);
     }
 };
@@ -791,43 +855,62 @@
 template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT>
 struct UpdateEdgeMasksInnerConservative
 {
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*,
-                                           const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){};
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
+                                            const __m256d*,
+                                            const __m128i,
+                                            const __m128i,
+                                            int32_t&,
+                                            int32_t&,
+                                            int32_t&){};
 };
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges
-/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges 
-/// evaluated at raster tile corners to inner conservative position and 
+/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges
+/// evaluated at raster tile corners to inner conservative position and
 /// updates edge masks
 template <typename RT>
 struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT>
 {
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16,
-                                           const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2)
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3],
+                                            const __m256d* vEdgeFix16,
+                                            const __m128i  vAi,
+                                            const __m128i  vBi,
+                                            int32_t&       mask0,
+                                            int32_t&       mask1,
+                                            int32_t&       mask2)
     {
         __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]};
 
-        // instead of keeping 2 copies of evaluated edges around, just compensate for the outer 
+        // instead of keeping 2 copies of evaluated edges around, just compensate for the outer
         // conservative evaluated edge when adjusting the edge in for inner conservative tests
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]);
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]);
-        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]);
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
+            vAi, vBi, vTempEdge[0]);
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
+            vAi, vBi, vTempEdge[1]);
+        adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(
+            vAi, vBi, vTempEdge[2]);
 
-        UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
+        UpdateEdgeMasks<typename RT::NumCoverageSamplesT>(
+            vEdgeTileBbox, vTempEdge, mask0, mask1, mask2);
     }
 };
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage 
-/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot 
+/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage
+/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot
 /// cover an entire raster tile, set mask0 to 0 to force it down the
 /// rastierizePartialTile path
 template <typename RT, typename ValidEdgeMaskT>
 struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT>
 {
-    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*,
-                                   const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &)
+    INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3],
+                                            const __m256d*,
+                                            const __m128i,
+                                            const __m128i,
+                                            int32_t& mask0,
+                                            int32_t&,
+                                            int32_t&)
     {
         // set one mask to zero to force the triangle down the rastierizePartialTile path
         mask0 = 0;
@@ -837,7 +920,7 @@
 template <typename RT>
 void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
 {
-    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
+    const TRIANGLE_WORK_DESC& workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
 #if KNOB_ENABLE_TOSS_POINTS
     if (KNOB_TOSS_BIN_TRIS)
     {
@@ -847,24 +930,25 @@
     RDTSC_BEGIN(BERasterizeTriangle, pDC->drawId);
     RDTSC_BEGIN(BETriangleSetup, pDC->drawId);
 
-    const API_STATE &state = GetApiState(pDC);
-    const SWR_RASTSTATE &rastState = state.rastState;
+    const API_STATE&     state        = GetApiState(pDC);
+    const SWR_RASTSTATE& rastState    = state.rastState;
     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 
     OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
 
     __m128 vX, vY, vZ, vRecipW;
-    
+
     // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
     // eg: vX = [x0 x1 x2 dc]
-    vX = _mm_load_ps(workDesc.pTriBuffer);
-    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
-    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+    vX      = _mm_load_ps(workDesc.pTriBuffer);
+    vY      = _mm_load_ps(workDesc.pTriBuffer + 4);
+    vZ      = _mm_load_ps(workDesc.pTriBuffer + 8);
     vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
 
     // convert to fixed point
-    static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Rasterizer expects 16.8 fixed point precision");
+    static_assert(std::is_same<typename RT::PrecisionT, FixedPointTraits<Fixed_16_8>>::value,
+                  "Rasterizer expects 16.8 fixed point precision");
     __m128i vXi = fpToFixedPoint(vX);
     __m128i vYi = fpToFixedPoint(vY);
 
@@ -879,12 +963,12 @@
 
     __m128i vAi, vBi;
     triangleSetupABInt(vXi, vYi, vAi, vBi);
-    
+
     // determinant
     float det = calcDeterminantInt(vAi, vBi);
 
     // Verts in Pixel Coordinate Space at this point
-    // Det > 0 = CW winding order 
+    // Det > 0 = CW winding order
     // Convert CW triangles to CCW
     if (det > 0.0)
     {
@@ -899,9 +983,9 @@
     // Finish triangle setup - C edge coef
     triangleSetupC(vX, vY, vA, vB, vC);
 
-    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
+    if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
     {
-        // If we have degenerate edge(s) to rasterize, set I and J coefs 
+        // If we have degenerate edge(s) to rasterize, set I and J coefs
         // to 0 for constant interpolation of attributes
         triDesc.I[0] = 0.0f;
         triDesc.I[1] = 0.0f;
@@ -915,7 +999,7 @@
     }
     else
     {
-        // only extract coefs for 2 of the barycentrics; the 3rd can be 
+        // only extract coefs for 2 of the barycentrics; the 3rd can be
         // determined from the barycentric equation:
         // i + j + k = 1 <=> k = 1 - j - i
         _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
@@ -926,7 +1010,7 @@
         _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
 
         // compute recipDet, used to calculate barycentric i and j in the backend
-        triDesc.recipDet = 1.0f/det;
+        triDesc.recipDet = 1.0f / det;
     }
 
     OSALIGNSIMD(float) oneOverW[4];
@@ -935,31 +1019,31 @@
     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
     triDesc.OneOverW[2] = oneOverW[2];
 
-    // calculate perspective correct coefs per vertex attrib 
-    float* pPerspAttribs = perspAttribsTLS;
-    float* pAttribs = workDesc.pAttribs;
+    // calculate perspective correct coefs per vertex attrib
+    float* pPerspAttribs  = perspAttribsTLS;
+    float* pAttribs       = workDesc.pAttribs;
     triDesc.pPerspAttribs = pPerspAttribs;
-    triDesc.pAttribs = pAttribs;
-    float *pRecipW = workDesc.pTriBuffer + 12;
-    triDesc.pRecipW = pRecipW;
-    __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
-    __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
-    __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
-    for(uint32_t i = 0; i < workDesc.numAttribs; i++)
+    triDesc.pAttribs      = pAttribs;
+    float* pRecipW        = workDesc.pTriBuffer + 12;
+    triDesc.pRecipW       = pRecipW;
+    __m128 vOneOverWV0    = _mm_broadcast_ss(pRecipW);
+    __m128 vOneOverWV1    = _mm_broadcast_ss(pRecipW += 1);
+    __m128 vOneOverWV2    = _mm_broadcast_ss(pRecipW += 1);
+    for (uint32_t i = 0; i < workDesc.numAttribs; i++)
     {
         __m128 attribA = _mm_load_ps(pAttribs);
-        __m128 attribB = _mm_load_ps(pAttribs+=4);
-        __m128 attribC = _mm_load_ps(pAttribs+=4);
-        pAttribs+=4;
+        __m128 attribB = _mm_load_ps(pAttribs += 4);
+        __m128 attribC = _mm_load_ps(pAttribs += 4);
+        pAttribs += 4;
 
         attribA = _mm_mul_ps(attribA, vOneOverWV0);
         attribB = _mm_mul_ps(attribB, vOneOverWV1);
         attribC = _mm_mul_ps(attribC, vOneOverWV2);
 
         _mm_store_ps(pPerspAttribs, attribA);
-        _mm_store_ps(pPerspAttribs+=4, attribB);
-        _mm_store_ps(pPerspAttribs+=4, attribC);
-        pPerspAttribs+=4;
+        _mm_store_ps(pPerspAttribs += 4, attribB);
+        _mm_store_ps(pPerspAttribs += 4, attribC);
+        pPerspAttribs += 4;
     }
 
     // compute bary Z
@@ -969,7 +1053,7 @@
     triDesc.Z[0] = a[0] - a[2];
     triDesc.Z[1] = a[1] - a[2];
     triDesc.Z[2] = a[2];
-        
+
     // add depth bias
     triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
 
@@ -977,12 +1061,17 @@
     OSALIGNSIMD(SWR_RECT) bbox;
     calcBoundingBoxInt(vXi, vYi, bbox);
 
-    const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
+    const SWR_RECT& scissorInFixedPoint =
+        state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];
 
-    if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
+    if (RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
     {
-        // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
-        bbox.xmin--;    bbox.xmax++;    bbox.ymin--;    bbox.ymax++;
+        // If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is
+        // valid
+        bbox.xmin--;
+        bbox.xmax++;
+        bbox.ymin--;
+        bbox.ymax++;
         SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
                    "Conservative rast degenerate handling requires a valid scissor rect");
     }
@@ -996,12 +1085,13 @@
 
     triDesc.triFlags = workDesc.triFlags;
 
-    // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
+    // further constrain backend to intersecting bounding box of macro tile and scissored triangle
+    // bbox
     uint32_t macroX, macroY;
     MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
-    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
-    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
-    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
+    int32_t macroBoxLeft   = macroX * KNOB_MACROTILE_X_DIM_FIXED;
+    int32_t macroBoxRight  = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
+    int32_t macroBoxTop    = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
 
     intersect.xmin = std::max(intersect.xmin, macroBoxLeft);
@@ -1009,19 +1099,21 @@
     intersect.xmax = std::min(intersect.xmax, macroBoxRight);
     intersect.ymax = std::min(intersect.ymax, macroBoxBottom);
 
-    SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax && intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 && intersect.ymax >= 0);
+    SWR_ASSERT(intersect.xmin <= intersect.xmax && intersect.ymin <= intersect.ymax &&
+               intersect.xmin >= 0 && intersect.xmax >= 0 && intersect.ymin >= 0 &&
+               intersect.ymax >= 0);
 
     RDTSC_END(BETriangleSetup, 0);
 
     // update triangle desc
-    uint32_t minTileX = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t minTileY = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t maxTileX = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
-    uint32_t maxTileY = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t minTileX  = intersect.xmin >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t minTileY  = intersect.ymin >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t maxTileX  = intersect.xmax >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t maxTileY  = intersect.ymax >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
     uint32_t numTilesX = maxTileX - minTileX + 1;
     uint32_t numTilesY = maxTileY - minTileY + 1;
 
-    if (numTilesX == 0 || numTilesY == 0) 
+    if (numTilesX == 0 || numTilesY == 0)
     {
         RDTSC_EVENT(BEEmptyTriangle, 1, 0);
         RDTSC_END(BERasterizeTriangle, 1);
@@ -1040,7 +1132,7 @@
 
     // single sample rasterization evaluates edges at pixel center,
     // multisample evaluates edges UL pixel corner and steps to each sample position
-    if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
+    if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
     {
         // Add 0.5, in fixed point, to offset to pixel center
         x += (FIXED_POINT_SCALE / 2);
@@ -1051,7 +1143,7 @@
     __m128i vTopLeftY = _mm_set1_epi32(y);
 
     // evaluate edge equations at top-left pixel using 64bit math
-    // 
+    //
     // line = Ax + By + C
     // solving for C:
     // C = -Ax - By
@@ -1061,21 +1153,21 @@
     // line = Ax - By - Ax0 - By0
     // line = A(x - x0) + B(y - y0)
     // dX = (x-x0), dY = (y-y0)
-    // so all this simplifies to 
+    // so all this simplifies to
     // edge = A(dX) + B(dY), our first test at the top left of the bbox we're rasterizing within
 
     __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
     __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
 
     // evaluate A(dx) and B(dY) for all points
-    __m256d vAipd = _mm256_cvtepi32_pd(vAi);
-    __m256d vBipd = _mm256_cvtepi32_pd(vBi);
+    __m256d vAipd     = _mm256_cvtepi32_pd(vAi);
+    __m256d vBipd     = _mm256_cvtepi32_pd(vBi);
     __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
     __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
 
     __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
     __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
-    __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
+    __m256d vEdge          = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
 
     // apply any edge adjustments(top-left, crast, etc)
     adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge);
@@ -1098,8 +1190,8 @@
     ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
 
     // Compute and store triangle edge data if scissor needs to rasterized
-    ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
-                       (bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
+    ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>(
+        bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
 
     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
     // used to for testing if entire raster tile is inside a triangle
@@ -1117,9 +1209,9 @@
     __m256d vEdgeTileBbox[3];
     if (NumCoverageSamplesT::value > 1)
     {
-        const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
-        const __m128i vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
-        const __m128i vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
+        const SWR_MULTISAMPLE_POS& samplePos         = rastState.samplePositions;
+        const __m128i              vTileSampleBBoxXh = samplePos.TileSampleOffsetsX();
+        const __m128i              vTileSampleBBoxYh = samplePos.TileSampleOffsetsY();
 
         __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
         __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
@@ -1128,24 +1220,33 @@
         // used to for testing if entire raster tile is inside a triangle
         for (uint32_t e = 0; e < 3; ++e)
         {
-            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
-            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
+            __m256d vResultAxFix16 =
+                _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
+            __m256d vResultByFix16 =
+                _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
             vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
 
             // adjust for msaa tile bbox edges outward for conservative rast, if enabled
-            adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]);
+            adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(
+                vAi, vBi, vEdgeTileBbox[e]);
         }
     }
 
     RDTSC_END(BEStepSetup, 0);
 
-    uint32_t tY = minTileY;
-    uint32_t tX = minTileX;
+    uint32_t tY   = minTileY;
+    uint32_t tX   = minTileX;
     uint32_t maxY = maxTileY;
     uint32_t maxX = maxTileX;
 
     RenderOutputBuffers renderBuffers, currentRenderBufferRow;
-    GetRenderHotTiles<RT::MT::numSamples>(pDC, workerId, macroTile, minTileX, minTileY, renderBuffers, triDesc.triFlags.renderTargetArrayIndex);
+    GetRenderHotTiles<RT::MT::numSamples>(pDC,
+                                          workerId,
+                                          macroTile,
+                                          minTileX,
+                                          minTileY,
+                                          renderBuffers,
+                                          triDesc.triFlags.renderTargetArrayIndex);
     currentRenderBufferRow = renderBuffers;
 
     // rasterize and generate coverage masks per sample
@@ -1168,26 +1269,31 @@
             for (uint32_t sampleNum = 0; sampleNum < NumCoverageSamplesT::value; sampleNum++)
             {
                 // trivial reject, at least one edge has all 4 corners of raster tile outside
-                bool trivialReject = TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
+                bool trivialReject =
+                    TrivialRejectTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2);
 
                 if (!trivialReject)
                 {
                     // trivial accept mask
                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
 
-                    // Update the raster tile edge masks based on inner conservative edge offsets, if enabled
-                    UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>
-                        (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
+                    // Update the raster tile edge masks based on inner conservative edge offsets,
+                    // if enabled
+                    UpdateEdgeMasksInnerConservative<RT,
+                                                     typename RT::ValidEdgeMaskT,
+                                                     typename RT::InputCoverageT>(
+                        vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2);
 
                     // @todo Make this a bit smarter to allow use of trivial accept when:
                     //   1) scissor/vp intersection rect is raster tile aligned
                     //   2) raster tile is entirely within scissor/vp intersection rect
                     if (TrivialAcceptTest<typename RT::RasterizeScissorEdgesT>(mask0, mask1, mask2))
                     {
-                        // trivial accept, all 4 corners of all 3 edges are negative 
+                        // trivial accept, all 4 corners of all 3 edges are negative
                         // i.e. raster tile completely inside triangle
                         triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
-                        if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value)
+                        if (std::is_same<typename RT::InputCoverageT,
+                                         InnerConservativeCoverageT>::value)
                         {
                             triDesc.innerCoverageMask = 0xffffffffffffffffULL;
                         }
@@ -1196,9 +1302,10 @@
                     else
                     {
                         __m256d vEdgeAtSample[RT::NumEdgesT::value];
-                        if(std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
+                        if (std::is_same<NumCoverageSamplesT, SingleSampleT>::value)
                         {
-                            // should get optimized out for single sample case (global value numbering or copy propagation)
+                            // should get optimized out for single sample case (global value
+                            // numbering or copy propagation)
                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
                             {
                                 vEdgeAtSample[e] = vEdgeFix16[e];
@@ -1206,23 +1313,25 @@
                         }
                         else
                         {
-                            const SWR_MULTISAMPLE_POS &samplePos = rastState.samplePositions;
-                            __m128i vSampleOffsetXh = samplePos.vXi(sampleNum);
-                            __m128i vSampleOffsetYh = samplePos.vYi(sampleNum);
+                            const SWR_MULTISAMPLE_POS& samplePos       = rastState.samplePositions;
+                            __m128i                    vSampleOffsetXh = samplePos.vXi(sampleNum);
+                            __m128i                    vSampleOffsetYh = samplePos.vYi(sampleNum);
                             __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
                             __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
 
                             // step edge equation tests from UL tile corner to pixel sample position
                             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
                             {
-                                __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
-                                __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
+                                __m256d vResultAxFix16 =
+                                    _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
+                                __m256d vResultByFix16 =
+                                    _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
                                 vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
                                 vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
                             }
                         }
 
-                        double startQuadEdges[RT::NumEdgesT::value];
+                        double        startQuadEdges[RT::NumEdgesT::value];
                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
                         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
                         {
@@ -1231,19 +1340,25 @@
 
                         // not trivial accept or reject, must rasterize full tile
                         RDTSC_BEGIN(BERasterizePartial, pDC->drawId);
-                        triDesc.coverageMask[sampleNum] = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdges, rastEdges);
+                        triDesc.coverageMask[sampleNum] =
+                            rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(
+                                pDC, startQuadEdges, rastEdges);
                         RDTSC_END(BERasterizePartial, 0);
 
-                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
-                        
+                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
+
                         // Output SV InnerCoverage, if needed
-                        GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
+                        GenerateSVInnerCoverage<RT,
+                                                typename RT::ValidEdgeMaskT,
+                                                typename RT::InputCoverageT>(
+                            pDC, workerId, rastEdges, startQuadEdges, triDesc.innerCoverageMask);
                     }
                 }
                 else
                 {
-                    // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
-                    if(NumCoverageSamplesT::value > 1)
+                    // if we're calculating coverage per sample, need to store it off. otherwise no
+                    // covered samples, don't need to do anything
+                    if (NumCoverageSamplesT::value > 1)
                     {
                         triDesc.coverageMask[sampleNum] = 0;
                     }
@@ -1252,19 +1367,22 @@
             }
 
 #if KNOB_ENABLE_TOSS_POINTS
-            if(KNOB_TOSS_RS)
+            if (KNOB_TOSS_RS)
             {
                 gToss = triDesc.coverageMask[0];
             }
             else
 #endif
-            if(triDesc.anyCoveredSamples)
+                if (triDesc.anyCoveredSamples)
             {
-                // if conservative rast and MSAA are enabled, conservative coverage for a pixel means all samples in that pixel are covered
-                // copy conservative coverage result to all samples
-                if(RT::IsConservativeT::value)
+                // if conservative rast and MSAA are enabled, conservative coverage for a pixel
+                // means all samples in that pixel are covered copy conservative coverage result to
+                // all samples
+                if (RT::IsConservativeT::value)
                 {
-                    auto copyCoverage = [&](int sample){triDesc.coverageMask[sample] = triDesc.coverageMask[0]; };
+                    auto copyCoverage = [&](int sample) {
+                        triDesc.coverageMask[sample] = triDesc.coverageMask[0];
+                    };
                     UnrollerL<1, RT::MT::numSamples, 1>::step(copyCoverage);
                 }
 
@@ -1272,14 +1390,20 @@
                 AR_EVENT(RasterTileCount(pDC->drawId, 1));
 
                 RDTSC_BEGIN(BEPixelBackend, pDC->drawId);
-                backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
+                backendFuncs.pfnBackend(pDC,
+                                        workerId,
+                                        tileX << KNOB_TILE_X_DIM_SHIFT,
+                                        tileY << KNOB_TILE_Y_DIM_SHIFT,
+                                        triDesc,
+                                        renderBuffers);
                 RDTSC_END(BEPixelBackend, 0);
             }
 
             // step to the next tile in X
             for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
             {
-                vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
+                vEdgeFix16[e] =
+                    _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
             }
             StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers);
         }
@@ -1287,7 +1411,8 @@
         // step to the next tile in Y
         for (uint32_t e = 0; e < RT::NumEdgesT::value; ++e)
         {
-            vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
+            vEdgeFix16[e] =
+                _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
         }
         StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow);
     }
@@ -1297,10 +1422,16 @@
 
 // Get pointers to hot tile memory for color RT, depth, stencil
 template <uint32_t numSamples>
-void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex)
+void GetRenderHotTiles(DRAW_CONTEXT*        pDC,
+                       uint32_t             workerId,
+                       uint32_t             macroID,
+                       uint32_t             tileX,
+                       uint32_t             tileY,
+                       RenderOutputBuffers& renderBuffers,
+                       uint32_t             renderTargetArrayIndex)
 {
-    const API_STATE& state = GetApiState(pDC);
-    SWR_CONTEXT *pContext = pDC->pContext;
+    const API_STATE& state    = GetApiState(pDC);
+    SWR_CONTEXT*     pContext = pDC->pContext;
     HANDLE hWorkerPrivateData = pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     uint32_t mx, my;
@@ -1310,46 +1441,73 @@
 
     // compute tile offset for active hottile buffers
     const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
-    uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-    offset*=numSamples;
+    uint32_t       offset = ComputeTileOffset2D<
+        TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp>>(
+        pitch, tileX, tileY);
+    offset *= numSamples;
 
-    unsigned long rtSlot = 0;
-    uint32_t colorHottileEnableMask = state.colorHottileEnable;
-    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
+    unsigned long rtSlot                 = 0;
+    uint32_t      colorHottileEnableMask = state.colorHottileEnable;
+    while (_BitScanForward(&rtSlot, colorHottileEnableMask))
     {
-        HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 
-            numSamples, renderTargetArrayIndex);
-        pColor->state = HOTTILE_DIRTY;
+        HOTTILE* pColor = pContext->pHotTileMgr->GetHotTile(
+            pContext,
+            pDC,
+            hWorkerPrivateData,
+            macroID,
+            (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
+            true,
+            numSamples,
+            renderTargetArrayIndex);
+        pColor->state                = HOTTILE_DIRTY;
         renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
-        
+
         colorHottileEnableMask &= ~(1 << rtSlot);
     }
-    if(state.depthHottileEnable)
+    if (state.depthHottileEnable)
     {
-        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
-        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-        offset*=numSamples;
-        HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true,
-            numSamples, renderTargetArrayIndex);
-        pDepth->state = HOTTILE_DIRTY;
+        const uint32_t pitch =
+            KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
+        uint32_t offset = ComputeTileOffset2D<
+            TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp>>(
+            pitch, tileX, tileY);
+        offset *= numSamples;
+        HOTTILE* pDepth = pContext->pHotTileMgr->GetHotTile(pContext,
+                                                            pDC,
+                                                            hWorkerPrivateData,
+                                                            macroID,
+                                                            SWR_ATTACHMENT_DEPTH,
+                                                            true,
+                                                            numSamples,
+                                                            renderTargetArrayIndex);
+        pDepth->state   = HOTTILE_DIRTY;
         SWR_ASSERT(pDepth->pBuffer != nullptr);
         renderBuffers.pDepth = pDepth->pBuffer + offset;
     }
-    if(state.stencilHottileEnable)
+    if (state.stencilHottileEnable)
     {
-        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
-        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
-        offset*=numSamples;
-        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true,
-            numSamples, renderTargetArrayIndex);
-        pStencil->state = HOTTILE_DIRTY;
+        const uint32_t pitch =
+            KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
+        uint32_t offset = ComputeTileOffset2D<
+            TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp>>(
+            pitch, tileX, tileY);
+        offset *= numSamples;
+        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext,
+                                                              pDC,
+                                                              hWorkerPrivateData,
+                                                              macroID,
+                                                              SWR_ATTACHMENT_STENCIL,
+                                                              true,
+                                                              numSamples,
+                                                              renderTargetArrayIndex);
+        pStencil->state   = HOTTILE_DIRTY;
         SWR_ASSERT(pStencil->pBuffer != nullptr);
         renderBuffers.pStencil = pStencil->pBuffer + offset;
     }
 }
 
 template <typename RT>
-INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers)
+INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers& buffers)
 {
     DWORD rt = 0;
     while (_BitScanForward(&rt, colorHotTileMask))
@@ -1357,13 +1515,15 @@
         colorHotTileMask &= ~(1 << rt);
         buffers.pColor[rt] += RT::colorRasterTileStep;
     }
-    
+
     buffers.pDepth += RT::depthRasterTileStep;
     buffers.pStencil += RT::stencilRasterTileStep;
 }
 
 template <typename RT>
-INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow)
+INLINE void StepRasterTileY(uint32_t             colorHotTileMask,
+                            RenderOutputBuffers& buffers,
+                            RenderOutputBuffers& startBufferRow)
 {
     DWORD rt = 0;
     while (_BitScanForward(&rt, colorHotTileMask))
@@ -1378,4 +1538,3 @@
     startBufferRow.pStencil += RT::stencilRasterTileRowStep;
     buffers.pStencil = startBufferRow.pStencil;
 }
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
index f1355dd..e858a7d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
@@ -1,97 +1,100 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #include "rdtsc_core.h"
 #include "common/rdtsc_buckets.h"
 
 // must match CORE_BUCKETS enum order
 BUCKET_DESC gCoreBuckets[] = {
-    { "APIClearRenderTarget", "", true, 0xff0b8bea },
-    { "APIDraw", "", true, 0xff000066 },
-    { "APIDrawWakeAllThreads", "", false, 0xffffffff },
-    { "APIDrawIndexed", "", true, 0xff000066 },
-    { "APIDispatch", "", true, 0xff660000 },
-    { "APIStoreTiles", "", true, 0xff00ffff },
-    { "APIGetDrawContext", "", false, 0xffffffff },
-    { "APISync", "", true, 0xff6666ff },
-    { "APIWaitForIdle", "", true, 0xff0000ff },
-    { "FEProcessDraw", "", true, 0xff009900 },
-    { "FEProcessDrawIndexed", "", true, 0xff009900 },
-    { "FEFetchShader", "", false, 0xffffffff },
-    { "FEVertexShader", "", false, 0xffffffff },
-    { "FEHullShader", "", false, 0xffffffff },
-    { "FETessellation", "", false, 0xffffffff },
-    { "FEDomainShader", "", false, 0xffffffff },
-    { "FEGeometryShader", "", false, 0xffffffff },
-    { "FEStreamout", "", false, 0xffffffff },
-    { "FEPAAssemble", "", false, 0xffffffff },
-    { "FEBinPoints", "", false, 0xff29b854 },
-    { "FEBinLines", "", false, 0xff29b854 },
-    { "FEBinTriangles", "", false, 0xff29b854 },
-    { "FETriangleSetup", "", false, 0xffffffff },
-    { "FEViewportCull", "", false, 0xffffffff },
-    { "FEGuardbandClip", "", false, 0xffffffff },
-    { "FEClipPoints", "", false, 0xffffffff },
-    { "FEClipLines", "", false, 0xffffffff },
-    { "FEClipTriangles", "", false, 0xffffffff },
-    { "FECullZeroAreaAndBackface", "", false, 0xffffffff },
-    { "FECullBetweenCenters", "", false, 0xffffffff },
-    { "FEEarlyRastEnter", "", false, 0xffffffff },
-    { "FEEarlyRastExit", "", false, 0xffffffff },
-    { "FEProcessStoreTiles", "", true, 0xff39c864 },
-    { "FEProcessInvalidateTiles", "", true, 0xffffffff },
-    { "WorkerWorkOnFifoBE", "", false, 0xff40261c },
-    { "WorkerFoundWork", "", false, 0xff573326 },
-    { "BELoadTiles", "", true, 0xffb0e2ff },
-    { "BEDispatch", "", true, 0xff00a2ff },
-    { "BEClear", "", true, 0xff00ccbb },
-    { "BERasterizeLine", "", true, 0xffb26a4e },
-    { "BERasterizeTriangle", "", true, 0xffb26a4e },
-    { "BETriangleSetup", "", false, 0xffffffff },
-    { "BEStepSetup", "", false, 0xffffffff },
-    { "BECullZeroArea", "", false, 0xffffffff },
-    { "BEEmptyTriangle", "", false, 0xffffffff },
-    { "BETrivialAccept", "", false, 0xffffffff },
-    { "BETrivialReject", "", false, 0xffffffff },
-    { "BERasterizePartial", "", false, 0xffffffff },
-    { "BEPixelBackend", "", false, 0xffffffff },
-    { "BESetup", "", false, 0xffffffff },
-    { "BEBarycentric", "", false, 0xffffffff },
-    { "BEEarlyDepthTest", "", false, 0xffffffff },
-    { "BEPixelShader", "", false, 0xffffffff },
-    { "BESingleSampleBackend", "", false, 0xffffffff },
-    { "BEPixelRateBackend", "", false, 0xffffffff },
-    { "BESampleRateBackend", "", false, 0xffffffff },
-    { "BENullBackend", "", false, 0xffffffff },
-    { "BELateDepthTest", "", false, 0xffffffff },
-    { "BEOutputMerger", "", false, 0xffffffff },
-    { "BEStoreTiles", "", true, 0xff00cccc },
-    { "BEEndTile", "", false, 0xffffffff },
+    {"APIClearRenderTarget", "", true, 0xff0b8bea},
+    {"APIDraw", "", true, 0xff000066},
+    {"APIDrawWakeAllThreads", "", false, 0xffffffff},
+    {"APIDrawIndexed", "", true, 0xff000066},
+    {"APIDispatch", "", true, 0xff660000},
+    {"APIStoreTiles", "", true, 0xff00ffff},
+    {"APIGetDrawContext", "", false, 0xffffffff},
+    {"APISync", "", true, 0xff6666ff},
+    {"APIWaitForIdle", "", true, 0xff0000ff},
+    {"FEProcessDraw", "", true, 0xff009900},
+    {"FEProcessDrawIndexed", "", true, 0xff009900},
+    {"FEFetchShader", "", false, 0xffffffff},
+    {"FEVertexShader", "", false, 0xffffffff},
+    {"FEHullShader", "", false, 0xffffffff},
+    {"FETessellation", "", false, 0xffffffff},
+    {"FEDomainShader", "", false, 0xffffffff},
+    {"FEGeometryShader", "", false, 0xffffffff},
+    {"FEStreamout", "", false, 0xffffffff},
+    {"FEPAAssemble", "", false, 0xffffffff},
+    {"FEBinPoints", "", false, 0xff29b854},
+    {"FEBinLines", "", false, 0xff29b854},
+    {"FEBinTriangles", "", false, 0xff29b854},
+    {"FETriangleSetup", "", false, 0xffffffff},
+    {"FEViewportCull", "", false, 0xffffffff},
+    {"FEGuardbandClip", "", false, 0xffffffff},
+    {"FEClipPoints", "", false, 0xffffffff},
+    {"FEClipLines", "", false, 0xffffffff},
+    {"FEClipTriangles", "", false, 0xffffffff},
+    {"FEClipRectangles", "", false, 0xffffffff},
+    {"FECullZeroAreaAndBackface", "", false, 0xffffffff},
+    {"FECullBetweenCenters", "", false, 0xffffffff},
+    {"FEEarlyRastEnter", "", false, 0xffffffff},
+    {"FEEarlyRastExit", "", false, 0xffffffff},
+    {"FEProcessStoreTiles", "", true, 0xff39c864},
+    {"FEProcessInvalidateTiles", "", true, 0xffffffff},
+    {"WorkerWorkOnFifoBE", "", false, 0xff40261c},
+    {"WorkerFoundWork", "", false, 0xff573326},
+    {"BELoadTiles", "", true, 0xffb0e2ff},
+    {"BEDispatch", "", true, 0xff00a2ff},
+    {"BEClear", "", true, 0xff00ccbb},
+    {"BERasterizeLine", "", true, 0xffb26a4e},
+    {"BERasterizeTriangle", "", true, 0xffb26a4e},
+    {"BETriangleSetup", "", false, 0xffffffff},
+    {"BEStepSetup", "", false, 0xffffffff},
+    {"BECullZeroArea", "", false, 0xffffffff},
+    {"BEEmptyTriangle", "", false, 0xffffffff},
+    {"BETrivialAccept", "", false, 0xffffffff},
+    {"BETrivialReject", "", false, 0xffffffff},
+    {"BERasterizePartial", "", false, 0xffffffff},
+    {"BEPixelBackend", "", false, 0xffffffff},
+    {"BESetup", "", false, 0xffffffff},
+    {"BEBarycentric", "", false, 0xffffffff},
+    {"BEEarlyDepthTest", "", false, 0xffffffff},
+    {"BEPixelShader", "", false, 0xffffffff},
+    {"BESingleSampleBackend", "", false, 0xffffffff},
+    {"BEPixelRateBackend", "", false, 0xffffffff},
+    {"BESampleRateBackend", "", false, 0xffffffff},
+    {"BENullBackend", "", false, 0xffffffff},
+    {"BELateDepthTest", "", false, 0xffffffff},
+    {"BEOutputMerger", "", false, 0xffffffff},
+    {"BEStoreTiles", "", true, 0xff00cccc},
+    {"BEEndTile", "", false, 0xffffffff},
 };
+static_assert(NumBuckets == (sizeof(gCoreBuckets) / sizeof(gCoreBuckets[0])),
+              "RDTSC Bucket enum and description table size mismatched.");
 
 /// @todo bucketmanager and mapping should probably be a part of the SWR context
 std::vector<uint32_t> gBucketMap;
-BucketManager gBucketMgr;
+BucketManager         gBucketMgr;
 
-uint32_t gCurrentFrame = 0;
-bool gBucketsInitialized = false;
+uint32_t gCurrentFrame       = 0;
+bool     gBucketsInitialized = false;
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
index 5ee8dec..dc20e5b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #pragma once
 #include "knobs.h"
@@ -29,6 +29,9 @@
 
 #include <vector>
 
+///////////////////////////////////////////////////////////////////////////////
+// NOTE:  This enum MUST be kept in sync with gCoreBuckets in rdtsc_core.cpp
+///////////////////////////////////////////////////////////////////////////////
 enum CORE_BUCKETS
 {
     APIClearRenderTarget,
@@ -59,6 +62,7 @@
     FEClipPoints,
     FEClipLines,
     FEClipTriangles,
+    FEClipRectangles,
     FECullZeroAreaAndBackface,
     FECullBetweenCenters,
     FEEarlyRastEnter,
@@ -120,10 +124,10 @@
 #endif
 
 extern std::vector<uint32_t> gBucketMap;
-extern BucketManager gBucketMgr;
-extern BUCKET_DESC gCoreBuckets[];
-extern uint32_t gCurrentFrame;
-extern bool gBucketsInitialized;
+extern BucketManager         gBucketMgr;
+extern BUCKET_DESC           gCoreBuckets[];
+extern uint32_t              gCurrentFrame;
+extern bool                  gBucketsInitialized;
 
 INLINE void rdtscReset()
 {
@@ -170,12 +174,14 @@
 {
     gCurrentFrame++;
 
-    if (gCurrentFrame == KNOB_BUCKETS_START_FRAME && KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
+    if (gCurrentFrame == KNOB_BUCKETS_START_FRAME &&
+        KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
     {
         gBucketMgr.StartCapture();
     }
 
-    if (gCurrentFrame == KNOB_BUCKETS_END_FRAME && KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
+    if (gCurrentFrame == KNOB_BUCKETS_END_FRAME &&
+        KNOB_BUCKETS_START_FRAME < KNOB_BUCKETS_END_FRAME)
     {
         gBucketMgr.StopCapture();
         gBucketMgr.PrintReport("rdtsc.txt");
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
index f1bef21..133420e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@@ -1,56 +1,52 @@
 /****************************************************************************
-* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file arena.h
-*
-* @brief RingBuffer
-*        The RingBuffer class manages all aspects of the ring buffer including
-*        the head/tail indices, etc.
-*
-******************************************************************************/
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file arena.h
+ *
+ * @brief RingBuffer
+ *        The RingBuffer class manages all aspects of the ring buffer including
+ *        the head/tail indices, etc.
+ *
+ ******************************************************************************/
 #pragma once
 
-template<typename T>
+template <typename T>
 class RingBuffer
 {
 public:
-    RingBuffer()
-        : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0)
-    {
-    }
+    RingBuffer() : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) {}
 
-    ~RingBuffer()
-    {
-        Destroy();
-    }
+    ~RingBuffer() { Destroy(); }
 
     void Init(uint32_t numEntries)
     {
         SWR_ASSERT(numEntries > 0);
-        SWR_ASSERT(((1ULL << 32) % numEntries) == 0, "%d is not evenly divisible into 2 ^ 32.  Wrap errors will occur!", numEntries);
-        mNumEntries = numEntries;
-        mpRingBuffer = (T*)AlignedMalloc(sizeof(T)*numEntries, 64);
+        SWR_ASSERT(((1ULL << 32) % numEntries) == 0,
+                   "%d is not evenly divisible into 2 ^ 32.  Wrap errors will occur!",
+                   numEntries);
+        mNumEntries  = numEntries;
+        mpRingBuffer = (T*)AlignedMalloc(sizeof(T) * numEntries, 64);
         SWR_ASSERT(mpRingBuffer != nullptr);
-        memset(mpRingBuffer, 0, sizeof(T)*numEntries);
+        memset(mpRingBuffer, 0, sizeof(T) * numEntries);
     }
 
     void Destroy()
@@ -77,10 +73,7 @@
         InterlockedIncrement(&mRingTail); // There are multiple consumers.
     }
 
-    INLINE bool IsEmpty()
-    {
-        return (GetHead() == GetTail());
-    }
+    INLINE bool IsEmpty() { return (GetHead() == GetTail()); }
 
     INLINE bool IsFull()
     {
@@ -94,9 +87,9 @@
     INLINE uint32_t GetHead() volatile { return mRingHead; }
 
 protected:
-    T* mpRingBuffer;
+    T*       mpRingBuffer;
     uint32_t mNumEntries;
 
-    OSALIGNLINE(volatile uint32_t) mRingHead;  // Consumer Counter
-    OSALIGNLINE(volatile uint32_t) mRingTail;  // Producer Counter
+    OSALIGNLINE(volatile uint32_t) mRingHead; // Consumer Counter
+    OSALIGNLINE(volatile uint32_t) mRingTail; // Producer Counter
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 217cf44..0b42a45 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -1,100 +1,103 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file state.h
-*
-* @brief Definitions for API state.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file state.h
+ *
+ * @brief Definitions for API state.
+ *
+ ******************************************************************************/
+// Skipping clang-format due to parsing by simplistic python scripts
+// clang-format off
 #pragma once
 
 #include "common/formats.h"
 #include "common/intrin.h"
-using gfxptr_t = unsigned long long;
 #include <functional>
 #include <algorithm>
 
+using gfxptr_t = unsigned long long;
+
 //////////////////////////////////////////////////////////////////////////
 /// PRIMITIVE_TOPOLOGY.
 //////////////////////////////////////////////////////////////////////////
 enum PRIMITIVE_TOPOLOGY
 {
-    TOP_UNKNOWN = 0x0,
-    TOP_POINT_LIST = 0x1,
-    TOP_LINE_LIST = 0x2,
-    TOP_LINE_STRIP = 0x3,
-    TOP_TRIANGLE_LIST = 0x4,
-    TOP_TRIANGLE_STRIP = 0x5,
-    TOP_TRIANGLE_FAN = 0x6,
-    TOP_QUAD_LIST = 0x7,
-    TOP_QUAD_STRIP = 0x8,
-    TOP_LINE_LIST_ADJ = 0x9,
-    TOP_LISTSTRIP_ADJ = 0xA,
-    TOP_TRI_LIST_ADJ = 0xB,
-    TOP_TRI_STRIP_ADJ = 0xC,
-    TOP_TRI_STRIP_REVERSE = 0xD,
-    TOP_POLYGON = 0xE,
-    TOP_RECT_LIST = 0xF,
-    TOP_LINE_LOOP = 0x10,
-    TOP_POINT_LIST_BF = 0x11,
-    TOP_LINE_STRIP_CONT = 0x12,
-    TOP_LINE_STRIP_BF = 0x13,
-    TOP_LINE_STRIP_CONT_BF = 0x14,
+    TOP_UNKNOWN                = 0x0,
+    TOP_POINT_LIST             = 0x1,
+    TOP_LINE_LIST              = 0x2,
+    TOP_LINE_STRIP             = 0x3,
+    TOP_TRIANGLE_LIST          = 0x4,
+    TOP_TRIANGLE_STRIP         = 0x5,
+    TOP_TRIANGLE_FAN           = 0x6,
+    TOP_QUAD_LIST              = 0x7,
+    TOP_QUAD_STRIP             = 0x8,
+    TOP_LINE_LIST_ADJ          = 0x9,
+    TOP_LISTSTRIP_ADJ          = 0xA,
+    TOP_TRI_LIST_ADJ           = 0xB,
+    TOP_TRI_STRIP_ADJ          = 0xC,
+    TOP_TRI_STRIP_REVERSE      = 0xD,
+    TOP_POLYGON                = 0xE,
+    TOP_RECT_LIST              = 0xF,
+    TOP_LINE_LOOP              = 0x10,
+    TOP_POINT_LIST_BF          = 0x11,
+    TOP_LINE_STRIP_CONT        = 0x12,
+    TOP_LINE_STRIP_BF          = 0x13,
+    TOP_LINE_STRIP_CONT_BF     = 0x14,
     TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16,
-    TOP_TRIANGLE_DISC = 0x17,   /// @todo What is this??
+    TOP_TRIANGLE_DISC          = 0x17, /// @todo What is this??
 
-    TOP_PATCHLIST_BASE = 0x1F,  // Invalid topology, used to calculate num verts for a patchlist.
-    TOP_PATCHLIST_1 = 0x20,     // List of 1-vertex patches
-    TOP_PATCHLIST_2 = 0x21,
-    TOP_PATCHLIST_3 = 0x22,
-    TOP_PATCHLIST_4 = 0x23,
-    TOP_PATCHLIST_5 = 0x24,
-    TOP_PATCHLIST_6 = 0x25,
-    TOP_PATCHLIST_7 = 0x26,
-    TOP_PATCHLIST_8 = 0x27,
-    TOP_PATCHLIST_9 = 0x28,
-    TOP_PATCHLIST_10 = 0x29,
-    TOP_PATCHLIST_11 = 0x2A,
-    TOP_PATCHLIST_12 = 0x2B,
-    TOP_PATCHLIST_13 = 0x2C,
-    TOP_PATCHLIST_14 = 0x2D,
-    TOP_PATCHLIST_15 = 0x2E,
-    TOP_PATCHLIST_16 = 0x2F,
-    TOP_PATCHLIST_17 = 0x30,
-    TOP_PATCHLIST_18 = 0x31,
-    TOP_PATCHLIST_19 = 0x32,
-    TOP_PATCHLIST_20 = 0x33,
-    TOP_PATCHLIST_21 = 0x34,
-    TOP_PATCHLIST_22 = 0x35,
-    TOP_PATCHLIST_23 = 0x36,
-    TOP_PATCHLIST_24 = 0x37,
-    TOP_PATCHLIST_25 = 0x38,
-    TOP_PATCHLIST_26 = 0x39,
-    TOP_PATCHLIST_27 = 0x3A,
-    TOP_PATCHLIST_28 = 0x3B,
-    TOP_PATCHLIST_29 = 0x3C,
-    TOP_PATCHLIST_30 = 0x3D,
-    TOP_PATCHLIST_31 = 0x3E,
-    TOP_PATCHLIST_32 = 0x3F,   // List of 32-vertex patches
+    TOP_PATCHLIST_BASE = 0x1F, // Invalid topology, used to calculate num verts for a patchlist.
+    TOP_PATCHLIST_1    = 0x20, // List of 1-vertex patches
+    TOP_PATCHLIST_2    = 0x21,
+    TOP_PATCHLIST_3    = 0x22,
+    TOP_PATCHLIST_4    = 0x23,
+    TOP_PATCHLIST_5    = 0x24,
+    TOP_PATCHLIST_6    = 0x25,
+    TOP_PATCHLIST_7    = 0x26,
+    TOP_PATCHLIST_8    = 0x27,
+    TOP_PATCHLIST_9    = 0x28,
+    TOP_PATCHLIST_10   = 0x29,
+    TOP_PATCHLIST_11   = 0x2A,
+    TOP_PATCHLIST_12   = 0x2B,
+    TOP_PATCHLIST_13   = 0x2C,
+    TOP_PATCHLIST_14   = 0x2D,
+    TOP_PATCHLIST_15   = 0x2E,
+    TOP_PATCHLIST_16   = 0x2F,
+    TOP_PATCHLIST_17   = 0x30,
+    TOP_PATCHLIST_18   = 0x31,
+    TOP_PATCHLIST_19   = 0x32,
+    TOP_PATCHLIST_20   = 0x33,
+    TOP_PATCHLIST_21   = 0x34,
+    TOP_PATCHLIST_22   = 0x35,
+    TOP_PATCHLIST_23   = 0x36,
+    TOP_PATCHLIST_24   = 0x37,
+    TOP_PATCHLIST_25   = 0x38,
+    TOP_PATCHLIST_26   = 0x39,
+    TOP_PATCHLIST_27   = 0x3A,
+    TOP_PATCHLIST_28   = 0x3B,
+    TOP_PATCHLIST_29   = 0x3C,
+    TOP_PATCHLIST_30   = 0x3D,
+    TOP_PATCHLIST_31   = 0x3E,
+    TOP_PATCHLIST_32   = 0x3F, // List of 32-vertex patches
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -172,7 +175,6 @@
     SWR_NUM_OUTER_TESS_FACTORS,
 };
 
-
 /////////////////////////////////////////////////////////////////////////
 /// simdvertex
 /// @brief Defines a vertex element that holds all the data for SIMD vertices.
@@ -181,9 +183,9 @@
 enum SWR_VTX_SLOTS
 {
     VERTEX_SGV_SLOT                 = 0,
-        VERTEX_SGV_RTAI_COMP        = 0,
-        VERTEX_SGV_VAI_COMP         = 1,
-        VERTEX_SGV_POINT_SIZE_COMP  = 2,
+    VERTEX_SGV_RTAI_COMP            = 0,
+    VERTEX_SGV_VAI_COMP             = 1,
+    VERTEX_SGV_POINT_SIZE_COMP      = 2,
     VERTEX_POSITION_SLOT            = 1,
     VERTEX_POSITION_END_SLOT        = 1,
     VERTEX_CLIPCULL_DIST_LO_SLOT    = (1 + VERTEX_POSITION_END_SLOT), // VS writes lower 4 clip/cull dist
@@ -196,21 +198,21 @@
 // SoAoSoA
 struct simdvertex
 {
-    simdvector      attrib[SWR_VTX_NUM_SLOTS];
+    simdvector attrib[SWR_VTX_NUM_SLOTS];
 };
 
 #if ENABLE_AVX512_SIMD16
 struct simd16vertex
 {
-    simd16vector    attrib[SWR_VTX_NUM_SLOTS];
+    simd16vector attrib[SWR_VTX_NUM_SLOTS];
 };
 
 #endif
 
-template<typename SIMD_T>
+template <typename SIMD_T>
 struct SIMDVERTEX_T
 {
-    typename SIMD_T::Vec4               attrib[SWR_VTX_NUM_SLOTS];
+    typename SIMD_T::Vec4 attrib[SWR_VTX_NUM_SLOTS];
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -228,19 +230,20 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_VS_CONTEXT
 {
-    simdvertex* pVin;           // IN: SIMD input vertex data store
-    simdvertex* pVout;          // OUT: SIMD output vertex data store
+    simdvertex* pVin;  // IN: SIMD input vertex data store
+    simdvertex* pVout; // OUT: SIMD output vertex data store
 
-    uint32_t InstanceID;        // IN: Instance ID, constant across all verts of the SIMD
-    simdscalari VertexID;       // IN: Vertex ID
-    simdscalari mask;           // IN: Active mask for shader
+    uint32_t    InstanceID; // IN: Instance ID, constant across all verts of the SIMD
+    simdscalari VertexID;   // IN: Vertex ID
+    simdscalari mask;       // IN: Active mask for shader
 
     // SIMD16 Frontend fields.
-    uint32_t AlternateOffset;   // IN: amount to offset for interleaving even/odd simd8 in simd16vertex output
-    simd16scalari mask16;       // IN: Active mask for shader (16-wide)
-    simd16scalari VertexID16;   // IN: Vertex ID (16-wide)
+    uint32_t AlternateOffset; // IN: amount to offset for interleaving even/odd simd8 in
+                              // simd16vertex output
+    simd16scalari mask16;     // IN: Active mask for shader (16-wide)
+    simd16scalari VertexID16; // IN: Vertex ID (16-wide)
 
-    SWR_SHADER_STATS stats;     // OUT: shader statistics used for archrast.
+    SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
 };
 
 /////////////////////////////////////////////////////////////////////////
@@ -267,16 +270,16 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_TESSELLATION_FACTORS
 {
-    float  OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS];
-    float  InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS];
+    float OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS];
+    float InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS];
 };
 
 #define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches
 struct ScalarPatch
 {
     SWR_TESSELLATION_FACTORS tessFactors;
-    ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM];
-    ScalarCPoint patchData;
+    ScalarCPoint             cp[MAX_NUM_VERTS_PER_PRIM];
+    ScalarCPoint             patchData;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -285,12 +288,11 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_HS_CONTEXT
 {
-    simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data
-    simdscalari PrimitiveID;    // IN: (SIMD) primitive ID generated from the draw call
-    simdscalari mask;           // IN: Active mask for shader
-    ScalarPatch* pCPout;        // OUT: Output control point patch
-                                // SIMD-sized-array of SCALAR patches
-    SWR_SHADER_STATS stats;     // OUT: shader statistics used for archrast.
+    simdvertex       vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data
+    simdscalari      PrimitiveID;                  // IN: (SIMD) primitive ID generated from the draw call
+    simdscalari      mask;                         // IN: Active mask for shader
+    ScalarPatch*     pCPout;                       // OUT: Output control point patch SIMD-sized-array of SCALAR patches
+    SWR_SHADER_STATS stats;                        // OUT: shader statistics used for archrast.
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -317,13 +319,13 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_GS_CONTEXT
 {
-    simdvector* pVerts;                 // IN: input primitive data for SIMD prims
-    uint32_t inputVertStride;           // IN: input vertex stride, in attributes
-    simdscalari PrimitiveID;            // IN: input primitive ID generated from the draw call
-    uint32_t InstanceID;                // IN: input instance ID
-    simdscalari mask;                   // IN: Active mask for shader
-    uint8_t* pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams)
-    SWR_SHADER_STATS stats;             // OUT: shader statistics used for archrast.
+    simdvector* pVerts;                    // IN: input primitive data for SIMD prims
+    uint32_t    inputVertStride;           // IN: input vertex stride, in attributes
+    simdscalari PrimitiveID;               // IN: input primitive ID generated from the draw call
+    uint32_t    InstanceID;                // IN: input instance ID
+    simdscalari mask;                      // IN: Active mask for shader
+    uint8_t*    pStreams[KNOB_SIMD_WIDTH]; // OUT: output stream (contains vertices for all output streams)
+    SWR_SHADER_STATS stats;                // OUT: shader statistics used for archrast.
 };
 
 struct PixelPositions
@@ -342,36 +344,35 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_PS_CONTEXT
 {
-    PixelPositions vX;          // IN: x location(s) of pixels
-    PixelPositions vY;          // IN: x location(s) of pixels
-    simdscalar vZ;              // INOUT: z location of pixels
-    simdscalari activeMask;     // OUT: mask for kill
-    simdscalar  inputMask;      // IN: input coverage mask for all samples
-    simdscalari oMask;          // OUT: mask for output coverage
+    PixelPositions vX;         // IN: x location(s) of pixels
+    PixelPositions vY;         // IN: x location(s) of pixels
+    simdscalar     vZ;         // INOUT: z location of pixels
+    simdscalari    activeMask; // OUT: mask for kill
+    simdscalar     inputMask;  // IN: input coverage mask for all samples
+    simdscalari    oMask;      // OUT: mask for output coverage
 
-    PixelPositions vI;          // barycentric coords evaluated at pixel center, sample position, centroid
+    PixelPositions vI; // barycentric coords evaluated at pixel center, sample position, centroid
     PixelPositions vJ;
-    PixelPositions vOneOverW;   // IN: 1/w
+    PixelPositions vOneOverW; // IN: 1/w
 
     const float* pAttribs;      // IN: pointer to attribute barycentric coefficients
     const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients
     const float* pRecipW;       // IN: pointer to 1/w coord for each vertex
-    const float *I;             // IN: Barycentric A, B, and C coefs used to compute I
-    const float *J;             // IN: Barycentric A, B, and C coefs used to compute J
-    float recipDet;             // IN: 1/Det, used when barycentric interpolating attributes
+    const float* I;             // IN: Barycentric A, B, and C coefs used to compute I
+    const float* J;             // IN: Barycentric A, B, and C coefs used to compute J
+    float        recipDet;      // IN: 1/Det, used when barycentric interpolating attributes
     const float* pSamplePosX;   // IN: array of sample positions
     const float* pSamplePosY;   // IN: array of sample positions
-    simdvector shaded[SWR_NUM_RENDERTARGETS];
-                                // OUT: result color per rendertarget
+    simdvector   shaded[SWR_NUM_RENDERTARGETS]; // OUT: result color per rendertarget
 
-    uint32_t frontFace;                 // IN: front- 1, back- 0
-    uint32_t sampleIndex;               // IN: sampleIndex
-    uint32_t renderTargetArrayIndex;    // IN: render target array index from GS
-    uint32_t rasterizerSampleCount;     // IN: sample count used by the rasterizer
+    uint32_t frontFace;              // IN: front- 1, back- 0
+    uint32_t sampleIndex;            // IN: sampleIndex
+    uint32_t renderTargetArrayIndex; // IN: render target array index from GS
+    uint32_t rasterizerSampleCount;  // IN: sample count used by the rasterizer
 
     uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; // IN: Pointers to render target hottiles
 
-    SWR_SHADER_STATS stats;             // OUT: shader statistics used for archrast.
+    SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -400,41 +401,41 @@
     // count into the shader. When the count reaches 0 then all thread groups in the
     // dispatch call have been completed.
 
-    uint32_t tileCounter;  // The tile counter value for this thread group.
+    uint32_t tileCounter; // The tile counter value for this thread group.
 
     // Dispatch dimensions used by shader to compute system values from the tile counter.
     uint32_t dispatchDims[3];
 
     uint8_t* pTGSM;               // Thread Group Shared Memory pointer.
     uint8_t* pSpillFillBuffer;    // Spill/fill buffer for barrier support
-    uint8_t* pScratchSpace;       // Pointer to scratch space buffer used by the shader, shader is responsible
-                                  // for subdividing scratch space per instance/simd
+    uint8_t* pScratchSpace;       // Pointer to scratch space buffer used by the shader, shader is
+                                  // responsible for subdividing scratch space per instance/simd
     uint32_t scratchSpacePerSimd; // Scratch space per work item x SIMD_WIDTH
 
-    SWR_SHADER_STATS stats;       // OUT: shader statistics used for archrast.
+    SWR_SHADER_STATS stats; // OUT: shader statistics used for archrast.
 };
 
 // enums
 enum SWR_TILE_MODE
 {
-    SWR_TILE_NONE = 0x0,    // Linear mode (no tiling)
-    SWR_TILE_MODE_WMAJOR,   // W major tiling
-    SWR_TILE_MODE_XMAJOR,   // X major tiling
-    SWR_TILE_MODE_YMAJOR,   // Y major tiling
-    SWR_TILE_SWRZ,          // SWR-Z tiling
+    SWR_TILE_NONE = 0x0,   // Linear mode (no tiling)
+    SWR_TILE_MODE_WMAJOR,  // W major tiling
+    SWR_TILE_MODE_XMAJOR,  // X major tiling
+    SWR_TILE_MODE_YMAJOR,  // Y major tiling
+    SWR_TILE_SWRZ,         // SWR-Z tiling
 
     SWR_TILE_MODE_COUNT
 };
 
 enum SWR_SURFACE_TYPE
 {
-    SURFACE_1D        = 0,
-    SURFACE_2D        = 1,
-    SURFACE_3D        = 2,
-    SURFACE_CUBE      = 3,
-    SURFACE_BUFFER    = 4,
+    SURFACE_1D                = 0,
+    SURFACE_2D                = 1,
+    SURFACE_3D                = 2,
+    SURFACE_CUBE              = 3,
+    SURFACE_BUFFER            = 4,
     SURFACE_STRUCTURED_BUFFER = 5,
-    SURFACE_NULL       = 7
+    SURFACE_NULL              = 7
 };
 
 enum SWR_ZFUNCTION
@@ -536,34 +537,35 @@
 //////////////////////////////////////////////////////////////////////////
 struct SWR_SURFACE_STATE
 {
-    gfxptr_t xpBaseAddress;
-    SWR_SURFACE_TYPE type;  // @llvm_enum
-    SWR_FORMAT format;      // @llvm_enum
-    uint32_t width;
-    uint32_t height;
-    uint32_t depth;
-    uint32_t numSamples;
-    uint32_t samplePattern;
-    uint32_t pitch;
-    uint32_t qpitch;
-    uint32_t minLod;            // for sampled surfaces, the most detailed LOD that can be accessed by sampler
-    uint32_t maxLod;            // for sampled surfaces, the max LOD that can be accessed
-    float resourceMinLod;       // for sampled surfaces, the most detailed fractional mip that can be accessed by sampler
-    uint32_t lod;               // for render targets, the lod being rendered to
-    uint32_t arrayIndex;        // for render targets, the array index being rendered to for arrayed surfaces
-    SWR_TILE_MODE tileMode;     // @llvm_enum
-    uint32_t halign;
-    uint32_t valign;
-    uint32_t xOffset;
-    uint32_t yOffset;
+    gfxptr_t         xpBaseAddress;
+    SWR_SURFACE_TYPE type;   // @llvm_enum
+    SWR_FORMAT       format; // @llvm_enum
+    uint32_t         width;
+    uint32_t         height;
+    uint32_t         depth;
+    uint32_t         numSamples;
+    uint32_t         samplePattern;
+    uint32_t         pitch;
+    uint32_t         qpitch;
+    uint32_t minLod; // for sampled surfaces, the most detailed LOD that can be accessed by sampler
+    uint32_t maxLod; // for sampled surfaces, the max LOD that can be accessed
+    float    resourceMinLod; // for sampled surfaces, the most detailed fractional mip that can be
+                             // accessed by sampler
+    uint32_t lod;            // for render targets, the lod being rendered to
+    uint32_t arrayIndex; // for render targets, the array index being rendered to for arrayed surfaces
+    SWR_TILE_MODE tileMode; // @llvm_enum
+    uint32_t      halign;
+    uint32_t      valign;
+    uint32_t      xOffset;
+    uint32_t      yOffset;
 
     uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces
 
-    gfxptr_t xpAuxBaseAddress;   // Used for compression, append/consume counter, etc.
-    SWR_AUX_MODE auxMode;      // @llvm_enum
+    gfxptr_t     xpAuxBaseAddress; // Used for compression, append/consume counter, etc.
+    SWR_AUX_MODE auxMode;          // @llvm_enum
 
 
-    bool bInterleavedSamples;   // are MSAA samples stored interleaved or planar
+    bool bInterleavedSamples; // are MSAA samples stored interleaved or planar
 };
 
 // vertex fetch state
@@ -575,9 +577,10 @@
     uint32_t index;
     uint32_t pitch;
     uint32_t size;
-    uint32_t minVertex;             // min vertex (for bounds checking)
-    uint32_t maxVertex;             // size / pitch.  precalculated value used by fetch shader for OOB checks
-    uint32_t partialInboundsSize;   // size % pitch.  precalculated value used by fetch shader for partially OOB vertices
+    uint32_t minVertex; // min vertex (for bounds checking)
+    uint32_t maxVertex; // size / pitch.  precalculated value used by fetch shader for OOB checks
+    uint32_t partialInboundsSize; // size % pitch.  precalculated value used by fetch shader for
+                                  // partially OOB vertices
 };
 
 struct SWR_INDEX_BUFFER_STATE
@@ -585,10 +588,9 @@
     gfxptr_t xpIndices;
     // Format type for indices (e.g. UINT16, UINT32, etc.)
     SWR_FORMAT format; // @llvm_enum
-    uint32_t size;
+    uint32_t   size;
 };
 
-
 //////////////////////////////////////////////////////////////////////////
 /// SWR_FETCH_CONTEXT
 /// @brief Input to fetch shader.
@@ -597,20 +599,21 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_FETCH_CONTEXT
 {
-    const SWR_VERTEX_BUFFER_STATE* pStreams;    // IN: array of bound vertex buffers
-    gfxptr_t xpIndices;                          // IN: pointer to int32 index buffer for indexed draws
-    gfxptr_t xpLastIndex;                        // IN: pointer to end of index buffer, used for bounds checking
-    uint32_t CurInstance;                       // IN: current instance
-    uint32_t BaseVertex;                        // IN: base vertex
-    uint32_t StartVertex;                       // IN: start vertex
-    uint32_t StartInstance;                     // IN: start instance
-    simdscalari VertexID;                       // OUT: vector of vertex IDs
-    simdscalari CutMask;                        // OUT: vector mask of indices which have the cut index value
+    const SWR_VERTEX_BUFFER_STATE* pStreams;  // IN: array of bound vertex buffers
+    gfxptr_t                       xpIndices; // IN: pointer to int32 index buffer for indexed draws
+    gfxptr_t    xpLastIndex;   // IN: pointer to end of index buffer, used for bounds checking
+    uint32_t    CurInstance;   // IN: current instance
+    uint32_t    BaseVertex;    // IN: base vertex
+    uint32_t    StartVertex;   // IN: start vertex
+    uint32_t    StartInstance; // IN: start instance
+    simdscalari VertexID;      // OUT: vector of vertex IDs
+    simdscalari CutMask;       // OUT: vector mask of indices which have the cut index value
 #if USE_SIMD16_SHADERS
-//    simd16scalari VertexID;                     // OUT: vector of vertex IDs
-//    simd16scalari CutMask;                      // OUT: vector mask of indices which have the cut index value
-    simdscalari VertexID2;                      // OUT: vector of vertex IDs
-    simdscalari CutMask2;                       // OUT: vector mask of indices which have the cut index value
+    //    simd16scalari VertexID;                     // OUT: vector of vertex IDs
+    //    simd16scalari CutMask;                      // OUT: vector mask of indices which have the
+    //    cut index value
+    simdscalari VertexID2; // OUT: vector of vertex IDs
+    simdscalari CutMask2;  // OUT: vector mask of indices which have the cut index value
 #endif
 };
 
@@ -626,8 +629,8 @@
     uint64_t DepthPassCount; // Number of passing depth tests. Not exact.
 
     // Pipeline Stats
-    uint64_t PsInvocations;  // Number of Pixel Shader invocations
-    uint64_t CsInvocations;  // Number of Compute Shader invocations
+    uint64_t PsInvocations; // Number of Pixel Shader invocations
+    uint64_t CsInvocations; // Number of Compute Shader invocations
 
 };
 
@@ -653,9 +656,9 @@
     uint64_t SoNumPrimsWritten[4];
 };
 
-//////////////////////////////////////////////////////////////////////////
-/// STREAMOUT_BUFFERS
-/////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////
+    /// STREAMOUT_BUFFERS
+    /////////////////////////////////////////////////////////////////////////
 
 #define MAX_SO_STREAMS 4
 #define MAX_SO_BUFFERS 4
@@ -702,7 +705,7 @@
     // The stream masks specify which attributes are sent to which streams.
     // These masks help the FE to setup the pPrimData buffer that is passed
     // the Stream Output Shader (SOS) function.
-    uint32_t streamMasks[MAX_SO_STREAMS];
+    uint64_t streamMasks[MAX_SO_STREAMS];
 
     // Number of attributes, including position, per vertex that are streamed out.
     // This should match number of bits in stream mask.
@@ -717,7 +720,7 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_STREAMOUT_CONTEXT
 {
-    uint32_t* pPrimData;
+    uint32_t*             pPrimData;
     SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS];
 
     // Num prims written for this stream
@@ -735,8 +738,8 @@
     bool gsEnable;
 
     // If true, geometry shader emits a single stream, with separate cut buffer.
-    // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer
-    // to map vertices to streams
+    // If false, geometry shader emits vertices for multiple streams to the stream buffer, with a
+    // separate StreamID buffer to map vertices to streams
     bool isSingleStream;
 
     // Number of input attributes per vertex. Used by the frontend to
@@ -746,8 +749,8 @@
     // Stride of incoming verts in attributes
     uint32_t inputVertStride;
 
-    // Output topology - can be point, tristrip, or linestrip
-    PRIMITIVE_TOPOLOGY outputTopology;      // @llvm_enum
+    // Output topology - can be point, tristrip, linestrip, or rectlist
+    PRIMITIVE_TOPOLOGY outputTopology; // @llvm_enum
 
     // Maximum number of verts that can be emitted by a single instance of the GS
     uint32_t maxNumVerts;
@@ -762,14 +765,16 @@
     // Total amount of memory to allocate for one instance of the shader output in bytes
     uint32_t allocationSize;
 
-    // Offset to the start of the attributes of the input vertices, in simdvector units, as read by the GS
+    // Offset to the start of the attributes of the input vertices, in simdvector units, as read by
+    // the GS
     uint32_t vertexAttribOffset;
 
     // Offset to the attributes as stored by the preceding shader stage.
     uint32_t srcVertexAttribOffset;
 
-    // Size of the control data section which contains cut or streamID data, in simdscalar units. Should be sized to handle
-    // the maximum number of verts output by the GS. Can be 0 if there are no cuts or streamID bits.
+    // Size of the control data section which contains cut or streamID data, in simdscalar units.
+    // Should be sized to handle the maximum number of verts output by the GS. Can be 0 if there are
+    // no cuts or streamID bits.
     uint32_t controlDataSize;
 
     // Offset to the control data section, in bytes
@@ -781,15 +786,14 @@
     // Offset to the start of the vertex section, in bytes
     uint32_t outputVertexOffset;
 
-    // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero, shader is
-    // expected to store the final vertex count in the first dword of the gs output stream.
+    // Set this to non-zero to indicate that the shader outputs a static number of verts. If zero,
+    // shader is expected to store the final vertex count in the first dword of the gs output
+    // stream.
     uint32_t staticVertexCount;
 
     uint32_t pad;
 };
-static_assert(sizeof(SWR_GS_STATE) == 64,
-    "Adjust padding to keep size (or remove this assert)");
-
+static_assert(sizeof(SWR_GS_STATE) == 64, "Adjust padding to keep size (or remove this assert)");
 
 //////////////////////////////////////////////////////////////////////////
 /// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS
@@ -833,22 +837,22 @@
 /////////////////////////////////////////////////////////////////////////
 struct SWR_TS_STATE
 {
-    bool                    tsEnable;
+    bool tsEnable;
 
-    SWR_TS_OUTPUT_TOPOLOGY  tsOutputTopology;   // @llvm_enum
-    SWR_TS_PARTITIONING     partitioning;       // @llvm_enum
-    SWR_TS_DOMAIN           domain;             // @llvm_enum
+    SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology; // @llvm_enum
+    SWR_TS_PARTITIONING    partitioning;     // @llvm_enum
+    SWR_TS_DOMAIN          domain;           // @llvm_enum
 
-    PRIMITIVE_TOPOLOGY      postDSTopology;     // @llvm_enum
+    PRIMITIVE_TOPOLOGY postDSTopology; // @llvm_enum
 
-    uint32_t                numHsInputAttribs;
-    uint32_t                numHsOutputAttribs;
-    uint32_t                numDsOutputAttribs;
-    uint32_t                dsAllocationSize;
-    uint32_t                dsOutVtxAttribOffset;
+    uint32_t numHsInputAttribs;
+    uint32_t numHsOutputAttribs;
+    uint32_t numDsOutputAttribs;
+    uint32_t dsAllocationSize;
+    uint32_t dsOutVtxAttribOffset;
 
     // Offset to the start of the attributes of the input vertices, in simdvector units
-    uint32_t                vertexAttribOffset;
+    uint32_t vertexAttribOffset;
 };
 
 // output merger state
@@ -859,7 +863,8 @@
     uint8_t writeDisableBlue : 1;
     uint8_t writeDisableAlpha : 1;
 };
-static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
+static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1,
+              "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
 
 enum SWR_MULTISAMPLE_COUNT
 {
@@ -886,7 +891,7 @@
     uint32_t sampleMask;
     // all RT's have the same sample count
     ///@todo move this to Output Merger state when we refactor
-    SWR_MULTISAMPLE_COUNT sampleCount;  // @llvm_enum
+    SWR_MULTISAMPLE_COUNT sampleCount; // @llvm_enum
 
     SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS];
 };
@@ -894,17 +899,17 @@
 
 struct SWR_BLEND_CONTEXT
 {
-    const SWR_BLEND_STATE*  pBlendState;
-    simdvector*             src;
-    simdvector*             src1;
-    simdvector*             src0alpha;
-    uint32_t                sampleNum;
-    simdvector*             pDst;
-    simdvector*             result;
-    simdscalari*            oMask;
-    simdscalari*            pMask;
-    uint32_t                isAlphaTested;
-    uint32_t                isAlphaBlended;
+    const SWR_BLEND_STATE* pBlendState;
+    simdvector*            src;
+    simdvector*            src1;
+    simdvector*            src0alpha;
+    uint32_t               sampleNum;
+    simdvector*            pDst;
+    simdvector*            result;
+    simdscalari*           oMask;
+    simdscalari*           pMask;
+    uint32_t               isAlphaTested;
+    uint32_t               isAlphaBlended;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -921,13 +926,12 @@
 typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext);
 typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext);
 typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
-typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT *pContext);
-typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
+typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
 typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*);
 typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar const &);
 
 
-
 //////////////////////////////////////////////////////////////////////////
 /// FRONTEND_STATE
 /////////////////////////////////////////////////////////////////////////
@@ -1028,44 +1032,44 @@
 struct SWR_MULTISAMPLE_POS
 {
 public:
-    INLINE void SetXi(uint32_t sampleNum, uint32_t val) { _xi[sampleNum] = val; }; // @llvm_func
-    INLINE void SetYi(uint32_t sampleNum, uint32_t val) { _yi[sampleNum] = val; }; // @llvm_func
-    INLINE uint32_t Xi(uint32_t sampleNum) const { return _xi[sampleNum]; }; // @llvm_func
-    INLINE uint32_t Yi(uint32_t sampleNum) const { return _yi[sampleNum]; }; // @llvm_func
-    INLINE void SetX(uint32_t sampleNum, float val) { _x[sampleNum] = val; }; // @llvm_func
-    INLINE void SetY(uint32_t sampleNum, float val) { _y[sampleNum] = val; }; // @llvm_func
-    INLINE float X(uint32_t sampleNum) const { return _x[sampleNum]; }; // @llvm_func
-    INLINE float Y(uint32_t sampleNum) const { return _y[sampleNum]; }; // @llvm_func
-    typedef const float(&sampleArrayT)[SWR_MAX_NUM_MULTISAMPLES]; //@llvm_typedef
-    INLINE sampleArrayT X() const { return _x; }; // @llvm_func
-    INLINE sampleArrayT Y() const { return _y; }; // @llvm_func
+    INLINE void SetXi(uint32_t sampleNum, uint32_t val) { _xi[sampleNum] = val; };   // @llvm_func
+    INLINE void SetYi(uint32_t sampleNum, uint32_t val) { _yi[sampleNum] = val; };   // @llvm_func
+    INLINE uint32_t Xi(uint32_t sampleNum) const { return _xi[sampleNum]; };         // @llvm_func
+    INLINE uint32_t Yi(uint32_t sampleNum) const { return _yi[sampleNum]; };         // @llvm_func
+    INLINE void     SetX(uint32_t sampleNum, float val) { _x[sampleNum] = val; };    // @llvm_func
+    INLINE void     SetY(uint32_t sampleNum, float val) { _y[sampleNum] = val; };    // @llvm_func
+    INLINE float    X(uint32_t sampleNum) const { return _x[sampleNum]; };           // @llvm_func
+    INLINE float    Y(uint32_t sampleNum) const { return _y[sampleNum]; };           // @llvm_func
+    typedef const float (&sampleArrayT)[SWR_MAX_NUM_MULTISAMPLES];                   //@llvm_typedef
+    INLINE sampleArrayT X() const { return _x; };                                    // @llvm_func
+    INLINE sampleArrayT Y() const { return _y; };                                    // @llvm_func
     INLINE const __m128i& vXi(uint32_t sampleNum) const { return _vXi[sampleNum]; }; // @llvm_func
     INLINE const __m128i& vYi(uint32_t sampleNum) const { return _vYi[sampleNum]; }; // @llvm_func
     INLINE const simdscalar& vX(uint32_t sampleNum) const { return _vX[sampleNum]; }; // @llvm_func
     INLINE const simdscalar& vY(uint32_t sampleNum) const { return _vY[sampleNum]; }; // @llvm_func
-    INLINE const __m128i& TileSampleOffsetsX() const { return tileSampleOffsetsX; }; // @llvm_func
-    INLINE const __m128i& TileSampleOffsetsY() const { return tileSampleOffsetsY; }; // @llvm_func
+    INLINE const __m128i& TileSampleOffsetsX() const { return tileSampleOffsetsX; };  // @llvm_func
+    INLINE const __m128i& TileSampleOffsetsY() const { return tileSampleOffsetsY; };  // @llvm_func
 
     INLINE void PrecalcSampleData(int numSamples); //@llvm_func
 
 private:
     template <typename MaskT>
     INLINE __m128i expandThenBlend4(uint32_t* min, uint32_t* max); // @llvm_func
-    INLINE void CalcTileSampleOffsets(int numSamples);   // @llvm_func
+    INLINE void    CalcTileSampleOffsets(int numSamples);          // @llvm_func
 
     // scalar sample values
     uint32_t _xi[SWR_MAX_NUM_MULTISAMPLES];
     uint32_t _yi[SWR_MAX_NUM_MULTISAMPLES];
-    float _x[SWR_MAX_NUM_MULTISAMPLES];
-    float _y[SWR_MAX_NUM_MULTISAMPLES];
+    float    _x[SWR_MAX_NUM_MULTISAMPLES];
+    float    _y[SWR_MAX_NUM_MULTISAMPLES];
 
     // precalc'd / vectorized samples
-    __m128i _vXi[SWR_MAX_NUM_MULTISAMPLES];
-    __m128i _vYi[SWR_MAX_NUM_MULTISAMPLES];
+    __m128i    _vXi[SWR_MAX_NUM_MULTISAMPLES];
+    __m128i    _vYi[SWR_MAX_NUM_MULTISAMPLES];
     simdscalar _vX[SWR_MAX_NUM_MULTISAMPLES];
     simdscalar _vY[SWR_MAX_NUM_MULTISAMPLES];
-    __m128i tileSampleOffsetsX;
-    __m128i tileSampleOffsetsY;
+    __m128i    tileSampleOffsetsX;
+    __m128i    tileSampleOffsetsY;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -1073,33 +1077,33 @@
 //////////////////////////////////////////////////////////////////////////
 struct SWR_RASTSTATE
 {
-    uint32_t cullMode               : 2;
-    uint32_t fillMode               : 2;
-    uint32_t frontWinding           : 1;
-    uint32_t scissorEnable          : 1;
-    uint32_t depthClipEnable        : 1;
-    uint32_t clipHalfZ              : 1;
-    uint32_t pointParam             : 1;
-    uint32_t pointSpriteEnable      : 1;
-    uint32_t pointSpriteTopOrigin   : 1;
-    uint32_t forcedSampleCount      : 1;
-    uint32_t pixelOffset            : 1;
-    uint32_t depthBiasPreAdjusted   : 1;    ///< depth bias constant is in float units, not per-format Z units
-    uint32_t conservativeRast       : 1;
+    uint32_t cullMode : 2;
+    uint32_t fillMode : 2;
+    uint32_t frontWinding : 1;
+    uint32_t scissorEnable : 1;
+    uint32_t depthClipEnable : 1;
+    uint32_t clipHalfZ : 1;
+    uint32_t pointParam : 1;
+    uint32_t pointSpriteEnable : 1;
+    uint32_t pointSpriteTopOrigin : 1;
+    uint32_t forcedSampleCount : 1;
+    uint32_t pixelOffset : 1;
+    uint32_t depthBiasPreAdjusted : 1; ///< depth bias constant is in float units, not per-format Z units
+    uint32_t conservativeRast : 1;
 
     float pointSize;
     float lineWidth;
 
-    float depthBias;
-    float slopeScaledDepthBias;
-    float depthBiasClamp;
-    SWR_FORMAT depthFormat;     // @llvm_enum
+    float      depthBias;
+    float      slopeScaledDepthBias;
+    float      depthBiasClamp;
+    SWR_FORMAT depthFormat; // @llvm_enum
 
     // sample count the rasterizer is running at
-    SWR_MULTISAMPLE_COUNT sampleCount;  // @llvm_enum
-    uint32_t pixelLocation;     // UL or Center
-    SWR_MULTISAMPLE_POS samplePositions;    // @llvm_struct
-    bool bIsCenterPattern;   // @llvm_enum
+    SWR_MULTISAMPLE_COUNT sampleCount;      // @llvm_enum
+    uint32_t              pixelLocation;    // UL or Center
+    SWR_MULTISAMPLE_POS   samplePositions;  // @llvm_struct
+    bool                  bIsCenterPattern; // @llvm_enum
 };
 
 
@@ -1121,17 +1125,21 @@
 // backend state
 struct SWR_BACKEND_STATE
 {
-    uint32_t constantInterpolationMask;     // bitmask indicating which attributes have constant interpolation
-    uint32_t pointSpriteTexCoordMask;       // bitmask indicating the attribute(s) which should be interpreted as tex coordinates
+    uint32_t constantInterpolationMask; // bitmask indicating which attributes have constant
+                                        // interpolation
+    uint32_t pointSpriteTexCoordMask;   // bitmask indicating the attribute(s) which should be
+                                        // interpreted as tex coordinates
 
-    bool swizzleEnable;                 // when enabled, core will parse the swizzle map when
-                                        // setting up attributes for the backend, otherwise
-                                        // all attributes up to numAttributes will be sent
-    uint8_t numAttributes;                  // total number of attributes to send to backend (up to 32)
-    uint8_t numComponents[32];              // number of components to setup per attribute, this reduces some calculations for unneeded components
+    bool swizzleEnable;        // when enabled, core will parse the swizzle map when
+                               // setting up attributes for the backend, otherwise
+                               // all attributes up to numAttributes will be sent
+    uint8_t numAttributes;     // total number of attributes to send to backend (up to 32)
+    uint8_t numComponents[32]; // number of components to setup per attribute, this reduces some
+                               // calculations for unneeded components
 
-    bool readRenderTargetArrayIndex;    // Forward render target array index from last FE stage to the backend
-    bool readViewportArrayIndex;        // Read viewport array index from last FE stage during binning
+    bool readRenderTargetArrayIndex; // Forward render target array index from last FE stage to the
+                                     // backend
+    bool readViewportArrayIndex;     // Read viewport array index from last FE stage during binning
 
     // User clip/cull distance enables
     uint8_t cullDistanceMask;
@@ -1141,7 +1149,7 @@
     // and that the next fields are dword aligned.
     uint8_t pad[10];
 
-        // Offset to the start of the attributes of the input vertices, in simdvector units
+    // Offset to the start of the attributes of the input vertices, in simdvector units
     uint32_t vertexAttribOffset;
 
     // Offset to clip/cull attrib section of the vertex, in simdvector units
@@ -1150,7 +1158,7 @@
     SWR_ATTRIB_SWIZZLE swizzleMap[32];
 };
 static_assert(sizeof(SWR_BACKEND_STATE) == 128,
-    "Adjust padding to keep size (or remove this assert)");
+              "Adjust padding to keep size (or remove this assert)");
 
 
 union SWR_DEPTH_STENCIL_STATE
@@ -1213,8 +1221,8 @@
 
 enum SWR_BARYCENTRICS_MASK
 {
-    SWR_BARYCENTRIC_PER_PIXEL_MASK = 0x1,
-    SWR_BARYCENTRIC_CENTROID_MASK = 0x2,
+    SWR_BARYCENTRIC_PER_PIXEL_MASK  = 0x1,
+    SWR_BARYCENTRIC_CENTROID_MASK   = 0x2,
     SWR_BARYCENTRIC_PER_SAMPLE_MASK = 0x4,
 };
 
@@ -1222,27 +1230,28 @@
 struct SWR_PS_STATE
 {
     // dword 0-1
-    PFN_PIXEL_KERNEL pfnPixelShader;  // @llvm_pfn
+    PFN_PIXEL_KERNEL pfnPixelShader; // @llvm_pfn
 
     // dword 2
-    uint32_t killsPixel             : 1;    // pixel shader can kill pixels
-    uint32_t inputCoverage          : 2;    // ps uses input coverage
-    uint32_t writesODepth           : 1;    // pixel shader writes to depth
-    uint32_t usesSourceDepth        : 1;    // pixel shader reads depth
-    uint32_t shadingRate            : 2;    // shading per pixel / sample / coarse pixel
-    uint32_t posOffset              : 2;    // type of offset (none, sample, centroid) to add to pixel position
-    uint32_t barycentricsMask       : 3;    // which type(s) of barycentric coords does the PS interpolate attributes with
-    uint32_t usesUAV                : 1;    // pixel shader accesses UAV
-    uint32_t forceEarlyZ            : 1;    // force execution of early depth/stencil test
+    uint32_t killsPixel : 1;      // pixel shader can kill pixels
+    uint32_t inputCoverage : 2;   // ps uses input coverage
+    uint32_t writesODepth : 1;    // pixel shader writes to depth
+    uint32_t usesSourceDepth : 1; // pixel shader reads depth
+    uint32_t shadingRate : 2;     // shading per pixel / sample / coarse pixel
+    uint32_t posOffset : 2; // type of offset (none, sample, centroid) to add to pixel position
+    uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate
+                                   // attributes with
+    uint32_t usesUAV : 1;          // pixel shader accesses UAV
+    uint32_t forceEarlyZ : 1;      // force execution of early depth/stencil test
 
-    uint8_t renderTargetMask;               // Mask of render targets written
+    uint8_t renderTargetMask; // Mask of render targets written
 };
 
 // depth bounds state
 struct SWR_DEPTH_BOUNDS_STATE
 {
-    bool    depthBoundsTestEnable;
-    float   depthBoundsTestMinValue;
-    float   depthBoundsTestMaxValue;
+    bool  depthBoundsTestEnable;
+    float depthBoundsTestMinValue;
+    float depthBoundsTestMaxValue;
 };
-
+// clang-format on
diff --git a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h
index eaf0094..99eac83 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state_funcs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state_funcs.h
@@ -1,36 +1,35 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file state.h
-*
-* @brief Definitions for API state - complex function implementation.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file state.h
+ *
+ * @brief Definitions for API state - complex function implementation.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "core/state.h"
 #include "common/simdintrin.h"
 
-
 template <typename MaskT>
 INLINE __m128i SWR_MULTISAMPLE_POS::expandThenBlend4(uint32_t* min, uint32_t* max)
 {
@@ -41,27 +40,27 @@
 
 INLINE void SWR_MULTISAMPLE_POS::PrecalcSampleData(int numSamples)
 {
-    for(int i = 0; i < numSamples; i++)
+    for (int i = 0; i < numSamples; i++)
     {
         _vXi[i] = _mm_set1_epi32(_xi[i]);
         _vYi[i] = _mm_set1_epi32(_yi[i]);
-        _vX[i] = _simd_set1_ps(_x[i]);
-        _vY[i] = _simd_set1_ps(_y[i]);
+        _vX[i]  = _simd_set1_ps(_x[i]);
+        _vY[i]  = _simd_set1_ps(_y[i]);
     }
     // precalculate the raster tile BB for the rasterizer.
-    CalcTileSampleOffsets(numSamples);                                 
+    CalcTileSampleOffsets(numSamples);
 }
 
 INLINE void SWR_MULTISAMPLE_POS::CalcTileSampleOffsets(int numSamples)
 {
-    auto minXi = std::min_element(std::begin(_xi), &_xi[numSamples]);
-    auto maxXi = std::max_element(std::begin(_xi), &_xi[numSamples]);
+    auto minXi  = std::min_element(std::begin(_xi), &_xi[numSamples]);
+    auto maxXi  = std::max_element(std::begin(_xi), &_xi[numSamples]);
     using xMask = std::integral_constant<int, 0xA>;
     // BR(max),    BL(min),    UR(max),    UL(min)
     tileSampleOffsetsX = expandThenBlend4<xMask>(minXi, maxXi);
 
-    auto minYi = std::min_element(std::begin(_yi), &_yi[numSamples]);
-    auto maxYi = std::max_element(std::begin(_yi), &_yi[numSamples]);
+    auto minYi  = std::min_element(std::begin(_yi), &_yi[numSamples]);
+    auto maxYi  = std::max_element(std::begin(_yi), &_yi[numSamples]);
     using yMask = std::integral_constant<int, 0xC>;
     // BR(max),    BL(min),    UR(max),    UL(min)
     tileSampleOffsetsY = expandThenBlend4<yMask>(minYi, maxYi);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h
index 316f66f..348170b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tessellator.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tessellator.h
@@ -1,43 +1,42 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file tessellator.h
-*
-* @brief Tessellator fixed function unit interface definition
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file tessellator.h
+ *
+ * @brief Tessellator fixed function unit interface definition
+ *
+ ******************************************************************************/
 #pragma once
 
 /// Allocate and initialize a new tessellation context
-HANDLE SWR_API TSInitCtx(
-    SWR_TS_DOMAIN tsDomain,                     ///< [IN] Tessellation domain (isoline, quad, triangle)
-    SWR_TS_PARTITIONING tsPartitioning,         ///< [IN] Tessellation partitioning algorithm
-    SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology,    ///< [IN] Tessellation output topology
-    void* pContextMem,                          ///< [IN] Memory to use for the context
-    size_t& memSize);                           ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required
+HANDLE SWR_API
+       TSInitCtx(SWR_TS_DOMAIN          tsDomain, ///< [IN] Tessellation domain (isoline, quad, triangle)
+                 SWR_TS_PARTITIONING    tsPartitioning, ///< [IN] Tessellation partitioning algorithm
+                 SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology, ///< [IN] Tessellation output topology
+                 void*                  pContextMem,      ///< [IN] Memory to use for the context
+                 size_t& memSize); ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required
 
 /// Destroy & de-allocate tessellation context
-void SWR_API TSDestroyCtx(
-    HANDLE tsCtx);  ///< [IN] Tessellation context to be destroyed
+void SWR_API TSDestroyCtx(HANDLE tsCtx); ///< [IN] Tessellation context to be destroyed
 
 struct SWR_TS_TESSELLATED_DATA
 {
@@ -45,43 +44,38 @@
     uint32_t NumDomainPoints;
 
     uint32_t* ppIndices[3];
-    float* pDomainPointsU;
-    float* pDomainPointsV;
+    float*    pDomainPointsU;
+    float*    pDomainPointsV;
     // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i]
 };
 
 /// Perform Tessellation
-void SWR_API TSTessellate(
-    HANDLE tsCtx,                                   ///< [IN] Tessellation Context
-    const SWR_TESSELLATION_FACTORS& tsTessFactors,  ///< [IN] Tessellation Factors
-    SWR_TS_TESSELLATED_DATA& tsTessellatedData);    ///< [OUT] Tessellated Data
-
+void SWR_API
+     TSTessellate(HANDLE                          tsCtx,         ///< [IN] Tessellation Context
+                  const SWR_TESSELLATION_FACTORS& tsTessFactors, ///< [IN] Tessellation Factors
+                  SWR_TS_TESSELLATED_DATA&        tsTessellatedData);   ///< [OUT] Tessellated Data
 
 
 /// @TODO - Implement OSS tessellator
 
-INLINE HANDLE SWR_API TSInitCtx(
-    SWR_TS_DOMAIN tsDomain,
-    SWR_TS_PARTITIONING tsPartitioning,
-    SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology,
-    void* pContextMem,
-    size_t& memSize)
+INLINE HANDLE SWR_API TSInitCtx(SWR_TS_DOMAIN          tsDomain,
+                                SWR_TS_PARTITIONING    tsPartitioning,
+                                SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology,
+                                void*                  pContextMem,
+                                size_t&                memSize)
 {
     SWR_NOT_IMPL;
     return NULL;
 }
 
-
 INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx)
 {
     SWR_NOT_IMPL;
 }
 
-
-INLINE void SWR_API TSTessellate(
-    HANDLE tsCtx,
-    const SWR_TESSELLATION_FACTORS& tsTessFactors,
-    SWR_TS_TESSELLATED_DATA& tsTessellatedData)
+INLINE void SWR_API TSTessellate(HANDLE                          tsCtx,
+                                 const SWR_TESSELLATION_FACTORS& tsTessFactors,
+                                 SWR_TS_TESSELLATED_DATA&        tsTessellatedData)
 {
     SWR_NOT_IMPL;
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 9e16246..4523616 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -1,25 +1,25 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-****************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ****************************************************************************/
 
 #include <stdio.h>
 #include <thread>
@@ -49,15 +49,14 @@
 #include "rasterizer.h"
 #include "rdtsc_core.h"
 #include "tilemgr.h"
-
-
+#include "tileset.h"
 
 
 // ThreadId
 struct Core
 {
-    uint32_t                procGroup = 0;
-    std::vector<uint32_t>   threadIds;
+    uint32_t              procGroup = 0;
+    std::vector<uint32_t> threadIds;
 };
 
 struct NumaNode
@@ -77,7 +76,7 @@
 
     std::vector<KAFFINITY> threadMaskPerProcGroup;
 
-    static std::mutex m;
+    static std::mutex           m;
     std::lock_guard<std::mutex> l(m);
 
     DWORD bufSize = 0;
@@ -85,13 +84,14 @@
     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
     SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
 
-    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem =
+        (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
     SWR_ASSERT(pBufferMem);
 
     ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
     SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
 
-    uint32_t count = bufSize / pBufferMem->Size;
+    uint32_t                                 count   = bufSize / pBufferMem->Size;
     PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
 
     for (uint32_t i = 0; i < count; ++i)
@@ -99,8 +99,8 @@
         SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
         for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
         {
-            auto& gmask = pBuffer->Processor.GroupMask[g];
-            uint32_t threadId = 0;
+            auto&    gmask     = pBuffer->Processor.GroupMask[g];
+            uint32_t threadId  = 0;
             uint32_t procGroup = gmask.Group;
 
             Core* pCore = nullptr;
@@ -132,10 +132,10 @@
                 threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
 
                 // Find Numa Node
-                uint32_t numaId = 0;
+                uint32_t         numaId  = 0;
                 PROCESSOR_NUMBER procNum = {};
-                procNum.Group = WORD(procGroup);
-                procNum.Number = UCHAR(threadId);
+                procNum.Group            = WORD(procGroup);
+                procNum.Number           = UCHAR(threadId);
 
                 ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
                 SWR_ASSERT(ret);
@@ -145,7 +145,7 @@
                 {
                     out_nodes.resize(numaId + 1);
                 }
-                auto& numaNode = out_nodes[numaId];
+                auto& numaNode  = out_nodes[numaId];
                 numaNode.numaId = numaId;
 
                 uint32_t coreId = 0;
@@ -153,7 +153,7 @@
                 if (nullptr == pCore)
                 {
                     numaNode.cores.push_back(Core());
-                    pCore = &numaNode.cores.back();
+                    pCore            = &numaNode.cores.back();
                     pCore->procGroup = procGroup;
                 }
                 pCore->threadIds.push_back(threadId);
@@ -168,56 +168,55 @@
 
     free(pBufferMem);
 
-
-#elif defined(__linux__) || defined (__gnu_linux__)
+#elif defined(__linux__) || defined(__gnu_linux__)
 
     // Parse /proc/cpuinfo to get full topology
     std::ifstream input("/proc/cpuinfo");
-    std::string line;
-    char* c;
-    uint32_t procId = uint32_t(-1);
-    uint32_t coreId = uint32_t(-1);
-    uint32_t physId = uint32_t(-1);
+    std::string   line;
+    char*         c;
+    uint32_t      procId = uint32_t(-1);
+    uint32_t      coreId = uint32_t(-1);
+    uint32_t      physId = uint32_t(-1);
 
     while (std::getline(input, line))
     {
         if (line.find("processor") != std::string::npos)
         {
             auto data_start = line.find(": ") + 2;
-            procId = std::strtoul(&line.c_str()[data_start], &c, 10);
+            procId          = std::strtoul(&line.c_str()[data_start], &c, 10);
             continue;
         }
         if (line.find("core id") != std::string::npos)
         {
             auto data_start = line.find(": ") + 2;
-            coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
+            coreId          = std::strtoul(&line.c_str()[data_start], &c, 10);
             continue;
         }
         if (line.find("physical id") != std::string::npos)
         {
             auto data_start = line.find(": ") + 2;
-            physId = std::strtoul(&line.c_str()[data_start], &c, 10);
+            physId          = std::strtoul(&line.c_str()[data_start], &c, 10);
             continue;
         }
         if (line.length() == 0)
         {
             if (physId + 1 > out_nodes.size())
                 out_nodes.resize(physId + 1);
-            auto& numaNode = out_nodes[physId];
+            auto& numaNode  = out_nodes[physId];
             numaNode.numaId = physId;
 
             if (coreId + 1 > numaNode.cores.size())
                 numaNode.cores.resize(coreId + 1);
-            auto& core = numaNode.cores[coreId];
+            auto& core     = numaNode.cores[coreId];
             core.procGroup = coreId;
             core.threadIds.push_back(procId);
         }
     }
 
     out_numThreadsPerProcGroup = 0;
-    for (auto &node : out_nodes)
+    for (auto& node : out_nodes)
     {
-        for (auto &core : node.cores)
+        for (auto& core : node.cores)
         {
             out_numThreadsPerProcGroup += core.threadIds.size();
         }
@@ -225,11 +224,11 @@
 
 #elif defined(__APPLE__)
 
-    auto numProcessors = 0;
-    auto numCores = 0;
+    auto numProcessors  = 0;
+    auto numCores       = 0;
     auto numPhysicalIds = 0;
 
-    int value;
+    int    value;
     size_t size = sizeof(value);
 
     int result = sysctlbyname("hw.packages", &value, &size, NULL, 0);
@@ -248,8 +247,8 @@
 
     for (auto physId = 0; physId < numPhysicalIds; ++physId)
     {
-        auto &numaNode = out_nodes[physId];
-        auto procId = 0;
+        auto& numaNode = out_nodes[physId];
+        auto  procId   = 0;
 
         numaNode.cores.resize(numCores);
 
@@ -257,7 +256,7 @@
         {
             for (auto coreId = 0; coreId < numaNode.cores.size(); ++coreId, ++procId)
             {
-                auto &core = numaNode.cores[coreId];
+                auto& core = numaNode.cores[coreId];
 
                 core.procGroup = coreId;
                 core.threadIds.push_back(procId);
@@ -267,9 +266,9 @@
 
     out_numThreadsPerProcGroup = 0;
 
-    for (auto &node : out_nodes)
+    for (auto& node : out_nodes)
     {
-        for (auto &core : node.cores)
+        for (auto& core : node.cores)
         {
             out_numThreadsPerProcGroup += core.threadIds.size();
         }
@@ -282,10 +281,10 @@
 #endif
 
     // Prune empty cores and numa nodes
-    for (auto node_it = out_nodes.begin(); node_it != out_nodes.end(); )
+    for (auto node_it = out_nodes.begin(); node_it != out_nodes.end();)
     {
         // Erase empty cores (first)
-        for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end(); )
+        for (auto core_it = node_it->cores.begin(); core_it != node_it->cores.end();)
         {
             if (core_it->threadIds.size() == 0)
             {
@@ -309,10 +308,14 @@
     }
 }
 
-void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
+void bindThread(SWR_CONTEXT* pContext,
+                uint32_t     threadId,
+                uint32_t     procGroupId   = 0,
+                bool         bindProcGroup = false)
 {
     // Only bind threads when MAX_WORKER_THREADS isn't set.
-    if (pContext->threadInfo.SINGLE_THREADED || (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
+    if (pContext->threadInfo.SINGLE_THREADED ||
+        (pContext->threadInfo.MAX_WORKER_THREADS && bindProcGroup == false))
     {
         return;
     }
@@ -320,7 +323,7 @@
 #if defined(_WIN32)
 
     GROUP_AFFINITY affinity = {};
-    affinity.Group = procGroupId;
+    affinity.Group          = procGroupId;
 
 #if !defined(_WIN64)
     if (threadId >= 32)
@@ -339,7 +342,7 @@
     {
         // If MAX_WORKER_THREADS is set, only bind to the proc group,
         // Not the individual HW thread.
-        if (!bindProcGroup  && !pContext->threadInfo.MAX_WORKER_THREADS)
+        if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
         {
             affinity.Mask = KAFFINITY(1) << threadId;
         }
@@ -371,15 +374,15 @@
 }
 
 INLINE
-uint32_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
+uint32_t GetEnqueuedDraw(SWR_CONTEXT* pContext)
 {
     return pContext->dcRing.GetHead();
 }
 
 INLINE
-DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint32_t drawId)
+DRAW_CONTEXT* GetDC(SWR_CONTEXT* pContext, uint32_t drawId)
 {
-    return &pContext->dcRing[(drawId-1) % pContext->MAX_DRAWS_IN_FLIGHT];
+    return &pContext->dcRing[(drawId - 1) % pContext->MAX_DRAWS_IN_FLIGHT];
 }
 
 INLINE
@@ -392,12 +395,12 @@
 
 // returns true if dependency not met
 INLINE
-bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
+bool CheckDependency(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
 {
     return pDC->dependent && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 }
 
-bool CheckDependencyFE(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t lastRetiredDraw)
+bool CheckDependencyFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t lastRetiredDraw)
 {
     return pDC->dependentFE && IDComparesLess(lastRetiredDraw, pDC->drawId - 1);
 }
@@ -412,15 +415,15 @@
     }
 
     DRAW_DYNAMIC_STATE& dynState = pDC->dynState;
-    OSALIGNLINE(SWR_STATS) stats{ 0 };
+    OSALIGNLINE(SWR_STATS) stats{0};
 
     // Sum up stats across all workers before sending to client.
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
         stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
 
-        stats.PsInvocations  += dynState.pStats[i].PsInvocations;
-        stats.CsInvocations  += dynState.pStats[i].CsInvocations;
+        stats.PsInvocations += dynState.pStats[i].PsInvocations;
+        stats.CsInvocations += dynState.pStats[i].CsInvocations;
     }
 
 
@@ -434,8 +437,8 @@
     if (pDC->retireCallback.pfnCallbackFunc)
     {
         pDC->retireCallback.pfnCallbackFunc(pDC->retireCallback.userData,
-            pDC->retireCallback.userData2,
-            pDC->retireCallback.userData3);
+                                            pDC->retireCallback.userData2,
+                                            pDC->retireCallback.userData3);
     }
 }
 
@@ -464,7 +467,7 @@
 
         _ReadWriteBarrier();
 
-        pContext->dcRing.Dequeue();  // Remove from tail
+        pContext->dcRing.Dequeue(); // Remove from tail
     }
 
     return result;
@@ -476,20 +479,23 @@
     return CompleteDrawContextInl(pContext, 0, pDC);
 }
 
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE, uint32_t& drawEnqueued)
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext,
+                                    uint32_t     workerId,
+                                    uint32_t&    curDrawBE,
+                                    uint32_t&    drawEnqueued)
 {
     // increment our current draw id to the first incomplete draw
     drawEnqueued = GetEnqueuedDraw(pContext);
     while (IDComparesLess(curDrawBE, drawEnqueued))
     {
-        DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
+        DRAW_CONTEXT* pDC = &pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT];
 
         // If its not compute and FE is not done then break out of loop.
-        if (!pDC->doneFE && !pDC->isCompute) break;
+        if (!pDC->doneFE && !pDC->isCompute)
+            break;
 
-        bool isWorkComplete = pDC->isCompute ?
-            pDC->pDispatch->isWorkComplete() :
-            pDC->pTileMgr->isWorkComplete();
+        bool isWorkComplete =
+            pDC->isCompute ? pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
 
         if (isWorkComplete)
         {
@@ -510,24 +516,24 @@
 /// @brief If there is any BE work then go work on it.
 /// @param pContext - pointer to SWR context.
 /// @param workerId - The unique worker ID that is assigned to this thread.
-/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
-///                    has its own curDrawBE counter and this ensures that each worker processes all the
-///                    draws in order.
+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
+/// thread
+///                    has its own curDrawBE counter and this ensures that each worker processes all
+///                    the draws in order.
 /// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
-///                      own set and each time it fails to lock a macrotile, because its already locked,
-///                      then it will add that tile to the lockedTiles set. As a worker begins to work
-///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
-///                      still have work pending in a previous draw. Additionally, the lockedTiles is
-///                      hueristic that can steer a worker back to the same macrotile that it had been
-///                      working on in a previous draw.
+///                      own set and each time it fails to lock a macrotile, because its already
+///                      locked, then it will add that tile to the lockedTiles set. As a worker
+///                      begins to work on future draws the lockedTiles ensure that it doesn't work
+///                      on tiles that may still have work pending in a previous draw. Additionally,
+///                      the lockedTiles is hueristic that can steer a worker back to the same
+///                      macrotile that it had been working on in a previous draw.
 /// @returns        true if worker thread should shutdown
-bool WorkOnFifoBE(
-    SWR_CONTEXT *pContext,
-    uint32_t workerId,
-    uint32_t &curDrawBE,
-    TileSet& lockedTiles,
-    uint32_t numaNode,
-    uint32_t numaMask)
+bool WorkOnFifoBE(SWR_CONTEXT* pContext,
+                  uint32_t     workerId,
+                  uint32_t&    curDrawBE,
+                  TileSet&     lockedTiles,
+                  uint32_t     numaNode,
+                  uint32_t     numaMask)
 {
     bool bShutdown = false;
 
@@ -539,27 +545,30 @@
         return false;
     }
 
-    uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
+    uint32_t lastRetiredDraw =
+        pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 
     // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
     lockedTiles.clear();
 
     // Try to work on each draw in order of the available draws in flight.
     //   1. If we're on curDrawBE, we can work on any macrotile that is available.
-    //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
+    //   2. If we're trying to work on draws after curDrawBE, we are restricted to
     //      working on those macrotiles that are known to be complete in the prior draw to
     //      maintain order. The locked tiles provides the history to ensures this.
     for (uint32_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
     {
-        DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
+        DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
 
-        if (pDC->isCompute) return false; // We don't look at compute work.
+        if (pDC->isCompute)
+            return false; // We don't look at compute work.
 
         // First wait for FE to be finished with this draw. This keeps threading model simple
         // but if there are lots of bubbles between draws then serializing FE and BE may
         // need to be revisited.
-        if (!pDC->doneFE) return false;
-        
+        if (!pDC->doneFE)
+            return false;
+
         // If this draw is dependent on a previous draw then we need to bail.
         if (CheckDependency(pContext, pDC, lastRetiredDraw))
         {
@@ -567,7 +576,7 @@
         }
 
         // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
-        auto &macroTiles = pDC->pTileMgr->getDirtyTiles();
+        auto& macroTiles = pDC->pTileMgr->getDirtyTiles();
 
         for (auto tile : macroTiles)
         {
@@ -587,14 +596,14 @@
             }
 
             // can only work on this draw if it's not in use by other threads
-            if (lockedTiles.find(tileID) != lockedTiles.end())
+            if (lockedTiles.get(tileID))
             {
                 continue;
             }
 
             if (tile->tryLock())
             {
-                BE_WORK *pWork;
+                BE_WORK* pWork;
 
                 RDTSC_BEGIN(WorkerFoundWork, pDC->drawId);
 
@@ -623,11 +632,13 @@
 
                 pDC->pTileMgr->markTileComplete(tileID);
 
-                // Optimization: If the draw is complete and we're the last one to have worked on it then
-                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
+                // Optimization: If the draw is complete and we're the last one to have worked on it
+                // then we can reset the locked list as we know that all previous draws before the
+                // next are guaranteed to be complete.
                 if ((curDrawBE == i) && (bShutdown || pDC->pTileMgr->isWorkComplete()))
                 {
-                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
+                    // We can increment the current BE and safely move to next draw since we know
+                    // this draw is complete.
                     curDrawBE++;
                     CompleteDrawContextInl(pContext, workerId, pDC);
 
@@ -644,8 +655,9 @@
             }
             else
             {
-                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
-                lockedTiles.insert(tileID);
+                // This tile is already locked. So let's add it to our locked tiles set. This way we
+                // don't try locking this one again.
+                lockedTiles.set(tileID);
             }
         }
     }
@@ -662,12 +674,24 @@
         SWR_STATS_FE& stats = pDC->dynState.statsFE;
 
         AR_EVENT(FrontendStatsEvent(pDC->drawId,
-            stats.IaVertices, stats.IaPrimitives, stats.VsInvocations, stats.HsInvocations,
-            stats.DsInvocations, stats.GsInvocations, stats.GsPrimitives, stats.CInvocations, stats.CPrimitives,
-            stats.SoPrimStorageNeeded[0], stats.SoPrimStorageNeeded[1], stats.SoPrimStorageNeeded[2], stats.SoPrimStorageNeeded[3],
-            stats.SoNumPrimsWritten[0], stats.SoNumPrimsWritten[1], stats.SoNumPrimsWritten[2], stats.SoNumPrimsWritten[3]
-        ));
-		AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
+                                    stats.IaVertices,
+                                    stats.IaPrimitives,
+                                    stats.VsInvocations,
+                                    stats.HsInvocations,
+                                    stats.DsInvocations,
+                                    stats.GsInvocations,
+                                    stats.GsPrimitives,
+                                    stats.CInvocations,
+                                    stats.CPrimitives,
+                                    stats.SoPrimStorageNeeded[0],
+                                    stats.SoPrimStorageNeeded[1],
+                                    stats.SoPrimStorageNeeded[2],
+                                    stats.SoPrimStorageNeeded[3],
+                                    stats.SoNumPrimsWritten[0],
+                                    stats.SoNumPrimsWritten[1],
+                                    stats.SoNumPrimsWritten[2],
+                                    stats.SoNumPrimsWritten[3]));
+        AR_EVENT(FrontendDrawEndEvent(pDC->drawId));
 
         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &stats);
     }
@@ -679,7 +703,8 @@
             if ((pDC->dynState.SoWriteOffsetDirty[i]) &&
                 (pDC->pState->state.soBuffer[i].soWriteEnable))
             {
-                pContext->pfnUpdateSoWriteOffset(GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
+                pContext->pfnUpdateSoWriteOffset(
+                    GetPrivateState(pDC), i, pDC->dynState.SoWriteOffset[i]);
             }
         }
     }
@@ -691,14 +716,14 @@
     InterlockedDecrement(&pContext->drawsOutstandingFE);
 }
 
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
+void WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE)
 {
     // Try to grab the next DC from the ring
     uint32_t drawEnqueued = GetEnqueuedDraw(pContext);
     while (IDComparesLess(curDrawFE, drawEnqueued))
     {
-        uint32_t dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
-        DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
+        uint32_t      dcSlot = curDrawFE % pContext->MAX_DRAWS_IN_FLIGHT;
+        DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
         if (pDC->isCompute || pDC->doneFE)
         {
             CompleteDrawContextInl(pContext, workerId, pDC);
@@ -711,11 +736,11 @@
     }
 
     uint32_t lastRetiredFE = curDrawFE - 1;
-    uint32_t curDraw = curDrawFE;
+    uint32_t curDraw       = curDrawFE;
     while (IDComparesLess(curDraw, drawEnqueued))
     {
-        uint32_t dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
-        DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
+        uint32_t      dcSlot = curDraw % pContext->MAX_DRAWS_IN_FLIGHT;
+        DRAW_CONTEXT* pDC    = &pContext->dcRing[dcSlot];
 
         if (!pDC->isCompute && !pDC->FeLock)
         {
@@ -741,13 +766,11 @@
 /// @brief If there is any compute work then go work on it.
 /// @param pContext - pointer to SWR context.
 /// @param workerId - The unique worker ID that is assigned to this thread.
-/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
-///                    has its own curDrawBE counter and this ensures that each worker processes all the
-///                    draws in order.
-void WorkOnCompute(
-    SWR_CONTEXT *pContext,
-    uint32_t workerId,
-    uint32_t& curDrawBE)
+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker
+/// thread
+///                    has its own curDrawBE counter and this ensures that each worker processes all
+///                    the draws in order.
+void WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE)
 {
     uint32_t drawEnqueued = 0;
     if (FindFirstIncompleteDraw(pContext, workerId, curDrawBE, drawEnqueued) == false)
@@ -755,12 +778,14 @@
         return;
     }
 
-    uint32_t lastRetiredDraw = pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
+    uint32_t lastRetiredDraw =
+        pContext->dcRing[curDrawBE % pContext->MAX_DRAWS_IN_FLIGHT].drawId - 1;
 
     for (uint64_t i = curDrawBE; IDComparesLess(i, drawEnqueued); ++i)
     {
-        DRAW_CONTEXT *pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
-        if (pDC->isCompute == false) return;
+        DRAW_CONTEXT* pDC = &pContext->dcRing[i % pContext->MAX_DRAWS_IN_FLIGHT];
+        if (pDC->isCompute == false)
+            return;
 
         // check dependencies
         if (CheckDependency(pContext, pDC, lastRetiredDraw))
@@ -774,9 +799,9 @@
         // Is there any work remaining?
         if (queue.getNumQueued() > 0)
         {
-            void* pSpillFillBuffer = nullptr;
-            void* pScratchSpace = nullptr;
-            uint32_t threadGroupId = 0;
+            void*    pSpillFillBuffer = nullptr;
+            void*    pScratchSpace    = nullptr;
+            uint32_t threadGroupId    = 0;
             while (queue.getWork(threadGroupId))
             {
                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
@@ -789,7 +814,7 @@
     }
 }
 
-void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
+void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId)
 {
     if (nullptr == pContext)
     {
@@ -800,25 +825,26 @@
     {
         if (pContext->threadPool.numReservedThreads)
         {
-            const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
+            const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[0];
             // Just bind to the process group used for API thread 0
             bindThread(pContext, 0, threadData.procGroupId, true);
         }
         return;
     }
 
-    const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];
+    const THREAD_DATA& threadData = pContext->threadPool.pApiThreadData[apiThreadId];
 
-    bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
+    bindThread(
+        pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
 }
 
-template<bool IsFEThread, bool IsBEThread>
+template <bool IsFEThread, bool IsBEThread>
 DWORD workerThreadMain(LPVOID pData)
 {
-    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
-    SWR_CONTEXT *pContext = pThreadData->pContext;
-    uint32_t threadId = pThreadData->threadId;
-    uint32_t workerId = pThreadData->workerId;
+    THREAD_DATA* pThreadData = (THREAD_DATA*)pData;
+    SWR_CONTEXT* pContext    = pThreadData->pContext;
+    uint32_t     threadId    = pThreadData->threadId;
+    uint32_t     workerId    = pThreadData->workerId;
 
     bindThread(pContext, threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup);
 
@@ -831,7 +857,10 @@
                   // linux pthread name limited to 16 chars (including \0)
                   "w%03d-n%d-c%03d-t%d",
 #endif
-            workerId, pThreadData->numaId, pThreadData->coreId, pThreadData->htId);
+                  workerId,
+                  pThreadData->numaId,
+                  pThreadData->coreId,
+                  pThreadData->htId);
         SetCurrentThreadName(threadName);
     }
 
@@ -850,7 +879,7 @@
 
     // each worker has the ability to work on any of the queued draws as long as certain
     // conditions are met. the data associated
-    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
+    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he
     // has moved on to the next draw when he determines there is no more work to do. The api
     // thread will not increment the head of the dc ring until all workers have moved past the
     // current head.
@@ -905,7 +934,8 @@
         if (IsBEThread)
         {
             RDTSC_BEGIN(WorkerWorkOnFifoBE, 0);
-            bShutdown |= WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
+            bShutdown |=
+                WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
             RDTSC_END(WorkerWorkOnFifoBE, 0);
 
             WorkOnCompute(pContext, workerId, curDrawBE);
@@ -924,7 +954,8 @@
 
     return 0;
 }
-template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
+template <>
+DWORD workerThreadMain<false, false>(LPVOID) = delete;
 
 template <bool IsFEThread, bool IsBEThread>
 DWORD workerThreadInit(LPVOID pData)
@@ -937,7 +968,7 @@
     }
 
 #if defined(_WIN32)
-    __except(EXCEPTION_CONTINUE_SEARCH)
+    __except (EXCEPTION_CONTINUE_SEARCH)
     {
     }
 
@@ -945,14 +976,16 @@
 
     return 1;
 }
-template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
+template <>
+DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
 
 static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
 {
     // Initialize DRAW_CONTEXT's per-thread stats
     for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
     {
-        pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
+        pContext->dcRing[dc].dynState.pStats =
+            (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
         memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
     }
 }
@@ -964,15 +997,15 @@
 void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 {
     CPUNumaNodes nodes;
-    uint32_t numThreadsPerProcGroup = 0;
+    uint32_t     numThreadsPerProcGroup = 0;
     CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
 
     // Assumption, for asymmetric topologies, multi-threaded cores will appear
     // in the list before single-threaded cores.  This appears to be true for
     // Windows when the total HW threads is limited to 64.
-    uint32_t numHWNodes         = (uint32_t)nodes.size();
-    uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
-    uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();
+    uint32_t numHWNodes        = (uint32_t)nodes.size();
+    uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
+    uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
 
 #if defined(_WIN32) && !defined(_WIN64)
     if (!pContext->threadInfo.MAX_WORKER_THREADS)
@@ -996,9 +1029,9 @@
         }
     }
 
-    uint32_t numNodes           = numHWNodes;
-    uint32_t numCoresPerNode    = numHWCoresPerNode;
-    uint32_t numHyperThreads    = numHWHyperThreads;
+    uint32_t numNodes        = numHWNodes;
+    uint32_t numCoresPerNode = numHWCoresPerNode;
+    uint32_t numHyperThreads = numHWHyperThreads;
 
     // Calc used threads per-core
     if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
@@ -1007,11 +1040,10 @@
     }
     else
     {
-        SWR_ASSERT(
-            false,
-            "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
-            pContext->threadInfo.BASE_THREAD,
-            numHyperThreads);
+        SWR_ASSERT(false,
+                   "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
+                   pContext->threadInfo.BASE_THREAD,
+                   numHyperThreads);
         pContext->threadInfo.BASE_THREAD = 0;
     }
 
@@ -1041,11 +1073,10 @@
     }
     else
     {
-        SWR_ASSERT(
-            false,
-            "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
-            pContext->threadInfo.BASE_CORE,
-            numCoresPerNode);
+        SWR_ASSERT(false,
+                   "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
+                   pContext->threadInfo.BASE_CORE,
+                   numCoresPerNode);
         pContext->threadInfo.BASE_CORE = 0;
     }
 
@@ -1079,25 +1110,25 @@
     SWR_REL_ASSERT(numThreads <= numHWThreads);
 
     uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
-    uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
-    uint32_t numRemovedThreads = 0;
+    uint32_t& numAPIThreadsPerCore  = pContext->apiThreadInfo.numAPIThreadsPerCore;
+    uint32_t  numRemovedThreads     = 0;
 
     if (pContext->threadInfo.SINGLE_THREADED)
     {
-        numAPIReservedThreads = 0;
-        numThreads = 1;
+        numAPIReservedThreads      = 0;
+        numThreads                 = 1;
         pContext->NumWorkerThreads = 1;
-        pContext->NumFEThreads = 1;
-        pContext->NumBEThreads = 1;
-        pPool->numThreads = 0;
+        pContext->NumFEThreads     = 1;
+        pContext->NumBEThreads     = 1;
+        pPool->numThreads          = 0;
     }
     else if (pContext->threadInfo.MAX_WORKER_THREADS)
     {
         numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
         pContext->threadInfo.BASE_NUMA_NODE = 0;
-        pContext->threadInfo.BASE_CORE = 0;
-        pContext->threadInfo.BASE_THREAD = 0;
-        numAPIReservedThreads = 0;
+        pContext->threadInfo.BASE_CORE      = 0;
+        pContext->threadInfo.BASE_THREAD    = 0;
+        numAPIReservedThreads               = 0;
     }
     else
     {
@@ -1118,7 +1149,8 @@
             if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
             {
                 // Adjust removed threads to make logic below work
-                numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
+                numRemovedThreads =
+                    std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
             }
 
             numThreads -= numRemovedThreads;
@@ -1130,7 +1162,7 @@
     if (pContext->threadInfo.SINGLE_THREADED)
     {
         numAPIReservedThreads = 0;
-        numThreads = 1;
+        numThreads            = 1;
     }
 
     if (numAPIReservedThreads)
@@ -1148,7 +1180,7 @@
     }
     pPool->numReservedThreads = numAPIReservedThreads;
 
-    pPool->numThreads = numThreads;
+    pPool->numThreads          = numThreads;
     pContext->NumWorkerThreads = pPool->numThreads;
 
     pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
@@ -1160,7 +1192,8 @@
     pPool->pWorkerPrivateDataArray = nullptr;
     if (pContext->workerPrivateState.perWorkerPrivateStateSize)
     {
-        size_t perWorkerSize = AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
+        size_t perWorkerSize =
+            AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
         size_t totalSize = perWorkerSize * pPool->numThreads;
         if (totalSize)
         {
@@ -1190,19 +1223,19 @@
 
     if (pContext->threadInfo.MAX_WORKER_THREADS)
     {
-        bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
+        bool     bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
         uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
         // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
         // But Windows will still require binding to specific process groups
         for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
         {
-            pPool->pThreadData[workerId].workerId = workerId;
-            pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
-            pPool->pThreadData[workerId].threadId = 0;
-            pPool->pThreadData[workerId].numaId = 0;
-            pPool->pThreadData[workerId].coreId = 0;
-            pPool->pThreadData[workerId].htId = 0;
-            pPool->pThreadData[workerId].pContext = pContext;
+            pPool->pThreadData[workerId].workerId           = workerId;
+            pPool->pThreadData[workerId].procGroupId        = workerId % numProcGroups;
+            pPool->pThreadData[workerId].threadId           = 0;
+            pPool->pThreadData[workerId].numaId             = 0;
+            pPool->pThreadData[workerId].coreId             = 0;
+            pPool->pThreadData[workerId].htId               = 0;
+            pPool->pThreadData[workerId].pContext           = pContext;
             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
 
             pContext->NumBEThreads++;
@@ -1227,7 +1260,7 @@
             pPool->numaMask = 0;
         }
 
-        uint32_t workerId = 0;
+        uint32_t workerId           = 0;
         uint32_t numReservedThreads = numAPIReservedThreads;
         for (uint32_t n = 0; n < numNodes; ++n)
         {
@@ -1235,7 +1268,7 @@
             {
                 break;
             }
-            auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
+            auto&    node     = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
             uint32_t numCores = numCoresPerNode;
             for (uint32_t c = 0; c < numCores; ++c)
             {
@@ -1257,26 +1290,32 @@
                         --numRemovedThreads;
                         SWR_REL_ASSERT(numReservedThreads);
                         --numReservedThreads;
-                        pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
+                        pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
                         pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
-                        pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
-                        pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
-                        pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
-                        pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
-                        pPool->pApiThreadData[numReservedThreads].pContext = pContext;
+                        pPool->pApiThreadData[numReservedThreads].threadId    = core.threadIds[t];
+                        pPool->pApiThreadData[numReservedThreads].numaId =
+                            useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+                        pPool->pApiThreadData[numReservedThreads].coreId =
+                            c + pContext->threadInfo.BASE_CORE;
+                        pPool->pApiThreadData[numReservedThreads].htId =
+                            t + pContext->threadInfo.BASE_THREAD;
+                        pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
                         pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
 
-
                         if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
                         {
                             --numReservedThreads;
-                            pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
+                            pPool->pApiThreadData[numReservedThreads].workerId    = 0xFFFFFFFFU;
                             pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
-                            pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1];
-                            pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
-                            pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
-                            pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
-                            pPool->pApiThreadData[numReservedThreads].pContext = pContext;
+                            pPool->pApiThreadData[numReservedThreads].threadId =
+                                core.threadIds[t + 1];
+                            pPool->pApiThreadData[numReservedThreads].numaId =
+                                useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+                            pPool->pApiThreadData[numReservedThreads].coreId =
+                                c + pContext->threadInfo.BASE_CORE;
+                            pPool->pApiThreadData[numReservedThreads].htId =
+                                t + pContext->threadInfo.BASE_THREAD;
+                            pPool->pApiThreadData[numReservedThreads].pContext           = pContext;
                             pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
                         }
 
@@ -1285,12 +1324,14 @@
 
                     SWR_ASSERT(workerId < numThreads);
 
-                    pPool->pThreadData[workerId].workerId = workerId;
+                    pPool->pThreadData[workerId].workerId    = workerId;
                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
-                    pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD];
-                    pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
-                    pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
-                    pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
+                    pPool->pThreadData[workerId].threadId =
+                        core.threadIds[t + pContext->threadInfo.BASE_THREAD];
+                    pPool->pThreadData[workerId].numaId =
+                        useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+                    pPool->pThreadData[workerId].coreId   = c + pContext->threadInfo.BASE_CORE;
+                    pPool->pThreadData[workerId].htId     = t + pContext->threadInfo.BASE_THREAD;
                     pPool->pThreadData[workerId].pContext = pContext;
                     pPool->pThreadData[workerId].forceBindProcGroup = false;
 
@@ -1318,7 +1359,8 @@
 
     for (uint32_t workerId = 0; workerId < pContext->NumWorkerThreads; ++workerId)
     {
-        pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+        pPool->pThreads[workerId] =
+            new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
     }
 }
 
@@ -1326,7 +1368,7 @@
 /// @brief Destroys thread pool.
 /// @param pContext - pointer to context
 /// @param pPool - pointer to thread pool object.
-void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
+void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
 {
     // Wait for all threads to finish
     SwrWaitForIdle(pContext);
@@ -1339,12 +1381,13 @@
             // Detach from thread.  Cannot join() due to possibility (in Windows) of code
             // in some DLLMain(THREAD_DETATCH case) blocking the thread until after this returns.
             pPool->pThreads[t]->detach();
-            delete(pPool->pThreads[t]);
+            delete (pPool->pThreads[t]);
         }
 
         if (pContext->workerPrivateState.pfnFinishWorkerData)
         {
-            pContext->workerPrivateState.pfnFinishWorkerData(pPool->pThreadData[t].pWorkerPrivateData, t);
+            pContext->workerPrivateState.pfnFinishWorkerData(
+                pPool->pThreadData[t].pWorkerPrivateData, t);
         }
     }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index cb918dd..d0f4b30 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file threads.h
-*
-* @brief Definitions for SWR threading model.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file threads.h
+ *
+ * @brief Definitions for SWR threading model.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "knobs.h"
@@ -39,39 +39,43 @@
 
 struct THREAD_DATA
 {
-    void* pWorkerPrivateData;// Pointer to per-worker private data
-    uint32_t procGroupId;   // Will always be 0 for non-Windows OS
-    uint32_t threadId;      // within the procGroup for Windows
-    uint32_t numaId;        // NUMA node id
-    uint32_t coreId;        // Core id
-    uint32_t htId;          // Hyperthread id
-    uint32_t workerId;
-    SWR_CONTEXT *pContext;
-    bool forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set.
+    void*        pWorkerPrivateData; // Pointer to per-worker private data
+    uint32_t     procGroupId;        // Will always be 0 for non-Windows OS
+    uint32_t     threadId;           // within the procGroup for Windows
+    uint32_t     numaId;             // NUMA node id
+    uint32_t     coreId;             // Core id
+    uint32_t     htId;               // Hyperthread id
+    uint32_t     workerId;
+    SWR_CONTEXT* pContext;
+    bool         forceBindProcGroup; // Only useful when MAX_WORKER_THREADS is set.
 };
 
-
 struct THREAD_POOL
 {
-    THREAD_PTR* pThreads;
-    uint32_t numThreads;
-    uint32_t numaMask;
-    THREAD_DATA *pThreadData;
-    void* pWorkerPrivateDataArray; // All memory for worker private data
-    uint32_t numReservedThreads; // Number of threads reserved for API use
-    THREAD_DATA *pApiThreadData;
+    THREAD_PTR*  pThreads;
+    uint32_t     numThreads;
+    uint32_t     numaMask;
+    THREAD_DATA* pThreadData;
+    void*        pWorkerPrivateDataArray; // All memory for worker private data
+    uint32_t     numReservedThreads;      // Number of threads reserved for API use
+    THREAD_DATA* pApiThreadData;
 };
 
-typedef std::unordered_set<uint32_t> TileSet;
+struct TileSet;
 
-void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
+void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
-void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
+void DestroyThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE);
-bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
-void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE);
+void    WorkOnFifoFE(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawFE);
+bool    WorkOnFifoBE(SWR_CONTEXT* pContext,
+                     uint32_t     workerId,
+                     uint32_t&    curDrawBE,
+                     TileSet&     usedTiles,
+                     uint32_t     numaNode,
+                     uint32_t     numaMask);
+void    WorkOnCompute(SWR_CONTEXT* pContext, uint32_t workerId, uint32_t& curDrawBE);
 int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
 
-void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId);
+void BindApiThread(SWR_CONTEXT* pContext, uint32_t apiThreadId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 28fa787..87d5373 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -1,31 +1,31 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file tilemgr.cpp
-*
-* @brief Implementation for Macro Tile Manager which provides the facilities
-*        for threads to work on an macro tile.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file tilemgr.cpp
+ *
+ * @brief Implementation for Macro Tile Manager which provides the facilities
+ *        for threads to work on an macro tile.
+ *
+ ******************************************************************************/
 #include <unordered_map>
 
 #include "fifo.hpp"
@@ -33,44 +33,49 @@
 #include "core/multisample.h"
 #include "rdtsc_core.h"
 
-#define TILE_ID(x,y) ((x << 16 | y))
+MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) {}
 
-MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
-{
-}
-
-void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
+void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK* pWork)
 {
     // Should not enqueue more then what we have backing for in the hot tile manager.
     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
-    if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
+    if ((x & ~(KNOB_NUM_HOT_TILES_X - 1)) | (y & ~(KNOB_NUM_HOT_TILES_Y - 1)))
     {
         return;
     }
 
-    uint32_t id = TILE_ID(x, y);
+    uint32_t id = getTileId(x, y);
 
-    MacroTileQueue &tile = mTiles[id];
-    tile.mWorkItemsFE++;
-    tile.mId = id;
-
-    if (tile.mWorkItemsFE == 1)
+    if (id >= mTiles.size())
     {
-        tile.clear(mArena);
-        mDirtyTiles.push_back(&tile);
+        mTiles.resize((16 + id) * 2);
+    }
+
+    MacroTileQueue* pTile = mTiles[id];
+    if (!pTile)
+    {
+        pTile = mTiles[id] = new MacroTileQueue();
+    }
+    pTile->mWorkItemsFE++;
+    pTile->mId = id;
+
+    if (pTile->mWorkItemsFE == 1)
+    {
+        pTile->clear(mArena);
+        mDirtyTiles.push_back(pTile);
     }
 
     mWorkItemsProduced++;
-    tile.enqueue_try_nosync(mArena, pWork);
+    pTile->enqueue_try_nosync(mArena, pWork);
 }
 
 void MacroTileMgr::markTileComplete(uint32_t id)
 {
-    SWR_ASSERT(mTiles.find(id) != mTiles.end());
-    MacroTileQueue &tile = mTiles[id];
-    uint32_t numTiles = tile.mWorkItemsFE;
+    SWR_ASSERT(mTiles.size() > id);
+    MacroTileQueue& tile     = *mTiles[id];
+    uint32_t        numTiles = tile.mWorkItemsFE;
     InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
 
     _ReadWriteBarrier();
@@ -83,8 +88,14 @@
     tile.mWorkItemsBE = 0;
 }
 
-HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerPrivateData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
-    uint32_t renderTargetArrayIndex)
+HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT*                pContext,
+                                DRAW_CONTEXT*               pDC,
+                                HANDLE                      hWorkerPrivateData,
+                                uint32_t                    macroID,
+                                SWR_RENDERTARGET_ATTACHMENT attachment,
+                                bool                        create,
+                                uint32_t                    numSamples,
+                                uint32_t                    renderTargetArrayIndex)
 {
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroID, x, y);
@@ -92,17 +103,18 @@
     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
-    HotTileSet &tile = mHotTiles[x][y];
-    HOTTILE& hotTile = tile.Attachment[attachment];
+    HotTileSet& tile    = mHotTiles[x][y];
+    HOTTILE&    hotTile = tile.Attachment[attachment];
     if (hotTile.pBuffer == NULL)
     {
         if (create)
         {
-            uint32_t size = numSamples * mHotTileSize[attachment];
+            uint32_t size     = numSamples * mHotTileSize[attachment];
             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
-            hotTile.state = HOTTILE_INVALID;
-            hotTile.numSamples = numSamples;
+            hotTile.pBuffer =
+                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
+            hotTile.state                  = HOTTILE_INVALID;
+            hotTile.numSamples             = numSamples;
             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
         }
         else
@@ -115,22 +127,22 @@
         // free the old tile and create a new one with enough space to hold all samples
         if (numSamples > hotTile.numSamples)
         {
-            // tile should be either uninitialized or resolved if we're deleting and switching to a 
+            // tile should be either uninitialized or resolved if we're deleting and switching to a
             // new sample count
-            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
-                (hotTile.state == HOTTILE_RESOLVED) ||
-                (hotTile.state == HOTTILE_CLEAR));
+            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || (hotTile.state == HOTTILE_RESOLVED) ||
+                       (hotTile.state == HOTTILE_CLEAR));
             FreeHotTileMem(hotTile.pBuffer);
 
-            uint32_t size = numSamples * mHotTileSize[attachment];
+            uint32_t size     = numSamples * mHotTileSize[attachment];
             uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
-            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
-            hotTile.state = HOTTILE_INVALID;
+            hotTile.pBuffer =
+                (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
+            hotTile.state      = HOTTILE_INVALID;
             hotTile.numSamples = numSamples;
         }
 
-        // if requested render target array index isn't currently loaded, need to store out the current hottile 
-        // and load the requested array slice
+        // if requested render target array index isn't currently loaded, need to store out the
+        // current hottile and load the requested array slice
         if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
         {
             SWR_FORMAT format;
@@ -143,10 +155,19 @@
             case SWR_ATTACHMENT_COLOR4:
             case SWR_ATTACHMENT_COLOR5:
             case SWR_ATTACHMENT_COLOR6:
-            case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-            case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
-            case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
-            default: SWR_INVALID("Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+            case SWR_ATTACHMENT_COLOR7:
+                format = KNOB_COLOR_HOT_TILE_FORMAT;
+                break;
+            case SWR_ATTACHMENT_DEPTH:
+                format = KNOB_DEPTH_HOT_TILE_FORMAT;
+                break;
+            case SWR_ATTACHMENT_STENCIL:
+                format = KNOB_STENCIL_HOT_TILE_FORMAT;
+                break;
+            default:
+                SWR_INVALID("Unknown attachment: %d", attachment);
+                format = KNOB_COLOR_HOT_TILE_FORMAT;
+                break;
             }
 
             if (hotTile.state == HOTTILE_CLEAR)
@@ -163,23 +184,38 @@
 
             if (hotTile.state == HOTTILE_DIRTY)
             {
-                pContext->pfnStoreTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment,
-                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
+                pContext->pfnStoreTile(GetPrivateState(pDC),
+                                       hWorkerPrivateData,
+                                       format,
+                                       attachment,
+                                       x * KNOB_MACROTILE_X_DIM,
+                                       y * KNOB_MACROTILE_Y_DIM,
+                                       hotTile.renderTargetArrayIndex,
+                                       hotTile.pBuffer);
             }
 
-            pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, format, attachment,
-                x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC),
+                                  hWorkerPrivateData,
+                                  format,
+                                  attachment,
+                                  x * KNOB_MACROTILE_X_DIM,
+                                  y * KNOB_MACROTILE_Y_DIM,
+                                  renderTargetArrayIndex,
+                                  hotTile.pBuffer);
 
             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-            hotTile.state = HOTTILE_DIRTY;
+            hotTile.state                  = HOTTILE_DIRTY;
         }
     }
     return &tile.Attachment[attachment];
 }
 
-HOTTILE* HotTileMgr::GetHotTileNoLoad(
-    SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
-    SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
+HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT*                pContext,
+                                      DRAW_CONTEXT*               pDC,
+                                      uint32_t                    macroID,
+                                      SWR_RENDERTARGET_ATTACHMENT attachment,
+                                      bool                        create,
+                                      uint32_t                    numSamples)
 {
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroID, x, y);
@@ -187,16 +223,16 @@
     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
-    HotTileSet &tile = mHotTiles[x][y];
-    HOTTILE& hotTile = tile.Attachment[attachment];
+    HotTileSet& tile    = mHotTiles[x][y];
+    HOTTILE&    hotTile = tile.Attachment[attachment];
     if (hotTile.pBuffer == NULL)
     {
         if (create)
         {
-            uint32_t size = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer = (uint8_t*)AlignedMalloc(size, 64);
-            hotTile.state = HOTTILE_INVALID;
-            hotTile.numSamples = numSamples;
+            uint32_t size                  = numSamples * mHotTileSize[attachment];
+            hotTile.pBuffer                = (uint8_t*)AlignedMalloc(size, 64);
+            hotTile.state                  = HOTTILE_INVALID;
+            hotTile.numSamples             = numSamples;
             hotTile.renderTargetArrayIndex = 0;
         }
         else
@@ -209,23 +245,25 @@
 }
 
 #if USE_8x2_TILE_BACKEND
-void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+void HotTileMgr::ClearColorHotTile(
+    const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
 {
     // Load clear color into SIMD register...
-    float *pClearData = (float *)(pHotTile->clearData);
-    simd16scalar valR = _simd16_broadcast_ss(&pClearData[0]);
-    simd16scalar valG = _simd16_broadcast_ss(&pClearData[1]);
-    simd16scalar valB = _simd16_broadcast_ss(&pClearData[2]);
-    simd16scalar valA = _simd16_broadcast_ss(&pClearData[3]);
+    float*       pClearData = (float*)(pHotTile->clearData);
+    simd16scalar valR       = _simd16_broadcast_ss(&pClearData[0]);
+    simd16scalar valG       = _simd16_broadcast_ss(&pClearData[1]);
+    simd16scalar valB       = _simd16_broadcast_ss(&pClearData[2]);
+    simd16scalar valA       = _simd16_broadcast_ss(&pClearData[3]);
 
-    float *pfBuf = (float *)pHotTile->pBuffer;
+    float*   pfBuf      = (float*)pHotTile->pBuffer;
     uint32_t numSamples = pHotTile->numSamples;
 
     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
     {
         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
         {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
+                 si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
             {
                 _simd16_store_ps(pfBuf, valR);
                 pfBuf += KNOB_SIMD16_WIDTH;
@@ -243,20 +281,22 @@
     }
 }
 
-void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+void HotTileMgr::ClearDepthHotTile(
+    const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
 {
     // Load clear color into SIMD register...
-    float *pClearData = (float *)(pHotTile->clearData);
-    simd16scalar valZ = _simd16_broadcast_ss(&pClearData[0]);
+    float*       pClearData = (float*)(pHotTile->clearData);
+    simd16scalar valZ       = _simd16_broadcast_ss(&pClearData[0]);
 
-    float *pfBuf = (float *)pHotTile->pBuffer;
+    float*   pfBuf      = (float*)pHotTile->pBuffer;
     uint32_t numSamples = pHotTile->numSamples;
 
     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
     {
         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
         {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
+                 si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM)
             {
                 _simd16_store_ps(pfBuf, valZ);
                 pfBuf += KNOB_SIMD16_WIDTH;
@@ -269,18 +309,19 @@
 {
     // convert from F32 to U8.
     uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
-    //broadcast 32x into __m256i...
+    // broadcast 32x into __m256i...
     simd16scalari valS = _simd16_set1_epi8(clearVal);
 
-    simd16scalari *pBuf = (simd16scalari *)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
+    simd16scalari* pBuf       = (simd16scalari*)pHotTile->pBuffer;
+    uint32_t       numSamples = pHotTile->numSamples;
 
     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
     {
         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
         {
             // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4)
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
+                 si += SIMD16_TILE_X_DIM * SIMD16_TILE_Y_DIM * 4)
             {
                 _simd16_store_si(pBuf, valS);
                 pBuf += 1;
@@ -290,23 +331,26 @@
 }
 
 #else
-void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+void HotTileMgr::ClearColorHotTile(
+    const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
 {
     // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
-    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
-    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
-    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+    float*     pClearData = (float*)(pHotTile->clearData);
+    simdscalar valR       = _simd_broadcast_ss(&pClearData[0]);
+    simdscalar valG       = _simd_broadcast_ss(&pClearData[1]);
+    simdscalar valB       = _simd_broadcast_ss(&pClearData[2]);
+    simdscalar valA       = _simd_broadcast_ss(&pClearData[3]);
 
-    float *pfBuf = (float*)pHotTile->pBuffer;
+    float*   pfBuf      = (float*)pHotTile->pBuffer;
     uint32_t numSamples = pHotTile->numSamples;
 
     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
     {
         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
         {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
+                 si +=
+                 SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) // SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
             {
                 _simd_store_ps(pfBuf, valR);
                 pfBuf += KNOB_SIMD_WIDTH;
@@ -321,20 +365,22 @@
     }
 }
 
-void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+void HotTileMgr::ClearDepthHotTile(
+    const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
 {
     // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
+    float*     pClearData = (float*)(pHotTile->clearData);
+    simdscalar valZ       = _simd_broadcast_ss(&pClearData[0]);
 
-    float *pfBuf = (float*)pHotTile->pBuffer;
+    float*   pfBuf      = (float*)pHotTile->pBuffer;
     uint32_t numSamples = pHotTile->numSamples;
 
     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
     {
         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
         {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
+                 si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
             {
                 _simd_store_ps(pfBuf, valZ);
                 pfBuf += KNOB_SIMD_WIDTH;
@@ -347,18 +393,19 @@
 {
     // convert from F32 to U8.
     uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
-    //broadcast 32x into __m256i...
+    // broadcast 32x into __m256i...
     simdscalari valS = _simd_set1_epi8(clearVal);
 
-    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
+    simdscalari* pBuf       = (simdscalari*)pHotTile->pBuffer;
+    uint32_t     numSamples = pHotTile->numSamples;
 
     for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
     {
         for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
         {
             // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples);
+                 si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
             {
                 _simd_store_si(pBuf, valS);
                 pBuf += 1;
@@ -376,9 +423,12 @@
 /// to avoid unnecessary setup every triangle
 /// @todo support deferred clear
 /// @param pCreateInfo - pointer to creation info.
-void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID)
+void HotTileMgr::InitializeHotTiles(SWR_CONTEXT*  pContext,
+                                    DRAW_CONTEXT* pDC,
+                                    uint32_t      workerId,
+                                    uint32_t      macroID)
 {
-    const API_STATE& state = GetApiState(pDC);
+    const API_STATE& state    = GetApiState(pDC);
     HANDLE hWorkerPrivateData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;
 
     uint32_t x, y;
@@ -389,17 +439,31 @@
     uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
 
     // check RT if enabled
-    unsigned long rtSlot = 0;
-    uint32_t colorHottileEnableMask = state.colorHottileEnable;
+    unsigned long rtSlot                 = 0;
+    uint32_t      colorHottileEnableMask = state.colorHottileEnable;
     while (_BitScanForward(&rtSlot, colorHottileEnableMask))
     {
-        HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+        HOTTILE* pHotTile =
+            GetHotTile(pContext,
+                       pDC,
+                       hWorkerPrivateData,
+                       macroID,
+                       (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
+                       true,
+                       numSamples);
 
         if (pHotTile->state == HOTTILE_INVALID)
         {
             RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC),
+                                  hWorkerPrivateData,
+                                  KNOB_COLOR_HOT_TILE_FORMAT,
+                                  (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot),
+                                  x,
+                                  y,
+                                  pHotTile->renderTargetArrayIndex,
+                                  pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
             RDTSC_END(BELoadTiles, 0);
         }
@@ -417,12 +481,20 @@
     // check depth if enabled
     if (state.depthHottileEnable)
     {
-        HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+        HOTTILE* pHotTile = GetHotTile(
+            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
             RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC),
+                                  hWorkerPrivateData,
+                                  KNOB_DEPTH_HOT_TILE_FORMAT,
+                                  SWR_ATTACHMENT_DEPTH,
+                                  x,
+                                  y,
+                                  pHotTile->renderTargetArrayIndex,
+                                  pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
             RDTSC_END(BELoadTiles, 0);
         }
@@ -439,12 +511,20 @@
     // check stencil if enabled
     if (state.stencilHottileEnable)
     {
-        HOTTILE* pHotTile = GetHotTile(pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+        HOTTILE* pHotTile = GetHotTile(
+            pContext, pDC, hWorkerPrivateData, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
         if (pHotTile->state == HOTTILE_INVALID)
         {
             RDTSC_BEGIN(BELoadTiles, pDC->drawId);
             // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), hWorkerPrivateData, KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pContext->pfnLoadTile(GetPrivateState(pDC),
+                                  hWorkerPrivateData,
+                                  KNOB_STENCIL_HOT_TILE_FORMAT,
+                                  SWR_ATTACHMENT_STENCIL,
+                                  x,
+                                  y,
+                                  pHotTile->renderTargetArrayIndex,
+                                  pHotTile->pBuffer);
             pHotTile->state = HOTTILE_DIRTY;
             RDTSC_END(BELoadTiles, 0);
         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 2831010..7173b02 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -1,36 +1,37 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file tilemgr.h
-*
-* @brief Definitions for Macro Tile Manager which provides the facilities
-*        for threads to work on an macro tile.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file tilemgr.h
+ *
+ * @brief Definitions for Macro Tile Manager which provides the facilities
+ *        for threads to work on an macro tile.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include <set>
 #include <unordered_map>
 #include "common/formats.h"
+#include "common/intrin.h"
 #include "fifo.hpp"
 #include "context.h"
 #include "format_traits.h"
@@ -40,22 +41,16 @@
 //////////////////////////////////////////////////////////////////////////
 struct MacroTileQueue
 {
-    MacroTileQueue() { }
-    ~MacroTileQueue() { }
+    MacroTileQueue() {}
+    ~MacroTileQueue() { destroy(); }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Returns number of work items queued for this tile.
-    uint32_t getNumQueued()
-    {
-        return mFifo.getNumQueued();
-    }
+    uint32_t getNumQueued() { return mFifo.getNumQueued(); }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Attempt to lock the work fifo. If already locked then return false.
-    bool tryLock()
-    {
-        return mFifo.tryLock();
-    }
+    bool tryLock() { return mFifo.tryLock(); }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Clear fifo and unlock it.
@@ -67,10 +62,7 @@
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Peek at work sitting at the front of the fifo.
-    BE_WORK* peek()
-    {
-        return mFifo.peek();
-    }
+    BE_WORK* peek() { return mFifo.peek(); }
 
     template <typename ArenaT>
     bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry)
@@ -80,22 +72,16 @@
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Move to next work item
-    void dequeue()
-    {
-        mFifo.dequeue_noinc();
-    }
+    void dequeue() { mFifo.dequeue_noinc(); }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Destroy fifo
-    void destroy()
-    {
-        mFifo.destroy();
-    }
+    void destroy() { mFifo.destroy(); }
 
     ///@todo This will all be private.
     uint32_t mWorkItemsFE = 0;
     uint32_t mWorkItemsBE = 0;
-    uint32_t mId = 0;
+    uint32_t mId          = 0;
 
 private:
     QUEUE<BE_WORK> mFifo;
@@ -110,9 +96,9 @@
     MacroTileMgr(CachingArena& arena);
     ~MacroTileMgr()
     {
-        for (auto &tile : mTiles)
+        for (auto* pTile : mTiles)
         {
-            tile.second.destroy();
+            delete pTile;
         }
     }
 
@@ -125,33 +111,41 @@
     }
 
     INLINE std::vector<MacroTileQueue*>& getDirtyTiles() { return mDirtyTiles; }
-    void markTileComplete(uint32_t id);
+    void                                 markTileComplete(uint32_t id);
 
-    INLINE bool isWorkComplete()
+    INLINE bool isWorkComplete() { return mWorkItemsProduced == mWorkItemsConsumed; }
+
+    void enqueue(uint32_t x, uint32_t y, BE_WORK* pWork);
+
+    static INLINE void getTileIndices(uint32_t tileID, uint32_t& x, uint32_t& y)
     {
-        return mWorkItemsProduced == mWorkItemsConsumed;
+        // Morton / Z order of tiles
+        x = pext_u32(tileID, 0x55555555);
+        y = pext_u32(tileID, 0xAAAAAAAA);
     }
 
-    void enqueue(uint32_t x, uint32_t y, BE_WORK *pWork);
-
-    static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
+    static INLINE uint32_t getTileId(uint32_t x, uint32_t y)
     {
-        y = tileID & 0xffff;
-        x = (tileID >> 16) & 0xffff;
+        // Morton / Z order of tiles
+        return pdep_u32(x, 0x55555555) | pdep_u32(y, 0xAAAAAAAA);
     }
 
 private:
-    CachingArena& mArena;
-    std::unordered_map<uint32_t, MacroTileQueue> mTiles;
+    CachingArena&                mArena;
+    std::vector<MacroTileQueue*> mTiles;
 
     // Any tile that has work queued to it is a dirty tile.
     std::vector<MacroTileQueue*> mDirtyTiles;
 
-    OSALIGNLINE(long) mWorkItemsProduced { 0 };
-    OSALIGNLINE(volatile long) mWorkItemsConsumed { 0 };
+    OSALIGNLINE(long) mWorkItemsProduced{0};
+    OSALIGNLINE(volatile long) mWorkItemsConsumed{0};
 };
 
-typedef void(*PFN_DISPATCH)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace);
+typedef void (*PFN_DISPATCH)(DRAW_CONTEXT* pDC,
+                             uint32_t      workerId,
+                             uint32_t      threadGroupId,
+                             void*&        pSpillFillBuffer,
+                             void*&        pScratchSpace);
 
 //////////////////////////////////////////////////////////////////////////
 /// DispatchQueue - work queue for dispatch
@@ -167,23 +161,20 @@
     {
         // The available and outstanding counts start with total tasks.
         // At the start there are N tasks available and outstanding.
-        // When both the available and outstanding counts have reached 0 then all work has completed.
-        // When a worker starts on a threadgroup then it decrements the available count.
+        // When both the available and outstanding counts have reached 0 then all work has
+        // completed. When a worker starts on a threadgroup then it decrements the available count.
         // When a worker completes a threadgroup then it decrements the outstanding count.
 
-        mTasksAvailable = totalTasks;
+        mTasksAvailable   = totalTasks;
         mTasksOutstanding = totalTasks;
 
-        mpTaskData = pTaskData;
+        mpTaskData   = pTaskData;
         mPfnDispatch = pfnDispatch;
     }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Returns number of tasks available for this dispatch.
-    uint32_t getNumQueued()
-    {
-        return (mTasksAvailable > 0) ? mTasksAvailable : 0;
-    }
+    uint32_t getNumQueued() { return (mTasksAvailable > 0) ? mTasksAvailable : 0; }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Atomically decrement the work available count. If the result
@@ -216,50 +207,49 @@
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Work is complete once both the available/outstanding counts have reached 0.
-    bool isWorkComplete()
-    {
-        return ((mTasksAvailable <= 0) &&
-                (mTasksOutstanding <= 0));
-    }
+    bool isWorkComplete() { return ((mTasksAvailable <= 0) && (mTasksOutstanding <= 0)); }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Return pointer to task data.
-    const void* GetTasksData()
-    {
-        return mpTaskData;
-    }
+    const void* GetTasksData() { return mpTaskData; }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Dispatches a unit of work
-    void dispatch(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer, void*& pScratchSpace)
+    void dispatch(DRAW_CONTEXT* pDC,
+                  uint32_t      workerId,
+                  uint32_t      threadGroupId,
+                  void*&        pSpillFillBuffer,
+                  void*&        pScratchSpace)
     {
         SWR_ASSERT(mPfnDispatch != nullptr);
         mPfnDispatch(pDC, workerId, threadGroupId, pSpillFillBuffer, pScratchSpace);
     }
 
-    void* mpTaskData{ nullptr };        // The API thread will set this up and the callback task function will interpet this.
-    PFN_DISPATCH mPfnDispatch{ nullptr };      // Function to call per dispatch
+    void* mpTaskData{nullptr}; // The API thread will set this up and the callback task function
+                               // will interpet this.
+    PFN_DISPATCH mPfnDispatch{nullptr}; // Function to call per dispatch
 
-    OSALIGNLINE(volatile long) mTasksAvailable{ 0 };
-    OSALIGNLINE(volatile long) mTasksOutstanding{ 0 };
+    OSALIGNLINE(volatile long) mTasksAvailable{0};
+    OSALIGNLINE(volatile long) mTasksOutstanding{0};
 };
 
-
 enum HOTTILE_STATE
 {
-    HOTTILE_INVALID,        // tile is in unitialized state and should be loaded with surface contents before rendering
-    HOTTILE_CLEAR,          // tile should be cleared
-    HOTTILE_DIRTY,          // tile has been rendered to
-    HOTTILE_RESOLVED,       // tile has been stored to memory
+    HOTTILE_INVALID,  // tile is in unitialized state and should be loaded with surface contents
+                      // before rendering
+    HOTTILE_CLEAR,    // tile should be cleared
+    HOTTILE_DIRTY,    // tile has been rendered to
+    HOTTILE_RESOLVED, // tile has been stored to memory
 };
 
 struct HOTTILE
 {
-    uint8_t *pBuffer;
+    uint8_t*      pBuffer;
     HOTTILE_STATE state;
-    DWORD clearData[4];                 // May need to change based on pfnClearTile implementation.  Reorder for alignment?
+    DWORD clearData[4]; // May need to change based on pfnClearTile implementation.  Reorder for
+                        // alignment?
     uint32_t numSamples;
-    uint32_t renderTargetArrayIndex;    // current render target array index loaded
+    uint32_t renderTargetArrayIndex; // current render target array index loaded
 };
 
 union HotTileSet
@@ -283,10 +273,13 @@
         // cache hottile size
         for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i)
         {
-            mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
+            mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM *
+                              FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
         }
-        mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
-        mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
+        mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM *
+                                             FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
+        mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM *
+                                               FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
     }
 
     ~HotTileMgr()
@@ -303,12 +296,26 @@
         }
     }
 
-    void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroID);
+    void InitializeHotTiles(SWR_CONTEXT*  pContext,
+                            DRAW_CONTEXT* pDC,
+                            uint32_t      workerId,
+                            uint32_t      macroID);
 
-    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, HANDLE hWorkerData, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
-        uint32_t renderTargetArrayIndex = 0);
+    HOTTILE* GetHotTile(SWR_CONTEXT*                pContext,
+                        DRAW_CONTEXT*               pDC,
+                        HANDLE                      hWorkerData,
+                        uint32_t                    macroID,
+                        SWR_RENDERTARGET_ATTACHMENT attachment,
+                        bool                        create,
+                        uint32_t                    numSamples             = 1,
+                        uint32_t                    renderTargetArrayIndex = 0);
 
-    HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1);
+    HOTTILE* GetHotTileNoLoad(SWR_CONTEXT*                pContext,
+                              DRAW_CONTEXT*               pDC,
+                              uint32_t                    macroID,
+                              SWR_RENDERTARGET_ATTACHMENT attachment,
+                              bool                        create,
+                              uint32_t                    numSamples = 1);
 
     static void ClearColorHotTile(const HOTTILE* pHotTile);
     static void ClearDepthHotTile(const HOTTILE* pHotTile);
@@ -316,14 +323,15 @@
 
 private:
     HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
-    uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
+    uint32_t   mHotTileSize[SWR_NUM_ATTACHMENTS];
 
     void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
     {
         void* p = nullptr;
 #if defined(_WIN32)
         HANDLE hProcess = GetCurrentProcess();
-        p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
+        p               = VirtualAllocExNuma(
+            hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
 #else
         p = AlignedMalloc(size, align);
 #endif
@@ -343,4 +351,3 @@
         }
     }
 };
-
diff --git a/src/gallium/drivers/swr/rasterizer/core/tileset.h b/src/gallium/drivers/swr/rasterizer/core/tileset.h
new file mode 100644
index 0000000..e28c84d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/tileset.h
@@ -0,0 +1,102 @@
+/****************************************************************************
+ * Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file tileset.h
+ *
+ * @brief Custom bitset class for managing locked tiles
+ *
+ ******************************************************************************/
+#pragma once
+
+struct TileSet
+{
+    ~TileSet()
+    {
+        if (m_bits)
+        {
+            AlignedFree(m_bits);
+        }
+    }
+    INLINE void set(size_t idx)
+    {
+        _grow(idx);
+        size_t& word = _get_word(idx);
+        word |= (size_t(1) << (idx & BITS_OFFSET));
+        m_maxSet = std::max(m_maxSet, idx + 1);
+    }
+    INLINE bool get(size_t idx)
+    {
+        if (idx >= m_size)
+        {
+            return false;
+        }
+        size_t word = _get_word(idx);
+        return 0 != (word & (size_t(1) << (idx & BITS_OFFSET)));
+    }
+
+    INLINE void clear()
+    {
+        if (m_maxSet)
+        {
+            size_t num_words = (m_maxSet + BITS_OFFSET) / BITS_PER_WORD;
+            memset(m_bits, 0, sizeof(size_t) * num_words);
+            m_maxSet = 0;
+        }
+    }
+
+private:
+    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
+    static const size_t BITS_OFFSET   = BITS_PER_WORD - 1;
+
+    size_t  m_size   = 0;
+    size_t  m_maxSet = 0;
+    size_t* m_bits   = nullptr;
+
+    INLINE size_t& _get_word(size_t idx) { return m_bits[idx / BITS_PER_WORD]; }
+
+    void _grow(size_t idx)
+    {
+        if (idx < m_size)
+        {
+            return;
+        }
+
+        size_t  new_size   = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET;
+        size_t  num_words  = new_size / BITS_PER_WORD;
+        size_t* newBits    = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64);
+        size_t  copy_words = 0;
+
+        if (m_bits)
+        {
+            copy_words = (m_size + BITS_OFFSET) / BITS_PER_WORD;
+            num_words -= copy_words;
+            memcpy(newBits, m_bits, copy_words * sizeof(size_t));
+
+            AlignedFree(m_bits);
+        }
+
+        m_bits = newBits;
+        m_size = new_size;
+
+        memset(&m_bits[copy_words], 0, sizeof(size_t) * num_words);
+    }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index d6cbf24..27c9c60 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -1,35 +1,36 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file utils.h
-*
-* @brief Utilities used by SWR core.
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file utils.h
+ *
+ * @brief Utilities used by SWR core.
+ *
+ ******************************************************************************/
 #pragma once
 
 #include <string.h>
 #include <type_traits>
 #include <algorithm>
+#include <array>
 #include "common/os.h"
 #include "common/intrin.h"
 #include "common/swr_assert.h"
@@ -53,38 +54,44 @@
 };
 #endif
 
-template<typename SIMD_T>
+template <typename SIMD_T>
 struct SIMDBBOX_T
 {
-    typename SIMD_T::Integer            ymin;
-    typename SIMD_T::Integer            ymax;
-    typename SIMD_T::Integer            xmin;
-    typename SIMD_T::Integer            xmax;
+    typename SIMD_T::Integer ymin;
+    typename SIMD_T::Integer ymax;
+    typename SIMD_T::Integer xmin;
+    typename SIMD_T::Integer xmax;
 };
 
 // helper function to unroll loops
-template<int Begin, int End, int Step = 1>
-struct UnrollerL {
-    template<typename Lambda>
-    INLINE static void step(Lambda& func) {
+template <int Begin, int End, int Step = 1>
+struct UnrollerL
+{
+    template <typename Lambda>
+    INLINE static void step(Lambda& func)
+    {
         func(Begin);
         UnrollerL<Begin + Step, End, Step>::step(func);
     }
 };
 
-template<int End, int Step>
-struct UnrollerL<End, End, Step> {
-    template<typename Lambda>
-    static void step(Lambda& func) {
+template <int End, int Step>
+struct UnrollerL<End, End, Step>
+{
+    template <typename Lambda>
+    static void step(Lambda& func)
+    {
     }
 };
 
 // helper function to unroll loops, with mask to skip specific iterations
-template<int Begin, int End, int Step = 1, int Mask = 0x7f>
-struct UnrollerLMask {
-    template<typename Lambda>
-    INLINE static void step(Lambda& func) {
-        if(Mask & (1 << Begin))
+template <int Begin, int End, int Step = 1, int Mask = 0x7f>
+struct UnrollerLMask
+{
+    template <typename Lambda>
+    INLINE static void step(Lambda& func)
+    {
+        if (Mask & (1 << Begin))
         {
             func(Begin);
         }
@@ -92,29 +99,31 @@
     }
 };
 
-template<int End, int Step, int Mask>
-struct UnrollerLMask<End, End, Step, Mask> {
-    template<typename Lambda>
-    static void step(Lambda& func) {
+template <int End, int Step, int Mask>
+struct UnrollerLMask<End, End, Step, Mask>
+{
+    template <typename Lambda>
+    static void step(Lambda& func)
+    {
     }
 };
 
 // general CRC compute
 INLINE
-uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
+uint32_t ComputeCRC(uint32_t crc, const void* pData, uint32_t size)
 {
 #if defined(_WIN64) || defined(__x86_64__)
-    uint32_t sizeInQwords = size / sizeof(uint64_t);
-    uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
-    uint64_t* pDataWords = (uint64_t*)pData;
+    uint32_t  sizeInQwords       = size / sizeof(uint64_t);
+    uint32_t  sizeRemainderBytes = size % sizeof(uint64_t);
+    uint64_t* pDataWords         = (uint64_t*)pData;
     for (uint32_t i = 0; i < sizeInQwords; ++i)
     {
         crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
     }
 #else
-    uint32_t sizeInDwords = size / sizeof(uint32_t);
-    uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
-    uint32_t* pDataWords = (uint32_t*)pData;
+    uint32_t  sizeInDwords       = size / sizeof(uint32_t);
+    uint32_t  sizeRemainderBytes = size % sizeof(uint32_t);
+    uint32_t* pDataWords         = (uint32_t*)pData;
     for (uint32_t i = 0; i < sizeInDwords; ++i)
     {
         crc = _mm_crc32_u32(crc, *pDataWords++);
@@ -134,8 +143,7 @@
 /// Check specified bit within a data word
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
-INLINE
-static bool CheckBit(T word, uint32_t bit)
+INLINE static bool CheckBit(T word, uint32_t bit)
 {
     return 0 != (word & (T(1) << bit));
 }
@@ -144,8 +152,7 @@
 /// Add byte offset to any-type pointer
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
-INLINE
-static T* PtrAdd(T* p, intptr_t offset)
+INLINE static T* PtrAdd(T* p, intptr_t offset)
 {
     intptr_t intp = reinterpret_cast<intptr_t>(p);
     return reinterpret_cast<T*>(intp + offset);
@@ -155,8 +162,7 @@
 /// Is a power-of-2?
 //////////////////////////////////////////////////////////////////////////
 template <typename T>
-INLINE
-static bool IsPow2(T value)
+INLINE static bool IsPow2(T value)
 {
     return value == (value & (T(0) - value));
 }
@@ -166,8 +172,7 @@
 /// Note: IsPow2(alignment) MUST be true
 //////////////////////////////////////////////////////////////////////////
 template <typename T1, typename T2>
-INLINE
-static T1 AlignDownPow2(T1 value, T2 alignment)
+INLINE static T1 AlignDownPow2(T1 value, T2 alignment)
 {
     SWR_ASSERT(IsPow2(alignment));
     return value & ~T1(alignment - 1);
@@ -178,8 +183,7 @@
 /// Note: IsPow2(alignment) MUST be true
 //////////////////////////////////////////////////////////////////////////
 template <typename T1, typename T2>
-INLINE
-static T1 AlignUpPow2(T1 value, T2 alignment)
+INLINE static T1 AlignUpPow2(T1 value, T2 alignment)
 {
     return AlignDownPow2(value + T1(alignment - 1), alignment);
 }
@@ -189,8 +193,7 @@
 /// Note: IsPow2(alignment) MUST be true
 //////////////////////////////////////////////////////////////////////////
 template <typename T1, typename T2>
-INLINE
-static T1* AlignUpPow2(T1* value, T2 alignment)
+INLINE static T1* AlignUpPow2(T1* value, T2 alignment)
 {
     return reinterpret_cast<T1*>(
         AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
@@ -200,10 +203,12 @@
 /// Align down to specified alignment
 //////////////////////////////////////////////////////////////////////////
 template <typename T1, typename T2>
-INLINE
-static T1 AlignDown(T1 value, T2 alignment)
+INLINE static T1 AlignDown(T1 value, T2 alignment)
 {
-    if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
+    if (IsPow2(alignment))
+    {
+        return AlignDownPow2(value, alignment);
+    }
     return value - T1(value % alignment);
 }
 
@@ -211,8 +216,7 @@
 /// Align down to specified alignment
 //////////////////////////////////////////////////////////////////////////
 template <typename T1, typename T2>
-INLINE
-static T1* AlignDown(T1* value, T2 alignment)
+INLINE static T1* AlignDown(T1* value, T2 alignment)
 {
     return (T1*)AlignDown(uintptr_t(value), alignment);
 }
@@ -222,8 +226,7 @@
 /// Note: IsPow2(alignment) MUST be true
 //////////////////////////////////////////////////////////////////////////
 template <typename T1, typename T2>
-INLINE
-static T1 AlignUp(T1 value, T2 alignment)
+INLINE static T1 AlignUp(T1 value, T2 alignment)
 {
     return AlignDown(value + T1(alignment - 1), alignment);
 }
@@ -233,33 +236,31 @@
 /// Note: IsPow2(alignment) MUST be true
 //////////////////////////////////////////////////////////////////////////
 template <typename T1, typename T2>
-INLINE
-static T1* AlignUp(T1* value, T2 alignment)
+INLINE static T1* AlignUp(T1* value, T2 alignment)
 {
     return AlignDown(PtrAdd(value, alignment - 1), alignment);
 }
 
 //////////////////////////////////////////////////////////////////////////
-/// Helper structure used to access an array of elements that don't 
+/// Helper structure used to access an array of elements that don't
 /// correspond to a typical word size.
 //////////////////////////////////////////////////////////////////////////
-template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
+template <typename T, size_t BitsPerElementT, size_t ArrayLenT>
 class BitsArray
 {
 private:
-    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
+    static const size_t BITS_PER_WORD     = sizeof(size_t) * 8;
     static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
-    static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
-    static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
+    static const size_t NUM_WORDS         = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
+    static const size_t ELEMENT_MASK      = (size_t(1) << BitsPerElementT) - 1;
 
     static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
-        "Element size must an integral fraction of pointer size");
+                  "Element size must an integral fraction of pointer size");
 
-    size_t              m_words[NUM_WORDS] = {};
+    size_t m_words[NUM_WORDS] = {};
 
 public:
-
-    T operator[] (size_t elementIndex) const
+    T operator[](size_t elementIndex) const
     {
         size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
         word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
@@ -323,9 +324,11 @@
         }
         if (TMax > TMin)
         {
-            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(RangedArg<T, TMin, (T)(int(TMax)-1)>{iArg.val});
+            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(
+                RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val});
         }
-        SWR_ASSUME(false); return nullptr;
+        SWR_ASSUME(false);
+        return nullptr;
     }
     template <typename T, T TVal>
     static typename TermT::FuncType GetFunc(RangedArg<T, TVal, TVal> iArg)
@@ -340,19 +343,23 @@
     {
         if (iArg.val == TMax)
         {
-            return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TMax>>::GetFunc(remainingArgs...);
+            return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TMax>>::GetFunc(
+                remainingArgs...);
         }
         if (TMax > TMin)
         {
-            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val}, remainingArgs...);
+            return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(
+                RangedArg<T, TMin, (T)(int(TMax) - 1)>{iArg.val}, remainingArgs...);
         }
-        SWR_ASSUME(false); return nullptr;
+        SWR_ASSUME(false);
+        return nullptr;
     }
     template <typename T, T TVal, typename... TArgsT>
     static typename TermT::FuncType GetFunc(RangedArg<T, TVal, TVal> iArg, TArgsT... remainingArgs)
     {
         SWR_ASSERT(iArg.val == TVal);
-        return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TVal>>::GetFunc(remainingArgs...);
+        return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<T, TVal>>::GetFunc(
+            remainingArgs...);
     }
 };
 
@@ -364,12 +371,13 @@
     std::string output;
 #if defined(_WIN32)
     DWORD valueSize = GetEnvironmentVariableA(variableName.c_str(), nullptr, 0);
-    if (!valueSize) return output;
+    if (!valueSize)
+        return output;
     output.resize(valueSize - 1); // valueSize includes null, output.resize() does not
     GetEnvironmentVariableA(variableName.c_str(), &output[0], valueSize);
 #else
-    char *env = getenv(variableName.c_str());
-    output = env ? env : "";
+    char* env = getenv(variableName.c_str());
+    output    = env ? env : "";
 #endif
 
     return output;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 284eb27..0312fc4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file JitManager.cpp
-*
-* @brief Implementation if the Jit Manager.
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file JitManager.cpp
+ *
+ * @brief Implementation if the Jit Manager.
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #include "jit_pch.hpp"
 
 #include "JitManager.h"
@@ -59,35 +59,77 @@
 //////////////////////////////////////////////////////////////////////////
 /// @brief Contructor for JitManager.
 /// @param simdWidth - SIMD width to be used in generated program.
-JitManager::JitManager(uint32_t simdWidth, const char *arch, const char* core)
-    : mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), mArch(arch)
+JitManager::JitManager(uint32_t simdWidth, const char* arch, const char* core) :
+    mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth),
+    mArch(arch)
 {
     InitializeNativeTarget();
     InitializeNativeTargetAsmPrinter();
     InitializeNativeTargetDisassembler();
 
 
-    TargetOptions    tOpts;
+    TargetOptions tOpts;
     tOpts.AllowFPOpFusion = FPOpFusion::Fast;
-    tOpts.NoInfsFPMath = false;
-    tOpts.NoNaNsFPMath = false;
+    tOpts.NoInfsFPMath    = false;
+    tOpts.NoNaNsFPMath    = false;
     tOpts.UnsafeFPMath = false;
 
-    //tOpts.PrintMachineCode    = true;
+    // tOpts.PrintMachineCode    = true;
 
     std::unique_ptr<Module> newModule(new Module("", mContext));
     mpCurrentModule = newModule.get();
 
     StringRef hostCPUName;
 
-    hostCPUName = sys::getHostCPUName();
+    // force JIT to use the same CPU arch as the rest of swr
+    if (mArch.AVX512F())
+    {
+#if USE_SIMD16_SHADERS
+        if (mArch.AVX512ER())
+        {
+            hostCPUName = StringRef("knl");
+        }
+        else
+        {
+            hostCPUName = StringRef("skylake-avx512");
+        }
+        mUsingAVX512 = true;
+#else
+        hostCPUName = StringRef("core-avx2");
+#endif
+        if (mVWidth == 0)
+        {
+            mVWidth = 8;
+        }
+    }
+    else if (mArch.AVX2())
+    {
+        hostCPUName = StringRef("core-avx2");
+        if (mVWidth == 0)
+        {
+            mVWidth = 8;
+        }
+    }
+    else if (mArch.AVX())
+    {
+        if (mArch.F16C())
+        {
+            hostCPUName = StringRef("core-avx-i");
+        }
+        else
+        {
+            hostCPUName = StringRef("corei7-avx");
+        }
+        if (mVWidth == 0)
+        {
+            mVWidth = 8;
+        }
+    }
+    else
+    {
+        SWR_INVALID("Jitting requires at least AVX ISA support");
+    }
 
-#if defined(_WIN32)
-    // Needed for MCJIT on windows
-    Triple hostTriple(sys::getProcessTriple());
-    hostTriple.setObjectFormat(Triple::COFF);
-    mpCurrentModule->setTargetTriple(hostTriple.getTriple());
-#endif // _WIN32
 
     auto optLevel = CodeGenOpt::Aggressive;
 
@@ -97,11 +139,12 @@
         optLevel = CodeGenOpt::Level(KNOB_JIT_OPTIMIZATION_LEVEL);
     }
 
+    mpCurrentModule->setTargetTriple(sys::getProcessTriple());
     mpExec = EngineBuilder(std::move(newModule))
-        .setTargetOptions(tOpts)
-        .setOptLevel(optLevel)
-        .setMCPU(hostCPUName)
-        .create();
+                 .setTargetOptions(tOpts)
+                 .setOptLevel(optLevel)
+                 .setMCPU(hostCPUName)
+                 .create();
 
     if (KNOB_JIT_ENABLE_CACHE)
     {
@@ -110,7 +153,7 @@
     }
 
 #if LLVM_USE_INTEL_JITEVENTS
-    JITEventListener *vTune = JITEventListener::createIntelJITEventListener();
+    JITEventListener* vTune = JITEventListener::createIntelJITEventListener();
     mpExec->RegisterJITEventListener(vTune);
 #endif
 
@@ -163,30 +206,35 @@
 
     std::unique_ptr<Module> newModule(new Module("", mContext));
     mpCurrentModule = newModule.get();
-#if defined(_WIN32)
-    // Needed for MCJIT on windows
-    Triple hostTriple(sys::getProcessTriple());
-    hostTriple.setObjectFormat(Triple::COFF);
-    newModule->setTargetTriple(hostTriple.getTriple());
-#endif // _WIN32
-
+    mpCurrentModule->setTargetTriple(sys::getProcessTriple());
     mpExec->addModule(std::move(newModule));
     mIsModuleFinalized = false;
 }
 
 
-DIType* JitManager::CreateDebugStructType(StructType* pType, const std::string& name, DIFile* pFile, uint32_t lineNum,
-    const std::vector<std::pair<std::string, uint32_t>>& members)
+DIType*
+JitManager::CreateDebugStructType(StructType*                                          pType,
+                                  const std::string&                                   name,
+                                  DIFile*                                              pFile,
+                                  uint32_t                                             lineNum,
+                                  const std::vector<std::pair<std::string, uint32_t>>& members)
 {
-    DIBuilder builder(*mpCurrentModule);
+    DIBuilder                 builder(*mpCurrentModule);
     SmallVector<Metadata*, 8> ElemTypes;
-    DataLayout DL = DataLayout(mpCurrentModule);
-    uint32_t size = DL.getTypeAllocSizeInBits(pType);
-    uint32_t alignment = DL.getABITypeAlignment(pType);
-    DINode::DIFlags flags = DINode::DIFlags::FlagPublic;
+    DataLayout                DL        = DataLayout(mpCurrentModule);
+    uint32_t                  size      = DL.getTypeAllocSizeInBits(pType);
+    uint32_t                  alignment = DL.getABITypeAlignment(pType);
+    DINode::DIFlags           flags     = DINode::DIFlags::FlagPublic;
 
-    DICompositeType* pDIStructTy = builder.createStructType(pFile, name, pFile, lineNum, size, alignment,
-        flags, nullptr, builder.getOrCreateArray(ElemTypes));
+    DICompositeType* pDIStructTy = builder.createStructType(pFile,
+                                                            name,
+                                                            pFile,
+                                                            lineNum,
+                                                            size,
+                                                            alignment,
+                                                            flags,
+                                                            nullptr,
+                                                            builder.getOrCreateArray(ElemTypes));
 
     // Register mapping now to break loops (in case struct contains itself or pointers to itself)
     mDebugStructMap[pType] = pDIStructTy;
@@ -194,13 +242,14 @@
     uint32_t idx = 0;
     for (auto& elem : pType->elements())
     {
-        std::string name = members[idx].first;
-        uint32_t lineNum = members[idx].second;
-        size = DL.getTypeAllocSizeInBits(elem);
-        alignment = DL.getABITypeAlignment(elem);
-        uint32_t offset = DL.getStructLayout(pType)->getElementOffsetInBits(idx);
+        std::string name       = members[idx].first;
+        uint32_t    lineNum    = members[idx].second;
+        size                   = DL.getTypeAllocSizeInBits(elem);
+        alignment              = DL.getABITypeAlignment(elem);
+        uint32_t      offset   = DL.getStructLayout(pType)->getElementOffsetInBits(idx);
         llvm::DIType* pDebugTy = GetDebugType(elem);
-        ElemTypes.push_back(builder.createMemberType(pDIStructTy, name, pFile, lineNum, size, alignment, offset, flags, pDebugTy));
+        ElemTypes.push_back(builder.createMemberType(
+            pDIStructTy, name, pFile, lineNum, size, alignment, offset, flags, pDebugTy));
 
         idx++;
     }
@@ -211,36 +260,58 @@
 
 DIType* JitManager::GetDebugArrayType(Type* pTy)
 {
-    DIBuilder builder(*mpCurrentModule);
-    DataLayout DL = DataLayout(mpCurrentModule);
-    ArrayType* pArrayTy = cast<ArrayType>(pTy);
-    uint32_t size = DL.getTypeAllocSizeInBits(pArrayTy);
-    uint32_t alignment = DL.getABITypeAlignment(pArrayTy);
+    DIBuilder  builder(*mpCurrentModule);
+    DataLayout DL        = DataLayout(mpCurrentModule);
+    ArrayType* pArrayTy  = cast<ArrayType>(pTy);
+    uint32_t   size      = DL.getTypeAllocSizeInBits(pArrayTy);
+    uint32_t   alignment = DL.getABITypeAlignment(pArrayTy);
 
     SmallVector<Metadata*, 8> Elems;
     Elems.push_back(builder.getOrCreateSubrange(0, pArrayTy->getNumElements()));
-    return builder.createArrayType(size, alignment, GetDebugType(pArrayTy->getElementType()), builder.getOrCreateArray(Elems));
+    return builder.createArrayType(
+        size, alignment, GetDebugType(pArrayTy->getElementType()), builder.getOrCreateArray(Elems));
 }
 
 // Create a DIType from llvm Type
 DIType* JitManager::GetDebugType(Type* pTy)
 {
-    DIBuilder builder(*mpCurrentModule);
+    DIBuilder    builder(*mpCurrentModule);
     Type::TypeID id = pTy->getTypeID();
 
     switch (id)
     {
-    case Type::VoidTyID: return builder.createUnspecifiedType("void"); break;
-    case Type::HalfTyID: return builder.createBasicType("float16", 16, dwarf::DW_ATE_float); break;
-    case Type::FloatTyID: return builder.createBasicType("float", 32, dwarf::DW_ATE_float); break;
-    case Type::DoubleTyID: return builder.createBasicType("double", 64, dwarf::DW_ATE_float); break;
-    case Type::IntegerTyID: return GetDebugIntegerType(pTy); break;
-    case Type::StructTyID: return GetDebugStructType(pTy); break;
-    case Type::ArrayTyID: return GetDebugArrayType(pTy); break;
-    case Type::PointerTyID: return builder.createPointerType(GetDebugType(pTy->getPointerElementType()), 64, 64); break;
-    case Type::VectorTyID: return GetDebugVectorType(pTy); break;
-    case Type::FunctionTyID: return GetDebugFunctionType(pTy); break;
-    default: SWR_ASSERT(false, "Unimplemented llvm type");
+    case Type::VoidTyID:
+        return builder.createUnspecifiedType("void");
+        break;
+    case Type::HalfTyID:
+        return builder.createBasicType("float16", 16, dwarf::DW_ATE_float);
+        break;
+    case Type::FloatTyID:
+        return builder.createBasicType("float", 32, dwarf::DW_ATE_float);
+        break;
+    case Type::DoubleTyID:
+        return builder.createBasicType("double", 64, dwarf::DW_ATE_float);
+        break;
+    case Type::IntegerTyID:
+        return GetDebugIntegerType(pTy);
+        break;
+    case Type::StructTyID:
+        return GetDebugStructType(pTy);
+        break;
+    case Type::ArrayTyID:
+        return GetDebugArrayType(pTy);
+        break;
+    case Type::PointerTyID:
+        return builder.createPointerType(GetDebugType(pTy->getPointerElementType()), 64, 64);
+        break;
+    case Type::VectorTyID:
+        return GetDebugVectorType(pTy);
+        break;
+    case Type::FunctionTyID:
+        return GetDebugFunctionType(pTy);
+        break;
+    default:
+        SWR_ASSERT(false, "Unimplemented llvm type");
     }
     return nullptr;
 }
@@ -249,8 +320,8 @@
 DIType* JitManager::GetDebugFunctionType(Type* pTy)
 {
     SmallVector<Metadata*, 8> ElemTypes;
-    FunctionType* pFuncTy = cast<FunctionType>(pTy);
-    DIBuilder builder(*mpCurrentModule);
+    FunctionType*             pFuncTy = cast<FunctionType>(pTy);
+    DIBuilder                 builder(*mpCurrentModule);
 
     // Add result type
     ElemTypes.push_back(GetDebugType(pFuncTy->getReturnType()));
@@ -266,33 +337,48 @@
 
 DIType* JitManager::GetDebugIntegerType(Type* pTy)
 {
-    DIBuilder builder(*mpCurrentModule);
+    DIBuilder    builder(*mpCurrentModule);
     IntegerType* pIntTy = cast<IntegerType>(pTy);
     switch (pIntTy->getBitWidth())
     {
-    case 1: return builder.createBasicType("int1", 1, dwarf::DW_ATE_unsigned); break;
-    case 8: return builder.createBasicType("int8", 8, dwarf::DW_ATE_signed); break;
-    case 16: return builder.createBasicType("int16", 16, dwarf::DW_ATE_signed); break;
-    case 32: return builder.createBasicType("int", 32, dwarf::DW_ATE_signed); break;
-    case 64: return builder.createBasicType("int64", 64, dwarf::DW_ATE_signed); break;
-    case 128: return builder.createBasicType("int128", 128, dwarf::DW_ATE_signed); break;
-    default: SWR_ASSERT(false, "Unimplemented integer bit width");
+    case 1:
+        return builder.createBasicType("int1", 1, dwarf::DW_ATE_unsigned);
+        break;
+    case 8:
+        return builder.createBasicType("int8", 8, dwarf::DW_ATE_signed);
+        break;
+    case 16:
+        return builder.createBasicType("int16", 16, dwarf::DW_ATE_signed);
+        break;
+    case 32:
+        return builder.createBasicType("int", 32, dwarf::DW_ATE_signed);
+        break;
+    case 64:
+        return builder.createBasicType("int64", 64, dwarf::DW_ATE_signed);
+        break;
+    case 128:
+        return builder.createBasicType("int128", 128, dwarf::DW_ATE_signed);
+        break;
+    default:
+        SWR_ASSERT(false, "Unimplemented integer bit width");
     }
     return nullptr;
 }
 
 DIType* JitManager::GetDebugVectorType(Type* pTy)
 {
-    DIBuilder builder(*mpCurrentModule);
-    VectorType* pVecTy = cast<VectorType>(pTy);
-    DataLayout DL = DataLayout(mpCurrentModule);
-    uint32_t size = DL.getTypeAllocSizeInBits(pVecTy);
-    uint32_t alignment = DL.getABITypeAlignment(pVecTy);
+    DIBuilder                 builder(*mpCurrentModule);
+    VectorType*               pVecTy    = cast<VectorType>(pTy);
+    DataLayout                DL        = DataLayout(mpCurrentModule);
+    uint32_t                  size      = DL.getTypeAllocSizeInBits(pVecTy);
+    uint32_t                  alignment = DL.getABITypeAlignment(pVecTy);
     SmallVector<Metadata*, 1> Elems;
     Elems.push_back(builder.getOrCreateSubrange(0, pVecTy->getVectorNumElements()));
 
-    return builder.createVectorType(size, alignment, GetDebugType(pVecTy->getVectorElementType()), builder.getOrCreateArray(Elems));
-
+    return builder.createVectorType(size,
+                                    alignment,
+                                    GetDebugType(pVecTy->getVectorElementType()),
+                                    builder.getOrCreateArray(Elems));
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -303,21 +389,20 @@
 {
     if (KNOB_DUMP_SHADER_IR)
     {
-
 #if defined(_WIN32)
         DWORD pid = GetCurrentProcessId();
-        char procname[MAX_PATH];
+        char  procname[MAX_PATH];
         GetModuleFileNameA(NULL, procname, MAX_PATH);
-        const char* pBaseName = strrchr(procname, '\\');
+        const char*       pBaseName = strrchr(procname, '\\');
         std::stringstream outDir;
         outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
         CreateDirectoryPath(outDir.str().c_str());
 #endif
 
         std::error_code EC;
-        Module* pModule = pFunction->getParent();
-        const char *funcName = pFunction->getName().data();
-        char fName[256];
+        Module*         pModule  = pFunction->getParent();
+        const char*     funcName = pFunction->getName().data();
+        char            fName[256];
 #if defined(_WIN32)
         sprintf(fName, "%s\\%s.%s.asm", outDir.str().c_str(), funcName, fileName);
 #else
@@ -326,10 +411,15 @@
 
         raw_fd_ostream filestream(fName, EC, llvm::sys::fs::F_None);
 
-        legacy::PassManager* pMPasses = new legacy::PassManager();
-        auto* pTarget = mpExec->getTargetMachine();
+        legacy::PassManager* pMPasses         = new legacy::PassManager();
+        auto*                pTarget          = mpExec->getTargetMachine();
         pTarget->Options.MCOptions.AsmVerbose = true;
+#if LLVM_VERSION_MAJOR >= 7
+        pTarget->addPassesToEmitFile(
+            *pMPasses, filestream, nullptr, TargetMachine::CGFT_AssemblyFile);
+#else
         pTarget->addPassesToEmitFile(*pMPasses, filestream, TargetMachine::CGFT_AssemblyFile);
+#endif
         pMPasses->run(*pModule);
         delete pMPasses;
         pTarget->Options.MCOptions.AsmVerbose = false;
@@ -340,9 +430,9 @@
 {
 #if defined(_WIN32)
     DWORD pid = GetCurrentProcessId();
-    char procname[MAX_PATH];
+    char  procname[MAX_PATH];
     GetModuleFileNameA(NULL, procname, MAX_PATH);
-    const char* pBaseName = strrchr(procname, '\\');
+    const char*       pBaseName = strrchr(procname, '\\');
     std::stringstream outDir;
     outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid;
     CreateDirectoryPath(outDir.str().c_str());
@@ -353,15 +443,15 @@
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Dump function to file.
-void JitManager::DumpToFile(Module *M, const char *fileName)
+void JitManager::DumpToFile(Module* M, const char* fileName)
 {
     if (KNOB_DUMP_SHADER_IR)
     {
         std::string outDir = GetOutputDir();
 
         std::error_code EC;
-        const char *funcName = M->getName().data();
-        char fName[256];
+        const char*     funcName = M->getName().data();
+        char            fName[256];
 #if defined(_WIN32)
         sprintf(fName, "%s\\%s.%s.ll", outDir.c_str(), funcName, fileName);
 #else
@@ -375,15 +465,15 @@
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Dump function to file.
-void JitManager::DumpToFile(Function *f, const char *fileName)
+void JitManager::DumpToFile(Function* f, const char* fileName)
 {
     if (KNOB_DUMP_SHADER_IR)
     {
         std::string outDir = GetOutputDir();
 
         std::error_code EC;
-        const char *funcName = f->getName().data();
-        char fName[256];
+        const char*     funcName = f->getName().data();
+        char            fName[256];
 #if defined(_WIN32)
         sprintf(fName, "%s\\%s.%s.ll", outDir.c_str(), funcName, fileName);
 #else
@@ -406,27 +496,26 @@
     }
 }
 
-extern "C"
+extern "C" {
+bool g_DllActive = true;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create JIT context.
+/// @param simdWidth - SIMD width to be used in generated program.
+HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch, const char* core)
 {
-    bool g_DllActive = true;
+    return new JitManager(targetSimdWidth, arch, core);
+}
 
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Create JIT context.
-    /// @param simdWidth - SIMD width to be used in generated program.
-    HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch, const char* core)
+//////////////////////////////////////////////////////////////////////////
+/// @brief Destroy JIT context.
+void JITCALL JitDestroyContext(HANDLE hJitContext)
+{
+    if (g_DllActive)
     {
-        return new JitManager(targetSimdWidth, arch, core);
+        delete reinterpret_cast<JitManager*>(hJitContext);
     }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Destroy JIT context.
-    void JITCALL JitDestroyContext(HANDLE hJitContext)
-    {
-        if (g_DllActive)
-        {
-            delete reinterpret_cast<JitManager*>(hJitContext);
-        }
-    }
+}
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -438,31 +527,29 @@
 //////////////////////////////////////////////////////////////////////////
 struct JitCacheFileHeader
 {
-    void Init(
-        uint32_t llCRC,
-        uint32_t objCRC,
-        const std::string& moduleID,
-        const std::string& cpu,
-        uint32_t optLevel,
-        uint64_t objSize)
+    void Init(uint32_t           llCRC,
+              uint32_t           objCRC,
+              const std::string& moduleID,
+              const std::string& cpu,
+              uint32_t           optLevel,
+              uint64_t           objSize)
     {
         m_objSize = objSize;
-        m_llCRC = llCRC;
-        m_objCRC = objCRC;
+        m_llCRC   = llCRC;
+        m_objCRC  = objCRC;
         strncpy(m_ModuleID, moduleID.c_str(), JC_STR_MAX_LEN - 1);
         m_ModuleID[JC_STR_MAX_LEN - 1] = 0;
         strncpy(m_Cpu, cpu.c_str(), JC_STR_MAX_LEN - 1);
         m_Cpu[JC_STR_MAX_LEN - 1] = 0;
-        m_optLevel = optLevel;
+        m_optLevel                = optLevel;
     }
 
 
-    bool IsValid(uint32_t llCRC, const std::string& moduleID, const std::string& cpu, uint32_t optLevel)
+    bool
+    IsValid(uint32_t llCRC, const std::string& moduleID, const std::string& cpu, uint32_t optLevel)
     {
-        if ((m_MagicNumber != JC_MAGIC_NUMBER) ||
-            (m_llCRC != llCRC) ||
-            (m_platformKey != JC_PLATFORM_KEY) ||
-            (m_optLevel != optLevel))
+        if ((m_MagicNumber != JC_MAGIC_NUMBER) || (m_llCRC != llCRC) ||
+            (m_platformKey != JC_PLATFORM_KEY) || (m_optLevel != optLevel))
         {
             return false;
         }
@@ -486,31 +573,33 @@
     uint64_t GetObjectCRC() const { return m_objCRC; }
 
 private:
-    static const uint64_t   JC_MAGIC_NUMBER = 0xfedcba9876543211ULL + 3;
-    static const size_t     JC_STR_MAX_LEN = 32;
-    static const uint32_t   JC_PLATFORM_KEY =
-        (LLVM_VERSION_MAJOR << 24)  |
-        (LLVM_VERSION_MINOR << 16)  |
-        (LLVM_VERSION_PATCH << 8)   |
-        ((sizeof(void*) > sizeof(uint32_t)) ? 1 : 0);
+    static const uint64_t JC_MAGIC_NUMBER = 0xfedcba9876543211ULL + 4;
+    static const size_t   JC_STR_MAX_LEN  = 32;
+    static const uint32_t JC_PLATFORM_KEY = (LLVM_VERSION_MAJOR << 24) |
+                                            (LLVM_VERSION_MINOR << 16) | (LLVM_VERSION_PATCH << 8) |
+                                            ((sizeof(void*) > sizeof(uint32_t)) ? 1 : 0);
 
-    uint64_t m_MagicNumber = JC_MAGIC_NUMBER;
-    uint64_t m_objSize = 0;
-    uint32_t m_llCRC = 0;
-    uint32_t m_platformKey = JC_PLATFORM_KEY;
-    uint32_t m_objCRC = 0;
-    uint32_t m_optLevel = 0;
-    char m_ModuleID[JC_STR_MAX_LEN] = {};
-    char m_Cpu[JC_STR_MAX_LEN] = {};
+    uint64_t m_MagicNumber              = JC_MAGIC_NUMBER;
+    uint64_t m_objSize                  = 0;
+    uint32_t m_llCRC                    = 0;
+    uint32_t m_platformKey              = JC_PLATFORM_KEY;
+    uint32_t m_objCRC                   = 0;
+    uint32_t m_optLevel                 = 0;
+    char     m_ModuleID[JC_STR_MAX_LEN] = {};
+    char     m_Cpu[JC_STR_MAX_LEN]      = {};
 };
 
 static inline uint32_t ComputeModuleCRC(const llvm::Module* M)
 {
-    std::string bitcodeBuffer;
+    std::string        bitcodeBuffer;
     raw_string_ostream bitcodeStream(bitcodeBuffer);
 
+#if LLVM_VERSION_MAJOR >= 7
+    llvm::WriteBitcodeToFile(*M, bitcodeStream);
+#else
     llvm::WriteBitcodeToFile(M, bitcodeStream);
-    //M->print(bitcodeStream, nullptr, false);
+#endif
+    // M->print(bitcodeStream, nullptr, false);
 
     bitcodeStream.flush();
 
@@ -521,14 +610,17 @@
 JitCache::JitCache()
 {
 #if defined(__APPLE__) || defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
-    if (strncmp(KNOB_JIT_CACHE_DIR.c_str(), "~/", 2) == 0) {
-        char *homedir;
-        if (!(homedir = getenv("HOME"))) {
+    if (strncmp(KNOB_JIT_CACHE_DIR.c_str(), "~/", 2) == 0)
+    {
+        char* homedir;
+        if (!(homedir = getenv("HOME")))
+        {
             homedir = getpwuid(getuid())->pw_dir;
         }
         mCacheDir = homedir;
         mCacheDir += (KNOB_JIT_CACHE_DIR.c_str() + 1);
-    } else
+    }
+    else
 #endif
     {
         mCacheDir = KNOB_JIT_CACHE_DIR;
@@ -537,14 +629,11 @@
 
 int ExecUnhookedProcess(const std::string& CmdLine, std::string* pStdOut, std::string* pStdErr)
 {
-    static const char *g_pEnv = "RASTY_DISABLE_HOOK=1\0";
-
-    return ExecCmd(CmdLine, g_pEnv, pStdOut, pStdErr);
+    return ExecCmd(CmdLine, "", pStdOut, pStdErr);
 }
 
-
 /// notifyObjectCompiled - Provides a pointer to compiled code for Module M.
-void JitCache::notifyObjectCompiled(const llvm::Module *M, llvm::MemoryBufferRef Obj)
+void JitCache::notifyObjectCompiled(const llvm::Module* M, llvm::MemoryBufferRef Obj)
 {
     const std::string& moduleID = M->getModuleIdentifier();
     if (!moduleID.length())
@@ -568,7 +657,7 @@
     objPath += JIT_OBJ_EXT;
 
     {
-        std::error_code err;
+        std::error_code      err;
         llvm::raw_fd_ostream fileObj(objPath.c_str(), err, llvm::sys::fs::F_None);
         fileObj << Obj.getBuffer();
         fileObj.flush();
@@ -576,7 +665,7 @@
 
 
     {
-        std::error_code err;
+        std::error_code      err;
         llvm::raw_fd_ostream fileObj(filePath.c_str(), err, llvm::sys::fs::F_None);
 
         uint32_t objcrc = ComputeCRC(0, Obj.getBufferStart(), Obj.getBufferSize());
@@ -594,7 +683,7 @@
 std::unique_ptr<llvm::MemoryBuffer> JitCache::getObject(const llvm::Module* M)
 {
     const std::string& moduleID = M->getModuleIdentifier();
-    mCurrentModuleCRC = ComputeModuleCRC(M);
+    mCurrentModuleCRC           = ComputeModuleCRC(M);
 
     if (!moduleID.length())
     {
@@ -613,7 +702,7 @@
     objFilePath += JIT_OBJ_EXT;
 
     FILE* fpObjIn = nullptr;
-    FILE* fpIn = fopen(filePath.c_str(), "rb");
+    FILE* fpIn    = fopen(filePath.c_str(), "rb");
     if (!fpIn)
     {
         return nullptr;
@@ -657,8 +746,7 @@
             break;
         }
 
-    }
-    while (0);
+    } while (0);
 
     fclose(fpIn);
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c15e0d1..a5b6af9 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file JitManager.h
-*
-* @brief JitManager contains the LLVM data structures used for JIT generation
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file JitManager.h
+ *
+ * @brief JitManager contains the LLVM data structures used for JIT generation
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "jit_pch.hpp"
@@ -37,7 +37,7 @@
 /// JitInstructionSet
 /// @brief Subclass of InstructionSet that allows users to override
 /// the reporting of support for certain ISA features.  This allows capping
-/// the jitted code to a certain feature level, e.g. jit AVX level code on 
+/// the jitted code to a certain feature level, e.g. jit AVX level code on
 /// a platform that supports AVX2.
 //////////////////////////////////////////////////////////////////////////
 class JitInstructionSet : public InstructionSet
@@ -47,44 +47,42 @@
     {
         std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower);
 
-        if(isaRequest == "avx")
+        if (isaRequest == "avx")
         {
-            bForceAVX = true;
-            bForceAVX2 = false;
+            bForceAVX    = true;
+            bForceAVX2   = false;
             bForceAVX512 = false;
         }
-        else if(isaRequest == "avx2")
+        else if (isaRequest == "avx2")
         {
-            bForceAVX = false;
-            bForceAVX2 = true;
+            bForceAVX    = false;
+            bForceAVX2   = true;
             bForceAVX512 = false;
         }
-        else if(isaRequest == "avx512")
+        else if (isaRequest == "avx512")
         {
-            bForceAVX = false;
-            bForceAVX2 = false;
+            bForceAVX    = false;
+            bForceAVX2   = false;
             bForceAVX512 = true;
         }
     };
 
     bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); }
     bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); }
+    bool AVX512ER(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512ER(); }
     bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); }
 
 private:
-    bool bForceAVX = false;
-    bool bForceAVX2 = false;
-    bool bForceAVX512 = false;
+    bool        bForceAVX    = false;
+    bool        bForceAVX2   = false;
+    bool        bForceAVX512 = false;
     std::string isaRequest;
 };
 
-
-
 struct JitLLVMContext : llvm::LLVMContext
 {
 };
 
-
 //////////////////////////////////////////////////////////////////////////
 /// JitCache
 //////////////////////////////////////////////////////////////////////////
@@ -96,30 +94,27 @@
     JitCache();
     virtual ~JitCache() {}
 
-    void Init(
-        JitManager* pJitMgr,
-        const llvm::StringRef& cpu,
-        llvm::CodeGenOpt::Level level)
+    void Init(JitManager* pJitMgr, const llvm::StringRef& cpu, llvm::CodeGenOpt::Level level)
     {
-        mCpu = cpu.str();
-        mpJitMgr = pJitMgr;
+        mCpu      = cpu.str();
+        mpJitMgr  = pJitMgr;
         mOptLevel = level;
     }
 
     /// notifyObjectCompiled - Provides a pointer to compiled code for Module M.
-    virtual void notifyObjectCompiled(const llvm::Module *M, llvm::MemoryBufferRef Obj);
+    void notifyObjectCompiled(const llvm::Module* M, llvm::MemoryBufferRef Obj) override;
 
     /// Returns a pointer to a newly allocated MemoryBuffer that contains the
     /// object which corresponds with Module M, or 0 if an object is not
     /// available.
-    virtual std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module* M);
+    std::unique_ptr<llvm::MemoryBuffer> getObject(const llvm::Module* M) override;
 
 private:
-    std::string mCpu;
+    std::string                 mCpu;
     llvm::SmallString<MAX_PATH> mCacheDir;
-    uint32_t mCurrentModuleCRC = 0;
-    JitManager* mpJitMgr = nullptr;
-    llvm::CodeGenOpt::Level mOptLevel = llvm::CodeGenOpt::None;
+    uint32_t                    mCurrentModuleCRC = 0;
+    JitManager*                 mpJitMgr          = nullptr;
+    llvm::CodeGenOpt::Level     mOptLevel         = llvm::CodeGenOpt::None;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -130,32 +125,33 @@
     JitManager(uint32_t w, const char* arch, const char* core);
     ~JitManager(){};
 
-    JitLLVMContext          mContext;   ///< LLVM compiler
-    llvm::IRBuilder<>       mBuilder;   ///< LLVM IR Builder
-    llvm::ExecutionEngine*  mpExec;
-    JitCache                mCache;
+    JitLLVMContext         mContext; ///< LLVM compiler
+    llvm::IRBuilder<>      mBuilder; ///< LLVM IR Builder
+    llvm::ExecutionEngine* mpExec;
+    JitCache               mCache;
 
     // Need to be rebuilt after a JIT and before building new IR
-    llvm::Module*           mpCurrentModule;
-    bool                    mIsModuleFinalized;
-    uint32_t                mJitNumber;
+    llvm::Module* mpCurrentModule;
+    bool          mIsModuleFinalized;
+    uint32_t      mJitNumber;
 
-    uint32_t                mVWidth;
+    uint32_t mVWidth;
 
+    bool mUsingAVX512 = false;
 
     // fetch shader types
-    llvm::FunctionType*     mFetchShaderTy;
+    llvm::FunctionType* mFetchShaderTy;
 
-    JitInstructionSet       mArch;
+    JitInstructionSet mArch;
 
     // Debugging support
     std::unordered_map<llvm::StructType*, llvm::DIType*> mDebugStructMap;
 
     void SetupNewModule();
 
-    void DumpAsm(llvm::Function* pFunction, const char* fileName);
-    static void DumpToFile(llvm::Function *f, const char *fileName);
-    static void DumpToFile(llvm::Module *M, const char *fileName);
+    void               DumpAsm(llvm::Function* pFunction, const char* fileName);
+    static void        DumpToFile(llvm::Function* f, const char* fileName);
+    static void        DumpToFile(llvm::Module* M, const char* fileName);
     static std::string GetOutputDir();
 
     // Debugging support methods
@@ -175,6 +171,10 @@
         return mDebugStructMap[pStructTy];
     }
 
-    llvm::DIType* CreateDebugStructType(llvm::StructType* pType, const std::string& name, llvm::DIFile* pFile, uint32_t lineNum,
-        const std::vector<std::pair<std::string, uint32_t>>& members);
+    llvm::DIType*
+    CreateDebugStructType(llvm::StructType*                                    pType,
+                          const std::string&                                   name,
+                          llvm::DIFile*                                        pFile,
+                          uint32_t                                             lineNum,
+                          const std::vector<std::pair<std::string, uint32_t>>& members);
 };
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 58fdb7f..d5328c8 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file blend_jit.cpp
-*
-* @brief Implementation of the blend jitter
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file blend_jit.cpp
+ *
+ * @brief Implementation of the blend jitter
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #include "jit_pch.hpp"
 #include "builder.h"
 #include "jit_api.h"
@@ -47,8 +47,13 @@
 {
     BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
 
-    template<bool Color, bool Alpha>
-    void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
+    template <bool Color, bool Alpha>
+    void GenerateBlendFactor(SWR_BLEND_FACTOR factor,
+                             Value*           constColor[4],
+                             Value*           src[4],
+                             Value*           src1[4],
+                             Value*           dst[4],
+                             Value*           result[4])
     {
         Value* out[4];
 
@@ -77,7 +82,7 @@
             break;
         case BLENDFACTOR_SRC_ALPHA_SATURATE:
             out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
-            out[3] = VIMMED1(1.0f);
+            out[3]                   = VIMMED1(1.0f);
             break;
         case BLENDFACTOR_CONST_COLOR:
             out[0] = constColor[0];
@@ -158,7 +163,7 @@
     void Clamp(SWR_FORMAT format, Value* src[4])
     {
         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-        SWR_TYPE type = info.type[0];
+        SWR_TYPE               type = info.type[0];
 
         switch (type)
         {
@@ -179,7 +184,8 @@
             src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
             break;
 
-        case SWR_TYPE_UNKNOWN: SWR_INVALID("Unsupport format type: %d", type);
+        case SWR_TYPE_UNKNOWN:
+            SWR_INVALID("Unsupport format type: %d", type);
         }
     }
 
@@ -187,7 +193,7 @@
     {
         const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 
-        bool valid[] = { false, false, false, false };
+        bool valid[] = {false, false, false, false};
         for (uint32_t c = 0; c < info.numComps; ++c)
         {
             valid[info.swizzle[c]] = true;
@@ -210,7 +216,8 @@
         {
             if (info.type[c] == SWR_TYPE_UNUSED)
             {
-                src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
+                src[info.swizzle[c]] =
+                    BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
             }
         }
     }
@@ -223,22 +230,28 @@
             if (info.bpc[c] <= QUANTIZE_THRESHOLD && info.type[c] != SWR_TYPE_UNUSED)
             {
                 uint32_t swizComp = info.swizzle[c];
-                float factor = (float)((1 << info.bpc[c]) - 1);
+                float    factor   = (float)((1 << info.bpc[c]) - 1);
                 switch (info.type[c])
                 {
                 case SWR_TYPE_UNORM:
                     src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
                     src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
-                    src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
+                    src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f / factor));
                     break;
-                default: SWR_INVALID("Unsupported format type: %d", info.type[c]);
+                default:
+                    SWR_INVALID("Unsupported format type: %d", info.type[c]);
                 }
             }
         }
     }
 
-    template<bool Color, bool Alpha>
-    void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
+    template <bool Color, bool Alpha>
+    void BlendFunc(SWR_BLEND_OP blendOp,
+                   Value*       src[4],
+                   Value*       srcFactor[4],
+                   Value*       dst[4],
+                   Value*       dstFactor[4],
+                   Value*       result[4])
     {
         Value* out[4];
         Value* srcBlend[4];
@@ -308,7 +321,7 @@
     void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
     {
         // Op: (s == PS output, d = RT contents)
-        switch(logicOp)
+        switch (logicOp)
         {
         case LOGICOP_CLEAR:
             result[0] = VIMMED1(0);
@@ -443,32 +456,49 @@
         }
     }
 
-    void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
+    void
+    AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
     {
         // load uint32_t reference
-        Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
-        
+        Value* pRef = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_alphaTestReference}));
+
         // load alpha
-        Value* pAlpha = LOAD(ppAlpha, { 0, 0 });
+        Value* pAlpha = LOAD(ppAlpha, {0, 0});
 
         Value* pTest = nullptr;
         if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
         {
             // convert float alpha to unorm8
             Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
-            pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
+            pAlphaU8        = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
 
             // compare
             switch (state.alphaTestFunction)
             {
-            case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
-            case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
-            case ZFUNC_LT:      pTest = ICMP_ULT(pAlphaU8, pRef); break;
-            case ZFUNC_EQ:      pTest = ICMP_EQ(pAlphaU8, pRef); break;
-            case ZFUNC_LE:      pTest = ICMP_ULE(pAlphaU8, pRef); break;
-            case ZFUNC_GT:      pTest = ICMP_UGT(pAlphaU8, pRef); break;
-            case ZFUNC_NE:      pTest = ICMP_NE(pAlphaU8, pRef); break;
-            case ZFUNC_GE:      pTest = ICMP_UGE(pAlphaU8, pRef); break;
+            case ZFUNC_ALWAYS:
+                pTest = VIMMED1(true);
+                break;
+            case ZFUNC_NEVER:
+                pTest = VIMMED1(false);
+                break;
+            case ZFUNC_LT:
+                pTest = ICMP_ULT(pAlphaU8, pRef);
+                break;
+            case ZFUNC_EQ:
+                pTest = ICMP_EQ(pAlphaU8, pRef);
+                break;
+            case ZFUNC_LE:
+                pTest = ICMP_ULE(pAlphaU8, pRef);
+                break;
+            case ZFUNC_GT:
+                pTest = ICMP_UGT(pAlphaU8, pRef);
+                break;
+            case ZFUNC_NE:
+                pTest = ICMP_NE(pAlphaU8, pRef);
+                break;
+            case ZFUNC_GE:
+                pTest = ICMP_UGE(pAlphaU8, pRef);
+                break;
             default:
                 SWR_INVALID("Invalid alpha test function");
                 break;
@@ -482,14 +512,30 @@
             // compare
             switch (state.alphaTestFunction)
             {
-            case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
-            case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
-            case ZFUNC_LT:      pTest = FCMP_OLT(pAlpha, pRef); break;
-            case ZFUNC_EQ:      pTest = FCMP_OEQ(pAlpha, pRef); break;
-            case ZFUNC_LE:      pTest = FCMP_OLE(pAlpha, pRef); break;
-            case ZFUNC_GT:      pTest = FCMP_OGT(pAlpha, pRef); break;
-            case ZFUNC_NE:      pTest = FCMP_ONE(pAlpha, pRef); break;
-            case ZFUNC_GE:      pTest = FCMP_OGE(pAlpha, pRef); break;
+            case ZFUNC_ALWAYS:
+                pTest = VIMMED1(true);
+                break;
+            case ZFUNC_NEVER:
+                pTest = VIMMED1(false);
+                break;
+            case ZFUNC_LT:
+                pTest = FCMP_OLT(pAlpha, pRef);
+                break;
+            case ZFUNC_EQ:
+                pTest = FCMP_OEQ(pAlpha, pRef);
+                break;
+            case ZFUNC_LE:
+                pTest = FCMP_OLE(pAlpha, pRef);
+                break;
+            case ZFUNC_GT:
+                pTest = FCMP_OGT(pAlpha, pRef);
+                break;
+            case ZFUNC_NE:
+                pTest = FCMP_ONE(pAlpha, pRef);
+                break;
+            case ZFUNC_GE:
+                pTest = FCMP_OGE(pAlpha, pRef);
+                break;
             default:
                 SWR_INVALID("Invalid alpha test function");
                 break;
@@ -514,22 +560,24 @@
 
     Function* Create(const BLEND_COMPILE_STATE& state)
     {
-        std::stringstream fnName("BLND_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+        std::stringstream fnName("BLND_",
+                                 std::ios_base::in | std::ios_base::out | std::ios_base::ate);
         fnName << ComputeCRC(0, &state, sizeof(state));
 
         // blend function signature
-        //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*);
+        // typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_CONTEXT*);
 
         std::vector<Type*> args{
             PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0) // SWR_BLEND_CONTEXT*
         };
 
-        //std::vector<Type*> args{
+        // std::vector<Type*> args{
         //    PointerType::get(Gen_SWR_BLEND_CONTEXT(JM()), 0), // SWR_BLEND_CONTEXT*
         //};
 
-        FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
-        Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+        FunctionType* fTy       = FunctionType::get(IRB()->getVoidTy(), args, false);
+        Function*     blendFunc = Function::Create(
+            fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
         blendFunc->getParent()->setModuleIdentifier(blendFunc->getName());
 
         BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
@@ -537,31 +585,30 @@
         IRB()->SetInsertPoint(entry);
 
         // arguments
-        auto argitr = blendFunc->arg_begin();
+        auto   argitr        = blendFunc->arg_begin();
         Value* pBlendContext = &*argitr++;
         pBlendContext->setName("pBlendContext");
-        Value* pBlendState = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_pBlendState });
+        Value* pBlendState = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pBlendState});
         pBlendState->setName("pBlendState");
-        Value* pSrc = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_src });
+        Value* pSrc = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src});
         pSrc->setName("src");
-        Value* pSrc1 = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_src1 });
+        Value* pSrc1 = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src1});
         pSrc1->setName("src1");
-        Value* pSrc0Alpha = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_src0alpha });
+        Value* pSrc0Alpha = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_src0alpha});
         pSrc0Alpha->setName("src0alpha");
-        Value* sampleNum = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_sampleNum });
+        Value* sampleNum = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_sampleNum});
         sampleNum->setName("sampleNum");
-        Value* pDst = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_pDst });
+        Value* pDst = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pDst});
         pDst->setName("pDst");
-        Value* pResult = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_result });
+        Value* pResult = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_result});
         pResult->setName("result");
-        Value* ppoMask = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_oMask });
+        Value* ppoMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_oMask});
         ppoMask->setName("ppoMask");
-        Value* ppMask = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_pMask });
+        Value* ppMask = LOAD(pBlendContext, {0, SWR_BLEND_CONTEXT_pMask});
         ppMask->setName("pMask");
-        Value* AlphaTest1 = LOAD(pBlendContext, { 0, SWR_BLEND_CONTEXT_isAlphaBlended });
-        ppMask->setName("AlphaTest1");
 
-        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT,
+                      "Unsupported hot tile format");
         Value* dst[4];
         Value* constantColor[4];
         Value* src[4];
@@ -570,44 +617,44 @@
         for (uint32_t i = 0; i < 4; ++i)
         {
             // load hot tile
-            dst[i] = LOAD(pDst, { 0, i });
+            dst[i] = LOAD(pDst, {0, i});
 
             // load constant color
-            constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
-        
+            constantColor[i] = VBROADCAST(LOAD(pBlendState, {0, SWR_BLEND_STATE_constantColor, i}));
+
             // load src
-            src[i] = LOAD(pSrc, { 0, i });
+            src[i] = LOAD(pSrc, {0, i});
 
             // load src1
-            src1[i] = LOAD(pSrc1, { 0, i });
+            src1[i] = LOAD(pSrc1, {0, i});
         }
         Value* currentSampleMask = VIMMED1(-1);
         if (state.desc.alphaToCoverageEnable)
         {
-            Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
-            uint32_t bits = (1 << state.desc.numSamples) - 1;
-            currentSampleMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
-            currentSampleMask = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
+            Value*   pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
+            uint32_t bits        = (1 << state.desc.numSamples) - 1;
+            currentSampleMask    = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
+            currentSampleMask    = FP_TO_SI(FADD(currentSampleMask, VIMMED1(0.5f)), mSimdInt32Ty);
         }
 
         // alpha test
         if (state.desc.alphaTestEnable)
         {
             // Gather for archrast stats
-            STORE(C(1), pBlendContext, { 0, SWR_BLEND_CONTEXT_isAlphaTested });
+            STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
             AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
         }
         else
         {
             // Gather for archrast stats
-            STORE(C(0), pBlendContext, { 0, SWR_BLEND_CONTEXT_isAlphaTested });
+            STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaTested});
         }
 
         // color blend
         if (state.blendState.blendEnable)
         {
             // Gather for archrast stats
-            STORE(C(1), pBlendContext, { 0, SWR_BLEND_CONTEXT_isAlphaBlended });
+            STORE(C(1), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
 
             // clamp sources
             Clamp(state.format, src);
@@ -637,40 +684,57 @@
             Value* dstFactor[4];
             if (state.desc.independentAlphaBlendEnable)
             {
-                GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
-                GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
+                GenerateBlendFactor<true, false>(
+                    state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
+                GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor,
+                                                 constantColor,
+                                                 src,
+                                                 src1,
+                                                 dst,
+                                                 srcFactor);
 
-                GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
-                GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
+                GenerateBlendFactor<true, false>(
+                    state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
+                GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor,
+                                                 constantColor,
+                                                 src,
+                                                 src1,
+                                                 dst,
+                                                 dstFactor);
 
-                BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
-                BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
+                BlendFunc<true, false>(
+                    state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+                BlendFunc<false, true>(
+                    state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
             }
             else
             {
-                GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
-                GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
+                GenerateBlendFactor<true, true>(
+                    state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
+                GenerateBlendFactor<true, true>(
+                    state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
 
-                BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+                BlendFunc<true, true>(
+                    state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
             }
 
             // store results out
             for (uint32_t i = 0; i < 4; ++i)
             {
-                STORE(result[i], pResult, { 0, i });
+                STORE(result[i], pResult, {0, i});
             }
         }
         else
         {
             // Gather for archrast stats
-            STORE(C(0), pBlendContext, { 0, SWR_BLEND_CONTEXT_isAlphaBlended });
+            STORE(C(0), pBlendContext, {0, SWR_BLEND_CONTEXT_isAlphaBlended});
         }
-        
-        if(state.blendState.logicOpEnable)
+
+        if (state.blendState.logicOpEnable)
         {
             const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
-            Value* vMask[4];
-            float scale[4];
+            Value*                 vMask[4];
+            float                  scale[4];
 
             if (!state.blendState.blendEnable)
             {
@@ -678,7 +742,7 @@
                 Clamp(state.format, dst);
             }
 
-            for(uint32_t i = 0; i < 4; i++)
+            for (uint32_t i = 0; i < 4; i++)
             {
                 if (info.type[i] == SWR_TYPE_UNUSED)
                 {
@@ -715,20 +779,12 @@
                     dst[i] = BITCAST(dst[i], mSimdInt32Ty);
                     break;
                 case SWR_TYPE_SNORM:
-                    src[i] = FP_TO_SI(
-                        FMUL(src[i], VIMMED1(scale[i])),
-                        mSimdInt32Ty);
-                    dst[i] = FP_TO_SI(
-                        FMUL(dst[i], VIMMED1(scale[i])),
-                        mSimdInt32Ty);
+                    src[i] = FP_TO_SI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
+                    dst[i] = FP_TO_SI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
                     break;
                 case SWR_TYPE_UNORM:
-                    src[i] = FP_TO_UI(
-                        FMUL(src[i], VIMMED1(scale[i])),
-                        mSimdInt32Ty);
-                    dst[i] = FP_TO_UI(
-                        FMUL(dst[i], VIMMED1(scale[i])),
-                        mSimdInt32Ty);
+                    src[i] = FP_TO_UI(FMUL(src[i], VIMMED1(scale[i])), mSimdInt32Ty);
+                    dst[i] = FP_TO_UI(FMUL(dst[i], VIMMED1(scale[i])), mSimdInt32Ty);
                     break;
                 }
             }
@@ -736,7 +792,7 @@
             LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
 
             // store results out
-            for(uint32_t i = 0; i < 4; ++i)
+            for (uint32_t i = 0; i < 4; ++i)
             {
                 if (info.type[i] == SWR_TYPE_UNUSED)
                 {
@@ -763,12 +819,10 @@
                 case SWR_TYPE_SNORM:
                     result[i] = SHL(result[i], C(32 - info.bpc[i]));
                     result[i] = ASHR(result[i], C(32 - info.bpc[i]));
-                    result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty),
-                                     VIMMED1(1.0f / scale[i]));
+                    result[i] = FMUL(SI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
                     break;
                 case SWR_TYPE_UNORM:
-                    result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty),
-                                     VIMMED1(1.0f / scale[i]));
+                    result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty), VIMMED1(1.0f / scale[i]));
                     break;
                 }
 
@@ -776,27 +830,27 @@
             }
         }
 
-        if(state.desc.oMaskEnable)
+        if (state.desc.oMaskEnable)
         {
             assert(!(state.desc.alphaToCoverageEnable));
             // load current mask
-            Value* oMask = LOAD(ppoMask);
+            Value* oMask      = LOAD(ppoMask);
             currentSampleMask = AND(oMask, currentSampleMask);
         }
 
-        if(state.desc.sampleMaskEnable)
+        if (state.desc.sampleMaskEnable)
         {
-            Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
+            Value* sampleMask = LOAD(pBlendState, {0, SWR_BLEND_STATE_sampleMask});
             currentSampleMask = AND(VBROADCAST(sampleMask), currentSampleMask);
         }
 
-        if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
-           state.desc.oMaskEnable)
+        if (state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
+            state.desc.oMaskEnable)
         {
             // load coverage mask and mask off any lanes with no samples
-            Value* pMask = LOAD(ppMask);
+            Value* pMask        = LOAD(ppMask);
             Value* sampleMasked = SHL(C(1), sampleNum);
-            currentSampleMask = AND(currentSampleMask, VBROADCAST(sampleMasked));
+            currentSampleMask   = AND(currentSampleMask, VBROADCAST(sampleMasked));
             currentSampleMask = S_EXT(ICMP_UGT(currentSampleMask, VBROADCAST(C(0))), mSimdInt32Ty);
             Value* outputMask = AND(pMask, currentSampleMask);
             // store new mask
@@ -816,12 +870,11 @@
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());
         passes.add(createInstructionCombiningPass());
-        passes.add(createInstructionSimplifierPass());
         passes.add(createConstantPropagationPass());
         passes.add(createSCCPPass());
         passes.add(createAggressiveDCEPass());
 
-        passes.add(createLowerX86Pass(JM(), this));
+        passes.add(createLowerX86Pass(this));
 
         passes.run(*blendFunc);
 
@@ -838,11 +891,12 @@
 /// @return PFN_FETCH_FUNC - pointer to fetch code
 PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
 {
-    const llvm::Function *func = (const llvm::Function*)hFunc;
-    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-    PFN_BLEND_JIT_FUNC pfnBlend;
+    const llvm::Function* func    = (const llvm::Function*)hFunc;
+    JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+    PFN_BLEND_JIT_FUNC    pfnBlend;
     pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
-    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
+    // add new IR to the module
     pJitMgr->mIsModuleFinalized = true;
 
     return pfnBlend;
@@ -852,14 +906,15 @@
 /// @brief JIT compiles blend shader
 /// @param hJitMgr - JitManager handle
 /// @param state   - blend state to build function from
-extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
+extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE                     hJitMgr,
+                                                      const BLEND_COMPILE_STATE& state)
 {
     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
 
     pJitMgr->SetupNewModule();
 
     BlendJit theJit(pJitMgr);
-    HANDLE hFunc = theJit.Create(state);
+    HANDLE   hFunc = theJit.Create(state);
 
     return JitBlendFunc(hJitMgr, hFunc);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
index ddb7374..3e78054 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file blend_jit.h
-*
-* @brief Definition of the blend jitter
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file blend_jit.h
+ *
+ * @brief Definition of the blend jitter
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "common/formats.h"
@@ -34,15 +34,15 @@
 
 struct RENDER_TARGET_BLEND_COMPILE_STATE
 {
-    bool blendEnable;
-    bool logicOpEnable;
+    bool             blendEnable;
+    bool             logicOpEnable;
     SWR_BLEND_FACTOR sourceAlphaBlendFactor;
     SWR_BLEND_FACTOR destAlphaBlendFactor;
     SWR_BLEND_FACTOR sourceBlendFactor;
     SWR_BLEND_FACTOR destBlendFactor;
-    SWR_BLEND_OP colorBlendFunc;
-    SWR_BLEND_OP alphaBlendFunc;
-    SWR_LOGIC_OP logicOpFunc;
+    SWR_BLEND_OP     colorBlendFunc;
+    SWR_BLEND_OP     alphaBlendFunc;
+    SWR_LOGIC_OP     logicOpFunc;
 };
 
 enum ALPHA_TEST_FORMAT
@@ -60,14 +60,14 @@
     {
         struct
         {
-            uint32_t            alphaTestEnable: 1;
-            uint32_t            independentAlphaBlendEnable: 1;
-            uint32_t            alphaToCoverageEnable: 1;
-            uint32_t            oMaskEnable:1;
-            uint32_t            inputCoverageEnable:1;
-            uint32_t            sampleMaskEnable:1;
-            uint32_t            numSamples:5;
-            uint32_t            _reserved : 21;
+            uint32_t alphaTestEnable : 1;
+            uint32_t independentAlphaBlendEnable : 1;
+            uint32_t alphaToCoverageEnable : 1;
+            uint32_t oMaskEnable : 1;
+            uint32_t inputCoverageEnable : 1;
+            uint32_t sampleMaskEnable : 1;
+            uint32_t numSamples : 5;
+            uint32_t _reserved : 21;
         };
         uint32_t bits;
     };
@@ -78,11 +78,11 @@
 //////////////////////////////////////////////////////////////////////////
 struct BLEND_COMPILE_STATE
 {
-    SWR_FORMAT format;          // format of render target being blended
+    SWR_FORMAT                        format; // format of render target being blended
     RENDER_TARGET_BLEND_COMPILE_STATE blendState;
-    BLEND_DESC desc;
+    BLEND_DESC                        desc;
 
-    SWR_ZFUNCTION alphaTestFunction;
+    SWR_ZFUNCTION     alphaTestFunction;
     ALPHA_TEST_FORMAT alphaTestFormat;
 
     bool operator==(const BLEND_COMPILE_STATE& other) const
@@ -95,18 +95,18 @@
     {
         if (!desc.alphaTestEnable)
         {
-            alphaTestFormat = (ALPHA_TEST_FORMAT)0;
+            alphaTestFormat   = (ALPHA_TEST_FORMAT)0;
             alphaTestFunction = (SWR_ZFUNCTION)0;
         }
 
         if (!blendState.blendEnable)
         {
             blendState.sourceAlphaBlendFactor = (SWR_BLEND_FACTOR)0;
-            blendState.destAlphaBlendFactor = (SWR_BLEND_FACTOR)0;
-            blendState.sourceBlendFactor = (SWR_BLEND_FACTOR)0;
-            blendState.destBlendFactor = (SWR_BLEND_FACTOR)0;
-            blendState.colorBlendFunc = (SWR_BLEND_OP)0;
-            blendState.alphaBlendFunc = (SWR_BLEND_OP)0;
+            blendState.destAlphaBlendFactor   = (SWR_BLEND_FACTOR)0;
+            blendState.sourceBlendFactor      = (SWR_BLEND_FACTOR)0;
+            blendState.destBlendFactor        = (SWR_BLEND_FACTOR)0;
+            blendState.colorBlendFunc         = (SWR_BLEND_OP)0;
+            blendState.alphaBlendFunc         = (SWR_BLEND_OP)0;
         }
 
         if (!blendState.logicOpEnable)
@@ -122,8 +122,8 @@
         if (!desc.independentAlphaBlendEnable)
         {
             blendState.sourceAlphaBlendFactor = (SWR_BLEND_FACTOR)0;
-            blendState.destAlphaBlendFactor = (SWR_BLEND_FACTOR)0;
-            blendState.alphaBlendFunc = (SWR_BLEND_OP)0;
+            blendState.destAlphaBlendFactor   = (SWR_BLEND_FACTOR)0;
+            blendState.alphaBlendFunc         = (SWR_BLEND_OP)0;
         }
     }
 };
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index bd81560..ef95e01 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file builder.h
-* 
-* @brief Includes all the builder related functionality
-* 
-* Notes:
-* 
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder.h
+ *
+ * @brief Includes all the builder related functionality
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 
 #include "jit_pch.hpp"
 #include "builder.h"
@@ -38,11 +38,9 @@
     //////////////////////////////////////////////////////////////////////////
     /// @brief Contructor for Builder.
     /// @param pJitMgr - JitManager which contains modules, function passes, etc.
-    Builder::Builder(JitManager *pJitMgr)
-        : mpJitMgr(pJitMgr),
-          mpPrivateContext(nullptr)
+    Builder::Builder(JitManager* pJitMgr) : mpJitMgr(pJitMgr), mpPrivateContext(nullptr)
     {
-        mVWidth = pJitMgr->mVWidth;
+        mVWidth   = pJitMgr->mVWidth;
         mVWidth16 = 16;
 
         mpIRBuilder = &pJitMgr->mBuilder;
@@ -70,29 +68,29 @@
 
         // Built in types: simd16
 
-        mSimd16Int1Ty       = VectorType::get(mInt1Ty,  mVWidth16);
-        mSimd16Int16Ty      = VectorType::get(mInt16Ty, mVWidth16);
-        mSimd16Int32Ty      = VectorType::get(mInt32Ty, mVWidth16);
-        mSimd16Int64Ty      = VectorType::get(mInt64Ty, mVWidth16);
-        mSimd16FP16Ty       = VectorType::get(mFP16Ty,  mVWidth16);
-        mSimd16FP32Ty       = VectorType::get(mFP32Ty,  mVWidth16);
-        mSimd16VectorTy     = ArrayType::get(mSimd16FP32Ty, 4);
-        mSimd16VectorTRTy   = ArrayType::get(mSimd16FP32Ty, 5);
+        mSimd16Int1Ty     = VectorType::get(mInt1Ty, mVWidth16);
+        mSimd16Int16Ty    = VectorType::get(mInt16Ty, mVWidth16);
+        mSimd16Int32Ty    = VectorType::get(mInt32Ty, mVWidth16);
+        mSimd16Int64Ty    = VectorType::get(mInt64Ty, mVWidth16);
+        mSimd16FP16Ty     = VectorType::get(mFP16Ty, mVWidth16);
+        mSimd16FP32Ty     = VectorType::get(mFP32Ty, mVWidth16);
+        mSimd16VectorTy   = ArrayType::get(mSimd16FP32Ty, 4);
+        mSimd16VectorTRTy = ArrayType::get(mSimd16FP32Ty, 5);
 
-        mSimd32Int8Ty       = VectorType::get(mInt8Ty, 32);
+        mSimd32Int8Ty = VectorType::get(mInt8Ty, 32);
 
         if (sizeof(uint32_t*) == 4)
         {
-            mIntPtrTy = mInt32Ty;
-            mSimdIntPtrTy = mSimdInt32Ty;
+            mIntPtrTy       = mInt32Ty;
+            mSimdIntPtrTy   = mSimdInt32Ty;
             mSimd16IntPtrTy = mSimd16Int32Ty;
         }
         else
         {
             SWR_ASSERT(sizeof(uint32_t*) == 8);
 
-            mIntPtrTy = mInt64Ty;
-            mSimdIntPtrTy = mSimdInt64Ty;
+            mIntPtrTy       = mInt64Ty;
+            mSimdIntPtrTy   = mSimdInt64Ty;
             mSimd16IntPtrTy = mSimd16Int64Ty;
         }
     }
@@ -101,15 +99,15 @@
     {
         mVWidth = width;
 
-        mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth);
-        mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
-        mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
-        mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
-        mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
-        mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
-        mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
+        mSimdInt1Ty      = VectorType::get(mInt1Ty, mVWidth);
+        mSimdInt16Ty     = VectorType::get(mInt16Ty, mVWidth);
+        mSimdInt32Ty     = VectorType::get(mInt32Ty, mVWidth);
+        mSimdInt64Ty     = VectorType::get(mInt64Ty, mVWidth);
+        mSimdFP16Ty      = VectorType::get(mFP16Ty, mVWidth);
+        mSimdFP32Ty      = VectorType::get(mFP32Ty, mVWidth);
+        mSimdVectorTy    = ArrayType::get(mSimdFP32Ty, 4);
         mSimdVectorIntTy = ArrayType::get(mSimdInt32Ty, 4);
-        mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+        mSimdVectorTRTy  = ArrayType::get(mSimdFP32Ty, 5);
     }
 
     /// @brief Mark this alloca as temporary to avoid hoisting later on
@@ -128,4 +126,91 @@
 
         return (pAlloca->getMetadata("is_temp_alloca") != nullptr);
     }
-}
+
+    // Returns true if able to find a call instruction to mark
+    bool Builder::SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName)
+    {
+        CallInst* pCallInstr = dyn_cast<CallInst>(inst);
+        if (pCallInstr)
+        {
+            MDNode* N = MDNode::get(JM()->mContext, MDString::get(JM()->mContext, mdName));
+            pCallInstr->setMetadata(mdName, N);
+            return true;
+        }
+        else
+        {
+            // Follow use def chain back up
+            for (Use& u : inst->operands())
+            {
+                Instruction* srcInst = dyn_cast<Instruction>(u.get());
+                if (srcInst)
+                {
+                    if (SetNamedMetaDataOnCallInstr(srcInst, mdName))
+                    {
+                        return true;
+                    }
+                }
+            }
+        }
+
+        return false;
+    }
+
+    bool Builder::HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName)
+    {
+        CallInst* pCallInstr = dyn_cast<CallInst>(inst);
+
+        if (!pCallInstr)
+        {
+            return false;
+        }
+
+        return (pCallInstr->getMetadata(mdName) != nullptr);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Packetizes the type. Assumes SOA conversion.
+    Type* Builder::GetVectorType(Type* pType)
+    {
+        if (pType->isVectorTy())
+        {
+            return pType;
+        }
+
+        // [N x float] should packetize to [N x <8 x float>]
+        if (pType->isArrayTy())
+        {
+            uint32_t arraySize     = pType->getArrayNumElements();
+            Type*    pArrayType    = pType->getArrayElementType();
+            Type*    pVecArrayType = GetVectorType(pArrayType);
+            Type*    pVecType      = ArrayType::get(pVecArrayType, arraySize);
+            return pVecType;
+        }
+
+        // {float,int} should packetize to {<8 x float>, <8 x int>}
+        if (pType->isAggregateType())
+        {
+            uint32_t              numElems = pType->getStructNumElements();
+            SmallVector<Type*, 8> vecTypes;
+            for (uint32_t i = 0; i < numElems; ++i)
+            {
+                Type* pElemType    = pType->getStructElementType(i);
+                Type* pVecElemType = GetVectorType(pElemType);
+                vecTypes.push_back(pVecElemType);
+            }
+            Type* pVecType = StructType::get(JM()->mContext, vecTypes);
+            return pVecType;
+        }
+
+        // [N x float]* should packetize to [N x <8 x float>]*
+        if (pType->isPointerTy() && pType->getPointerElementType()->isArrayTy())
+        {
+            return PointerType::get(GetVectorType(pType->getPointerElementType()),
+                                    pType->getPointerAddressSpace());
+        }
+
+        // <ty> should packetize to <8 x <ty>>
+        Type* vecType = VectorType::get(pType, JM()->mVWidth);
+        return vecType;
+    }
+} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index e2ad1e8..a047f2a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file builder.h
-* 
-* @brief Includes all the builder related functionality
-* 
-* Notes:
-* 
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder.h
+ *
+ * @brief Includes all the builder related functionality
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "JitManager.h"
@@ -37,90 +37,122 @@
     ///@todo Move this to better place
     enum SHADER_STATS_COUNTER_TYPE
     {
-        STATS_INST_EXECUTED = 0,
-        STATS_SAMPLE_EXECUTED = 1,
-        STATS_SAMPLE_L_EXECUTED = 2,
-        STATS_SAMPLE_B_EXECUTED = 3,
-        STATS_SAMPLE_C_EXECUTED = 4,
-        STATS_SAMPLE_C_LZ_EXECUTED = 5,
-        STATS_SAMPLE_C_D_EXECUTED = 6,
-        STATS_LOD_EXECUTED = 7,
-        STATS_GATHER4_EXECUTED = 8,
-        STATS_GATHER4_C_EXECUTED = 9,
-        STATS_GATHER4_C_PO_EXECUTED = 10,
+        STATS_INST_EXECUTED           = 0,
+        STATS_SAMPLE_EXECUTED         = 1,
+        STATS_SAMPLE_L_EXECUTED       = 2,
+        STATS_SAMPLE_B_EXECUTED       = 3,
+        STATS_SAMPLE_C_EXECUTED       = 4,
+        STATS_SAMPLE_C_LZ_EXECUTED    = 5,
+        STATS_SAMPLE_C_D_EXECUTED     = 6,
+        STATS_LOD_EXECUTED            = 7,
+        STATS_GATHER4_EXECUTED        = 8,
+        STATS_GATHER4_C_EXECUTED      = 9,
+        STATS_GATHER4_C_PO_EXECUTED   = 10,
         STATS_GATHER4_C_PO_C_EXECUTED = 11,
-        STATS_LOAD_RAW_UAV = 12,
-        STATS_LOAD_RAW_RESOURCE = 13,
-        STATS_STORE_RAW_UAV = 14,
-        STATS_STORE_TGSM = 15,
-        STATS_DISCARD = 16,
-        STATS_BARRIER = 17,
+        STATS_LOAD_RAW_UAV            = 12,
+        STATS_LOAD_RAW_RESOURCE       = 13,
+        STATS_STORE_RAW_UAV           = 14,
+        STATS_STORE_TGSM              = 15,
+        STATS_DISCARD                 = 16,
+        STATS_BARRIER                 = 17,
     };
 
     using namespace llvm;
     struct Builder
     {
-        Builder(JitManager *pJitMgr);
+        Builder(JitManager* pJitMgr);
         virtual ~Builder() {}
 
-        IRBuilder<> *IRB() { return mpIRBuilder; };
-        JitManager *JM() { return mpJitMgr; }
+        IRBuilder<>* IRB() { return mpIRBuilder; };
+        JitManager*  JM() { return mpJitMgr; }
 
-        JitManager *mpJitMgr;
-        IRBuilder<> *mpIRBuilder;
+        JitManager*  mpJitMgr;
+        IRBuilder<>* mpIRBuilder;
 
-        uint32_t             mVWidth;   // vector width target simd
-        uint32_t             mVWidth16; // vector width simd16
+        uint32_t mVWidth;   // vector width target simd
+        uint32_t mVWidth16; // vector width simd16
 
         // Built in types: scalar
 
-        Type*                mVoidTy;
-        Type*                mInt1Ty;
-        Type*                mInt8Ty;
-        Type*                mInt16Ty;
-        Type*                mInt32Ty;
-        Type*                mInt64Ty;
-        Type*                mIntPtrTy;
-        Type*                mFP16Ty;
-        Type*                mFP32Ty;
-        Type*                mFP32PtrTy;
-        Type*                mDoubleTy;
-        Type*                mInt8PtrTy;
-        Type*                mInt16PtrTy;
-        Type*                mInt32PtrTy;
+        Type* mVoidTy;
+        Type* mInt1Ty;
+        Type* mInt8Ty;
+        Type* mInt16Ty;
+        Type* mInt32Ty;
+        Type* mInt64Ty;
+        Type* mIntPtrTy;
+        Type* mFP16Ty;
+        Type* mFP32Ty;
+        Type* mFP32PtrTy;
+        Type* mDoubleTy;
+        Type* mInt8PtrTy;
+        Type* mInt16PtrTy;
+        Type* mInt32PtrTy;
 
-        Type*                mSimd4FP64Ty;
+        Type* mSimd4FP64Ty;
 
         // Built in types: target SIMD
 
-        Type*                mSimdFP16Ty;
-        Type*                mSimdFP32Ty;
-        Type*                mSimdInt1Ty;
-        Type*                mSimdInt16Ty;
-        Type*                mSimdInt32Ty;
-        Type*                mSimdInt64Ty;
-        Type*                mSimdIntPtrTy;
-        Type*                mSimdVectorTy;
-        Type*                mSimdVectorTRTy;
-        Type*                mSimdVectorIntTy;
+        Type* mSimdFP16Ty;
+        Type* mSimdFP32Ty;
+        Type* mSimdInt1Ty;
+        Type* mSimdInt16Ty;
+        Type* mSimdInt32Ty;
+        Type* mSimdInt64Ty;
+        Type* mSimdIntPtrTy;
+        Type* mSimdVectorTy;
+        Type* mSimdVectorTRTy;
+        Type* mSimdVectorIntTy;
 
         // Built in types: simd16
 
-        Type*                mSimd16FP16Ty;
-        Type*                mSimd16FP32Ty;
-        Type*                mSimd16Int1Ty;
-        Type*                mSimd16Int16Ty;
-        Type*                mSimd16Int32Ty;
-        Type*                mSimd16Int64Ty;
-        Type*                mSimd16IntPtrTy;
-        Type*                mSimd16VectorTy;
-        Type*                mSimd16VectorTRTy;
+        Type* mSimd16FP16Ty;
+        Type* mSimd16FP32Ty;
+        Type* mSimd16Int1Ty;
+        Type* mSimd16Int16Ty;
+        Type* mSimd16Int32Ty;
+        Type* mSimd16Int64Ty;
+        Type* mSimd16IntPtrTy;
+        Type* mSimd16VectorTy;
+        Type* mSimd16VectorTRTy;
 
-        Type*                mSimd32Int8Ty;
+        Type* mSimd32Int8Ty;
 
-        void SetTargetWidth(uint32_t width);
-        void SetTempAlloca(Value* inst);
-        bool IsTempAlloca(Value* inst);
+        void  SetTargetWidth(uint32_t width);
+        void  SetTempAlloca(Value* inst);
+        bool  IsTempAlloca(Value* inst);
+        bool  SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName);
+        bool  HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName);
+        Type* GetVectorType(Type* pType);
+        void  SetMetadata(StringRef s, uint32_t val)
+        {
+            llvm::NamedMDNode* metaData = mpJitMgr->mpCurrentModule->getOrInsertNamedMetadata(s);
+            Constant*          cval     = mpIRBuilder->getInt32(val);
+            llvm::MDNode*      mdNode   = llvm::MDNode::get(mpJitMgr->mpCurrentModule->getContext(),
+                                                     llvm::ConstantAsMetadata::get(cval));
+            if (metaData->getNumOperands())
+            {
+                metaData->setOperand(0, mdNode);
+            }
+            else
+            {
+                metaData->addOperand(mdNode);
+            }
+        }
+        uint32_t GetMetadata(StringRef s)
+        {
+            NamedMDNode* metaData = mpJitMgr->mpCurrentModule->getNamedMetadata(s);
+            if (metaData)
+            {
+                MDNode*   mdNode = metaData->getOperand(0);
+                Metadata* val    = mdNode->getOperand(0);
+                return mdconst::dyn_extract<ConstantInt>(val)->getZExtValue();
+            }
+            else
+            {
+                return 0;
+            }
+        }
 
 #include "gen_builder.hpp"
 #include "gen_builder_meta.hpp"
@@ -130,17 +162,15 @@
 #include "builder_mem.h"
 
     protected:
-
-        void SetPrivateContext(Value* pPrivateContext) 
-        { 
-            mpPrivateContext = pPrivateContext; 
+        void SetPrivateContext(Value* pPrivateContext)
+        {
+            mpPrivateContext = pPrivateContext;
             NotifyPrivateContextSet();
         }
-        virtual void NotifyPrivateContextSet() {}
+        virtual void  NotifyPrivateContextSet() {}
         inline Value* GetPrivateContext() { return mpPrivateContext; }
 
-    private: 
+    private:
         Value* mpPrivateContext;
-
     };
-}
+} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
index 6ecd969..c68f3b9 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file builder_gfx_mem.cpp
-*
-* @brief Definition of the gfx mem builder
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder_gfx_mem.cpp
+ *
+ * @brief Definition of the gfx mem builder
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #include "jit_pch.hpp"
 #include "builder.h"
 #include "common/rdtsc_buckets.h"
@@ -37,12 +37,12 @@
 {
     using namespace llvm;
 
-    BuilderGfxMem::BuilderGfxMem(JitManager* pJitMgr) :
-        Builder(pJitMgr)
+    BuilderGfxMem::BuilderGfxMem(JitManager* pJitMgr) : Builder(pJitMgr)
     {
-        mpTranslationFuncTy = nullptr;
-        mpfnTranslateGfxAddress = nullptr;
-        mpParamSimDC = nullptr;
+        mpTranslationFuncTy             = nullptr;
+        mpfnTranslateGfxAddressForRead  = nullptr;
+        mpfnTranslateGfxAddressForWrite = nullptr;
+        mpParamSimDC                    = nullptr;
 
     }
 
@@ -52,20 +52,51 @@
 
     void BuilderGfxMem::AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage)
     {
-        SWR_ASSERT(!(ptr->getType() == mInt64Ty && usage == MEM_CLIENT_INTERNAL), "Internal memory should not be gfxptr_t.");
+        SWR_ASSERT(!(ptr->getType() == mInt64Ty && usage == MEM_CLIENT_INTERNAL),
+                   "Internal memory should not be gfxptr_t.");
     }
 
+
     //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
+    /// @brief Generate a masked gather operation in LLVM IR.  If not
     /// supported on the underlying platform, emulate it with loads
     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
     /// @param pBase - Int8* base VB address pointer value
     /// @param vIndices - SIMD wide value of VB byte offsets
     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
     /// @param scale - value to scale indices by
-    Value *BuilderGfxMem::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
+    Value* BuilderGfxMem::GATHERPS(Value*         vSrc,
+                                   Value*         pBase,
+                                   Value*         vIndices,
+                                   Value*         vMask,
+                                   uint8_t        scale,
+                                   JIT_MEM_CLIENT usage)
     {
-        Value *vGather;
+       // address may be coming in as 64bit int now so get the pointer
+        if (pBase->getType() == mInt64Ty)
+        {
+            pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
+        }
+
+        Value* vGather = Builder::GATHERPS(vSrc, pBase, vIndices, vMask, scale);
+        return vGather;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Generate a masked gather operation in LLVM IR.  If not
+    /// supported on the underlying platform, emulate it with loads
+    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+    /// @param pBase - Int8* base VB address pointer value
+    /// @param vIndices - SIMD wide value of VB byte offsets
+    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+    /// @param scale - value to scale indices by
+    Value* BuilderGfxMem::GATHERDD(Value*         vSrc,
+                                   Value*         pBase,
+                                   Value*         vIndices,
+                                   Value*         vMask,
+                                   uint8_t        scale,
+                                   JIT_MEM_CLIENT usage)
+    {
 
         // address may be coming in as 64bit int now so get the pointer
         if (pBase->getType() == mInt64Ty)
@@ -73,66 +104,58 @@
             pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
         }
 
-        vGather = Builder::GATHERPS(vSrc, pBase, vIndices, vMask, scale);
+        Value* vGather = Builder::GATHERDD(vSrc, pBase, vIndices, vMask, scale);
         return vGather;
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
-    /// supported on the underlying platform, emulate it with loads
-    /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
-    /// @param pBase - Int8* base VB address pointer value
-    /// @param vIndices - SIMD wide value of VB byte offsets
-    /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
-    /// @param scale - value to scale indices by
-    Value *BuilderGfxMem::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
+    void BuilderGfxMem::SCATTERPS(
+        Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, JIT_MEM_CLIENT usage)
     {
-        Value* vGather = VIMMED1(0.0f);
-
 
         // address may be coming in as 64bit int now so get the pointer
-        if (pBase->getType() == mInt64Ty)
+        if (pDst->getType() == mInt64Ty)
         {
-            pBase = INT_TO_PTR(pBase, PointerType::get(mInt8Ty, 0));
+            pDst = INT_TO_PTR(pDst, PointerType::get(mInt8Ty, 0));
         }
 
-        vGather = Builder::GATHERDD(vSrc, pBase, vIndices, vMask, scale);
-        return vGather;
+        Builder::SCATTERPS(pDst, vSrc, vOffsets, vMask, usage);
     }
 
 
-    Value* BuilderGfxMem::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset)
+    Value* BuilderGfxMem::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
     {
         return ADD(base, offset);
     }
-    
-    Value *BuilderGfxMem::GEP(Value *Ptr, Value *Idx, Type *Ty, const Twine &Name)
+
+    Value* BuilderGfxMem::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name)
     {
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::GEP(Ptr, Idx, nullptr, Name);
     }
 
-    Value *BuilderGfxMem::GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name)
+    Value* BuilderGfxMem::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
     {
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::GEP(Ty, Ptr, Idx, Name);
     }
 
-    Value *BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<Value*> &indexList, Type *Ty)
+    Value* BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
     {
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::GEP(Ptr, indexList);
     }
 
-    Value *BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty)
+    Value*
+    BuilderGfxMem::GEP(Value* Ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
     {
         Ptr = TranslationHelper(Ptr, Ty);
         return Builder::GEP(Ptr, indexList);
     }
 
-    Value* BuilderGfxMem::TranslationHelper(Value *Ptr, Type *Ty)
+    Value* BuilderGfxMem::TranslationHelper(Value* Ptr, Type* Ty)
     {
-        SWR_ASSERT(!(Ptr->getType() == mInt64Ty && Ty == nullptr), "Access of GFX pointers must have non-null type specified.");
+        SWR_ASSERT(!(Ptr->getType() == mInt64Ty && Ty == nullptr),
+                   "Access of GFX pointers must have non-null type specified.");
 
 
         // address may be coming in as 64bit int now so get the pointer
@@ -144,7 +167,7 @@
         return Ptr;
     }
 
-    LoadInst* BuilderGfxMem::LOAD(Value *Ptr, const char *Name, Type *Ty, JIT_MEM_CLIENT usage)
+    LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertGFXMemoryParams(Ptr, usage);
 
@@ -152,7 +175,7 @@
         return Builder::LOAD(Ptr, Name);
     }
 
-    LoadInst* BuilderGfxMem::LOAD(Value *Ptr, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage)
+    LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertGFXMemoryParams(Ptr, usage);
 
@@ -160,15 +183,9 @@
         return Builder::LOAD(Ptr, Name);
     }
 
-    LoadInst* BuilderGfxMem::LOAD(Type *Ty, Value *Ptr, const Twine &Name, JIT_MEM_CLIENT usage)
-    {
-        AssertGFXMemoryParams(Ptr, usage);
 
-        Ptr = TranslationHelper(Ptr, Ty);
-        return Builder::LOAD(Ty, Ptr, Name);
-    }
-    
-    LoadInst* BuilderGfxMem::LOAD(Value *Ptr, bool isVolatile, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage)
+    LoadInst* BuilderGfxMem::LOAD(
+        Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertGFXMemoryParams(Ptr, usage);
 
@@ -176,19 +193,43 @@
         return Builder::LOAD(Ptr, isVolatile, Name);
     }
 
-    LoadInst *BuilderGfxMem::LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name, Type *Ty, JIT_MEM_CLIENT usage)
+    LoadInst* BuilderGfxMem::LOAD(Value*                                 BasePtr,
+                                  const std::initializer_list<uint32_t>& offset,
+                                  const llvm::Twine&                     name,
+                                  Type*                                  Ty,
+                                  JIT_MEM_CLIENT                         usage)
     {
         AssertGFXMemoryParams(BasePtr, usage);
 
-        // This call is just a pass through to the base class.
-        // It needs to be here to compile due to the combination of virtual overrides and signature overloads.
-        // It doesn't do anything meaningful because the implementation in the base class is going to call 
-        // another version of LOAD inside itself where the actual per offset translation will take place 
-        // and we can't just translate the BasePtr once, each address needs individual translation.
-        return Builder::LOAD(BasePtr, offset, name, Ty, usage);
+        bool bNeedTranslation = false;
+        if (BasePtr->getType() == mInt64Ty)
+        {
+            SWR_ASSERT(Ty);
+            BasePtr          = INT_TO_PTR(BasePtr, Ty, name);
+            bNeedTranslation = true;
+        }
+        std::vector<Value*> valIndices;
+        for (auto i : offset)
+        {
+            valIndices.push_back(C(i));
+        }
+        BasePtr = Builder::GEPA(BasePtr, valIndices, name);
+        if (bNeedTranslation)
+        {
+            BasePtr = PTR_TO_INT(BasePtr, mInt64Ty, name);
+        }
+
+        return LOAD(BasePtr, name, Ty, usage);
     }
 
-    CallInst* BuilderGfxMem::MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage)
+
+    CallInst* BuilderGfxMem::MASKED_LOAD(Value*         Ptr,
+                                         unsigned       Align,
+                                         Value*         Mask,
+                                         Value*         PassThru,
+                                         const Twine&   Name,
+                                         Type*          Ty,
+                                         JIT_MEM_CLIENT usage)
     {
         AssertGFXMemoryParams(Ptr, usage);
 
@@ -196,8 +237,28 @@
         return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage);
     }
 
-    Value* BuilderGfxMem::TranslateGfxAddress(Value* xpGfxAddress)
+    Value* BuilderGfxMem::TranslateGfxAddressForRead(Value*       xpGfxAddress,
+                                                     Type*        PtrTy,
+                                                     const Twine& Name,
+                                                     JIT_MEM_CLIENT /* usage */)
     {
-        return INT_TO_PTR(xpGfxAddress, PointerType::get(mInt8Ty, 0));
+        if (PtrTy == nullptr)
+        {
+            PtrTy = mInt8PtrTy;
+        }
+        return INT_TO_PTR(xpGfxAddress, PtrTy, Name);
     }
-}
+
+    Value* BuilderGfxMem::TranslateGfxAddressForWrite(Value*       xpGfxAddress,
+                                                      Type*        PtrTy,
+                                                      const Twine& Name,
+                                                      JIT_MEM_CLIENT /* usage */)
+    {
+        if (PtrTy == nullptr)
+        {
+            PtrTy = mInt8PtrTy;
+        }
+        return INT_TO_PTR(xpGfxAddress, PtrTy, Name);
+    }
+
+} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
index f8ec0ac..aefbbef 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file builder_gfx_mem.h
-*
-* @brief Definition of the builder to support different translation types for gfx memory access
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder_gfx_mem.h
+ *
+ * @brief Definition of the builder to support different translation types for gfx memory access
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "builder.h"
@@ -41,44 +41,90 @@
         BuilderGfxMem(JitManager* pJitMgr);
         virtual ~BuilderGfxMem() {}
 
-        virtual Value *GEP(Value *Ptr, Value *Idx, Type *Ty = nullptr, const Twine &Name = "");
-        virtual Value *GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name = "");
-        virtual Value *GEP(Value* Ptr, const std::initializer_list<Value*> &indexList, Type *Ty = nullptr);
-        virtual Value *GEP(Value* Ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty = nullptr);
+        virtual Value* GEP(Value* Ptr, Value* Idx, Type* Ty = nullptr, const Twine& Name = "");
+        virtual Value* GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name = "");
+        virtual Value*
+        GEP(Value* Ptr, const std::initializer_list<Value*>& indexList, Type* Ty = nullptr);
+        virtual Value*
+        GEP(Value* Ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty = nullptr);
 
-        virtual LoadInst* LOAD(Value *Ptr, const char *Name, Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-        virtual LoadInst* LOAD(Value *Ptr, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-        virtual LoadInst* LOAD(Type *Ty, Value *Ptr, const Twine &Name = "", JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-        virtual LoadInst* LOAD(Value *Ptr, bool isVolatile, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-        virtual LoadInst* LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+        virtual LoadInst* LOAD(Value*         Ptr,
+                               const char*    Name,
+                               Type*          Ty    = nullptr,
+                               JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+        virtual LoadInst* LOAD(Value*         Ptr,
+                               const Twine&   Name  = "",
+                               Type*          Ty    = nullptr,
+                               JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+        virtual LoadInst* LOAD(Value*         Ptr,
+                               bool           isVolatile,
+                               const Twine&   Name  = "",
+                               Type*          Ty    = nullptr,
+                               JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+        virtual LoadInst* LOAD(Value*                                 BasePtr,
+                               const std::initializer_list<uint32_t>& offset,
+                               const llvm::Twine&                     Name  = "",
+                               Type*                                  Ty    = nullptr,
+                               JIT_MEM_CLIENT                         usage = MEM_CLIENT_INTERNAL);
 
-        virtual CallInst* MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru = nullptr, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
-        virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+        virtual CallInst* MASKED_LOAD(Value*         Ptr,
+                                      unsigned       Align,
+                                      Value*         Mask,
+                                      Value*         PassThru = nullptr,
+                                      const Twine&   Name     = "",
+                                      Type*          Ty       = nullptr,
+                                      JIT_MEM_CLIENT usage    = MEM_CLIENT_INTERNAL);
 
-        virtual Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+        virtual Value* GATHERPS(Value*         src,
+                                Value*         pBase,
+                                Value*         indices,
+                                Value*         mask,
+                                uint8_t        scale = 1,
+                                JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+        virtual Value* GATHERDD(Value*         src,
+                                Value*         pBase,
+                                Value*         indices,
+                                Value*         mask,
+                                uint8_t        scale = 1,
+                                JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
-        Value* TranslateGfxAddress(Value* xpGfxAddress);
+        virtual void SCATTERPS(Value*         pDst,
+                               Value*         vSrc,
+                               Value*         vOffsets,
+                               Value*         vMask,
+                               JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+
+
+        Value* TranslateGfxAddressForRead(Value*         xpGfxAddress,
+                                          Type*          PtrTy = nullptr,
+                                          const Twine&   Name  = "",
+                                          JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+        Value* TranslateGfxAddressForWrite(Value*         xpGfxAddress,
+                                           Type*          PtrTy = nullptr,
+                                           const Twine&   Name  = "",
+                                           JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
 
     protected:
-
         void AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage);
-            
+
         virtual void NotifyPrivateContextSet();
 
-        virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset);
+        virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset);
 
-        Value* TranslationHelper(Value *Ptr, Type *Ty);
+        Value* TranslationHelper(Value* Ptr, Type* Ty);
 
         FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; }
-        Value* GetTranslationFunction() { return mpfnTranslateGfxAddress; }
-        Value* GetParamSimDC() { return mpParamSimDC; }
+        Value*        GetTranslationFunctionForRead() { return mpfnTranslateGfxAddressForRead; }
+        Value*        GetTranslationFunctionForWrite() { return mpfnTranslateGfxAddressForWrite; }
+        Value*        GetParamSimDC() { return mpParamSimDC; }
+
 
     private:
-
         FunctionType* mpTranslationFuncTy;
-        Value* mpfnTranslateGfxAddress;
-        Value* mpParamSimDC;
+        Value*        mpfnTranslateGfxAddressForRead;
+        Value*        mpfnTranslateGfxAddressForWrite;
+        Value*        mpParamSimDC;
     };
-}
+} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
index 92867ec..02aa6f9 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file builder_math.h
-* 
-* @brief math/alu builder functions
-* 
-* Notes:
-* 
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder_math.h
+ *
+ * @brief math/alu builder functions
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 Value* VLOG2PS(Value* src);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index 5b70b29..94489f1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -1,57 +1,58 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file builder_misc.cpp
-*
-* @brief Implementation for miscellaneous builder functions
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder_misc.cpp
+ *
+ * @brief Implementation for miscellaneous builder functions
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #include "jit_pch.hpp"
 #include "builder.h"
 #include "common/rdtsc_buckets.h"
 
 #include <cstdarg>
 
-
 namespace SwrJit
 {
     void Builder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage)
     {
-        SWR_ASSERT(ptr->getType() != mInt64Ty, "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
+        SWR_ASSERT(
+            ptr->getType() != mInt64Ty,
+            "Address appears to be GFX access.  Requires translation through BuilderGfxMem.");
     }
 
-    Value *Builder::GEP(Value *Ptr, Value *Idx, Type *Ty, const Twine &Name)
+    Value* Builder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name)
     {
         return IRB()->CreateGEP(Ptr, Idx, Name);
     }
 
-    Value *Builder::GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name)
+    Value* Builder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name)
     {
         return IRB()->CreateGEP(Ty, Ptr, Idx, Name);
     }
 
-    Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList, Type *Ty)
+    Value* Builder::GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty)
     {
         std::vector<Value*> indices;
         for (auto i : indexList)
@@ -59,7 +60,7 @@
         return GEPA(ptr, indices);
     }
 
-    Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty)
+    Value* Builder::GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty)
     {
         std::vector<Value*> indices;
         for (auto i : indexList)
@@ -67,17 +68,17 @@
         return GEPA(ptr, indices);
     }
 
-    Value *Builder::GEPA(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name)
+    Value* Builder::GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
     {
         return IRB()->CreateGEP(Ptr, IdxList, Name);
     }
 
-    Value *Builder::GEPA(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name)
+    Value* Builder::GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name)
     {
         return IRB()->CreateGEP(Ty, Ptr, IdxList, Name);
     }
 
-    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList)
     {
         std::vector<Value*> indices;
         for (auto i : indexList)
@@ -85,7 +86,7 @@
         return IN_BOUNDS_GEP(ptr, indices);
     }
 
-    Value *Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+    Value* Builder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList)
     {
         std::vector<Value*> indices;
         for (auto i : indexList)
@@ -93,31 +94,36 @@
         return IN_BOUNDS_GEP(ptr, indices);
     }
 
-    LoadInst* Builder::LOAD(Value *Ptr, const char *Name, Type *Ty, JIT_MEM_CLIENT usage)
+    LoadInst* Builder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertMemoryUsageParams(Ptr, usage);
         return IRB()->CreateLoad(Ptr, Name);
     }
 
-    LoadInst* Builder::LOAD(Value *Ptr, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage)
+    LoadInst* Builder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertMemoryUsageParams(Ptr, usage);
         return IRB()->CreateLoad(Ptr, Name);
     }
 
-    LoadInst* Builder::LOAD(Type *Ty, Value *Ptr, const Twine &Name, JIT_MEM_CLIENT usage)
+    LoadInst* Builder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage)
     {
         AssertMemoryUsageParams(Ptr, usage);
         return IRB()->CreateLoad(Ty, Ptr, Name);
     }
 
-    LoadInst* Builder::LOAD(Value *Ptr, bool isVolatile, const Twine &Name, Type *Ty, JIT_MEM_CLIENT usage)
+    LoadInst*
+    Builder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
     {
         AssertMemoryUsageParams(Ptr, usage);
         return IRB()->CreateLoad(Ptr, isVolatile, Name);
     }
 
-    LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name, Type *Ty, JIT_MEM_CLIENT usage)
+    LoadInst* Builder::LOAD(Value*                                 basePtr,
+                            const std::initializer_list<uint32_t>& indices,
+                            const llvm::Twine&                     name,
+                            Type*                                  Ty,
+                            JIT_MEM_CLIENT                         usage)
     {
         std::vector<Value*> valIndices;
         for (auto i : indices)
@@ -125,7 +131,9 @@
         return Builder::LOAD(GEPA(basePtr, valIndices), name);
     }
 
-    LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
+    LoadInst* Builder::LOADV(Value*                               basePtr,
+                             const std::initializer_list<Value*>& indices,
+                             const llvm::Twine&                   name)
     {
         std::vector<Value*> valIndices;
         for (auto i : indices)
@@ -133,7 +141,8 @@
         return LOAD(GEPA(basePtr, valIndices), name);
     }
 
-    StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
+    StoreInst*
+    Builder::STORE(Value* val, Value* basePtr, const std::initializer_list<uint32_t>& indices)
     {
         std::vector<Value*> valIndices;
         for (auto i : indices)
@@ -141,7 +150,8 @@
         return STORE(val, GEPA(basePtr, valIndices));
     }
 
-    StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
+    StoreInst*
+    Builder::STOREV(Value* val, Value* basePtr, const std::initializer_list<Value*>& indices)
     {
         std::vector<Value*> valIndices;
         for (auto i : indices)
@@ -149,27 +159,35 @@
         return STORE(val, GEPA(basePtr, valIndices));
     }
 
-    Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset)
+    Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset)
     {
         return GEP(base, offset);
     }
 
-    Value* Builder::MEM_ADD(Value* i32Incr, Value* basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
+    Value* Builder::MEM_ADD(Value*                                 i32Incr,
+                            Value*                                 basePtr,
+                            const std::initializer_list<uint32_t>& indices,
+                            const llvm::Twine&                     name)
     {
-        Value* i32Value = LOAD(GEP(basePtr, indices), name);
+        Value* i32Value  = LOAD(GEP(basePtr, indices), name);
         Value* i32Result = ADD(i32Value, i32Incr);
         return STORE(i32Result, GEP(basePtr, indices));
     }
 
     //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
+    /// @brief Generate a masked gather operation in LLVM IR.  If not
     /// supported on the underlying platform, emulate it with loads
     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
     /// @param pBase - Int8* base VB address pointer value
     /// @param vIndices - SIMD wide value of VB byte offsets
     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
     /// @param scale - value to scale indices by
-    Value *Builder::GATHERPS(Value *vSrc, Value *pBase, Value *vIndices, Value *vMask, uint8_t scale, JIT_MEM_CLIENT usage)
+    Value* Builder::GATHERPS(Value*         vSrc,
+                             Value*         pBase,
+                             Value*         vIndices,
+                             Value*         vMask,
+                             uint8_t        scale,
+                             JIT_MEM_CLIENT usage)
     {
         AssertMemoryUsageParams(pBase, usage);
 
@@ -177,14 +195,19 @@
     }
 
     //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a masked gather operation in LLVM IR.  If not  
+    /// @brief Generate a masked gather operation in LLVM IR.  If not
     /// supported on the underlying platform, emulate it with loads
     /// @param vSrc - SIMD wide value that will be loaded if mask is invalid
     /// @param pBase - Int8* base VB address pointer value
     /// @param vIndices - SIMD wide value of VB byte offsets
     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
     /// @param scale - value to scale indices by
-    Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale, JIT_MEM_CLIENT usage)
+    Value* Builder::GATHERDD(Value*         vSrc,
+                             Value*         pBase,
+                             Value*         vIndices,
+                             Value*         vMask,
+                             uint8_t        scale,
+                             JIT_MEM_CLIENT usage)
     {
         AssertMemoryUsageParams(pBase, usage);
 
@@ -199,7 +222,8 @@
     /// @param vIndices - SIMD wide value of VB byte offsets
     /// @param vMask - SIMD wide mask that controls whether to access memory or the src values
     /// @param scale - value to scale indices by
-    Value *Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
+    Value*
+    Builder::GATHERPD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, uint8_t scale)
     {
         return VGATHERPD(vSrc, pBase, vIndices, vMask, C(scale));
     }
@@ -214,10 +238,15 @@
         return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
     }
 
-    void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
-        Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
+    void Builder::Gather4(const SWR_FORMAT format,
+                          Value*           pSrcBase,
+                          Value*           byteOffsets,
+                          Value*           mask,
+                          Value*           vGatherComponents[],
+                          bool             bPackedOutput,
+                          JIT_MEM_CLIENT   usage)
     {
-        const SWR_FORMAT_INFO &info = GetFormatInfo(format);
+        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
         if (info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
         {
             GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput, usage);
@@ -228,8 +257,13 @@
         }
     }
 
-    void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-        Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
+    void Builder::GATHER4PS(const SWR_FORMAT_INFO& info,
+                            Value*                 pSrcBase,
+                            Value*                 byteOffsets,
+                            Value*                 vMask,
+                            Value*                 vGatherComponents[],
+                            bool                   bPackedOutput,
+                            JIT_MEM_CLIENT         usage)
     {
         switch (info.bpp / info.numComps)
         {
@@ -254,10 +288,11 @@
                 // offset base to the next components(zw) in the vertex to gather
                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 
-                vGatherResult[1] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
+                vGatherResult[1] =
+                    GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
                 // e.g. result of second 8x32bit integer gather for 16bit components
                 // 256i - 0    1    2    3    4    5    6    7
-                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
                 //
             }
             else
@@ -282,7 +317,8 @@
                 uint32_t swizzleIndex = info.swizzle[i];
 
                 // Gather a SIMD of components
-                vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
+                vGatherComponents[swizzleIndex] = GATHERPS(
+                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 
                 // offset base to the next component to gather
                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
@@ -295,18 +331,24 @@
         }
     }
 
-    void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-        Value* vMask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage)
+    void Builder::GATHER4DD(const SWR_FORMAT_INFO& info,
+                            Value*                 pSrcBase,
+                            Value*                 byteOffsets,
+                            Value*                 vMask,
+                            Value*                 vGatherComponents[],
+                            bool                   bPackedOutput,
+                            JIT_MEM_CLIENT         usage)
     {
         switch (info.bpp / info.numComps)
         {
         case 8:
         {
             Value* vGatherMaskedVal = VIMMED1((int32_t)0);
-            Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
+            Value* vGatherResult =
+                GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
             // e.g. result of an 8x32bit integer gather for 8bit components
             // 256i - 0    1    2    3    4    5    6    7
-            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 
             Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
         }
@@ -332,10 +374,11 @@
                 // offset base to the next components(zw) in the vertex to gather
                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
 
-                vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
+                vGatherResult[1] =
+                    GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, 1, usage);
                 // e.g. result of second 8x32bit integer gather for 16bit components
                 // 256i - 0    1    2    3    4    5    6    7
-                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
                 //
             }
             else
@@ -345,7 +388,6 @@
 
             // Shuffle gathered components into place, each row is a component
             Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
-
         }
         break;
         case 32:
@@ -361,7 +403,8 @@
                 uint32_t swizzleIndex = info.swizzle[i];
 
                 // Gather a SIMD of components
-                vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
+                vGatherComponents[swizzleIndex] = GATHERDD(
+                    vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, 1, usage);
 
                 // offset base to the next component to gather
                 pSrcBase = OFFSET_TO_NEXT_COMPONENT(pSrcBase, C((intptr_t)4));
@@ -374,29 +417,35 @@
         }
     }
 
-    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
+    void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
+                                      Value*                 vGatherInput[2],
+                                      Value*                 vGatherOutput[4],
+                                      bool                   bPackedOutput)
     {
         // cast types
         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+        Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
-                                                               // input could either be float or int vector; do shuffle work in int
+        // input could either be float or int vector; do shuffle work in int
         vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
         vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
 
         if (bPackedOutput)
         {
-            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+            Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
+                                              mVWidth / 4); // vwidth is units of 32 bits
 
-                                                                                                         // shuffle mask
-            Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
+            // shuffle mask
+            Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                         0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
+            Value* vShufResult =
+                BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
             // after pshufb: group components together in each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 
-            Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            Value* vi128XY =
+                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
             // after PERMD: move and pack xy components into each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
@@ -405,8 +454,10 @@
             Value* vi128ZW = nullptr;
             if (info.numComps > 2)
             {
-                Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
-                vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+                Value* vShufResult =
+                    BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
+                vi128ZW =
+                    BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
             }
 
             for (uint32_t i = 0; i < 4; i++)
@@ -426,23 +477,23 @@
                 // if x or y, use vi128XY permute result, else use vi128ZW
                 Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 
-                // extract packed component 128 bit lanes 
+                // extract packed component 128 bit lanes
                 vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
             }
-
         }
         else
         {
             // pshufb masks for each component
             Value* vConstMask[2];
             // x/z shuffle mask
-            vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+            vConstMask[0] = C<char>({
+                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+            });
 
             // y/w shuffle mask
-            vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
-
+            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
 
             // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
             // apply defaults
@@ -460,32 +511,41 @@
                 // if x or y, use vi128XY permute result, else use vi128ZW
                 uint32_t selectedGather = (i < 2) ? 0 : 1;
 
-                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+                vGatherOutput[swizzleIndex] =
+                    BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty),
+                                   vConstMask[selectedMask]),
+                            vGatherTy);
                 // after pshufb mask for x channel; z uses the same shuffle from the second gather
                 // 256i - 0    1    2    3    4    5    6    7
-                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
+                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
             }
         }
     }
 
-    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
+    void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
+                                     Value*                 vGatherInput,
+                                     Value*                 vGatherOutput[],
+                                     bool                   bPackedOutput)
     {
         // cast types
         Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-        Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+        Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
         if (bPackedOutput)
         {
-            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
-                                                                                                      // shuffle mask
-            Value* vConstMask = C<char>({ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
-                0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 });
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+            Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
+                                           mVWidth / 4); // vwidth is units of 32 bits
+                                                         // shuffle mask
+            Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                                         0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
+            Value* vShufResult =
+                BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
             // after pshufb: group components together in each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 
-            Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            Value* vi128XY =
+                BITCAST(VPERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
@@ -494,10 +554,12 @@
             Value* vi128ZW = nullptr;
             if (info.numComps > 2)
             {
-                vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+                vi128ZW =
+                    BITCAST(VPERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
             }
 
-            // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+            // sign extend all enabled components. If we have a fill vVertexElements, output to
+            // current simdvertex
             for (uint32_t i = 0; i < 4; i++)
             {
                 uint32_t swizzleIndex = info.swizzle[i];
@@ -520,7 +582,8 @@
             }
         }
         // else zero extend
-        else {
+        else
+        {
             // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
             // apply defaults
             for (uint32_t i = 0; i < 4; ++i)
@@ -528,7 +591,8 @@
                 vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
             }
 
-            for (uint32_t i = 0; i < info.numComps; i++) {
+            for (uint32_t i = 0; i < info.numComps; i++)
+            {
                 uint32_t swizzleIndex = info.swizzle[i];
 
                 // pshufb masks for each component
@@ -537,45 +601,53 @@
                 {
                 case 0:
                     // x shuffle mask
-                    vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-                        0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
+                    vConstMask =
+                        C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+                                 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
                     break;
                 case 1:
                     // y shuffle mask
-                    vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-                        1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
+                    vConstMask =
+                        C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+                                 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
                     break;
                 case 2:
                     // z shuffle mask
-                    vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-                        2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
+                    vConstMask =
+                        C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+                                 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
                     break;
                 case 3:
                     // w shuffle mask
-                    vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-                        3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
+                    vConstMask =
+                        C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+                                 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
                     break;
                 default:
                     vConstMask = nullptr;
                     break;
                 }
 
-                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+                vGatherOutput[swizzleIndex] =
+                    BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
                 // after pshufb for x channel
                 // 256i - 0    1    2    3    4    5    6    7
-                //        x000 x000 x000 x000 x000 x000 x000 x000 
+                //        x000 x000 x000 x000 x000 x000 x000 x000
             }
         }
     }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief emulates a scatter operation.
-    /// @param pDst - pointer to destination 
+    /// @param pDst - pointer to destination
     /// @param vSrc - vector of src data to scatter
     /// @param vOffsets - vector of byte offsets from pDst
     /// @param vMask - mask of valid lanes
-    void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
+    void Builder::SCATTERPS(
+        Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask, JIT_MEM_CLIENT usage)
     {
+        AssertMemoryUsageParams(pDst, usage);
+
         /* Scatter algorithm
 
         while(Index = BitScanForward(mask))
@@ -587,25 +659,25 @@
         */
 
         BasicBlock* pCurBB = IRB()->GetInsertBlock();
-        Function* pFunc = pCurBB->getParent();
-        Type* pSrcTy = vSrc->getType()->getVectorElementType();
+        Function*   pFunc  = pCurBB->getParent();
+        Type*       pSrcTy = vSrc->getType()->getVectorElementType();
 
         // Store vectors on stack
         if (pScatterStackSrc == nullptr)
         {
             // Save off stack allocations and reuse per scatter. Significantly reduces stack
             // requirements for shaders with a lot of scatters.
-            pScatterStackSrc = CreateEntryAlloca(pFunc, mSimdInt64Ty);
+            pScatterStackSrc     = CreateEntryAlloca(pFunc, mSimdInt64Ty);
             pScatterStackOffsets = CreateEntryAlloca(pFunc, mSimdInt32Ty);
         }
 
-        Value* pSrcArrayPtr = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
+        Value* pSrcArrayPtr     = BITCAST(pScatterStackSrc, PointerType::get(vSrc->getType(), 0));
         Value* pOffsetsArrayPtr = pScatterStackOffsets;
         STORE(vSrc, pSrcArrayPtr);
         STORE(vOffsets, pOffsetsArrayPtr);
 
         // Cast to pointers for random access
-        pSrcArrayPtr = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
+        pSrcArrayPtr     = POINTER_CAST(pSrcArrayPtr, PointerType::get(pSrcTy, 0));
         pOffsetsArrayPtr = POINTER_CAST(pOffsetsArrayPtr, PointerType::get(mInt32Ty, 0));
 
         Value* pMask = VMOVMSK(vMask);
@@ -618,33 +690,44 @@
 
         Value* pIsUndef = ICMP_EQ(pIndex, C(32));
 
-        // Split current block
-        BasicBlock* pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
+        // Split current block or create new one if building inline
+        BasicBlock* pPostLoop;
+        if (pCurBB->getTerminator())
+        {
+            pPostLoop = pCurBB->splitBasicBlock(cast<Instruction>(pIsUndef)->getNextNode());
 
-        // Remove unconditional jump created by splitBasicBlock
-        pCurBB->getTerminator()->eraseFromParent();
+            // Remove unconditional jump created by splitBasicBlock
+            pCurBB->getTerminator()->eraseFromParent();
 
-        // Add terminator to end of original block
-        IRB()->SetInsertPoint(pCurBB);
+            // Add terminator to end of original block
+            IRB()->SetInsertPoint(pCurBB);
 
-        // Add conditional branch
-        COND_BR(pIsUndef, pPostLoop, pLoop);
+            // Add conditional branch
+            COND_BR(pIsUndef, pPostLoop, pLoop);
+        }
+        else
+        {
+            pPostLoop = BasicBlock::Create(mpJitMgr->mContext, "PostScatter_Loop", pFunc);
+
+            // Add conditional branch
+            COND_BR(pIsUndef, pPostLoop, pLoop);
+        }
 
         // Add loop basic block contents
         IRB()->SetInsertPoint(pLoop);
         PHINode* pIndexPhi = PHI(mInt32Ty, 2);
-        PHINode* pMaskPhi = PHI(mInt32Ty, 2);
+        PHINode* pMaskPhi  = PHI(mInt32Ty, 2);
 
         pIndexPhi->addIncoming(pIndex, pCurBB);
         pMaskPhi->addIncoming(pMask, pCurBB);
 
         // Extract elements for this index
-        Value* pSrcElem = LOADV(pSrcArrayPtr, { pIndexPhi });
-        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, { pIndexPhi });
+        Value* pSrcElem    = LOADV(pSrcArrayPtr, {pIndexPhi});
+        Value* pOffsetElem = LOADV(pOffsetsArrayPtr, {pIndexPhi});
 
         // GEP to this offset in dst
-        Value* pCurDst = GEP(pDst, pOffsetElem);
-        pCurDst = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
+        Value* pCurDst = GEP(pDst, pOffsetElem, mInt8PtrTy);
+        pCurDst        = POINTER_CAST(pCurDst, PointerType::get(pSrcTy, 0));
         STORE(pSrcElem, pCurDst);
 
         // Update the mask
@@ -663,4 +746,4 @@
         // Move builder to beginning of post loop
         IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
     }
-}
+} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
index 9ccac4f..15def96 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
@@ -1,100 +1,157 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file builder_misc.h
-*
-* @brief miscellaneous builder functions
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder_misc.h
+ *
+ * @brief miscellaneous builder functions
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 public:
-
 typedef enum _JIT_MEM_CLIENT
 {
     MEM_CLIENT_INTERNAL,
     GFX_MEM_CLIENT_FETCH,
-    GFX_MEM_CLIENT_SAMPLER
+    GFX_MEM_CLIENT_SAMPLER,
+    GFX_MEM_CLIENT_SHADER,
 } JIT_MEM_CLIENT;
 
 protected:
-
-virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset);
-void AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage);
+virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset);
+void           AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage);
 
 public:
+virtual Value* GEP(Value* Ptr, Value* Idx, Type* Ty = nullptr, const Twine& Name = "");
+virtual Value* GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name = "");
+virtual Value* GEP(Value* ptr, const std::initializer_list<Value*>& indexList, Type* Ty = nullptr);
+virtual Value*
+GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList, Type* Ty = nullptr);
 
-virtual Value *GEP(Value *Ptr, Value *Idx, Type *Ty = nullptr, const Twine &Name = "");
-virtual Value *GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name = "");
-virtual Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList, Type *Ty = nullptr);
-virtual Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList, Type *Ty = nullptr);
+Value* GEPA(Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = "");
+Value* GEPA(Type* Ty, Value* Ptr, ArrayRef<Value*> IdxList, const Twine& Name = "");
 
-Value *GEPA(Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name = "");
-Value *GEPA(Type *Ty, Value *Ptr, ArrayRef<Value *> IdxList, const Twine &Name = "");
+Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*>& indexList);
+Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t>& indexList);
 
-Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
-Value *IN_BOUNDS_GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+virtual LoadInst*
+                  LOAD(Value* Ptr, const char* Name, Type* Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+virtual LoadInst* LOAD(Value*         Ptr,
+                       const Twine&   Name  = "",
+                       Type*          Ty    = nullptr,
+                       JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+virtual LoadInst*
+                  LOAD(Type* Ty, Value* Ptr, const Twine& Name = "", JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+virtual LoadInst* LOAD(Value*         Ptr,
+                       bool           isVolatile,
+                       const Twine&   Name  = "",
+                       Type*          Ty    = nullptr,
+                       JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+virtual LoadInst* LOAD(Value*                                 BasePtr,
+                       const std::initializer_list<uint32_t>& offset,
+                       const llvm::Twine&                     Name  = "",
+                       Type*                                  Ty    = nullptr,
+                       JIT_MEM_CLIENT                         usage = MEM_CLIENT_INTERNAL);
 
-virtual LoadInst* LOAD(Value *Ptr, const char *Name, Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-virtual LoadInst* LOAD(Value *Ptr, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-virtual LoadInst* LOAD(Type *Ty, Value *Ptr, const Twine &Name = "", JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-virtual LoadInst* LOAD(Value *Ptr, bool isVolatile, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-virtual LoadInst* LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
-
-virtual CallInst* MASKED_LOAD(Value *Ptr, unsigned Align, Value *Mask, Value *PassThru = nullptr, const Twine &Name = "", Type *Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL)
+virtual CallInst* MASKED_LOAD(Value*         Ptr,
+                              unsigned       Align,
+                              Value*         Mask,
+                              Value*         PassThru = nullptr,
+                              const Twine&   Name     = "",
+                              Type*          Ty       = nullptr,
+                              JIT_MEM_CLIENT usage    = MEM_CLIENT_INTERNAL)
 {
     return IRB()->CreateMaskedLoad(Ptr, Align, Mask, PassThru, Name);
 }
 
-LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
-StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
-StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
+LoadInst*
+           LOADV(Value* BasePtr, const std::initializer_list<Value*>& offset, const llvm::Twine& name = "");
+StoreInst* STORE(Value* Val, Value* BasePtr, const std::initializer_list<uint32_t>& offset);
+StoreInst* STOREV(Value* Val, Value* BasePtr, const std::initializer_list<Value*>& offset);
 
-Value* MEM_ADD(Value* i32Incr, Value* basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name = "");
+Value* MEM_ADD(Value*                                 i32Incr,
+               Value*                                 basePtr,
+               const std::initializer_list<uint32_t>& indices,
+               const llvm::Twine&                     name = "");
 
-void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
-    Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+void Gather4(const SWR_FORMAT format,
+             Value*           pSrcBase,
+             Value*           byteOffsets,
+             Value*           mask,
+             Value*           vGatherComponents[],
+             bool             bPackedOutput,
+             JIT_MEM_CLIENT   usage = MEM_CLIENT_INTERNAL);
 
-virtual Value *GATHERPS(Value *src, Value *pBase, Value *indices, Value *mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+virtual Value* GATHERPS(Value*         src,
+                        Value*         pBase,
+                        Value*         indices,
+                        Value*         mask,
+                        uint8_t        scale = 1,
+                        JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
-void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-    Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+void GATHER4PS(const SWR_FORMAT_INFO& info,
+               Value*                 pSrcBase,
+               Value*                 byteOffsets,
+               Value*                 mask,
+               Value*                 vGatherComponents[],
+               bool                   bPackedOutput,
+               JIT_MEM_CLIENT         usage = MEM_CLIENT_INTERNAL);
 
-virtual Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+virtual Value* GATHERDD(Value*         src,
+                        Value*         pBase,
+                        Value*         indices,
+                        Value*         mask,
+                        uint8_t        scale = 1,
+                        JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
-void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
-    Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
+void GATHER4DD(const SWR_FORMAT_INFO& info,
+               Value*                 pSrcBase,
+               Value*                 byteOffsets,
+               Value*                 mask,
+               Value*                 vGatherComponents[],
+               bool                   bPackedOutput,
+               JIT_MEM_CLIENT         usage = MEM_CLIENT_INTERNAL);
 
-Value *GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
+Value* GATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, uint8_t scale = 1);
 
-Value *GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru);
+Value* GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru);
 
-void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
+virtual void SCATTERPS(Value*         pDst,
+                       Value*         vSrc,
+                       Value*         vOffsets,
+                       Value*         vMask,
+                       JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
 
-void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
-void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
+void Shuffle8bpcGather4(const SWR_FORMAT_INFO& info,
+                        Value*                 vGatherInput,
+                        Value*                 vGatherOutput[],
+                        bool                   bPackedOutput);
+void Shuffle16bpcGather4(const SWR_FORMAT_INFO& info,
+                         Value*                 vGatherInput[],
+                         Value*                 vGatherOutput[],
+                         bool                   bPackedOutput);
 
 // Static stack allocations for scatter operations
-Value* pScatterStackSrc{ nullptr };
-Value* pScatterStackOffsets{ nullptr };
+Value* pScatterStackSrc{nullptr};
+Value* pScatterStackOffsets{nullptr};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index f893693..26d8688 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file builder_misc.cpp
-* 
-* @brief Implementation for miscellaneous builder functions
-* 
-* Notes:
-* 
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder_misc.cpp
+ *
+ * @brief Implementation for miscellaneous builder functions
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #include "jit_pch.hpp"
 #include "builder.h"
 #include "common/rdtsc_buckets.h"
@@ -50,25 +50,25 @@
 
         // Extract the sign, exponent, and mantissa
         uint32_t uf = *(uint32_t*)&val;
-        sign = (uf & 0x80000000) >> 31;
-        exp = (uf & 0x7F800000) >> 23;
-        mant = uf & 0x007FFFFF;
+        sign        = (uf & 0x80000000) >> 31;
+        exp         = (uf & 0x7F800000) >> 23;
+        mant        = uf & 0x007FFFFF;
 
         // Check for out of range
         if (std::isnan(val))
         {
-            exp = 0x1F;
+            exp  = 0x1F;
             mant = 0x200;
-            sign = 1;                     // set the sign bit for NANs
+            sign = 1; // set the sign bit for NANs
         }
         else if (std::isinf(val))
         {
-            exp = 0x1f;
+            exp  = 0x1f;
             mant = 0x0;
         }
         else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
         {
-            exp = 0x1E;
+            exp  = 0x1E;
             mant = 0x3FF;
         }
         else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
@@ -76,12 +76,12 @@
             mant |= 0x00800000;
             for (; exp <= 0x70; mant >>= 1, exp++)
                 ;
-            exp = 0;
+            exp  = 0;
             mant = mant >> 13;
         }
         else if (exp < 0x66) // Too small to represent -> Zero
         {
-            exp = 0;
+            exp  = 0;
             mant = 0;
         }
         else
@@ -89,7 +89,7 @@
             // Saves bits that will be shifted off for rounding
             roundBits = mant & 0x1FFFu;
             // convert exponent and mantissa to 16 bit format
-            exp = exp - 0x70;
+            exp  = exp - 0x70;
             mant = mant >> 13;
 
             // Essentially RTZ, but round up if off by only 1 lsb
@@ -129,7 +129,7 @@
         {
             uint32_t sign = (val & 0x8000) << 16;
             uint32_t mant = (val & 0x3ff) << 13;
-            uint32_t exp = (val >> 10) & 0x1f;
+            uint32_t exp  = (val >> 10) & 0x1f;
             if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
             {
                 mant <<= 1;
@@ -140,134 +140,94 @@
                 }
                 mant &= (0x3ff << 13);
             }
-            exp = ((exp - 15 + 127) & 0xff) << 23;
+            exp    = ((exp - 15 + 127) & 0xff) << 23;
             result = sign | exp | mant;
         }
 
         return *(float*)&result;
     }
 
-    Constant *Builder::C(bool i)
-    {
-        return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
-    }
+    Constant* Builder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); }
 
-    Constant *Builder::C(char i)
-    {
-        return ConstantInt::get(IRB()->getInt8Ty(), i);
-    }
+    Constant* Builder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
 
-    Constant *Builder::C(uint8_t i)
-    {
-        return ConstantInt::get(IRB()->getInt8Ty(), i);
-    }
+    Constant* Builder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); }
 
-    Constant *Builder::C(int i)
-    {
-        return ConstantInt::get(IRB()->getInt32Ty(), i);
-    }
+    Constant* Builder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
 
-    Constant *Builder::C(int64_t i)
-    {
-        return ConstantInt::get(IRB()->getInt64Ty(), i);
-    }
+    Constant* Builder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
 
-    Constant *Builder::C(uint16_t i)
-    {
-        return ConstantInt::get(mInt16Ty,i);
-    }
+    Constant* Builder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); }
 
-    Constant *Builder::C(uint32_t i)
-    {
-        return ConstantInt::get(IRB()->getInt32Ty(), i);
-    }
+    Constant* Builder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); }
 
-    Constant *Builder::C(float i)
-    {
-        return ConstantFP::get(IRB()->getFloatTy(), i);
-    }
+    Constant* Builder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); }
 
-    Constant *Builder::PRED(bool pred)
+    Constant* Builder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); }
+
+    Constant* Builder::PRED(bool pred)
     {
         return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
     }
 
-    Value *Builder::VIMMED1(int i)
+    Value* Builder::VIMMED1(int i)
     {
         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
     }
 
-    Value *Builder::VIMMED1_16(int i)
+    Value* Builder::VIMMED1_16(int i)
     {
         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
     }
 
-    Value *Builder::VIMMED1(uint32_t i)
+    Value* Builder::VIMMED1(uint32_t i)
     {
         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
     }
 
-    Value *Builder::VIMMED1_16(uint32_t i)
+    Value* Builder::VIMMED1_16(uint32_t i)
     {
         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
     }
 
-    Value *Builder::VIMMED1(float i)
+    Value* Builder::VIMMED1(float i)
     {
         return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
     }
 
-    Value *Builder::VIMMED1_16(float i)
+    Value* Builder::VIMMED1_16(float i)
     {
         return ConstantVector::getSplat(mVWidth16, cast<ConstantFP>(C(i)));
     }
 
-    Value *Builder::VIMMED1(bool i)
+    Value* Builder::VIMMED1(bool i)
     {
         return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
     }
 
-    Value *Builder::VIMMED1_16(bool i)
+    Value* Builder::VIMMED1_16(bool i)
     {
         return ConstantVector::getSplat(mVWidth16, cast<ConstantInt>(C(i)));
     }
 
-    Value *Builder::VUNDEF_IPTR()
-    {
-        return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
-    }
+    Value* Builder::VUNDEF_IPTR() { return UndefValue::get(VectorType::get(mInt32PtrTy, mVWidth)); }
 
-    Value *Builder::VUNDEF(Type* t)
-    {
-        return UndefValue::get(VectorType::get(t, mVWidth));
-    }
+    Value* Builder::VUNDEF(Type* t) { return UndefValue::get(VectorType::get(t, mVWidth)); }
 
-    Value *Builder::VUNDEF_I()
-    {
-        return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
-    }
+    Value* Builder::VUNDEF_I() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); }
 
-    Value *Builder::VUNDEF_I_16()
-    {
-        return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16));
-    }
+    Value* Builder::VUNDEF_I_16() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16)); }
 
-    Value *Builder::VUNDEF_F()
-    {
-        return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
-    }
+    Value* Builder::VUNDEF_F() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); }
 
-    Value *Builder::VUNDEF_F_16()
-    {
-        return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16));
-    }
+    Value* Builder::VUNDEF_F_16() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16)); }
 
-    Value *Builder::VUNDEF(Type *ty, uint32_t size)
+    Value* Builder::VUNDEF(Type* ty, uint32_t size)
     {
         return UndefValue::get(VectorType::get(ty, size));
     }
 
-    Value *Builder::VBROADCAST(Value *src, const llvm::Twine& name)
+    Value* Builder::VBROADCAST(Value* src, const llvm::Twine& name)
     {
         // check if src is already a vector
         if (src->getType()->isVectorTy())
@@ -278,7 +238,7 @@
         return VECTOR_SPLAT(mVWidth, src, name);
     }
 
-    Value *Builder::VBROADCAST_16(Value *src)
+    Value* Builder::VBROADCAST_16(Value* src)
     {
         // check if src is already a vector
         if (src->getType()->isVectorTy())
@@ -292,18 +252,20 @@
     uint32_t Builder::IMMED(Value* v)
     {
         SWR_ASSERT(isa<ConstantInt>(v));
-        ConstantInt *pValConst = cast<ConstantInt>(v);
+        ConstantInt* pValConst = cast<ConstantInt>(v);
         return pValConst->getZExtValue();
     }
 
     int32_t Builder::S_IMMED(Value* v)
     {
         SWR_ASSERT(isa<ConstantInt>(v));
-        ConstantInt *pValConst = cast<ConstantInt>(v);
+        ConstantInt* pValConst = cast<ConstantInt>(v);
         return pValConst->getSExtValue();
     }
 
-    CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList, const llvm::Twine& name)
+    CallInst* Builder::CALL(Value*                               Callee,
+                            const std::initializer_list<Value*>& argsList,
+                            const llvm::Twine&                   name)
     {
         std::vector<Value*> args;
         for (auto arg : argsList)
@@ -311,14 +273,14 @@
         return CALLA(Callee, args, name);
     }
 
-    CallInst *Builder::CALL(Value *Callee, Value* arg)
+    CallInst* Builder::CALL(Value* Callee, Value* arg)
     {
         std::vector<Value*> args;
         args.push_back(arg);
         return CALLA(Callee, args);
     }
 
-    CallInst *Builder::CALL2(Value *Callee, Value* arg1, Value* arg2)
+    CallInst* Builder::CALL2(Value* Callee, Value* arg1, Value* arg2)
     {
         std::vector<Value*> args;
         args.push_back(arg1);
@@ -326,7 +288,7 @@
         return CALLA(Callee, args);
     }
 
-    CallInst *Builder::CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3)
+    CallInst* Builder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3)
     {
         std::vector<Value*> args;
         args.push_back(arg1);
@@ -335,15 +297,15 @@
         return CALLA(Callee, args);
     }
 
-    Value *Builder::VRCP(Value *va, const llvm::Twine& name)
+    Value* Builder::VRCP(Value* va, const llvm::Twine& name)
     {
-        return FDIV(VIMMED1(1.0f), va, name);  // 1 / a
+        return FDIV(VIMMED1(1.0f), va, name); // 1 / a
     }
 
-    Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
+    Value* Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY)
     {
         Value* vOut = FMADDPS(vA, vX, vC);
-        vOut = FMADDPS(vB, vY, vOut);
+        vOut        = FMADDPS(vB, vY, vOut);
         return vOut;
     }
 
@@ -357,7 +319,8 @@
     ///   result from a GEP, printing out the pointer to memory
     /// @param printStr - constant string to print, which includes format specifiers
     /// @param printArgs - initializer list of Value*'s to print to std out
-    CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
+    CallInst* Builder::PRINT(const std::string&                   printStr,
+                             const std::initializer_list<Value*>& printArgs)
     {
         // push the arguments to CallPrint into a vector
         std::vector<Value*> printCallArgs;
@@ -365,15 +328,15 @@
         printCallArgs.resize(1);
 
         // search through the format string for special processing
-        size_t pos = 0;
+        size_t      pos = 0;
         std::string tempStr(printStr);
-        pos = tempStr.find('%', pos);
+        pos    = tempStr.find('%', pos);
         auto v = printArgs.begin();
 
         while ((pos != std::string::npos) && (v != printArgs.end()))
         {
-            Value* pArg = *v;
-            Type* pType = pArg->getType();
+            Value* pArg  = *v;
+            Type*  pType = pArg->getType();
 
             if (pType->isVectorTy())
             {
@@ -381,7 +344,7 @@
 
                 if (toupper(tempStr[pos + 1]) == 'X')
                 {
-                    tempStr[pos] = '0';
+                    tempStr[pos]     = '0';
                     tempStr[pos + 1] = 'x';
                     tempStr.insert(pos + 2, "%08X ");
                     pos += 7;
@@ -405,9 +368,11 @@
                     {
                         tempStr.insert(pos, std::string("%f "));
                         pos += 3;
-                        printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+                        printCallArgs.push_back(
+                            FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
                     }
-                    printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+                    printCallArgs.push_back(
+                        FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
                 }
                 else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
                 {
@@ -416,9 +381,24 @@
                     {
                         tempStr.insert(pos, std::string("%d "));
                         pos += 3;
-                        printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                        printCallArgs.push_back(
+                            S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
                     }
-                    printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                    printCallArgs.push_back(
+                        S_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
+                }
+                else if ((tempStr[pos + 1] == 'u') && (pContainedType->isIntegerTy()))
+                {
+                    uint32_t i = 0;
+                    for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+                    {
+                        tempStr.insert(pos, std::string("%d "));
+                        pos += 3;
+                        printCallArgs.push_back(
+                            Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
+                    }
+                    printCallArgs.push_back(
+                        Z_EXT(VEXTRACT(pArg, C(i)), Type::getInt32Ty(JM()->mContext)));
                 }
             }
             else
@@ -448,89 +428,82 @@
         }
 
         // create global variable constant string
-        Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
-        GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
+        Constant*       constString = ConstantDataArray::getString(JM()->mContext, tempStr, true);
+        GlobalVariable* gvPtr       = new GlobalVariable(
+            constString->getType(), true, GlobalValue::InternalLinkage, constString, "printStr");
         JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
 
         // get a pointer to the first character in the constant string array
-        std::vector<Constant*> geplist{C(0),C(0)};
-        Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
+        std::vector<Constant*> geplist{C(0), C(0)};
+        Constant* strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr, geplist, false);
 
         // insert the pointer to the format string in the argument vector
         printCallArgs[0] = strGEP;
 
         // get pointer to CallPrint function and insert decl into the module if needed
         std::vector<Type*> args;
-        args.push_back(PointerType::get(mInt8Ty,0));
-        FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
-        Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
+        args.push_back(PointerType::get(mInt8Ty, 0));
+        FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, true);
+        Function*     callPrintFn =
+            cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
 
         // if we haven't yet added the symbol to the symbol table
-        if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
+        if ((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
         {
-            sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
+            sys::DynamicLibrary::AddSymbol("CallPrint", (void*)&CallPrint);
         }
 
         // insert a call to CallPrint
-        return CALLA(callPrintFn,printCallArgs);
+        return CALLA(callPrintFn, printCallArgs);
     }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Wrapper around PRINT with initializer list.
-    CallInst* Builder::PRINT(const std::string &printStr)
-    {
-        return PRINT(printStr, {});
-    }
+    CallInst* Builder::PRINT(const std::string& printStr) { return PRINT(printStr, {}); }
 
-    Value *Builder::EXTRACT_16(Value *x, uint32_t imm)
+    Value* Builder::EXTRACT_16(Value* x, uint32_t imm)
     {
         if (imm == 0)
         {
-            return VSHUFFLE(x, UndefValue::get(x->getType()), { 0, 1, 2, 3, 4, 5, 6, 7 });
+            return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7});
         }
         else
         {
-            return VSHUFFLE(x, UndefValue::get(x->getType()), { 8, 9, 10, 11, 12, 13, 14, 15 });
+            return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15});
         }
     }
 
-    Value *Builder::JOIN_16(Value *a, Value *b)
+    Value* Builder::JOIN_16(Value* a, Value* b)
     {
-        return VSHUFFLE(a, b, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
+        return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
     }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief convert x86 <N x float> mask to llvm <N x i1> mask
-    Value *Builder::MASK(Value *vmask)
+    Value* Builder::MASK(Value* vmask)
     {
-        Value *src = BITCAST(vmask, mSimdInt32Ty);
+        Value* src = BITCAST(vmask, mSimdInt32Ty);
         return ICMP_SLT(src, VIMMED1(0));
     }
 
-    Value *Builder::MASK_16(Value *vmask)
+    Value* Builder::MASK_16(Value* vmask)
     {
-        Value *src = BITCAST(vmask, mSimd16Int32Ty);
+        Value* src = BITCAST(vmask, mSimd16Int32Ty);
         return ICMP_SLT(src, VIMMED1_16(0));
     }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
-    Value *Builder::VMASK(Value *mask)
-    {
-        return S_EXT(mask, mSimdInt32Ty);
-    }
+    Value* Builder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); }
 
-    Value *Builder::VMASK_16(Value *mask)
-    {
-        return S_EXT(mask, mSimd16Int32Ty);
-    }
+    Value* Builder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); }
 
     /// @brief Convert <Nxi1> llvm mask to integer
-    Value *Builder::VMOVMSK(Value* mask)
+    Value* Builder::VMOVMSK(Value* mask)
     {
         SWR_ASSERT(mask->getType()->getVectorElementType() == mInt1Ty);
         uint32_t numLanes = mask->getType()->getVectorNumElements();
-        Value* i32Result;
+        Value*   i32Result;
         if (numLanes == 8)
         {
             i32Result = BITCAST(mask, mInt8Ty);
@@ -548,18 +521,18 @@
     }
 
     //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPSHUFB operation in LLVM IR.  If not  
+    /// @brief Generate a VPSHUFB operation in LLVM IR.  If not
     /// supported on the underlying platform, emulate it
     /// @param a - 256bit SIMD(32x8bit) of 8bit integer values
     /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
-    /// Byte masks in lower 128 lane of b selects 8 bit values from lower 
-    /// 128bits of a, and vice versa for the upper lanes.  If the mask 
+    /// Byte masks in lower 128 lane of b selects 8 bit values from lower
+    /// 128bits of a, and vice versa for the upper lanes.  If the mask
     /// value is negative, '0' is inserted.
-    Value *Builder::PSHUFB(Value* a, Value* b)
+    Value* Builder::PSHUFB(Value* a, Value* b)
     {
         Value* res;
         // use avx2 pshufb instruction if available
-        if(JM()->mArch.AVX2())
+        if (JM()->mArch.AVX2())
         {
             res = VPSHUFB(a, b);
         }
@@ -573,22 +546,26 @@
 
             // insert an 8 bit value from the high and low lanes of a per loop iteration
             numElms /= 2;
-            for(uint32_t i = 0; i < numElms; i++)
+            for (uint32_t i = 0; i < numElms; i++)
             {
-                ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
+                ConstantInt* cLow128b  = cast<ConstantInt>(cB->getAggregateElement(i));
                 ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
 
                 // extract values from constant mask
-                char valLow128bLane =  (char)(cLow128b->getSExtValue());
+                char valLow128bLane  = (char)(cLow128b->getSExtValue());
                 char valHigh128bLane = (char)(cHigh128b->getSExtValue());
 
                 Value* insertValLow128b;
                 Value* insertValHigh128b;
 
                 // if the mask value is negative, insert a '0' in the respective output position
-                // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
-                insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
-                insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
+                // otherwise, lookup the value at mask position (bits 3..0 of the respective mask
+                // byte) in a and insert in output vector
+                insertValLow128b =
+                    (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
+                insertValHigh128b = (valHigh128bLane < 0)
+                                        ? C((char)0)
+                                        : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
 
                 vShuf = VINSERT(vShuf, insertValLow128b, i);
                 vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
@@ -599,11 +576,11 @@
     }
 
     //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 
+    /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32
     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
-    /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only 
+    /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only
     /// lower 8 values are used.
-    Value *Builder::PMOVSXBD(Value* a)
+    Value* Builder::PMOVSXBD(Value* a)
     {
         // VPMOVSXBD output type
         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
@@ -612,10 +589,10 @@
     }
 
     //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 
+    /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32
     /// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
     /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
-    Value *Builder::PMOVSXWD(Value* a)
+    Value* Builder::PMOVSXWD(Value* a)
     {
         // VPMOVSXWD output type
         Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
@@ -627,7 +604,7 @@
     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
     /// in LLVM IR.  If not supported on the underlying platform, emulate it
     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-    Value *Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
+    Value* Builder::CVTPH2PS(Value* a, const llvm::Twine& name)
     {
         if (JM()->mArch.F16C())
         {
@@ -635,20 +612,22 @@
         }
         else
         {
-            FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
-            Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
+            FunctionType* pFuncTy   = FunctionType::get(mFP32Ty, mInt16Ty);
+            Function*     pCvtPh2Ps = cast<Function>(
+                JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat16ToFloat32", pFuncTy));
 
             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat16ToFloat32") == nullptr)
             {
-                sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32", (void *)&ConvertFloat16ToFloat32);
+                sys::DynamicLibrary::AddSymbol("ConvertFloat16ToFloat32",
+                                               (void*)&ConvertFloat16ToFloat32);
             }
 
             Value* pResult = UndefValue::get(mSimdFP32Ty);
             for (uint32_t i = 0; i < mVWidth; ++i)
             {
-                Value* pSrc = VEXTRACT(a, C(i));
+                Value* pSrc  = VEXTRACT(a, C(i));
                 Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
-                pResult = VINSERT(pResult, pConv, C(i));
+                pResult      = VINSERT(pResult, pConv, C(i));
             }
 
             pResult->setName(name);
@@ -660,7 +639,7 @@
     /// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
     /// in LLVM IR.  If not supported on the underlying platform, emulate it
     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
-    Value *Builder::CVTPS2PH(Value* a, Value* rounding)
+    Value* Builder::CVTPS2PH(Value* a, Value* rounding)
     {
         if (JM()->mArch.F16C())
         {
@@ -669,45 +648,47 @@
         else
         {
             // call scalar C function for now
-            FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
-            Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
+            FunctionType* pFuncTy   = FunctionType::get(mInt16Ty, mFP32Ty);
+            Function*     pCvtPs2Ph = cast<Function>(
+                JM()->mpCurrentModule->getOrInsertFunction("ConvertFloat32ToFloat16", pFuncTy));
 
             if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertFloat32ToFloat16") == nullptr)
             {
-                sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16", (void *)&ConvertFloat32ToFloat16);
+                sys::DynamicLibrary::AddSymbol("ConvertFloat32ToFloat16",
+                                               (void*)&ConvertFloat32ToFloat16);
             }
 
             Value* pResult = UndefValue::get(mSimdInt16Ty);
             for (uint32_t i = 0; i < mVWidth; ++i)
             {
-                Value* pSrc = VEXTRACT(a, C(i));
+                Value* pSrc  = VEXTRACT(a, C(i));
                 Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
-                pResult = VINSERT(pResult, pConv, C(i));
+                pResult      = VINSERT(pResult, pConv, C(i));
             }
 
             return pResult;
         }
     }
 
-    Value *Builder::PMAXSD(Value* a, Value* b)
+    Value* Builder::PMAXSD(Value* a, Value* b)
     {
         Value* cmp = ICMP_SGT(a, b);
         return SELECT(cmp, a, b);
     }
 
-    Value *Builder::PMINSD(Value* a, Value* b)
+    Value* Builder::PMINSD(Value* a, Value* b)
     {
         Value* cmp = ICMP_SLT(a, b);
         return SELECT(cmp, a, b);
     }
 
-    Value *Builder::PMAXUD(Value* a, Value* b)
+    Value* Builder::PMAXUD(Value* a, Value* b)
     {
         Value* cmp = ICMP_UGT(a, b);
         return SELECT(cmp, a, b);
     }
 
-    Value *Builder::PMINUD(Value* a, Value* b)
+    Value* Builder::PMINUD(Value* a, Value* b)
     {
         Value* cmp = ICMP_ULT(a, b);
         return SELECT(cmp, a, b);
@@ -717,110 +698,104 @@
     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType)
     {
         auto saveIP = IRB()->saveIP();
-        IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
-                              pFunc->getEntryBlock().begin());
+        IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
         Value* pAlloca = ALLOCA(pType);
-        if (saveIP.isSet()) IRB()->restoreIP(saveIP);
+        if (saveIP.isSet())
+            IRB()->restoreIP(saveIP);
         return pAlloca;
     }
 
     Value* Builder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize)
     {
         auto saveIP = IRB()->saveIP();
-        IRB()->SetInsertPoint(&pFunc->getEntryBlock(),
-            pFunc->getEntryBlock().begin());
+        IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin());
         Value* pAlloca = ALLOCA(pType, pArraySize);
-        if (saveIP.isSet()) IRB()->restoreIP(saveIP);
+        if (saveIP.isSet())
+            IRB()->restoreIP(saveIP);
         return pAlloca;
     }
 
     Value* Builder::VABSPS(Value* a)
     {
-        Value* asInt = BITCAST(a, mSimdInt32Ty);
+        Value* asInt  = BITCAST(a, mSimdInt32Ty);
         Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
         return result;
     }
 
-    Value *Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
+    Value* Builder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name)
     {
-        Value *lowCmp = ICMP_SLT(src, low);
-        Value *ret = SELECT(lowCmp, low, src);
+        Value* lowCmp = ICMP_SLT(src, low);
+        Value* ret    = SELECT(lowCmp, low, src);
 
-        Value *highCmp = ICMP_SGT(ret, high);
-        ret = SELECT(highCmp, high, ret, name);
+        Value* highCmp = ICMP_SGT(ret, high);
+        ret            = SELECT(highCmp, high, ret, name);
 
         return ret;
     }
 
-    Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
+    Value* Builder::FCLAMP(Value* src, Value* low, Value* high)
     {
-        Value *lowCmp = FCMP_OLT(src, low);
-        Value *ret = SELECT(lowCmp, low, src);
+        Value* lowCmp = FCMP_OLT(src, low);
+        Value* ret    = SELECT(lowCmp, low, src);
 
-        Value *highCmp = FCMP_OGT(ret, high);
-        ret = SELECT(highCmp, high, ret);
+        Value* highCmp = FCMP_OGT(ret, high);
+        ret            = SELECT(highCmp, high, ret);
 
         return ret;
     }
 
-    Value *Builder::FCLAMP(Value* src, float low, float high)
+    Value* Builder::FCLAMP(Value* src, float low, float high)
     {
         Value* result = VMAXPS(src, VIMMED1(low));
-        result = VMINPS(result, VIMMED1(high));
+        result        = VMINPS(result, VIMMED1(high));
 
         return result;
     }
 
-    Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
+    Value* Builder::FMADDPS(Value* a, Value* b, Value* c)
     {
         Value* vOut;
-        // use FMADs if available
-        if(JM()->mArch.AVX2())
-        {
-            vOut = VFMADDPS(a, b, c);
-        }
-        else
-        {
-            vOut = FADD(FMUL(a, b), c);
-        }
+        // This maps to LLVM fmuladd intrinsic
+        vOut = VFMADDPS(a, b, c);
         return vOut;
     }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief pop count on vector mask (e.g. <8 x i1>)
-    Value* Builder::VPOPCNT(Value* a)
-    {
-        return POPCNT(VMOVMSK(a));
-    }
+    Value* Builder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief C functions called by LLVM IR
     //////////////////////////////////////////////////////////////////////////
 
-    Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
+    Value* Builder::VEXTRACTI128(Value* a, Constant* imm8)
     {
-        bool flag = !imm8->isZeroValue();
-        SmallVector<Constant*,8> idx;
-        for (unsigned i = 0; i < mVWidth / 2; i++) {
+        bool                      flag = !imm8->isZeroValue();
+        SmallVector<Constant*, 8> idx;
+        for (unsigned i = 0; i < mVWidth / 2; i++)
+        {
             idx.push_back(C(flag ? i + mVWidth / 2 : i));
         }
         return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
     }
 
-    Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
+    Value* Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
     {
-        bool flag = !imm8->isZeroValue();
-        SmallVector<Constant*,8> idx;
-        for (unsigned i = 0; i < mVWidth; i++) {
+        bool                      flag = !imm8->isZeroValue();
+        SmallVector<Constant*, 8> idx;
+        for (unsigned i = 0; i < mVWidth; i++)
+        {
             idx.push_back(C(i));
         }
-        Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
+        Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
 
-        SmallVector<Constant*,8> idx2;
-        for (unsigned i = 0; i < mVWidth / 2; i++) {
+        SmallVector<Constant*, 8> idx2;
+        for (unsigned i = 0; i < mVWidth / 2; i++)
+        {
             idx2.push_back(C(flag ? i : i + mVWidth));
         }
-        for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
+        for (unsigned i = mVWidth / 2; i < mVWidth; i++)
+        {
             idx2.push_back(C(flag ? i + mVWidth / 2 : i));
         }
         return VSHUFFLE(a, inter, ConstantVector::get(idx2));
@@ -829,45 +804,51 @@
     // rdtsc buckets macros
     void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
     {
-        // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
-        // buckets framework when single threaded
+        // @todo due to an issue with thread local storage propagation in llvm, we can only safely
+        // call into buckets framework when single threaded
         if (KNOB_SINGLE_THREADED)
         {
             std::vector<Type*> args{
-                PointerType::get(mInt32Ty, 0),   // pBucketMgr
-                mInt32Ty                        // id
+                PointerType::get(mInt32Ty, 0), // pBucketMgr
+                mInt32Ty                       // id
             };
 
             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
-            Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
-            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
+            Function*     pFunc   = cast<Function>(
+                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
+            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") ==
+                nullptr)
             {
-                sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+                sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket",
+                                               (void*)&BucketManager_StartBucket);
             }
 
-            CALL(pFunc, { pBucketMgr, pId });
+            CALL(pFunc, {pBucketMgr, pId});
         }
     }
 
     void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
     {
-        // @todo due to an issue with thread local storage propagation in llvm, we can only safely call into
-        // buckets framework when single threaded
+        // @todo due to an issue with thread local storage propagation in llvm, we can only safely
+        // call into buckets framework when single threaded
         if (KNOB_SINGLE_THREADED)
         {
             std::vector<Type*> args{
-                PointerType::get(mInt32Ty, 0),   // pBucketMgr
-                mInt32Ty                        // id
+                PointerType::get(mInt32Ty, 0), // pBucketMgr
+                mInt32Ty                       // id
             };
 
             FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
-            Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
-            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+            Function*     pFunc   = cast<Function>(
+                JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
+            if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") ==
+                nullptr)
             {
-                sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+                sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket",
+                                               (void*)&BucketManager_StopBucket);
             }
 
-            CALL(pFunc, { pBucketMgr, pId });
+            CALL(pFunc, {pBucketMgr, pId});
         }
     }
 
@@ -876,14 +857,14 @@
         if (pType->isStructTy())
         {
             uint32_t numElems = pType->getStructNumElements();
-            Type* pElemTy = pType->getStructElementType(0);
+            Type*    pElemTy  = pType->getStructElementType(0);
             return numElems * GetTypeSize(pElemTy);
         }
 
         if (pType->isArrayTy())
         {
             uint32_t numElems = pType->getArrayNumElements();
-            Type* pElemTy = pType->getArrayElementType();
+            Type*    pElemTy  = pType->getArrayElementType();
             return numElems * GetTypeSize(pElemTy);
         }
 
@@ -911,4 +892,4 @@
         SWR_ASSERT(false, "Unimplemented type.");
         return 0;
     }
-}
+} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index bd4be9f..f8701f9 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -1,143 +1,164 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-* 
-* @file builder_misc.h
-* 
-* @brief miscellaneous builder functions
-* 
-* Notes:
-* 
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file builder_misc.h
+ *
+ * @brief miscellaneous builder functions
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
-Constant *C(bool i);
-Constant *C(char i);
-Constant *C(uint8_t i);
-Constant *C(int i);
-Constant *C(int64_t i);
-Constant *C(uint16_t i);
-Constant *C(uint32_t i);
-Constant *C(float i);
+Constant* C(bool i);
+Constant* C(char i);
+Constant* C(uint8_t i);
+Constant* C(int i);
+Constant* C(int64_t i);
+Constant* C(uint64_t i);
+Constant* C(uint16_t i);
+Constant* C(uint32_t i);
+Constant* C(float i);
 
-template<typename Ty>
-Constant *C(const std::initializer_list<Ty> &constList)
+template <typename Ty>
+Constant* C(const std::initializer_list<Ty>& constList)
 {
     std::vector<Constant*> vConsts;
-    for(auto i : constList) {
-
+    for (auto i : constList)
+    {
         vConsts.push_back(C((Ty)i));
     }
     return ConstantVector::get(vConsts);
 }
 
-template<typename Ty>
-Constant *CA(LLVMContext& ctx, ArrayRef<Ty> constList)
+template <typename Ty>
+Constant* CA(LLVMContext& ctx, ArrayRef<Ty> constList)
 {
     return ConstantDataArray::get(ctx, constList);
 }
 
-Constant *PRED(bool pred);
+template <typename Ty>
+Constant* CInc(uint32_t base, uint32_t count)
+{
+    std::vector<Constant*> vConsts;
 
-Value *VIMMED1(int i);
-Value *VIMMED1_16(int i);
+    for (uint32_t i = 0; i < count; i++)
+    {
+        vConsts.push_back(C((Ty)base));
+        base++;
+    }
+    return ConstantVector::get(vConsts);
+}
 
-Value *VIMMED1(uint32_t i);
-Value *VIMMED1_16(uint32_t i);
+Constant* PRED(bool pred);
 
-Value *VIMMED1(float i);
-Value *VIMMED1_16(float i);
+Value* VIMMED1(int i);
+Value* VIMMED1_16(int i);
 
-Value *VIMMED1(bool i);
-Value *VIMMED1_16(bool i);
+Value* VIMMED1(uint32_t i);
+Value* VIMMED1_16(uint32_t i);
 
-Value *VUNDEF(Type* t);
+Value* VIMMED1(float i);
+Value* VIMMED1_16(float i);
 
-Value *VUNDEF_F();
-Value *VUNDEF_F_16();
+Value* VIMMED1(bool i);
+Value* VIMMED1_16(bool i);
 
-Value *VUNDEF_I();
-Value *VUNDEF_I_16();
+Value* VUNDEF(Type* t);
 
-Value *VUNDEF(Type* ty, uint32_t size);
+Value* VUNDEF_F();
+Value* VUNDEF_F_16();
 
-Value *VUNDEF_IPTR();
+Value* VUNDEF_I();
+Value* VUNDEF_I_16();
 
-Value *VBROADCAST(Value *src, const llvm::Twine& name = "");
-Value *VBROADCAST_16(Value *src);
+Value* VUNDEF(Type* ty, uint32_t size);
 
-Value *VRCP(Value *va, const llvm::Twine& name = "");
-Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
+Value* VUNDEF_IPTR();
+
+Value* VBROADCAST(Value* src, const llvm::Twine& name = "");
+Value* VBROADCAST_16(Value* src);
+
+Value* VRCP(Value* va, const llvm::Twine& name = "");
+Value* VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY);
 
 uint32_t IMMED(Value* i);
-int32_t S_IMMED(Value* i);
+int32_t  S_IMMED(Value* i);
 
-CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args, const llvm::Twine& name = "");
-CallInst *CALL(Value *Callee) { return CALLA(Callee); }
-CallInst *CALL(Value *Callee, Value* arg);
-CallInst *CALL2(Value *Callee, Value* arg1, Value* arg2);
-CallInst *CALL3(Value *Callee, Value* arg1, Value* arg2, Value* arg3);
+CallInst*
+          CALL(Value* Callee, const std::initializer_list<Value*>& args, const llvm::Twine& name = "");
+CallInst* CALL(Value* Callee)
+{
+    return CALLA(Callee);
+}
+CallInst* CALL(Value* Callee, Value* arg);
+CallInst* CALL2(Value* Callee, Value* arg1, Value* arg2);
+CallInst* CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3);
 
-Value *MASK(Value *vmask);
-Value *MASK_16(Value *vmask);
+Value* MASK(Value* vmask);
+Value* MASK_16(Value* vmask);
 
-Value *VMASK(Value *mask);
-Value *VMASK_16(Value *mask);
+Value* VMASK(Value* mask);
+Value* VMASK_16(Value* mask);
 
-Value *VMOVMSK(Value *mask);
+Value* VMOVMSK(Value* mask);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief functions that build IR to call x86 intrinsics directly, or
 /// emulate them with other instructions if not available on the host
 //////////////////////////////////////////////////////////////////////////
 
-Value *EXTRACT_16(Value *x, uint32_t imm);
-Value *JOIN_16(Value *a, Value *b);
+Value* EXTRACT_16(Value* x, uint32_t imm);
+Value* JOIN_16(Value* a, Value* b);
 
-Value *PSHUFB(Value* a, Value* b);
-Value *PMOVSXBD(Value* a);
-Value *PMOVSXWD(Value* a);
-Value *CVTPH2PS(Value* a, const llvm::Twine& name = "");
-Value *CVTPS2PH(Value* a, Value* rounding);
-Value *PMAXSD(Value* a, Value* b);
-Value *PMINSD(Value* a, Value* b);
-Value *PMAXUD(Value* a, Value* b);
-Value *PMINUD(Value* a, Value* b);
-Value *VABSPS(Value* a);
-Value *FMADDPS(Value* a, Value* b, Value* c);
+Value* PSHUFB(Value* a, Value* b);
+Value* PMOVSXBD(Value* a);
+Value* PMOVSXWD(Value* a);
+Value* CVTPH2PS(Value* a, const llvm::Twine& name = "");
+Value* CVTPS2PH(Value* a, Value* rounding);
+Value* PMAXSD(Value* a, Value* b);
+Value* PMINSD(Value* a, Value* b);
+Value* PMAXUD(Value* a, Value* b);
+Value* PMINUD(Value* a, Value* b);
+Value* VABSPS(Value* a);
+Value* FMADDPS(Value* a, Value* b, Value* c);
 
-Value *ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = "");
-Value *FCLAMP(Value* src, Value* low, Value* high);
-Value *FCLAMP(Value* src, float low, float high);
+Value* ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = "");
+Value* FCLAMP(Value* src, Value* low, Value* high);
+Value* FCLAMP(Value* src, float low, float high);
 
-CallInst *PRINT(const std::string &printStr);
-CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs);
+CallInst* PRINT(const std::string& printStr);
+CallInst* PRINT(const std::string& printStr, const std::initializer_list<Value*>& printArgs);
 
 Value* VPOPCNT(Value* a);
 
-Value* INT3() { return DEBUGTRAP(); }
+Value* INT3()
+{
+    return DEBUGTRAP();
+}
 
 
-Value *VEXTRACTI128(Value* a, Constant* imm8);
-Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
+Value* VEXTRACTI128(Value* a, Constant* imm8);
+Value* VINSERTI128(Value* a, Value* b, Constant* imm8);
 
 // rdtsc buckets macros
 void RDTSC_START(Value* pBucketMgr, Value* pId);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 09590b7..3ad0fab 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file fetch_jit.cpp
-*
-* @brief Implementation of the fetch jitter
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file fetch_jit.cpp
+ *
+ * @brief Implementation of the fetch jitter
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #include "jit_pch.hpp"
 #include "builder_gfx_mem.h"
 #include "jit_api.h"
@@ -54,42 +54,64 @@
 //////////////////////////////////////////////////////////////////////////
 struct FetchJit : public BuilderGfxMem
 {
-    FetchJit(JitManager* pJitMgr) :
-        BuilderGfxMem(pJitMgr)
-    {}
+    FetchJit(JitManager* pJitMgr) : BuilderGfxMem(pJitMgr) {}
 
     Function* Create(const FETCH_COMPILE_STATE& fetchState);
 
     Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
     Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
     Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
-    template<typename T> Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
+    template <typename T>
+    Value* GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex);
 
     // package up Shuffle*bpcGatherd args into a tuple for convenience
-    typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType,
-        uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
-        const uint32_t(&)[4]> Shuffle8bpcArgs;
+    typedef std::tuple<Value*&,
+                       Value*,
+                       const Instruction::CastOps,
+                       const ConversionType,
+                       uint32_t&,
+                       uint32_t&,
+                       const ComponentEnable,
+                       const ComponentControl (&)[4],
+                       Value* (&)[4],
+                       const uint32_t (&)[4]>
+        Shuffle8bpcArgs;
 
-    void Shuffle8bpcGatherd16(Shuffle8bpcArgs &args);
-    void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
+    void Shuffle8bpcGatherd16(Shuffle8bpcArgs& args);
+    void Shuffle8bpcGatherd(Shuffle8bpcArgs& args);
 
-    typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
-        uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
+    typedef std::tuple<Value* (&)[2],
+                       Value*,
+                       const Instruction::CastOps,
+                       const ConversionType,
+                       uint32_t&,
+                       uint32_t&,
+                       const ComponentEnable,
+                       const ComponentControl (&)[4],
+                       Value* (&)[4]>
+        Shuffle16bpcArgs;
 
-    void Shuffle16bpcGather16(Shuffle16bpcArgs &args);
-    void Shuffle16bpcGather(Shuffle16bpcArgs &args);
+    void Shuffle16bpcGather16(Shuffle16bpcArgs& args);
+    void Shuffle16bpcGather(Shuffle16bpcArgs& args);
 
-    void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
+    void StoreVertexElements(Value*         pVtxOut,
+                             const uint32_t outputElt,
+                             const uint32_t numEltsToStore,
+                             Value* (&vVertexElements)[4]);
 
-    Value *GenerateCompCtrlVector(const ComponentControl ctrl);
+    Value* GenerateCompCtrlVector(const ComponentControl ctrl);
 
-    void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* streams, Value* vIndices, Value* pVtxOut);
+    void JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
+                           Value*                     streams,
+                           Value*                     vIndices,
+                           Value*                     pVtxOut);
 
     bool IsOddFormat(SWR_FORMAT format);
     bool IsUniformFormat(SWR_FORMAT format);
     void UnpackComponents(SWR_FORMAT format, Value* vInput, Value* result[4]);
-    void CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
-    void ConvertFormat(SWR_FORMAT format, Value *texels[4]);
+    void CreateGatherOddFormats(
+        SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
+    void ConvertFormat(SWR_FORMAT format, Value* texels[4]);
 
     Value* mpWorkerData;
     Value* mpFetchInfo;
@@ -100,25 +122,29 @@
     std::stringstream fnName("FCH_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
     fnName << ComputeCRC(0, &fetchState, sizeof(fetchState));
 
-    Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
-    BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
+    Function* fetch = Function::Create(
+        JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+    BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
 
     fetch->getParent()->setModuleIdentifier(fetch->getName());
 
     IRB()->SetInsertPoint(entry);
 
-    auto    argitr = fetch->arg_begin();
+    auto argitr = fetch->arg_begin();
 
     // Fetch shader arguments
-    Value* privateContext = &*argitr; ++argitr;
+    Value* privateContext = &*argitr;
+    ++argitr;
     privateContext->setName("privateContext");
     SetPrivateContext(privateContext);
 
-    mpWorkerData = &*argitr; ++argitr;
+    mpWorkerData = &*argitr;
+    ++argitr;
     mpWorkerData->setName("pWorkerData");
-    mpFetchInfo = &*argitr; ++argitr;
+    mpFetchInfo = &*argitr;
+    ++argitr;
     mpFetchInfo->setName("fetchInfo");
-    Value*    pVtxOut = &*argitr;
+    Value* pVtxOut = &*argitr;
     pVtxOut->setName("vtxOutput");
 
     uint32_t baseWidth = mVWidth;
@@ -133,71 +159,77 @@
     pVtxOut = BITCAST(pVtxOut, PointerType::get(mSimdFP32Ty, 0));
 
     // SWR_FETCH_CONTEXT::pStreams
-    Value*    streams = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
+    Value* streams = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_pStreams});
     streams->setName("pStreams");
 
     // SWR_FETCH_CONTEXT::pIndices
-    Value*    indices = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpIndices});
+    Value* indices = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpIndices});
     indices->setName("pIndices");
 
     // SWR_FETCH_CONTEXT::pLastIndex
-    Value*    pLastIndex = LOAD(mpFetchInfo,{0, SWR_FETCH_CONTEXT_xpLastIndex});
+    Value* pLastIndex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_xpLastIndex});
     pLastIndex->setName("pLastIndex");
 
     Value* vIndices;
-    switch(fetchState.indexType)
+    switch (fetchState.indexType)
     {
-        case R8_UINT:
-            indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
-            if(fetchState.bDisableIndexOOBCheck)
-            {
-                vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
-                vIndices = Z_EXT(vIndices, mSimdInt32Ty);
-            }
-            else
-            {
-                vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
-            }
-            break;
-        case R16_UINT: 
-            if(fetchState.bDisableIndexOOBCheck)
-            {
-                vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
-                vIndices = Z_EXT(vIndices, mSimdInt32Ty);
-            }
-            else
-            {
-                vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
-            }
-            break;
-        case R32_UINT:
-            (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH)
-                                               : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
-            break; // incoming type is already 32bit int
-        default:
-            SWR_INVALID("Unsupported index type");
-            vIndices = nullptr;
-            break;
+    case R8_UINT:
+        indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
+        if (fetchState.bDisableIndexOOBCheck)
+        {
+            vIndices = LOAD(
+                BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)),
+                {(uint32_t)0});
+            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
+        }
+        else
+        {
+            vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
+        }
+        break;
+    case R16_UINT:
+        if (fetchState.bDisableIndexOOBCheck)
+        {
+            vIndices = LOAD(
+                BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)),
+                {(uint32_t)0});
+            vIndices = Z_EXT(vIndices, mSimdInt32Ty);
+        }
+        else
+        {
+            vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
+        }
+        break;
+    case R32_UINT:
+        (fetchState.bDisableIndexOOBCheck)
+            ? vIndices = LOAD(indices, "", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH)
+            : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
+        break; // incoming type is already 32bit int
+    default:
+        SWR_INVALID("Unsupported index type");
+        vIndices = nullptr;
+        break;
     }
 
-    if(fetchState.bForceSequentialAccessEnable)
+    if (fetchState.bForceSequentialAccessEnable)
     {
-        Value* pOffsets = mVWidth == 8 ? C({ 0, 1, 2, 3, 4, 5, 6, 7 }) : 
-            C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
+        Value* pOffsets = mVWidth == 8 ? C({0, 1, 2, 3, 4, 5, 6, 7})
+                                       : C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
 
         // VertexData buffers are accessed sequentially, the index is equal to the vertex number
-        vIndices = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
+        vIndices = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
         vIndices = ADD(vIndices, pOffsets);
     }
 
     Value* vVertexId = vIndices;
     if (fetchState.bVertexIDOffsetEnable)
     {
-        // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally correct
-        Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
-        Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_StartVertex }));
-        vVertexId = ADD(vIndices, vBaseVertex);
-        vVertexId = ADD(vVertexId, vStartVertex);
+        // Assuming one of baseVertex or startVertex is 0, so adding both should be functionally
+        // correct
+        Value* vBaseVertex  = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
+        Value* vStartVertex = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex}));
+        vVertexId           = ADD(vIndices, vBaseVertex);
+        vVertexId           = ADD(vVertexId, vStartVertex);
     }
 
     // store out vertex IDs
@@ -206,30 +238,30 @@
         // store out in simd8 halves until core supports 16-wide natively
         auto vVertexIdLo = EXTRACT_16(vVertexId, 0);
         auto vVertexIdHi = EXTRACT_16(vVertexId, 1);
-        STORE(vVertexIdLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
-        STORE(vVertexIdHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 }));
+        STORE(vVertexIdLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
+        STORE(vVertexIdHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2}));
     }
     else if (mVWidth == 8)
     {
-        STORE(vVertexId, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
+        STORE(vVertexId, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID}));
     }
 
     // store out cut mask if enabled
     if (fetchState.bEnableCutIndex)
     {
         Value* vCutIndex = VIMMED1(fetchState.cutIndex);
-        Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
-        
+        Value* cutMask   = VMASK(ICMP_EQ(vIndices, vCutIndex));
+
         if (mVWidth == 16)
         {
             auto cutMaskLo = EXTRACT_16(cutMask, 0);
             auto cutMaskHi = EXTRACT_16(cutMask, 1);
-            STORE(cutMaskLo, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
-            STORE(cutMaskHi, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask2 }));
+            STORE(cutMaskLo, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
+            STORE(cutMaskHi, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask2}));
         }
         else if (mVWidth == 8)
         {
-            STORE(cutMask, GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
+            STORE(cutMask, GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CutMask}));
         }
     }
 
@@ -262,14 +294,13 @@
     optPasses.add(createCFGSimplificationPass());
     optPasses.add(createEarlyCSEPass());
     optPasses.add(createInstructionCombiningPass());
-    optPasses.add(createInstructionSimplifierPass());
     optPasses.add(createConstantPropagationPass());
     optPasses.add(createSCCPPass());
     optPasses.add(createAggressiveDCEPass());
 
     optPasses.run(*fetch);
 
-    optPasses.add(createLowerX86Pass(JM(), this));
+    optPasses.add(createLowerX86Pass(this));
     optPasses.run(*fetch);
 
     JitManager::DumpToFile(fetch, "opt");
@@ -279,7 +310,7 @@
 #if USE_SIMD16_SHADERS
     SetTargetWidth(baseWidth);
 #endif
- 
+
     return fetch;
 }
 
@@ -297,9 +328,9 @@
 // format is uniform if all components are the same size and type
 bool FetchJit::IsUniformFormat(SWR_FORMAT format)
 {
-    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
-    uint32_t bpc0 = info.bpc[0];
-    uint32_t type0 = info.type[0];
+    const SWR_FORMAT_INFO& info  = GetFormatInfo(format);
+    uint32_t               bpc0  = info.bpc[0];
+    uint32_t               type0 = info.type[0];
 
     for (uint32_t c = 1; c < info.numComps; ++c)
     {
@@ -323,10 +354,10 @@
     for (uint32_t c = 0; c < info.numComps; ++c)
     {
         uint32_t swizzledIndex = info.swizzle[c];
-        uint32_t compBits = info.bpc[c];
-        uint32_t bitmask = ((1 << compBits) - 1) << bitOffset;
-        Value* comp = AND(vInput, bitmask);
-        comp = LSHR(comp, bitOffset);
+        uint32_t compBits      = info.bpc[c];
+        uint32_t bitmask       = ((1 << compBits) - 1) << bitOffset;
+        Value*   comp          = AND(vInput, bitmask);
+        comp                   = LSHR(comp, bitOffset);
 
         result[swizzledIndex] = comp;
         bitOffset += compBits;
@@ -336,14 +367,15 @@
 // gather for odd component size formats
 // gather SIMD full pixels per lane then shift/mask to move each component to their
 // own vector
-void FetchJit::CreateGatherOddFormats(SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
+void FetchJit::CreateGatherOddFormats(
+    SWR_FORMAT format, Value* pMask, Value* pBase, Value* pOffsets, Value* pResult[4])
 {
-    const SWR_FORMAT_INFO &info = GetFormatInfo(format);
+    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 
     // only works if pixel size is <= 32bits
     SWR_ASSERT(info.bpp <= 32);
 
-    Value *pGather;
+    Value* pGather;
     if (info.bpp == 32)
     {
         pGather = GATHERDD(VIMMED1(0), pBase, pOffsets, pMask);
@@ -351,17 +383,17 @@
     else
     {
         // Can't use 32-bit gather for items less than 32-bits, could cause page faults.
-        Value *pMem = ALLOCA(mSimdInt32Ty);
+        Value* pMem = ALLOCA(mSimdInt32Ty);
         STORE(VIMMED1(0u), pMem);
 
-        pBase = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
+        pBase          = BITCAST(pBase, PointerType::get(mInt8Ty, 0));
         Value* pDstMem = BITCAST(pMem, mInt32PtrTy);
 
         for (uint32_t lane = 0; lane < mVWidth; ++lane)
         {
             // Get index
             Value* index = VEXTRACT(pOffsets, C(lane));
-            Value* mask = VEXTRACT(pMask, C(lane));
+            Value* mask  = VEXTRACT(pMask, C(lane));
             switch (info.bpp)
             {
             case 8:
@@ -418,9 +450,9 @@
     pResult[3] = BITCAST(pResult[3], mSimdFP32Ty);
 }
 
-void FetchJit::ConvertFormat(SWR_FORMAT format, Value *texels[4])
+void FetchJit::ConvertFormat(SWR_FORMAT format, Value* texels[4])
 {
-    const SWR_FORMAT_INFO &info = GetFormatInfo(format);
+    const SWR_FORMAT_INFO& info = GetFormatInfo(format);
 
     for (uint32_t c = 0; c < info.numComps; ++c)
     {
@@ -436,13 +468,14 @@
         {
             if (info.type[c] == SWR_TYPE_SNORM)
             {
-                /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to -1.0f.
+                /// @todo The most-negative value maps to -1.0f. e.g. the 5-bit value 10000 maps to
+                /// -1.0f.
 
                 /// result = c * (1.0f / (2^(n-1) - 1);
-                uint32_t n = info.bpc[c];
-                uint32_t pow2 = 1 << (n - 1);
-                float scale = 1.0f / (float)(pow2 - 1);
-                Value *vScale = VIMMED1(scale);
+                uint32_t n        = info.bpc[c];
+                uint32_t pow2     = 1 << (n - 1);
+                float    scale    = 1.0f / (float)(pow2 - 1);
+                Value*   vScale   = VIMMED1(scale);
                 texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
                 texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
                 texels[compIndex] = FMUL(texels[compIndex], vScale);
@@ -452,21 +485,22 @@
                 SWR_ASSERT(info.type[c] == SWR_TYPE_UNORM);
 
                 /// result = c * (1.0f / (2^n - 1))
-                uint32_t n = info.bpc[c];
+                uint32_t n    = info.bpc[c];
                 uint32_t pow2 = 1 << n;
-                // special case 24bit unorm format, which requires a full divide to meet ULP requirement
+                // special case 24bit unorm format, which requires a full divide to meet ULP
+                // requirement
                 if (n == 24)
                 {
-                    float scale = (float)(pow2 - 1);
-                    Value* vScale = VIMMED1(scale);
+                    float  scale      = (float)(pow2 - 1);
+                    Value* vScale     = VIMMED1(scale);
                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
                     texels[compIndex] = SI_TO_FP(texels[compIndex], mSimdFP32Ty);
                     texels[compIndex] = FDIV(texels[compIndex], vScale);
                 }
                 else
                 {
-                    float scale = 1.0f / (float)(pow2 - 1);
-                    Value *vScale = VIMMED1(scale);
+                    float  scale      = 1.0f / (float)(pow2 - 1);
+                    Value* vScale     = VIMMED1(scale);
                     texels[compIndex] = BITCAST(texels[compIndex], mSimdInt32Ty);
                     texels[compIndex] = UI_TO_FP(texels[compIndex], mSimdFP32Ty);
                     texels[compIndex] = FMUL(texels[compIndex], vScale);
@@ -483,17 +517,19 @@
 /// @param streams - value pointer to the current vertex stream
 /// @param vIndices - vector value of indices to gather
 /// @param pVtxOut - value pointer to output simdvertex struct
-void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
-    Value* streams, Value* vIndices, Value* pVtxOut)
+void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE& fetchState,
+                                 Value*                     streams,
+                                 Value*                     vIndices,
+                                 Value*                     pVtxOut)
 {
     uint32_t currentVertexElement = 0;
-    uint32_t outputElt = 0;
-    Value* vVertexElements[4];
+    uint32_t outputElt            = 0;
+    Value*   vVertexElements[4];
 
-    Value* startVertex = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
+    Value* startVertex   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
     Value* startInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
-    Value* curInstance = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
-    Value* vBaseVertex = VBROADCAST(LOAD(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_BaseVertex }));
+    Value* curInstance   = LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
+    Value* vBaseVertex   = VBROADCAST(LOAD(mpFetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
     curInstance->setName("curInstance");
 
     for (uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; nInputElt += 1)
@@ -506,23 +542,25 @@
             continue;
         }
 
-        const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
+        const SWR_FORMAT_INFO& info = GetFormatInfo((SWR_FORMAT)ied.Format);
         SWR_ASSERT((info.bpp != 0), "Unsupported format in JitGatherVertices.");
-        uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
+        uint32_t bpc =
+            info.bpp /
+            info.numComps; ///@todo Code below assumes all components are same size. Need to fix.
 
-        Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
+        Value* stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_xpData});
 
         // VGATHER* takes an *i8 src pointer
-        Value *pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
+        Value* pStreamBase = INT_TO_PTR(stream, PointerType::get(mInt8Ty, 0));
 
-        Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
-        Value *vStride = VBROADCAST(stride);
+        Value* stride  = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
+        Value* vStride = VBROADCAST(stride);
 
         // max vertex index that is fully in bounds
-        Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
-        maxVertex = LOAD(maxVertex);
+        Value* maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
+        maxVertex        = LOAD(maxVertex);
 
-        Value *minVertex = NULL;
+        Value* minVertex = NULL;
         if (fetchState.bPartialVertexBuffer)
         {
             // min vertex index for low bounds OOB checking
@@ -536,9 +574,9 @@
             curInstance = ADD(curInstance, startInstance);
         }
 
-        Value *vCurIndices;
-        Value *startOffset;
-        Value *vInstanceStride = VIMMED1(0);
+        Value* vCurIndices;
+        Value* startOffset;
+        Value* vInstanceStride = VIMMED1(0);
 
         if (ied.InstanceEnable)
         {
@@ -546,7 +584,7 @@
 
             // prevent a div by 0 for 0 step rate
             Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
-            stepRate = SELECT(isNonZeroStep, stepRate, C(1));
+            stepRate             = SELECT(isNonZeroStep, stepRate, C(1));
 
             // calc the current offset into instanced data buffer
             Value* calcInstance = UDIV(curInstance, stepRate);
@@ -559,7 +597,8 @@
         }
         else if (ied.InstanceStrideEnable)
         {
-            // grab the instance advancement state, determines stride in bytes from one instance to the next
+            // grab the instance advancement state, determines stride in bytes from one instance to
+            // the next
             Value* stepRate = C(ied.InstanceAdvancementState);
             vInstanceStride = VBROADCAST(MUL(curInstance, stepRate));
 
@@ -576,16 +615,16 @@
             startOffset = startVertex;
         }
 
-        // All of the OOB calculations are in vertices, not VB offsets, to prevent having to 
+        // All of the OOB calculations are in vertices, not VB offsets, to prevent having to
         // do 64bit address offset calculations.
 
         // calculate byte offset to the start of the VB
-        Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
-        pStreamBase = GEP(pStreamBase, baseOffset);
+        Value* baseOffset     = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
+        pStreamBase           = GEP(pStreamBase, baseOffset);
         Value* pStreamBaseGFX = ADD(stream, baseOffset);
 
         // if we have a start offset, subtract from max vertex. Used for OOB check
-        maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
+        maxVertex     = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
         Value* maxNeg = ICMP_SLT(maxVertex, C((int64_t)0));
         // if we have a negative value, we're already OOB. clamp at 0.
         maxVertex = SELECT(maxNeg, C(0), TRUNC(maxVertex, mInt32Ty));
@@ -593,38 +632,39 @@
         if (fetchState.bPartialVertexBuffer)
         {
             // similary for min vertex
-            minVertex = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
-            Value *minNeg = ICMP_SLT(minVertex, C((int64_t)0));
-            minVertex = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
+            minVertex     = SUB(Z_EXT(minVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
+            Value* minNeg = ICMP_SLT(minVertex, C((int64_t)0));
+            minVertex     = SELECT(minNeg, C(0), TRUNC(minVertex, mInt32Ty));
         }
 
         // Load the in bounds size of a partially valid vertex
-        Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
-        partialInboundsSize = LOAD(partialInboundsSize);
-        Value *vPartialVertexSize = VBROADCAST(partialInboundsSize);
-        Value *vBpp = VBROADCAST(C(info.Bpp));
-        Value *vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
+        Value* partialInboundsSize =
+            GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
+        partialInboundsSize       = LOAD(partialInboundsSize);
+        Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
+        Value* vBpp               = VBROADCAST(C(info.Bpp));
+        Value* vAlignmentOffsets  = VBROADCAST(C(ied.AlignedByteOffset));
 
         // is the element is <= the partially valid size
-        Value *vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
+        Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
 
         // override cur indices with 0 if pitch is 0
         Value* pZeroPitchMask = ICMP_EQ(vStride, VIMMED1(0));
-        vCurIndices = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
+        vCurIndices           = SELECT(pZeroPitchMask, VIMMED1(0), vCurIndices);
 
         // are vertices partially OOB?
-        Value* vMaxVertex = VBROADCAST(maxVertex);
+        Value* vMaxVertex      = VBROADCAST(maxVertex);
         Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
 
         // are vertices fully in bounds?
         Value* vMaxGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
 
-        Value *vGatherMask;
+        Value* vGatherMask;
         if (fetchState.bPartialVertexBuffer)
         {
             // are vertices below minVertex limit?
-            Value *vMinVertex = VBROADCAST(minVertex);
-            Value *vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
+            Value* vMinVertex     = VBROADCAST(minVertex);
+            Value* vMinGatherMask = ICMP_UGE(vCurIndices, vMinVertex);
 
             // only fetch lanes that pass both tests
             vGatherMask = AND(vMaxGatherMask, vMinGatherMask);
@@ -639,23 +679,26 @@
 
         // calculate the actual offsets into the VB
         Value* vOffsets = MUL(vCurIndices, vStride);
-        vOffsets = ADD(vOffsets, vAlignmentOffsets);
+        vOffsets        = ADD(vOffsets, vAlignmentOffsets);
 
         // if instance stride enable is:
         //  true  - add product of the instanceID and advancement state to the offst into the VB
         //  false - value of vInstanceStride has been initialialized to zero
         vOffsets = ADD(vOffsets, vInstanceStride);
 
-        // Packing and component control 
-        ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
-        const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, 
-                                             (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3}; 
+        // Packing and component control
+        ComponentEnable        compMask = (ComponentEnable)ied.ComponentPacking;
+        const ComponentControl compCtrl[4]{(ComponentControl)ied.ComponentControl0,
+                                           (ComponentControl)ied.ComponentControl1,
+                                           (ComponentControl)ied.ComponentControl2,
+                                           (ComponentControl)ied.ComponentControl3};
 
         // Special gather/conversion for formats without equal component sizes
         if (IsOddFormat((SWR_FORMAT)ied.Format))
         {
-            Value *pResults[4];
-            CreateGatherOddFormats((SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
+            Value* pResults[4];
+            CreateGatherOddFormats(
+                (SWR_FORMAT)ied.Format, vGatherMask, pStreamBase, vOffsets, pResults);
             ConvertFormat((SWR_FORMAT)ied.Format, pResults);
 
             for (uint32_t c = 0; c < 4; c += 1)
@@ -672,193 +715,214 @@
                 }
             }
         }
-        else if(info.type[0] == SWR_TYPE_FLOAT)
+        else if (info.type[0] == SWR_TYPE_FLOAT)
         {
             ///@todo: support 64 bit vb accesses
-            Value *gatherSrc = VIMMED1(0.0f);
+            Value* gatherSrc = VIMMED1(0.0f);
 
-            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 
-                "Unsupported format for standard gather fetch.");
+            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
+                       "Unsupported format for standard gather fetch.");
 
             // Gather components from memory to store in a simdvertex structure
             switch (bpc)
             {
-                case 16:
+            case 16:
+            {
+                Value* vGatherResult[2];
+
+                // if we have at least one component out of x or y to fetch
+                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
                 {
-                    Value *vGatherResult[2];
-
-                    // if we have at least one component out of x or y to fetch
-                    if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
-                    {
-                        vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
-                        // e.g. result of first 8x32bit integer gather for 16bit components
-                        // 256i - 0    1    2    3    4    5    6    7
-                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                        //
-                    }
-
-                    // if we have at least one component out of z or w to fetch
-                    if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
-                    {
-                        // offset base to the next components(zw) in the vertex to gather
-                        pStreamBase = GEP(pStreamBase, C((char)4));
-
-                        vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
-                        // e.g. result of second 8x32bit integer gather for 16bit components
-                        // 256i - 0    1    2    3    4    5    6    7
-                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-                        //
-                    }
-
-                    // if we have at least one component to shuffle into place
-                    if (compMask)
-                    {
-                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
-                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
-
-                        // Shuffle gathered components into place in simdvertex struct
-                        mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args);  // outputs to vVertexElements ref
-                    }
+                    vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
+                    // e.g. result of first 8x32bit integer gather for 16bit components
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                    //
                 }
-                    break;
-                case 32:
+
+                // if we have at least one component out of z or w to fetch
+                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
                 {
-                    for (uint32_t i = 0; i < 4; i += 1)
+                    // offset base to the next components(zw) in the vertex to gather
+                    pStreamBase = GEP(pStreamBase, C((char)4));
+
+                    vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vGatherMask);
+                    // e.g. result of second 8x32bit integer gather for 16bit components
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
+                    //
+                }
+
+                // if we have at least one component to shuffle into place
+                if (compMask)
+                {
+                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
+                                                                  pVtxOut,
+                                                                  Instruction::CastOps::FPExt,
+                                                                  CONVERT_NONE,
+                                                                  currentVertexElement,
+                                                                  outputElt,
+                                                                  compMask,
+                                                                  compCtrl,
+                                                                  vVertexElements);
+
+                    // Shuffle gathered components into place in simdvertex struct
+                    mVWidth == 16 ? Shuffle16bpcGather16(args)
+                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
+                }
+            }
+            break;
+            case 32:
+            {
+                for (uint32_t i = 0; i < 4; i += 1)
+                {
+                    if (isComponentEnabled(compMask, i))
                     {
-                        if (isComponentEnabled(compMask, i))
+                        // if we need to gather the component
+                        if (compCtrl[i] == StoreSrc)
                         {
-                            // if we need to gather the component
-                            if (compCtrl[i] == StoreSrc)
+                            // Gather a SIMD of vertices
+                            // APIs allow a 4GB range for offsets
+                            // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
+                            // But, we know that elements must be aligned for FETCH. :)
+                            // Right shift the offset by a bit and then scale by 2 to remove the
+                            // sign extension.
+                            Value* vShiftedOffsets = LSHR(vOffsets, 1);
+                            vVertexElements[currentVertexElement++] =
+                                GATHERPS(gatherSrc,
+                                         pStreamBaseGFX,
+                                         vShiftedOffsets,
+                                         vGatherMask,
+                                         2,
+                                         GFX_MEM_CLIENT_FETCH);
+                        }
+                        else
+                        {
+                            vVertexElements[currentVertexElement++] =
+                                GenerateCompCtrlVector(compCtrl[i]);
+                        }
+
+                        if (currentVertexElement > 3)
+                        {
+                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+                            // reset to the next vVertexElement to output
+                            currentVertexElement = 0;
+                        }
+                    }
+
+                    // offset base to the next component in the vertex to gather
+                    pStreamBase    = GEP(pStreamBase, C((char)4));
+                    pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
+                }
+            }
+            break;
+            case 64:
+            {
+                for (uint32_t i = 0; i < 4; i += 1)
+                {
+                    if (isComponentEnabled(compMask, i))
+                    {
+                        // if we need to gather the component
+                        if (compCtrl[i] == StoreSrc)
+                        {
+                            Value* vShufLo;
+                            Value* vShufHi;
+                            Value* vShufAll;
+
+                            if (mVWidth == 8)
                             {
-                                // Gather a SIMD of vertices
-                                // APIs allow a 4GB range for offsets
-                                // However, GATHERPS uses signed 32-bit offsets, so only a 2GB range :(
-                                // But, we know that elements must be aligned for FETCH. :)
-                                // Right shift the offset by a bit and then scale by 2 to remove the sign extension.
-                                Value *vShiftedOffsets = LSHR(vOffsets, 1);
-                                vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBaseGFX, vShiftedOffsets, vGatherMask, 2, GFX_MEM_CLIENT_FETCH);
+                                vShufLo  = C({0, 1, 2, 3});
+                                vShufHi  = C({4, 5, 6, 7});
+                                vShufAll = C({0, 1, 2, 3, 4, 5, 6, 7});
                             }
                             else
                             {
-                                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+                                SWR_ASSERT(mVWidth == 16);
+                                vShufLo = C({0, 1, 2, 3, 4, 5, 6, 7});
+                                vShufHi = C({8, 9, 10, 11, 12, 13, 14, 15});
+                                vShufAll =
+                                    C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
                             }
 
-                            if (currentVertexElement > 3)
-                            {
-                                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                                // reset to the next vVertexElement to output
-                                currentVertexElement = 0;
-                            }
+                            Value* vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
+                            Value* vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
+
+                            Value* vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
+                            Value* vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
+
+                            Value* vZeroDouble = VECTOR_SPLAT(
+                                mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
+
+                            Value* pGatherLo =
+                                GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
+                            Value* pGatherHi =
+                                GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
+
+                            pGatherLo = VCVTPD2PS(pGatherLo);
+                            pGatherHi = VCVTPD2PS(pGatherHi);
+
+                            Value* pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
+
+                            vVertexElements[currentVertexElement++] = pGather;
                         }
-
-                        // offset base to the next component in the vertex to gather
-                        pStreamBase = GEP(pStreamBase, C((char)4));
-                        pStreamBaseGFX = ADD(pStreamBaseGFX, C((int64_t)4));
-                    }
-                }
-                    break;
-                case 64:
-                {
-                    for (uint32_t i = 0; i < 4; i += 1)
-                    {
-                        if (isComponentEnabled(compMask, i))
+                        else
                         {
-                            // if we need to gather the component
-                            if (compCtrl[i] == StoreSrc)
-                            {
-                                Value* vShufLo;
-                                Value* vShufHi;
-                                Value* vShufAll;
-
-                                if (mVWidth == 8)
-                                {
-                                    vShufLo = C({ 0, 1, 2, 3 });
-                                    vShufHi = C({ 4, 5, 6, 7 });
-                                    vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
-                                }
-                                else
-                                {
-                                    SWR_ASSERT(mVWidth == 16);
-                                    vShufLo = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
-                                    vShufHi = C({ 8, 9, 10, 11, 12, 13, 14, 15 });
-                                    vShufAll = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
-                                }
-
-                                Value *vMaskLo = VSHUFFLE(vGatherMask, vGatherMask, vShufLo);
-                                Value *vMaskHi = VSHUFFLE(vGatherMask, vGatherMask, vShufHi);
-
-                                Value *vOffsetsLo = VSHUFFLE(vOffsets, vOffsets, vShufLo);
-                                Value *vOffsetsHi = VSHUFFLE(vOffsets, vOffsets, vShufHi);
-
-                                Value *vZeroDouble = VECTOR_SPLAT(mVWidth / 2, ConstantFP::get(IRB()->getDoubleTy(), 0.0f));
-
-                                Value* pGatherLo = GATHERPD(vZeroDouble, pStreamBase, vOffsetsLo, vMaskLo);
-                                Value* pGatherHi = GATHERPD(vZeroDouble, pStreamBase, vOffsetsHi, vMaskHi);
-
-                                pGatherLo = VCVTPD2PS(pGatherLo);
-                                pGatherHi = VCVTPD2PS(pGatherHi);
-
-                                Value *pGather = VSHUFFLE(pGatherLo, pGatherHi, vShufAll);
-
-                                vVertexElements[currentVertexElement++] = pGather;
-                            }
-                            else
-                            {
-                                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
-                            }
-
-                            if (currentVertexElement > 3)
-                            {
-                                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
-                                // reset to the next vVertexElement to output
-                                currentVertexElement = 0;
-                            }
+                            vVertexElements[currentVertexElement++] =
+                                GenerateCompCtrlVector(compCtrl[i]);
                         }
 
-                        // offset base to the next component  in the vertex to gather
-                        pStreamBase = GEP(pStreamBase, C((char)8));
+                        if (currentVertexElement > 3)
+                        {
+                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+                            // reset to the next vVertexElement to output
+                            currentVertexElement = 0;
+                        }
                     }
+
+                    // offset base to the next component  in the vertex to gather
+                    pStreamBase = GEP(pStreamBase, C((char)8));
                 }
-                    break;
-                default:
-                    SWR_INVALID("Tried to fetch invalid FP format");
-                    break;
+            }
+            break;
+            default:
+                SWR_INVALID("Tried to fetch invalid FP format");
+                break;
             }
         }
         else
         {
             Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
-            ConversionType conversionType = CONVERT_NONE;
+            ConversionType       conversionType = CONVERT_NONE;
 
-            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format), 
-                "Unsupported format for standard gather fetch.");
+            SWR_ASSERT(IsUniformFormat((SWR_FORMAT)ied.Format),
+                       "Unsupported format for standard gather fetch.");
 
-            switch(info.type[0])
+            switch (info.type[0])
             {
-                case SWR_TYPE_UNORM: 
-                    conversionType = CONVERT_NORMALIZED;
-                case SWR_TYPE_UINT:
-                    extendCastType = Instruction::CastOps::ZExt;
-                    break;
-                case SWR_TYPE_SNORM:
-                    conversionType = CONVERT_NORMALIZED;
-                case SWR_TYPE_SINT:
-                    extendCastType = Instruction::CastOps::SExt;
-                    break;
-                case SWR_TYPE_USCALED:
-                    conversionType = CONVERT_USCALED;
-                    extendCastType = Instruction::CastOps::UIToFP;
-                    break;
-                case SWR_TYPE_SSCALED:
-                    conversionType = CONVERT_SSCALED;
-                    extendCastType = Instruction::CastOps::SIToFP;
-                    break;
-                case SWR_TYPE_SFIXED:
-                    conversionType = CONVERT_SFIXED;
-                    extendCastType = Instruction::CastOps::SExt;
-                    break;
-                default:
-                    break;
+            case SWR_TYPE_UNORM:
+                conversionType = CONVERT_NORMALIZED;
+            case SWR_TYPE_UINT:
+                extendCastType = Instruction::CastOps::ZExt;
+                break;
+            case SWR_TYPE_SNORM:
+                conversionType = CONVERT_NORMALIZED;
+            case SWR_TYPE_SINT:
+                extendCastType = Instruction::CastOps::SExt;
+                break;
+            case SWR_TYPE_USCALED:
+                conversionType = CONVERT_USCALED;
+                extendCastType = Instruction::CastOps::UIToFP;
+                break;
+            case SWR_TYPE_SSCALED:
+                conversionType = CONVERT_SSCALED;
+                extendCastType = Instruction::CastOps::SIToFP;
+                break;
+            case SWR_TYPE_SFIXED:
+                conversionType = CONVERT_SFIXED;
+                extendCastType = Instruction::CastOps::SExt;
+                break;
+            default:
+                break;
             }
 
             // value substituted when component of gather is masked
@@ -867,113 +931,132 @@
             // Gather components from memory to store in a simdvertex structure
             switch (bpc)
             {
-                case 8:
+            case 8:
+            {
+                // if we have at least one component to fetch
+                if (compMask)
                 {
-                    // if we have at least one component to fetch
-                    if (compMask)
-                    {
-                        Value *vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
-                        // e.g. result of an 8x32bit integer gather for 8bit components
-                        // 256i - 0    1    2    3    4    5    6    7
-                        //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+                    Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
+                    // e.g. result of an 8x32bit integer gather for 8bit components
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw
 
-                        Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
-                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
+                    Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult,
+                                                                 pVtxOut,
+                                                                 extendCastType,
+                                                                 conversionType,
+                                                                 currentVertexElement,
+                                                                 outputElt,
+                                                                 compMask,
+                                                                 compCtrl,
+                                                                 vVertexElements,
+                                                                 info.swizzle);
 
-                        // Shuffle gathered components into place in simdvertex struct
-                        mVWidth == 16 ? Shuffle8bpcGatherd16(args) : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
-                    }
+                    // Shuffle gathered components into place in simdvertex struct
+                    mVWidth == 16 ? Shuffle8bpcGatherd16(args)
+                                  : Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
                 }
-                break;
-                case 16:
+            }
+            break;
+            case 16:
+            {
+                Value* vGatherResult[2];
+
+                // if we have at least one component out of x or y to fetch
+                if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
                 {
-                    Value *vGatherResult[2];
-
-                    // if we have at least one component out of x or y to fetch
-                    if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
-                    {
-                        vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
-                        // e.g. result of first 8x32bit integer gather for 16bit components
-                        // 256i - 0    1    2    3    4    5    6    7
-                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
-                        //
-                    }
-
-                    // if we have at least one component out of z or w to fetch
-                    if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
-                    {
-                        // offset base to the next components(zw) in the vertex to gather
-                        pStreamBase = GEP(pStreamBase, C((char)4));
-
-                        vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
-                        // e.g. result of second 8x32bit integer gather for 16bit components
-                        // 256i - 0    1    2    3    4    5    6    7
-                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
-                        //
-                    }
-
-                    // if we have at least one component to shuffle into place
-                    if (compMask)
-                    {
-                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
-                            currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
-
-                        // Shuffle gathered components into place in simdvertex struct
-                        mVWidth == 16 ? Shuffle16bpcGather16(args) : Shuffle16bpcGather(args); // outputs to vVertexElements ref
-                    }
+                    vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
+                    // e.g. result of first 8x32bit integer gather for 16bit components
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                    //
                 }
-                break;
-                case 32:
+
+                // if we have at least one component out of z or w to fetch
+                if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
                 {
-                    // Gathered components into place in simdvertex struct
-                    for (uint32_t i = 0; i < 4; i++)
+                    // offset base to the next components(zw) in the vertex to gather
+                    pStreamBase = GEP(pStreamBase, C((char)4));
+
+                    vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
+                    // e.g. result of second 8x32bit integer gather for 16bit components
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw
+                    //
+                }
+
+                // if we have at least one component to shuffle into place
+                if (compMask)
+                {
+                    Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult,
+                                                                  pVtxOut,
+                                                                  extendCastType,
+                                                                  conversionType,
+                                                                  currentVertexElement,
+                                                                  outputElt,
+                                                                  compMask,
+                                                                  compCtrl,
+                                                                  vVertexElements);
+
+                    // Shuffle gathered components into place in simdvertex struct
+                    mVWidth == 16 ? Shuffle16bpcGather16(args)
+                                  : Shuffle16bpcGather(args); // outputs to vVertexElements ref
+                }
+            }
+            break;
+            case 32:
+            {
+                // Gathered components into place in simdvertex struct
+                for (uint32_t i = 0; i < 4; i++)
+                {
+                    if (isComponentEnabled(compMask, i))
                     {
-                        if (isComponentEnabled(compMask, i))
+                        // if we need to gather the component
+                        if (compCtrl[i] == StoreSrc)
                         {
-                            // if we need to gather the component
-                            if (compCtrl[i] == StoreSrc)
+                            Value* pGather =
+                                GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
+
+                            if (conversionType == CONVERT_USCALED)
                             {
-                                Value* pGather = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask);
-
-                                if (conversionType == CONVERT_USCALED)
-                                {
-                                    pGather = UI_TO_FP(pGather, mSimdFP32Ty);
-                                }
-                                else if (conversionType == CONVERT_SSCALED)
-                                {
-                                    pGather = SI_TO_FP(pGather, mSimdFP32Ty);
-                                }
-                                else if (conversionType == CONVERT_SFIXED)
-                                {
-                                    pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty), VBROADCAST(C(1/65536.0f)));
-                                }
-
-                                vVertexElements[currentVertexElement++] = pGather;
-
-                                // e.g. result of a single 8x32bit integer gather for 32bit components
-                                // 256i - 0    1    2    3    4    5    6    7
-                                //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 
+                                pGather = UI_TO_FP(pGather, mSimdFP32Ty);
                             }
-                            else
+                            else if (conversionType == CONVERT_SSCALED)
                             {
-                                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+                                pGather = SI_TO_FP(pGather, mSimdFP32Ty);
+                            }
+                            else if (conversionType == CONVERT_SFIXED)
+                            {
+                                pGather = FMUL(SI_TO_FP(pGather, mSimdFP32Ty),
+                                               VBROADCAST(C(1 / 65536.0f)));
                             }
 
-                            if (currentVertexElement > 3)
-                            {
-                                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+                            vVertexElements[currentVertexElement++] = pGather;
 
-                                // reset to the next vVertexElement to output
-                                currentVertexElement = 0;
-                            }
-
+                            // e.g. result of a single 8x32bit integer gather for 32bit components
+                            // 256i - 0    1    2    3    4    5    6    7
+                            //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
+                        }
+                        else
+                        {
+                            vVertexElements[currentVertexElement++] =
+                                GenerateCompCtrlVector(compCtrl[i]);
                         }
 
-                        // offset base to the next component  in the vertex to gather
-                        pStreamBase = GEP(pStreamBase, C((char)4));
+                        if (currentVertexElement > 3)
+                        {
+                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+
+                            // reset to the next vVertexElement to output
+                            currentVertexElement = 0;
+                        }
                     }
+
+                    // offset base to the next component  in the vertex to gather
+                    pStreamBase = GEP(pStreamBase, C((char)4));
                 }
-                break;
+            }
+            break;
             }
         }
     }
@@ -985,17 +1068,16 @@
     }
 }
 
-typedef void*(*PFN_TRANSLATEGFXADDRESS_FUNC)(void* pdc, gfxptr_t va);
-extern "C" void GetSimdValid8bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
-extern "C" void GetSimdValid16bitIndicesGfx(gfxptr_t indices, gfxptr_t lastIndex, uint32_t vWidth, PFN_TRANSLATEGFXADDRESS_FUNC pfnTranslate, void* pdc, uint32_t* outIndices);
-
-template<typename T> Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
+template <typename T>
+Value* FetchJit::GetSimdValidIndicesHelper(Value* pIndices, Value* pLastIndex)
 {
-    SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty, "Function expects gfxptr_t for both input parameters.");
+    SWR_ASSERT(pIndices->getType() == mInt64Ty && pLastIndex->getType() == mInt64Ty,
+               "Function expects gfxptr_t for both input parameters.");
 
     Type* Ty = nullptr;
 
-    static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t), "Unsupported type for use with GetSimdValidIndicesHelper<T>");
+    static_assert(sizeof(T) == sizeof(uint16_t) || sizeof(T) == sizeof(uint8_t),
+                  "Unsupported type for use with GetSimdValidIndicesHelper<T>");
     constexpr bool bSize = (sizeof(T) == sizeof(uint16_t));
     if (bSize)
     {
@@ -1014,26 +1096,26 @@
 
     {
         // store 0 index on stack to be used to conditionally load from if index address is OOB
-        Value* pZeroIndex = ALLOCA(Ty);
+        Value* pZeroIndex = ALLOCA(Ty->getPointerElementType());
         STORE(C((T)0), pZeroIndex);
 
         // Load a SIMD of index pointers
         for (int64_t lane = 0; lane < mVWidth; lane++)
         {
             // Calculate the address of the requested index
-            Value *pIndex = GEP(pIndices, C(lane), Ty);
+            Value* pIndex = GEP(pIndices, C(lane), Ty);
 
             pLastIndex = INT_TO_PTR(pLastIndex, Ty);
 
-            // check if the address is less than the max index, 
+            // check if the address is less than the max index,
             Value* mask = ICMP_ULT(pIndex, pLastIndex);
 
             // if valid, load the index. if not, load 0 from the stack
             Value* pValid = SELECT(mask, pIndex, pZeroIndex);
-            Value *index = LOAD(pValid, "valid index", Ty, GFX_MEM_CLIENT_FETCH);
+            Value* index  = LOAD(pValid, "valid index", Ty, GFX_MEM_CLIENT_FETCH);
 
             // zero extended index to 32 bits and insert into the correct simd lane
-            index = Z_EXT(index, mInt32Ty);
+            index    = Z_EXT(index, mInt32Ty);
             vIndices = VINSERT(vIndices, index, lane);
         }
     }
@@ -1070,24 +1152,23 @@
 Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
 {
     DataLayout dL(JM()->mpCurrentModule);
-    unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
-    Value* iLastIndex = pLastIndex; 
-    Value* iIndices = pIndices;
+    Value*     iLastIndex = pLastIndex;
+    Value*     iIndices   = pIndices;
 
     // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
-    Value* numIndicesLeft = SUB(iLastIndex,iIndices);
-    numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
-    numIndicesLeft = SDIV(numIndicesLeft, C(4));
+    Value* numIndicesLeft = SUB(iLastIndex, iIndices);
+    numIndicesLeft        = TRUNC(numIndicesLeft, mInt32Ty);
+    numIndicesLeft        = SDIV(numIndicesLeft, C(4));
 
     // create a vector of index counts from the base index ptr passed into the fetch
     Constant* vIndexOffsets;
     if (mVWidth == 8)
     {
-        vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7 });
+        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7});
     }
     else
     {
-        vIndexOffsets = C({ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 });
+        vIndexOffsets = C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
     }
 
     // compare index count to the max valid index
@@ -1096,16 +1177,22 @@
     //     ------------------------------
     //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
     //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
-    Value* vMaxIndex = VBROADCAST(numIndicesLeft);
+    Value* vMaxIndex  = VBROADCAST(numIndicesLeft);
     Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
 
     // Load the indices; OOB loads 0
-    return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0), "vIndices", PointerType::get(mSimdInt32Ty, 0), GFX_MEM_CLIENT_FETCH);
+    return MASKED_LOAD(pIndices,
+                       4,
+                       vIndexMask,
+                       VIMMED1(0),
+                       "vIndices",
+                       PointerType::get(mSimdInt32Ty, 0),
+                       GFX_MEM_CLIENT_FETCH);
 }
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, 
-/// denormalizes if needed, converts to F32 if needed, and positions in 
+/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends,
+/// denormalizes if needed, converts to F32 if needed, and positions in
 //  the proper SIMD rows to be output to the simdvertex structure
 /// @param args: (tuple of args, listed below)
 ///   @param vGatherResult - 8 gathered 8bpc vertices
@@ -1118,60 +1205,67 @@
 ///   @param compCtrl - component control val
 ///   @param vVertexElements[4] - vertex components to output
 ///   @param swizzle[4] - component swizzle location
-void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
+void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs& args)
 {
     // Unpack tuple args
-    Value*& vGatherResult = std::get<0>(args);
-    Value* pVtxOut = std::get<1>(args);
-    const Instruction::CastOps extendType = std::get<2>(args);
-    const ConversionType conversionType = std::get<3>(args);
-    uint32_t &currentVertexElement = std::get<4>(args);
-    uint32_t &outputElt = std::get<5>(args);
-    const ComponentEnable compMask = std::get<6>(args);
-    const ComponentControl(&compCtrl)[4] = std::get<7>(args);
-    Value* (&vVertexElements)[4] = std::get<8>(args);
-    const uint32_t(&swizzle)[4] = std::get<9>(args);
+    Value*&                    vGatherResult        = std::get<0>(args);
+    Value*                     pVtxOut              = std::get<1>(args);
+    const Instruction::CastOps extendType           = std::get<2>(args);
+    const ConversionType       conversionType       = std::get<3>(args);
+    uint32_t&                  currentVertexElement = std::get<4>(args);
+    uint32_t&                  outputElt            = std::get<5>(args);
+    const ComponentEnable      compMask             = std::get<6>(args);
+    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
+    Value*(&vVertexElements)[4]                     = std::get<8>(args);
+    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
 
     // cast types
-    Type *vGatherTy = VectorType::get(mInt32Ty, 8);
-    Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
+    Type* vGatherTy = VectorType::get(mInt32Ty, 8);
+    Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
 
     // have to do extra work for sign extending
     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP))
     {
-        Type *v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
-        Type *v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
+        Type* v16x8Ty = VectorType::get(mInt8Ty, 16); // 8x16bit ints in a 128bit lane
+        Type* v128Ty  = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
 
         // shuffle mask, including any swizzling
-        const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
-        const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
-        Value *vConstMask = C<char>({ char(x), char(x + 4), char(x + 8), char(x + 12),
-            char(y), char(y + 4), char(y + 8), char(y + 12),
-            char(z), char(z + 4), char(z + 8), char(z + 12),
-            char(w), char(w + 4), char(w + 8), char(w + 12),
-            char(x), char(x + 4), char(x + 8), char(x + 12),
-            char(y), char(y + 4), char(y + 8), char(y + 12),
-            char(z), char(z + 4), char(z + 8), char(z + 12),
-            char(w), char(w + 4), char(w + 8), char(w + 12) });
+        const char x          = (char)swizzle[0];
+        const char y          = (char)swizzle[1];
+        const char z          = (char)swizzle[2];
+        const char w          = (char)swizzle[3];
+        Value*     vConstMask = C<char>(
+            {char(x),     char(x + 4),  char(x + 8), char(x + 12), char(y),     char(y + 4),
+             char(y + 8), char(y + 12), char(z),     char(z + 4),  char(z + 8), char(z + 12),
+             char(w),     char(w + 4),  char(w + 8), char(w + 12), char(x),     char(x + 4),
+             char(x + 8), char(x + 12), char(y),     char(y + 4),  char(y + 8), char(y + 12),
+             char(z),     char(z + 4),  char(z + 8), char(z + 12), char(w),     char(w + 4),
+             char(w + 8), char(w + 12)});
 
         // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
 
-        Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
-        Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
+        Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
+        Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
 
-        Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
-        Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+        Value* vShufResult_lo =
+            BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+        Value* vShufResult_hi =
+            BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
 
         // after pshufb: group components together in each 128bit lane
         // 256i - 0    1    2    3    4    5    6    7
         //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 
-        Value *vi128XY_lo = nullptr;
-        Value *vi128XY_hi = nullptr;
+        Value* vi128XY_lo = nullptr;
+        Value* vi128XY_hi = nullptr;
         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
         {
-            vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
-            vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            vi128XY_lo = BITCAST(
+                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
+                v128Ty);
+            vi128XY_hi = BITCAST(
+                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})),
+                v128Ty);
 
             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
@@ -1179,26 +1273,30 @@
         }
 
         // do the same for zw components
-        Value *vi128ZW_lo = nullptr;
-        Value *vi128ZW_hi = nullptr;
+        Value* vi128ZW_lo = nullptr;
+        Value* vi128ZW_hi = nullptr;
         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
         {
-            vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
-            vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+            vi128ZW_lo = BITCAST(
+                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
+                v128Ty);
+            vi128ZW_hi = BITCAST(
+                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})),
+                v128Ty);
         }
 
         // init denormalize variables if needed
         Instruction::CastOps fpCast;
-        Value *conversionFactor;
+        Value*               conversionFactor;
 
         switch (conversionType)
         {
         case CONVERT_NORMALIZED:
-            fpCast = Instruction::CastOps::SIToFP;
+            fpCast           = Instruction::CastOps::SIToFP;
             conversionFactor = VIMMED1((float)(1.0 / 127.0));
             break;
         case CONVERT_SSCALED:
-            fpCast = Instruction::CastOps::SIToFP;
+            fpCast           = Instruction::CastOps::SIToFP;
             conversionFactor = VIMMED1((float)(1.0));
             break;
         case CONVERT_USCALED:
@@ -1211,7 +1309,8 @@
             break;
         }
 
-        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+        // sign extend all enabled components. If we have a fill vVertexElements, output to current
+        // simdvertex
         for (uint32_t i = 0; i < 4; i++)
         {
             if (isComponentEnabled(compMask, i))
@@ -1221,12 +1320,14 @@
                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
                     // if x or y, use vi128XY permute result, else use vi128ZW
-                    Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
-                    Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
+                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
+                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
 
                     // sign extend
-                    Value *temp_lo = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
-                    Value *temp_hi = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
+                    Value* temp_lo =
+                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v16x8Ty));
+                    Value* temp_hi =
+                        PMOVSXBD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v16x8Ty));
 
                     Value* temp = JOIN_16(temp_lo, temp_hi);
 
@@ -1255,20 +1356,21 @@
         }
     }
     // else zero extend
-    else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+    else if ((extendType == Instruction::CastOps::ZExt) ||
+             (extendType == Instruction::CastOps::UIToFP))
     {
         // init denormalize variables if needed
         Instruction::CastOps fpCast;
-        Value *conversionFactor;
+        Value*               conversionFactor;
 
         switch (conversionType)
         {
         case CONVERT_NORMALIZED:
-            fpCast = Instruction::CastOps::UIToFP;
+            fpCast           = Instruction::CastOps::UIToFP;
             conversionFactor = VIMMED1((float)(1.0 / 255.0));
             break;
         case CONVERT_USCALED:
-            fpCast = Instruction::CastOps::UIToFP;
+            fpCast           = Instruction::CastOps::UIToFP;
             conversionFactor = VIMMED1((float)(1.0));
             break;
         case CONVERT_SSCALED:
@@ -1289,43 +1391,49 @@
                 if (compCtrl[i] == ComponentControl::StoreSrc)
                 {
                     // pshufb masks for each component
-                    Value *vConstMask;
+                    Value* vConstMask;
                     switch (swizzle[i])
                     {
                     case 0:
                         // x shuffle mask
-                        vConstMask = C<char>({ 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
-                            0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1 });
+                        vConstMask =
+                            C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+                                     0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
                         break;
                     case 1:
                         // y shuffle mask
-                        vConstMask = C<char>({ 1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
-                            1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1 });
+                        vConstMask =
+                            C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+                                     1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
                         break;
                     case 2:
                         // z shuffle mask
-                        vConstMask = C<char>({ 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
-                            2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1 });
+                        vConstMask =
+                            C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+                                     2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
                         break;
                     case 3:
                         // w shuffle mask
-                        vConstMask = C<char>({ 3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
-                            3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1 });
+                        vConstMask =
+                            C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+                                     3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
                         break;
                     default:
                         vConstMask = nullptr;
                         break;
                     }
 
-                    Value *vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
-                    Value *vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
+                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult, 0);
+                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult, 1);
 
-                    Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
-                    Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
+                    Value* temp_lo =
+                        BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
+                    Value* temp_hi =
+                        BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
 
                     // after pshufb for x channel
                     // 256i - 0    1    2    3    4    5    6    7
-                    //        x000 x000 x000 x000 x000 x000 x000 x000 
+                    //        x000 x000 x000 x000 x000 x000 x000 x000
 
                     Value* temp = JOIN_16(temp_lo, temp_hi);
 
@@ -1359,19 +1467,19 @@
     }
 }
 
-void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
+void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs& args)
 {
     // Unpack tuple args
-    Value*& vGatherResult = std::get<0>(args);
-    Value* pVtxOut = std::get<1>(args);
-    const Instruction::CastOps extendType = std::get<2>(args);
-    const ConversionType conversionType = std::get<3>(args);
-    uint32_t &currentVertexElement = std::get<4>(args);
-    uint32_t &outputElt = std::get<5>(args);
-    const ComponentEnable compMask = std::get<6>(args);
-    const ComponentControl(&compCtrl)[4] = std::get<7>(args);
-    Value* (&vVertexElements)[4] = std::get<8>(args);
-    const uint32_t(&swizzle)[4] = std::get<9>(args);
+    Value*&                    vGatherResult        = std::get<0>(args);
+    Value*                     pVtxOut              = std::get<1>(args);
+    const Instruction::CastOps extendType           = std::get<2>(args);
+    const ConversionType       conversionType       = std::get<3>(args);
+    uint32_t&                  currentVertexElement = std::get<4>(args);
+    uint32_t&                  outputElt            = std::get<5>(args);
+    const ComponentEnable      compMask             = std::get<6>(args);
+    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
+    Value*(&vVertexElements)[4]                     = std::get<8>(args);
+    const uint32_t(&swizzle)[4]                     = std::get<9>(args);
 
     // cast types
     Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
@@ -1384,18 +1492,19 @@
         if (compCtrl[i] == ComponentControl::StoreSrc)
         {
             std::vector<uint32_t> vShuffleMasks[4] = {
-                { 0, 4,  8, 12, 16, 20, 24, 28 }, // x
-                { 1, 5,  9, 13, 17, 21, 25, 29 }, // y
-                { 2, 6, 10, 14, 18, 22, 26, 30 }, // z
-                { 3, 7, 11, 15, 19, 23, 27, 31 }, // w
+                {0, 4, 8, 12, 16, 20, 24, 28},  // x
+                {1, 5, 9, 13, 17, 21, 25, 29},  // y
+                {2, 6, 10, 14, 18, 22, 26, 30}, // z
+                {3, 7, 11, 15, 19, 23, 27, 31}, // w
             };
 
-            Value *val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
-                UndefValue::get(v32x8Ty),
-                vShuffleMasks[swizzle[i]]);
+            Value* val = VSHUFFLE(BITCAST(vGatherResult, v32x8Ty),
+                                  UndefValue::get(v32x8Ty),
+                                  vShuffleMasks[swizzle[i]]);
 
             if ((extendType == Instruction::CastOps::SExt) ||
-                (extendType == Instruction::CastOps::SIToFP)) {
+                (extendType == Instruction::CastOps::SIToFP))
+            {
                 switch (conversionType)
                 {
                 case CONVERT_NORMALIZED:
@@ -1414,7 +1523,8 @@
                 }
             }
             else if ((extendType == Instruction::CastOps::ZExt) ||
-                (extendType == Instruction::CastOps::UIToFP)) {
+                     (extendType == Instruction::CastOps::UIToFP))
+            {
                 switch (conversionType)
                 {
                 case CONVERT_NORMALIZED:
@@ -1454,8 +1564,8 @@
 }
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, 
-/// denormalizes if needed, converts to F32 if needed, and positions in 
+/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends,
+/// denormalizes if needed, converts to F32 if needed, and positions in
 //  the proper SIMD rows to be output to the simdvertex structure
 /// @param args: (tuple of args, listed below)
 ///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
@@ -1467,53 +1577,59 @@
 ///   @param compMask - component packing mask
 ///   @param compCtrl - component control val
 ///   @param vVertexElements[4] - vertex components to output
-void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
+void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs& args)
 {
     // Unpack tuple args
-    Value* (&vGatherResult)[2] = std::get<0>(args);
-    Value* pVtxOut = std::get<1>(args);
-    const Instruction::CastOps extendType = std::get<2>(args);
-    const ConversionType conversionType = std::get<3>(args);
-    uint32_t &currentVertexElement = std::get<4>(args);
-    uint32_t &outputElt = std::get<5>(args);
-    const ComponentEnable compMask = std::get<6>(args);
-    const ComponentControl(&compCtrl)[4] = std::get<7>(args);
-    Value* (&vVertexElements)[4] = std::get<8>(args);
+    Value*(&vGatherResult)[2]                       = std::get<0>(args);
+    Value*                     pVtxOut              = std::get<1>(args);
+    const Instruction::CastOps extendType           = std::get<2>(args);
+    const ConversionType       conversionType       = std::get<3>(args);
+    uint32_t&                  currentVertexElement = std::get<4>(args);
+    uint32_t&                  outputElt            = std::get<5>(args);
+    const ComponentEnable      compMask             = std::get<6>(args);
+    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
+    Value*(&vVertexElements)[4]                     = std::get<8>(args);
 
     // cast types
-    Type *vGatherTy = VectorType::get(mInt32Ty, 8);
-    Type *v32x8Ty = VectorType::get(mInt8Ty, 32);
+    Type* vGatherTy = VectorType::get(mInt32Ty, 8);
+    Type* v32x8Ty   = VectorType::get(mInt8Ty, 32);
 
     // have to do extra work for sign extending
-    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
+    if ((extendType == Instruction::CastOps::SExt) ||
+        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
     {
         // is this PP float?
         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
 
-        Type *v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
-        Type *v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
+        Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), 2);
 
         // shuffle mask
-        Value *vConstMask = C<uint8_t>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-                                          0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
-        Value *vi128XY_lo = nullptr;
-        Value *vi128XY_hi = nullptr;
+        Value* vConstMask = C<uint8_t>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                        0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
+        Value* vi128XY_lo = nullptr;
+        Value* vi128XY_hi = nullptr;
         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
         {
-            // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+            // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for
+            // now..
 
-            Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
-            Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
+            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[0], 0), v32x8Ty);
+            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[0], 1), v32x8Ty);
 
-            Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
-            Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
+            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
+            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
 
             // after pshufb: group components together in each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 
-            vi128XY_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
-            vi128XY_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128XY_lo = BITCAST(
+                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
+                v128bitTy);
+            vi128XY_hi = BITCAST(
+                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
+                v128bitTy);
 
             // after PERMD: move and pack xy components into each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
@@ -1521,32 +1637,36 @@
         }
 
         // do the same for zw components
-        Value *vi128ZW_lo = nullptr;
-        Value *vi128ZW_hi = nullptr;
+        Value* vi128ZW_lo = nullptr;
+        Value* vi128ZW_hi = nullptr;
         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
         {
-            Value *vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
-            Value *vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
+            Value* vGatherResult_lo = BITCAST(EXTRACT_16(vGatherResult[1], 0), v32x8Ty);
+            Value* vGatherResult_hi = BITCAST(EXTRACT_16(vGatherResult[1], 1), v32x8Ty);
 
-            Value *vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
-            Value *vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
+            Value* vShufResult_lo = BITCAST(PSHUFB(vGatherResult_lo, vConstMask), vGatherTy);
+            Value* vShufResult_hi = BITCAST(PSHUFB(vGatherResult_hi, vConstMask), vGatherTy);
 
-            vi128ZW_lo = BITCAST(VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
-            vi128ZW_hi = BITCAST(VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128ZW_lo = BITCAST(
+                VSHUFFLE(vShufResult_lo, vShufResult_lo, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
+                v128bitTy);
+            vi128ZW_hi = BITCAST(
+                VSHUFFLE(vShufResult_hi, vShufResult_hi, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})),
+                v128bitTy);
         }
 
         // init denormalize variables if needed
         Instruction::CastOps IntToFpCast;
-        Value *conversionFactor;
+        Value*               conversionFactor;
 
         switch (conversionType)
         {
         case CONVERT_NORMALIZED:
-            IntToFpCast = Instruction::CastOps::SIToFP;
+            IntToFpCast      = Instruction::CastOps::SIToFP;
             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
             break;
         case CONVERT_SSCALED:
-            IntToFpCast = Instruction::CastOps::SIToFP;
+            IntToFpCast      = Instruction::CastOps::SIToFP;
             conversionFactor = VIMMED1((float)(1.0));
             break;
         case CONVERT_USCALED:
@@ -1559,7 +1679,8 @@
             break;
         }
 
-        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+        // sign extend all enabled components. If we have a fill vVertexElements, output to current
+        // simdvertex
         for (uint32_t i = 0; i < 4; i++)
         {
             if (isComponentEnabled(compMask, i))
@@ -1569,22 +1690,26 @@
                     // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
                     uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
                     // if x or y, use vi128XY permute result, else use vi128ZW
-                    Value *selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
-                    Value *selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
+                    Value* selectedPermute_lo = (i < 2) ? vi128XY_lo : vi128ZW_lo;
+                    Value* selectedPermute_hi = (i < 2) ? vi128XY_hi : vi128ZW_hi;
 
                     if (bFP)
                     {
                         // extract 128 bit lanes to sign extend each component
-                        Value *temp_lo = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
-                        Value *temp_hi = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+                        Value* temp_lo =
+                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+                        Value* temp_hi =
+                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
 
                         vVertexElements[currentVertexElement] = JOIN_16(temp_lo, temp_hi);
                     }
                     else
                     {
                         // extract 128 bit lanes to sign extend each component
-                        Value *temp_lo = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
-                        Value *temp_hi = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
+                        Value* temp_lo =
+                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_lo, C(lane)), v8x16Ty));
+                        Value* temp_hi =
+                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute_hi, C(lane)), v8x16Ty));
 
                         Value* temp = JOIN_16(temp_lo, temp_hi);
 
@@ -1614,37 +1739,40 @@
         }
     }
     // else zero extend
-    else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+    else if ((extendType == Instruction::CastOps::ZExt) ||
+             (extendType == Instruction::CastOps::UIToFP))
     {
         // pshufb masks for each component
-        Value *vConstMask[2];
+        Value* vConstMask[2];
 
         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
         {
             // x/z shuffle mask
-            vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+            vConstMask[0] = C<char>({
+                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+            });
         }
 
         if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
         {
             // y/w shuffle mask
-            vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
+            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
         }
 
         // init denormalize variables if needed
         Instruction::CastOps fpCast;
-        Value* conversionFactor;
+        Value*               conversionFactor;
 
         switch (conversionType)
         {
         case CONVERT_NORMALIZED:
-            fpCast = Instruction::CastOps::UIToFP;
+            fpCast           = Instruction::CastOps::UIToFP;
             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
             break;
         case CONVERT_USCALED:
-            fpCast = Instruction::CastOps::UIToFP;
+            fpCast           = Instruction::CastOps::UIToFP;
             conversionFactor = VIMMED1((float)(1.0f));
             break;
         case CONVERT_SSCALED:
@@ -1669,17 +1797,22 @@
                     // if x or y, use vi128XY permute result, else use vi128ZW
                     uint32_t selectedGather = (i < 2) ? 0 : 1;
 
-                    // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL, for now..
+                    // SIMD16 PSHUFB isnt part of AVX-512F, so split into SIMD8 for the sake of KNL,
+                    // for now..
 
-                    Value *vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
-                    Value *vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
+                    Value* vGatherResult_lo = EXTRACT_16(vGatherResult[selectedGather], 0);
+                    Value* vGatherResult_hi = EXTRACT_16(vGatherResult[selectedGather], 1);
 
-                    Value *temp_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
-                    Value *temp_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+                    Value* temp_lo = BITCAST(
+                        PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask[selectedMask]),
+                        vGatherTy);
+                    Value* temp_hi = BITCAST(
+                        PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask[selectedMask]),
+                        vGatherTy);
 
-                    // after pshufb mask for x channel; z uses the same shuffle from the second gather
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
+                    // after pshufb mask for x channel; z uses the same shuffle from the second
+                    // gather 256i - 0    1    2    3    4    5    6    7
+                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
 
                     Value* temp = JOIN_16(temp_lo, temp_hi);
 
@@ -1713,44 +1846,47 @@
     }
 }
 
-void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs& args)
 {
     // Unpack tuple args
-    Value* (&vGatherResult)[2] = std::get<0>(args);
-    Value* pVtxOut = std::get<1>(args);
-    const Instruction::CastOps extendType = std::get<2>(args);
-    const ConversionType conversionType = std::get<3>(args);
-    uint32_t &currentVertexElement = std::get<4>(args);
-    uint32_t &outputElt = std::get<5>(args);
-    const ComponentEnable compMask = std::get<6>(args);
-    const ComponentControl(&compCtrl)[4] = std::get<7>(args);
-    Value* (&vVertexElements)[4] = std::get<8>(args);
+    Value*(&vGatherResult)[2]                       = std::get<0>(args);
+    Value*                     pVtxOut              = std::get<1>(args);
+    const Instruction::CastOps extendType           = std::get<2>(args);
+    const ConversionType       conversionType       = std::get<3>(args);
+    uint32_t&                  currentVertexElement = std::get<4>(args);
+    uint32_t&                  outputElt            = std::get<5>(args);
+    const ComponentEnable      compMask             = std::get<6>(args);
+    const ComponentControl(&compCtrl)[4]            = std::get<7>(args);
+    Value*(&vVertexElements)[4]                     = std::get<8>(args);
 
     // cast types
     Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
+    Type* v32x8Ty   = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
-                                                           // have to do extra work for sign extending
-    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP) ||
-        (extendType == Instruction::CastOps::FPExt))
+    // have to do extra work for sign extending
+    if ((extendType == Instruction::CastOps::SExt) ||
+        (extendType == Instruction::CastOps::SIToFP) || (extendType == Instruction::CastOps::FPExt))
     {
         // is this PP float?
         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
 
-        Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
+        Type* v8x16Ty   = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128),
+                                          mVWidth / 4); // vwidth is units of 32 bits
 
-                                                                                                     // shuffle mask
-        Value* vConstMask = C<char>({ 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
-            0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 });
-        Value* vi128XY = nullptr;
-        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)) {
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
+        // shuffle mask
+        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
+        Value* vi128XY    = nullptr;
+        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
+        {
+            Value* vShufResult =
+                BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
             // after pshufb: group components together in each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 
-            vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
             // after PERMD: move and pack xy components into each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
@@ -1758,23 +1894,25 @@
 
         // do the same for zw components
         Value* vi128ZW = nullptr;
-        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
-            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
-            vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+        if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
+        {
+            Value* vShufResult =
+                BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
+            vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
         }
 
         // init denormalize variables if needed
         Instruction::CastOps IntToFpCast;
-        Value* conversionFactor;
+        Value*               conversionFactor;
 
         switch (conversionType)
         {
         case CONVERT_NORMALIZED:
-            IntToFpCast = Instruction::CastOps::SIToFP;
+            IntToFpCast      = Instruction::CastOps::SIToFP;
             conversionFactor = VIMMED1((float)(1.0 / 32767.0));
             break;
         case CONVERT_SSCALED:
-            IntToFpCast = Instruction::CastOps::SIToFP;
+            IntToFpCast      = Instruction::CastOps::SIToFP;
             conversionFactor = VIMMED1((float)(1.0));
             break;
         case CONVERT_USCALED:
@@ -1787,7 +1925,8 @@
             break;
         }
 
-        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+        // sign extend all enabled components. If we have a fill vVertexElements, output to current
+        // simdvertex
         for (uint32_t i = 0; i < 4; i++)
         {
             if (isComponentEnabled(compMask, i))
@@ -1799,17 +1938,26 @@
                     // if x or y, use vi128XY permute result, else use vi128ZW
                     Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
 
-                    if (bFP) {
+                    if (bFP)
+                    {
                         // extract 128 bit lanes to sign extend each component
-                        vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+                        vVertexElements[currentVertexElement] =
+                            CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
                     }
-                    else {
+                    else
+                    {
                         // extract 128 bit lanes to sign extend each component
-                        vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+                        vVertexElements[currentVertexElement] =
+                            PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
 
                         // denormalize if needed
-                        if (conversionType != CONVERT_NONE) {
-                            vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+                        if (conversionType != CONVERT_NONE)
+                        {
+                            vVertexElements[currentVertexElement] =
+                                FMUL(CAST(IntToFpCast,
+                                          vVertexElements[currentVertexElement],
+                                          mSimdFP32Ty),
+                                     conversionFactor);
                         }
                     }
                     currentVertexElement++;
@@ -1829,34 +1977,39 @@
         }
     }
     // else zero extend
-    else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+    else if ((extendType == Instruction::CastOps::ZExt) ||
+             (extendType == Instruction::CastOps::UIToFP))
     {
         // pshufb masks for each component
         Value* vConstMask[2];
-        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)) {
+        if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2))
+        {
             // x/z shuffle mask
-            vConstMask[0] = C<char>({ 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
-                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+            vConstMask[0] = C<char>({
+                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+                0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+            });
         }
 
-        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)) {
+        if (isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3))
+        {
             // y/w shuffle mask
-            vConstMask[1] = C<char>({ 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
-                2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1 });
+            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
         }
 
         // init denormalize variables if needed
         Instruction::CastOps fpCast;
-        Value* conversionFactor;
+        Value*               conversionFactor;
 
         switch (conversionType)
         {
         case CONVERT_NORMALIZED:
-            fpCast = Instruction::CastOps::UIToFP;
+            fpCast           = Instruction::CastOps::UIToFP;
             conversionFactor = VIMMED1((float)(1.0 / 65535.0));
             break;
         case CONVERT_USCALED:
-            fpCast = Instruction::CastOps::UIToFP;
+            fpCast           = Instruction::CastOps::UIToFP;
             conversionFactor = VIMMED1((float)(1.0f));
             break;
         case CONVERT_SSCALED:
@@ -1881,15 +2034,20 @@
                     // if x or y, use vi128XY permute result, else use vi128ZW
                     uint32_t selectedGather = (i < 2) ? 0 : 1;
 
-                    vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
-                    // after pshufb mask for x channel; z uses the same shuffle from the second gather
-                    // 256i - 0    1    2    3    4    5    6    7
-                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
+                    vVertexElements[currentVertexElement] =
+                        BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty),
+                                       vConstMask[selectedMask]),
+                                vGatherTy);
+                    // after pshufb mask for x channel; z uses the same shuffle from the second
+                    // gather 256i - 0    1    2    3    4    5    6    7
+                    //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00
 
                     // denormalize if needed
                     if (conversionType != CONVERT_NONE)
                     {
-                        vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+                        vVertexElements[currentVertexElement] =
+                            FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty),
+                                 conversionFactor);
                     }
                     currentVertexElement++;
                 }
@@ -1919,7 +2077,10 @@
 /// @param outputElt - simdvertex offset in VIN to write to
 /// @param numEltsToStore - number of simdvertex rows to write out
 /// @param vVertexElements - LLVM Value*[] simdvertex to write out
-void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
+void FetchJit::StoreVertexElements(Value*         pVtxOut,
+                                   const uint32_t outputElt,
+                                   const uint32_t numEltsToStore,
+                                   Value* (&vVertexElements)[4])
 {
     SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
 
@@ -1929,14 +2090,14 @@
         if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
         {
 #if FETCH_DUMP_VERTEX
-            PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
+            PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
 #endif
             vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
         }
 #if FETCH_DUMP_VERTEX
         else
         {
-            PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
+            PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
         }
 #endif
         // outputElt * 4 = offsetting by the size of a simdvertex
@@ -1947,10 +2108,10 @@
 }
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Generates a constant vector of values based on the 
+/// @brief Generates a constant vector of values based on the
 /// ComponentControl value
 /// @param ctrl - ComponentControl value
-Value *FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
+Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
 {
     switch (ctrl)
     {
@@ -1966,21 +2127,23 @@
     {
         if (mVWidth == 16)
         {
-            Type* pSimd8FPTy = VectorType::get(mFP32Ty, 8);
-            Value *pIdLo = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), pSimd8FPTy);
-            Value *pIdHi = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID2 })), pSimd8FPTy);
+            Type*  pSimd8FPTy = VectorType::get(mFP32Ty, 8);
+            Value* pIdLo =
+                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), pSimd8FPTy);
+            Value* pIdHi =
+                BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID2})), pSimd8FPTy);
             return JOIN_16(pIdLo, pIdHi);
         }
         else
         {
-            return BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID })), mSimdFP32Ty);
+            return BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_VertexID})), mSimdFP32Ty);
         }
     }
     case StoreInstanceId:
-        {
-            Value *pId = BITCAST(LOAD(GEP(mpFetchInfo, { 0, SWR_FETCH_CONTEXT_CurInstance })), mFP32Ty);
-            return VBROADCAST(pId);
-        }
+    {
+        Value* pId = BITCAST(LOAD(GEP(mpFetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance})), mFP32Ty);
+        return VBROADCAST(pId);
+    }
 
 
     case StoreSrc:
@@ -1999,15 +2162,20 @@
     switch (component)
     {
         // X
-    case 0: return (enableMask & ComponentEnable::X);
+    case 0:
+        return (enableMask & ComponentEnable::X);
         // Y
-    case 1: return (enableMask & ComponentEnable::Y);
+    case 1:
+        return (enableMask & ComponentEnable::Y);
         // Z
-    case 2: return (enableMask & ComponentEnable::Z);
+    case 2:
+        return (enableMask & ComponentEnable::Z);
         // W
-    case 3: return (enableMask & ComponentEnable::W);
+    case 3:
+        return (enableMask & ComponentEnable::W);
 
-    default: return false;
+    default:
+        return false;
     }
 }
 
@@ -2023,21 +2191,22 @@
 /// @return PFN_FETCH_FUNC - pointer to fetch code
 PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
 {
-    const llvm::Function* func = (const llvm::Function*)hFunc;
-    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-    PFN_FETCH_FUNC pfnFetch;
+    const llvm::Function* func    = (const llvm::Function*)hFunc;
+    JitManager*           pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+    PFN_FETCH_FUNC        pfnFetch;
 
     gFetchCodegenMutex.lock();
     pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
-    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
+    // add new IR to the module
     pJitMgr->mIsModuleFinalized = true;
 
 #if defined(KNOB_SWRC_TRACING)
-    char fName[1024];
-    const char *funcName = func->getName().data();
+    char        fName[1024];
+    const char* funcName = func->getName().data();
     sprintf(fName, "%s.bin", funcName);
-    FILE *fd = fopen(fName, "wb");
-    fwrite((void *)pfnFetch, 1, 2048, fd);
+    FILE* fd = fopen(fName, "wb");
+    fwrite((void*)pfnFetch, 1, 2048, fd);
     fclose(fd);
 #endif
 
@@ -2045,7 +2214,6 @@
     gFetchCodegenMutex.unlock();
 
 
-
     return pfnFetch;
 }
 
@@ -2060,7 +2228,7 @@
     pJitMgr->SetupNewModule();
 
     FetchJit theJit(pJitMgr);
-    HANDLE hFunc = theJit.Create(state);
+    HANDLE   hFunc = theJit.Create(state);
 
     return JitFetchFunc(hJitMgr, hFunc);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
index de0ec4f..abc3091 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file fetch_jit.h
-*
-* @brief Definition of the fetch jitter
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file fetch_jit.h
+ *
+ * @brief Definition of the fetch jitter
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "common/formats.h"
@@ -41,17 +41,17 @@
     {
         struct
         {
-            uint32_t            AlignedByteOffset : 12;
-            uint32_t            Format : 10;
-            uint32_t            StreamIndex : 6;
-            uint32_t            InstanceEnable : 1;
-            uint32_t            InstanceStrideEnable : 1;
-            uint32_t            ComponentControl0 : 4;
-            uint32_t            ComponentControl1 : 4;
-            uint32_t            ComponentControl2 : 4;
-            uint32_t            ComponentControl3 : 4;
-            uint32_t            ComponentPacking : 4;
-            uint32_t            _reserved : 14;
+            uint32_t AlignedByteOffset : 12;
+            uint32_t Format : 10;
+            uint32_t StreamIndex : 6;
+            uint32_t InstanceEnable : 1;
+            uint32_t InstanceStrideEnable : 1;
+            uint32_t ComponentControl0 : 4;
+            uint32_t ComponentControl1 : 4;
+            uint32_t ComponentControl2 : 4;
+            uint32_t ComponentControl3 : 4;
+            uint32_t ComponentPacking : 4;
+            uint32_t _reserved : 14;
         };
         uint64_t bits;
     };
@@ -95,40 +95,52 @@
 //////////////////////////////////////////////////////////////////////////
 struct FETCH_COMPILE_STATE
 {
-    uint32_t numAttribs{ 0 };
+    uint32_t           numAttribs{0};
     INPUT_ELEMENT_DESC layout[SWR_VTX_NUM_SLOTS];
-    SWR_FORMAT indexType;
-    uint32_t cutIndex{ 0xffffffff };
+    SWR_FORMAT         indexType;
+    uint32_t           cutIndex{0xffffffff};
 
     // Options that effect the JIT'd code
-    bool bDisableIndexOOBCheck;             // If enabled, FetchJit will exclude index OOB check
-    bool bEnableCutIndex{ false };          // Compares indices with the cut index and returns a cut mask
-    bool bVertexIDOffsetEnable{ false };    // Offset vertexID by StartVertex for non-indexed draws or BaseVertex for indexed draws
-    bool bPartialVertexBuffer{ false };     // for indexed draws, map illegal indices to a known resident vertex
+    bool bDisableIndexOOBCheck;        // If enabled, FetchJit will exclude index OOB check
+    bool bEnableCutIndex{false};       // Compares indices with the cut index and returns a cut mask
+    bool bVertexIDOffsetEnable{false}; // Offset vertexID by StartVertex for non-indexed draws or
+                                       // BaseVertex for indexed draws
+    bool bPartialVertexBuffer{
+        false}; // for indexed draws, map illegal indices to a known resident vertex
 
-    bool bForceSequentialAccessEnable{ false };
-    bool bInstanceIDOffsetEnable{ false };
+    bool bForceSequentialAccessEnable{false};
+    bool bInstanceIDOffsetEnable{false};
 
-    FETCH_COMPILE_STATE(bool diableIndexOOBCheck = false):
-        bDisableIndexOOBCheck(diableIndexOOBCheck){ };
+    FETCH_COMPILE_STATE(bool diableIndexOOBCheck = false) :
+        bDisableIndexOOBCheck(diableIndexOOBCheck){};
 
-    bool operator==(const FETCH_COMPILE_STATE &other) const
+    bool operator==(const FETCH_COMPILE_STATE& other) const
     {
-        if (numAttribs != other.numAttribs) return false;
-        if (indexType != other.indexType) return false;
-        if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) return false;
-        if (bEnableCutIndex != other.bEnableCutIndex) return false;
-        if (cutIndex != other.cutIndex) return false;
-        if (bVertexIDOffsetEnable != other.bVertexIDOffsetEnable) return false;
-        if (bPartialVertexBuffer != other.bPartialVertexBuffer) return false;
-        if (bForceSequentialAccessEnable != other.bForceSequentialAccessEnable) return false;
-        if (bInstanceIDOffsetEnable != other.bInstanceIDOffsetEnable) return false;
+        if (numAttribs != other.numAttribs)
+            return false;
+        if (indexType != other.indexType)
+            return false;
+        if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck)
+            return false;
+        if (bEnableCutIndex != other.bEnableCutIndex)
+            return false;
+        if (cutIndex != other.cutIndex)
+            return false;
+        if (bVertexIDOffsetEnable != other.bVertexIDOffsetEnable)
+            return false;
+        if (bPartialVertexBuffer != other.bPartialVertexBuffer)
+            return false;
+        if (bForceSequentialAccessEnable != other.bForceSequentialAccessEnable)
+            return false;
+        if (bInstanceIDOffsetEnable != other.bInstanceIDOffsetEnable)
+            return false;
 
         for (uint32_t i = 0; i < numAttribs; ++i)
         {
             if ((layout[i].bits != other.layout[i].bits) ||
-               (((layout[i].InstanceEnable == 1) || (layout[i].InstanceStrideEnable == 1)) &&
-                (layout[i].InstanceAdvancementState != other.layout[i].InstanceAdvancementState))){
+                (((layout[i].InstanceEnable == 1) || (layout[i].InstanceStrideEnable == 1)) &&
+                 (layout[i].InstanceAdvancementState != other.layout[i].InstanceAdvancementState)))
+            {
                 return false;
             }
         }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
index baf3ab5..c34959d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/lower_x86.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file lower_x86.cpp
-*
-* @brief llvm pass to lower meta code to x86
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file lower_x86.cpp
+ *
+ * @brief llvm pass to lower meta code to x86
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 
 #include "jit_pch.hpp"
 #include "passes.h"
@@ -34,12 +34,11 @@
 
 #include <unordered_map>
 
-
 namespace llvm
 {
     // foward declare the initializer
     void initializeLowerX86Pass(PassRegistry&);
-}
+} // namespace llvm
 
 namespace SwrJit
 {
@@ -47,15 +46,15 @@
 
     enum TargetArch
     {
-        AVX = 0,
-        AVX2 = 1,
+        AVX    = 0,
+        AVX2   = 1,
         AVX512 = 2
     };
 
     enum TargetWidth
     {
-        W256 = 0,
-        W512 = 1,
+        W256       = 0,
+        W512       = 1,
         NUM_WIDTHS = 2
     };
 
@@ -66,94 +65,144 @@
     struct X86Intrinsic
     {
         Intrinsic::ID intrin[NUM_WIDTHS];
-        EmuFunc emuFunc;
+        EmuFunc       emuFunc;
     };
 
-    // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the previous behavior of
-    // mapping directly to avx/avx2 intrinsics.
+    // Map of intrinsics that haven't been moved to the new mechanism yet. If used, these get the
+    // previous behavior of mapping directly to avx/avx2 intrinsics.
     static std::map<std::string, Intrinsic::ID> intrinsicMap = {
-        {"meta.intrinsic.BEXTR_32",        Intrinsic::x86_bmi_bextr_32},
-        {"meta.intrinsic.VPSHUFB",         Intrinsic::x86_avx2_pshuf_b},
-        {"meta.intrinsic.VCVTPS2PH",       Intrinsic::x86_vcvtps2ph_256},
-        {"meta.intrinsic.VPTESTC",         Intrinsic::x86_avx_ptestc_256},
-        {"meta.intrinsic.VPTESTZ",         Intrinsic::x86_avx_ptestz_256},
-        {"meta.intrinsic.VFMADDPS",        Intrinsic::x86_fma_vfmadd_ps_256},
-        {"meta.intrinsic.VPHADDD",         Intrinsic::x86_avx2_phadd_d},
-        {"meta.intrinsic.PDEP32",          Intrinsic::x86_bmi_pdep_32},
-        {"meta.intrinsic.RDTSC",           Intrinsic::x86_rdtsc},
+        {"meta.intrinsic.BEXTR_32", Intrinsic::x86_bmi_bextr_32},
+        {"meta.intrinsic.VPSHUFB", Intrinsic::x86_avx2_pshuf_b},
+        {"meta.intrinsic.VCVTPS2PH", Intrinsic::x86_vcvtps2ph_256},
+        {"meta.intrinsic.VPTESTC", Intrinsic::x86_avx_ptestc_256},
+        {"meta.intrinsic.VPTESTZ", Intrinsic::x86_avx_ptestz_256},
+        {"meta.intrinsic.VPHADDD", Intrinsic::x86_avx2_phadd_d},
+        {"meta.intrinsic.PDEP32", Intrinsic::x86_bmi_pdep_32},
+        {"meta.intrinsic.RDTSC", Intrinsic::x86_rdtsc},
     };
 
     // Forward decls
     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction* VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
-    Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+    Instruction*
+    VPERM_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+    Instruction*
+    VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+    Instruction*
+    VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+    Instruction*
+    VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
+    Instruction*
+    VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst);
 
-    Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin);
-    
+    Instruction* DOUBLE_EMU(LowerX86*     pThis,
+                            TargetArch    arch,
+                            TargetWidth   width,
+                            CallInst*     pCallInst,
+                            Intrinsic::ID intrin);
+
     static Intrinsic::ID DOUBLE = (Intrinsic::ID)-1;
 
     static std::map<std::string, X86Intrinsic> intrinsicMap2[] = {
         //                              256 wide                                    512 wide
-    {   // AVX
-        {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx_rcp_ps_256,              DOUBLE},                                        NO_EMU}},
-        {"meta.intrinsic.VPERMPS",     {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
-        {"meta.intrinsic.VPERMD",      {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VPERM_EMU}},
-        {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,          Intrinsic::not_intrinsic},                      NO_EMU}},
-        {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256,               Intrinsic::not_intrinsic},                      NO_EMU}},
-        {"meta.intrinsic.VROUND",      {{Intrinsic::x86_avx_round_ps_256,            DOUBLE},                                        NO_EMU}},
-        {"meta.intrinsic.VHSUBPS",     {{Intrinsic::x86_avx_hsub_ps_256,             DOUBLE},                                        NO_EMU}},
-    },
-    {   // AVX2
-        {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx_rcp_ps_256,              DOUBLE},                                        NO_EMU}},
-        {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx2_permps,                 Intrinsic::not_intrinsic},                      VPERM_EMU}},
-        {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx2_permd,                  Intrinsic::not_intrinsic},                      VPERM_EMU}},
-        {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx_cvt_pd2_ps_256,          Intrinsic::not_intrinsic},                      NO_EMU}},
-        {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_vcvtph2ps_256,               Intrinsic::not_intrinsic},                      NO_EMU}},
-        {"meta.intrinsic.VROUND",      {{Intrinsic::x86_avx_round_ps_256,            DOUBLE},                                        NO_EMU}},
-        {"meta.intrinsic.VHSUBPS",     {{Intrinsic::x86_avx_hsub_ps_256,             DOUBLE},                                        NO_EMU}},
-    },
-    {   // AVX512
-        {"meta.intrinsic.VRCPPS",      {{Intrinsic::x86_avx512_rcp14_ps_256,         Intrinsic::x86_avx512_rcp14_ps_512},            NO_EMU}},
-        {"meta.intrinsic.VPERMPS",     {{Intrinsic::x86_avx512_mask_permvar_sf_256,  Intrinsic::x86_avx512_mask_permvar_sf_512},     NO_EMU}},
-        {"meta.intrinsic.VPERMD",      {{Intrinsic::x86_avx512_mask_permvar_si_256,  Intrinsic::x86_avx512_mask_permvar_si_512},     NO_EMU}},
-        {"meta.intrinsic.VGATHERPD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERPS",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VGATHERDD",   {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VGATHER_EMU}},
-        {"meta.intrinsic.VCVTPD2PS",   {{Intrinsic::x86_avx512_mask_cvtpd2ps_256,    Intrinsic::x86_avx512_mask_cvtpd2ps_512 },      NO_EMU}},
-        {"meta.intrinsic.VCVTPH2PS",   {{Intrinsic::x86_avx512_mask_vcvtph2ps_256,   Intrinsic::x86_avx512_mask_vcvtph2ps_512 },     NO_EMU}},
-        {"meta.intrinsic.VROUND",      {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VROUND_EMU}},
-        {"meta.intrinsic.VHSUBPS",     {{Intrinsic::not_intrinsic,                   Intrinsic::not_intrinsic},                      VHSUB_EMU}},
-    }
-    };
+        {
+            // AVX
+            {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
+            {"meta.intrinsic.VPERMPS",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
+            {"meta.intrinsic.VPERMD",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
+            {"meta.intrinsic.VGATHERPD",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERPS",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERDD",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VCVTPD2PS",
+             {{Intrinsic::x86_avx_cvt_pd2_ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
+            {"meta.intrinsic.VCVTPH2PS",
+             {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
+            {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
+            {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
+        },
+        {
+            // AVX2
+            {"meta.intrinsic.VRCPPS", {{Intrinsic::x86_avx_rcp_ps_256, DOUBLE}, NO_EMU}},
+            {"meta.intrinsic.VPERMPS",
+             {{Intrinsic::x86_avx2_permps, Intrinsic::not_intrinsic}, VPERM_EMU}},
+            {"meta.intrinsic.VPERMD",
+             {{Intrinsic::x86_avx2_permd, Intrinsic::not_intrinsic}, VPERM_EMU}},
+            {"meta.intrinsic.VGATHERPD",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERPS",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERDD",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VCVTPD2PS", {{Intrinsic::x86_avx_cvt_pd2_ps_256, DOUBLE}, NO_EMU}},
+            {"meta.intrinsic.VCVTPH2PS",
+             {{Intrinsic::x86_vcvtph2ps_256, Intrinsic::not_intrinsic}, NO_EMU}},
+            {"meta.intrinsic.VROUND", {{Intrinsic::x86_avx_round_ps_256, DOUBLE}, NO_EMU}},
+            {"meta.intrinsic.VHSUBPS", {{Intrinsic::x86_avx_hsub_ps_256, DOUBLE}, NO_EMU}},
+        },
+        {
+            // AVX512
+            {"meta.intrinsic.VRCPPS",
+             {{Intrinsic::x86_avx512_rcp14_ps_256, Intrinsic::x86_avx512_rcp14_ps_512}, NO_EMU}},
+#if LLVM_VERSION_MAJOR < 7
+            {"meta.intrinsic.VPERMPS",
+             {{Intrinsic::x86_avx512_mask_permvar_sf_256,
+               Intrinsic::x86_avx512_mask_permvar_sf_512},
+              NO_EMU}},
+            {"meta.intrinsic.VPERMD",
+             {{Intrinsic::x86_avx512_mask_permvar_si_256,
+               Intrinsic::x86_avx512_mask_permvar_si_512},
+              NO_EMU}},
+#else
+            {"meta.intrinsic.VPERMPS",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
+            {"meta.intrinsic.VPERMD",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VPERM_EMU}},
+#endif
+            {"meta.intrinsic.VGATHERPD",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERPS",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+            {"meta.intrinsic.VGATHERDD",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VGATHER_EMU}},
+#if LLVM_VERSION_MAJOR < 7
+            {"meta.intrinsic.VCVTPD2PS",
+             {{Intrinsic::x86_avx512_mask_cvtpd2ps_256, Intrinsic::x86_avx512_mask_cvtpd2ps_512},
+              NO_EMU}},
+#else
+            {"meta.intrinsic.VCVTPD2PS",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VCONVERT_EMU}},
+#endif
+            {"meta.intrinsic.VCVTPH2PS",
+             {{Intrinsic::x86_avx512_mask_vcvtph2ps_256, Intrinsic::x86_avx512_mask_vcvtph2ps_512},
+              NO_EMU}},
+            {"meta.intrinsic.VROUND",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VROUND_EMU}},
+            {"meta.intrinsic.VHSUBPS",
+             {{Intrinsic::not_intrinsic, Intrinsic::not_intrinsic}, VHSUB_EMU}},
+        }};
 
     struct LowerX86 : public FunctionPass
     {
-        LowerX86(JitManager* pJitMgr = nullptr, Builder* b = nullptr)
-            : FunctionPass(ID), mpJitMgr(pJitMgr), B(b)
+        LowerX86(Builder* b = nullptr) : FunctionPass(ID), B(b)
         {
             initializeLowerX86Pass(*PassRegistry::getPassRegistry());
 
             // Determine target arch
-            if (mpJitMgr->mArch.AVX512F())
+            if (JM()->mArch.AVX512F())
             {
                 mTarget = AVX512;
             }
-            else if (mpJitMgr->mArch.AVX2())
+            else if (JM()->mArch.AVX2())
             {
                 mTarget = AVX2;
             }
-            else if (mpJitMgr->mArch.AVX())
+            else if (JM()->mArch.AVX())
             {
                 mTarget = AVX;
-
             }
             else
             {
@@ -166,10 +215,20 @@
         // across all intrinsics, and will have to be rethought. Probably need something
         // similar to llvm's getDeclaration() utility to map a set of inputs to a specific typed
         // intrinsic.
-        void GetRequestedWidthAndType(CallInst* pCallInst, TargetWidth* pWidth, Type** pTy)
+        void GetRequestedWidthAndType(CallInst*       pCallInst,
+                                      const StringRef intrinName,
+                                      TargetWidth*    pWidth,
+                                      Type**          pTy)
         {
-            uint32_t vecWidth;
             Type* pVecTy = pCallInst->getType();
+
+            // Check for intrinsic specific types
+            // VCVTPD2PS type comes from src, not dst
+            if (intrinName.equals("meta.intrinsic.VCVTPD2PS"))
+            {
+                pVecTy = pCallInst->getOperand(0)->getType();
+            }
+
             if (!pVecTy->isVectorTy())
             {
                 for (auto& op : pCallInst->arg_operands())
@@ -186,9 +245,14 @@
             uint32_t width = cast<VectorType>(pVecTy)->getBitWidth();
             switch (width)
             {
-            case 256: *pWidth = W256; break;
-            case 512: *pWidth = W512; break;
-            default: SWR_ASSERT(false, "Unhandled vector width %d", width);
+            case 256:
+                *pWidth = W256;
+                break;
+            case 512:
+                *pWidth = W512;
+                break;
+            default:
+                SWR_ASSERT(false, "Unhandled vector width %d", width);
                 *pWidth = W256;
             }
 
@@ -200,8 +264,14 @@
             uint32_t numElem = 0;
             switch (width)
             {
-            case W256: numElem = 8; break;
-            case W512: numElem = 16; break;
+            case W256:
+                numElem = 8;
+                break;
+            case W512:
+                numElem = 16;
+                break;
+            default:
+                SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
             }
 
             return ConstantVector::getNullValue(VectorType::get(pTy, numElem));
@@ -212,8 +282,14 @@
             Value* mask;
             switch (width)
             {
-            case W256: mask = B->C((uint8_t)-1); break;
-            case W512: mask = B->C((uint16_t)-1); break;
+            case W256:
+                mask = B->C((uint8_t)-1);
+                break;
+            case W512:
+                mask = B->C((uint16_t)-1);
+                break;
+            default:
+                SWR_ASSERT(false, "Unhandled vector width type %d\n", width);
             }
             return mask;
         }
@@ -227,11 +303,11 @@
 
         Instruction* ProcessIntrinsicAdvanced(CallInst* pCallInst)
         {
-            Function* pFunc = pCallInst->getCalledFunction();
-            auto& intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
+            Function*   pFunc     = pCallInst->getCalledFunction();
+            auto&       intrinsic = intrinsicMap2[mTarget][pFunc->getName()];
             TargetWidth vecWidth;
-            Type* pElemTy;
-            GetRequestedWidthAndType(pCallInst, &vecWidth, &pElemTy);
+            Type*       pElemTy;
+            GetRequestedWidthAndType(pCallInst, pFunc->getName(), &vecWidth, &pElemTy);
 
             // Check if there is a native intrinsic for this instruction
             Intrinsic::ID id = intrinsic.intrin[vecWidth];
@@ -240,7 +316,8 @@
                 // Double pump the next smaller SIMD intrinsic
                 SWR_ASSERT(vecWidth != 0, "Cannot double pump smallest SIMD width.");
                 Intrinsic::ID id2 = intrinsic.intrin[vecWidth - 1];
-                SWR_ASSERT(id2 != Intrinsic::not_intrinsic, "Cannot find intrinsic to double pump.");
+                SWR_ASSERT(id2 != Intrinsic::not_intrinsic,
+                           "Cannot find intrinsic to double pump.");
                 return DOUBLE_EMU(this, mTarget, vecWidth, pCallInst, id2);
             }
             else if (id != Intrinsic::not_intrinsic)
@@ -252,12 +329,23 @@
                     args.push_back(arg.get());
                 }
 
-                // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and full mask for now
-                // Assuming the intrinsics are consistent and place the src operand and mask last in the argument list.
+                // If AVX512, all instructions add a src operand and mask. We'll pass in 0 src and
+                // full mask for now Assuming the intrinsics are consistent and place the src
+                // operand and mask last in the argument list.
                 if (mTarget == AVX512)
                 {
-                    args.push_back(GetZeroVec(vecWidth, pElemTy));
-                    args.push_back(GetMask(vecWidth));
+                    if (pFunc->getName().equals("meta.intrinsic.VCVTPD2PS"))
+                    {
+                        args.push_back(GetZeroVec(W256, pCallInst->getType()->getScalarType()));
+                        args.push_back(GetMask(W256));
+                        // for AVX512 VCVTPD2PS, we also have to add rounding mode
+                        args.push_back(B->C(_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+                    }
+                    else
+                    {
+                        args.push_back(GetZeroVec(vecWidth, pElemTy));
+                        args.push_back(GetMask(vecWidth));
+                    }
                 }
 
                 return B->CALLA(pIntrin, args);
@@ -275,17 +363,20 @@
         Instruction* ProcessIntrinsic(CallInst* pCallInst)
         {
             Function* pFunc = pCallInst->getCalledFunction();
-            
+
             // Forward to the advanced support if found
             if (intrinsicMap2[mTarget].find(pFunc->getName()) != intrinsicMap2[mTarget].end())
             {
                 return ProcessIntrinsicAdvanced(pCallInst);
             }
 
-            SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(), "Unimplemented intrinsic %s.", pFunc->getName());
+            SWR_ASSERT(intrinsicMap.find(pFunc->getName()) != intrinsicMap.end(),
+                       "Unimplemented intrinsic %s.",
+                       pFunc->getName());
 
             Intrinsic::ID x86Intrinsic = intrinsicMap[pFunc->getName()];
-            Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
+            Function*     pX86IntrinFunc =
+                Intrinsic::getDeclaration(B->JM()->mpCurrentModule, x86Intrinsic);
 
             SmallVector<Value*, 8> args;
             for (auto& arg : pCallInst->arg_operands())
@@ -320,7 +411,6 @@
                                 pCallInst->replaceAllUsesWith(pReplace);
                             }
                         }
-
                     }
                 }
             }
@@ -335,26 +425,20 @@
             return true;
         }
 
-        virtual void getAnalysisUsage(AnalysisUsage& AU) const
-        {
-        }
+        virtual void getAnalysisUsage(AnalysisUsage& AU) const {}
 
-        JitManager* JM() { return mpJitMgr; }
+        JitManager* JM() { return B->JM(); }
 
-        JitManager* mpJitMgr;
         Builder* B;
 
         TargetArch mTarget;
 
-        static char ID;  ///< Needed by LLVM to generate ID for FunctionPass.
+        static char ID; ///< Needed by LLVM to generate ID for FunctionPass.
     };
 
-    char LowerX86::ID = 0;   // LLVM uses address of ID as the actual ID.
+    char LowerX86::ID = 0; // LLVM uses address of ID as the actual ID.
 
-    FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b)
-    {
-        return new LowerX86(pJitMgr, b);
-    }
+    FunctionPass* createLowerX86Pass(Builder* b) { return new LowerX86(b); }
 
     Instruction* NO_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
     {
@@ -367,9 +451,9 @@
         // Only need vperm emulation for AVX
         SWR_ASSERT(arch == AVX);
 
-        Builder* B = pThis->B;
-        auto v32A = pCallInst->getArgOperand(0);
-        auto vi32Index = pCallInst->getArgOperand(1);
+        Builder* B         = pThis->B;
+        auto     v32A      = pCallInst->getArgOperand(0);
+        auto     vi32Index = pCallInst->getArgOperand(1);
 
         Value* v32Result;
         if (isa<Constant>(vi32Index))
@@ -383,49 +467,50 @@
             for (uint32_t l = 0; l < v32A->getType()->getVectorNumElements(); ++l)
             {
                 auto i32Index = B->VEXTRACT(vi32Index, B->C(l));
-                auto val = B->VEXTRACT(v32A, i32Index);
-                v32Result = B->VINSERT(v32Result, val, B->C(l));
+                auto val      = B->VEXTRACT(v32A, i32Index);
+                v32Result     = B->VINSERT(v32Result, val, B->C(l));
             }
         }
         return cast<Instruction>(v32Result);
     }
 
-    Instruction* VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+    Instruction*
+    VGATHER_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
     {
-        Builder* B = pThis->B;
-        auto vSrc = pCallInst->getArgOperand(0);
-        auto pBase = pCallInst->getArgOperand(1);
-        auto vi32Indices = pCallInst->getArgOperand(2);
-        auto vi1Mask = pCallInst->getArgOperand(3);
-        auto i8Scale = pCallInst->getArgOperand(4);
+        Builder* B           = pThis->B;
+        auto     vSrc        = pCallInst->getArgOperand(0);
+        auto     pBase       = pCallInst->getArgOperand(1);
+        auto     vi32Indices = pCallInst->getArgOperand(2);
+        auto     vi1Mask     = pCallInst->getArgOperand(3);
+        auto     i8Scale     = pCallInst->getArgOperand(4);
 
-        pBase = B->INT_TO_PTR(pBase, PointerType::get(B->mInt8Ty, 0));
-        uint32_t numElem = vSrc->getType()->getVectorNumElements();
-        auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
-        auto srcTy = vSrc->getType()->getVectorElementType();
-        Value* v32Gather;
+        pBase             = B->POINTER_CAST(pBase, PointerType::get(B->mInt8Ty, 0));
+        uint32_t numElem  = vSrc->getType()->getVectorNumElements();
+        auto     i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
+        auto     srcTy    = vSrc->getType()->getVectorElementType();
+        Value*   v32Gather;
         if (arch == AVX)
         {
             // Full emulation for AVX
             // Store source on stack to provide a valid address to load from inactive lanes
             auto pStack = B->STACKSAVE();
-            auto pTmp = B->ALLOCA(vSrc->getType());
+            auto pTmp   = B->ALLOCA(vSrc->getType());
             B->STORE(vSrc, pTmp);
 
-            v32Gather = UndefValue::get(vSrc->getType());
-            auto vi32Scale = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
+            v32Gather        = UndefValue::get(vSrc->getType());
+            auto vi32Scale   = ConstantVector::getSplat(numElem, cast<ConstantInt>(i32Scale));
             auto vi32Offsets = B->MUL(vi32Indices, vi32Scale);
 
             for (uint32_t i = 0; i < numElem; ++i)
             {
-                auto i32Offset = B->VEXTRACT(vi32Offsets, B->C(i));
-                auto pLoadAddress = B->GEP(pBase, i32Offset);
-                pLoadAddress = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
-                auto pMaskedLoadAddress = B->GEP(pTmp, { 0, i });
-                auto i1Mask = B->VEXTRACT(vi1Mask, B->C(i));
-                auto pValidAddress = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
-                auto val = B->LOAD(pValidAddress);
-                v32Gather = B->VINSERT(v32Gather, val, B->C(i));
+                auto i32Offset          = B->VEXTRACT(vi32Offsets, B->C(i));
+                auto pLoadAddress       = B->GEP(pBase, i32Offset);
+                pLoadAddress            = B->BITCAST(pLoadAddress, PointerType::get(srcTy, 0));
+                auto pMaskedLoadAddress = B->GEP(pTmp, {0, i});
+                auto i1Mask             = B->VEXTRACT(vi1Mask, B->C(i));
+                auto pValidAddress      = B->SELECT(i1Mask, pLoadAddress, pMaskedLoadAddress);
+                auto val                = B->LOAD(pValidAddress);
+                v32Gather               = B->VINSERT(v32Gather, val, B->C(i));
             }
 
             B->STACKRESTORE(pStack);
@@ -435,15 +520,18 @@
             Function* pX86IntrinFunc;
             if (srcTy == B->mFP32Ty)
             {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_ps_256);
-            } 
+                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                           Intrinsic::x86_avx2_gather_d_ps_256);
+            }
             else if (srcTy == B->mInt32Ty)
             {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_d_256);
+                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                           Intrinsic::x86_avx2_gather_d_d_256);
             }
             else if (srcTy == B->mDoubleTy)
             {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx2_gather_d_q_256);
+                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                           Intrinsic::x86_avx2_gather_d_q_256);
             }
             else
             {
@@ -453,37 +541,55 @@
             if (width == W256)
             {
                 auto v32Mask = B->BITCAST(pThis->VectorMask(vi1Mask), vSrc->getType());
-                v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, v32Mask, i8Scale });
+                v32Gather = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, v32Mask, i8Scale});
             }
             else if (width == W512)
             {
                 // Double pump 4-wide for 64bit elements
                 if (vSrc->getType()->getVectorElementType() == B->mDoubleTy)
                 {
-                    auto v64Mask = B->S_EXT(pThis->VectorMask(vi1Mask), B->mInt64Ty);
+                    auto v64Mask = pThis->VectorMask(vi1Mask);
+                    v64Mask      = B->S_EXT(
+                        v64Mask,
+                        VectorType::get(B->mInt64Ty, v64Mask->getType()->getVectorNumElements()));
                     v64Mask = B->BITCAST(v64Mask, vSrc->getType());
 
-                    Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({ 0, 1, 2, 3 }));
-                    Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({ 4, 5, 6, 7 }));
+                    Value* src0 = B->VSHUFFLE(vSrc, vSrc, B->C({0, 1, 2, 3}));
+                    Value* src1 = B->VSHUFFLE(vSrc, vSrc, B->C({4, 5, 6, 7}));
 
-                    Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 0, 1, 2, 3 }));
-                    Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({ 4, 5, 6, 7 }));
+                    Value* indices0 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({0, 1, 2, 3}));
+                    Value* indices1 = B->VSHUFFLE(vi32Indices, vi32Indices, B->C({4, 5, 6, 7}));
 
-                    Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 0, 1, 2, 3 }));
-                    Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({ 4, 5, 6, 7 }));
+                    Value* mask0 = B->VSHUFFLE(v64Mask, v64Mask, B->C({0, 1, 2, 3}));
+                    Value* mask1 = B->VSHUFFLE(v64Mask, v64Mask, B->C({4, 5, 6, 7}));
 
-                    Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
-                    Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
+                    src0 = B->BITCAST(
+                        src0,
+                        VectorType::get(B->mInt64Ty, src0->getType()->getVectorNumElements()));
+                    mask0 = B->BITCAST(
+                        mask0,
+                        VectorType::get(B->mInt64Ty, mask0->getType()->getVectorNumElements()));
+                    Value* gather0 =
+                        B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
+                    src1 = B->BITCAST(
+                        src1,
+                        VectorType::get(B->mInt64Ty, src1->getType()->getVectorNumElements()));
+                    mask1 = B->BITCAST(
+                        mask1,
+                        VectorType::get(B->mInt64Ty, mask1->getType()->getVectorNumElements()));
+                    Value* gather1 =
+                        B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 
-                    v32Gather = B->VSHUFFLE(gather0, gather1, B->C({ 0, 1, 2, 3, 4, 5, 6, 7 }));
+                    v32Gather = B->VSHUFFLE(gather0, gather1, B->C({0, 1, 2, 3, 4, 5, 6, 7}));
+                    v32Gather = B->BITCAST(v32Gather, vSrc->getType());
                 }
                 else
                 {
                     // Double pump 8-wide for 32bit elements
                     auto v32Mask = pThis->VectorMask(vi1Mask);
-                    v32Mask = B->BITCAST(v32Mask, vSrc->getType());
-                    Value* src0 = B->EXTRACT_16(vSrc, 0);
-                    Value* src1 = B->EXTRACT_16(vSrc, 1);
+                    v32Mask      = B->BITCAST(v32Mask, vSrc->getType());
+                    Value* src0  = B->EXTRACT_16(vSrc, 0);
+                    Value* src1  = B->EXTRACT_16(vSrc, 1);
 
                     Value* indices0 = B->EXTRACT_16(vi32Indices, 0);
                     Value* indices1 = B->EXTRACT_16(vi32Indices, 1);
@@ -491,8 +597,10 @@
                     Value* mask0 = B->EXTRACT_16(v32Mask, 0);
                     Value* mask1 = B->EXTRACT_16(v32Mask, 1);
 
-                    Value* gather0 = B->CALL(pX86IntrinFunc, { src0, pBase, indices0, mask0, i8Scale });
-                    Value* gather1 = B->CALL(pX86IntrinFunc, { src1, pBase, indices1, mask1, i8Scale });
+                    Value* gather0 =
+                        B->CALL(pX86IntrinFunc, {src0, pBase, indices0, mask0, i8Scale});
+                    Value* gather1 =
+                        B->CALL(pX86IntrinFunc, {src1, pBase, indices1, mask1, i8Scale});
 
                     v32Gather = B->JOIN_16(gather0, gather1);
                 }
@@ -500,22 +608,25 @@
         }
         else if (arch == AVX512)
         {
-            Value* iMask;
+            Value*    iMask;
             Function* pX86IntrinFunc;
             if (srcTy == B->mFP32Ty)
             {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dps_512);
-                iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
+                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                           Intrinsic::x86_avx512_gather_dps_512);
+                iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
             }
             else if (srcTy == B->mInt32Ty)
             {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpi_512);
-                iMask = B->BITCAST(vi1Mask, B->mInt16Ty);
+                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                           Intrinsic::x86_avx512_gather_dpi_512);
+                iMask          = B->BITCAST(vi1Mask, B->mInt16Ty);
             }
             else if (srcTy == B->mDoubleTy)
             {
-                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx512_gather_dpd_512);
-                iMask = B->BITCAST(vi1Mask, B->mInt8Ty);
+                pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                           Intrinsic::x86_avx512_gather_dpd_512);
+                iMask          = B->BITCAST(vi1Mask, B->mInt8Ty);
             }
             else
             {
@@ -523,21 +634,24 @@
             }
 
             auto i32Scale = B->Z_EXT(i8Scale, B->mInt32Ty);
-            v32Gather = B->CALL(pX86IntrinFunc, { vSrc, pBase, vi32Indices, iMask, i32Scale });
+            v32Gather     = B->CALL(pX86IntrinFunc, {vSrc, pBase, vi32Indices, iMask, i32Scale});
         }
 
         return cast<Instruction>(v32Gather);
     }
 
-    // No support for vroundps in avx512 (it is available in kncni), so emulate with avx instructions
-    Instruction* VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+    // No support for vroundps in avx512 (it is available in kncni), so emulate with avx
+    // instructions
+    Instruction*
+    VROUND_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
     {
         SWR_ASSERT(arch == AVX512);
 
-        auto B = pThis->B;
+        auto B       = pThis->B;
         auto vf32Src = pCallInst->getOperand(0);
         auto i8Round = pCallInst->getOperand(1);
-        auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
+        auto pfnFunc =
+            Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_round_ps_256);
 
         if (width == W256)
         {
@@ -561,26 +675,56 @@
         return nullptr;
     }
 
+    Instruction*
+    VCONVERT_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
+    {
+        SWR_ASSERT(arch == AVX512);
+
+        auto B       = pThis->B;
+        auto vf32Src = pCallInst->getOperand(0);
+
+        if (width == W256)
+        {
+            auto vf32SrcRound = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                          Intrinsic::x86_avx_round_ps_256);
+            return cast<Instruction>(B->FP_TRUNC(vf32SrcRound, B->mFP32Ty));
+        }
+        else if (width == W512)
+        {
+            // 512 can use intrinsic
+            auto pfnFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule,
+                                                     Intrinsic::x86_avx512_mask_cvtpd2ps_512);
+            return cast<Instruction>(B->CALL(pfnFunc, vf32Src));
+        }
+        else
+        {
+            SWR_ASSERT(false, "Unimplemented vector width.");
+        }
+
+        return nullptr;
+    }
+
     // No support for hsub in AVX512
     Instruction* VHSUB_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst)
     {
         SWR_ASSERT(arch == AVX512);
 
-        auto B = pThis->B;
+        auto B    = pThis->B;
         auto src0 = pCallInst->getOperand(0);
         auto src1 = pCallInst->getOperand(1);
 
         // 256b hsub can just use avx intrinsic
         if (width == W256)
         {
-            auto pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
+            auto pX86IntrinFunc =
+                Intrinsic::getDeclaration(B->JM()->mpCurrentModule, Intrinsic::x86_avx_hsub_ps_256);
             return cast<Instruction>(B->CALL2(pX86IntrinFunc, src0, src1));
         }
         else if (width == W512)
         {
             // 512b hsub can be accomplished with shuf/sub combo
-            auto minuend = B->VSHUFFLE(src0, src1, B->C({ 0, 2, 8, 10, 4, 6, 12, 14 }));
-            auto subtrahend = B->VSHUFFLE(src0, src1, B->C({ 1, 3, 9, 11, 5, 7, 13, 15 }));
+            auto minuend    = B->VSHUFFLE(src0, src1, B->C({0, 2, 8, 10, 4, 6, 12, 14}));
+            auto subtrahend = B->VSHUFFLE(src0, src1, B->C({1, 3, 9, 11, 5, 7, 13, 15}));
             return cast<Instruction>(B->SUB(minuend, subtrahend));
         }
         else
@@ -590,30 +734,57 @@
         }
     }
 
-    // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from each vector argument and
-    // calls the 256 wide intrinsic, then merges the results to 512 wide
-    Instruction* DOUBLE_EMU(LowerX86* pThis, TargetArch arch, TargetWidth width, CallInst* pCallInst, Intrinsic::ID intrin)
+    // Double pump input using Intrin template arg. This blindly extracts lower and upper 256 from
+    // each vector argument and calls the 256 wide intrinsic, then merges the results to 512 wide
+    Instruction* DOUBLE_EMU(LowerX86*     pThis,
+                            TargetArch    arch,
+                            TargetWidth   width,
+                            CallInst*     pCallInst,
+                            Intrinsic::ID intrin)
     {
         auto B = pThis->B;
         SWR_ASSERT(width == W512);
-        Value* result[2];
+        Value*    result[2];
         Function* pX86IntrinFunc = Intrinsic::getDeclaration(B->JM()->mpCurrentModule, intrin);
         for (uint32_t i = 0; i < 2; ++i)
         {
             SmallVector<Value*, 8> args;
             for (auto& arg : pCallInst->arg_operands())
             {
-                args.push_back(arg.get()->getType()->isVectorTy() ? B->EXTRACT_16(arg.get(), i) : arg.get());
+                auto argType = arg.get()->getType();
+                if (argType->isVectorTy())
+                {
+                    uint32_t vecWidth  = argType->getVectorNumElements();
+                    Value*   lanes     = B->CInc<int>(i * vecWidth / 2, vecWidth / 2);
+                    Value*   argToPush = B->VSHUFFLE(
+                        arg.get(), B->VUNDEF(argType->getVectorElementType(), vecWidth), lanes);
+                    args.push_back(argToPush);
+                }
+                else
+                {
+                    args.push_back(arg.get());
+                }
             }
             result[i] = B->CALLA(pX86IntrinFunc, args);
         }
-        return cast<Instruction>(B->JOIN_16(result[0], result[1]));
+        uint32_t vecWidth;
+        if (result[0]->getType()->isVectorTy())
+        {
+            assert(result[1]->getType()->isVectorTy());
+            vecWidth = result[0]->getType()->getVectorNumElements() +
+                       result[1]->getType()->getVectorNumElements();
+        }
+        else
+        {
+            vecWidth = 2;
+        }
+        Value* lanes = B->CInc<int>(0, vecWidth);
+        return cast<Instruction>(B->VSHUFFLE(result[0], result[1], lanes));
     }
 
-}
+} // namespace SwrJit
 
 using namespace SwrJit;
 
 INITIALIZE_PASS_BEGIN(LowerX86, "LowerX86", "LowerX86", false, false)
 INITIALIZE_PASS_END(LowerX86, "LowerX86", "LowerX86", false, false)
-
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
index f7373f0..d3c732a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/functionpasses/passes.h
@@ -1,30 +1,30 @@
 /****************************************************************************
-* Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file passes.h
-*
-* @brief Include file for llvm passes
-*
-******************************************************************************/
+ * Copyright (C) 2014-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file passes.h
+ *
+ * @brief Include file for llvm passes
+ *
+ ******************************************************************************/
 
 #include "JitManager.h"
 #include "builder.h"
@@ -33,5 +33,5 @@
 {
     using namespace llvm;
 
-    FunctionPass* createLowerX86Pass(JitManager* pJitMgr, Builder* b);
-}
+    FunctionPass* createLowerX86Pass(Builder* b);
+} // namespace SwrJit
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
index fb6cf9b..cc986a7 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file jit_api.h
-*
-* @brief Platform independent JIT interface
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file jit_api.h
+ *
+ * @brief Platform independent JIT interface
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 #include "common/os.h"
 #include "core/utils.h"
@@ -48,7 +48,6 @@
 #endif
 
 
-
 struct ShaderInfo;
 
 //////////////////////////////////////////////////////////////////////////
@@ -59,15 +58,15 @@
     SWR_SHADER_TYPE type;
     uint32_t        crc;
 
-    const void* pIR;        ///< Pointer to LLVM IR text.
-    size_t irLength;
+    const void* pIR; ///< Pointer to LLVM IR text.
+    size_t      irLength;
 
     bool enableJitSampler;
 
 };
 
-extern "C"
-{
+
+extern "C" {
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Create JIT context.
@@ -82,17 +81,13 @@
 /// @param hJitContext - Jit Context
 /// @param input  - Input containing LLVM IR and other information
 /// @param output - Output containing information about JIT shader
-ShaderInfo* JITCALL JitCompileShader(
-    HANDLE hJitContext,
-    const JIT_COMPILE_INPUT& input);
+ShaderInfo* JITCALL JitCompileShader(HANDLE hJitContext, const JIT_COMPILE_INPUT& input);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief JIT destroy shader.
 /// @param hJitContext - Jit Context
 /// @param pShaderInfo  - pointer to shader object.
-void JITCALL JitDestroyShader(
-    HANDLE hJitContext,
-    ShaderInfo*& pShaderInfo);
+void JITCALL JitDestroyShader(HANDLE hJitContext, ShaderInfo*& pShaderInfo);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief JIT compiles fetch shader
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
index 216938f..47f717b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_pch.hpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2017-2018 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file jit_pch.hpp
-*
-* @brief Pre-compiled header for jitter
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2017-2018 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file jit_pch.hpp
+ *
+ * @brief Pre-compiled header for jitter
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 
 #pragma once
 
@@ -58,7 +58,7 @@
 
 #include "llvm/IR/LegacyPassManager.h"
 using FunctionPassManager = llvm::legacy::FunctionPassManager;
-using PassManager = llvm::legacy::PassManager;
+using PassManager         = llvm::legacy::PassManager;
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
@@ -69,6 +69,7 @@
 #include "llvm/Transforms/Scalar.h"
 #if LLVM_VERSION_MAJOR >= 7
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
 #endif
 #include "llvm/Support/Host.h"
 #include "llvm/Support/DynamicLibrary.h"
@@ -91,7 +92,6 @@
 
 #include "llvm/Transforms/Utils/Cloning.h"
 
-
 #if defined(_WIN32)
 #include "llvm/ADT/Triple.h"
 #endif
@@ -116,16 +116,18 @@
 #endif
 
 #if LLVM_VERSION_MAJOR >= 5
-static const auto Sync_CrossThread = llvm::SyncScope::System;
-static const auto Attrib_FunctionIndex = llvm::AttributeList::FunctionIndex;
-static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, const llvm::AttrBuilder &b)
+static const auto                Sync_CrossThread     = llvm::SyncScope::System;
+static const auto                Attrib_FunctionIndex = llvm::AttributeList::FunctionIndex;
+static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext&       ctx,
+                                                  const llvm::AttrBuilder& b)
 {
     return llvm::AttributeSet::get(ctx, b);
 }
 #else
-static const auto Sync_CrossThread = llvm::SynchronizationScope::CrossThread;
-static const auto Attrib_FunctionIndex = llvm::AttributeSet::FunctionIndex;
-static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext& ctx, const llvm::AttrBuilder &b)
+static const auto                Sync_CrossThread     = llvm::SynchronizationScope::CrossThread;
+static const auto                Attrib_FunctionIndex = llvm::AttributeSet::FunctionIndex;
+static inline llvm::AttributeSet GetFuncAttribSet(llvm::LLVMContext&       ctx,
+                                                  const llvm::AttrBuilder& b)
 {
     return llvm::AttributeSet::get(ctx, Attrib_FunctionIndex, b);
 }
@@ -133,7 +135,6 @@
 
 #pragma pop_macro("DEBUG")
 
-
 #include <deque>
 #include <list>
 #include <unordered_map>
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp
index 54d45e6..1c9db0c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/shader_lib/DebugOutput.cpp
@@ -1,36 +1,35 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file DebugOutput.cpp
-*
-* @brief Shader support library implementation for printed Debug output
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file DebugOutput.cpp
+ *
+ * @brief Shader support library implementation for printed Debug output
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #include <stdarg.h>
 #include "common/os.h"
 
-
 //////////////////////////////////////////////////////////////////////////
 /// @brief called in JIT code, inserted by PRINT
 /// output to both stdout and visual studio debug console
@@ -40,7 +39,7 @@
     va_start(args, fmt);
     vprintf(fmt, args);
 
-#if defined( _WIN32 )
+#if defined(_WIN32)
     char strBuf[1024];
     vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
     OutputDebugStringA(strBuf);
@@ -48,4 +47,3 @@
 
     va_end(args);
 }
-
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 15a6bc4..11ad365 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file streamout_jit.cpp
-*
-* @brief Implementation of the streamout jitter
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file streamout_jit.cpp
+ *
+ * @brief Implementation of the streamout jitter
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #include "jit_pch.hpp"
 #include "builder.h"
 #include "jit_api.h"
@@ -44,13 +44,12 @@
 {
     StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
 
-    // returns pointer to SWR_STREAMOUT_BUFFER 
+    // returns pointer to SWR_STREAMOUT_BUFFER
     Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
     {
-        return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
+        return LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer});
     }
 
-
     //////////////////////////////////////////////////////////////////////////
     // @brief checks if streamout buffer is oob
     // @return <i1> true/false
@@ -62,28 +61,27 @@
 
         // load enable
         // @todo bool data types should generate <i1> llvm type
-        Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
+        Value* enabled = TRUNC(LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_enable}), IRB()->getInt1Ty());
 
         // load buffer size
-        Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
-        
+        Value* bufferSize = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_bufferSize});
+
         // load current streamOffset
-        Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+        Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
 
         // load buffer pitch
-        Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
+        Value* pitch = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});
 
         // buffer is considered oob if in use in a decl but not enabled
         returnMask = OR(returnMask, NOT(enabled));
 
         // buffer is oob if cannot fit a prims worth of verts
         Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
-        returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
+        returnMask       = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
 
         return returnMask;
     }
 
-
     //////////////////////////////////////////////////////////////////////////
     // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
     //        packing the active mask bits
@@ -93,8 +91,8 @@
     Value* PackMask(uint32_t bitmask)
     {
         std::vector<Constant*> indices(4, C(0));
-        DWORD index;
-        uint32_t elem = 0;
+        DWORD                  index;
+        uint32_t               elem = 0;
         while (_BitScanForward(&index, bitmask))
         {
             indices[elem++] = C((int)index);
@@ -133,17 +131,17 @@
     void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
     {
         uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
-        uint32_t packedMask = (1 << numComponents) - 1;
+        uint32_t packedMask    = (1 << numComponents) - 1;
         if (!decl.hole)
         {
             // increment stream pointer to correct slot
             Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
 
             // load 4 components from stream
-            Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
+            Type* simd4Ty    = VectorType::get(IRB()->getFloatTy(), 4);
             Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
-            pAttrib = BITCAST(pAttrib, simd4PtrTy);
-            Value *vattrib = LOAD(pAttrib);
+            pAttrib          = BITCAST(pAttrib, simd4PtrTy);
+            Value* vattrib   = LOAD(pAttrib);
 
             // shuffle/pack enabled components
             Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
@@ -178,7 +176,11 @@
         }
     }
 
-    void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
+    void buildStream(const STREAMOUT_COMPILE_STATE& state,
+                     const STREAMOUT_STREAM&        streamState,
+                     Value*                         pSoCtx,
+                     BasicBlock*                    returnBB,
+                     Function*                      soFunc)
     {
         // get list of active SO buffers
         std::unordered_set<uint32_t> activeSOBuffers;
@@ -189,9 +191,9 @@
         }
 
         // always increment numPrimStorageNeeded
-        Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
-        numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
-        STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
+        Value* numPrimStorageNeeded = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});
+        numPrimStorageNeeded        = ADD(numPrimStorageNeeded, C(1));
+        STORE(numPrimStorageNeeded, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded});
 
         // check OOB on active SO buffers.  If any buffer is out of bound, don't write
         // the primitive to any buffer
@@ -208,27 +210,27 @@
 
         IRB()->SetInsertPoint(validBB);
 
-        Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
-        numPrimsWritten = ADD(numPrimsWritten, C(1));
-        STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
+        Value* numPrimsWritten = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});
+        numPrimsWritten        = ADD(numPrimsWritten, C(1));
+        STORE(numPrimsWritten, pSoCtx, {0, SWR_STREAMOUT_CONTEXT_numPrimsWritten});
 
         // compute start pointer for each output buffer
         Value* pOutBuffer[4];
         Value* pOutBufferStartVertex[4];
         Value* outBufferPitch[4];
-        for (uint32_t b: activeSOBuffers)
+        for (uint32_t b : activeSOBuffers)
         {
-            Value* pBuf = getSOBuffer(pSoCtx, b);
-            Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
-            Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
-            pOutBuffer[b] = GEP(pData, streamOffset);
+            Value* pBuf              = getSOBuffer(pSoCtx, b);
+            Value* pData             = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pBuffer});
+            Value* streamOffset      = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
+            pOutBuffer[b]            = GEP(pData, streamOffset);
             pOutBufferStartVertex[b] = pOutBuffer[b];
 
-            outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
+            outBufferPitch[b] = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_pitch});
         }
 
         // loop over the vertices of the prim
-        Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
+        Value* pStreamData = LOAD(pSoCtx, {0, SWR_STREAMOUT_CONTEXT_pPrimData});
         for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
         {
             buildVertex(streamState, pStreamData, pOutBuffer);
@@ -241,23 +243,24 @@
             for (uint32_t b : activeSOBuffers)
             {
                 pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
-                pOutBuffer[b] = pOutBufferStartVertex[b];
+                pOutBuffer[b]            = pOutBufferStartVertex[b];
             }
         }
 
         // update each active buffer's streamOffset
         for (uint32_t b : activeSOBuffers)
         {
-            Value* pBuf = getSOBuffer(pSoCtx, b);
-            Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+            Value* pBuf         = getSOBuffer(pSoCtx, b);
+            Value* streamOffset = LOAD(pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
             streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
-            STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+            STORE(streamOffset, pBuf, {0, SWR_STREAMOUT_BUFFER_streamOffset});
         }
     }
 
     Function* Create(const STREAMOUT_COMPILE_STATE& state)
     {
-        std::stringstream fnName("SO_", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+        std::stringstream fnName("SO_",
+                                 std::ios_base::in | std::ios_base::out | std::ios_base::ate);
         fnName << ComputeCRC(0, &state, sizeof(state));
 
         // SO function signature
@@ -267,19 +270,20 @@
             PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
         };
 
-        FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
-        Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+        FunctionType* fTy    = FunctionType::get(IRB()->getVoidTy(), args, false);
+        Function*     soFunc = Function::Create(
+            fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
 
         soFunc->getParent()->setModuleIdentifier(soFunc->getName());
 
         // create return basic block
-        BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
+        BasicBlock* entry    = BasicBlock::Create(JM()->mContext, "entry", soFunc);
         BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
 
         IRB()->SetInsertPoint(entry);
 
         // arguments
-        auto argitr = soFunc->arg_begin();
+        auto   argitr = soFunc->arg_begin();
         Value* pSoCtx = &*argitr++;
         pSoCtx->setName("pSoCtx");
 
@@ -302,17 +306,17 @@
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());
         passes.add(createInstructionCombiningPass());
-        passes.add(createInstructionSimplifierPass());
         passes.add(createConstantPropagationPass());
         passes.add(createSCCPPass());
         passes.add(createAggressiveDCEPass());
 
-        passes.add(createLowerX86Pass(JM(), this));
+        passes.add(createLowerX86Pass(this));
 
         passes.run(*soFunc);
 
         JitManager::DumpToFile(soFunc, "SoFunc_optimized");
 
+
         return soFunc;
     }
 };
@@ -324,15 +328,17 @@
 /// @return PFN_SO_FUNC - pointer to SOS function
 PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
 {
-    llvm::Function *func = (llvm::Function*)hFunc;
-    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
-    PFN_SO_FUNC pfnStreamOut;
+    llvm::Function* func    = (llvm::Function*)hFunc;
+    JitManager*     pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+    PFN_SO_FUNC     pfnStreamOut;
     pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
-    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot
+    // add new IR to the module
     pJitMgr->mIsModuleFinalized = true;
 
     pJitMgr->DumpAsm(func, "SoFunc_optimized");
 
+
     return pfnStreamOut;
 }
 
@@ -340,7 +346,8 @@
 /// @brief JIT compiles streamout shader
 /// @param hJitMgr - JitManager handle
 /// @param state   - SO state to build function from
-extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
+extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE                         hJitMgr,
+                                                   const STREAMOUT_COMPILE_STATE& state)
 {
     JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
 
@@ -356,7 +363,7 @@
     pJitMgr->SetupNewModule();
 
     StreamOutJit theJit(pJitMgr);
-    HANDLE hFunc = theJit.Create(soState);
+    HANDLE       hFunc = theJit.Create(soState);
 
     return JitStreamoutFunc(hJitMgr, hFunc);
 }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
index 097f8ab..cee7b57 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
@@ -1,32 +1,32 @@
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file streamout_jit.h
-*
-* @brief Definition of the streamout jitter
-*
-* Notes:
-*
-******************************************************************************/
+ * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * @file streamout_jit.h
+ *
+ * @brief Definition of the streamout jitter
+ *
+ * Notes:
+ *
+ ******************************************************************************/
 #pragma once
 
 #include "common/formats.h"
@@ -43,7 +43,7 @@
     // attribute to stream
     uint32_t attribSlot;
 
-    // attribute component mask 
+    // attribute component mask
     uint32_t componentMask;
 
     // indicates this decl is a hole
@@ -69,24 +69,31 @@
 {
     // number of verts per primitive
     uint32_t numVertsPerPrim;
-    uint32_t offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values.
+    uint32_t
+        offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values.
 
     uint64_t streamMask;
 
     // stream decls
     STREAMOUT_STREAM stream;
 
-    bool operator==(const STREAMOUT_COMPILE_STATE &other) const
+    bool operator==(const STREAMOUT_COMPILE_STATE& other) const
     {
-        if (numVertsPerPrim != other.numVertsPerPrim) return false;
-        if (stream.numDecls != other.stream.numDecls) return false;
+        if (numVertsPerPrim != other.numVertsPerPrim)
+            return false;
+        if (stream.numDecls != other.stream.numDecls)
+            return false;
 
         for (uint32_t i = 0; i < stream.numDecls; ++i)
         {
-            if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) return false;
-            if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) return false;
-            if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) return false;
-            if (stream.decl[i].hole != other.stream.decl[i].hole) return false;
+            if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex)
+                return false;
+            if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot)
+                return false;
+            if (stream.decl[i].componentMask != other.stream.decl[i].componentMask)
+                return false;
+            if (stream.decl[i].hole != other.stream.decl[i].hole)
+                return false;
         }
 
         return true;
diff --git a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp
new file mode 100644
index 0000000..bff96e1
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.cpp
@@ -0,0 +1,39 @@
+/****************************************************************************
+* Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file InitMemory.cpp
+*
+* @brief Provide access to tiles table initialization functions
+*
+******************************************************************************/
+#include "memory/InitMemory.h"
+
+void InitSimLoadTilesTable();
+void InitSimStoreTilesTable();
+void InitSimClearTilesTable();
+
+void InitTilesTable()
+{
+    InitSimLoadTilesTable();
+    InitSimStoreTilesTable();
+    InitSimClearTilesTable();
+}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/InitMemory.h b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.h
new file mode 100644
index 0000000..14cca6a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/InitMemory.h
@@ -0,0 +1,33 @@
+/****************************************************************************
+* Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file InitMemory.h
+*
+* @brief Provide access to tiles table initialization functions
+*
+******************************************************************************/
+#include "common/os.h"
+
+extern "C"
+{
+    SWR_VISIBLE void SWR_API InitTilesTable();
+}
diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp
index 01b9804..c5b2e29 100644
--- a/src/gallium/drivers/swr/swr_loader.cpp
+++ b/src/gallium/drivers/swr/swr_loader.cpp
@@ -21,6 +21,7 @@
  * IN THE SOFTWARE.
  ***************************************************************************/
 
+#include "memory/InitMemory.h"
 #include "util/u_cpu_detect.h"
 #include "util/u_dl.h"
 #include "swr_public.h"
@@ -35,6 +36,7 @@
 #ifdef HAVE_SWR_BUILTIN
    screen->pLibrary = NULL;
    screen->pfnSwrGetInterface = SwrGetInterface;
+   InitTilesTable();
    fprintf(stderr, "(using: builtin).\n");
 #else
    char filename[256] = { 0 };
@@ -48,7 +50,9 @@
 
    util_dl_proc pApiProc = util_dl_get_proc_address(screen->pLibrary,
       "SwrGetInterface");
-   if (!pApiProc) {
+   util_dl_proc pInitFunc = util_dl_get_proc_address(screen->pLibrary,
+      "InitTilesTable");
+   if (!pApiProc || !pInitFunc) {
       fprintf(stderr, "(skipping: %s).\n", util_dl_error());
       util_dl_close(screen->pLibrary);
       screen->pLibrary = NULL;
@@ -56,6 +60,8 @@
    }
 
    screen->pfnSwrGetInterface = (PFNSwrGetInterface)pApiProc;
+   pInitFunc();
+
    fprintf(stderr, "(using: %s).\n", filename);
 #endif
    return true;
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
index bf71683..084f55d 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -85,6 +85,7 @@
                         enum pipe_format format,
                         enum pipe_texture_target target,
                         unsigned sample_count,
+                        unsigned storage_sample_count,
                         unsigned bind)
 {
    struct swr_screen *screen = swr_screen(_screen);
@@ -100,6 +101,9 @@
           || target == PIPE_TEXTURE_CUBE
           || target == PIPE_TEXTURE_CUBE_ARRAY);
 
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+      return false;
+
    format_desc = util_format_description(format);
    if (!format_desc)
       return FALSE;
@@ -183,6 +187,8 @@
       return 7;
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
       return 330;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      return 140;
    case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
       return 16;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
@@ -342,11 +348,19 @@
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -400,6 +414,10 @@
       return 0.0;
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return 16.0; /* arbitrary */
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0f;
    }
    /* should only get here on unhandled cases */
    debug_printf("Unexpected PIPE_CAPF %d query\n", param);
diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
index 13d8986..afa184f 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -1402,7 +1402,7 @@
 
    // after the gallivm passes, we have to lower the core's intrinsics
    llvm::legacy::FunctionPassManager lowerPass(JM()->mpCurrentModule);
-   lowerPass.add(createLowerX86Pass(mpJitMgr, this));
+   lowerPass.add(createLowerX86Pass(this));
    lowerPass.run(*pFunction);
 
    PFN_PIXEL_KERNEL kernel =
diff --git a/src/gallium/drivers/tegra/tegra_screen.c b/src/gallium/drivers/tegra/tegra_screen.c
index 7283d08..73fdc6329 100644
--- a/src/gallium/drivers/tegra/tegra_screen.c
+++ b/src/gallium/drivers/tegra/tegra_screen.c
@@ -132,12 +132,14 @@
                                  enum pipe_format format,
                                  enum pipe_texture_target target,
                                  unsigned sample_count,
+                                 unsigned storage_sample_count,
                                  unsigned usage)
 {
    struct tegra_screen *screen = to_tegra_screen(pscreen);
 
    return screen->gpu->is_format_supported(screen->gpu, format, target,
-                                           sample_count, usage);
+                                           sample_count, storage_sample_count,
+                                           usage);
 }
 
 static boolean
@@ -229,7 +231,7 @@
 
    memset(&handle, 0, sizeof(handle));
    handle.modifier = DRM_FORMAT_MOD_INVALID;
-   handle.type = DRM_API_HANDLE_TYPE_FD;
+   handle.type = WINSYS_HANDLE_TYPE_FD;
 
    status = screen->gpu->resource_get_handle(screen->gpu, NULL, resource->gpu,
                                              &handle, usage);
@@ -387,7 +389,7 @@
     * to pass buffers into Tegra DRM for display. In all other cases, return
     * the Nouveau handle, assuming they will be used for sharing in DRI2/3.
     */
-   if (handle->type == DRM_API_HANDLE_TYPE_KMS &&
+   if (handle->type == WINSYS_HANDLE_TYPE_KMS &&
        presource->bind & PIPE_BIND_SCANOUT) {
       handle->modifier = resource->modifier;
       handle->handle = resource->handle;
diff --git a/src/gallium/drivers/vc5/.editorconfig b/src/gallium/drivers/v3d/.editorconfig
similarity index 100%
rename from src/gallium/drivers/vc5/.editorconfig
rename to src/gallium/drivers/v3d/.editorconfig
diff --git a/src/gallium/drivers/v3d/Automake.inc b/src/gallium/drivers/v3d/Automake.inc
new file mode 100644
index 0000000..91ae826
--- /dev/null
+++ b/src/gallium/drivers/v3d/Automake.inc
@@ -0,0 +1,16 @@
+if HAVE_GALLIUM_V3D
+
+TARGET_DRIVERS += v3d
+TARGET_CPPFLAGS += -DGALLIUM_V3D
+TARGET_LIB_DEPS += \
+	$(top_builddir)/src/gallium/winsys/v3d/drm/libv3ddrm.la \
+	$(top_builddir)/src/gallium/drivers/v3d/libv3d.la \
+	$(top_builddir)/src/broadcom/libbroadcom.la \
+	$(top_builddir)/src/broadcom/libbroadcom_v33.la \
+	$(top_builddir)/src/broadcom/libbroadcom_v41.la
+
+if !HAVE_GALLIUM_VC4
+TARGET_LIB_DEPS += $(top_builddir)/src/broadcom/cle/libbroadcom_cle.la
+endif
+
+endif
diff --git a/src/gallium/drivers/vc5/Makefile.am b/src/gallium/drivers/v3d/Makefile.am
similarity index 73%
rename from src/gallium/drivers/vc5/Makefile.am
rename to src/gallium/drivers/v3d/Makefile.am
index 419c9ab..5b4ed5d 100644
--- a/src/gallium/drivers/vc5/Makefile.am
+++ b/src/gallium/drivers/v3d/Makefile.am
@@ -24,33 +24,36 @@
 
 AM_CFLAGS = \
 	-I$(top_builddir)/src/compiler/nir \
+	-I$(top_srcdir)/include/drm-uapi \
+	-I$(top_srcdir)/src/broadcom \
 	-I$(top_builddir)/src/broadcom \
+	-I$(top_builddir)/src \
 	$(LIBDRM_CFLAGS) \
-	$(VC5_SIMULATOR_CFLAGS) \
+	$(V3D_SIMULATOR_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(VALGRIND_CFLAGS) \
 	$()
 
 noinst_LTLIBRARIES = \
-	libvc5.la \
-	libvc5_v33.la \
-	libvc5_v41.la \
+	libv3d.la \
+	libv3d_v33.la \
+	libv3d_v41.la \
 	$()
 
-libvc5_v33_la_SOURCES = $(VC5_PER_VERSION_SOURCES)
-libvc5_v33_la_CFLAGS = $(AM_CFLAGS) -DV3D_VERSION=33
+libv3d_v33_la_SOURCES = $(V3D_PER_VERSION_SOURCES)
+libv3d_v33_la_CFLAGS = $(AM_CFLAGS) -DV3D_VERSION=33
 
-libvc5_v41_la_SOURCES = $(VC5_PER_VERSION_SOURCES)
-libvc5_v41_la_CFLAGS = $(AM_CFLAGS) -DV3D_VERSION=41
+libv3d_v41_la_SOURCES = $(V3D_PER_VERSION_SOURCES)
+libv3d_v41_la_CFLAGS = $(AM_CFLAGS) -DV3D_VERSION=41
 
-libvc5_la_SOURCES = $(C_SOURCES)
+libv3d_la_SOURCES = $(C_SOURCES)
 
-libvc5_la_LDFLAGS = \
-	$(VC5_SIMULATOR_LIBS) \
+libv3d_la_LDFLAGS = \
+	$(V3D_SIMULATOR_LIBS) \
 	$(NULL)
-libvc5_la_LIBADD = \
-	libvc5_v33.la \
-	libvc5_v41.la \
+libv3d_la_LIBADD = \
+	libv3d_v33.la \
+	libv3d_v41.la \
 	$()
 
 EXTRA_DIST = meson.build
diff --git a/src/gallium/drivers/v3d/Makefile.sources b/src/gallium/drivers/v3d/Makefile.sources
new file mode 100644
index 0000000..c81ccb4
--- /dev/null
+++ b/src/gallium/drivers/v3d/Makefile.sources
@@ -0,0 +1,36 @@
+C_SOURCES := \
+	v3d_blit.c \
+	v3d_bufmgr.c \
+	v3d_bufmgr.h \
+	v3d_cl.c \
+	v3d_cl.h \
+	v3d_context.c \
+	v3d_context.h \
+	v3d_fence.c \
+	v3d_formats.c \
+	v3d_format_table.h \
+	v3d_job.c \
+	v3d_program.c \
+	v3d_query.c \
+	v3d_resource.c \
+	v3d_resource.h \
+	v3d_screen.c \
+	v3d_screen.h \
+	v3d_simulator.c \
+	v3d_simulator_wrapper.cpp \
+	v3d_simulator_wrapper.h \
+	v3d_tiling.c \
+	v3d_tiling.h \
+	v3d_uniforms.c \
+	$()
+
+V3D_PER_VERSION_SOURCES = \
+	v3dx_context.h \
+	v3dx_draw.c \
+	v3dx_emit.c \
+	v3dx_format_table.c \
+	v3dx_job.c \
+	v3dx_rcl.c \
+	v3dx_simulator.c \
+	v3dx_state.c \
+	$()
diff --git a/src/gallium/drivers/vc5/meson.build b/src/gallium/drivers/v3d/meson.build
similarity index 70%
rename from src/gallium/drivers/vc5/meson.build
rename to src/gallium/drivers/v3d/meson.build
index 4f20c26..18e68a6 100644
--- a/src/gallium/drivers/vc5/meson.build
+++ b/src/gallium/drivers/v3d/meson.build
@@ -18,52 +18,52 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-files_libvc5 = files(
-  'vc5_blit.c',
-  'vc5_bufmgr.c',
-  'vc5_bufmgr.h',
-  'vc5_cl.c',
-  'vc5_cl.h',
-  'vc5_context.c',
-  'vc5_context.h',
-  'vc5_fence.c',
-  'vc5_formats.c',
-  'vc5_job.c',
-  'vc5_program.c',
-  'vc5_query.c',
-  'vc5_resource.c',
-  'vc5_resource.h',
-  'vc5_screen.c',
-  'vc5_screen.h',
-  'vc5_simulator.c',
-  'vc5_simulator_wrapper.cpp',
-  'vc5_tiling.c',
-  'vc5_tiling.h',
-  'vc5_uniforms.c',
+files_libv3d = files(
+  'v3d_blit.c',
+  'v3d_bufmgr.c',
+  'v3d_bufmgr.h',
+  'v3d_cl.c',
+  'v3d_cl.h',
+  'v3d_context.c',
+  'v3d_context.h',
+  'v3d_fence.c',
+  'v3d_formats.c',
+  'v3d_job.c',
+  'v3d_program.c',
+  'v3d_query.c',
+  'v3d_resource.c',
+  'v3d_resource.h',
+  'v3d_screen.c',
+  'v3d_screen.h',
+  'v3d_simulator.c',
+  'v3d_simulator_wrapper.cpp',
+  'v3d_tiling.c',
+  'v3d_tiling.h',
+  'v3d_uniforms.c',
 )
 
 files_per_version = files(
+  'v3dx_draw.c',
+  'v3dx_emit.c',
   'v3dx_format_table.c',
   'v3dx_job.c',
+  'v3dx_rcl.c',
   'v3dx_simulator.c',
-  'vc5_draw.c',
-  'vc5_emit.c',
-  'vc5_rcl.c',
-  'vc5_state.c',
+  'v3dx_state.c',
 )
 
 v3dv3_c_args = []
-dep_v3dv3 = dependency('v3dv3')
+dep_v3dv3 = dependency('v3dv3', required: false)
 if dep_v3dv3.found()
-  v3dv3_c_args = '-DUSE_VC5_SIMULATOR'
+  v3dv3_c_args = '-DUSE_V3D_SIMULATOR'
 endif
 
-vc5_versions = ['33', '41']
+v3d_versions = ['33', '41']
 
 per_version_libs = []
-foreach ver : vc5_versions
+foreach ver : v3d_versions
   per_version_libs += static_library(
-    'vc5-v' + ver,
+    'v3d-v' + ver,
     [files_per_version, v3d_xml_pack, nir_opcodes_h, nir_builder_opcodes_h],
     include_directories : [
       inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
@@ -76,9 +76,9 @@
 
 endforeach
 
-libvc5 = static_library(
-  'vc5',
-  [files_libvc5, v3d_xml_pack],
+libv3d = static_library(
+  'v3d',
+  [files_libv3d, v3d_xml_pack],
   include_directories : [
     inc_src, inc_include, inc_gallium, inc_gallium_aux, inc_broadcom,
     inc_gallium_drivers, inc_drm_uapi,
@@ -89,8 +89,8 @@
   link_with: per_version_libs,
 )
 
-driver_vc5 = declare_dependency(
-  compile_args : '-DGALLIUM_VC5',
-  link_with : [libvc5, libvc5winsys, libbroadcom_cle, libbroadcom_vc5],
+driver_v3d = declare_dependency(
+  compile_args : '-DGALLIUM_V3D',
+  link_with : [libv3d, libv3dwinsys, libbroadcom_cle, libbroadcom_v3d],
   dependencies : idep_nir,
 )
diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
new file mode 100644
index 0000000..1a0406a
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_blit.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright © 2015-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_format.h"
+#include "util/u_surface.h"
+#include "util/u_blitter.h"
+#include "v3d_context.h"
+
+#if 0
+static struct pipe_surface *
+v3d_get_blit_surface(struct pipe_context *pctx,
+                     struct pipe_resource *prsc, unsigned level)
+{
+        struct pipe_surface tmpl;
+
+        memset(&tmpl, 0, sizeof(tmpl));
+        tmpl.format = prsc->format;
+        tmpl.u.tex.level = level;
+        tmpl.u.tex.first_layer = 0;
+        tmpl.u.tex.last_layer = 0;
+
+        return pctx->create_surface(pctx, prsc, &tmpl);
+}
+
+static bool
+is_tile_unaligned(unsigned size, unsigned tile_size)
+{
+        return size & (tile_size - 1);
+}
+
+static bool
+v3d_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+        bool msaa = (info->src.resource->nr_samples > 1 ||
+                     info->dst.resource->nr_samples > 1);
+        int tile_width = msaa ? 32 : 64;
+        int tile_height = msaa ? 32 : 64;
+
+        if (util_format_is_depth_or_stencil(info->dst.resource->format))
+                return false;
+
+        if (info->scissor_enable)
+                return false;
+
+        if ((info->mask & PIPE_MASK_RGBA) == 0)
+                return false;
+
+        if (info->dst.box.x != info->src.box.x ||
+            info->dst.box.y != info->src.box.y ||
+            info->dst.box.width != info->src.box.width ||
+            info->dst.box.height != info->src.box.height) {
+                return false;
+        }
+
+        int dst_surface_width = u_minify(info->dst.resource->width0,
+                                         info->dst.level);
+        int dst_surface_height = u_minify(info->dst.resource->height0,
+                                         info->dst.level);
+        if (is_tile_unaligned(info->dst.box.x, tile_width) ||
+            is_tile_unaligned(info->dst.box.y, tile_height) ||
+            (is_tile_unaligned(info->dst.box.width, tile_width) &&
+             info->dst.box.x + info->dst.box.width != dst_surface_width) ||
+            (is_tile_unaligned(info->dst.box.height, tile_height) &&
+             info->dst.box.y + info->dst.box.height != dst_surface_height)) {
+                return false;
+        }
+
+        /* VC5_PACKET_LOAD_TILE_BUFFER_GENERAL uses the
+         * VC5_PACKET_TILE_RENDERING_MODE_CONFIG's width (determined by our
+         * destination surface) to determine the stride.  This may be wrong
+         * when reading from texture miplevels > 0, which are stored in
+         * POT-sized areas.  For MSAA, the tile addresses are computed
+         * explicitly by the RCL, but still use the destination width to
+         * determine the stride (which could be fixed by explicitly supplying
+         * it in the ABI).
+         */
+        struct v3d_resource *rsc = v3d_resource(info->src.resource);
+
+        uint32_t stride;
+
+        if (info->src.resource->nr_samples > 1)
+                stride = align(dst_surface_width, 32) * 4 * rsc->cpp;
+        /* XXX else if (rsc->slices[info->src.level].tiling == VC5_TILING_FORMAT_T)
+           stride = align(dst_surface_width * rsc->cpp, 128); */
+        else
+                stride = align(dst_surface_width * rsc->cpp, 16);
+
+        if (stride != rsc->slices[info->src.level].stride)
+                return false;
+
+        if (info->dst.resource->format != info->src.resource->format)
+                return false;
+
+        if (false) {
+                fprintf(stderr, "RCL blit from %d,%d to %d,%d (%d,%d)\n",
+                        info->src.box.x,
+                        info->src.box.y,
+                        info->dst.box.x,
+                        info->dst.box.y,
+                        info->dst.box.width,
+                        info->dst.box.height);
+        }
+
+        struct pipe_surface *dst_surf =
+                v3d_get_blit_surface(pctx, info->dst.resource, info->dst.level);
+        struct pipe_surface *src_surf =
+                v3d_get_blit_surface(pctx, info->src.resource, info->src.level);
+
+        v3d_flush_jobs_reading_resource(v3d, info->src.resource);
+
+        struct v3d_job *job = v3d_get_job(v3d, dst_surf, NULL);
+        pipe_surface_reference(&job->color_read, src_surf);
+
+        /* If we're resolving from MSAA to single sample, we still need to run
+         * the engine in MSAA mode for the load.
+         */
+        if (!job->msaa && info->src.resource->nr_samples > 1) {
+                job->msaa = true;
+                job->tile_width = 32;
+                job->tile_height = 32;
+        }
+
+        job->draw_min_x = info->dst.box.x;
+        job->draw_min_y = info->dst.box.y;
+        job->draw_max_x = info->dst.box.x + info->dst.box.width;
+        job->draw_max_y = info->dst.box.y + info->dst.box.height;
+        job->draw_width = dst_surf->width;
+        job->draw_height = dst_surf->height;
+
+        job->tile_width = tile_width;
+        job->tile_height = tile_height;
+        job->msaa = msaa;
+        job->needs_flush = true;
+        job->resolve |= PIPE_CLEAR_COLOR;
+
+        v3d_job_submit(v3d, job);
+
+        pipe_surface_reference(&dst_surf, NULL);
+        pipe_surface_reference(&src_surf, NULL);
+
+        return true;
+}
+#endif
+
+void
+v3d_blitter_save(struct v3d_context *v3d)
+{
+        util_blitter_save_fragment_constant_buffer_slot(v3d->blitter,
+                                                        v3d->constbuf[PIPE_SHADER_FRAGMENT].cb);
+        util_blitter_save_vertex_buffer_slot(v3d->blitter, v3d->vertexbuf.vb);
+        util_blitter_save_vertex_elements(v3d->blitter, v3d->vtx);
+        util_blitter_save_vertex_shader(v3d->blitter, v3d->prog.bind_vs);
+        util_blitter_save_so_targets(v3d->blitter, v3d->streamout.num_targets,
+                                     v3d->streamout.targets);
+        util_blitter_save_rasterizer(v3d->blitter, v3d->rasterizer);
+        util_blitter_save_viewport(v3d->blitter, &v3d->viewport);
+        util_blitter_save_scissor(v3d->blitter, &v3d->scissor);
+        util_blitter_save_fragment_shader(v3d->blitter, v3d->prog.bind_fs);
+        util_blitter_save_blend(v3d->blitter, v3d->blend);
+        util_blitter_save_depth_stencil_alpha(v3d->blitter, v3d->zsa);
+        util_blitter_save_stencil_ref(v3d->blitter, &v3d->stencil_ref);
+        util_blitter_save_sample_mask(v3d->blitter, v3d->sample_mask);
+        util_blitter_save_framebuffer(v3d->blitter, &v3d->framebuffer);
+        util_blitter_save_fragment_sampler_states(v3d->blitter,
+                        v3d->fragtex.num_samplers,
+                        (void **)v3d->fragtex.samplers);
+        util_blitter_save_fragment_sampler_views(v3d->blitter,
+                        v3d->fragtex.num_textures, v3d->fragtex.textures);
+        util_blitter_save_so_targets(v3d->blitter, v3d->streamout.num_targets,
+                                     v3d->streamout.targets);
+}
+
+static bool
+v3d_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
+{
+        struct v3d_context *v3d = v3d_context(ctx);
+        struct v3d_resource *src = v3d_resource(info->src.resource);
+        struct pipe_resource *tiled = NULL;
+
+        if (!src->tiled) {
+                struct pipe_box box = {
+                        .x = 0,
+                        .y = 0,
+                        .width = u_minify(info->src.resource->width0,
+                                           info->src.level),
+                        .height = u_minify(info->src.resource->height0,
+                                           info->src.level),
+                        .depth = 1,
+                };
+                struct pipe_resource tmpl = {
+                        .target = info->src.resource->target,
+                        .format = info->src.resource->format,
+                        .width0 = box.width,
+                        .height0 = box.height,
+                        .depth0 = 1,
+                        .array_size = 1,
+                };
+                tiled = ctx->screen->resource_create(ctx->screen, &tmpl);
+                if (!tiled) {
+                        fprintf(stderr, "Failed to create tiled blit temp\n");
+                        return false;
+                }
+                ctx->resource_copy_region(ctx,
+                                          tiled, 0,
+                                          0, 0, 0,
+                                          info->src.resource, info->src.level,
+                                          &box);
+                info->src.level = 0;
+                info->src.resource = tiled;
+        }
+
+        if (!util_blitter_is_blit_supported(v3d->blitter, info)) {
+                fprintf(stderr, "blit unsupported %s -> %s\n",
+                    util_format_short_name(info->src.resource->format),
+                    util_format_short_name(info->dst.resource->format));
+                return false;
+        }
+
+        v3d_blitter_save(v3d);
+        util_blitter_blit(v3d->blitter, info);
+
+        pipe_resource_reference(&tiled, NULL);
+
+        return true;
+}
+
+/* Implement stencil blits by reinterpreting the stencil data as an RGBA8888
+ * or R8 texture.
+ */
+static void
+v3d_stencil_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
+{
+        struct v3d_context *v3d = v3d_context(ctx);
+        struct v3d_resource *src = v3d_resource(info->src.resource);
+        struct v3d_resource *dst = v3d_resource(info->dst.resource);
+        enum pipe_format src_format, dst_format;
+
+        if (src->separate_stencil) {
+                src = src->separate_stencil;
+                src_format = PIPE_FORMAT_R8_UNORM;
+        } else {
+                src_format = PIPE_FORMAT_RGBA8888_UNORM;
+        }
+
+        if (dst->separate_stencil) {
+                dst = dst->separate_stencil;
+                dst_format = PIPE_FORMAT_R8_UNORM;
+        } else {
+                dst_format = PIPE_FORMAT_RGBA8888_UNORM;
+        }
+
+        /* Initialize the surface. */
+        struct pipe_surface dst_tmpl = {
+                .u.tex = {
+                        .level = info->dst.level,
+                        .first_layer = info->dst.box.z,
+                        .last_layer = info->dst.box.z,
+                },
+                .format = dst_format,
+        };
+        struct pipe_surface *dst_surf =
+                ctx->create_surface(ctx, &dst->base, &dst_tmpl);
+
+        /* Initialize the sampler view. */
+        struct pipe_sampler_view src_tmpl = {
+                .target = src->base.target,
+                .format = src_format,
+                .u.tex = {
+                        .first_level = info->src.level,
+                        .last_level = info->src.level,
+                        .first_layer = 0,
+                        .last_layer = (PIPE_TEXTURE_3D ?
+                                       u_minify(src->base.depth0,
+                                                info->src.level) - 1 :
+                                       src->base.array_size - 1),
+                },
+                .swizzle_r = PIPE_SWIZZLE_X,
+                .swizzle_g = PIPE_SWIZZLE_Y,
+                .swizzle_b = PIPE_SWIZZLE_Z,
+                .swizzle_a = PIPE_SWIZZLE_W,
+        };
+        struct pipe_sampler_view *src_view =
+                ctx->create_sampler_view(ctx, &src->base, &src_tmpl);
+
+        v3d_blitter_save(v3d);
+        util_blitter_blit_generic(v3d->blitter, dst_surf, &info->dst.box,
+                                  src_view, &info->src.box,
+                                  src->base.width0, src->base.height0,
+                                  PIPE_MASK_R,
+                                  PIPE_TEX_FILTER_NEAREST,
+                                  info->scissor_enable ? &info->scissor : NULL,
+                                  info->alpha_blend);
+
+        pipe_surface_reference(&dst_surf, NULL);
+        pipe_sampler_view_reference(&src_view, NULL);
+}
+
+/* Optimal hardware path for blitting pixels.
+ * Scaling, format conversion, up- and downsampling (resolve) are allowed.
+ */
+void
+v3d_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
+{
+        struct pipe_blit_info info = *blit_info;
+
+        if (info.mask & PIPE_MASK_S) {
+                v3d_stencil_blit(pctx, blit_info);
+                info.mask &= ~PIPE_MASK_S;
+        }
+
+#if 0
+        if (v3d_tile_blit(pctx, blit_info))
+                return;
+#endif
+
+        v3d_render_blit(pctx, &info);
+}
diff --git a/src/gallium/drivers/vc5/vc5_bufmgr.c b/src/gallium/drivers/v3d/v3d_bufmgr.c
similarity index 68%
rename from src/gallium/drivers/vc5/vc5_bufmgr.c
rename to src/gallium/drivers/v3d/v3d_bufmgr.c
index 7a9c04a..d01b60e 100644
--- a/src/gallium/drivers/vc5/vc5_bufmgr.c
+++ b/src/gallium/drivers/v3d/v3d_bufmgr.c
@@ -32,8 +32,8 @@
 #include "util/u_memory.h"
 #include "util/ralloc.h"
 
-#include "vc5_context.h"
-#include "vc5_screen.h"
+#include "v3d_context.h"
+#include "v3d_screen.h"
 
 #ifdef HAVE_VALGRIND
 #include <valgrind.h>
@@ -46,25 +46,32 @@
 static bool dump_stats = false;
 
 static void
-vc5_bo_cache_free_all(struct vc5_bo_cache *cache);
+v3d_bo_cache_free_all(struct v3d_bo_cache *cache);
 
 static void
-vc5_bo_dump_stats(struct vc5_screen *screen)
+v3d_bo_dump_stats(struct v3d_screen *screen)
 {
-        struct vc5_bo_cache *cache = &screen->bo_cache;
+        struct v3d_bo_cache *cache = &screen->bo_cache;
+
+        uint32_t cache_count = 0;
+        uint32_t cache_size = 0;
+        list_for_each_entry(struct v3d_bo, bo, &cache->time_list, time_list) {
+                cache_count++;
+                cache_size += bo->size;
+        }
 
         fprintf(stderr, "  BOs allocated:   %d\n", screen->bo_count);
         fprintf(stderr, "  BOs size:        %dkb\n", screen->bo_size / 1024);
-        fprintf(stderr, "  BOs cached:      %d\n", cache->bo_count);
-        fprintf(stderr, "  BOs cached size: %dkb\n", cache->bo_size / 1024);
+        fprintf(stderr, "  BOs cached:      %d\n", cache_count);
+        fprintf(stderr, "  BOs cached size: %dkb\n", cache_size / 1024);
 
         if (!list_empty(&cache->time_list)) {
-                struct vc5_bo *first = LIST_ENTRY(struct vc5_bo,
-                                                  cache->time_list.next,
-                                                  time_list);
-                struct vc5_bo *last = LIST_ENTRY(struct vc5_bo,
-                                                  cache->time_list.prev,
-                                                  time_list);
+                struct v3d_bo *first = list_first_entry(&cache->time_list,
+                                                        struct v3d_bo,
+                                                        time_list);
+                struct v3d_bo *last = list_last_entry(&cache->time_list,
+                                                      struct v3d_bo,
+                                                      time_list);
 
                 fprintf(stderr, "  oldest cache time: %ld\n",
                         (long)first->free_time);
@@ -79,40 +86,38 @@
 }
 
 static void
-vc5_bo_remove_from_cache(struct vc5_bo_cache *cache, struct vc5_bo *bo)
+v3d_bo_remove_from_cache(struct v3d_bo_cache *cache, struct v3d_bo *bo)
 {
         list_del(&bo->time_list);
         list_del(&bo->size_list);
-        cache->bo_count--;
-        cache->bo_size -= bo->size;
 }
 
-static struct vc5_bo *
-vc5_bo_from_cache(struct vc5_screen *screen, uint32_t size, const char *name)
+static struct v3d_bo *
+v3d_bo_from_cache(struct v3d_screen *screen, uint32_t size, const char *name)
 {
-        struct vc5_bo_cache *cache = &screen->bo_cache;
+        struct v3d_bo_cache *cache = &screen->bo_cache;
         uint32_t page_index = size / 4096 - 1;
 
         if (cache->size_list_size <= page_index)
                 return NULL;
 
-        struct vc5_bo *bo = NULL;
+        struct v3d_bo *bo = NULL;
         mtx_lock(&cache->lock);
         if (!list_empty(&cache->size_list[page_index])) {
-                bo = LIST_ENTRY(struct vc5_bo, cache->size_list[page_index].next,
-                                size_list);
+                bo = list_first_entry(&cache->size_list[page_index],
+                                      struct v3d_bo, size_list);
 
                 /* Check that the BO has gone idle.  If not, then we want to
                  * allocate something new instead, since we assume that the
                  * user will proceed to CPU map it and fill it with stuff.
                  */
-                if (!vc5_bo_wait(bo, 0, NULL)) {
+                if (!v3d_bo_wait(bo, 0, NULL)) {
                         mtx_unlock(&cache->lock);
                         return NULL;
                 }
 
                 pipe_reference_init(&bo->reference, 1);
-                vc5_bo_remove_from_cache(cache, bo);
+                v3d_bo_remove_from_cache(cache, bo);
 
                 bo->name = name;
         }
@@ -120,25 +125,29 @@
         return bo;
 }
 
-struct vc5_bo *
-vc5_bo_alloc(struct vc5_screen *screen, uint32_t size, const char *name)
+struct v3d_bo *
+v3d_bo_alloc(struct v3d_screen *screen, uint32_t size, const char *name)
 {
-        struct vc5_bo *bo;
+        struct v3d_bo *bo;
         int ret;
 
+        /* The CLIF dumping requires that there is no whitespace in the name.
+         */
+        assert(!strchr(name, ' '));
+
         size = align(size, 4096);
 
-        bo = vc5_bo_from_cache(screen, size, name);
+        bo = v3d_bo_from_cache(screen, size, name);
         if (bo) {
                 if (dump_stats) {
                         fprintf(stderr, "Allocated %s %dkb from cache:\n",
                                 name, size / 1024);
-                        vc5_bo_dump_stats(screen);
+                        v3d_bo_dump_stats(screen);
                 }
                 return bo;
         }
 
-        bo = CALLOC_STRUCT(vc5_bo);
+        bo = CALLOC_STRUCT(v3d_bo);
         if (!bo)
                 return NULL;
 
@@ -152,11 +161,11 @@
         ;
 
         bool cleared_and_retried = false;
-        struct drm_vc5_create_bo create = {
+        struct drm_v3d_create_bo create = {
                 .size = size
         };
 
-        ret = vc5_ioctl(screen->fd, DRM_IOCTL_VC5_CREATE_BO, &create);
+        ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_CREATE_BO, &create);
         bo->handle = create.handle;
         bo->offset = create.offset;
 
@@ -164,7 +173,7 @@
                 if (!list_empty(&screen->bo_cache.time_list) &&
                     !cleared_and_retried) {
                         cleared_and_retried = true;
-                        vc5_bo_cache_free_all(&screen->bo_cache);
+                        v3d_bo_cache_free_all(&screen->bo_cache);
                         goto retry;
                 }
 
@@ -176,31 +185,31 @@
         screen->bo_size += bo->size;
         if (dump_stats) {
                 fprintf(stderr, "Allocated %s %dkb:\n", name, size / 1024);
-                vc5_bo_dump_stats(screen);
+                v3d_bo_dump_stats(screen);
         }
 
         return bo;
 }
 
 void
-vc5_bo_last_unreference(struct vc5_bo *bo)
+v3d_bo_last_unreference(struct v3d_bo *bo)
 {
-        struct vc5_screen *screen = bo->screen;
+        struct v3d_screen *screen = bo->screen;
 
         struct timespec time;
         clock_gettime(CLOCK_MONOTONIC, &time);
         mtx_lock(&screen->bo_cache.lock);
-        vc5_bo_last_unreference_locked_timed(bo, time.tv_sec);
+        v3d_bo_last_unreference_locked_timed(bo, time.tv_sec);
         mtx_unlock(&screen->bo_cache.lock);
 }
 
 static void
-vc5_bo_free(struct vc5_bo *bo)
+v3d_bo_free(struct v3d_bo *bo)
 {
-        struct vc5_screen *screen = bo->screen;
+        struct v3d_screen *screen = bo->screen;
 
         if (bo->map) {
-                if (using_vc5_simulator && bo->name &&
+                if (using_v3d_simulator && bo->name &&
                     strcmp(bo->name, "winsys") == 0) {
                         free(bo->map);
                 } else {
@@ -212,7 +221,7 @@
         struct drm_gem_close c;
         memset(&c, 0, sizeof(c));
         c.handle = bo->handle;
-        int ret = vc5_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
+        int ret = v3d_ioctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
         if (ret != 0)
                 fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
 
@@ -224,30 +233,29 @@
                         bo->name ? bo->name : "",
                         bo->name ? " " : "",
                         bo->size / 1024);
-                vc5_bo_dump_stats(screen);
+                v3d_bo_dump_stats(screen);
         }
 
         free(bo);
 }
 
 static void
-free_stale_bos(struct vc5_screen *screen, time_t time)
+free_stale_bos(struct v3d_screen *screen, time_t time)
 {
-        struct vc5_bo_cache *cache = &screen->bo_cache;
+        struct v3d_bo_cache *cache = &screen->bo_cache;
         bool freed_any = false;
 
-        list_for_each_entry_safe(struct vc5_bo, bo, &cache->time_list,
+        list_for_each_entry_safe(struct v3d_bo, bo, &cache->time_list,
                                  time_list) {
-                if (dump_stats && !freed_any) {
-                        fprintf(stderr, "Freeing stale BOs:\n");
-                        vc5_bo_dump_stats(screen);
-                        freed_any = true;
-                }
-
                 /* If it's more than a second old, free it. */
                 if (time - bo->free_time > 2) {
-                        vc5_bo_remove_from_cache(cache, bo);
-                        vc5_bo_free(bo);
+                        if (dump_stats && !freed_any) {
+                                fprintf(stderr, "Freeing stale BOs:\n");
+                                v3d_bo_dump_stats(screen);
+                                freed_any = true;
+                        }
+                        v3d_bo_remove_from_cache(cache, bo);
+                        v3d_bo_free(bo);
                 } else {
                         break;
                 }
@@ -255,31 +263,31 @@
 
         if (dump_stats && freed_any) {
                 fprintf(stderr, "Freed stale BOs:\n");
-                vc5_bo_dump_stats(screen);
+                v3d_bo_dump_stats(screen);
         }
 }
 
 static void
-vc5_bo_cache_free_all(struct vc5_bo_cache *cache)
+v3d_bo_cache_free_all(struct v3d_bo_cache *cache)
 {
         mtx_lock(&cache->lock);
-        list_for_each_entry_safe(struct vc5_bo, bo, &cache->time_list,
+        list_for_each_entry_safe(struct v3d_bo, bo, &cache->time_list,
                                  time_list) {
-                vc5_bo_remove_from_cache(cache, bo);
-                vc5_bo_free(bo);
+                v3d_bo_remove_from_cache(cache, bo);
+                v3d_bo_free(bo);
         }
         mtx_unlock(&cache->lock);
 }
 
 void
-vc5_bo_last_unreference_locked_timed(struct vc5_bo *bo, time_t time)
+v3d_bo_last_unreference_locked_timed(struct v3d_bo *bo, time_t time)
 {
-        struct vc5_screen *screen = bo->screen;
-        struct vc5_bo_cache *cache = &screen->bo_cache;
+        struct v3d_screen *screen = bo->screen;
+        struct v3d_bo_cache *cache = &screen->bo_cache;
         uint32_t page_index = bo->size / 4096 - 1;
 
         if (!bo->private) {
-                vc5_bo_free(bo);
+                v3d_bo_free(bo);
                 return;
         }
 
@@ -311,24 +319,22 @@
         bo->free_time = time;
         list_addtail(&bo->size_list, &cache->size_list[page_index]);
         list_addtail(&bo->time_list, &cache->time_list);
-        cache->bo_count++;
-        cache->bo_size += bo->size;
         if (dump_stats) {
                 fprintf(stderr, "Freed %s %dkb to cache:\n",
                         bo->name, bo->size / 1024);
-                vc5_bo_dump_stats(screen);
+                v3d_bo_dump_stats(screen);
         }
         bo->name = NULL;
 
         free_stale_bos(screen, time);
 }
 
-static struct vc5_bo *
-vc5_bo_open_handle(struct vc5_screen *screen,
+static struct v3d_bo *
+v3d_bo_open_handle(struct v3d_screen *screen,
                    uint32_t winsys_stride,
                    uint32_t handle, uint32_t size)
 {
-        struct vc5_bo *bo;
+        struct v3d_bo *bo;
 
         assert(size);
 
@@ -340,7 +346,7 @@
                 goto done;
         }
 
-        bo = CALLOC_STRUCT(vc5_bo);
+        bo = CALLOC_STRUCT(v3d_bo);
         pipe_reference_init(&bo->reference, 1);
         bo->screen = screen;
         bo->handle = handle;
@@ -348,23 +354,25 @@
         bo->name = "winsys";
         bo->private = false;
 
-        struct drm_vc5_get_bo_offset get = {
+#ifdef USE_V3D_SIMULATOR
+        v3d_simulator_open_from_handle(screen->fd, winsys_stride,
+                                       bo->handle, bo->size);
+        bo->map = malloc(bo->size);
+#endif
+
+        struct drm_v3d_get_bo_offset get = {
                 .handle = handle,
         };
-        int ret = vc5_ioctl(screen->fd, DRM_IOCTL_VC5_GET_BO_OFFSET, &get);
+        int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_GET_BO_OFFSET, &get);
         if (ret) {
                 fprintf(stderr, "Failed to get BO offset: %s\n",
                         strerror(errno));
+                free(bo->map);
                 free(bo);
                 return NULL;
         }
         bo->offset = get.offset;
-
-#ifdef USE_VC5_SIMULATOR
-        vc5_simulator_open_from_handle(screen->fd, winsys_stride,
-                                       bo->handle, bo->size);
-        bo->map = malloc(bo->size);
-#endif
+        assert(bo->offset != 0);
 
         util_hash_table_set(screen->bo_handles, (void *)(uintptr_t)handle, bo);
 
@@ -373,31 +381,31 @@
         return bo;
 }
 
-struct vc5_bo *
-vc5_bo_open_name(struct vc5_screen *screen, uint32_t name,
+struct v3d_bo *
+v3d_bo_open_name(struct v3d_screen *screen, uint32_t name,
                  uint32_t winsys_stride)
 {
         struct drm_gem_open o = {
                 .name = name
         };
-        int ret = vc5_ioctl(screen->fd, DRM_IOCTL_GEM_OPEN, &o);
+        int ret = v3d_ioctl(screen->fd, DRM_IOCTL_GEM_OPEN, &o);
         if (ret) {
                 fprintf(stderr, "Failed to open bo %d: %s\n",
                         name, strerror(errno));
                 return NULL;
         }
 
-        return vc5_bo_open_handle(screen, winsys_stride, o.handle, o.size);
+        return v3d_bo_open_handle(screen, winsys_stride, o.handle, o.size);
 }
 
-struct vc5_bo *
-vc5_bo_open_dmabuf(struct vc5_screen *screen, int fd, uint32_t winsys_stride)
+struct v3d_bo *
+v3d_bo_open_dmabuf(struct v3d_screen *screen, int fd, uint32_t winsys_stride)
 {
         uint32_t handle;
         int ret = drmPrimeFDToHandle(screen->fd, fd, &handle);
         int size;
         if (ret) {
-                fprintf(stderr, "Failed to get vc5 handle for dmabuf %d\n", fd);
+                fprintf(stderr, "Failed to get v3d handle for dmabuf %d\n", fd);
                 return NULL;
         }
 
@@ -408,11 +416,11 @@
                 return NULL;
         }
 
-        return vc5_bo_open_handle(screen, winsys_stride, handle, size);
+        return v3d_bo_open_handle(screen, winsys_stride, handle, size);
 }
 
 int
-vc5_bo_get_dmabuf(struct vc5_bo *bo)
+v3d_bo_get_dmabuf(struct v3d_bo *bo)
 {
         int fd;
         int ret = drmPrimeHandleToFD(bo->screen->fd, bo->handle,
@@ -432,12 +440,12 @@
 }
 
 bool
-vc5_bo_flink(struct vc5_bo *bo, uint32_t *name)
+v3d_bo_flink(struct v3d_bo *bo, uint32_t *name)
 {
         struct drm_gem_flink flink = {
                 .handle = bo->handle,
         };
-        int ret = vc5_ioctl(bo->screen->fd, DRM_IOCTL_GEM_FLINK, &flink);
+        int ret = v3d_ioctl(bo->screen->fd, DRM_IOCTL_GEM_FLINK, &flink);
         if (ret) {
                 fprintf(stderr, "Failed to flink bo %d: %s\n",
                         bo->handle, strerror(errno));
@@ -451,13 +459,13 @@
         return true;
 }
 
-static int vc5_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns)
+static int v3d_wait_bo_ioctl(int fd, uint32_t handle, uint64_t timeout_ns)
 {
-        struct drm_vc5_wait_bo wait = {
+        struct drm_v3d_wait_bo wait = {
                 .handle = handle,
                 .timeout_ns = timeout_ns,
         };
-        int ret = vc5_ioctl(fd, DRM_IOCTL_VC5_WAIT_BO, &wait);
+        int ret = v3d_ioctl(fd, DRM_IOCTL_V3D_WAIT_BO, &wait);
         if (ret == -1)
                 return -errno;
         else
@@ -466,18 +474,18 @@
 }
 
 bool
-vc5_bo_wait(struct vc5_bo *bo, uint64_t timeout_ns, const char *reason)
+v3d_bo_wait(struct v3d_bo *bo, uint64_t timeout_ns, const char *reason)
 {
-        struct vc5_screen *screen = bo->screen;
+        struct v3d_screen *screen = bo->screen;
 
         if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) && timeout_ns && reason) {
-                if (vc5_wait_bo_ioctl(screen->fd, bo->handle, 0) == -ETIME) {
+                if (v3d_wait_bo_ioctl(screen->fd, bo->handle, 0) == -ETIME) {
                         fprintf(stderr, "Blocking on %s BO for %s\n",
                                 bo->name, reason);
                 }
         }
 
-        int ret = vc5_wait_bo_ioctl(screen->fd, bo->handle, timeout_ns);
+        int ret = v3d_wait_bo_ioctl(screen->fd, bo->handle, timeout_ns);
         if (ret) {
                 if (ret != -ETIME) {
                         fprintf(stderr, "wait failed: %d\n", ret);
@@ -491,7 +499,7 @@
 }
 
 void *
-vc5_bo_map_unsynchronized(struct vc5_bo *bo)
+v3d_bo_map_unsynchronized(struct v3d_bo *bo)
 {
         uint64_t offset;
         int ret;
@@ -499,10 +507,10 @@
         if (bo->map)
                 return bo->map;
 
-        struct drm_vc5_mmap_bo map;
+        struct drm_v3d_mmap_bo map;
         memset(&map, 0, sizeof(map));
         map.handle = bo->handle;
-        ret = vc5_ioctl(bo->screen->fd, DRM_IOCTL_VC5_MMAP_BO, &map);
+        ret = v3d_ioctl(bo->screen->fd, DRM_IOCTL_V3D_MMAP_BO, &map);
         offset = map.offset;
         if (ret != 0) {
                 fprintf(stderr, "map ioctl failure\n");
@@ -522,11 +530,11 @@
 }
 
 void *
-vc5_bo_map(struct vc5_bo *bo)
+v3d_bo_map(struct v3d_bo *bo)
 {
-        void *map = vc5_bo_map_unsynchronized(bo);
+        void *map = v3d_bo_map_unsynchronized(bo);
 
-        bool ok = vc5_bo_wait(bo, PIPE_TIMEOUT_INFINITE, "bo map");
+        bool ok = v3d_bo_wait(bo, PIPE_TIMEOUT_INFINITE, "bo map");
         if (!ok) {
                 fprintf(stderr, "BO wait for map failed\n");
                 abort();
@@ -536,15 +544,15 @@
 }
 
 void
-vc5_bufmgr_destroy(struct pipe_screen *pscreen)
+v3d_bufmgr_destroy(struct pipe_screen *pscreen)
 {
-        struct vc5_screen *screen = vc5_screen(pscreen);
-        struct vc5_bo_cache *cache = &screen->bo_cache;
+        struct v3d_screen *screen = v3d_screen(pscreen);
+        struct v3d_bo_cache *cache = &screen->bo_cache;
 
-        vc5_bo_cache_free_all(cache);
+        v3d_bo_cache_free_all(cache);
 
         if (dump_stats) {
                 fprintf(stderr, "BO stats after screen destroy:\n");
-                vc5_bo_dump_stats(screen);
+                v3d_bo_dump_stats(screen);
         }
 }
diff --git a/src/gallium/drivers/vc5/vc5_bufmgr.h b/src/gallium/drivers/v3d/v3d_bufmgr.h
similarity index 72%
rename from src/gallium/drivers/vc5/vc5_bufmgr.h
rename to src/gallium/drivers/v3d/v3d_bufmgr.h
index cca2b22..8fbde5a 100644
--- a/src/gallium/drivers/vc5/vc5_bufmgr.h
+++ b/src/gallium/drivers/v3d/v3d_bufmgr.h
@@ -28,13 +28,13 @@
 #include "util/u_hash_table.h"
 #include "util/u_inlines.h"
 #include "util/list.h"
-#include "vc5_screen.h"
+#include "v3d_screen.h"
 
-struct vc5_context;
+struct v3d_context;
 
-struct vc5_bo {
+struct v3d_bo {
         struct pipe_reference reference;
-        struct vc5_screen *screen;
+        struct v3d_screen *screen;
         void *map;
         const char *name;
         uint32_t handle;
@@ -56,43 +56,43 @@
         bool private;
 };
 
-struct vc5_bo *vc5_bo_alloc(struct vc5_screen *screen, uint32_t size,
+struct v3d_bo *v3d_bo_alloc(struct v3d_screen *screen, uint32_t size,
                             const char *name);
-void vc5_bo_last_unreference(struct vc5_bo *bo);
-void vc5_bo_last_unreference_locked_timed(struct vc5_bo *bo, time_t time);
-struct vc5_bo *vc5_bo_open_name(struct vc5_screen *screen, uint32_t name,
+void v3d_bo_last_unreference(struct v3d_bo *bo);
+void v3d_bo_last_unreference_locked_timed(struct v3d_bo *bo, time_t time);
+struct v3d_bo *v3d_bo_open_name(struct v3d_screen *screen, uint32_t name,
                                 uint32_t winsys_stride);
-struct vc5_bo *vc5_bo_open_dmabuf(struct vc5_screen *screen, int fd,
+struct v3d_bo *v3d_bo_open_dmabuf(struct v3d_screen *screen, int fd,
                                   uint32_t winsys_stride);
-bool vc5_bo_flink(struct vc5_bo *bo, uint32_t *name);
-int vc5_bo_get_dmabuf(struct vc5_bo *bo);
+bool v3d_bo_flink(struct v3d_bo *bo, uint32_t *name);
+int v3d_bo_get_dmabuf(struct v3d_bo *bo);
 
 static inline void
-vc5_bo_set_reference(struct vc5_bo **old_bo, struct vc5_bo *new_bo)
+v3d_bo_set_reference(struct v3d_bo **old_bo, struct v3d_bo *new_bo)
 {
         if (pipe_reference(&(*old_bo)->reference, &new_bo->reference))
-                vc5_bo_last_unreference(*old_bo);
+                v3d_bo_last_unreference(*old_bo);
         *old_bo = new_bo;
 }
 
-static inline struct vc5_bo *
-vc5_bo_reference(struct vc5_bo *bo)
+static inline struct v3d_bo *
+v3d_bo_reference(struct v3d_bo *bo)
 {
         pipe_reference(NULL, &bo->reference);
         return bo;
 }
 
 static inline void
-vc5_bo_unreference(struct vc5_bo **bo)
+v3d_bo_unreference(struct v3d_bo **bo)
 {
-        struct vc5_screen *screen;
+        struct v3d_screen *screen;
         if (!*bo)
                 return;
 
         if ((*bo)->private) {
                 /* Avoid the mutex for private BOs */
                 if (pipe_reference(&(*bo)->reference, NULL))
-                        vc5_bo_last_unreference(*bo);
+                        v3d_bo_last_unreference(*bo);
         } else {
                 screen = (*bo)->screen;
                 mtx_lock(&screen->bo_handles_mutex);
@@ -100,7 +100,7 @@
                 if (pipe_reference(&(*bo)->reference, NULL)) {
                         util_hash_table_remove(screen->bo_handles,
                                                (void *)(uintptr_t)(*bo)->handle);
-                        vc5_bo_last_unreference(*bo);
+                        v3d_bo_last_unreference(*bo);
                 }
 
                 mtx_unlock(&screen->bo_handles_mutex);
@@ -110,31 +110,31 @@
 }
 
 static inline void
-vc5_bo_unreference_locked_timed(struct vc5_bo **bo, time_t time)
+v3d_bo_unreference_locked_timed(struct v3d_bo **bo, time_t time)
 {
         if (!*bo)
                 return;
 
         if (pipe_reference(&(*bo)->reference, NULL))
-                vc5_bo_last_unreference_locked_timed(*bo, time);
+                v3d_bo_last_unreference_locked_timed(*bo, time);
         *bo = NULL;
 }
 
 void *
-vc5_bo_map(struct vc5_bo *bo);
+v3d_bo_map(struct v3d_bo *bo);
 
 void *
-vc5_bo_map_unsynchronized(struct vc5_bo *bo);
+v3d_bo_map_unsynchronized(struct v3d_bo *bo);
 
 bool
-vc5_bo_wait(struct vc5_bo *bo, uint64_t timeout_ns, const char *reason);
+v3d_bo_wait(struct v3d_bo *bo, uint64_t timeout_ns, const char *reason);
 
 bool
-vc5_wait_seqno(struct vc5_screen *screen, uint64_t seqno, uint64_t timeout_ns,
+v3d_wait_seqno(struct v3d_screen *screen, uint64_t seqno, uint64_t timeout_ns,
                const char *reason);
 
 void
-vc5_bufmgr_destroy(struct pipe_screen *pscreen);
+v3d_bufmgr_destroy(struct pipe_screen *pscreen);
 
 #endif /* VC5_BUFMGR_H */
 
diff --git a/src/gallium/drivers/vc5/vc5_cl.c b/src/gallium/drivers/v3d/v3d_cl.c
similarity index 78%
rename from src/gallium/drivers/vc5/vc5_cl.c
rename to src/gallium/drivers/v3d/v3d_cl.c
index a10c164..94e83a8 100644
--- a/src/gallium/drivers/vc5/vc5_cl.c
+++ b/src/gallium/drivers/v3d/v3d_cl.c
@@ -23,14 +23,14 @@
 
 #include "util/u_math.h"
 #include "util/ralloc.h"
-#include "vc5_context.h"
+#include "v3d_context.h"
 /* The branching packets are the same across V3D versions. */
 #define V3D_VERSION 33
 #include "broadcom/common/v3d_macros.h"
 #include "broadcom/cle/v3dx_pack.h"
 
 void
-vc5_init_cl(struct vc5_job *job, struct vc5_cl *cl)
+v3d_init_cl(struct v3d_job *job, struct v3d_cl *cl)
 {
         cl->base = NULL;
         cl->next = cl->base;
@@ -39,7 +39,7 @@
 }
 
 uint32_t
-vc5_cl_ensure_space(struct vc5_cl *cl, uint32_t space, uint32_t alignment)
+v3d_cl_ensure_space(struct v3d_cl *cl, uint32_t space, uint32_t alignment)
 {
         uint32_t offset = align(cl_offset(cl), alignment);
 
@@ -48,9 +48,9 @@
                 return offset;
         }
 
-        vc5_bo_unreference(&cl->bo);
-        cl->bo = vc5_bo_alloc(cl->job->vc5->screen, align(space, 4096), "CL");
-        cl->base = vc5_bo_map(cl->bo);
+        v3d_bo_unreference(&cl->bo);
+        cl->bo = v3d_bo_alloc(cl->job->v3d->screen, align(space, 4096), "CL");
+        cl->base = v3d_bo_map(cl->bo);
         cl->size = cl->bo->size;
         cl->next = cl->base;
 
@@ -58,12 +58,12 @@
 }
 
 void
-vc5_cl_ensure_space_with_branch(struct vc5_cl *cl, uint32_t space)
+v3d_cl_ensure_space_with_branch(struct v3d_cl *cl, uint32_t space)
 {
         if (cl_offset(cl) + space + cl_packet_length(BRANCH) <= cl->size)
                 return;
 
-        struct vc5_bo *new_bo = vc5_bo_alloc(cl->job->vc5->screen, 4096, "CL");
+        struct v3d_bo *new_bo = v3d_bo_alloc(cl->job->v3d->screen, 4096, "CL");
         assert(space <= new_bo->size);
 
         /* Chain to the new BO from the old one. */
@@ -71,20 +71,20 @@
                 cl_emit(cl, BRANCH, branch) {
                         branch.address = cl_address(new_bo, 0);
                 }
-                vc5_bo_unreference(&cl->bo);
+                v3d_bo_unreference(&cl->bo);
         } else {
                 /* Root the first RCL/BCL BO in the job. */
-                vc5_job_add_bo(cl->job, cl->bo);
+                v3d_job_add_bo(cl->job, cl->bo);
         }
 
         cl->bo = new_bo;
-        cl->base = vc5_bo_map(cl->bo);
+        cl->base = v3d_bo_map(cl->bo);
         cl->size = cl->bo->size;
         cl->next = cl->base;
 }
 
 void
-vc5_destroy_cl(struct vc5_cl *cl)
+v3d_destroy_cl(struct v3d_cl *cl)
 {
-        vc5_bo_unreference(&cl->bo);
+        v3d_bo_unreference(&cl->bo);
 }
diff --git a/src/gallium/drivers/vc5/vc5_cl.h b/src/gallium/drivers/v3d/v3d_cl.h
similarity index 73%
rename from src/gallium/drivers/vc5/vc5_cl.h
rename to src/gallium/drivers/v3d/v3d_cl.h
index 7025b5a..0b3058f 100644
--- a/src/gallium/drivers/vc5/vc5_cl.h
+++ b/src/gallium/drivers/v3d/v3d_cl.h
@@ -29,70 +29,70 @@
 #include "util/u_math.h"
 #include "util/macros.h"
 
-struct vc5_bo;
-struct vc5_job;
-struct vc5_cl;
+struct v3d_bo;
+struct v3d_job;
+struct v3d_cl;
 
 /**
  * Undefined structure, used for typechecking that you're passing the pointers
  * to these functions correctly.
  */
-struct vc5_cl_out;
+struct v3d_cl_out;
 
 /** A reference to a BO used in the CL packing functions */
-struct vc5_cl_reloc {
-        struct vc5_bo *bo;
+struct v3d_cl_reloc {
+        struct v3d_bo *bo;
         uint32_t offset;
 };
 
-static inline void cl_pack_emit_reloc(struct vc5_cl *cl, const struct vc5_cl_reloc *);
+static inline void cl_pack_emit_reloc(struct v3d_cl *cl, const struct v3d_cl_reloc *);
 
-#define __gen_user_data struct vc5_cl
-#define __gen_address_type struct vc5_cl_reloc
+#define __gen_user_data struct v3d_cl
+#define __gen_address_type struct v3d_cl_reloc
 #define __gen_address_offset(reloc) (((reloc)->bo ? (reloc)->bo->offset : 0) + \
                                      (reloc)->offset)
 #define __gen_emit_reloc cl_pack_emit_reloc
 
-struct vc5_cl {
+struct v3d_cl {
         void *base;
-        struct vc5_job *job;
-        struct vc5_cl_out *next;
-        struct vc5_bo *bo;
+        struct v3d_job *job;
+        struct v3d_cl_out *next;
+        struct v3d_bo *bo;
         uint32_t size;
 };
 
-void vc5_init_cl(struct vc5_job *job, struct vc5_cl *cl);
-void vc5_destroy_cl(struct vc5_cl *cl);
-void vc5_dump_cl(void *cl, uint32_t size, bool is_render);
-uint32_t vc5_gem_hindex(struct vc5_job *job, struct vc5_bo *bo);
+void v3d_init_cl(struct v3d_job *job, struct v3d_cl *cl);
+void v3d_destroy_cl(struct v3d_cl *cl);
+void v3d_dump_cl(void *cl, uint32_t size, bool is_render);
+uint32_t v3d_gem_hindex(struct v3d_job *job, struct v3d_bo *bo);
 
 struct PACKED unaligned_16 { uint16_t x; };
 struct PACKED unaligned_32 { uint32_t x; };
 
-static inline uint32_t cl_offset(struct vc5_cl *cl)
+static inline uint32_t cl_offset(struct v3d_cl *cl)
 {
         return (char *)cl->next - (char *)cl->base;
 }
 
-static inline struct vc5_cl_reloc cl_get_address(struct vc5_cl *cl)
+static inline struct v3d_cl_reloc cl_get_address(struct v3d_cl *cl)
 {
-        return (struct vc5_cl_reloc){ .bo = cl->bo, .offset = cl_offset(cl) };
+        return (struct v3d_cl_reloc){ .bo = cl->bo, .offset = cl_offset(cl) };
 }
 
 static inline void
-cl_advance(struct vc5_cl_out **cl, uint32_t n)
+cl_advance(struct v3d_cl_out **cl, uint32_t n)
 {
-        (*cl) = (struct vc5_cl_out *)((char *)(*cl) + n);
+        (*cl) = (struct v3d_cl_out *)((char *)(*cl) + n);
 }
 
-static inline struct vc5_cl_out *
-cl_start(struct vc5_cl *cl)
+static inline struct v3d_cl_out *
+cl_start(struct v3d_cl *cl)
 {
         return cl->next;
 }
 
 static inline void
-cl_end(struct vc5_cl *cl, struct vc5_cl_out *next)
+cl_end(struct v3d_cl *cl, struct v3d_cl_out *next)
 {
         cl->next = next;
         assert(cl_offset(cl) <= cl->size);
@@ -100,71 +100,71 @@
 
 
 static inline void
-put_unaligned_32(struct vc5_cl_out *ptr, uint32_t val)
+put_unaligned_32(struct v3d_cl_out *ptr, uint32_t val)
 {
         struct unaligned_32 *p = (void *)ptr;
         p->x = val;
 }
 
 static inline void
-put_unaligned_16(struct vc5_cl_out *ptr, uint16_t val)
+put_unaligned_16(struct v3d_cl_out *ptr, uint16_t val)
 {
         struct unaligned_16 *p = (void *)ptr;
         p->x = val;
 }
 
 static inline void
-cl_u8(struct vc5_cl_out **cl, uint8_t n)
+cl_u8(struct v3d_cl_out **cl, uint8_t n)
 {
         *(uint8_t *)(*cl) = n;
         cl_advance(cl, 1);
 }
 
 static inline void
-cl_u16(struct vc5_cl_out **cl, uint16_t n)
+cl_u16(struct v3d_cl_out **cl, uint16_t n)
 {
         put_unaligned_16(*cl, n);
         cl_advance(cl, 2);
 }
 
 static inline void
-cl_u32(struct vc5_cl_out **cl, uint32_t n)
+cl_u32(struct v3d_cl_out **cl, uint32_t n)
 {
         put_unaligned_32(*cl, n);
         cl_advance(cl, 4);
 }
 
 static inline void
-cl_aligned_u32(struct vc5_cl_out **cl, uint32_t n)
+cl_aligned_u32(struct v3d_cl_out **cl, uint32_t n)
 {
         *(uint32_t *)(*cl) = n;
         cl_advance(cl, 4);
 }
 
 static inline void
-cl_aligned_reloc(struct vc5_cl *cl,
-                 struct vc5_cl_out **cl_out,
-                 struct vc5_bo *bo, uint32_t offset)
+cl_aligned_reloc(struct v3d_cl *cl,
+                 struct v3d_cl_out **cl_out,
+                 struct v3d_bo *bo, uint32_t offset)
 {
         cl_aligned_u32(cl_out, bo->offset + offset);
-        vc5_job_add_bo(cl->job, bo);
+        v3d_job_add_bo(cl->job, bo);
 }
 
 static inline void
-cl_ptr(struct vc5_cl_out **cl, void *ptr)
+cl_ptr(struct v3d_cl_out **cl, void *ptr)
 {
-        *(struct vc5_cl_out **)(*cl) = ptr;
+        *(struct v3d_cl_out **)(*cl) = ptr;
         cl_advance(cl, sizeof(void *));
 }
 
 static inline void
-cl_f(struct vc5_cl_out **cl, float f)
+cl_f(struct v3d_cl_out **cl, float f)
 {
         cl_u32(cl, fui(f));
 }
 
 static inline void
-cl_aligned_f(struct vc5_cl_out **cl, float f)
+cl_aligned_f(struct v3d_cl_out **cl, float f)
 {
         cl_aligned_u32(cl, fui(f));
 }
@@ -172,18 +172,18 @@
 /**
  * Reference to a BO with its associated offset, used in the pack process.
  */
-static inline struct vc5_cl_reloc
-cl_address(struct vc5_bo *bo, uint32_t offset)
+static inline struct v3d_cl_reloc
+cl_address(struct v3d_bo *bo, uint32_t offset)
 {
-        struct vc5_cl_reloc reloc = {
+        struct v3d_cl_reloc reloc = {
                 .bo = bo,
                 .offset = offset,
         };
         return reloc;
 }
 
-uint32_t vc5_cl_ensure_space(struct vc5_cl *cl, uint32_t size, uint32_t align);
-void vc5_cl_ensure_space_with_branch(struct vc5_cl *cl, uint32_t size);
+uint32_t v3d_cl_ensure_space(struct v3d_cl *cl, uint32_t size, uint32_t align);
+void v3d_cl_ensure_space_with_branch(struct v3d_cl *cl, uint32_t size);
 
 #define cl_packet_header(packet) V3DX(packet ## _header)
 #define cl_packet_length(packet) V3DX(packet ## _length)
@@ -191,7 +191,7 @@
 #define cl_packet_struct(packet) V3DX(packet)
 
 static inline void *
-cl_get_emit_space(struct vc5_cl_out **cl, size_t size)
+cl_get_emit_space(struct v3d_cl_out **cl, size_t size)
 {
         void *addr = *cl;
         cl_advance(cl, size);
@@ -219,7 +219,7 @@
         *_loop_terminate = &name;                                \
         __builtin_expect(_loop_terminate != NULL, 1);            \
         ({                                                       \
-                struct vc5_cl_out *cl_out = cl_start(cl);        \
+                struct v3d_cl_out *cl_out = cl_start(cl);        \
                 cl_packet_pack(packet)(cl, (uint8_t *)cl_out, &name); \
                 cl_advance(&cl_out, cl_packet_length(packet));   \
                 cl_end(cl, cl_out);                              \
@@ -233,7 +233,7 @@
         *_loop_terminate = &name;                                \
         __builtin_expect(_loop_terminate != NULL, 1);            \
         ({                                                       \
-                struct vc5_cl_out *cl_out = cl_start(cl);        \
+                struct v3d_cl_out *cl_out = cl_start(cl);        \
                 uint8_t packed[cl_packet_length(packet)];         \
                 cl_packet_pack(packet)(cl, packed, &name);       \
                 for (int _i = 0; _i < cl_packet_length(packet); _i++) \
@@ -243,11 +243,14 @@
                 _loop_terminate = NULL;                          \
         }))                                                      \
 
-#define cl_emit_prepacked(cl, packet) do {                       \
-        memcpy((cl)->next, packet, sizeof(*packet));             \
-        cl_advance(&(cl)->next, sizeof(*packet));                \
+#define cl_emit_prepacked_sized(cl, packet, size) do {                \
+        memcpy((cl)->next, packet, size);             \
+        cl_advance(&(cl)->next, size);                \
 } while (0)
 
+#define cl_emit_prepacked(cl, packet) \
+        cl_emit_prepacked_sized(cl, packet, sizeof(*(packet)))
+
 #define v3dx_pack(packed, packet, name)                          \
         for (struct cl_packet_struct(packet) name = {            \
                 cl_packet_header(packet)                         \
@@ -270,10 +273,10 @@
  * for this exec.
  */
 static inline void
-cl_pack_emit_reloc(struct vc5_cl *cl, const struct vc5_cl_reloc *reloc)
+cl_pack_emit_reloc(struct v3d_cl *cl, const struct v3d_cl_reloc *reloc)
 {
         if (reloc->bo)
-                vc5_job_add_bo(cl->job, reloc->bo);
+                v3d_job_add_bo(cl->job, reloc->bo);
 }
 
 #endif /* VC5_CL_H */
diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c
new file mode 100644
index 0000000..2fd2fa0
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_context.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <xf86drm.h>
+#include <err.h>
+
+#include "pipe/p_defines.h"
+#include "util/hash_table.h"
+#include "util/ralloc.h"
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_blitter.h"
+#include "util/u_upload_mgr.h"
+#include "indices/u_primconvert.h"
+#include "pipe/p_screen.h"
+
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_resource.h"
+
+void
+v3d_flush(struct pipe_context *pctx)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+
+        struct hash_entry *entry;
+        hash_table_foreach(v3d->jobs, entry) {
+                struct v3d_job *job = entry->data;
+                v3d_job_submit(v3d, job);
+        }
+}
+
+static void
+v3d_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
+               unsigned flags)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+
+        v3d_flush(pctx);
+
+        if (fence) {
+                struct pipe_screen *screen = pctx->screen;
+                struct v3d_fence *f = v3d_fence_create(v3d);
+                screen->fence_reference(screen, fence, NULL);
+                *fence = (struct pipe_fence_handle *)f;
+        }
+}
+
+static void
+v3d_invalidate_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_resource *rsc = v3d_resource(prsc);
+
+        rsc->initialized_buffers = 0;
+
+        struct hash_entry *entry = _mesa_hash_table_search(v3d->write_jobs,
+                                                           prsc);
+        if (!entry)
+                return;
+
+        struct v3d_job *job = entry->data;
+        if (job->key.zsbuf && job->key.zsbuf->texture == prsc)
+                job->store &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL);
+}
+
+static void
+v3d_context_destroy(struct pipe_context *pctx)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+
+        v3d_flush(pctx);
+
+        if (v3d->blitter)
+                util_blitter_destroy(v3d->blitter);
+
+        if (v3d->primconvert)
+                util_primconvert_destroy(v3d->primconvert);
+
+        if (v3d->uploader)
+                u_upload_destroy(v3d->uploader);
+
+        slab_destroy_child(&v3d->transfer_pool);
+
+        pipe_surface_reference(&v3d->framebuffer.cbufs[0], NULL);
+        pipe_surface_reference(&v3d->framebuffer.zsbuf, NULL);
+
+        v3d_program_fini(pctx);
+
+        ralloc_free(v3d);
+}
+
+struct pipe_context *
+v3d_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
+{
+        struct v3d_screen *screen = v3d_screen(pscreen);
+        struct v3d_context *v3d;
+
+        /* Prevent dumping of the shaders built during context setup. */
+        uint32_t saved_shaderdb_flag = V3D_DEBUG & V3D_DEBUG_SHADERDB;
+        V3D_DEBUG &= ~V3D_DEBUG_SHADERDB;
+
+        v3d = rzalloc(NULL, struct v3d_context);
+        if (!v3d)
+                return NULL;
+        struct pipe_context *pctx = &v3d->base;
+
+        v3d->screen = screen;
+
+        int ret = drmSyncobjCreate(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
+                                   &v3d->out_sync);
+        if (ret) {
+                ralloc_free(v3d);
+                return NULL;
+        }
+
+        pctx->screen = pscreen;
+        pctx->priv = priv;
+        pctx->destroy = v3d_context_destroy;
+        pctx->flush = v3d_pipe_flush;
+        pctx->invalidate_resource = v3d_invalidate_resource;
+
+        if (screen->devinfo.ver >= 41) {
+                v3d41_draw_init(pctx);
+                v3d41_state_init(pctx);
+        } else {
+                v3d33_draw_init(pctx);
+                v3d33_state_init(pctx);
+        }
+        v3d_program_init(pctx);
+        v3d_query_init(pctx);
+        v3d_resource_context_init(pctx);
+
+        v3d_job_init(v3d);
+
+        v3d->fd = screen->fd;
+
+        slab_create_child(&v3d->transfer_pool, &screen->transfer_pool);
+
+        v3d->uploader = u_upload_create_default(&v3d->base);
+        v3d->base.stream_uploader = v3d->uploader;
+        v3d->base.const_uploader = v3d->uploader;
+
+        v3d->blitter = util_blitter_create(pctx);
+        if (!v3d->blitter)
+                goto fail;
+        v3d->blitter->use_index_buffer = true;
+
+        v3d->primconvert = util_primconvert_create(pctx,
+                                                   (1 << PIPE_PRIM_QUADS) - 1);
+        if (!v3d->primconvert)
+                goto fail;
+
+        V3D_DEBUG |= saved_shaderdb_flag;
+
+        v3d->sample_mask = (1 << VC5_MAX_SAMPLES) - 1;
+        v3d->active_queries = true;
+
+        return &v3d->base;
+
+fail:
+        pctx->destroy(pctx);
+        return NULL;
+}
diff --git a/src/gallium/drivers/vc5/vc5_context.h b/src/gallium/drivers/v3d/v3d_context.h
similarity index 67%
rename from src/gallium/drivers/vc5/vc5_context.h
rename to src/gallium/drivers/v3d/v3d_context.h
index e08a2a5..f6720e2 100644
--- a/src/gallium/drivers/vc5/vc5_context.h
+++ b/src/gallium/drivers/v3d/v3d_context.h
@@ -36,23 +36,21 @@
 #include "util/bitset.h"
 #include "util/slab.h"
 #include "xf86drm.h"
-#include "vc5_drm.h"
-#include "vc5_screen.h"
+#include "v3d_drm.h"
+#include "v3d_screen.h"
 
-struct vc5_job;
-struct vc5_bo;
-void vc5_job_add_bo(struct vc5_job *job, struct vc5_bo *bo);
+struct v3d_job;
+struct v3d_bo;
+void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo);
 
-#define __user
-#include "vc5_drm.h"
-#include "vc5_bufmgr.h"
-#include "vc5_resource.h"
-#include "vc5_cl.h"
+#include "v3d_bufmgr.h"
+#include "v3d_resource.h"
+#include "v3d_cl.h"
 
-#ifdef USE_VC5_SIMULATOR
-#define using_vc5_simulator true
+#ifdef USE_V3D_SIMULATOR
+#define using_v3d_simulator true
 #else
-#define using_vc5_simulator false
+#define using_v3d_simulator false
 #endif
 
 #define VC5_DIRTY_BLEND         (1 <<  0)
@@ -63,7 +61,7 @@
 
 #define VC5_DIRTY_BLEND_COLOR   (1 <<  7)
 #define VC5_DIRTY_STENCIL_REF   (1 <<  8)
-#define VC5_DIRTY_SAMPLE_MASK   (1 <<  9)
+#define VC5_DIRTY_SAMPLE_STATE  (1 <<  9)
 #define VC5_DIRTY_FRAMEBUFFER   (1 << 10)
 #define VC5_DIRTY_STIPPLE       (1 << 11)
 #define VC5_DIRTY_VIEWPORT      (1 << 12)
@@ -82,10 +80,12 @@
 #define VC5_DIRTY_FS_INPUTS     (1 << 26)
 #define VC5_DIRTY_STREAMOUT     (1 << 27)
 #define VC5_DIRTY_OQ            (1 << 28)
+#define VC5_DIRTY_CENTROID_FLAGS (1 << 29)
+#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1 << 30)
 
 #define VC5_MAX_FS_INPUTS 64
 
-struct vc5_sampler_view {
+struct v3d_sampler_view {
         struct pipe_sampler_view base;
         uint32_t p0;
         uint32_t p1;
@@ -94,10 +94,10 @@
 
         uint8_t texture_shader_state[32];
         /* V3D 4.x: Texture state struct. */
-        struct vc5_bo *bo;
+        struct v3d_bo *bo;
 };
 
-struct vc5_sampler_state {
+struct v3d_sampler_state {
         struct pipe_sampler_state base;
         uint32_t p0;
         uint32_t p1;
@@ -105,24 +105,24 @@
         /* V3D 3.x: Packed texture state. */
         uint8_t texture_shader_state[32];
         /* V3D 4.x: Sampler state struct. */
-        struct vc5_bo *bo;
+        struct v3d_bo *bo;
 };
 
-struct vc5_texture_stateobj {
+struct v3d_texture_stateobj {
         struct pipe_sampler_view *textures[PIPE_MAX_SAMPLERS];
         unsigned num_textures;
         struct pipe_sampler_state *samplers[PIPE_MAX_SAMPLERS];
         unsigned num_samplers;
-        struct vc5_cl_reloc texture_state[PIPE_MAX_SAMPLERS];
+        struct v3d_cl_reloc texture_state[PIPE_MAX_SAMPLERS];
 };
 
-struct vc5_shader_uniform_info {
+struct v3d_shader_uniform_info {
         enum quniform_contents *contents;
         uint32_t *data;
         uint32_t count;
 };
 
-struct vc5_uncompiled_shader {
+struct v3d_uncompiled_shader {
         /** A name for this program, so you can track it in shader-db output. */
         uint32_t program_id;
         /** How many variants of this program were compiled, for shader-db. */
@@ -142,8 +142,8 @@
         bool was_tgsi;
 };
 
-struct vc5_compiled_shader {
-        struct vc5_bo *bo;
+struct v3d_compiled_shader {
+        struct v3d_bo *bo;
 
         union {
                 struct v3d_prog_data *base;
@@ -152,54 +152,56 @@
         } prog_data;
 
         /**
-         * VC5_DIRTY_* flags that, when set in vc5->dirty, mean that the
+         * VC5_DIRTY_* flags that, when set in v3d->dirty, mean that the
          * uniforms have to be rewritten (and therefore the shader state
          * reemitted).
          */
         uint32_t uniform_dirty_bits;
 };
 
-struct vc5_program_stateobj {
-        struct vc5_uncompiled_shader *bind_vs, *bind_fs;
-        struct vc5_compiled_shader *cs, *vs, *fs;
+struct v3d_program_stateobj {
+        struct v3d_uncompiled_shader *bind_vs, *bind_fs;
+        struct v3d_compiled_shader *cs, *vs, *fs;
 
-        struct vc5_bo *spill_bo;
+        struct v3d_bo *spill_bo;
         int spill_size_per_thread;
 };
 
-struct vc5_constbuf_stateobj {
+struct v3d_constbuf_stateobj {
         struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS];
         uint32_t enabled_mask;
         uint32_t dirty_mask;
 };
 
-struct vc5_vertexbuf_stateobj {
+struct v3d_vertexbuf_stateobj {
         struct pipe_vertex_buffer vb[PIPE_MAX_ATTRIBS];
         unsigned count;
         uint32_t enabled_mask;
         uint32_t dirty_mask;
 };
 
-struct vc5_vertex_stateobj {
+struct v3d_vertex_stateobj {
         struct pipe_vertex_element pipe[VC5_MAX_ATTRIBUTES];
         unsigned num_elements;
 
-        uint8_t attrs[12 * VC5_MAX_ATTRIBUTES];
-        struct vc5_bo *default_attribute_values;
+        uint8_t attrs[16 * VC5_MAX_ATTRIBUTES];
+        struct v3d_bo *default_attribute_values;
 };
 
-struct vc5_streamout_stateobj {
+struct v3d_streamout_stateobj {
         struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
+        /* Number of vertices we've written into the buffer so far. */
+        uint32_t offsets[PIPE_MAX_SO_BUFFERS];
         unsigned num_targets;
 };
 
-/* Hash table key for vc5->jobs */
-struct vc5_job_key {
+/* Hash table key for v3d->jobs */
+struct v3d_job_key {
         struct pipe_surface *cbufs[4];
         struct pipe_surface *zsbuf;
 };
 
-enum vc5_ez_state {
+enum v3d_ez_state {
         VC5_EZ_UNDECIDED = 0,
         VC5_EZ_GT_GE,
         VC5_EZ_LT_LE,
@@ -215,16 +217,16 @@
  * target (which would mean reading back from the old render target when
  * starting to render to it again).
  */
-struct vc5_job {
-        struct vc5_context *vc5;
-        struct vc5_cl bcl;
-        struct vc5_cl rcl;
-        struct vc5_cl indirect;
-        struct vc5_bo *tile_alloc;
-        struct vc5_bo *tile_state;
+struct v3d_job {
+        struct v3d_context *v3d;
+        struct v3d_cl bcl;
+        struct v3d_cl rcl;
+        struct v3d_cl indirect;
+        struct v3d_bo *tile_alloc;
+        struct v3d_bo *tile_state;
         uint32_t shader_rec_count;
 
-        struct drm_vc5_submit_cl submit;
+        struct drm_v3d_submit_cl submit;
 
         /**
          * Set of all BOs referenced by the job.  This will be used for making
@@ -278,11 +280,15 @@
         /* Bitmask of PIPE_CLEAR_* of buffers that were cleared before the
          * first rendering.
          */
-        uint32_t cleared;
-        /* Bitmask of PIPE_CLEAR_* of buffers that have been rendered to
-         * (either clears or draws).
+        uint32_t clear;
+        /* Bitmask of PIPE_CLEAR_* of buffers that have been read by a draw
+         * call without having been cleared first.
          */
-        uint32_t resolve;
+        uint32_t load;
+        /* Bitmask of PIPE_CLEAR_* of buffers that have been rendered to
+         * (either clears or draws) and should be stored.
+         */
+        uint32_t store;
         uint32_t clear_color[4][4];
         float clear_z;
         uint8_t clear_s;
@@ -311,12 +317,12 @@
          * Current EZ state for drawing. Updated at the start of draw after
          * we've decided on the shader being rendered.
          */
-        enum vc5_ez_state ez_state;
+        enum v3d_ez_state ez_state;
         /**
          * The first EZ state that was used for drawing with a decided EZ
          * direction (so either UNDECIDED, GT, or LT).
          */
-        enum vc5_ez_state first_ez_state;
+        enum v3d_ez_state first_ez_state;
 
         /**
          * Number of draw calls (not counting full buffer clears) queued in
@@ -324,24 +330,24 @@
          */
         uint32_t draw_calls_queued;
 
-        struct vc5_job_key key;
+        struct v3d_job_key key;
 };
 
-struct vc5_context {
+struct v3d_context {
         struct pipe_context base;
 
         int fd;
-        struct vc5_screen *screen;
+        struct v3d_screen *screen;
 
         /** The 3D rendering job for the currently bound FBO. */
-        struct vc5_job *job;
+        struct v3d_job *job;
 
-        /* Map from struct vc5_job_key to the job for that FBO.
+        /* Map from struct v3d_job_key to the job for that FBO.
          */
         struct hash_table *jobs;
 
         /**
-         * Map from vc5_resource to a job writing to that resource.
+         * Map from v3d_resource to a job writing to that resource.
          *
          * Primarily for flushing jobs rendering to textures that are now
          * being read from.
@@ -360,7 +366,7 @@
         uint32_t next_uncompiled_program_id;
         uint64_t next_compiled_program_id;
 
-        struct vc5_compiler_state *compiler_state;
+        struct v3d_compiler_state *compiler_state;
 
         uint8_t prim_mode;
 
@@ -374,15 +380,15 @@
 
         /** @{ Current pipeline state objects */
         struct pipe_scissor_state scissor;
-        struct pipe_blend_state *blend;
-        struct vc5_rasterizer_state *rasterizer;
-        struct vc5_depth_stencil_alpha_state *zsa;
+        struct v3d_blend_state *blend;
+        struct v3d_rasterizer_state *rasterizer;
+        struct v3d_depth_stencil_alpha_state *zsa;
 
-        struct vc5_texture_stateobj verttex, fragtex;
+        struct v3d_texture_stateobj verttex, fragtex;
 
-        struct vc5_program_stateobj prog;
+        struct v3d_program_stateobj prog;
 
-        struct vc5_vertex_stateobj *vtx;
+        struct v3d_vertex_stateobj *vtx;
 
         struct {
                 struct pipe_blend_color f;
@@ -415,141 +421,129 @@
         struct pipe_poly_stipple stipple;
         struct pipe_clip_state clip;
         struct pipe_viewport_state viewport;
-        struct vc5_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
-        struct vc5_vertexbuf_stateobj vertexbuf;
-        struct vc5_streamout_stateobj streamout;
-        struct vc5_bo *current_oq;
+        struct v3d_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
+        struct v3d_vertexbuf_stateobj vertexbuf;
+        struct v3d_streamout_stateobj streamout;
+        struct v3d_bo *current_oq;
         /** @} */
 };
 
-struct vc5_rasterizer_state {
+struct v3d_rasterizer_state {
         struct pipe_rasterizer_state base;
 
-        /* VC5_CONFIGURATION_BITS */
-        uint8_t config_bits[3];
-
         float point_size;
 
-        /**
-         * Half-float (1/8/7 bits) value of polygon offset units for
-         * VC5_PACKET_DEPTH_OFFSET
-         */
-        uint16_t offset_units;
-        /**
-         * Half-float (1/8/7 bits) value of polygon offset scale for
-         * VC5_PACKET_DEPTH_OFFSET
-         */
-        uint16_t offset_factor;
+        uint8_t depth_offset[9];
+        uint8_t depth_offset_z16[9];
 };
 
-struct vc5_depth_stencil_alpha_state {
+struct v3d_depth_stencil_alpha_state {
         struct pipe_depth_stencil_alpha_state base;
 
-        enum vc5_ez_state ez_state;
-
-        /** Uniforms for stencil state.
-         *
-         * Index 0 is either the front config, or the front-and-back config.
-         * Index 1 is the back config if doing separate back stencil.
-         * Index 2 is the writemask config if it's not a common mask value.
-         */
-        uint32_t stencil_uniforms[3];
+        enum v3d_ez_state ez_state;
 
         uint8_t stencil_front[6];
         uint8_t stencil_back[6];
 };
 
+struct v3d_blend_state {
+        struct pipe_blend_state base;
+
+        /* Per-RT mask of whether blending is enabled. */
+        uint8_t blend_enables;
+};
+
 #define perf_debug(...) do {                            \
         if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))       \
                 fprintf(stderr, __VA_ARGS__);           \
 } while (0)
 
-static inline struct vc5_context *
-vc5_context(struct pipe_context *pcontext)
+static inline struct v3d_context *
+v3d_context(struct pipe_context *pcontext)
 {
-        return (struct vc5_context *)pcontext;
+        return (struct v3d_context *)pcontext;
 }
 
-static inline struct vc5_sampler_view *
-vc5_sampler_view(struct pipe_sampler_view *psview)
+static inline struct v3d_sampler_view *
+v3d_sampler_view(struct pipe_sampler_view *psview)
 {
-        return (struct vc5_sampler_view *)psview;
+        return (struct v3d_sampler_view *)psview;
 }
 
-static inline struct vc5_sampler_state *
-vc5_sampler_state(struct pipe_sampler_state *psampler)
+static inline struct v3d_sampler_state *
+v3d_sampler_state(struct pipe_sampler_state *psampler)
 {
-        return (struct vc5_sampler_state *)psampler;
+        return (struct v3d_sampler_state *)psampler;
 }
 
-struct pipe_context *vc5_context_create(struct pipe_screen *pscreen,
+struct pipe_context *v3d_context_create(struct pipe_screen *pscreen,
                                         void *priv, unsigned flags);
-void vc5_program_init(struct pipe_context *pctx);
-void vc5_program_fini(struct pipe_context *pctx);
-void vc5_query_init(struct pipe_context *pctx);
+void v3d_program_init(struct pipe_context *pctx);
+void v3d_program_fini(struct pipe_context *pctx);
+void v3d_query_init(struct pipe_context *pctx);
 
-void vc5_simulator_init(struct vc5_screen *screen);
-void vc5_simulator_destroy(struct vc5_screen *screen);
-int vc5_simulator_flush(struct vc5_context *vc5,
-                        struct drm_vc5_submit_cl *args,
-                        struct vc5_job *job);
-int vc5_simulator_ioctl(int fd, unsigned long request, void *arg);
-void vc5_simulator_open_from_handle(int fd, uint32_t winsys_stride,
+void v3d_simulator_init(struct v3d_screen *screen);
+void v3d_simulator_destroy(struct v3d_screen *screen);
+int v3d_simulator_flush(struct v3d_context *v3d,
+                        struct drm_v3d_submit_cl *args,
+                        struct v3d_job *job);
+int v3d_simulator_ioctl(int fd, unsigned long request, void *arg);
+void v3d_simulator_open_from_handle(int fd, uint32_t winsys_stride,
                                     int handle, uint32_t size);
 
 static inline int
-vc5_ioctl(int fd, unsigned long request, void *arg)
+v3d_ioctl(int fd, unsigned long request, void *arg)
 {
-        if (using_vc5_simulator)
-                return vc5_simulator_ioctl(fd, request, arg);
+        if (using_v3d_simulator)
+                return v3d_simulator_ioctl(fd, request, arg);
         else
                 return drmIoctl(fd, request, arg);
 }
 
-void vc5_set_shader_uniform_dirty_flags(struct vc5_compiled_shader *shader);
-struct vc5_cl_reloc vc5_write_uniforms(struct vc5_context *vc5,
-                                       struct vc5_compiled_shader *shader,
-                                       struct vc5_constbuf_stateobj *cb,
-                                       struct vc5_texture_stateobj *texstate);
+void v3d_set_shader_uniform_dirty_flags(struct v3d_compiled_shader *shader);
+struct v3d_cl_reloc v3d_write_uniforms(struct v3d_context *v3d,
+                                       struct v3d_compiled_shader *shader,
+                                       struct v3d_constbuf_stateobj *cb,
+                                       struct v3d_texture_stateobj *texstate);
 
-void vc5_flush(struct pipe_context *pctx);
-void vc5_job_init(struct vc5_context *vc5);
-struct vc5_job *vc5_get_job(struct vc5_context *vc5,
+void v3d_flush(struct pipe_context *pctx);
+void v3d_job_init(struct v3d_context *v3d);
+struct v3d_job *v3d_get_job(struct v3d_context *v3d,
                             struct pipe_surface **cbufs,
                             struct pipe_surface *zsbuf);
-struct vc5_job *vc5_get_job_for_fbo(struct vc5_context *vc5);
-void vc5_job_add_bo(struct vc5_job *job, struct vc5_bo *bo);
-void vc5_job_add_write_resource(struct vc5_job *job, struct pipe_resource *prsc);
-void vc5_job_submit(struct vc5_context *vc5, struct vc5_job *job);
-void vc5_flush_jobs_writing_resource(struct vc5_context *vc5,
+struct v3d_job *v3d_get_job_for_fbo(struct v3d_context *v3d);
+void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo);
+void v3d_job_add_write_resource(struct v3d_job *job, struct pipe_resource *prsc);
+void v3d_job_submit(struct v3d_context *v3d, struct v3d_job *job);
+void v3d_flush_jobs_writing_resource(struct v3d_context *v3d,
                                      struct pipe_resource *prsc);
-void vc5_flush_jobs_reading_resource(struct vc5_context *vc5,
+void v3d_flush_jobs_reading_resource(struct v3d_context *v3d,
                                      struct pipe_resource *prsc);
-void vc5_update_compiled_shaders(struct vc5_context *vc5, uint8_t prim_mode);
+void v3d_update_compiled_shaders(struct v3d_context *v3d, uint8_t prim_mode);
 
-bool vc5_rt_format_supported(const struct v3d_device_info *devinfo,
+bool v3d_rt_format_supported(const struct v3d_device_info *devinfo,
                              enum pipe_format f);
-bool vc5_tex_format_supported(const struct v3d_device_info *devinfo,
+bool v3d_tex_format_supported(const struct v3d_device_info *devinfo,
                               enum pipe_format f);
-uint8_t vc5_get_rt_format(const struct v3d_device_info *devinfo, enum pipe_format f);
-uint8_t vc5_get_tex_format(const struct v3d_device_info *devinfo, enum pipe_format f);
-uint8_t vc5_get_tex_return_size(const struct v3d_device_info *devinfo,
+uint8_t v3d_get_rt_format(const struct v3d_device_info *devinfo, enum pipe_format f);
+uint8_t v3d_get_tex_format(const struct v3d_device_info *devinfo, enum pipe_format f);
+uint8_t v3d_get_tex_return_size(const struct v3d_device_info *devinfo,
                                 enum pipe_format f,
                                 enum pipe_tex_compare compare);
-uint8_t vc5_get_tex_return_channels(const struct v3d_device_info *devinfo,
+uint8_t v3d_get_tex_return_channels(const struct v3d_device_info *devinfo,
                                     enum pipe_format f);
-const uint8_t *vc5_get_format_swizzle(const struct v3d_device_info *devinfo,
+const uint8_t *v3d_get_format_swizzle(const struct v3d_device_info *devinfo,
                                       enum pipe_format f);
-void vc5_get_internal_type_bpp_for_output_format(const struct v3d_device_info *devinfo,
+void v3d_get_internal_type_bpp_for_output_format(const struct v3d_device_info *devinfo,
                                                  uint32_t format,
                                                  uint32_t *type,
                                                  uint32_t *bpp);
 
-void vc5_init_query_functions(struct vc5_context *vc5);
-void vc5_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info);
-void vc5_blitter_save(struct vc5_context *vc5);
+void v3d_init_query_functions(struct v3d_context *v3d);
+void v3d_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info);
+void v3d_blitter_save(struct v3d_context *v3d);
 
-struct vc5_fence *vc5_fence_create(struct vc5_context *vc5);
+struct v3d_fence *v3d_fence_create(struct v3d_context *v3d);
 
 #ifdef v3dX
 #  include "v3dx_context.h"
diff --git a/src/gallium/drivers/v3d/v3d_fence.c b/src/gallium/drivers/v3d/v3d_fence.c
new file mode 100644
index 0000000..0edcb66
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3d_fence.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright © 2014 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file v3d_fence.c
+ *
+ * Seqno-based fence management.
+ *
+ * We have two mechanisms for waiting in our kernel API: You can wait on a BO
+ * to have all rendering to from any process to be completed, or wait on a
+ * seqno for that particular seqno to be passed.  The fence API we're
+ * implementing is based on waiting for all rendering in the context to have
+ * completed (with no reference to what other processes might be doing with
+ * the same BOs), so we can just use the seqno of the last rendering we'd
+ * fired off as our fence marker.
+ */
+
+#include "util/u_inlines.h"
+#include "util/os_time.h"
+
+#include "v3d_context.h"
+#include "v3d_bufmgr.h"
+
+struct v3d_fence {
+        struct pipe_reference reference;
+        int fd;
+};
+
+static void
+v3d_fence_reference(struct pipe_screen *pscreen,
+                    struct pipe_fence_handle **pp,
+                    struct pipe_fence_handle *pf)
+{
+        struct v3d_fence **p = (struct v3d_fence **)pp;
+        struct v3d_fence *f = (struct v3d_fence *)pf;
+        struct v3d_fence *old = *p;
+
+        if (pipe_reference(&(*p)->reference, &f->reference)) {
+                close(old->fd);
+                free(old);
+        }
+        *p = f;
+}
+
+static boolean
+v3d_fence_finish(struct pipe_screen *pscreen,
+		 struct pipe_context *ctx,
+                 struct pipe_fence_handle *pf,
+                 uint64_t timeout_ns)
+{
+        struct v3d_screen *screen = v3d_screen(pscreen);
+        struct v3d_fence *f = (struct v3d_fence *)pf;
+        int ret;
+
+        unsigned syncobj;
+        ret = drmSyncobjCreate(screen->fd, 0, &syncobj);
+        if (ret) {
+                fprintf(stderr, "Failed to create syncobj to wait on: %d\n",
+                        ret);
+                return false;
+        }
+
+        drmSyncobjImportSyncFile(screen->fd, syncobj, f->fd);
+        if (ret) {
+                fprintf(stderr, "Failed to import fence to syncobj: %d\n", ret);
+                return false;
+        }
+
+        uint64_t abs_timeout = os_time_get_absolute_timeout(timeout_ns);
+        if (abs_timeout == OS_TIMEOUT_INFINITE)
+                abs_timeout = INT64_MAX;
+
+        ret = drmSyncobjWait(screen->fd, &syncobj, 1, abs_timeout, 0, NULL);
+
+        drmSyncobjDestroy(screen->fd, syncobj);
+
+        return ret >= 0;
+}
+
+struct v3d_fence *
+v3d_fence_create(struct v3d_context *v3d)
+{
+        struct v3d_fence *f = calloc(1, sizeof(*f));
+        if (!f)
+                return NULL;
+
+        /* Snapshot the last V3D rendering's out fence.  We'd rather have
+         * another syncobj instead of a sync file, but this is all we get.
+         * (HandleToFD/FDToHandle just gives you another syncobj ID for the
+         * same syncobj).
+         */
+        drmSyncobjExportSyncFile(v3d->fd, v3d->out_sync, &f->fd);
+        if (f->fd == -1) {
+                fprintf(stderr, "export failed\n");
+                free(f);
+                return NULL;
+        }
+
+        pipe_reference_init(&f->reference, 1);
+
+        return f;
+}
+
+void
+v3d_fence_init(struct v3d_screen *screen)
+{
+        screen->base.fence_reference = v3d_fence_reference;
+        screen->base.fence_finish = v3d_fence_finish;
+}
diff --git a/src/gallium/drivers/vc5/vc5_format_table.h b/src/gallium/drivers/v3d/v3d_format_table.h
similarity index 98%
rename from src/gallium/drivers/vc5/vc5_format_table.h
rename to src/gallium/drivers/v3d/v3d_format_table.h
index 8b80113..b291708 100644
--- a/src/gallium/drivers/vc5/vc5_format_table.h
+++ b/src/gallium/drivers/v3d/v3d_format_table.h
@@ -26,7 +26,7 @@
 #include <stdbool.h>
 #include <stdint.h>
 
-struct vc5_format {
+struct v3d_format {
         /** Set if the pipe format is defined in the table. */
         bool present;
 
diff --git a/src/gallium/drivers/vc5/vc5_formats.c b/src/gallium/drivers/v3d/v3d_formats.c
similarity index 75%
rename from src/gallium/drivers/vc5/vc5_formats.c
rename to src/gallium/drivers/v3d/v3d_formats.c
index b65b7cd..bdb8693 100644
--- a/src/gallium/drivers/vc5/vc5_formats.c
+++ b/src/gallium/drivers/v3d/v3d_formats.c
@@ -22,7 +22,7 @@
  */
 
 /**
- * @file vc5_formats.c
+ * @file v3d_formats.c
  *
  * Contains the table and accessors for VC5 texture and render target format
  * support.
@@ -34,10 +34,10 @@
 
 #include "util/macros.h"
 
-#include "vc5_context.h"
-#include "vc5_format_table.h"
+#include "v3d_context.h"
+#include "v3d_format_table.h"
 
-static const struct vc5_format *
+static const struct v3d_format *
 get_format(const struct v3d_device_info *devinfo, enum pipe_format f)
 {
         if (devinfo->ver >= 41)
@@ -47,10 +47,10 @@
 }
 
 bool
-vc5_rt_format_supported(const struct v3d_device_info *devinfo,
+v3d_rt_format_supported(const struct v3d_device_info *devinfo,
                         enum pipe_format f)
 {
-        const struct vc5_format *vf = get_format(devinfo, f);
+        const struct v3d_format *vf = get_format(devinfo, f);
 
         if (!vf)
                 return false;
@@ -59,9 +59,9 @@
 }
 
 uint8_t
-vc5_get_rt_format(const struct v3d_device_info *devinfo, enum pipe_format f)
+v3d_get_rt_format(const struct v3d_device_info *devinfo, enum pipe_format f)
 {
-        const struct vc5_format *vf = get_format(devinfo, f);
+        const struct v3d_format *vf = get_format(devinfo, f);
 
         if (!vf)
                 return 0;
@@ -70,18 +70,18 @@
 }
 
 bool
-vc5_tex_format_supported(const struct v3d_device_info *devinfo,
+v3d_tex_format_supported(const struct v3d_device_info *devinfo,
                          enum pipe_format f)
 {
-        const struct vc5_format *vf = get_format(devinfo, f);
+        const struct v3d_format *vf = get_format(devinfo, f);
 
         return vf != NULL;
 }
 
 uint8_t
-vc5_get_tex_format(const struct v3d_device_info *devinfo, enum pipe_format f)
+v3d_get_tex_format(const struct v3d_device_info *devinfo, enum pipe_format f)
 {
-        const struct vc5_format *vf = get_format(devinfo, f);
+        const struct v3d_format *vf = get_format(devinfo, f);
 
         if (!vf)
                 return 0;
@@ -90,10 +90,10 @@
 }
 
 uint8_t
-vc5_get_tex_return_size(const struct v3d_device_info *devinfo,
+v3d_get_tex_return_size(const struct v3d_device_info *devinfo,
                         enum pipe_format f, enum pipe_tex_compare compare)
 {
-        const struct vc5_format *vf = get_format(devinfo, f);
+        const struct v3d_format *vf = get_format(devinfo, f);
 
         if (!vf)
                 return 0;
@@ -105,10 +105,10 @@
 }
 
 uint8_t
-vc5_get_tex_return_channels(const struct v3d_device_info *devinfo,
+v3d_get_tex_return_channels(const struct v3d_device_info *devinfo,
                             enum pipe_format f)
 {
-        const struct vc5_format *vf = get_format(devinfo, f);
+        const struct v3d_format *vf = get_format(devinfo, f);
 
         if (!vf)
                 return 0;
@@ -117,9 +117,9 @@
 }
 
 const uint8_t *
-vc5_get_format_swizzle(const struct v3d_device_info *devinfo, enum pipe_format f)
+v3d_get_format_swizzle(const struct v3d_device_info *devinfo, enum pipe_format f)
 {
-        const struct vc5_format *vf = get_format(devinfo, f);
+        const struct v3d_format *vf = get_format(devinfo, f);
         static const uint8_t fallback[] = {0, 1, 2, 3};
 
         if (!vf)
@@ -129,7 +129,7 @@
 }
 
 void
-vc5_get_internal_type_bpp_for_output_format(const struct v3d_device_info *devinfo,
+v3d_get_internal_type_bpp_for_output_format(const struct v3d_device_info *devinfo,
                                             uint32_t format,
                                             uint32_t *type,
                                             uint32_t *bpp)
diff --git a/src/gallium/drivers/vc5/vc5_job.c b/src/gallium/drivers/v3d/v3d_job.c
similarity index 61%
rename from src/gallium/drivers/vc5/vc5_job.c
rename to src/gallium/drivers/v3d/v3d_job.c
index 9947fb4..3645305 100644
--- a/src/gallium/drivers/vc5/vc5_job.c
+++ b/src/gallium/drivers/v3d/v3d_job.c
@@ -21,13 +21,13 @@
  * IN THE SOFTWARE.
  */
 
-/** @file vc5_job.c
+/** @file v3d_job.c
  *
  * Functions for submitting VC5 render jobs to the kernel.
  */
 
 #include <xf86drm.h>
-#include "vc5_context.h"
+#include "v3d_context.h"
 /* The OQ/semaphore packets are the same across V3D versions. */
 #define V3D_VERSION 33
 #include "broadcom/cle/v3dx_pack.h"
@@ -45,16 +45,16 @@
 }
 
 static void
-vc5_job_free(struct vc5_context *vc5, struct vc5_job *job)
+v3d_job_free(struct v3d_context *v3d, struct v3d_job *job)
 {
         struct set_entry *entry;
 
         set_foreach(job->bos, entry) {
-                struct vc5_bo *bo = (struct vc5_bo *)entry->key;
-                vc5_bo_unreference(&bo);
+                struct v3d_bo *bo = (struct v3d_bo *)entry->key;
+                v3d_bo_unreference(&bo);
         }
 
-        remove_from_ht(vc5->jobs, &job->key);
+        remove_from_ht(v3d->jobs, &job->key);
 
         if (job->write_prscs) {
                 struct set_entry *entry;
@@ -62,43 +62,48 @@
                 set_foreach(job->write_prscs, entry) {
                         const struct pipe_resource *prsc = entry->key;
 
-                        remove_from_ht(vc5->write_jobs, (void *)prsc);
+                        remove_from_ht(v3d->write_jobs, (void *)prsc);
                 }
         }
 
         for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
                 if (job->cbufs[i]) {
-                        remove_from_ht(vc5->write_jobs, job->cbufs[i]->texture);
+                        remove_from_ht(v3d->write_jobs, job->cbufs[i]->texture);
                         pipe_surface_reference(&job->cbufs[i], NULL);
                 }
         }
         if (job->zsbuf) {
-                remove_from_ht(vc5->write_jobs, job->zsbuf->texture);
+                struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
+                if (rsc->separate_stencil)
+                        remove_from_ht(v3d->write_jobs,
+                                       &rsc->separate_stencil->base);
+
+                remove_from_ht(v3d->write_jobs, job->zsbuf->texture);
                 pipe_surface_reference(&job->zsbuf, NULL);
         }
 
-        if (vc5->job == job)
-                vc5->job = NULL;
+        if (v3d->job == job)
+                v3d->job = NULL;
 
-        vc5_destroy_cl(&job->bcl);
-        vc5_destroy_cl(&job->rcl);
-        vc5_destroy_cl(&job->indirect);
-        vc5_bo_unreference(&job->tile_alloc);
-        vc5_bo_unreference(&job->tile_state);
+        v3d_destroy_cl(&job->bcl);
+        v3d_destroy_cl(&job->rcl);
+        v3d_destroy_cl(&job->indirect);
+        v3d_bo_unreference(&job->tile_alloc);
+        v3d_bo_unreference(&job->tile_state);
 
         ralloc_free(job);
 }
 
-static struct vc5_job *
-vc5_job_create(struct vc5_context *vc5)
+static struct v3d_job *
+v3d_job_create(struct v3d_context *v3d)
 {
-        struct vc5_job *job = rzalloc(vc5, struct vc5_job);
+        struct v3d_job *job = rzalloc(v3d, struct v3d_job);
 
-        job->vc5 = vc5;
+        job->v3d = v3d;
 
-        vc5_init_cl(job, &job->bcl);
-        vc5_init_cl(job, &job->rcl);
-        vc5_init_cl(job, &job->indirect);
+        v3d_init_cl(job, &job->bcl);
+        v3d_init_cl(job, &job->rcl);
+        v3d_init_cl(job, &job->indirect);
 
         job->draw_min_x = ~0;
         job->draw_min_y = ~0;
@@ -112,7 +117,7 @@
 }
 
 void
-vc5_job_add_bo(struct vc5_job *job, struct vc5_bo *bo)
+v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo)
 {
         if (!bo)
                 return;
@@ -120,7 +125,7 @@
         if (_mesa_set_search(job->bos, bo))
                 return;
 
-        vc5_bo_reference(bo);
+        v3d_bo_reference(bo);
         _mesa_set_add(job->bos, bo);
         job->referenced_size += bo->size;
 
@@ -136,9 +141,9 @@
 }
 
 void
-vc5_job_add_write_resource(struct vc5_job *job, struct pipe_resource *prsc)
+v3d_job_add_write_resource(struct v3d_job *job, struct pipe_resource *prsc)
 {
-        struct vc5_context *vc5 = job->vc5;
+        struct v3d_context *v3d = job->v3d;
 
         if (!job->write_prscs) {
                 job->write_prscs = _mesa_set_create(job,
@@ -147,36 +152,36 @@
         }
 
         _mesa_set_add(job->write_prscs, prsc);
-        _mesa_hash_table_insert(vc5->write_jobs, prsc, job);
+        _mesa_hash_table_insert(v3d->write_jobs, prsc, job);
 }
 
 void
-vc5_flush_jobs_writing_resource(struct vc5_context *vc5,
+v3d_flush_jobs_writing_resource(struct v3d_context *v3d,
                                 struct pipe_resource *prsc)
 {
-        struct hash_entry *entry = _mesa_hash_table_search(vc5->write_jobs,
+        struct hash_entry *entry = _mesa_hash_table_search(v3d->write_jobs,
                                                            prsc);
         if (entry) {
-                struct vc5_job *job = entry->data;
-                vc5_job_submit(vc5, job);
+                struct v3d_job *job = entry->data;
+                v3d_job_submit(v3d, job);
         }
 }
 
 void
-vc5_flush_jobs_reading_resource(struct vc5_context *vc5,
+v3d_flush_jobs_reading_resource(struct v3d_context *v3d,
                                 struct pipe_resource *prsc)
 {
-        struct vc5_resource *rsc = vc5_resource(prsc);
+        struct v3d_resource *rsc = v3d_resource(prsc);
 
-        vc5_flush_jobs_writing_resource(vc5, prsc);
+        v3d_flush_jobs_writing_resource(v3d, prsc);
 
         struct hash_entry *entry;
-        hash_table_foreach(vc5->jobs, entry) {
-                struct vc5_job *job = entry->data;
+        hash_table_foreach(v3d->jobs, entry) {
+                struct v3d_job *job = entry->data;
 
                 if (_mesa_set_search(job->bos, rsc->bo)) {
-                        vc5_job_submit(vc5, job);
-                        /* Reminder: vc5->jobs is safe to keep iterating even
+                        v3d_job_submit(v3d, job);
+                        /* Reminder: v3d->jobs is safe to keep iterating even
                          * after deletion of an entry.
                          */
                         continue;
@@ -185,7 +190,7 @@
 }
 
 static void
-vc5_job_set_tile_buffer_size(struct vc5_job *job)
+v3d_job_set_tile_buffer_size(struct v3d_job *job)
 {
         static const uint8_t tile_sizes[] = {
                 64, 64,
@@ -206,7 +211,7 @@
         int max_bpp = RENDER_TARGET_MAXIMUM_32BPP;
         for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
                 if (job->cbufs[i]) {
-                        struct vc5_surface *surf = vc5_surface(job->cbufs[i]);
+                        struct v3d_surface *surf = v3d_surface(job->cbufs[i]);
                         max_bpp = MAX2(max_bpp, surf->internal_bpp);
                 }
         }
@@ -220,19 +225,19 @@
 }
 
 /**
- * Returns a vc5_job struture for tracking V3D rendering to a particular FBO.
+ * Returns a v3d_job struture for tracking V3D rendering to a particular FBO.
  *
  * If we've already started rendering to this FBO, then return old same job,
  * otherwise make a new one.  If we're beginning rendering to an FBO, make
  * sure that any previous reads of the FBO (or writes to its color/Z surfaces)
  * have been flushed.
  */
-struct vc5_job *
-vc5_get_job(struct vc5_context *vc5,
+struct v3d_job *
+v3d_get_job(struct v3d_context *v3d,
             struct pipe_surface **cbufs, struct pipe_surface *zsbuf)
 {
         /* Return the existing job for this FBO if we have one */
-        struct vc5_job_key local_key = {
+        struct v3d_job_key local_key = {
                 .cbufs = {
                         cbufs[0],
                         cbufs[1],
@@ -241,7 +246,7 @@
                 },
                 .zsbuf = zsbuf,
         };
-        struct hash_entry *entry = _mesa_hash_table_search(vc5->jobs,
+        struct hash_entry *entry = _mesa_hash_table_search(v3d->jobs,
                                                            &local_key);
         if (entry)
                 return entry->data;
@@ -249,11 +254,11 @@
         /* Creating a new job.  Make sure that any previous jobs reading or
          * writing these buffers are flushed.
          */
-        struct vc5_job *job = vc5_job_create(vc5);
+        struct v3d_job *job = v3d_job_create(v3d);
 
         for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
                 if (cbufs[i]) {
-                        vc5_flush_jobs_reading_resource(vc5, cbufs[i]->texture);
+                        v3d_flush_jobs_reading_resource(v3d, cbufs[i]->texture);
                         pipe_surface_reference(&job->cbufs[i], cbufs[i]);
 
                         if (cbufs[i]->texture->nr_samples > 1)
@@ -261,135 +266,132 @@
                 }
         }
         if (zsbuf) {
-                vc5_flush_jobs_reading_resource(vc5, zsbuf->texture);
+                v3d_flush_jobs_reading_resource(v3d, zsbuf->texture);
                 pipe_surface_reference(&job->zsbuf, zsbuf);
                 if (zsbuf->texture->nr_samples > 1)
                         job->msaa = true;
         }
 
-        vc5_job_set_tile_buffer_size(job);
+        v3d_job_set_tile_buffer_size(job);
 
         for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
                 if (cbufs[i])
-                        _mesa_hash_table_insert(vc5->write_jobs,
+                        _mesa_hash_table_insert(v3d->write_jobs,
                                                 cbufs[i]->texture, job);
         }
-        if (zsbuf)
-                _mesa_hash_table_insert(vc5->write_jobs, zsbuf->texture, job);
+        if (zsbuf) {
+                _mesa_hash_table_insert(v3d->write_jobs, zsbuf->texture, job);
+
+                struct v3d_resource *rsc = v3d_resource(zsbuf->texture);
+                if (rsc->separate_stencil) {
+                        v3d_flush_jobs_reading_resource(v3d,
+                                                        &rsc->separate_stencil->base);
+                        _mesa_hash_table_insert(v3d->write_jobs,
+                                                &rsc->separate_stencil->base,
+                                                job);
+                }
+        }
 
         memcpy(&job->key, &local_key, sizeof(local_key));
-        _mesa_hash_table_insert(vc5->jobs, &job->key, job);
+        _mesa_hash_table_insert(v3d->jobs, &job->key, job);
 
         return job;
 }
 
-struct vc5_job *
-vc5_get_job_for_fbo(struct vc5_context *vc5)
+struct v3d_job *
+v3d_get_job_for_fbo(struct v3d_context *v3d)
 {
-        if (vc5->job)
-                return vc5->job;
+        if (v3d->job)
+                return v3d->job;
 
-        struct pipe_surface **cbufs = vc5->framebuffer.cbufs;
-        struct pipe_surface *zsbuf = vc5->framebuffer.zsbuf;
-        struct vc5_job *job = vc5_get_job(vc5, cbufs, zsbuf);
+        struct pipe_surface **cbufs = v3d->framebuffer.cbufs;
+        struct pipe_surface *zsbuf = v3d->framebuffer.zsbuf;
+        struct v3d_job *job = v3d_get_job(v3d, cbufs, zsbuf);
 
-        /* The dirty flags are tracking what's been updated while vc5->job has
+        /* The dirty flags are tracking what's been updated while v3d->job has
          * been bound, so set them all to ~0 when switching between jobs.  We
          * also need to reset all state at the start of rendering.
          */
-        vc5->dirty = ~0;
+        v3d->dirty = ~0;
 
         /* If we're binding to uninitialized buffers, no need to load their
          * contents before drawing.
          */
         for (int i = 0; i < 4; i++) {
                 if (cbufs[i]) {
-                        struct vc5_resource *rsc = vc5_resource(cbufs[i]->texture);
+                        struct v3d_resource *rsc = v3d_resource(cbufs[i]->texture);
                         if (!rsc->writes)
-                                job->cleared |= PIPE_CLEAR_COLOR0 << i;
+                                job->clear |= PIPE_CLEAR_COLOR0 << i;
                 }
         }
 
         if (zsbuf) {
-                struct vc5_resource *rsc = vc5_resource(zsbuf->texture);
+                struct v3d_resource *rsc = v3d_resource(zsbuf->texture);
                 if (!rsc->writes)
-                        job->cleared |= PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL;
+                        job->clear |= PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL;
         }
 
-        job->draw_tiles_x = DIV_ROUND_UP(vc5->framebuffer.width,
+        job->draw_tiles_x = DIV_ROUND_UP(v3d->framebuffer.width,
                                          job->tile_width);
-        job->draw_tiles_y = DIV_ROUND_UP(vc5->framebuffer.height,
+        job->draw_tiles_y = DIV_ROUND_UP(v3d->framebuffer.height,
                                          job->tile_height);
 
-        vc5->job = job;
+        v3d->job = job;
 
         return job;
 }
 
-static bool
-vc5_clif_dump_lookup(void *data, uint32_t addr, void **vaddr)
-{
-        struct vc5_job *job = data;
-        struct set_entry *entry;
-
-        set_foreach(job->bos, entry) {
-                struct vc5_bo *bo = (void *)entry->key;
-
-                if (addr >= bo->offset &&
-                    addr < bo->offset + bo->size) {
-                        vc5_bo_map(bo);
-                        *vaddr = bo->map + addr - bo->offset;
-                        return true;
-                }
-        }
-
-        return false;
-}
-
 static void
-vc5_clif_dump(struct vc5_context *vc5, struct vc5_job *job)
+v3d_clif_dump(struct v3d_context *v3d, struct v3d_job *job)
 {
-        if (!(V3D_DEBUG & V3D_DEBUG_CL))
+        if (!(V3D_DEBUG & (V3D_DEBUG_CL | V3D_DEBUG_CLIF)))
                 return;
 
-        struct clif_dump *clif = clif_dump_init(&vc5->screen->devinfo,
-                                                stderr, vc5_clif_dump_lookup,
-                                                job);
+        struct clif_dump *clif = clif_dump_init(&v3d->screen->devinfo,
+                                                stderr,
+                                                V3D_DEBUG & V3D_DEBUG_CL);
 
-        fprintf(stderr, "BCL: 0x%08x..0x%08x\n",
-                job->submit.bcl_start, job->submit.bcl_end);
+        struct set_entry *entry;
+        set_foreach(job->bos, entry) {
+                struct v3d_bo *bo = (void *)entry->key;
+                char *name = ralloc_asprintf(NULL, "%s_0x%x",
+                                             bo->name, bo->offset);
 
-        clif_dump_add_cl(clif, job->submit.bcl_start, job->submit.bcl_end);
+                v3d_bo_map(bo);
+                clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
 
-        fprintf(stderr, "RCL: 0x%08x..0x%08x\n",
-                job->submit.rcl_start, job->submit.rcl_end);
-        clif_dump_add_cl(clif, job->submit.rcl_start, job->submit.rcl_end);
+                ralloc_free(name);
+        }
+
+        clif_dump(clif, &job->submit);
+
+        clif_dump_destroy(clif);
 }
 
 /**
  * Submits the job to the kernel and then reinitializes it.
  */
 void
-vc5_job_submit(struct vc5_context *vc5, struct vc5_job *job)
+v3d_job_submit(struct v3d_context *v3d, struct v3d_job *job)
 {
-        MAYBE_UNUSED struct vc5_screen *screen = vc5->screen;
+        MAYBE_UNUSED struct v3d_screen *screen = v3d->screen;
 
         if (!job->needs_flush)
                 goto done;
 
-        if (vc5->screen->devinfo.ver >= 41)
+        if (v3d->screen->devinfo.ver >= 41)
                 v3d41_emit_rcl(job);
         else
                 v3d33_emit_rcl(job);
 
         if (cl_offset(&job->bcl) > 0) {
                 if (screen->devinfo.ver >= 41)
-                        v3d41_bcl_epilogue(vc5, job);
+                        v3d41_bcl_epilogue(v3d, job);
                 else
-                        v3d33_bcl_epilogue(vc5, job);
+                        v3d33_bcl_epilogue(v3d, job);
         }
 
-        job->submit.out_sync = vc5->out_sync;
+        job->submit.out_sync = v3d->out_sync;
         job->submit.bcl_end = job->bcl.bo->offset + cl_offset(&job->bcl);
         job->submit.rcl_end = job->rcl.bo->offset + cl_offset(&job->rcl);
 
@@ -397,23 +399,23 @@
          * instead of binner packets.
          */
         if (screen->devinfo.ver >= 41) {
-                vc5_job_add_bo(job, job->tile_alloc);
+                v3d_job_add_bo(job, job->tile_alloc);
                 job->submit.qma = job->tile_alloc->offset;
                 job->submit.qms = job->tile_alloc->size;
 
-                vc5_job_add_bo(job, job->tile_state);
+                v3d_job_add_bo(job, job->tile_state);
                 job->submit.qts = job->tile_state->offset;
         }
 
-        vc5_clif_dump(vc5, job);
+        v3d_clif_dump(v3d, job);
 
         if (!(V3D_DEBUG & V3D_DEBUG_NORAST)) {
                 int ret;
 
-#ifndef USE_VC5_SIMULATOR
-                ret = drmIoctl(vc5->fd, DRM_IOCTL_VC5_SUBMIT_CL, &job->submit);
+#ifndef USE_V3D_SIMULATOR
+                ret = drmIoctl(v3d->fd, DRM_IOCTL_V3D_SUBMIT_CL, &job->submit);
 #else
-                ret = vc5_simulator_flush(vc5, &job->submit, job);
+                ret = v3d_simulator_flush(v3d, &job->submit, job);
 #endif
                 static bool warned = false;
                 if (ret && !warned) {
@@ -424,28 +426,28 @@
         }
 
 done:
-        vc5_job_free(vc5, job);
+        v3d_job_free(v3d, job);
 }
 
 static bool
-vc5_job_compare(const void *a, const void *b)
+v3d_job_compare(const void *a, const void *b)
 {
-        return memcmp(a, b, sizeof(struct vc5_job_key)) == 0;
+        return memcmp(a, b, sizeof(struct v3d_job_key)) == 0;
 }
 
 static uint32_t
-vc5_job_hash(const void *key)
+v3d_job_hash(const void *key)
 {
-        return _mesa_hash_data(key, sizeof(struct vc5_job_key));
+        return _mesa_hash_data(key, sizeof(struct v3d_job_key));
 }
 
 void
-vc5_job_init(struct vc5_context *vc5)
+v3d_job_init(struct v3d_context *v3d)
 {
-        vc5->jobs = _mesa_hash_table_create(vc5,
-                                            vc5_job_hash,
-                                            vc5_job_compare);
-        vc5->write_jobs = _mesa_hash_table_create(vc5,
+        v3d->jobs = _mesa_hash_table_create(v3d,
+                                            v3d_job_hash,
+                                            v3d_job_compare);
+        v3d->write_jobs = _mesa_hash_table_create(v3d,
                                                   _mesa_hash_pointer,
                                                   _mesa_key_pointer_equal);
 }
diff --git a/src/gallium/drivers/vc5/vc5_program.c b/src/gallium/drivers/v3d/v3d_program.c
similarity index 68%
rename from src/gallium/drivers/vc5/vc5_program.c
rename to src/gallium/drivers/v3d/v3d_program.c
index 7bad80a..8555458 100644
--- a/src/gallium/drivers/vc5/vc5_program.c
+++ b/src/gallium/drivers/v3d/v3d_program.c
@@ -33,12 +33,12 @@
 #include "compiler/nir/nir_builder.h"
 #include "nir/tgsi_to_nir.h"
 #include "compiler/v3d_compiler.h"
-#include "vc5_context.h"
+#include "v3d_context.h"
 #include "broadcom/cle/v3d_packet_v33_pack.h"
 #include "mesa/state_tracker/st_glsl_types.h"
 
 static gl_varying_slot
-vc5_get_slot_for_driver_location(nir_shader *s, uint32_t driver_location)
+v3d_get_slot_for_driver_location(nir_shader *s, uint32_t driver_location)
 {
         nir_foreach_variable(var, &s->outputs) {
                 if (var->data.driver_location == driver_location) {
@@ -58,7 +58,7 @@
  * varyings together in a single data spec.
  */
 static void
-vc5_set_transform_feedback_outputs(struct vc5_uncompiled_shader *so,
+v3d_set_transform_feedback_outputs(struct v3d_uncompiled_shader *so,
                                    const struct pipe_stream_output_info *stream_output)
 {
         if (!stream_output->num_outputs)
@@ -96,7 +96,7 @@
                          */
                         for (int j = 0; j < output->num_components; j++) {
                                 gl_varying_slot slot =
-                                        vc5_get_slot_for_driver_location(so->base.ir.nir, output->register_index);
+                                        v3d_get_slot_for_driver_location(so->base.ir.nir, output->register_index);
 
                                 slots[slot_count] =
                                         v3d_slot_from_slot_and_component(slot,
@@ -121,10 +121,14 @@
                                  * values at the start.
                                  */
                                 .first_shaded_vertex_value_to_output = vpm_start_offset,
-                                .number_of_consecutive_vertex_values_to_output_as_32_bit_values_minus_1 = write_size - 1,
+                                .number_of_consecutive_vertex_values_to_output_as_32_bit_values = write_size,
                                 .output_buffer_to_write_to = buffer,
                         };
 
+                        /* GFXH-1559 */
+                        assert(unpacked.first_shaded_vertex_value_to_output != 8 ||
+                               so->num_tf_specs != 0);
+
                         assert(so->num_tf_specs != ARRAY_SIZE(so->tf_specs));
                         V3D33_TRANSFORM_FEEDBACK_OUTPUT_DATA_SPEC_pack(NULL,
                                                                        (void *)&so->tf_specs[so->num_tf_specs],
@@ -136,6 +140,11 @@
                          * though.
                          */
                         unpacked.first_shaded_vertex_value_to_output++;
+
+                        /* GFXH-1559 */
+                        assert(unpacked.first_shaded_vertex_value_to_output != 8 ||
+                               so->num_tf_specs != 0);
+
                         V3D33_TRANSFORM_FEEDBACK_OUTPUT_DATA_SPEC_pack(NULL,
                                                                        (void *)&so->tf_specs_psiz[so->num_tf_specs],
                                                                        &unpacked);
@@ -143,6 +152,8 @@
                         vpm_start_offset += write_size;
                         vpm_size -= write_size;
                 }
+                so->base.stream_output.stride[buffer] =
+                        stream_output->stride[buffer];
         }
 
         so->num_tf_outputs = slot_count;
@@ -164,15 +175,15 @@
 }
 
 static void *
-vc5_shader_state_create(struct pipe_context *pctx,
+v3d_shader_state_create(struct pipe_context *pctx,
                         const struct pipe_shader_state *cso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_uncompiled_shader *so = CALLOC_STRUCT(vc5_uncompiled_shader);
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_uncompiled_shader *so = CALLOC_STRUCT(v3d_uncompiled_shader);
         if (!so)
                 return NULL;
 
-        so->program_id = vc5->next_uncompiled_program_id++;
+        so->program_id = v3d->next_uncompiled_program_id++;
 
         nir_shader *s;
 
@@ -218,7 +229,7 @@
         so->base.type = PIPE_SHADER_IR_NIR;
         so->base.ir.nir = s;
 
-        vc5_set_transform_feedback_outputs(so, &cso->stream_output);
+        v3d_set_transform_feedback_outputs(so, &cso->stream_output);
 
         if (V3D_DEBUG & (V3D_DEBUG_NIR |
                          v3d_debug_flag_for_shader_stage(s->info.stage))) {
@@ -232,19 +243,19 @@
         return so;
 }
 
-static struct vc5_compiled_shader *
-vc5_get_compiled_shader(struct vc5_context *vc5, struct v3d_key *key)
+static struct v3d_compiled_shader *
+v3d_get_compiled_shader(struct v3d_context *v3d, struct v3d_key *key)
 {
-        struct vc5_uncompiled_shader *shader_state = key->shader_state;
+        struct v3d_uncompiled_shader *shader_state = key->shader_state;
         nir_shader *s = shader_state->base.ir.nir;
 
         struct hash_table *ht;
         uint32_t key_size;
         if (s->info.stage == MESA_SHADER_FRAGMENT) {
-                ht = vc5->fs_cache;
+                ht = v3d->fs_cache;
                 key_size = sizeof(struct v3d_fs_key);
         } else {
-                ht = vc5->vs_cache;
+                ht = v3d->vs_cache;
                 key_size = sizeof(struct v3d_vs_key);
         }
 
@@ -252,8 +263,8 @@
         if (entry)
                 return entry->data;
 
-        struct vc5_compiled_shader *shader =
-                rzalloc(NULL, struct vc5_compiled_shader);
+        struct v3d_compiled_shader *shader =
+                rzalloc(NULL, struct v3d_compiled_shader);
 
         int program_id = shader_state->program_id;
         int variant_id =
@@ -265,7 +276,7 @@
         case MESA_SHADER_VERTEX:
                 shader->prog_data.vs = rzalloc(shader, struct v3d_vs_prog_data);
 
-                qpu_insts = v3d_compile_vs(vc5->screen->compiler,
+                qpu_insts = v3d_compile_vs(v3d->screen->compiler,
                                            (struct v3d_vs_key *)key,
                                            shader->prog_data.vs, s,
                                            program_id, variant_id,
@@ -274,7 +285,7 @@
         case MESA_SHADER_FRAGMENT:
                 shader->prog_data.fs = rzalloc(shader, struct v3d_fs_prog_data);
 
-                qpu_insts = v3d_compile_fs(vc5->screen->compiler,
+                qpu_insts = v3d_compile_fs(v3d->screen->compiler,
                                            (struct v3d_fs_key *)key,
                                            shader->prog_data.fs, s,
                                            program_id, variant_id,
@@ -284,31 +295,31 @@
                 unreachable("bad stage");
         }
 
-        vc5_set_shader_uniform_dirty_flags(shader);
+        v3d_set_shader_uniform_dirty_flags(shader);
 
-        shader->bo = vc5_bo_alloc(vc5->screen, shader_size, "shader");
-        vc5_bo_map(shader->bo);
+        shader->bo = v3d_bo_alloc(v3d->screen, shader_size, "shader");
+        v3d_bo_map(shader->bo);
         memcpy(shader->bo->map, qpu_insts, shader_size);
 
         free(qpu_insts);
 
-        struct vc5_key *dup_key;
+        struct v3d_key *dup_key;
         dup_key = ralloc_size(shader, key_size);
         memcpy(dup_key, key, key_size);
         _mesa_hash_table_insert(ht, dup_key, shader);
 
         if (shader->prog_data.base->spill_size >
-            vc5->prog.spill_size_per_thread) {
+            v3d->prog.spill_size_per_thread) {
                 /* Max 4 QPUs per slice, 3 slices per core. We only do single
                  * core so far.  This overallocates memory on smaller cores.
                  */
                 int total_spill_size =
                         4 * 3 * shader->prog_data.base->spill_size;
 
-                vc5_bo_unreference(&vc5->prog.spill_bo);
-                vc5->prog.spill_bo = vc5_bo_alloc(vc5->screen,
+                v3d_bo_unreference(&v3d->prog.spill_bo);
+                v3d->prog.spill_bo = v3d_bo_alloc(v3d->screen,
                                                   total_spill_size, "spill");
-                vc5->prog.spill_size_per_thread =
+                v3d->prog.spill_size_per_thread =
                         shader->prog_data.base->spill_size;
         }
 
@@ -316,14 +327,14 @@
 }
 
 static void
-vc5_setup_shared_key(struct vc5_context *vc5, struct v3d_key *key,
-                     struct vc5_texture_stateobj *texstate)
+v3d_setup_shared_key(struct v3d_context *v3d, struct v3d_key *key,
+                     struct v3d_texture_stateobj *texstate)
 {
-        const struct v3d_device_info *devinfo = &vc5->screen->devinfo;
+        const struct v3d_device_info *devinfo = &v3d->screen->devinfo;
 
         for (int i = 0; i < texstate->num_textures; i++) {
                 struct pipe_sampler_view *sampler = texstate->textures[i];
-                struct vc5_sampler_view *vc5_sampler = vc5_sampler_view(sampler);
+                struct v3d_sampler_view *v3d_sampler = v3d_sampler_view(sampler);
                 struct pipe_sampler_state *sampler_state =
                         texstate->samplers[i];
 
@@ -331,7 +342,7 @@
                         continue;
 
                 key->tex[i].return_size =
-                        vc5_get_tex_return_size(devinfo,
+                        v3d_get_tex_return_size(devinfo,
                                                 sampler->format,
                                                 sampler_state->compare_mode);
 
@@ -345,14 +356,14 @@
                         key->tex[i].return_channels = 4;
                 } else {
                         key->tex[i].return_channels =
-                                vc5_get_tex_return_channels(devinfo,
+                                v3d_get_tex_return_channels(devinfo,
                                                             sampler->format);
                 }
 
                 if (key->tex[i].return_size == 32 && devinfo->ver < 40) {
                         memcpy(key->tex[i].swizzle,
-                               vc5_sampler->swizzle,
-                               sizeof(vc5_sampler->swizzle));
+                               v3d_sampler->swizzle,
+                               sizeof(v3d_sampler->swizzle));
                 } else {
                         /* For 16-bit returns, we let the sampler state handle
                          * the swizzle.
@@ -363,10 +374,7 @@
                         key->tex[i].swizzle[3] = PIPE_SWIZZLE_W;
                 }
 
-                if (sampler->texture->nr_samples > 1) {
-                        key->tex[i].msaa_width = sampler->texture->width0;
-                        key->tex[i].msaa_height = sampler->texture->height0;
-                } else if (sampler){
+                if (sampler) {
                         key->tex[i].compare_mode = sampler_state->compare_mode;
                         key->tex[i].compare_func = sampler_state->compare_func;
                         key->tex[i].clamp_s =
@@ -378,62 +386,62 @@
                 }
         }
 
-        key->ucp_enables = vc5->rasterizer->base.clip_plane_enable;
+        key->ucp_enables = v3d->rasterizer->base.clip_plane_enable;
 }
 
 static void
-vc5_update_compiled_fs(struct vc5_context *vc5, uint8_t prim_mode)
+v3d_update_compiled_fs(struct v3d_context *v3d, uint8_t prim_mode)
 {
-        struct vc5_job *job = vc5->job;
+        struct v3d_job *job = v3d->job;
         struct v3d_fs_key local_key;
         struct v3d_fs_key *key = &local_key;
 
-        if (!(vc5->dirty & (VC5_DIRTY_PRIM_MODE |
+        if (!(v3d->dirty & (VC5_DIRTY_PRIM_MODE |
                             VC5_DIRTY_BLEND |
                             VC5_DIRTY_FRAMEBUFFER |
                             VC5_DIRTY_ZSA |
                             VC5_DIRTY_RASTERIZER |
-                            VC5_DIRTY_SAMPLE_MASK |
+                            VC5_DIRTY_SAMPLE_STATE |
                             VC5_DIRTY_FRAGTEX |
                             VC5_DIRTY_UNCOMPILED_FS))) {
                 return;
         }
 
         memset(key, 0, sizeof(*key));
-        vc5_setup_shared_key(vc5, &key->base, &vc5->fragtex);
-        key->base.shader_state = vc5->prog.bind_fs;
+        v3d_setup_shared_key(v3d, &key->base, &v3d->fragtex);
+        key->base.shader_state = v3d->prog.bind_fs;
         key->is_points = (prim_mode == PIPE_PRIM_POINTS);
         key->is_lines = (prim_mode >= PIPE_PRIM_LINES &&
                          prim_mode <= PIPE_PRIM_LINE_STRIP);
-        key->clamp_color = vc5->rasterizer->base.clamp_fragment_color;
-        if (vc5->blend->logicop_enable) {
-                key->logicop_func = vc5->blend->logicop_func;
+        key->clamp_color = v3d->rasterizer->base.clamp_fragment_color;
+        if (v3d->blend->base.logicop_enable) {
+                key->logicop_func = v3d->blend->base.logicop_func;
         } else {
                 key->logicop_func = PIPE_LOGICOP_COPY;
         }
         if (job->msaa) {
-                key->msaa = vc5->rasterizer->base.multisample;
-                key->sample_coverage = (vc5->rasterizer->base.multisample &&
-                                        vc5->sample_mask != (1 << VC5_MAX_SAMPLES) - 1);
-                key->sample_alpha_to_coverage = vc5->blend->alpha_to_coverage;
-                key->sample_alpha_to_one = vc5->blend->alpha_to_one;
+                key->msaa = v3d->rasterizer->base.multisample;
+                key->sample_coverage = (v3d->rasterizer->base.multisample &&
+                                        v3d->sample_mask != (1 << VC5_MAX_SAMPLES) - 1);
+                key->sample_alpha_to_coverage = v3d->blend->base.alpha_to_coverage;
+                key->sample_alpha_to_one = v3d->blend->base.alpha_to_one;
         }
 
-        key->depth_enabled = (vc5->zsa->base.depth.enabled ||
-                              vc5->zsa->base.stencil[0].enabled);
-        if (vc5->zsa->base.alpha.enabled) {
+        key->depth_enabled = (v3d->zsa->base.depth.enabled ||
+                              v3d->zsa->base.stencil[0].enabled);
+        if (v3d->zsa->base.alpha.enabled) {
                 key->alpha_test = true;
-                key->alpha_test_func = vc5->zsa->base.alpha.func;
+                key->alpha_test_func = v3d->zsa->base.alpha.func;
         }
 
         /* gl_FragColor's propagation to however many bound color buffers
          * there are means that the buffer count needs to be in the key.
          */
-        key->nr_cbufs = vc5->framebuffer.nr_cbufs;
-        key->swap_color_rb = vc5->swap_color_rb;
+        key->nr_cbufs = v3d->framebuffer.nr_cbufs;
+        key->swap_color_rb = v3d->swap_color_rb;
 
         for (int i = 0; i < key->nr_cbufs; i++) {
-                struct pipe_surface *cbuf = vc5->framebuffer.cbufs[i];
+                struct pipe_surface *cbuf = v3d->framebuffer.cbufs[i];
                 if (!cbuf)
                         continue;
 
@@ -445,7 +453,7 @@
                         key->f32_color_rb |= 1 << i;
                 }
 
-                if (vc5->prog.bind_fs->was_tgsi) {
+                if (v3d->prog.bind_fs->was_tgsi) {
                         if (util_format_is_pure_uint(cbuf->format))
                                 key->uint_color_rb |= 1 << i;
                         else if (util_format_is_pure_sint(cbuf->format))
@@ -455,42 +463,53 @@
 
         if (key->is_points) {
                 key->point_sprite_mask =
-                        vc5->rasterizer->base.sprite_coord_enable;
+                        v3d->rasterizer->base.sprite_coord_enable;
                 key->point_coord_upper_left =
-                        (vc5->rasterizer->base.sprite_coord_mode ==
+                        (v3d->rasterizer->base.sprite_coord_mode ==
                          PIPE_SPRITE_COORD_UPPER_LEFT);
         }
 
-        key->light_twoside = vc5->rasterizer->base.light_twoside;
-        key->shade_model_flat = vc5->rasterizer->base.flatshade;
+        key->light_twoside = v3d->rasterizer->base.light_twoside;
+        key->shade_model_flat = v3d->rasterizer->base.flatshade;
 
-        struct vc5_compiled_shader *old_fs = vc5->prog.fs;
-        vc5->prog.fs = vc5_get_compiled_shader(vc5, &key->base);
-        if (vc5->prog.fs == old_fs)
+        struct v3d_compiled_shader *old_fs = v3d->prog.fs;
+        v3d->prog.fs = v3d_get_compiled_shader(v3d, &key->base);
+        if (v3d->prog.fs == old_fs)
                 return;
 
-        vc5->dirty |= VC5_DIRTY_COMPILED_FS;
+        v3d->dirty |= VC5_DIRTY_COMPILED_FS;
 
-        if (old_fs &&
-            vc5->prog.fs->prog_data.fs->flat_shade_flags !=
-            old_fs->prog_data.fs->flat_shade_flags) {
-                vc5->dirty |= VC5_DIRTY_FLAT_SHADE_FLAGS;
+        if (old_fs) {
+                if (v3d->prog.fs->prog_data.fs->flat_shade_flags !=
+                    old_fs->prog_data.fs->flat_shade_flags) {
+                        v3d->dirty |= VC5_DIRTY_FLAT_SHADE_FLAGS;
+                }
+
+                if (v3d->prog.fs->prog_data.fs->noperspective_flags !=
+                    old_fs->prog_data.fs->noperspective_flags) {
+                        v3d->dirty |= VC5_DIRTY_NOPERSPECTIVE_FLAGS;
+                }
+
+                if (v3d->prog.fs->prog_data.fs->centroid_flags !=
+                    old_fs->prog_data.fs->centroid_flags) {
+                        v3d->dirty |= VC5_DIRTY_CENTROID_FLAGS;
+                }
         }
 
-        if (old_fs && memcmp(vc5->prog.fs->prog_data.fs->input_slots,
+        if (old_fs && memcmp(v3d->prog.fs->prog_data.fs->input_slots,
                              old_fs->prog_data.fs->input_slots,
-                             sizeof(vc5->prog.fs->prog_data.fs->input_slots))) {
-                vc5->dirty |= VC5_DIRTY_FS_INPUTS;
+                             sizeof(v3d->prog.fs->prog_data.fs->input_slots))) {
+                v3d->dirty |= VC5_DIRTY_FS_INPUTS;
         }
 }
 
 static void
-vc5_update_compiled_vs(struct vc5_context *vc5, uint8_t prim_mode)
+v3d_update_compiled_vs(struct v3d_context *v3d, uint8_t prim_mode)
 {
         struct v3d_vs_key local_key;
         struct v3d_vs_key *key = &local_key;
 
-        if (!(vc5->dirty & (VC5_DIRTY_PRIM_MODE |
+        if (!(v3d->dirty & (VC5_DIRTY_PRIM_MODE |
                             VC5_DIRTY_RASTERIZER |
                             VC5_DIRTY_VERTTEX |
                             VC5_DIRTY_VTXSTATE |
@@ -500,29 +519,29 @@
         }
 
         memset(key, 0, sizeof(*key));
-        vc5_setup_shared_key(vc5, &key->base, &vc5->verttex);
-        key->base.shader_state = vc5->prog.bind_vs;
-        key->num_fs_inputs = vc5->prog.fs->prog_data.fs->base.num_inputs;
+        v3d_setup_shared_key(v3d, &key->base, &v3d->verttex);
+        key->base.shader_state = v3d->prog.bind_vs;
+        key->num_fs_inputs = v3d->prog.fs->prog_data.fs->base.num_inputs;
         STATIC_ASSERT(sizeof(key->fs_inputs) ==
-                      sizeof(vc5->prog.fs->prog_data.fs->input_slots));
-        memcpy(key->fs_inputs, vc5->prog.fs->prog_data.fs->input_slots,
+                      sizeof(v3d->prog.fs->prog_data.fs->input_slots));
+        memcpy(key->fs_inputs, v3d->prog.fs->prog_data.fs->input_slots,
                sizeof(key->fs_inputs));
-        key->clamp_color = vc5->rasterizer->base.clamp_vertex_color;
+        key->clamp_color = v3d->rasterizer->base.clamp_vertex_color;
 
         key->per_vertex_point_size =
                 (prim_mode == PIPE_PRIM_POINTS &&
-                 vc5->rasterizer->base.point_size_per_vertex);
+                 v3d->rasterizer->base.point_size_per_vertex);
 
-        struct vc5_compiled_shader *vs =
-                vc5_get_compiled_shader(vc5, &key->base);
-        if (vs != vc5->prog.vs) {
-                vc5->prog.vs = vs;
-                vc5->dirty |= VC5_DIRTY_COMPILED_VS;
+        struct v3d_compiled_shader *vs =
+                v3d_get_compiled_shader(v3d, &key->base);
+        if (vs != v3d->prog.vs) {
+                v3d->prog.vs = vs;
+                v3d->dirty |= VC5_DIRTY_COMPILED_VS;
         }
 
         key->is_coord = true;
         /* Coord shaders only output varyings used by transform feedback. */
-        struct vc5_uncompiled_shader *shader_state = key->base.shader_state;
+        struct v3d_uncompiled_shader *shader_state = key->base.shader_state;
         memcpy(key->fs_inputs, shader_state->tf_outputs,
                sizeof(*key->fs_inputs) * shader_state->num_tf_outputs);
         if (shader_state->num_tf_outputs < key->num_fs_inputs) {
@@ -533,19 +552,19 @@
         }
         key->num_fs_inputs = shader_state->num_tf_outputs;
 
-        struct vc5_compiled_shader *cs =
-                vc5_get_compiled_shader(vc5, &key->base);
-        if (cs != vc5->prog.cs) {
-                vc5->prog.cs = cs;
-                vc5->dirty |= VC5_DIRTY_COMPILED_CS;
+        struct v3d_compiled_shader *cs =
+                v3d_get_compiled_shader(v3d, &key->base);
+        if (cs != v3d->prog.cs) {
+                v3d->prog.cs = cs;
+                v3d->dirty |= VC5_DIRTY_COMPILED_CS;
         }
 }
 
 void
-vc5_update_compiled_shaders(struct vc5_context *vc5, uint8_t prim_mode)
+v3d_update_compiled_shaders(struct v3d_context *v3d, uint8_t prim_mode)
 {
-        vc5_update_compiled_fs(vc5, prim_mode);
-        vc5_update_compiled_vs(vc5, prim_mode);
+        v3d_update_compiled_fs(v3d, prim_mode);
+        v3d_update_compiled_vs(v3d, prim_mode);
 }
 
 static uint32_t
@@ -574,16 +593,16 @@
 
 static void
 delete_from_cache_if_matches(struct hash_table *ht,
-                             struct vc5_compiled_shader **last_compile,
+                             struct v3d_compiled_shader **last_compile,
                              struct hash_entry *entry,
-                             struct vc5_uncompiled_shader *so)
+                             struct v3d_uncompiled_shader *so)
 {
         const struct v3d_key *key = entry->key;
 
         if (key->shader_state == so) {
-                struct vc5_compiled_shader *shader = entry->data;
+                struct v3d_compiled_shader *shader = entry->data;
                 _mesa_hash_table_remove(ht, entry);
-                vc5_bo_unreference(&shader->bo);
+                v3d_bo_unreference(&shader->bo);
 
                 if (shader == *last_compile)
                         *last_compile = NULL;
@@ -593,18 +612,18 @@
 }
 
 static void
-vc5_shader_state_delete(struct pipe_context *pctx, void *hwcso)
+v3d_shader_state_delete(struct pipe_context *pctx, void *hwcso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_uncompiled_shader *so = hwcso;
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_uncompiled_shader *so = hwcso;
 
         struct hash_entry *entry;
-        hash_table_foreach(vc5->fs_cache, entry) {
-                delete_from_cache_if_matches(vc5->fs_cache, &vc5->prog.fs,
+        hash_table_foreach(v3d->fs_cache, entry) {
+                delete_from_cache_if_matches(v3d->fs_cache, &v3d->prog.fs,
                                              entry, so);
         }
-        hash_table_foreach(vc5->vs_cache, entry) {
-                delete_from_cache_if_matches(vc5->vs_cache, &vc5->prog.vs,
+        hash_table_foreach(v3d->vs_cache, entry) {
+                delete_from_cache_if_matches(v3d->vs_cache, &v3d->prog.vs,
                                              entry, so);
         }
 
@@ -613,58 +632,60 @@
 }
 
 static void
-vc5_fp_state_bind(struct pipe_context *pctx, void *hwcso)
+v3d_fp_state_bind(struct pipe_context *pctx, void *hwcso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->prog.bind_fs = hwcso;
-        vc5->dirty |= VC5_DIRTY_UNCOMPILED_FS;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->prog.bind_fs = hwcso;
+        v3d->dirty |= VC5_DIRTY_UNCOMPILED_FS;
 }
 
 static void
-vc5_vp_state_bind(struct pipe_context *pctx, void *hwcso)
+v3d_vp_state_bind(struct pipe_context *pctx, void *hwcso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->prog.bind_vs = hwcso;
-        vc5->dirty |= VC5_DIRTY_UNCOMPILED_VS;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->prog.bind_vs = hwcso;
+        v3d->dirty |= VC5_DIRTY_UNCOMPILED_VS;
 }
 
 void
-vc5_program_init(struct pipe_context *pctx)
+v3d_program_init(struct pipe_context *pctx)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
+        struct v3d_context *v3d = v3d_context(pctx);
 
-        pctx->create_vs_state = vc5_shader_state_create;
-        pctx->delete_vs_state = vc5_shader_state_delete;
+        pctx->create_vs_state = v3d_shader_state_create;
+        pctx->delete_vs_state = v3d_shader_state_delete;
 
-        pctx->create_fs_state = vc5_shader_state_create;
-        pctx->delete_fs_state = vc5_shader_state_delete;
+        pctx->create_fs_state = v3d_shader_state_create;
+        pctx->delete_fs_state = v3d_shader_state_delete;
 
-        pctx->bind_fs_state = vc5_fp_state_bind;
-        pctx->bind_vs_state = vc5_vp_state_bind;
+        pctx->bind_fs_state = v3d_fp_state_bind;
+        pctx->bind_vs_state = v3d_vp_state_bind;
 
-        vc5->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
+        v3d->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
                                                 fs_cache_compare);
-        vc5->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
+        v3d->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
                                                 vs_cache_compare);
 }
 
 void
-vc5_program_fini(struct pipe_context *pctx)
+v3d_program_fini(struct pipe_context *pctx)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
+        struct v3d_context *v3d = v3d_context(pctx);
 
         struct hash_entry *entry;
-        hash_table_foreach(vc5->fs_cache, entry) {
-                struct vc5_compiled_shader *shader = entry->data;
-                vc5_bo_unreference(&shader->bo);
+        hash_table_foreach(v3d->fs_cache, entry) {
+                struct v3d_compiled_shader *shader = entry->data;
+                v3d_bo_unreference(&shader->bo);
                 ralloc_free(shader);
-                _mesa_hash_table_remove(vc5->fs_cache, entry);
+                _mesa_hash_table_remove(v3d->fs_cache, entry);
         }
 
-        hash_table_foreach(vc5->vs_cache, entry) {
-                struct vc5_compiled_shader *shader = entry->data;
-                vc5_bo_unreference(&shader->bo);
+        hash_table_foreach(v3d->vs_cache, entry) {
+                struct v3d_compiled_shader *shader = entry->data;
+                v3d_bo_unreference(&shader->bo);
                 ralloc_free(shader);
-                _mesa_hash_table_remove(vc5->vs_cache, entry);
+                _mesa_hash_table_remove(v3d->vs_cache, entry);
         }
+
+        v3d_bo_unreference(&v3d->prog.spill_bo);
 }
diff --git a/src/gallium/drivers/vc5/vc5_query.c b/src/gallium/drivers/v3d/v3d_query.c
similarity index 62%
rename from src/gallium/drivers/vc5/vc5_query.c
rename to src/gallium/drivers/v3d/v3d_query.c
index 9aa80cf..d344779 100644
--- a/src/gallium/drivers/vc5/vc5_query.c
+++ b/src/gallium/drivers/v3d/v3d_query.c
@@ -33,21 +33,21 @@
  * do the calculations in software at draw time.
  */
 
-#include "vc5_context.h"
+#include "v3d_context.h"
 #include "broadcom/cle/v3d_packet_v33_pack.h"
 
-struct vc5_query
+struct v3d_query
 {
         enum pipe_query_type type;
-        struct vc5_bo *bo;
+        struct v3d_bo *bo;
 
         uint32_t start, end;
 };
 
 static struct pipe_query *
-vc5_create_query(struct pipe_context *pctx, unsigned query_type, unsigned index)
+v3d_create_query(struct pipe_context *pctx, unsigned query_type, unsigned index)
 {
-        struct vc5_query *q = calloc(1, sizeof(*q));
+        struct v3d_query *q = calloc(1, sizeof(*q));
 
         q->type = query_type;
 
@@ -56,34 +56,34 @@
 }
 
 static void
-vc5_destroy_query(struct pipe_context *pctx, struct pipe_query *query)
+v3d_destroy_query(struct pipe_context *pctx, struct pipe_query *query)
 {
-        struct vc5_query *q = (struct vc5_query *)query;
+        struct v3d_query *q = (struct v3d_query *)query;
 
-        vc5_bo_unreference(&q->bo);
+        v3d_bo_unreference(&q->bo);
         free(q);
 }
 
 static boolean
-vc5_begin_query(struct pipe_context *pctx, struct pipe_query *query)
+v3d_begin_query(struct pipe_context *pctx, struct pipe_query *query)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_query *q = (struct vc5_query *)query;
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_query *q = (struct v3d_query *)query;
 
         switch (q->type) {
         case PIPE_QUERY_PRIMITIVES_GENERATED:
-                q->start = vc5->prims_generated;
+                q->start = v3d->prims_generated;
                 break;
         case PIPE_QUERY_PRIMITIVES_EMITTED:
-                q->start = vc5->tf_prims_generated;
+                q->start = v3d->tf_prims_generated;
                 break;
         default:
-                q->bo = vc5_bo_alloc(vc5->screen, 4096, "query");
+                q->bo = v3d_bo_alloc(v3d->screen, 4096, "query");
 
-                uint32_t *map = vc5_bo_map(q->bo);
+                uint32_t *map = v3d_bo_map(q->bo);
                 *map = 0;
-                vc5->current_oq = q->bo;
-                vc5->dirty |= VC5_DIRTY_OQ;
+                v3d->current_oq = q->bo;
+                v3d->dirty |= VC5_DIRTY_OQ;
                 break;
         }
 
@@ -91,21 +91,21 @@
 }
 
 static bool
-vc5_end_query(struct pipe_context *pctx, struct pipe_query *query)
+v3d_end_query(struct pipe_context *pctx, struct pipe_query *query)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_query *q = (struct vc5_query *)query;
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_query *q = (struct v3d_query *)query;
 
         switch (q->type) {
         case PIPE_QUERY_PRIMITIVES_GENERATED:
-                q->end = vc5->prims_generated;
+                q->end = v3d->prims_generated;
                 break;
         case PIPE_QUERY_PRIMITIVES_EMITTED:
-                q->end = vc5->tf_prims_generated;
+                q->end = v3d->tf_prims_generated;
                 break;
         default:
-                vc5->current_oq = NULL;
-                vc5->dirty |= VC5_DIRTY_OQ;
+                v3d->current_oq = NULL;
+                v3d->dirty |= VC5_DIRTY_OQ;
                 break;
         }
 
@@ -113,29 +113,29 @@
 }
 
 static boolean
-vc5_get_query_result(struct pipe_context *pctx, struct pipe_query *query,
+v3d_get_query_result(struct pipe_context *pctx, struct pipe_query *query,
                      boolean wait, union pipe_query_result *vresult)
 {
-        struct vc5_query *q = (struct vc5_query *)query;
+        struct v3d_query *q = (struct v3d_query *)query;
         uint32_t result = 0;
 
         if (q->bo) {
                 /* XXX: Only flush the jobs using this BO. */
-                vc5_flush(pctx);
+                v3d_flush(pctx);
 
                 if (wait) {
-                        if (!vc5_bo_wait(q->bo, 0, "query"))
+                        if (!v3d_bo_wait(q->bo, 0, "query"))
                                 return false;
                 } else {
-                        if (!vc5_bo_wait(q->bo, ~0ull, "query"))
+                        if (!v3d_bo_wait(q->bo, ~0ull, "query"))
                                 return false;
                 }
 
                 /* XXX: Sum up per-core values. */
-                uint32_t *map = vc5_bo_map(q->bo);
+                uint32_t *map = v3d_bo_map(q->bo);
                 result = *map;
 
-                vc5_bo_unreference(&q->bo);
+                v3d_bo_unreference(&q->bo);
         }
 
         switch (q->type) {
@@ -158,23 +158,23 @@
 }
 
 static void
-vc5_set_active_query_state(struct pipe_context *pctx, boolean enable)
+v3d_set_active_query_state(struct pipe_context *pctx, boolean enable)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
+        struct v3d_context *v3d = v3d_context(pctx);
 
-        vc5->active_queries = enable;
-        vc5->dirty |= VC5_DIRTY_OQ;
-        vc5->dirty |= VC5_DIRTY_STREAMOUT;
+        v3d->active_queries = enable;
+        v3d->dirty |= VC5_DIRTY_OQ;
+        v3d->dirty |= VC5_DIRTY_STREAMOUT;
 }
 
 void
-vc5_query_init(struct pipe_context *pctx)
+v3d_query_init(struct pipe_context *pctx)
 {
-        pctx->create_query = vc5_create_query;
-        pctx->destroy_query = vc5_destroy_query;
-        pctx->begin_query = vc5_begin_query;
-        pctx->end_query = vc5_end_query;
-        pctx->get_query_result = vc5_get_query_result;
-        pctx->set_active_query_state = vc5_set_active_query_state;
+        pctx->create_query = v3d_create_query;
+        pctx->destroy_query = v3d_destroy_query;
+        pctx->begin_query = v3d_begin_query;
+        pctx->end_query = v3d_end_query;
+        pctx->get_query_result = v3d_get_query_result;
+        pctx->set_active_query_state = v3d_set_active_query_state;
 }
 
diff --git a/src/gallium/drivers/vc5/vc5_resource.c b/src/gallium/drivers/v3d/v3d_resource.c
similarity index 79%
rename from src/gallium/drivers/vc5/vc5_resource.c
rename to src/gallium/drivers/v3d/v3d_resource.c
index c8c99cf..8bf6a97 100644
--- a/src/gallium/drivers/vc5/vc5_resource.c
+++ b/src/gallium/drivers/v3d/v3d_resource.c
@@ -33,14 +33,14 @@
 #include "util/u_format_zs.h"
 
 #include "drm_fourcc.h"
-#include "vc5_screen.h"
-#include "vc5_context.h"
-#include "vc5_resource.h"
-#include "vc5_tiling.h"
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_resource.h"
+#include "v3d_tiling.h"
 #include "broadcom/cle/v3d_packet_v33_pack.h"
 
 static void
-vc5_debug_resource_layout(struct vc5_resource *rsc, const char *caller)
+v3d_debug_resource_layout(struct v3d_resource *rsc, const char *caller)
 {
         if (!(V3D_DEBUG & V3D_DEBUG_SURFACE))
                 return;
@@ -68,7 +68,7 @@
         };
 
         for (int i = 0; i <= prsc->last_level; i++) {
-                struct vc5_resource_slice *slice = &rsc->slices[i];
+                struct v3d_resource_slice *slice = &rsc->slices[i];
 
                 int level_width = slice->stride / rsc->cpp;
                 int level_height = slice->padded_height;
@@ -94,17 +94,17 @@
 }
 
 static bool
-vc5_resource_bo_alloc(struct vc5_resource *rsc)
+v3d_resource_bo_alloc(struct v3d_resource *rsc)
 {
         struct pipe_resource *prsc = &rsc->base;
         struct pipe_screen *pscreen = prsc->screen;
-        struct vc5_bo *bo;
+        struct v3d_bo *bo;
 
-        bo = vc5_bo_alloc(vc5_screen(pscreen), rsc->size, "resource");
+        bo = v3d_bo_alloc(v3d_screen(pscreen), rsc->size, "resource");
         if (bo) {
-                vc5_bo_unreference(&rsc->bo);
+                v3d_bo_unreference(&rsc->bo);
                 rsc->bo = bo;
-                vc5_debug_resource_layout(rsc, "alloc");
+                v3d_debug_resource_layout(rsc, "alloc");
                 return true;
         } else {
                 return false;
@@ -112,23 +112,23 @@
 }
 
 static void
-vc5_resource_transfer_unmap(struct pipe_context *pctx,
+v3d_resource_transfer_unmap(struct pipe_context *pctx,
                             struct pipe_transfer *ptrans)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_transfer *trans = vc5_transfer(ptrans);
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_transfer *trans = v3d_transfer(ptrans);
 
         if (trans->map) {
-                struct vc5_resource *rsc = vc5_resource(ptrans->resource);
-                struct vc5_resource_slice *slice = &rsc->slices[ptrans->level];
+                struct v3d_resource *rsc = v3d_resource(ptrans->resource);
+                struct v3d_resource_slice *slice = &rsc->slices[ptrans->level];
 
                 if (ptrans->usage & PIPE_TRANSFER_WRITE) {
                         for (int z = 0; z < ptrans->box.depth; z++) {
                                 void *dst = rsc->bo->map +
-                                        vc5_layer_offset(&rsc->base,
+                                        v3d_layer_offset(&rsc->base,
                                                          ptrans->level,
                                                          ptrans->box.z + z);
-                                vc5_store_tiled_image(dst,
+                                v3d_store_tiled_image(dst,
                                                       slice->stride,
                                                       (trans->map +
                                                        ptrans->stride *
@@ -143,19 +143,19 @@
         }
 
         pipe_resource_reference(&ptrans->resource, NULL);
-        slab_free(&vc5->transfer_pool, ptrans);
+        slab_free(&v3d->transfer_pool, ptrans);
 }
 
 static void *
-vc5_resource_transfer_map(struct pipe_context *pctx,
+v3d_resource_transfer_map(struct pipe_context *pctx,
                           struct pipe_resource *prsc,
                           unsigned level, unsigned usage,
                           const struct pipe_box *box,
                           struct pipe_transfer **pptrans)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_resource *rsc = vc5_resource(prsc);
-        struct vc5_transfer *trans;
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_resource *rsc = v3d_resource(prsc);
+        struct v3d_transfer *trans;
         struct pipe_transfer *ptrans;
         enum pipe_format format = prsc->format;
         char *buf;
@@ -168,7 +168,7 @@
          */
         if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
             !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
-            !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) &&
+            !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) &&
             prsc->last_level == 0 &&
             prsc->width0 == box->width &&
             prsc->height0 == box->height &&
@@ -179,20 +179,20 @@
         }
 
         if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
-                if (vc5_resource_bo_alloc(rsc)) {
+                if (v3d_resource_bo_alloc(rsc)) {
                         /* If it might be bound as one of our vertex buffers
                          * or UBOs, make sure we re-emit vertex buffer state
                          * or uniforms.
                          */
                         if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
-                                vc5->dirty |= VC5_DIRTY_VTXBUF;
+                                v3d->dirty |= VC5_DIRTY_VTXBUF;
                         if (prsc->bind & PIPE_BIND_CONSTANT_BUFFER)
-                                vc5->dirty |= VC5_DIRTY_CONSTBUF;
+                                v3d->dirty |= VC5_DIRTY_CONSTBUF;
                 } else {
                         /* If we failed to reallocate, flush users so that we
                          * don't violate any syncing requirements.
                          */
-                        vc5_flush_jobs_reading_resource(vc5, prsc);
+                        v3d_flush_jobs_reading_resource(v3d, prsc);
                 }
         } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
                 /* If we're writing and the buffer is being used by the CL, we
@@ -200,9 +200,9 @@
                  * to flush if the CL has written our buffer.
                  */
                 if (usage & PIPE_TRANSFER_WRITE)
-                        vc5_flush_jobs_reading_resource(vc5, prsc);
+                        v3d_flush_jobs_reading_resource(v3d, prsc);
                 else
-                        vc5_flush_jobs_writing_resource(vc5, prsc);
+                        v3d_flush_jobs_writing_resource(v3d, prsc);
         }
 
         if (usage & PIPE_TRANSFER_WRITE) {
@@ -210,7 +210,7 @@
                 rsc->initialized_buffers = ~0;
         }
 
-        trans = slab_alloc(&vc5->transfer_pool);
+        trans = slab_alloc(&v3d->transfer_pool);
         if (!trans)
                 return NULL;
 
@@ -230,9 +230,9 @@
          */
 
         if (usage & PIPE_TRANSFER_UNSYNCHRONIZED)
-                buf = vc5_bo_map_unsynchronized(rsc->bo);
+                buf = v3d_bo_map_unsynchronized(rsc->bo);
         else
-                buf = vc5_bo_map(rsc->bo);
+                buf = v3d_bo_map(rsc->bo);
         if (!buf) {
                 fprintf(stderr, "Failed to map bo\n");
                 goto fail;
@@ -248,7 +248,7 @@
         ptrans->box.height = DIV_ROUND_UP(ptrans->box.height,
                                           util_format_get_blockheight(format));
 
-        struct vc5_resource_slice *slice = &rsc->slices[level];
+        struct v3d_resource_slice *slice = &rsc->slices[level];
         if (rsc->tiled) {
                 /* No direct mappings of tiled, since we need to manually
                  * tile/untile.
@@ -264,10 +264,10 @@
                 if (usage & PIPE_TRANSFER_READ) {
                         for (int z = 0; z < ptrans->box.depth; z++) {
                                 void *src = rsc->bo->map +
-                                        vc5_layer_offset(&rsc->base,
+                                        v3d_layer_offset(&rsc->base,
                                                          ptrans->level,
                                                          ptrans->box.z + z);
-                                vc5_load_tiled_image((trans->map +
+                                v3d_load_tiled_image((trans->map +
                                                       ptrans->stride *
                                                       ptrans->box.height * z),
                                                      ptrans->stride,
@@ -281,7 +281,7 @@
                 return trans->map;
         } else {
                 ptrans->stride = slice->stride;
-                ptrans->layer_stride = ptrans->stride;
+                ptrans->layer_stride = rsc->cube_map_stride;
 
                 return buf + slice->offset +
                         ptrans->box.y * ptrans->stride +
@@ -291,31 +291,32 @@
 
 
 fail:
-        vc5_resource_transfer_unmap(pctx, ptrans);
+        v3d_resource_transfer_unmap(pctx, ptrans);
         return NULL;
 }
 
 static void
-vc5_resource_destroy(struct pipe_screen *pscreen,
+v3d_resource_destroy(struct pipe_screen *pscreen,
                      struct pipe_resource *prsc)
 {
-        struct vc5_resource *rsc = vc5_resource(prsc);
+        struct v3d_resource *rsc = v3d_resource(prsc);
 
-        vc5_bo_unreference(&rsc->bo);
+        v3d_bo_unreference(&rsc->bo);
         free(rsc);
 }
 
 static boolean
-vc5_resource_get_handle(struct pipe_screen *pscreen,
+v3d_resource_get_handle(struct pipe_screen *pscreen,
                         struct pipe_context *pctx,
                         struct pipe_resource *prsc,
                         struct winsys_handle *whandle,
                         unsigned usage)
 {
-        struct vc5_resource *rsc = vc5_resource(prsc);
-        struct vc5_bo *bo = rsc->bo;
+        struct v3d_resource *rsc = v3d_resource(prsc);
+        struct v3d_bo *bo = rsc->bo;
 
         whandle->stride = rsc->slices[0].stride;
+        whandle->offset = 0;
 
         /* If we're passing some reference to our BO out to some other part of
          * the system, then we can't do any optimizations about only us being
@@ -323,14 +324,25 @@
          */
         bo->private = false;
 
+        if (rsc->tiled) {
+                /* A shared tiled buffer should always be allocated as UIF,
+                 * not UBLINEAR or LT.
+                 */
+                assert(rsc->slices[0].tiling == VC5_TILING_UIF_XOR ||
+                       rsc->slices[0].tiling == VC5_TILING_UIF_NO_XOR);
+                whandle->modifier = DRM_FORMAT_MOD_BROADCOM_UIF;
+        } else {
+                whandle->modifier = DRM_FORMAT_MOD_LINEAR;
+        }
+
         switch (whandle->type) {
-        case DRM_API_HANDLE_TYPE_SHARED:
-                return vc5_bo_flink(bo, &whandle->handle);
-        case DRM_API_HANDLE_TYPE_KMS:
+        case WINSYS_HANDLE_TYPE_SHARED:
+                return v3d_bo_flink(bo, &whandle->handle);
+        case WINSYS_HANDLE_TYPE_KMS:
                 whandle->handle = bo->handle;
                 return TRUE;
-        case DRM_API_HANDLE_TYPE_FD:
-                whandle->handle = vc5_bo_get_dmabuf(bo);
+        case WINSYS_HANDLE_TYPE_FD:
+                whandle->handle = v3d_bo_get_dmabuf(bo);
                 return whandle->handle != -1;
         }
 
@@ -350,9 +362,9 @@
  * between columns of UIF blocks.
  */
 static uint32_t
-vc5_get_ub_pad(struct vc5_resource *rsc, uint32_t height)
+v3d_get_ub_pad(struct v3d_resource *rsc, uint32_t height)
 {
-        uint32_t utile_h = vc5_utile_height(rsc->cpp);
+        uint32_t utile_h = v3d_utile_height(rsc->cpp);
         uint32_t uif_block_h = utile_h * 2;
         uint32_t height_ub = height / uif_block_h;
 
@@ -384,7 +396,7 @@
 }
 
 static void
-vc5_setup_slices(struct vc5_resource *rsc)
+v3d_setup_slices(struct v3d_resource *rsc)
 {
         struct pipe_resource *prsc = &rsc->base;
         uint32_t width = prsc->width0;
@@ -399,8 +411,8 @@
         uint32_t pot_height = 2 * util_next_power_of_two(u_minify(height, 1));
         uint32_t pot_depth = 2 * util_next_power_of_two(u_minify(depth, 1));
         uint32_t offset = 0;
-        uint32_t utile_w = vc5_utile_width(rsc->cpp);
-        uint32_t utile_h = vc5_utile_height(rsc->cpp);
+        uint32_t utile_w = v3d_utile_width(rsc->cpp);
+        uint32_t utile_h = v3d_utile_height(rsc->cpp);
         uint32_t uif_block_w = utile_w * 2;
         uint32_t uif_block_h = utile_h * 2;
         uint32_t block_width = util_format_get_blockwidth(prsc->format);
@@ -412,7 +424,7 @@
         bool uif_top = msaa;
 
         for (int i = prsc->last_level; i >= 0; i--) {
-                struct vc5_resource_slice *slice = &rsc->slices[i];
+                struct v3d_resource_slice *slice = &rsc->slices[i];
 
                 uint32_t level_width, level_height, level_depth;
                 if (i < 2) {
@@ -466,7 +478,7 @@
                                 level_height = align(level_height,
                                                      uif_block_h);
 
-                                slice->ub_pad = vc5_get_ub_pad(rsc,
+                                slice->ub_pad = v3d_get_ub_pad(rsc,
                                                                level_height);
                                 level_height += slice->ub_pad * uif_block_h;
 
@@ -539,10 +551,10 @@
 }
 
 uint32_t
-vc5_layer_offset(struct pipe_resource *prsc, uint32_t level, uint32_t layer)
+v3d_layer_offset(struct pipe_resource *prsc, uint32_t level, uint32_t layer)
 {
-        struct vc5_resource *rsc = vc5_resource(prsc);
-        struct vc5_resource_slice *slice = &rsc->slices[level];
+        struct v3d_resource *rsc = v3d_resource(prsc);
+        struct v3d_resource_slice *slice = &rsc->slices[level];
 
         if (prsc->target == PIPE_TEXTURE_3D)
                 return slice->offset + layer * slice->size;
@@ -550,12 +562,12 @@
                 return slice->offset + layer * rsc->cube_map_stride;
 }
 
-static struct vc5_resource *
-vc5_resource_setup(struct pipe_screen *pscreen,
+static struct v3d_resource *
+v3d_resource_setup(struct pipe_screen *pscreen,
                    const struct pipe_resource *tmpl)
 {
-        struct vc5_screen *screen = vc5_screen(pscreen);
-        struct vc5_resource *rsc = CALLOC_STRUCT(vc5_resource);
+        struct v3d_screen *screen = v3d_screen(pscreen);
+        struct v3d_resource *rsc = CALLOC_STRUCT(v3d_resource);
         if (!rsc)
                 return NULL;
         struct pipe_resource *prsc = &rsc->base;
@@ -566,16 +578,18 @@
         prsc->screen = pscreen;
 
         if (prsc->nr_samples <= 1 ||
+            screen->devinfo.ver >= 40 ||
             util_format_is_depth_or_stencil(prsc->format)) {
-                rsc->cpp = util_format_get_blocksize(prsc->format) *
-                        MAX2(prsc->nr_samples, 1);
+                rsc->cpp = util_format_get_blocksize(prsc->format);
+                if (screen->devinfo.ver < 40 && prsc->nr_samples > 1)
+                        rsc->cpp *= prsc->nr_samples;
         } else {
-                assert(vc5_rt_format_supported(&screen->devinfo, prsc->format));
+                assert(v3d_rt_format_supported(&screen->devinfo, prsc->format));
                 uint32_t output_image_format =
-                        vc5_get_rt_format(&screen->devinfo, prsc->format);
+                        v3d_get_rt_format(&screen->devinfo, prsc->format);
                 uint32_t internal_type;
                 uint32_t internal_bpp;
-                vc5_get_internal_type_bpp_for_output_format(&screen->devinfo,
+                v3d_get_internal_type_bpp_for_output_format(&screen->devinfo,
                                                             output_image_format,
                                                             &internal_type,
                                                             &internal_bpp);
@@ -611,13 +625,13 @@
 }
 
 static struct pipe_resource *
-vc5_resource_create_with_modifiers(struct pipe_screen *pscreen,
+v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
                                    const struct pipe_resource *tmpl,
                                    const uint64_t *modifiers,
                                    int count)
 {
         bool linear_ok = find_modifier(DRM_FORMAT_MOD_LINEAR, modifiers, count);
-        struct vc5_resource *rsc = vc5_resource_setup(pscreen, tmpl);
+        struct v3d_resource *rsc = v3d_resource_setup(pscreen, tmpl);
         struct pipe_resource *prsc = &rsc->base;
         /* Use a tiled layout if we can, for better 3D performance. */
         bool should_tile = true;
@@ -639,7 +653,7 @@
         /* Scanout BOs for simulator need to be linear for interaction with
          * i965.
          */
-        if (using_vc5_simulator &&
+        if (using_v3d_simulator &&
             tmpl->bind & (PIPE_BIND_SHARED | PIPE_BIND_SCANOUT))
                 should_tile = false;
 
@@ -648,7 +662,7 @@
                 linear_ok = true;
                 rsc->tiled = should_tile;
         } else if (should_tile &&
-                   find_modifier(DRM_FORMAT_MOD_BROADCOM_VC4_T_TILED,
+                   find_modifier(DRM_FORMAT_MOD_BROADCOM_UIF,
                                  modifiers, count)) {
                 rsc->tiled = true;
         } else if (linear_ok) {
@@ -660,34 +674,34 @@
 
         rsc->internal_format = prsc->format;
 
-        vc5_setup_slices(rsc);
-        if (!vc5_resource_bo_alloc(rsc))
+        v3d_setup_slices(rsc);
+        if (!v3d_resource_bo_alloc(rsc))
                 goto fail;
 
         return prsc;
 fail:
-        vc5_resource_destroy(pscreen, prsc);
+        v3d_resource_destroy(pscreen, prsc);
         return NULL;
 }
 
 struct pipe_resource *
-vc5_resource_create(struct pipe_screen *pscreen,
+v3d_resource_create(struct pipe_screen *pscreen,
                     const struct pipe_resource *tmpl)
 {
         const uint64_t mod = DRM_FORMAT_MOD_INVALID;
-        return vc5_resource_create_with_modifiers(pscreen, tmpl, &mod, 1);
+        return v3d_resource_create_with_modifiers(pscreen, tmpl, &mod, 1);
 }
 
 static struct pipe_resource *
-vc5_resource_from_handle(struct pipe_screen *pscreen,
+v3d_resource_from_handle(struct pipe_screen *pscreen,
                          const struct pipe_resource *tmpl,
                          struct winsys_handle *whandle,
                          unsigned usage)
 {
-        struct vc5_screen *screen = vc5_screen(pscreen);
-        struct vc5_resource *rsc = vc5_resource_setup(pscreen, tmpl);
+        struct v3d_screen *screen = v3d_screen(pscreen);
+        struct v3d_resource *rsc = v3d_resource_setup(pscreen, tmpl);
         struct pipe_resource *prsc = &rsc->base;
-        struct vc5_resource_slice *slice = &rsc->slices[0];
+        struct v3d_resource_slice *slice = &rsc->slices[0];
 
         if (!rsc)
                 return NULL;
@@ -696,7 +710,10 @@
         case DRM_FORMAT_MOD_LINEAR:
                 rsc->tiled = false;
                 break;
-        /* XXX: UIF */
+        case DRM_FORMAT_MOD_BROADCOM_UIF:
+        case DRM_FORMAT_MOD_INVALID:
+                rsc->tiled = true;
+                break;
         default:
                 fprintf(stderr,
                         "Attempt to import unsupported modifier 0x%llx\n",
@@ -712,12 +729,12 @@
         }
 
         switch (whandle->type) {
-        case DRM_API_HANDLE_TYPE_SHARED:
-                rsc->bo = vc5_bo_open_name(screen,
+        case WINSYS_HANDLE_TYPE_SHARED:
+                rsc->bo = v3d_bo_open_name(screen,
                                            whandle->handle, whandle->stride);
                 break;
-        case DRM_API_HANDLE_TYPE_FD:
-                rsc->bo = vc5_bo_open_dmabuf(screen,
+        case WINSYS_HANDLE_TYPE_FD:
+                rsc->bo = v3d_bo_open_dmabuf(screen,
                                              whandle->handle, whandle->stride);
                 break;
         default:
@@ -730,8 +747,10 @@
         if (!rsc->bo)
                 goto fail;
 
-        vc5_setup_slices(rsc);
-        vc5_debug_resource_layout(rsc, "import");
+        rsc->internal_format = prsc->format;
+
+        v3d_setup_slices(rsc);
+        v3d_debug_resource_layout(rsc, "import");
 
         if (whandle->stride != slice->stride) {
                 static bool warned = false;
@@ -751,19 +770,19 @@
         return prsc;
 
 fail:
-        vc5_resource_destroy(pscreen, prsc);
+        v3d_resource_destroy(pscreen, prsc);
         return NULL;
 }
 
 static struct pipe_surface *
-vc5_create_surface(struct pipe_context *pctx,
+v3d_create_surface(struct pipe_context *pctx,
                    struct pipe_resource *ptex,
                    const struct pipe_surface *surf_tmpl)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_screen *screen = vc5->screen;
-        struct vc5_surface *surface = CALLOC_STRUCT(vc5_surface);
-        struct vc5_resource *rsc = vc5_resource(ptex);
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_screen *screen = v3d->screen;
+        struct v3d_surface *surface = CALLOC_STRUCT(v3d_surface);
+        struct v3d_resource *rsc = v3d_resource(ptex);
 
         if (!surface)
                 return NULL;
@@ -772,7 +791,7 @@
 
         struct pipe_surface *psurf = &surface->base;
         unsigned level = surf_tmpl->u.tex.level;
-        struct vc5_resource_slice *slice = &rsc->slices[level];
+        struct v3d_resource_slice *slice = &rsc->slices[level];
 
         pipe_reference_init(&psurf->reference, 1);
         pipe_resource_reference(&psurf->texture, ptex);
@@ -785,11 +804,11 @@
         psurf->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
         psurf->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
 
-        surface->offset = vc5_layer_offset(ptex, level,
+        surface->offset = v3d_layer_offset(ptex, level,
                                            psurf->u.tex.first_layer);
         surface->tiling = slice->tiling;
 
-        surface->format = vc5_get_rt_format(&screen->devinfo, psurf->format);
+        surface->format = v3d_get_rt_format(&screen->devinfo, psurf->format);
 
         if (util_format_is_depth_or_stencil(psurf->format)) {
                 switch (psurf->format) {
@@ -805,7 +824,7 @@
                 }
         } else {
                 uint32_t bpp, type;
-                vc5_get_internal_type_bpp_for_output_format(&screen->devinfo,
+                v3d_get_internal_type_bpp_for_output_format(&screen->devinfo,
                                                             surface->format,
                                                             &type, &bpp);
                 surface->internal_type = type;
@@ -816,12 +835,12 @@
             surface->tiling == VC5_TILING_UIF_XOR) {
                 surface->padded_height_of_output_image_in_uif_blocks =
                         (slice->padded_height /
-                         (2 * vc5_utile_height(rsc->cpp)));
+                         (2 * v3d_utile_height(rsc->cpp)));
         }
 
         if (rsc->separate_stencil) {
                 surface->separate_stencil =
-                        vc5_create_surface(pctx, &rsc->separate_stencil->base,
+                        v3d_create_surface(pctx, &rsc->separate_stencil->base,
                                            surf_tmpl);
         }
 
@@ -829,9 +848,9 @@
 }
 
 static void
-vc5_surface_destroy(struct pipe_context *pctx, struct pipe_surface *psurf)
+v3d_surface_destroy(struct pipe_context *pctx, struct pipe_surface *psurf)
 {
-        struct vc5_surface *surf = vc5_surface(psurf);
+        struct v3d_surface *surf = v3d_surface(psurf);
 
         if (surf->separate_stencil)
                 pipe_surface_reference(&surf->separate_stencil, NULL);
@@ -841,7 +860,7 @@
 }
 
 static void
-vc5_flush_resource(struct pipe_context *pctx, struct pipe_resource *resource)
+v3d_flush_resource(struct pipe_context *pctx, struct pipe_resource *resource)
 {
         /* All calls to flush_resource are followed by a flush of the context,
          * so there's nothing to do.
@@ -849,61 +868,61 @@
 }
 
 static enum pipe_format
-vc5_resource_get_internal_format(struct pipe_resource *prsc)
+v3d_resource_get_internal_format(struct pipe_resource *prsc)
 {
-        return vc5_resource(prsc)->internal_format;
+        return v3d_resource(prsc)->internal_format;
 }
 
 static void
-vc5_resource_set_stencil(struct pipe_resource *prsc,
+v3d_resource_set_stencil(struct pipe_resource *prsc,
                          struct pipe_resource *stencil)
 {
-        vc5_resource(prsc)->separate_stencil = vc5_resource(stencil);
+        v3d_resource(prsc)->separate_stencil = v3d_resource(stencil);
 }
 
 static struct pipe_resource *
-vc5_resource_get_stencil(struct pipe_resource *prsc)
+v3d_resource_get_stencil(struct pipe_resource *prsc)
 {
-        struct vc5_resource *rsc = vc5_resource(prsc);
+        struct v3d_resource *rsc = v3d_resource(prsc);
 
         return &rsc->separate_stencil->base;
 }
 
 static const struct u_transfer_vtbl transfer_vtbl = {
-        .resource_create          = vc5_resource_create,
-        .resource_destroy         = vc5_resource_destroy,
-        .transfer_map             = vc5_resource_transfer_map,
-        .transfer_unmap           = vc5_resource_transfer_unmap,
+        .resource_create          = v3d_resource_create,
+        .resource_destroy         = v3d_resource_destroy,
+        .transfer_map             = v3d_resource_transfer_map,
+        .transfer_unmap           = v3d_resource_transfer_unmap,
         .transfer_flush_region    = u_default_transfer_flush_region,
-        .get_internal_format      = vc5_resource_get_internal_format,
-        .set_stencil              = vc5_resource_set_stencil,
-        .get_stencil              = vc5_resource_get_stencil,
+        .get_internal_format      = v3d_resource_get_internal_format,
+        .set_stencil              = v3d_resource_set_stencil,
+        .get_stencil              = v3d_resource_get_stencil,
 };
 
 void
-vc5_resource_screen_init(struct pipe_screen *pscreen)
+v3d_resource_screen_init(struct pipe_screen *pscreen)
 {
         pscreen->resource_create_with_modifiers =
-                vc5_resource_create_with_modifiers;
+                v3d_resource_create_with_modifiers;
         pscreen->resource_create = u_transfer_helper_resource_create;
-        pscreen->resource_from_handle = vc5_resource_from_handle;
-        pscreen->resource_get_handle = vc5_resource_get_handle;
+        pscreen->resource_from_handle = v3d_resource_from_handle;
+        pscreen->resource_get_handle = v3d_resource_get_handle;
         pscreen->resource_destroy = u_transfer_helper_resource_destroy;
         pscreen->transfer_helper = u_transfer_helper_create(&transfer_vtbl,
                                                             true, true, true);
 }
 
 void
-vc5_resource_context_init(struct pipe_context *pctx)
+v3d_resource_context_init(struct pipe_context *pctx)
 {
         pctx->transfer_map = u_transfer_helper_transfer_map;
         pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region;
         pctx->transfer_unmap = u_transfer_helper_transfer_unmap;
         pctx->buffer_subdata = u_default_buffer_subdata;
         pctx->texture_subdata = u_default_texture_subdata;
-        pctx->create_surface = vc5_create_surface;
-        pctx->surface_destroy = vc5_surface_destroy;
+        pctx->create_surface = v3d_create_surface;
+        pctx->surface_destroy = v3d_surface_destroy;
         pctx->resource_copy_region = util_resource_copy_region;
-        pctx->blit = vc5_blit;
-        pctx->flush_resource = vc5_flush_resource;
+        pctx->blit = v3d_blit;
+        pctx->flush_resource = v3d_flush_resource;
 }
diff --git a/src/gallium/drivers/vc5/vc5_resource.h b/src/gallium/drivers/v3d/v3d_resource.h
similarity index 81%
rename from src/gallium/drivers/vc5/vc5_resource.h
rename to src/gallium/drivers/v3d/v3d_resource.h
index 2af3553..141c4ca 100644
--- a/src/gallium/drivers/vc5/vc5_resource.h
+++ b/src/gallium/drivers/v3d/v3d_resource.h
@@ -25,7 +25,7 @@
 #ifndef VC5_RESOURCE_H
 #define VC5_RESOURCE_H
 
-#include "vc5_screen.h"
+#include "v3d_screen.h"
 #include "util/u_transfer.h"
 
 /* A UIFblock is a 256-byte region of memory that's 256-byte aligned.  These
@@ -38,10 +38,10 @@
  */
 
 /**
- * Tiling mode enum used for vc5_resource.c, which maps directly to the Memory
+ * Tiling mode enum used for v3d_resource.c, which maps directly to the Memory
  * Format field of render target and Z/Stencil config.
  */
-enum vc5_tiling_mode {
+enum v3d_tiling_mode {
         /* Untiled resources.  Not valid as texture inputs. */
         VC5_TILING_RASTER,
 
@@ -65,12 +65,12 @@
         VC5_TILING_UIF_XOR,
 };
 
-struct vc5_transfer {
+struct v3d_transfer {
         struct pipe_transfer base;
         void *map;
 };
 
-struct vc5_resource_slice {
+struct v3d_resource_slice {
         uint32_t offset;
         uint32_t stride;
         uint32_t padded_height;
@@ -80,13 +80,13 @@
          */
         uint32_t size;
         uint8_t ub_pad;
-        enum vc5_tiling_mode tiling;
+        enum v3d_tiling_mode tiling;
 };
 
-struct vc5_surface {
+struct v3d_surface {
         struct pipe_surface base;
         uint32_t offset;
-        enum vc5_tiling_mode tiling;
+        enum v3d_tiling_mode tiling;
         /**
          * Output image format for TILE_RENDERING_MODE_CONFIGURATION
          */
@@ -112,10 +112,10 @@
         struct pipe_surface *separate_stencil;
 };
 
-struct vc5_resource {
+struct v3d_resource {
         struct pipe_resource base;
-        struct vc5_bo *bo;
-        struct vc5_resource_slice slices[VC5_MAX_MIP_LEVELS];
+        struct v3d_bo *bo;
+        struct v3d_resource_slice slices[VC5_MAX_MIP_LEVELS];
         uint32_t cube_map_stride;
         uint32_t size;
         int cpp;
@@ -143,32 +143,32 @@
         enum pipe_format internal_format;
 
         /* Resource storing the S8 part of a Z32F_S8 resource, or NULL. */
-        struct vc5_resource *separate_stencil;
+        struct v3d_resource *separate_stencil;
 };
 
-static inline struct vc5_resource *
-vc5_resource(struct pipe_resource *prsc)
+static inline struct v3d_resource *
+v3d_resource(struct pipe_resource *prsc)
 {
-        return (struct vc5_resource *)prsc;
+        return (struct v3d_resource *)prsc;
 }
 
-static inline struct vc5_surface *
-vc5_surface(struct pipe_surface *psurf)
+static inline struct v3d_surface *
+v3d_surface(struct pipe_surface *psurf)
 {
-        return (struct vc5_surface *)psurf;
+        return (struct v3d_surface *)psurf;
 }
 
-static inline struct vc5_transfer *
-vc5_transfer(struct pipe_transfer *ptrans)
+static inline struct v3d_transfer *
+v3d_transfer(struct pipe_transfer *ptrans)
 {
-        return (struct vc5_transfer *)ptrans;
+        return (struct v3d_transfer *)ptrans;
 }
 
-void vc5_resource_screen_init(struct pipe_screen *pscreen);
-void vc5_resource_context_init(struct pipe_context *pctx);
-struct pipe_resource *vc5_resource_create(struct pipe_screen *pscreen,
+void v3d_resource_screen_init(struct pipe_screen *pscreen);
+void v3d_resource_context_init(struct pipe_context *pctx);
+struct pipe_resource *v3d_resource_create(struct pipe_screen *pscreen,
                                           const struct pipe_resource *tmpl);
-uint32_t vc5_layer_offset(struct pipe_resource *prsc, uint32_t level,
+uint32_t v3d_layer_offset(struct pipe_resource *prsc, uint32_t level,
                           uint32_t layer);
 
 
diff --git a/src/gallium/drivers/vc5/vc5_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
similarity index 85%
rename from src/gallium/drivers/vc5/vc5_screen.c
rename to src/gallium/drivers/v3d/v3d_screen.c
index 271c2c8..2f08479 100644
--- a/src/gallium/drivers/vc5/vc5_screen.c
+++ b/src/gallium/drivers/v3d/v3d_screen.c
@@ -34,16 +34,15 @@
 #include "util/ralloc.h"
 
 #include <xf86drm.h>
-#include "vc5_drm.h"
-#include "vc5_screen.h"
-#include "vc5_context.h"
-#include "vc5_resource.h"
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_resource.h"
 #include "compiler/v3d_compiler.h"
 
 static const char *
-vc5_screen_get_name(struct pipe_screen *pscreen)
+v3d_screen_get_name(struct pipe_screen *pscreen)
 {
-        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct v3d_screen *screen = v3d_screen(pscreen);
 
         if (!screen->name) {
                 screen->name = ralloc_asprintf(screen,
@@ -56,22 +55,22 @@
 }
 
 static const char *
-vc5_screen_get_vendor(struct pipe_screen *pscreen)
+v3d_screen_get_vendor(struct pipe_screen *pscreen)
 {
         return "Broadcom";
 }
 
 static void
-vc5_screen_destroy(struct pipe_screen *pscreen)
+v3d_screen_destroy(struct pipe_screen *pscreen)
 {
-        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct v3d_screen *screen = v3d_screen(pscreen);
 
         util_hash_table_destroy(screen->bo_handles);
-        vc5_bufmgr_destroy(pscreen);
+        v3d_bufmgr_destroy(pscreen);
         slab_destroy_parent(&screen->transfer_pool);
 
-        if (using_vc5_simulator)
-                vc5_simulator_destroy(screen);
+        if (using_v3d_simulator)
+                v3d_simulator_destroy(screen);
 
         v3d_compiler_free(screen->compiler);
 
@@ -80,9 +79,9 @@
 }
 
 static int
-vc5_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
-        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct v3d_screen *screen = v3d_screen(pscreen);
 
         switch (param) {
                 /* Supported features (boolean caps). */
@@ -110,6 +109,7 @@
         case PIPE_CAP_DRAW_INDIRECT:
         case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
         case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
+        case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
                 return 1;
 
         case PIPE_CAP_INDEP_BLEND_ENABLE:
@@ -124,6 +124,9 @@
         case PIPE_CAP_GLSL_FEATURE_LEVEL:
                 return 400;
 
+	case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+		return 140;
+
         case PIPE_CAP_MAX_VIEWPORTS:
                 return 1;
 
@@ -258,9 +261,17 @@
         case PIPE_CAP_TILE_RASTER_ORDER:
         case PIPE_CAP_STREAM_OUTPUT_INTERLEAVE_BUFFERS:
         case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+	case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
         case PIPE_CAP_CONTEXT_PRIORITY_MASK:
-	case PIPE_CAP_CONSTBUF0_FLAGS:
+        case PIPE_CAP_CONSTBUF0_FLAGS:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+        case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
         case PIPE_CAP_PACKED_UNIFORMS:
+        case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
                 return 0;
 
                 /* Geometry shader output, unsupported. */
@@ -318,7 +329,7 @@
 }
 
 static float
-vc5_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
+v3d_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
 {
         switch (param) {
         case PIPE_CAPF_MAX_LINE_WIDTH:
@@ -333,6 +344,11 @@
                 return 0.0f;
         case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
                 return 16.0f;
+
+        case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+                return 0.0f;
         default:
                 fprintf(stderr, "unknown paramf %d\n", param);
                 return 0;
@@ -340,7 +356,7 @@
 }
 
 static int
-vc5_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
+v3d_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
                            enum pipe_shader_cap param)
 {
         if (shader != PIPE_SHADER_VERTEX &&
@@ -363,7 +379,7 @@
                 if (shader == PIPE_SHADER_FRAGMENT)
                         return VC5_MAX_FS_INPUTS / 4;
                 else
-                        return 16;
+                        return VC5_MAX_ATTRIBUTES;
         case PIPE_SHADER_CAP_MAX_OUTPUTS:
                 if (shader == PIPE_SHADER_FRAGMENT)
                         return 4;
@@ -397,6 +413,8 @@
         case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
         case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
                 return 0;
+        case PIPE_SHADER_CAP_SCALAR_ISA:
+                return 1;
         case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
         case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
         case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
@@ -419,19 +437,22 @@
 }
 
 static boolean
-vc5_screen_is_format_supported(struct pipe_screen *pscreen,
+v3d_screen_is_format_supported(struct pipe_screen *pscreen,
                                enum pipe_format format,
                                enum pipe_texture_target target,
                                unsigned sample_count,
+                               unsigned storage_sample_count,
                                unsigned usage)
 {
-        struct vc5_screen *screen = vc5_screen(pscreen);
+        struct v3d_screen *screen = v3d_screen(pscreen);
+
+        if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+                return false;
 
         if (sample_count > 1 && sample_count != VC5_MAX_SAMPLES)
                 return FALSE;
 
-        if ((target >= PIPE_MAX_TEXTURE_TYPES) ||
-            !util_format_is_supported(format, usage)) {
+        if (target >= PIPE_MAX_TEXTURE_TYPES) {
                 return FALSE;
         }
 
@@ -496,12 +517,12 @@
         }
 
         if ((usage & PIPE_BIND_RENDER_TARGET) &&
-            !vc5_rt_format_supported(&screen->devinfo, format)) {
+            !v3d_rt_format_supported(&screen->devinfo, format)) {
                 return FALSE;
         }
 
         if ((usage & PIPE_BIND_SAMPLER_VIEW) &&
-            !vc5_tex_format_supported(&screen->devinfo, format)) {
+            !v3d_tex_format_supported(&screen->devinfo, format)) {
                 return FALSE;
         }
 
@@ -537,23 +558,23 @@
 }
 
 static bool
-vc5_get_device_info(struct vc5_screen *screen)
+v3d_get_device_info(struct v3d_screen *screen)
 {
-        struct drm_vc5_get_param ident0 = {
-                .param = DRM_VC5_PARAM_V3D_CORE0_IDENT0,
+        struct drm_v3d_get_param ident0 = {
+                .param = DRM_V3D_PARAM_V3D_CORE0_IDENT0,
         };
-        struct drm_vc5_get_param ident1 = {
-                .param = DRM_VC5_PARAM_V3D_CORE0_IDENT1,
+        struct drm_v3d_get_param ident1 = {
+                .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
         };
         int ret;
 
-        ret = vc5_ioctl(screen->fd, DRM_IOCTL_VC5_GET_PARAM, &ident0);
+        ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
         if (ret != 0) {
                 fprintf(stderr, "Couldn't get V3D core IDENT0: %s\n",
                         strerror(errno));
                 return false;
         }
-        ret = vc5_ioctl(screen->fd, DRM_IOCTL_VC5_GET_PARAM, &ident1);
+        ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_GET_PARAM, &ident1);
         if (ret != 0) {
                 fprintf(stderr, "Couldn't get V3D core IDENT1: %s\n",
                         strerror(errno));
@@ -564,6 +585,8 @@
         uint32_t minor = (ident1.value >> 0) & 0xf;
         screen->devinfo.ver = major * 10 + minor;
 
+        screen->devinfo.vpm_size = (ident1.value >> 28 & 0xf) * 8192;
+
         switch (screen->devinfo.ver) {
         case 33:
         case 41:
@@ -581,53 +604,53 @@
 }
 
 static const void *
-vc5_screen_get_compiler_options(struct pipe_screen *pscreen,
+v3d_screen_get_compiler_options(struct pipe_screen *pscreen,
                                 enum pipe_shader_ir ir, unsigned shader)
 {
         return &v3d_nir_options;
 }
 
 struct pipe_screen *
-vc5_screen_create(int fd)
+v3d_screen_create(int fd)
 {
-        struct vc5_screen *screen = rzalloc(NULL, struct vc5_screen);
+        struct v3d_screen *screen = rzalloc(NULL, struct v3d_screen);
         struct pipe_screen *pscreen;
 
         pscreen = &screen->base;
 
-        pscreen->destroy = vc5_screen_destroy;
-        pscreen->get_param = vc5_screen_get_param;
-        pscreen->get_paramf = vc5_screen_get_paramf;
-        pscreen->get_shader_param = vc5_screen_get_shader_param;
-        pscreen->context_create = vc5_context_create;
-        pscreen->is_format_supported = vc5_screen_is_format_supported;
+        pscreen->destroy = v3d_screen_destroy;
+        pscreen->get_param = v3d_screen_get_param;
+        pscreen->get_paramf = v3d_screen_get_paramf;
+        pscreen->get_shader_param = v3d_screen_get_shader_param;
+        pscreen->context_create = v3d_context_create;
+        pscreen->is_format_supported = v3d_screen_is_format_supported;
 
         screen->fd = fd;
         list_inithead(&screen->bo_cache.time_list);
         (void)mtx_init(&screen->bo_handles_mutex, mtx_plain);
         screen->bo_handles = util_hash_table_create(handle_hash, handle_compare);
 
-#if defined(USE_VC5_SIMULATOR)
-        vc5_simulator_init(screen);
+#if defined(USE_V3D_SIMULATOR)
+        v3d_simulator_init(screen);
 #endif
 
-        if (!vc5_get_device_info(screen))
+        if (!v3d_get_device_info(screen))
                 goto fail;
 
-        slab_create_parent(&screen->transfer_pool, sizeof(struct vc5_transfer), 16);
+        slab_create_parent(&screen->transfer_pool, sizeof(struct v3d_transfer), 16);
 
-        vc5_fence_init(screen);
+        v3d_fence_init(screen);
 
         v3d_process_debug_variable();
 
-        vc5_resource_screen_init(pscreen);
+        v3d_resource_screen_init(pscreen);
 
         screen->compiler = v3d_compiler_init(&screen->devinfo);
 
-        pscreen->get_name = vc5_screen_get_name;
-        pscreen->get_vendor = vc5_screen_get_vendor;
-        pscreen->get_device_vendor = vc5_screen_get_vendor;
-        pscreen->get_compiler_options = vc5_screen_get_compiler_options;
+        pscreen->get_name = v3d_screen_get_name;
+        pscreen->get_vendor = v3d_screen_get_vendor;
+        pscreen->get_device_vendor = v3d_screen_get_vendor;
+        pscreen->get_compiler_options = v3d_screen_get_compiler_options;
 
         return pscreen;
 
diff --git a/src/gallium/drivers/vc5/vc5_screen.h b/src/gallium/drivers/v3d/v3d_screen.h
similarity index 82%
rename from src/gallium/drivers/vc5/vc5_screen.h
rename to src/gallium/drivers/v3d/v3d_screen.h
index 9a7c11a..4d30ef3 100644
--- a/src/gallium/drivers/vc5/vc5_screen.h
+++ b/src/gallium/drivers/v3d/v3d_screen.h
@@ -32,7 +32,7 @@
 #include "broadcom/common/v3d_debug.h"
 #include "broadcom/common/v3d_device_info.h"
 
-struct vc5_bo;
+struct v3d_bo;
 
 #define VC5_MAX_MIP_LEVELS 12
 #define VC5_MAX_TEXTURE_SAMPLERS 32
@@ -51,9 +51,9 @@
 #define VC5_UIFBLOCK_SIZE (4 * VC5_UBLOCK_SIZE)
 #define VC5_UIFBLOCK_ROW_SIZE (4 * VC5_UIFBLOCK_SIZE)
 
-struct vc5_simulator_file;
+struct v3d_simulator_file;
 
-struct vc5_screen {
+struct v3d_screen {
         struct pipe_screen base;
         int fd;
 
@@ -63,17 +63,14 @@
 
         struct slab_parent_pool transfer_pool;
 
-        struct vc5_bo_cache {
-                /** List of struct vc5_bo freed, by age. */
+        struct v3d_bo_cache {
+                /** List of struct v3d_bo freed, by age. */
                 struct list_head time_list;
-                /** List of struct vc5_bo freed, per size, by age. */
+                /** List of struct v3d_bo freed, per size, by age. */
                 struct list_head *size_list;
                 uint32_t size_list_size;
 
                 mtx_t lock;
-
-                uint32_t bo_size;
-                uint32_t bo_count;
         } bo_cache;
 
         const struct v3d_compiler *compiler;
@@ -84,18 +81,18 @@
         uint32_t bo_size;
         uint32_t bo_count;
 
-        struct vc5_simulator_file *sim_file;
+        struct v3d_simulator_file *sim_file;
 };
 
-static inline struct vc5_screen *
-vc5_screen(struct pipe_screen *screen)
+static inline struct v3d_screen *
+v3d_screen(struct pipe_screen *screen)
 {
-        return (struct vc5_screen *)screen;
+        return (struct v3d_screen *)screen;
 }
 
-struct pipe_screen *vc5_screen_create(int fd);
+struct pipe_screen *v3d_screen_create(int fd);
 
 void
-vc5_fence_init(struct vc5_screen *screen);
+v3d_fence_init(struct v3d_screen *screen);
 
 #endif /* VC5_SCREEN_H */
diff --git a/src/gallium/drivers/vc5/vc5_simulator.c b/src/gallium/drivers/v3d/v3d_simulator.c
similarity index 75%
rename from src/gallium/drivers/vc5/vc5_simulator.c
rename to src/gallium/drivers/v3d/v3d_simulator.c
index d677293..8ef88db 100644
--- a/src/gallium/drivers/vc5/vc5_simulator.c
+++ b/src/gallium/drivers/v3d/v3d_simulator.c
@@ -22,7 +22,7 @@
  */
 
 /**
- * @file vc5_simulator.c
+ * @file v3d_simulator.c
  *
  * Implements VC5 simulation on top of a non-VC5 GEM fd.
  *
@@ -46,7 +46,7 @@
  * BOs).
  */
 
-#ifdef USE_VC5_SIMULATOR
+#ifdef USE_V3D_SIMULATOR
 
 #include <sys/mman.h>
 #include "util/hash_table.h"
@@ -54,13 +54,13 @@
 #include "util/set.h"
 #include "util/u_memory.h"
 #include "util/u_mm.h"
-#include "vc5_simulator_wrapper.h"
+#include "v3d_simulator_wrapper.h"
 
-#include "vc5_screen.h"
-#include "vc5_context.h"
+#include "v3d_screen.h"
+#include "v3d_context.h"
 
 /** Global (across GEM fds) state for the simulator */
-static struct vc5_simulator_state {
+static struct v3d_simulator_state {
         mtx_t mutex;
 
         struct v3d_hw *v3d;
@@ -76,7 +76,7 @@
         struct mem_block *heap;
         struct mem_block *overflow;
 
-        /** Mapping from GEM handle to struct vc5_simulator_bo * */
+        /** Mapping from GEM handle to struct v3d_simulator_bo * */
         struct hash_table *fd_map;
 
         int refcount;
@@ -85,19 +85,19 @@
 };
 
 /** Per-GEM-fd state for the simulator. */
-struct vc5_simulator_file {
+struct v3d_simulator_file {
         int fd;
 
-        /** Mapping from GEM handle to struct vc5_simulator_bo * */
+        /** Mapping from GEM handle to struct v3d_simulator_bo * */
         struct hash_table *bo_map;
 
         struct mem_block *gmp;
         void *gmp_vaddr;
 };
 
-/** Wrapper for drm_vc5_bo tracking the simulator-specific state. */
-struct vc5_simulator_bo {
-        struct vc5_simulator_file *file;
+/** Wrapper for drm_v3d_bo tracking the simulator-specific state. */
+struct v3d_simulator_bo {
+        struct v3d_simulator_file *file;
 
         /** Area for this BO within sim_state->mem */
         struct mem_block *block;
@@ -116,8 +116,8 @@
         return (void *)(uintptr_t)key;
 }
 
-static struct vc5_simulator_file *
-vc5_get_simulator_file_for_fd(int fd)
+static struct v3d_simulator_file *
+v3d_get_simulator_file_for_fd(int fd)
 {
         struct hash_entry *entry = _mesa_hash_table_search(sim_state.fd_map,
                                                            int_to_key(fd + 1));
@@ -137,7 +137,7 @@
  * permissions (bit 0 = read, bit 1 = write, write-only forbidden).
  */
 static void
-set_gmp_flags(struct vc5_simulator_file *file,
+set_gmp_flags(struct v3d_simulator_file *file,
               uint32_t offset, uint32_t size, uint32_t flag)
 {
         assert((offset & ((1 << GMP_ALIGN2) - 1)) == 0);
@@ -158,12 +158,12 @@
  * Allocates space in simulator memory and returns a tracking struct for it
  * that also contains the drm_gem_cma_object struct.
  */
-static struct vc5_simulator_bo *
-vc5_create_simulator_bo(int fd, int handle, unsigned size)
+static struct v3d_simulator_bo *
+v3d_create_simulator_bo(int fd, int handle, unsigned size)
 {
-        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
-        struct vc5_simulator_bo *sim_bo = rzalloc(file,
-                                                  struct vc5_simulator_bo);
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+        struct v3d_simulator_bo *sim_bo = rzalloc(file,
+                                                  struct v3d_simulator_bo);
         size = align(size, 4096);
 
         sim_bo->file = file;
@@ -182,7 +182,7 @@
 
         *(uint32_t *)(sim_bo->vaddr + sim_bo->size) = BO_SENTINEL;
 
-        /* A handle of 0 is used for vc5_gem.c internal allocations that
+        /* A handle of 0 is used for v3d_gem.c internal allocations that
          * don't need to go in the lookup table.
          */
         if (handle != 0) {
@@ -196,9 +196,9 @@
 }
 
 static void
-vc5_free_simulator_bo(struct vc5_simulator_bo *sim_bo)
+v3d_free_simulator_bo(struct v3d_simulator_bo *sim_bo)
 {
-        struct vc5_simulator_file *sim_file = sim_bo->file;
+        struct v3d_simulator_file *sim_file = sim_bo->file;
 
         if (sim_bo->winsys_map)
                 munmap(sim_bo->winsys_map, sim_bo->size);
@@ -217,8 +217,8 @@
         ralloc_free(sim_bo);
 }
 
-static struct vc5_simulator_bo *
-vc5_get_simulator_bo(struct vc5_simulator_file *file, int gem_handle)
+static struct v3d_simulator_bo *
+v3d_get_simulator_bo(struct v3d_simulator_file *file, int gem_handle)
 {
         mtx_lock(&sim_state.mutex);
         struct hash_entry *entry =
@@ -229,17 +229,17 @@
 }
 
 static int
-vc5_simulator_pin_bos(int fd, struct vc5_job *job)
+v3d_simulator_pin_bos(int fd, struct v3d_job *job)
 {
-        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
         struct set_entry *entry;
 
         set_foreach(job->bos, entry) {
-                struct vc5_bo *bo = (struct vc5_bo *)entry->key;
-                struct vc5_simulator_bo *sim_bo =
-                        vc5_get_simulator_bo(file, bo->handle);
+                struct v3d_bo *bo = (struct v3d_bo *)entry->key;
+                struct v3d_simulator_bo *sim_bo =
+                        v3d_get_simulator_bo(file, bo->handle);
 
-                vc5_bo_map(bo);
+                v3d_bo_map(bo);
                 memcpy(sim_bo->vaddr, bo->map, bo->size);
         }
 
@@ -247,22 +247,22 @@
 }
 
 static int
-vc5_simulator_unpin_bos(int fd, struct vc5_job *job)
+v3d_simulator_unpin_bos(int fd, struct v3d_job *job)
 {
-        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
         struct set_entry *entry;
 
         set_foreach(job->bos, entry) {
-                struct vc5_bo *bo = (struct vc5_bo *)entry->key;
-                struct vc5_simulator_bo *sim_bo =
-                        vc5_get_simulator_bo(file, bo->handle);
+                struct v3d_bo *bo = (struct v3d_bo *)entry->key;
+                struct v3d_simulator_bo *sim_bo =
+                        v3d_get_simulator_bo(file, bo->handle);
 
                 if (*(uint32_t *)(sim_bo->vaddr +
                                   sim_bo->size) != BO_SENTINEL) {
                         fprintf(stderr, "Buffer overflow in %s\n", bo->name);
                 }
 
-                vc5_bo_map(bo);
+                v3d_bo_map(bo);
                 memcpy(bo->map, sim_bo->vaddr, bo->size);
         }
 
@@ -271,20 +271,20 @@
 
 #if 0
 static void
-vc5_dump_to_file(struct vc5_exec_info *exec)
+v3d_dump_to_file(struct v3d_exec_info *exec)
 {
         static int dumpno = 0;
-        struct drm_vc5_get_hang_state *state;
-        struct drm_vc5_get_hang_state_bo *bo_state;
+        struct drm_v3d_get_hang_state *state;
+        struct drm_v3d_get_hang_state_bo *bo_state;
         unsigned int dump_version = 0;
 
-        if (!(vc5_debug & VC5_DEBUG_DUMP))
+        if (!(v3d_debug & VC5_DEBUG_DUMP))
                 return;
 
         state = calloc(1, sizeof(*state));
 
         int unref_count = 0;
-        list_for_each_entry_safe(struct drm_vc5_bo, bo, &exec->unref_list,
+        list_for_each_entry_safe(struct drm_v3d_bo, bo, &exec->unref_list,
                                  unref_head) {
                 unref_count++;
         }
@@ -294,7 +294,7 @@
         bo_state = calloc(state->bo_count, sizeof(*bo_state));
 
         char *filename = NULL;
-        asprintf(&filename, "vc5-dri-%d.dump", dumpno++);
+        asprintf(&filename, "v3d-dri-%d.dump", dumpno++);
         FILE *f = fopen(filename, "w+");
         if (!f) {
                 fprintf(stderr, "Couldn't open %s: %s", filename,
@@ -320,7 +320,7 @@
                 bo_state[i].size = cma_bo->base.size;
         }
 
-        list_for_each_entry_safe(struct drm_vc5_bo, bo, &exec->unref_list,
+        list_for_each_entry_safe(struct drm_v3d_bo, bo, &exec->unref_list,
                                  unref_head) {
                 struct drm_gem_cma_object *cma_bo = &bo->base;
                 bo_state[i].handle = 0;
@@ -342,7 +342,7 @@
                 fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
         }
 
-        list_for_each_entry_safe(struct drm_vc5_bo, bo, &exec->unref_list,
+        list_for_each_entry_safe(struct drm_v3d_bo, bo, &exec->unref_list,
                                  unref_head) {
                 struct drm_gem_cma_object *cma_bo = &bo->base;
                 fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
@@ -359,15 +359,15 @@
 #endif
 
 int
-vc5_simulator_flush(struct vc5_context *vc5,
-                    struct drm_vc5_submit_cl *submit, struct vc5_job *job)
+v3d_simulator_flush(struct v3d_context *v3d,
+                    struct drm_v3d_submit_cl *submit, struct v3d_job *job)
 {
-        struct vc5_screen *screen = vc5->screen;
+        struct v3d_screen *screen = v3d->screen;
         int fd = screen->fd;
-        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
-        struct vc5_surface *csurf = vc5_surface(vc5->framebuffer.cbufs[0]);
-        struct vc5_resource *ctex = csurf ? vc5_resource(csurf->base.texture) : NULL;
-        struct vc5_simulator_bo *csim_bo = ctex ? vc5_get_simulator_bo(file, ctex->bo->handle) : NULL;
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+        struct v3d_surface *csurf = v3d_surface(v3d->framebuffer.cbufs[0]);
+        struct v3d_resource *ctex = csurf ? v3d_resource(csurf->base.texture) : NULL;
+        struct v3d_simulator_bo *csim_bo = ctex ? v3d_get_simulator_bo(file, ctex->bo->handle) : NULL;
         uint32_t winsys_stride = ctex ? csim_bo->winsys_stride : 0;
         uint32_t sim_stride = ctex ? ctex->slices[0].stride : 0;
         uint32_t row_len = MIN2(sim_stride, winsys_stride);
@@ -389,18 +389,18 @@
                 }
         }
 
-        ret = vc5_simulator_pin_bos(fd, job);
+        ret = v3d_simulator_pin_bos(fd, job);
         if (ret)
                 return ret;
 
-        //vc5_dump_to_file(&exec);
+        //v3d_dump_to_file(&exec);
 
         if (sim_state.ver >= 41)
                 v3d41_simulator_flush(sim_state.v3d, submit, file->gmp->ofs);
         else
                 v3d33_simulator_flush(sim_state.v3d, submit, file->gmp->ofs);
 
-        ret = vc5_simulator_unpin_bos(fd, job);
+        ret = v3d_simulator_unpin_bos(fd, job);
         if (ret)
                 return ret;
 
@@ -419,7 +419,7 @@
  * Map the underlying GEM object from the real hardware GEM handle.
  */
 static void *
-vc5_simulator_map_winsys_bo(int fd, struct vc5_simulator_bo *sim_bo)
+v3d_simulator_map_winsys_bo(int fd, struct v3d_simulator_bo *sim_bo)
 {
         int ret;
         void *map;
@@ -453,14 +453,14 @@
  * time, but we're still using drmPrimeFDToHandle() so we have this helper to
  * be called afterward instead.
  */
-void vc5_simulator_open_from_handle(int fd, uint32_t winsys_stride,
+void v3d_simulator_open_from_handle(int fd, uint32_t winsys_stride,
                                     int handle, uint32_t size)
 {
-        struct vc5_simulator_bo *sim_bo =
-                vc5_create_simulator_bo(fd, handle, size);
+        struct v3d_simulator_bo *sim_bo =
+                v3d_create_simulator_bo(fd, handle, size);
 
         sim_bo->winsys_stride = winsys_stride;
-        sim_bo->winsys_map = vc5_simulator_map_winsys_bo(fd, sim_bo);
+        sim_bo->winsys_map = v3d_simulator_map_winsys_bo(fd, sim_bo);
 }
 
 /**
@@ -469,7 +469,7 @@
  * Making a VC5 BO is just a matter of making a corresponding BO on the host.
  */
 static int
-vc5_simulator_create_bo_ioctl(int fd, struct drm_vc5_create_bo *args)
+v3d_simulator_create_bo_ioctl(int fd, struct drm_v3d_create_bo *args)
 {
         int ret;
         struct drm_mode_create_dumb create = {
@@ -483,8 +483,8 @@
 
         args->handle = create.handle;
 
-        struct vc5_simulator_bo *sim_bo =
-                vc5_create_simulator_bo(fd, create.handle, args->size);
+        struct v3d_simulator_bo *sim_bo =
+                v3d_create_simulator_bo(fd, create.handle, args->size);
 
         args->offset = sim_bo->block->ofs;
 
@@ -497,7 +497,7 @@
  * We just pass this straight through to dumb mmap.
  */
 static int
-vc5_simulator_mmap_bo_ioctl(int fd, struct drm_vc5_mmap_bo *args)
+v3d_simulator_mmap_bo_ioctl(int fd, struct drm_v3d_mmap_bo *args)
 {
         int ret;
         struct drm_mode_map_dumb map = {
@@ -511,21 +511,33 @@
 }
 
 static int
-vc5_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
+v3d_simulator_get_bo_offset_ioctl(int fd, struct drm_v3d_get_bo_offset *args)
 {
-        /* Free the simulator's internal tracking. */
-        struct vc5_simulator_file *file = vc5_get_simulator_file_for_fd(fd);
-        struct vc5_simulator_bo *sim_bo = vc5_get_simulator_bo(file,
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+        struct v3d_simulator_bo *sim_bo = v3d_get_simulator_bo(file,
                                                                args->handle);
 
-        vc5_free_simulator_bo(sim_bo);
+        args->offset = sim_bo->block->ofs;
+
+        return 0;
+}
+
+static int
+v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
+{
+        /* Free the simulator's internal tracking. */
+        struct v3d_simulator_file *file = v3d_get_simulator_file_for_fd(fd);
+        struct v3d_simulator_bo *sim_bo = v3d_get_simulator_bo(file,
+                                                               args->handle);
+
+        v3d_free_simulator_bo(sim_bo);
 
         /* Pass the call on down. */
         return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args);
 }
 
 static int
-vc5_simulator_get_param_ioctl(int fd, struct drm_vc5_get_param *args)
+v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
 {
         if (sim_state.ver >= 41)
                 return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
@@ -534,27 +546,29 @@
 }
 
 int
-vc5_simulator_ioctl(int fd, unsigned long request, void *args)
+v3d_simulator_ioctl(int fd, unsigned long request, void *args)
 {
         switch (request) {
-        case DRM_IOCTL_VC5_CREATE_BO:
-                return vc5_simulator_create_bo_ioctl(fd, args);
-        case DRM_IOCTL_VC5_MMAP_BO:
-                return vc5_simulator_mmap_bo_ioctl(fd, args);
+        case DRM_IOCTL_V3D_CREATE_BO:
+                return v3d_simulator_create_bo_ioctl(fd, args);
+        case DRM_IOCTL_V3D_MMAP_BO:
+                return v3d_simulator_mmap_bo_ioctl(fd, args);
+        case DRM_IOCTL_V3D_GET_BO_OFFSET:
+                return v3d_simulator_get_bo_offset_ioctl(fd, args);
 
-        case DRM_IOCTL_VC5_WAIT_BO:
-                /* We do all of the vc5 rendering synchronously, so we just
+        case DRM_IOCTL_V3D_WAIT_BO:
+                /* We do all of the v3d rendering synchronously, so we just
                  * return immediately on the wait ioctls.  This ignores any
                  * native rendering to the host BO, so it does mean we race on
                  * front buffer rendering.
                  */
                 return 0;
 
-        case DRM_IOCTL_VC5_GET_PARAM:
-                return vc5_simulator_get_param_ioctl(fd, args);
+        case DRM_IOCTL_V3D_GET_PARAM:
+                return v3d_simulator_get_param_ioctl(fd, args);
 
         case DRM_IOCTL_GEM_CLOSE:
-                return vc5_simulator_gem_close_ioctl(fd, args);
+                return v3d_simulator_gem_close_ioctl(fd, args);
 
         case DRM_IOCTL_GEM_OPEN:
         case DRM_IOCTL_GEM_FLINK:
@@ -566,7 +580,7 @@
 }
 
 static void
-vc5_simulator_init_global(const struct v3d_device_info *devinfo)
+v3d_simulator_init_global(const struct v3d_device_info *devinfo)
 {
         mtx_lock(&sim_state.mutex);
         if (sim_state.refcount++) {
@@ -580,7 +594,11 @@
                 v3d_hw_get_mem(sim_state.v3d, &sim_state.mem_size,
                                &sim_state.mem);
 
-        sim_state.heap = u_mmInit(0, sim_state.mem_size);
+        /* Allocate from anywhere from 4096 up.  We don't allocate at 0,
+         * because for OQs and some other addresses in the HW, 0 means
+         * disabled.
+         */
+        sim_state.heap = u_mmInit(4096, sim_state.mem_size - 4096);
 
         /* Make a block of 0xd0 at address 0 to make sure we don't screw up
          * and land there.
@@ -604,12 +622,12 @@
 }
 
 void
-vc5_simulator_init(struct vc5_screen *screen)
+v3d_simulator_init(struct v3d_screen *screen)
 {
-        vc5_simulator_init_global(&screen->devinfo);
+        v3d_simulator_init_global(&screen->devinfo);
 
-        screen->sim_file = rzalloc(screen, struct vc5_simulator_file);
-        struct vc5_simulator_file *sim_file = screen->sim_file;
+        screen->sim_file = rzalloc(screen, struct v3d_simulator_file);
+        struct v3d_simulator_file *sim_file = screen->sim_file;
 
         screen->sim_file->bo_map =
                 _mesa_hash_table_create(screen->sim_file,
@@ -627,7 +645,7 @@
 }
 
 void
-vc5_simulator_destroy(struct vc5_screen *screen)
+v3d_simulator_destroy(struct v3d_screen *screen)
 {
         mtx_lock(&sim_state.mutex);
         if (!--sim_state.refcount) {
@@ -639,4 +657,4 @@
         mtx_unlock(&sim_state.mutex);
 }
 
-#endif /* USE_VC5_SIMULATOR */
+#endif /* USE_V3D_SIMULATOR */
diff --git a/src/gallium/drivers/vc5/vc5_simulator_wrapper.cpp b/src/gallium/drivers/v3d/v3d_simulator_wrapper.cpp
similarity index 94%
rename from src/gallium/drivers/vc5/vc5_simulator_wrapper.cpp
rename to src/gallium/drivers/v3d/v3d_simulator_wrapper.cpp
index 5776aea..7b04ded 100644
--- a/src/gallium/drivers/vc5/vc5_simulator_wrapper.cpp
+++ b/src/gallium/drivers/v3d/v3d_simulator_wrapper.cpp
@@ -24,12 +24,12 @@
 /** @file
  *
  * Wraps bits of the V3D simulator interface in a C interface for the
- * vc5_simulator.c code to use.
+ * v3d_simulator.c code to use.
  */
 
-#ifdef USE_VC5_SIMULATOR
+#ifdef USE_V3D_SIMULATOR
 
-#include "vc5_simulator_wrapper.h"
+#include "v3d_simulator_wrapper.h"
 
 #define V3D_TECH_VERSION 3
 #define V3D_REVISION 3
@@ -85,4 +85,4 @@
 
 }
 
-#endif /* USE_VC5_SIMULATOR */
+#endif /* USE_V3D_SIMULATOR */
diff --git a/src/gallium/drivers/vc5/vc5_simulator_wrapper.h b/src/gallium/drivers/v3d/v3d_simulator_wrapper.h
similarity index 100%
rename from src/gallium/drivers/vc5/vc5_simulator_wrapper.h
rename to src/gallium/drivers/v3d/v3d_simulator_wrapper.h
diff --git a/src/gallium/drivers/vc5/vc5_tiling.c b/src/gallium/drivers/v3d/v3d_tiling.c
similarity index 79%
rename from src/gallium/drivers/vc5/vc5_tiling.c
rename to src/gallium/drivers/v3d/v3d_tiling.c
index cbd86d5..bc227be 100644
--- a/src/gallium/drivers/vc5/vc5_tiling.c
+++ b/src/gallium/drivers/v3d/v3d_tiling.c
@@ -21,20 +21,20 @@
  * IN THE SOFTWARE.
  */
 
-/** @file vc5_tiling.c
+/** @file v3d_tiling.c
  *
  * Handles information about the VC5 tiling formats, and loading and storing
  * from them.
  */
 
 #include <stdint.h>
-#include "vc5_screen.h"
-#include "vc5_context.h"
-#include "vc5_tiling.h"
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_tiling.h"
 
 /** Return the width in pixels of a 64-byte microtile. */
 uint32_t
-vc5_utile_width(int cpp)
+v3d_utile_width(int cpp)
 {
         switch (cpp) {
         case 1:
@@ -52,7 +52,7 @@
 
 /** Return the height in pixels of a 64-byte microtile. */
 uint32_t
-vc5_utile_height(int cpp)
+v3d_utile_height(int cpp)
 {
         switch (cpp) {
         case 1:
@@ -75,10 +75,10 @@
  * arrangement.
  */
 static inline uint32_t
-vc5_get_utile_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y)
+v3d_get_utile_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y)
 {
-        uint32_t utile_w = vc5_utile_width(cpp);
-        uint32_t utile_h = vc5_utile_height(cpp);
+        uint32_t utile_w = v3d_utile_width(cpp);
+        uint32_t utile_h = v3d_utile_height(cpp);
 
         assert(x < utile_w && y < utile_h);
 
@@ -91,17 +91,17 @@
  * LINEARTILE is a single line of utiles in either the X or Y direction.
  */
 static inline uint32_t
-vc5_get_lt_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y)
+v3d_get_lt_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y)
 {
-        uint32_t utile_w = vc5_utile_width(cpp);
-        uint32_t utile_h = vc5_utile_height(cpp);
+        uint32_t utile_w = v3d_utile_width(cpp);
+        uint32_t utile_h = v3d_utile_height(cpp);
         uint32_t utile_index_x = x / utile_w;
         uint32_t utile_index_y = y / utile_h;
 
         assert(utile_index_x == 0 || utile_index_y == 0);
 
         return (64 * (utile_index_x + utile_index_y) +
-                vc5_get_utile_pixel_offset(cpp,
+                v3d_get_utile_pixel_offset(cpp,
                                            x & (utile_w - 1),
                                            y & (utile_h - 1)));
 }
@@ -113,11 +113,11 @@
  * utiles), and the UIF blocks are in 1 or 2 columns in raster order.
  */
 static inline uint32_t
-vc5_get_ublinear_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y,
+v3d_get_ublinear_pixel_offset(uint32_t cpp, uint32_t x, uint32_t y,
                               int ublinear_number)
 {
-        uint32_t utile_w = vc5_utile_width(cpp);
-        uint32_t utile_h = vc5_utile_height(cpp);
+        uint32_t utile_w = v3d_utile_width(cpp);
+        uint32_t utile_h = v3d_utile_height(cpp);
         uint32_t ub_w = utile_w * 2;
         uint32_t ub_h = utile_h * 2;
         uint32_t ub_x = x / ub_w;
@@ -127,23 +127,23 @@
                        ub_x) +
                 ((x & utile_w) ? 64 : 0) +
                 ((y & utile_h) ? 128 : 0) +
-                + vc5_get_utile_pixel_offset(cpp,
+                + v3d_get_utile_pixel_offset(cpp,
                                              x & (utile_w - 1),
                                              y & (utile_h - 1)));
 }
 
 static inline uint32_t
-vc5_get_ublinear_2_column_pixel_offset(uint32_t cpp, uint32_t image_h,
+v3d_get_ublinear_2_column_pixel_offset(uint32_t cpp, uint32_t image_h,
                                        uint32_t x, uint32_t y)
 {
-        return vc5_get_ublinear_pixel_offset(cpp, x, y, 2);
+        return v3d_get_ublinear_pixel_offset(cpp, x, y, 2);
 }
 
 static inline uint32_t
-vc5_get_ublinear_1_column_pixel_offset(uint32_t cpp, uint32_t image_h,
+v3d_get_ublinear_1_column_pixel_offset(uint32_t cpp, uint32_t image_h,
                                        uint32_t x, uint32_t y)
 {
-        return vc5_get_ublinear_pixel_offset(cpp, x, y, 1);
+        return v3d_get_ublinear_pixel_offset(cpp, x, y, 1);
 }
 
 /**
@@ -154,11 +154,11 @@
  * 4x4 groups, and those 4x4 groups are then stored in raster order.
  */
 static inline uint32_t
-vc5_get_uif_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y,
+v3d_get_uif_pixel_offset(uint32_t cpp, uint32_t image_h, uint32_t x, uint32_t y,
                          bool do_xor)
 {
-        uint32_t utile_w = vc5_utile_width(cpp);
-        uint32_t utile_h = vc5_utile_height(cpp);
+        uint32_t utile_w = v3d_utile_width(cpp);
+        uint32_t utile_h = v3d_utile_height(cpp);
         uint32_t mb_width = utile_w * 2;
         uint32_t mb_height = utile_h * 2;
         uint32_t log2_mb_width = ffs(mb_width) - 1;
@@ -190,7 +190,7 @@
 
         uint32_t mb_pixel_address = (mb_base_addr +
                                      mb_tile_offset +
-                                     vc5_get_utile_pixel_offset(cpp,
+                                     v3d_get_utile_pixel_offset(cpp,
                                                                 utile_x,
                                                                 utile_y));
 
@@ -198,21 +198,21 @@
 }
 
 static inline uint32_t
-vc5_get_uif_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
+v3d_get_uif_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
                              uint32_t x, uint32_t y)
 {
-        return vc5_get_uif_pixel_offset(cpp, image_h, x, y, true);
+        return v3d_get_uif_pixel_offset(cpp, image_h, x, y, true);
 }
 
 static inline uint32_t
-vc5_get_uif_no_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
+v3d_get_uif_no_xor_pixel_offset(uint32_t cpp, uint32_t image_h,
                                 uint32_t x, uint32_t y)
 {
-        return vc5_get_uif_pixel_offset(cpp, image_h, x, y, false);
+        return v3d_get_uif_pixel_offset(cpp, image_h, x, y, false);
 }
 
 static inline void
-vc5_move_pixels_general_percpp(void *gpu, uint32_t gpu_stride,
+v3d_move_pixels_general_percpp(void *gpu, uint32_t gpu_stride,
                                void *cpu, uint32_t cpu_stride,
                                int cpp, uint32_t image_h,
                                const struct pipe_box *box,
@@ -249,7 +249,7 @@
 }
 
 static inline void
-vc5_move_pixels_general(void *gpu, uint32_t gpu_stride,
+v3d_move_pixels_general(void *gpu, uint32_t gpu_stride,
                                void *cpu, uint32_t cpu_stride,
                                int cpp, uint32_t image_h,
                                const struct pipe_box *box,
@@ -260,35 +260,35 @@
 {
         switch (cpp) {
         case 1:
-                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
                                                cpu, cpu_stride,
                                                1, image_h, box,
                                                get_pixel_offset,
                                                is_load);
                 break;
         case 2:
-                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
                                                cpu, cpu_stride,
                                                2, image_h, box,
                                                get_pixel_offset,
                                                is_load);
                 break;
         case 4:
-                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
                                                cpu, cpu_stride,
                                                4, image_h, box,
                                                get_pixel_offset,
                                                is_load);
                 break;
         case 8:
-                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
                                                cpu, cpu_stride,
                                                8, image_h, box,
                                                get_pixel_offset,
                                                is_load);
                 break;
         case 16:
-                vc5_move_pixels_general_percpp(gpu, gpu_stride,
+                v3d_move_pixels_general_percpp(gpu, gpu_stride,
                                                cpu, cpu_stride,
                                                16, image_h, box,
                                                get_pixel_offset,
@@ -298,9 +298,9 @@
 }
 
 static inline void
-vc5_move_tiled_image(void *gpu, uint32_t gpu_stride,
+v3d_move_tiled_image(void *gpu, uint32_t gpu_stride,
                      void *cpu, uint32_t cpu_stride,
-                     enum vc5_tiling_mode tiling_format,
+                     enum v3d_tiling_mode tiling_format,
                      int cpp,
                      uint32_t image_h,
                      const struct pipe_box *box,
@@ -308,38 +308,38 @@
 {
         switch (tiling_format) {
         case VC5_TILING_UIF_XOR:
-                vc5_move_pixels_general(gpu, gpu_stride,
+                v3d_move_pixels_general(gpu, gpu_stride,
                                         cpu, cpu_stride,
                                         cpp, image_h, box,
-                                        vc5_get_uif_xor_pixel_offset,
+                                        v3d_get_uif_xor_pixel_offset,
                                         is_load);
                 break;
         case VC5_TILING_UIF_NO_XOR:
-                vc5_move_pixels_general(gpu, gpu_stride,
+                v3d_move_pixels_general(gpu, gpu_stride,
                                         cpu, cpu_stride,
                                         cpp, image_h, box,
-                                        vc5_get_uif_no_xor_pixel_offset,
+                                        v3d_get_uif_no_xor_pixel_offset,
                                         is_load);
                 break;
         case VC5_TILING_UBLINEAR_2_COLUMN:
-                vc5_move_pixels_general(gpu, gpu_stride,
+                v3d_move_pixels_general(gpu, gpu_stride,
                                         cpu, cpu_stride,
                                         cpp, image_h, box,
-                                        vc5_get_ublinear_2_column_pixel_offset,
+                                        v3d_get_ublinear_2_column_pixel_offset,
                                         is_load);
                 break;
         case VC5_TILING_UBLINEAR_1_COLUMN:
-                vc5_move_pixels_general(gpu, gpu_stride,
+                v3d_move_pixels_general(gpu, gpu_stride,
                                         cpu, cpu_stride,
                                         cpp, image_h, box,
-                                        vc5_get_ublinear_1_column_pixel_offset,
+                                        v3d_get_ublinear_1_column_pixel_offset,
                                         is_load);
                 break;
         case VC5_TILING_LINEARTILE:
-                vc5_move_pixels_general(gpu, gpu_stride,
+                v3d_move_pixels_general(gpu, gpu_stride,
                                         cpu, cpu_stride,
                                         cpp, image_h, box,
-                                        vc5_get_lt_pixel_offset,
+                                        v3d_get_lt_pixel_offset,
                                         is_load);
                 break;
         default:
@@ -353,13 +353,13 @@
  * start of \p dst according to the given tiling format.
  */
 void
-vc5_load_tiled_image(void *dst, uint32_t dst_stride,
+v3d_load_tiled_image(void *dst, uint32_t dst_stride,
                      void *src, uint32_t src_stride,
-                     enum vc5_tiling_mode tiling_format, int cpp,
+                     enum v3d_tiling_mode tiling_format, int cpp,
                      uint32_t image_h,
                      const struct pipe_box *box)
 {
-        vc5_move_tiled_image(src, src_stride,
+        v3d_move_tiled_image(src, src_stride,
                              dst, dst_stride,
                              tiling_format,
                              cpp,
@@ -373,13 +373,13 @@
  * \p dst according to the given tiling format.
  */
 void
-vc5_store_tiled_image(void *dst, uint32_t dst_stride,
+v3d_store_tiled_image(void *dst, uint32_t dst_stride,
                       void *src, uint32_t src_stride,
-                      enum vc5_tiling_mode tiling_format, int cpp,
+                      enum v3d_tiling_mode tiling_format, int cpp,
                       uint32_t image_h,
                       const struct pipe_box *box)
 {
-        vc5_move_tiled_image(dst, dst_stride,
+        v3d_move_tiled_image(dst, dst_stride,
                              src, src_stride,
                              tiling_format,
                              cpp,
diff --git a/src/gallium/drivers/vc5/vc5_tiling.h b/src/gallium/drivers/v3d/v3d_tiling.h
similarity index 74%
rename from src/gallium/drivers/vc5/vc5_tiling.h
rename to src/gallium/drivers/v3d/v3d_tiling.h
index d3cf48c..7957744 100644
--- a/src/gallium/drivers/vc5/vc5_tiling.h
+++ b/src/gallium/drivers/v3d/v3d_tiling.h
@@ -24,19 +24,19 @@
 #ifndef VC5_TILING_H
 #define VC5_TILING_H
 
-uint32_t vc5_utile_width(int cpp) ATTRIBUTE_CONST;
-uint32_t vc5_utile_height(int cpp) ATTRIBUTE_CONST;
-bool vc5_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
-void vc5_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp);
-void vc5_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp);
-void vc5_load_tiled_image(void *dst, uint32_t dst_stride,
+uint32_t v3d_utile_width(int cpp) ATTRIBUTE_CONST;
+uint32_t v3d_utile_height(int cpp) ATTRIBUTE_CONST;
+bool v3d_size_is_lt(uint32_t width, uint32_t height, int cpp) ATTRIBUTE_CONST;
+void v3d_load_utile(void *dst, void *src, uint32_t dst_stride, uint32_t cpp);
+void v3d_store_utile(void *dst, void *src, uint32_t src_stride, uint32_t cpp);
+void v3d_load_tiled_image(void *dst, uint32_t dst_stride,
                           void *src, uint32_t src_stride,
-                          enum vc5_tiling_mode tiling_format, int cpp,
+                          enum v3d_tiling_mode tiling_format, int cpp,
                           uint32_t image_h,
                           const struct pipe_box *box);
-void vc5_store_tiled_image(void *dst, uint32_t dst_stride,
+void v3d_store_tiled_image(void *dst, uint32_t dst_stride,
                            void *src, uint32_t src_stride,
-                           enum vc5_tiling_mode tiling_format, int cpp,
+                           enum v3d_tiling_mode tiling_format, int cpp,
                            uint32_t image_h,
                            const struct pipe_box *box);
 
diff --git a/src/gallium/drivers/vc5/vc5_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c
similarity index 81%
rename from src/gallium/drivers/vc5/vc5_uniforms.c
rename to src/gallium/drivers/v3d/v3d_uniforms.c
index 03b6d83..6f0ef91 100644
--- a/src/gallium/drivers/vc5/vc5_uniforms.c
+++ b/src/gallium/drivers/v3d/v3d_uniforms.c
@@ -24,7 +24,7 @@
 #include "util/u_pack_color.h"
 #include "util/format_srgb.h"
 
-#include "vc5_context.h"
+#include "v3d_context.h"
 #include "compiler/v3d_compiler.h"
 #include "broadcom/cle/v3d_packet_v33_pack.h"
 
@@ -38,14 +38,14 @@
 }
 
 static void
-write_texture_border_color(struct vc5_job *job,
-                           struct vc5_cl_out **uniforms,
-                           struct vc5_texture_stateobj *texstate,
+write_texture_border_color(struct v3d_job *job,
+                           struct v3d_cl_out **uniforms,
+                           struct v3d_texture_stateobj *texstate,
                            uint32_t unit)
 {
         struct pipe_sampler_state *sampler = texstate->samplers[unit];
         struct pipe_sampler_view *texture = texstate->textures[unit];
-        struct vc5_resource *rsc = vc5_resource(texture->texture);
+        struct v3d_resource *rsc = v3d_resource(texture->texture);
         union util_color uc;
 
         const struct util_format_description *tex_format_desc =
@@ -68,15 +68,15 @@
                                  border_color,
                                  tex_format_desc->swizzle);
 
-        /* Now, pack so that when the vc5_format-sampled texture contents are
-         * replaced with our border color, the vc5_get_format_swizzle()
+        /* Now, pack so that when the v3d_format-sampled texture contents are
+         * replaced with our border color, the v3d_get_format_swizzle()
          * swizzling will get the right channels.
          */
         if (util_format_is_depth_or_stencil(texture->format)) {
                 uc.ui[0] = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
                                        sampler->border_color.f[0]) << 8;
         } else {
-                switch (rsc->vc5_format) {
+                switch (rsc->v3d_format) {
                 default:
                 case VC5_TEXTURE_TYPE_RGBA8888:
                         util_pack_color(storage_color,
@@ -105,7 +105,7 @@
 #endif
 
 static uint32_t
-get_texrect_scale(struct vc5_texture_stateobj *texstate,
+get_texrect_scale(struct v3d_texture_stateobj *texstate,
                   enum quniform_contents contents,
                   uint32_t data)
 {
@@ -121,7 +121,7 @@
 }
 
 static uint32_t
-get_texture_size(struct vc5_texture_stateobj *texstate,
+get_texture_size(struct v3d_texture_stateobj *texstate,
                  enum quniform_contents contents,
                  uint32_t data)
 {
@@ -147,18 +147,18 @@
         }
 }
 
-static struct vc5_bo *
-vc5_upload_ubo(struct vc5_context *vc5,
-               struct vc5_compiled_shader *shader,
+static struct v3d_bo *
+v3d_upload_ubo(struct v3d_context *v3d,
+               struct v3d_compiled_shader *shader,
                const uint32_t *gallium_uniforms)
 {
         if (!shader->prog_data.base->ubo_size)
                 return NULL;
 
-        struct vc5_bo *ubo = vc5_bo_alloc(vc5->screen,
+        struct v3d_bo *ubo = v3d_bo_alloc(v3d->screen,
                                           shader->prog_data.base->ubo_size,
                                           "ubo");
-        void *data = vc5_bo_map(ubo);
+        void *data = v3d_bo_map(ubo);
         for (uint32_t i = 0; i < shader->prog_data.base->num_ubo_ranges; i++) {
                 memcpy(data + shader->prog_data.base->ubo_ranges[i].dst_offset,
                        ((const void *)gallium_uniforms +
@@ -177,23 +177,23 @@
  * two together here.
  */
 static void
-write_texture_p0(struct vc5_job *job,
-                 struct vc5_cl_out **uniforms,
-                 struct vc5_texture_stateobj *texstate,
+write_texture_p0(struct v3d_job *job,
+                 struct v3d_cl_out **uniforms,
+                 struct v3d_texture_stateobj *texstate,
                  uint32_t unit,
                  uint32_t shader_data)
 {
         struct pipe_sampler_state *psampler = texstate->samplers[unit];
-        struct vc5_sampler_state *sampler = vc5_sampler_state(psampler);
+        struct v3d_sampler_state *sampler = v3d_sampler_state(psampler);
 
         cl_aligned_u32(uniforms, shader_data | sampler->p0);
 }
 
 /** Writes the V3D 3.x P1 (CFG_MODE=1) texture parameter. */
 static void
-write_texture_p1(struct vc5_job *job,
-                 struct vc5_cl_out **uniforms,
-                 struct vc5_texture_stateobj *texstate,
+write_texture_p1(struct v3d_job *job,
+                 struct v3d_cl_out **uniforms,
+                 struct v3d_texture_stateobj *texstate,
                  uint32_t data)
 {
         /* Extract the texture unit from the top bits, and the compiler's
@@ -203,7 +203,7 @@
         uint32_t p1 = data & 0x1f;
 
         struct pipe_sampler_view *psview = texstate->textures[unit];
-        struct vc5_sampler_view *sview = vc5_sampler_view(psview);
+        struct v3d_sampler_view *sview = v3d_sampler_view(psview);
 
         struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 unpacked = {
                 .texture_state_record_base_address = texstate->texture_state[unit],
@@ -219,9 +219,9 @@
 
 /** Writes the V3D 4.x TMU configuration parameter 0. */
 static void
-write_tmu_p0(struct vc5_job *job,
-             struct vc5_cl_out **uniforms,
-             struct vc5_texture_stateobj *texstate,
+write_tmu_p0(struct v3d_job *job,
+             struct v3d_cl_out **uniforms,
+             struct v3d_texture_stateobj *texstate,
              uint32_t data)
 {
         /* Extract the texture unit from the top bits, and the compiler's
@@ -231,18 +231,18 @@
         uint32_t p0 = data & 0x00ffffff;
 
         struct pipe_sampler_view *psview = texstate->textures[unit];
-        struct vc5_sampler_view *sview = vc5_sampler_view(psview);
-        struct vc5_resource *rsc = vc5_resource(psview->texture);
+        struct v3d_sampler_view *sview = v3d_sampler_view(psview);
+        struct v3d_resource *rsc = v3d_resource(psview->texture);
 
         cl_aligned_reloc(&job->indirect, uniforms, sview->bo, p0);
-        vc5_job_add_bo(job, rsc->bo);
+        v3d_job_add_bo(job, rsc->bo);
 }
 
 /** Writes the V3D 4.x TMU configuration parameter 1. */
 static void
-write_tmu_p1(struct vc5_job *job,
-             struct vc5_cl_out **uniforms,
-             struct vc5_texture_stateobj *texstate,
+write_tmu_p1(struct v3d_job *job,
+             struct v3d_cl_out **uniforms,
+             struct v3d_texture_stateobj *texstate,
              uint32_t data)
 {
         /* Extract the texture unit from the top bits, and the compiler's
@@ -252,30 +252,30 @@
         uint32_t p0 = data & 0x00ffffff;
 
         struct pipe_sampler_state *psampler = texstate->samplers[unit];
-        struct vc5_sampler_state *sampler = vc5_sampler_state(psampler);
+        struct v3d_sampler_state *sampler = v3d_sampler_state(psampler);
 
         cl_aligned_reloc(&job->indirect, uniforms, sampler->bo, p0);
 }
 
-struct vc5_cl_reloc
-vc5_write_uniforms(struct vc5_context *vc5, struct vc5_compiled_shader *shader,
-                   struct vc5_constbuf_stateobj *cb,
-                   struct vc5_texture_stateobj *texstate)
+struct v3d_cl_reloc
+v3d_write_uniforms(struct v3d_context *v3d, struct v3d_compiled_shader *shader,
+                   struct v3d_constbuf_stateobj *cb,
+                   struct v3d_texture_stateobj *texstate)
 {
         struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms;
-        struct vc5_job *job = vc5->job;
+        struct v3d_job *job = v3d->job;
         const uint32_t *gallium_uniforms = cb->cb[0].user_buffer;
-        struct vc5_bo *ubo = vc5_upload_ubo(vc5, shader, gallium_uniforms);
+        struct v3d_bo *ubo = v3d_upload_ubo(v3d, shader, gallium_uniforms);
 
         /* We always need to return some space for uniforms, because the HW
          * will be prefetching, even if we don't read any in the program.
          */
-        vc5_cl_ensure_space(&job->indirect, MAX2(uinfo->count, 1) * 4, 4);
+        v3d_cl_ensure_space(&job->indirect, MAX2(uinfo->count, 1) * 4, 4);
 
-        struct vc5_cl_reloc uniform_stream = cl_get_address(&job->indirect);
-        vc5_bo_reference(uniform_stream.bo);
+        struct v3d_cl_reloc uniform_stream = cl_get_address(&job->indirect);
+        v3d_bo_reference(uniform_stream.bo);
 
-        struct vc5_cl_out *uniforms =
+        struct v3d_cl_out *uniforms =
                 cl_start(&job->indirect);
 
         for (int i = 0; i < uinfo->count; i++) {
@@ -289,22 +289,22 @@
                                        gallium_uniforms[uinfo->data[i]]);
                         break;
                 case QUNIFORM_VIEWPORT_X_SCALE:
-                        cl_aligned_f(&uniforms, vc5->viewport.scale[0] * 256.0f);
+                        cl_aligned_f(&uniforms, v3d->viewport.scale[0] * 256.0f);
                         break;
                 case QUNIFORM_VIEWPORT_Y_SCALE:
-                        cl_aligned_f(&uniforms, vc5->viewport.scale[1] * 256.0f);
+                        cl_aligned_f(&uniforms, v3d->viewport.scale[1] * 256.0f);
                         break;
 
                 case QUNIFORM_VIEWPORT_Z_OFFSET:
-                        cl_aligned_f(&uniforms, vc5->viewport.translate[2]);
+                        cl_aligned_f(&uniforms, v3d->viewport.translate[2]);
                         break;
                 case QUNIFORM_VIEWPORT_Z_SCALE:
-                        cl_aligned_f(&uniforms, vc5->viewport.scale[2]);
+                        cl_aligned_f(&uniforms, v3d->viewport.scale[2]);
                         break;
 
                 case QUNIFORM_USER_CLIP_PLANE:
                         cl_aligned_f(&uniforms,
-                                     vc5->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
+                                     v3d->clip.ucp[uinfo->data[i] / 4][uinfo->data[i] % 4]);
                         break;
 
                 case QUNIFORM_TMU_CONFIG_P0:
@@ -348,21 +348,13 @@
                                                         uinfo->data[i]));
                         break;
 
-                case QUNIFORM_STENCIL:
-                        cl_aligned_u32(&uniforms,
-                                       vc5->zsa->stencil_uniforms[uinfo->data[i]] |
-                                       (uinfo->data[i] <= 1 ?
-                                        (vc5->stencil_ref.ref_value[uinfo->data[i]] << 8) :
-                                        0));
-                        break;
-
                 case QUNIFORM_ALPHA_REF:
                         cl_aligned_f(&uniforms,
-                                     vc5->zsa->base.alpha.ref_value);
+                                     v3d->zsa->base.alpha.ref_value);
                         break;
 
                 case QUNIFORM_SAMPLE_MASK:
-                        cl_aligned_u32(&uniforms, vc5->sample_mask);
+                        cl_aligned_u32(&uniforms, v3d->sample_mask);
                         break;
 
                 case QUNIFORM_UBO_ADDR:
@@ -371,8 +363,8 @@
                                                  ubo, 0);
                         } else {
                                 int ubo_index = uinfo->data[i];
-                                struct vc5_resource *rsc =
-                                        vc5_resource(cb->cb[ubo_index].buffer);
+                                struct v3d_resource *rsc =
+                                        v3d_resource(cb->cb[ubo_index].buffer);
 
                                 cl_aligned_reloc(&job->indirect, &uniforms,
                                                  rsc->bo,
@@ -391,12 +383,12 @@
 
                 case QUNIFORM_SPILL_OFFSET:
                         cl_aligned_reloc(&job->indirect, &uniforms,
-                                         vc5->prog.spill_bo, 0);
+                                         v3d->prog.spill_bo, 0);
                         break;
 
                 case QUNIFORM_SPILL_SIZE_PER_THREAD:
                         cl_aligned_u32(&uniforms,
-                                       vc5->prog.spill_size_per_thread);
+                                       v3d->prog.spill_size_per_thread);
                         break;
 
                 default:
@@ -419,13 +411,13 @@
 
         cl_end(&job->indirect, uniforms);
 
-        vc5_bo_unreference(&ubo);
+        v3d_bo_unreference(&ubo);
 
         return uniform_stream;
 }
 
 void
-vc5_set_shader_uniform_dirty_flags(struct vc5_compiled_shader *shader)
+v3d_set_shader_uniform_dirty_flags(struct v3d_compiled_shader *shader)
 {
         uint32_t dirty = 0;
 
@@ -469,13 +461,12 @@
                         dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX;
                         break;
 
-                case QUNIFORM_STENCIL:
                 case QUNIFORM_ALPHA_REF:
                         dirty |= VC5_DIRTY_ZSA;
                         break;
 
                 case QUNIFORM_SAMPLE_MASK:
-                        dirty |= VC5_DIRTY_SAMPLE_MASK;
+                        dirty |= VC5_DIRTY_SAMPLE_STATE;
                         break;
 
                 default:
diff --git a/src/gallium/drivers/vc5/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h
similarity index 83%
rename from src/gallium/drivers/vc5/v3dx_context.h
rename to src/gallium/drivers/v3d/v3dx_context.h
index f9edd1c..bb90e1c 100644
--- a/src/gallium/drivers/vc5/v3dx_context.h
+++ b/src/gallium/drivers/v3d/v3dx_context.h
@@ -23,25 +23,25 @@
  */
 
 /* This file generates the per-v3d-version function prototypes.  It must only
- * be included from vc5_context.h.
+ * be included from v3d_context.h.
  */
 
 struct v3d_hw;
-struct vc5_format;
+struct v3d_format;
 
 void v3dX(emit_state)(struct pipe_context *pctx);
-void v3dX(emit_rcl)(struct vc5_job *job);
+void v3dX(emit_rcl)(struct v3d_job *job);
 void v3dX(draw_init)(struct pipe_context *pctx);
 void v3dX(state_init)(struct pipe_context *pctx);
 
-void v3dX(bcl_epilogue)(struct vc5_context *vc5, struct vc5_job *job);
+void v3dX(bcl_epilogue)(struct v3d_context *v3d, struct v3d_job *job);
 
 void v3dX(simulator_init_regs)(struct v3d_hw *v3d);
 int v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
-                                    struct drm_vc5_get_param *args);
-void v3dX(simulator_flush)(struct v3d_hw *v3d, struct drm_vc5_submit_cl *submit,
+                                    struct drm_v3d_get_param *args);
+void v3dX(simulator_flush)(struct v3d_hw *v3d, struct drm_v3d_submit_cl *submit,
                            uint32_t gmp_ofs);
-const struct vc5_format *v3dX(get_format_desc)(enum pipe_format f);
+const struct v3d_format *v3dX(get_format_desc)(enum pipe_format f);
 void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
                                                    uint32_t *type,
                                                    uint32_t *bpp);
diff --git a/src/gallium/drivers/vc5/vc5_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
similarity index 61%
rename from src/gallium/drivers/vc5/vc5_draw.c
rename to src/gallium/drivers/v3d/v3dx_draw.c
index ff14d1c..bfb4af1 100644
--- a/src/gallium/drivers/vc5/vc5_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@@ -29,9 +29,9 @@
 #include "util/u_upload_mgr.h"
 #include "indices/u_primconvert.h"
 
-#include "vc5_context.h"
-#include "vc5_resource.h"
-#include "vc5_cl.h"
+#include "v3d_context.h"
+#include "v3d_resource.h"
+#include "v3d_cl.h"
 #include "broadcom/compiler/v3d_compiler.h"
 #include "broadcom/common/v3d_macros.h"
 #include "broadcom/cle/v3dx_pack.h"
@@ -40,9 +40,9 @@
  * Does the initial bining command list setup for drawing to a given FBO.
  */
 static void
-vc5_start_draw(struct vc5_context *vc5)
+v3d_start_draw(struct v3d_context *v3d)
 {
-        struct vc5_job *job = vc5->job;
+        struct v3d_job *job = v3d->job;
 
         if (job->needs_flush)
                 return;
@@ -50,39 +50,43 @@
         /* Get space to emit our BCL state, using a branch to jump to a new BO
          * if necessary.
          */
-        vc5_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */);
+        v3d_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */);
 
         job->submit.bcl_start = job->bcl.bo->offset;
-        vc5_job_add_bo(job, job->bcl.bo);
+        v3d_job_add_bo(job, job->bcl.bo);
 
-        job->tile_alloc = vc5_bo_alloc(vc5->screen, 1024 * 1024, "tile alloc");
-        uint32_t tsda_per_tile_size = vc5->screen->devinfo.ver >= 40 ? 256 : 64;
-        job->tile_state = vc5_bo_alloc(vc5->screen,
+        job->tile_alloc = v3d_bo_alloc(v3d->screen, 1024 * 1024, "tile_alloc");
+        uint32_t tsda_per_tile_size = v3d->screen->devinfo.ver >= 40 ? 256 : 64;
+        job->tile_state = v3d_bo_alloc(v3d->screen,
                                        job->draw_tiles_y *
                                        job->draw_tiles_x *
                                        tsda_per_tile_size,
                                        "TSDA");
 
-#if V3D_VERSION < 40
+#if V3D_VERSION >= 40
+        cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+                config.width_in_pixels = v3d->framebuffer.width;
+                config.height_in_pixels = v3d->framebuffer.height;
+                config.number_of_render_targets =
+                        MAX2(v3d->framebuffer.nr_cbufs, 1);
+
+                config.multisample_mode_4x = job->msaa;
+
+                config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+        }
+#else /* V3D_VERSION < 40 */
         /* "Binning mode lists start with a Tile Binning Mode Configuration
          * item (120)"
          *
          * Part1 signals the end of binning config setup.
          */
-        cl_emit(&job->bcl, TILE_BINNING_MODE_CONFIGURATION_PART2, config) {
+        cl_emit(&job->bcl, TILE_BINNING_MODE_CFG_PART2, config) {
                 config.tile_allocation_memory_address =
                         cl_address(job->tile_alloc, 0);
                 config.tile_allocation_memory_size = job->tile_alloc->size;
         }
-#endif
 
-        cl_emit(&job->bcl, TILE_BINNING_MODE_CONFIGURATION_PART1, config) {
-#if V3D_VERSION >= 40
-                config.width_in_pixels_minus_1 = vc5->framebuffer.width - 1;
-                config.height_in_pixels_minus_1 = vc5->framebuffer.height - 1;
-                config.number_of_render_targets_minus_1 =
-                        MAX2(vc5->framebuffer.nr_cbufs, 1) - 1;
-#else /* V3D_VERSION < 40 */
+        cl_emit(&job->bcl, TILE_BINNING_MODE_CFG_PART1, config) {
                 config.tile_state_data_array_base_address =
                         cl_address(job->tile_state, 0);
 
@@ -90,13 +94,13 @@
                 config.height_in_tiles = job->draw_tiles_y;
                 /* Must be >= 1 */
                 config.number_of_render_targets =
-                        MAX2(vc5->framebuffer.nr_cbufs, 1);
-#endif /* V3D_VERSION < 40 */
+                        MAX2(v3d->framebuffer.nr_cbufs, 1);
 
                 config.multisample_mode_4x = job->msaa;
 
                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
         }
+#endif /* V3D_VERSION < 40 */
 
         /* There's definitely nothing in the VCD cache we want. */
         cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
@@ -110,53 +114,53 @@
         cl_emit(&job->bcl, START_TILE_BINNING, bin);
 
         job->needs_flush = true;
-        job->draw_width = vc5->framebuffer.width;
-        job->draw_height = vc5->framebuffer.height;
+        job->draw_width = v3d->framebuffer.width;
+        job->draw_height = v3d->framebuffer.height;
 }
 
 static void
-vc5_predraw_check_textures(struct pipe_context *pctx,
-                           struct vc5_texture_stateobj *stage_tex)
+v3d_predraw_check_textures(struct pipe_context *pctx,
+                           struct v3d_texture_stateobj *stage_tex)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
+        struct v3d_context *v3d = v3d_context(pctx);
 
         for (int i = 0; i < stage_tex->num_textures; i++) {
                 struct pipe_sampler_view *view = stage_tex->textures[i];
                 if (!view)
                         continue;
 
-                vc5_flush_jobs_writing_resource(vc5, view->texture);
+                v3d_flush_jobs_writing_resource(v3d, view->texture);
         }
 }
 
 static void
-vc5_emit_gl_shader_state(struct vc5_context *vc5,
+v3d_emit_gl_shader_state(struct v3d_context *v3d,
                          const struct pipe_draw_info *info)
 {
-        struct vc5_job *job = vc5->job;
+        struct v3d_job *job = v3d->job;
         /* VC5_DIRTY_VTXSTATE */
-        struct vc5_vertex_stateobj *vtx = vc5->vtx;
+        struct v3d_vertex_stateobj *vtx = v3d->vtx;
         /* VC5_DIRTY_VTXBUF */
-        struct vc5_vertexbuf_stateobj *vertexbuf = &vc5->vertexbuf;
+        struct v3d_vertexbuf_stateobj *vertexbuf = &v3d->vertexbuf;
 
         /* Upload the uniforms to the indirect CL first */
-        struct vc5_cl_reloc fs_uniforms =
-                vc5_write_uniforms(vc5, vc5->prog.fs,
-                                   &vc5->constbuf[PIPE_SHADER_FRAGMENT],
-                                   &vc5->fragtex);
-        struct vc5_cl_reloc vs_uniforms =
-                vc5_write_uniforms(vc5, vc5->prog.vs,
-                                   &vc5->constbuf[PIPE_SHADER_VERTEX],
-                                   &vc5->verttex);
-        struct vc5_cl_reloc cs_uniforms =
-                vc5_write_uniforms(vc5, vc5->prog.cs,
-                                   &vc5->constbuf[PIPE_SHADER_VERTEX],
-                                   &vc5->verttex);
+        struct v3d_cl_reloc fs_uniforms =
+                v3d_write_uniforms(v3d, v3d->prog.fs,
+                                   &v3d->constbuf[PIPE_SHADER_FRAGMENT],
+                                   &v3d->fragtex);
+        struct v3d_cl_reloc vs_uniforms =
+                v3d_write_uniforms(v3d, v3d->prog.vs,
+                                   &v3d->constbuf[PIPE_SHADER_VERTEX],
+                                   &v3d->verttex);
+        struct v3d_cl_reloc cs_uniforms =
+                v3d_write_uniforms(v3d, v3d->prog.cs,
+                                   &v3d->constbuf[PIPE_SHADER_VERTEX],
+                                   &v3d->verttex);
 
         /* See GFXH-930 workaround below */
         uint32_t num_elements_to_emit = MAX2(vtx->num_elements, 1);
         uint32_t shader_rec_offset =
-                vc5_cl_ensure_space(&job->indirect,
+                v3d_cl_ensure_space(&job->indirect,
                                     cl_packet_length(GL_SHADER_STATE_RECORD) +
                                     num_elements_to_emit *
                                     cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
@@ -167,27 +171,32 @@
                 /* VC5_DIRTY_PRIM_MODE | VC5_DIRTY_RASTERIZER */
                 shader.point_size_in_shaded_vertex_data =
                         (info->mode == PIPE_PRIM_POINTS &&
-                         vc5->rasterizer->base.point_size_per_vertex);
+                         v3d->rasterizer->base.point_size_per_vertex);
 
                 /* Must be set if the shader modifies Z, discards, or modifies
                  * the sample mask.  For any of these cases, the fragment
                  * shader needs to write the Z value (even just discards).
                  */
                 shader.fragment_shader_does_z_writes =
-                        (vc5->prog.fs->prog_data.fs->writes_z ||
-                         vc5->prog.fs->prog_data.fs->discard);
+                        (v3d->prog.fs->prog_data.fs->writes_z ||
+                         v3d->prog.fs->prog_data.fs->discard);
+
+                shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 =
+                        v3d->prog.fs->prog_data.fs->uses_center_w;
 
                 shader.number_of_varyings_in_fragment_shader =
-                        vc5->prog.fs->prog_data.base->num_inputs;
+                        v3d->prog.fs->prog_data.base->num_inputs;
 
-                shader.propagate_nans = true;
+                shader.coordinate_shader_propagate_nans = true;
+                shader.vertex_shader_propagate_nans = true;
+                shader.fragment_shader_propagate_nans = true;
 
                 shader.coordinate_shader_code_address =
-                        cl_address(vc5->prog.cs->bo, 0);
+                        cl_address(v3d->prog.cs->bo, 0);
                 shader.vertex_shader_code_address =
-                        cl_address(vc5->prog.vs->bo, 0);
+                        cl_address(v3d->prog.vs->bo, 0);
                 shader.fragment_shader_code_address =
-                        cl_address(vc5->prog.fs->bo, 0);
+                        cl_address(v3d->prog.fs->bo, 0);
 
                 /* XXX: Use combined input/output size flag in the common
                  * case.
@@ -195,56 +204,59 @@
                 shader.coordinate_shader_has_separate_input_and_output_vpm_blocks = true;
                 shader.vertex_shader_has_separate_input_and_output_vpm_blocks = true;
                 shader.coordinate_shader_input_vpm_segment_size =
-                        MAX2(vc5->prog.cs->prog_data.vs->vpm_input_size, 1);
+                        MAX2(v3d->prog.cs->prog_data.vs->vpm_input_size, 1);
                 shader.vertex_shader_input_vpm_segment_size =
-                        MAX2(vc5->prog.vs->prog_data.vs->vpm_input_size, 1);
+                        MAX2(v3d->prog.vs->prog_data.vs->vpm_input_size, 1);
 
                 shader.coordinate_shader_output_vpm_segment_size =
-                        vc5->prog.cs->prog_data.vs->vpm_output_size;
+                        v3d->prog.cs->prog_data.vs->vpm_output_size;
                 shader.vertex_shader_output_vpm_segment_size =
-                        vc5->prog.vs->prog_data.vs->vpm_output_size;
+                        v3d->prog.vs->prog_data.vs->vpm_output_size;
 
                 shader.coordinate_shader_uniforms_address = cs_uniforms;
                 shader.vertex_shader_uniforms_address = vs_uniforms;
                 shader.fragment_shader_uniforms_address = fs_uniforms;
 
 #if V3D_VERSION >= 41
+                shader.min_coord_shader_input_segments_required_in_play = 1;
+                shader.min_vertex_shader_input_segments_required_in_play = 1;
+
                 shader.coordinate_shader_4_way_threadable =
-                        vc5->prog.cs->prog_data.vs->base.threads == 4;
+                        v3d->prog.cs->prog_data.vs->base.threads == 4;
                 shader.vertex_shader_4_way_threadable =
-                        vc5->prog.vs->prog_data.vs->base.threads == 4;
+                        v3d->prog.vs->prog_data.vs->base.threads == 4;
                 shader.fragment_shader_4_way_threadable =
-                        vc5->prog.fs->prog_data.fs->base.threads == 4;
+                        v3d->prog.fs->prog_data.fs->base.threads == 4;
 
                 shader.coordinate_shader_start_in_final_thread_section =
-                        vc5->prog.cs->prog_data.vs->base.single_seg;
+                        v3d->prog.cs->prog_data.vs->base.single_seg;
                 shader.vertex_shader_start_in_final_thread_section =
-                        vc5->prog.vs->prog_data.vs->base.single_seg;
+                        v3d->prog.vs->prog_data.vs->base.single_seg;
                 shader.fragment_shader_start_in_final_thread_section =
-                        vc5->prog.fs->prog_data.fs->base.single_seg;
+                        v3d->prog.fs->prog_data.fs->base.single_seg;
 #else
                 shader.coordinate_shader_4_way_threadable =
-                        vc5->prog.cs->prog_data.vs->base.threads == 4;
+                        v3d->prog.cs->prog_data.vs->base.threads == 4;
                 shader.coordinate_shader_2_way_threadable =
-                        vc5->prog.cs->prog_data.vs->base.threads == 2;
+                        v3d->prog.cs->prog_data.vs->base.threads == 2;
                 shader.vertex_shader_4_way_threadable =
-                        vc5->prog.vs->prog_data.vs->base.threads == 4;
+                        v3d->prog.vs->prog_data.vs->base.threads == 4;
                 shader.vertex_shader_2_way_threadable =
-                        vc5->prog.vs->prog_data.vs->base.threads == 2;
+                        v3d->prog.vs->prog_data.vs->base.threads == 2;
                 shader.fragment_shader_4_way_threadable =
-                        vc5->prog.fs->prog_data.fs->base.threads == 4;
+                        v3d->prog.fs->prog_data.fs->base.threads == 4;
                 shader.fragment_shader_2_way_threadable =
-                        vc5->prog.fs->prog_data.fs->base.threads == 2;
+                        v3d->prog.fs->prog_data.fs->base.threads == 2;
 #endif
 
                 shader.vertex_id_read_by_coordinate_shader =
-                        vc5->prog.cs->prog_data.vs->uses_vid;
+                        v3d->prog.cs->prog_data.vs->uses_vid;
                 shader.instance_id_read_by_coordinate_shader =
-                        vc5->prog.cs->prog_data.vs->uses_iid;
+                        v3d->prog.cs->prog_data.vs->uses_iid;
                 shader.vertex_id_read_by_vertex_shader =
-                        vc5->prog.vs->prog_data.vs->uses_vid;
+                        v3d->prog.vs->prog_data.vs->uses_vid;
                 shader.instance_id_read_by_vertex_shader =
-                        vc5->prog.vs->prog_data.vs->uses_iid;
+                        v3d->prog.vs->prog_data.vs->uses_iid;
 
                 shader.address_of_default_attribute_values =
                         cl_address(vtx->default_attribute_values, 0);
@@ -254,7 +266,7 @@
                 struct pipe_vertex_element *elem = &vtx->pipe[i];
                 struct pipe_vertex_buffer *vb =
                         &vertexbuf->vb[elem->vertex_buffer_index];
-                struct vc5_resource *rsc = vc5_resource(vb->buffer.resource);
+                struct v3d_resource *rsc = v3d_resource(vb->buffer.resource);
 
                 const uint32_t size =
                         cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
@@ -266,13 +278,14 @@
                                                   vb->buffer_offset +
                                                   elem->src_offset);
                         attr.number_of_values_read_by_coordinate_shader =
-                                vc5->prog.cs->prog_data.vs->vattr_sizes[i];
+                                v3d->prog.cs->prog_data.vs->vattr_sizes[i];
                         attr.number_of_values_read_by_vertex_shader =
-                                vc5->prog.vs->prog_data.vs->vattr_sizes[i];
+                                v3d->prog.vs->prog_data.vs->vattr_sizes[i];
 #if V3D_VERSION >= 41
                         attr.maximum_index = 0xffffff;
 #endif
                 }
+                STATIC_ASSERT(sizeof(vtx->attrs) >= VC5_MAX_ATTRIBUTES * size);
         }
 
         if (vtx->num_elements == 0) {
@@ -293,14 +306,21 @@
                 }
         }
 
+        cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) {
+                vcm.number_of_16_vertex_batches_for_binning =
+                        v3d->prog.cs->prog_data.vs->vcm_cache_size;
+                vcm.number_of_16_vertex_batches_for_rendering =
+                        v3d->prog.vs->prog_data.vs->vcm_cache_size;
+        }
+
         cl_emit(&job->bcl, GL_SHADER_STATE, state) {
                 state.address = cl_address(job->indirect.bo, shader_rec_offset);
                 state.number_of_attribute_arrays = num_elements_to_emit;
         }
 
-        vc5_bo_unreference(&cs_uniforms.bo);
-        vc5_bo_unreference(&vs_uniforms.bo);
-        vc5_bo_unreference(&fs_uniforms.bo);
+        v3d_bo_unreference(&cs_uniforms.bo);
+        v3d_bo_unreference(&vs_uniforms.bo);
+        v3d_bo_unreference(&fs_uniforms.bo);
 
         job->shader_rec_count++;
 }
@@ -310,26 +330,26 @@
  * recorded by CL packets.
  */
 static void
-vc5_tf_statistics_record(struct vc5_context *vc5,
+v3d_tf_statistics_record(struct v3d_context *v3d,
                          const struct pipe_draw_info *info,
                          bool prim_tf)
 {
-        if (!vc5->active_queries)
+        if (!v3d->active_queries)
                 return;
 
         uint32_t prims = u_prims_for_vertices(info->mode, info->count);
-        vc5->prims_generated += prims;
+        v3d->prims_generated += prims;
 
         if (prim_tf) {
                 /* XXX: Only count if we didn't overflow. */
-                vc5->tf_prims_generated += prims;
+                v3d->tf_prims_generated += prims;
         }
 }
 
 static void
-vc5_update_job_ez(struct vc5_context *vc5, struct vc5_job *job)
+v3d_update_job_ez(struct v3d_context *v3d, struct v3d_job *job)
 {
-        switch (vc5->zsa->ez_state) {
+        switch (v3d->zsa->ez_state) {
         case VC5_EZ_UNDECIDED:
                 /* If the Z/S state didn't pick a direction but didn't
                  * disable, then go along with the current EZ state.  This
@@ -343,8 +363,8 @@
                  * the current direction if we've decided on one.
                  */
                 if (job->ez_state == VC5_EZ_UNDECIDED)
-                        job->ez_state = vc5->zsa->ez_state;
-                else if (job->ez_state != vc5->zsa->ez_state)
+                        job->ez_state = v3d->zsa->ez_state;
+                else if (job->ez_state != v3d->zsa->ez_state)
                         job->ez_state = VC5_EZ_DISABLED;
                 break;
 
@@ -361,18 +381,19 @@
          * the chosen EZ direction (though we could use
          * ARB_conservative_depth's hints to avoid this)
          */
-        if (vc5->prog.fs->prog_data.fs->writes_z) {
+        if (v3d->prog.fs->prog_data.fs->writes_z) {
                 job->ez_state = VC5_EZ_DISABLED;
         }
 
-        if (job->first_ez_state == VC5_EZ_UNDECIDED)
+        if (job->first_ez_state == VC5_EZ_UNDECIDED &&
+            (job->ez_state != VC5_EZ_DISABLED || job->draw_calls_queued == 0))
                 job->first_ez_state = job->ez_state;
 }
 
 static void
-vc5_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
+v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
+        struct v3d_context *v3d = v3d_context(pctx);
 
         if (!info->count_from_stream_output && !info->indirect &&
             !info->primitive_restart &&
@@ -400,8 +421,8 @@
         }
 
         if (info->mode >= PIPE_PRIM_QUADS) {
-                util_primconvert_save_rasterizer_state(vc5->primconvert, &vc5->rasterizer->base);
-                util_primconvert_draw_vbo(vc5->primconvert, info);
+                util_primconvert_save_rasterizer_state(v3d->primconvert, &v3d->rasterizer->base);
+                util_primconvert_draw_vbo(v3d->primconvert, info);
                 perf_debug("Fallback conversion for %d %s vertices\n",
                            info->count, u_prim_name(info->mode));
                 return;
@@ -410,24 +431,38 @@
         /* Before setting up the draw, flush anything writing to the textures
          * that we read from.
          */
-        vc5_predraw_check_textures(pctx, &vc5->verttex);
-        vc5_predraw_check_textures(pctx, &vc5->fragtex);
+        v3d_predraw_check_textures(pctx, &v3d->verttex);
+        v3d_predraw_check_textures(pctx, &v3d->fragtex);
 
-        struct vc5_job *job = vc5_get_job_for_fbo(vc5);
+        struct v3d_job *job = v3d_get_job_for_fbo(v3d);
+
+        /* If vertex texturing depends on the output of rendering, we need to
+         * ensure that that rendering is complete before we run a coordinate
+         * shader that depends on it.
+         *
+         * Given that doing that is unusual, for now we just block the binner
+         * on the last submitted render, rather than tracking the last
+         * rendering to each texture's BO.
+         */
+        if (v3d->verttex.num_textures) {
+                perf_debug("Blocking binner on last render "
+                           "due to vertex texturing.\n");
+                job->submit.in_sync_bcl = v3d->out_sync;
+        }
 
         /* Get space to emit our draw call into the BCL, using a branch to
          * jump to a new BO if necessary.
          */
-        vc5_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */);
+        v3d_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */);
 
-        if (vc5->prim_mode != info->mode) {
-                vc5->prim_mode = info->mode;
-                vc5->dirty |= VC5_DIRTY_PRIM_MODE;
+        if (v3d->prim_mode != info->mode) {
+                v3d->prim_mode = info->mode;
+                v3d->dirty |= VC5_DIRTY_PRIM_MODE;
         }
 
-        vc5_start_draw(vc5);
-        vc5_update_compiled_shaders(vc5, info->mode);
-        vc5_update_job_ez(vc5, job);
+        v3d_start_draw(v3d);
+        v3d_update_compiled_shaders(v3d, info->mode);
+        v3d_update_job_ez(v3d, job);
 
 #if V3D_VERSION >= 41
         v3d41_emit_state(pctx);
@@ -435,20 +470,20 @@
         v3d33_emit_state(pctx);
 #endif
 
-        if (vc5->dirty & (VC5_DIRTY_VTXBUF |
+        if (v3d->dirty & (VC5_DIRTY_VTXBUF |
                           VC5_DIRTY_VTXSTATE |
                           VC5_DIRTY_PRIM_MODE |
                           VC5_DIRTY_RASTERIZER |
                           VC5_DIRTY_COMPILED_CS |
                           VC5_DIRTY_COMPILED_VS |
                           VC5_DIRTY_COMPILED_FS |
-                          vc5->prog.cs->uniform_dirty_bits |
-                          vc5->prog.vs->uniform_dirty_bits |
-                          vc5->prog.fs->uniform_dirty_bits)) {
-                vc5_emit_gl_shader_state(vc5, info);
+                          v3d->prog.cs->uniform_dirty_bits |
+                          v3d->prog.vs->uniform_dirty_bits |
+                          v3d->prog.fs->uniform_dirty_bits)) {
+                v3d_emit_gl_shader_state(v3d, info);
         }
 
-        vc5->dirty = 0;
+        v3d->dirty = 0;
 
         /* The Base Vertex/Base Instance packet sets those values to nonzero
          * for the next draw call only.
@@ -465,11 +500,11 @@
         /* V3D 3.x: The HW only processes transform feedback on primitives
          * with the flag set.
          */
-        if (vc5->streamout.num_targets)
+        if (v3d->streamout.num_targets)
                 prim_tf_enable = (V3D_PRIM_POINTS_TF - V3D_PRIM_POINTS);
 #endif
 
-        vc5_tf_statistics_record(vc5, info, vc5->streamout.num_targets);
+        v3d_tf_statistics_record(v3d, info, v3d->streamout.num_targets);
 
         /* Note that the primitive type fields match with OpenGL/gallium
          * definitions, up to but not including QUADS.
@@ -480,14 +515,14 @@
                 struct pipe_resource *prsc;
                 if (info->has_user_indices) {
                         prsc = NULL;
-                        u_upload_data(vc5->uploader, 0,
+                        u_upload_data(v3d->uploader, 0,
                                       info->count * info->index_size, 4,
                                       info->index.user,
                                       &offset, &prsc);
                 } else {
                         prsc = info->index.resource;
                 }
-                struct vc5_resource *rsc = vc5_resource(prsc);
+                struct v3d_resource *rsc = v3d_resource(prsc);
 
 #if V3D_VERSION >= 40
                 cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
@@ -497,7 +532,7 @@
 #endif
 
                 if (info->instance_count > 1) {
-                        cl_emit(&job->bcl, INDEXED_INSTANCED_PRIMITIVE_LIST, prim) {
+                        cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
                                 prim.index_type = ffs(info->index_size) - 1;
 #if V3D_VERSION >= 40
                                 prim.index_offset = offset;
@@ -513,7 +548,7 @@
                                 prim.instance_length = info->count;
                         }
                 } else {
-                        cl_emit(&job->bcl, INDEXED_PRIMITIVE_LIST, prim) {
+                        cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
                                 prim.index_type = ffs(info->index_size) - 1;
                                 prim.length = info->count;
 #if V3D_VERSION >= 40
@@ -534,74 +569,138 @@
                         pipe_resource_reference(&prsc, NULL);
         } else {
                 if (info->instance_count > 1) {
-                        cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMITIVES, prim) {
+                        cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
                                 prim.mode = info->mode | prim_tf_enable;
                                 prim.index_of_first_vertex = info->start;
                                 prim.number_of_instances = info->instance_count;
                                 prim.instance_length = info->count;
                         }
                 } else {
-                        cl_emit(&job->bcl, VERTEX_ARRAY_PRIMITIVES, prim) {
+                        cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
                                 prim.mode = info->mode | prim_tf_enable;
                                 prim.length = info->count;
                                 prim.index_of_first_vertex = info->start;
                         }
                 }
         }
+
+        /* A flush is required in between a TF draw and any following TF specs
+         * packet, or the GPU may hang.  Just flush each time for now.
+         */
+        if (v3d->streamout.num_targets)
+                cl_emit(&job->bcl, TRANSFORM_FEEDBACK_FLUSH_AND_COUNT, flush);
+
         job->draw_calls_queued++;
 
-        if (vc5->zsa && job->zsbuf &&
-            (vc5->zsa->base.depth.enabled ||
-             vc5->zsa->base.stencil[0].enabled)) {
-                struct vc5_resource *rsc = vc5_resource(job->zsbuf->texture);
-                vc5_job_add_bo(job, rsc->bo);
+        /* Increment the TF offsets by how many verts we wrote.  XXX: This
+         * needs some clamping to the buffer size.
+         */
+        for (int i = 0; i < v3d->streamout.num_targets; i++)
+                v3d->streamout.offsets[i] += info->count;
 
-                if (vc5->zsa->base.depth.enabled) {
-                        job->resolve |= PIPE_CLEAR_DEPTH;
-                        rsc->initialized_buffers = PIPE_CLEAR_DEPTH;
-                }
+        if (v3d->zsa && job->zsbuf && v3d->zsa->base.depth.enabled) {
+                struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
+                v3d_job_add_bo(job, rsc->bo);
 
-                if (vc5->zsa->base.stencil[0].enabled) {
-                        job->resolve |= PIPE_CLEAR_STENCIL;
-                        rsc->initialized_buffers |= PIPE_CLEAR_STENCIL;
+                job->load |= PIPE_CLEAR_DEPTH & ~job->clear;
+                if (v3d->zsa->base.depth.writemask)
+                        job->store |= PIPE_CLEAR_DEPTH;
+                rsc->initialized_buffers = PIPE_CLEAR_DEPTH;
+        }
+
+        if (v3d->zsa && job->zsbuf && v3d->zsa->base.stencil[0].enabled) {
+                struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
+                if (rsc->separate_stencil)
+                        rsc = rsc->separate_stencil;
+
+                v3d_job_add_bo(job, rsc->bo);
+
+                job->load |= PIPE_CLEAR_STENCIL & ~job->clear;
+                if (v3d->zsa->base.stencil[0].writemask ||
+                    v3d->zsa->base.stencil[1].writemask) {
+                        job->store |= PIPE_CLEAR_STENCIL;
                 }
+                rsc->initialized_buffers |= PIPE_CLEAR_STENCIL;
         }
 
         for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
                 uint32_t bit = PIPE_CLEAR_COLOR0 << i;
+                int blend_rt = v3d->blend->base.independent_blend_enable ? i : 0;
 
-                if (job->resolve & bit || !job->cbufs[i])
+                if (job->store & bit || !job->cbufs[i])
                         continue;
-                struct vc5_resource *rsc = vc5_resource(job->cbufs[i]->texture);
+                struct v3d_resource *rsc = v3d_resource(job->cbufs[i]->texture);
 
-                job->resolve |= bit;
-                vc5_job_add_bo(job, rsc->bo);
+                job->load |= bit & ~job->clear;
+                if (v3d->blend->base.rt[blend_rt].colormask)
+                        job->store |= bit;
+                v3d_job_add_bo(job, rsc->bo);
         }
 
         if (job->referenced_size > 768 * 1024 * 1024) {
                 perf_debug("Flushing job with %dkb to try to free up memory\n",
                         job->referenced_size / 1024);
-                vc5_flush(pctx);
+                v3d_flush(pctx);
         }
 
         if (V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)
-                vc5_flush(pctx);
+                v3d_flush(pctx);
 }
 
+/**
+ * Implements gallium's clear() hook (glClear()) by drawing a pair of triangles.
+ */
 static void
-vc5_clear(struct pipe_context *pctx, unsigned buffers,
-          const union pipe_color_union *color, double depth, unsigned stencil)
+v3d_draw_clear(struct v3d_context *v3d,
+               unsigned buffers,
+               const union pipe_color_union *color,
+               double depth, unsigned stencil)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_job *job = vc5_get_job_for_fbo(vc5);
+        static const union pipe_color_union dummy_color = {};
 
-        /* We can't flag new buffers for clearing once we've queued draws.  We
-         * could avoid this by using the 3d engine to clear.
+        /* The blitter util dereferences the color regardless, even though the
+         * gallium clear API may not pass one in when only Z/S are cleared.
          */
+        if (!color)
+                color = &dummy_color;
+
+        v3d_blitter_save(v3d);
+        util_blitter_clear(v3d->blitter,
+                           v3d->framebuffer.width,
+                           v3d->framebuffer.height,
+                           util_framebuffer_get_num_layers(&v3d->framebuffer),
+                           buffers, color, depth, stencil);
+}
+
+/**
+ * Attempts to perform the GL clear by using the TLB's fast clear at the start
+ * of the frame.
+ */
+static unsigned
+v3d_tlb_clear(struct v3d_job *job, unsigned buffers,
+              const union pipe_color_union *color,
+              double depth, unsigned stencil)
+{
+        struct v3d_context *v3d = job->v3d;
+
         if (job->draw_calls_queued) {
-                perf_debug("Flushing rendering to process new clear.\n");
-                vc5_job_submit(vc5, job);
-                job = vc5_get_job_for_fbo(vc5);
+                /* If anything in the CL has drawn using the buffer, then the
+                 * TLB clear we're trying to add now would happen before that
+                 * drawing.
+                 */
+                buffers &= ~(job->load | job->store);
+        }
+
+        /* GFXH-1461: If we were to emit a load of just depth or just stencil,
+         * then the clear for the other may get lost.  We need to decide now
+         * if it would be possible to need to emit a load of just one after
+         * we've set up our TLB clears.
+         */
+        if (buffers & PIPE_CLEAR_DEPTHSTENCIL &&
+            (buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL &&
+            job->zsbuf &&
+            util_format_is_depth_and_stencil(job->zsbuf->texture->format)) {
+                buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
         }
 
         for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
@@ -609,15 +708,15 @@
                 if (!(buffers & bit))
                         continue;
 
-                struct pipe_surface *psurf = vc5->framebuffer.cbufs[i];
-                struct vc5_surface *surf = vc5_surface(psurf);
-                struct vc5_resource *rsc = vc5_resource(psurf->texture);
+                struct pipe_surface *psurf = v3d->framebuffer.cbufs[i];
+                struct v3d_surface *surf = v3d_surface(psurf);
+                struct v3d_resource *rsc = v3d_resource(psurf->texture);
 
                 union util_color uc;
                 uint32_t internal_size = 4 << surf->internal_bpp;
 
                 static union pipe_color_union swapped_color;
-                if (vc5->swap_color_rb & (1 << i)) {
+                if (v3d->swap_color_rb & (1 << i)) {
                         swapped_color.f[0] = color->f[2];
                         swapped_color.f[1] = color->f[1];
                         swapped_color.f[2] = color->f[0];
@@ -662,8 +761,8 @@
 
         unsigned zsclear = buffers & PIPE_CLEAR_DEPTHSTENCIL;
         if (zsclear) {
-                struct vc5_resource *rsc =
-                        vc5_resource(vc5->framebuffer.zsbuf->texture);
+                struct v3d_resource *rsc =
+                        v3d_resource(v3d->framebuffer.zsbuf->texture);
 
                 if (zsclear & PIPE_CLEAR_DEPTH)
                         job->clear_z = depth;
@@ -675,16 +774,31 @@
 
         job->draw_min_x = 0;
         job->draw_min_y = 0;
-        job->draw_max_x = vc5->framebuffer.width;
-        job->draw_max_y = vc5->framebuffer.height;
-        job->cleared |= buffers;
-        job->resolve |= buffers;
+        job->draw_max_x = v3d->framebuffer.width;
+        job->draw_max_y = v3d->framebuffer.height;
+        job->clear |= buffers;
+        job->store |= buffers;
 
-        vc5_start_draw(vc5);
+        v3d_start_draw(v3d);
+
+        return buffers;
 }
 
 static void
-vc5_clear_render_target(struct pipe_context *pctx, struct pipe_surface *ps,
+v3d_clear(struct pipe_context *pctx, unsigned buffers,
+          const union pipe_color_union *color, double depth, unsigned stencil)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_job *job = v3d_get_job_for_fbo(v3d);
+
+        buffers &= ~v3d_tlb_clear(job, buffers, color, depth, stencil);
+
+        if (buffers)
+                v3d_draw_clear(v3d, buffers, color, depth, stencil);
+}
+
+static void
+v3d_clear_render_target(struct pipe_context *pctx, struct pipe_surface *ps,
                         const union pipe_color_union *color,
                         unsigned x, unsigned y, unsigned w, unsigned h,
                         bool render_condition_enabled)
@@ -693,7 +807,7 @@
 }
 
 static void
-vc5_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps,
+v3d_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps,
                         unsigned buffers, double depth, unsigned stencil,
                         unsigned x, unsigned y, unsigned w, unsigned h,
                         bool render_condition_enabled)
@@ -704,8 +818,8 @@
 void
 v3dX(draw_init)(struct pipe_context *pctx)
 {
-        pctx->draw_vbo = vc5_draw_vbo;
-        pctx->clear = vc5_clear;
-        pctx->clear_render_target = vc5_clear_render_target;
-        pctx->clear_depth_stencil = vc5_clear_depth_stencil;
+        pctx->draw_vbo = v3d_draw_vbo;
+        pctx->clear = v3d_clear;
+        pctx->clear_render_target = v3d_clear_render_target;
+        pctx->clear_depth_stencil = v3d_clear_depth_stencil;
 }
diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
new file mode 100644
index 0000000..537dd1f
--- /dev/null
+++ b/src/gallium/drivers/v3d/v3dx_emit.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright © 2014-2017 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/u_format.h"
+#include "util/u_half.h"
+#include "v3d_context.h"
+#include "broadcom/common/v3d_macros.h"
+#include "broadcom/cle/v3dx_pack.h"
+#include "broadcom/compiler/v3d_compiler.h"
+
+static uint8_t
+v3d_factor(enum pipe_blendfactor factor, bool dst_alpha_one)
+{
+        /* We may get a bad blendfactor when blending is disabled. */
+        if (factor == 0)
+                return V3D_BLEND_FACTOR_ZERO;
+
+        switch (factor) {
+        case PIPE_BLENDFACTOR_ZERO:
+                return V3D_BLEND_FACTOR_ZERO;
+        case PIPE_BLENDFACTOR_ONE:
+                return V3D_BLEND_FACTOR_ONE;
+        case PIPE_BLENDFACTOR_SRC_COLOR:
+                return V3D_BLEND_FACTOR_SRC_COLOR;
+        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+                return V3D_BLEND_FACTOR_INV_SRC_COLOR;
+        case PIPE_BLENDFACTOR_DST_COLOR:
+                return V3D_BLEND_FACTOR_DST_COLOR;
+        case PIPE_BLENDFACTOR_INV_DST_COLOR:
+                return V3D_BLEND_FACTOR_INV_DST_COLOR;
+        case PIPE_BLENDFACTOR_SRC_ALPHA:
+                return V3D_BLEND_FACTOR_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+                return V3D_BLEND_FACTOR_INV_SRC_ALPHA;
+        case PIPE_BLENDFACTOR_DST_ALPHA:
+                return (dst_alpha_one ?
+                        V3D_BLEND_FACTOR_ONE :
+                        V3D_BLEND_FACTOR_DST_ALPHA);
+        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+                return (dst_alpha_one ?
+                        V3D_BLEND_FACTOR_ZERO :
+                        V3D_BLEND_FACTOR_INV_DST_ALPHA);
+        case PIPE_BLENDFACTOR_CONST_COLOR:
+                return V3D_BLEND_FACTOR_CONST_COLOR;
+        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+                return V3D_BLEND_FACTOR_INV_CONST_COLOR;
+        case PIPE_BLENDFACTOR_CONST_ALPHA:
+                return V3D_BLEND_FACTOR_CONST_ALPHA;
+        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+                return V3D_BLEND_FACTOR_INV_CONST_ALPHA;
+        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+                return (dst_alpha_one ?
+                        V3D_BLEND_FACTOR_ZERO :
+                        V3D_BLEND_FACTOR_SRC_ALPHA_SATURATE);
+        default:
+                unreachable("Bad blend factor");
+        }
+}
+
+static inline uint16_t
+swizzled_border_color(const struct v3d_device_info *devinfo,
+                      struct pipe_sampler_state *sampler,
+                      struct v3d_sampler_view *sview,
+                      int chan)
+{
+        const struct util_format_description *desc =
+                util_format_description(sview->base.format);
+        uint8_t swiz = chan;
+
+        /* If we're doing swizzling in the sampler, then only rearrange the
+         * border color for the mismatch between the VC5 texture format and
+         * the PIPE_FORMAT, since GL_ARB_texture_swizzle will be handled by
+         * the sampler's swizzle.
+         *
+         * For swizzling in the shader, we don't do any pre-swizzling of the
+         * border color.
+         */
+        if (v3d_get_tex_return_size(devinfo, sview->base.format,
+                                    sampler->compare_mode) != 32)
+                swiz = desc->swizzle[swiz];
+
+        switch (swiz) {
+        case PIPE_SWIZZLE_0:
+                return util_float_to_half(0.0);
+        case PIPE_SWIZZLE_1:
+                return util_float_to_half(1.0);
+        default:
+                return util_float_to_half(sampler->border_color.f[swiz]);
+        }
+}
+
+#if V3D_VERSION < 40
+static uint32_t
+translate_swizzle(unsigned char pipe_swizzle)
+{
+        switch (pipe_swizzle) {
+        case PIPE_SWIZZLE_0:
+                return 0;
+        case PIPE_SWIZZLE_1:
+                return 1;
+        case PIPE_SWIZZLE_X:
+        case PIPE_SWIZZLE_Y:
+        case PIPE_SWIZZLE_Z:
+        case PIPE_SWIZZLE_W:
+                return 2 + pipe_swizzle;
+        default:
+                unreachable("unknown swizzle");
+        }
+}
+
+static void
+emit_one_texture(struct v3d_context *v3d, struct v3d_texture_stateobj *stage_tex,
+                 int i)
+{
+        struct v3d_job *job = v3d->job;
+        struct pipe_sampler_state *psampler = stage_tex->samplers[i];
+        struct v3d_sampler_state *sampler = v3d_sampler_state(psampler);
+        struct pipe_sampler_view *psview = stage_tex->textures[i];
+        struct v3d_sampler_view *sview = v3d_sampler_view(psview);
+        struct pipe_resource *prsc = psview->texture;
+        struct v3d_resource *rsc = v3d_resource(prsc);
+        const struct v3d_device_info *devinfo = &v3d->screen->devinfo;
+
+        stage_tex->texture_state[i].offset =
+                v3d_cl_ensure_space(&job->indirect,
+                                    cl_packet_length(TEXTURE_SHADER_STATE),
+                                    32);
+        v3d_bo_set_reference(&stage_tex->texture_state[i].bo,
+                             job->indirect.bo);
+
+        uint32_t return_size = v3d_get_tex_return_size(devinfo, psview->format,
+                                                       psampler->compare_mode);
+
+        struct V3D33_TEXTURE_SHADER_STATE unpacked = {
+                /* XXX */
+                .border_color_red = swizzled_border_color(devinfo, psampler,
+                                                          sview, 0),
+                .border_color_green = swizzled_border_color(devinfo, psampler,
+                                                            sview, 1),
+                .border_color_blue = swizzled_border_color(devinfo, psampler,
+                                                           sview, 2),
+                .border_color_alpha = swizzled_border_color(devinfo, psampler,
+                                                            sview, 3),
+
+                /* In the normal texturing path, the LOD gets clamped between
+                 * min/max, and the base_level field (set in the sampler view
+                 * from first_level) only decides where the min/mag switch
+                 * happens, so we need to use the LOD clamps to keep us
+                 * between min and max.
+                 *
+                 * For txf, the LOD clamp is still used, despite GL not
+                 * wanting that.  We will need to have a separate
+                 * TEXTURE_SHADER_STATE that ignores psview->min/max_lod to
+                 * support txf properly.
+                 */
+                .min_level_of_detail = MIN2(psview->u.tex.first_level +
+                                            MAX2(psampler->min_lod, 0),
+                                            psview->u.tex.last_level),
+                .max_level_of_detail = MIN2(psview->u.tex.first_level +
+                                            psampler->max_lod,
+                                            psview->u.tex.last_level),
+
+                .texture_base_pointer = cl_address(rsc->bo,
+                                                   rsc->slices[0].offset),
+
+                .output_32_bit = return_size == 32,
+        };
+
+        /* Set up the sampler swizzle if we're doing 16-bit sampling.  For
+         * 32-bit, we leave swizzling up to the shader compiler.
+         *
+         * Note: Contrary to the docs, the swizzle still applies even if the
+         * return size is 32.  It's just that you probably want to swizzle in
+         * the shader, because you need the Y/Z/W channels to be defined.
+         */
+        if (return_size == 32) {
+                unpacked.swizzle_r = translate_swizzle(PIPE_SWIZZLE_X);
+                unpacked.swizzle_g = translate_swizzle(PIPE_SWIZZLE_Y);
+                unpacked.swizzle_b = translate_swizzle(PIPE_SWIZZLE_Z);
+                unpacked.swizzle_a = translate_swizzle(PIPE_SWIZZLE_W);
+        } else {
+                unpacked.swizzle_r = translate_swizzle(sview->swizzle[0]);
+                unpacked.swizzle_g = translate_swizzle(sview->swizzle[1]);
+                unpacked.swizzle_b = translate_swizzle(sview->swizzle[2]);
+                unpacked.swizzle_a = translate_swizzle(sview->swizzle[3]);
+        }
+
+        int min_img_filter = psampler->min_img_filter;
+        int min_mip_filter = psampler->min_mip_filter;
+        int mag_img_filter = psampler->mag_img_filter;
+
+        if (return_size == 32) {
+                min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
+                min_img_filter = PIPE_TEX_FILTER_NEAREST;
+                mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+        }
+
+        bool min_nearest = min_img_filter == PIPE_TEX_FILTER_NEAREST;
+        switch (min_mip_filter) {
+        case PIPE_TEX_MIPFILTER_NONE:
+                unpacked.filter += min_nearest ? 2 : 0;
+                break;
+        case PIPE_TEX_MIPFILTER_NEAREST:
+                unpacked.filter += min_nearest ? 4 : 8;
+                break;
+        case PIPE_TEX_MIPFILTER_LINEAR:
+                unpacked.filter += min_nearest ? 4 : 8;
+                unpacked.filter += 2;
+                break;
+        }
+
+        if (mag_img_filter == PIPE_TEX_FILTER_NEAREST)
+                unpacked.filter++;
+
+        if (psampler->max_anisotropy > 8)
+                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_16_1;
+        else if (psampler->max_anisotropy > 4)
+                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_8_1;
+        else if (psampler->max_anisotropy > 2)
+                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_4_1;
+        else if (psampler->max_anisotropy)
+                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_2_1;
+
+        uint8_t packed[cl_packet_length(TEXTURE_SHADER_STATE)];
+        cl_packet_pack(TEXTURE_SHADER_STATE)(&job->indirect, packed, &unpacked);
+
+        for (int i = 0; i < ARRAY_SIZE(packed); i++)
+                packed[i] |= sview->texture_shader_state[i] | sampler->texture_shader_state[i];
+
+        /* TMU indirect structs need to be 32b aligned. */
+        v3d_cl_ensure_space(&job->indirect, ARRAY_SIZE(packed), 32);
+        cl_emit_prepacked(&job->indirect, &packed);
+}
+
+static void
+emit_textures(struct v3d_context *v3d, struct v3d_texture_stateobj *stage_tex)
+{
+        for (int i = 0; i < stage_tex->num_textures; i++) {
+                if (stage_tex->textures[i])
+                        emit_one_texture(v3d, stage_tex, i);
+        }
+}
+#endif /* V3D_VERSION < 40 */
+
+static uint32_t
+translate_colormask(struct v3d_context *v3d, uint32_t colormask, int rt)
+{
+        if (v3d->swap_color_rb & (1 << rt)) {
+                colormask = ((colormask & (2 | 8)) |
+                             ((colormask & 1) << 2) |
+                             ((colormask & 4) >> 2));
+        }
+
+        return (~colormask) & 0xf;
+}
+
+static void
+emit_rt_blend(struct v3d_context *v3d, struct v3d_job *job,
+              struct pipe_blend_state *blend, int rt)
+{
+        struct pipe_rt_blend_state *rtblend = &blend->rt[rt];
+
+#if V3D_VERSION >= 40
+        /* We don't need to emit blend state for disabled RTs. */
+        if (!rtblend->blend_enable)
+                return;
+#endif
+
+        cl_emit(&job->bcl, BLEND_CFG, config) {
+#if V3D_VERSION >= 40
+                if (blend->independent_blend_enable)
+                        config.render_target_mask = 1 << rt;
+                else
+                        config.render_target_mask = (1 << VC5_MAX_DRAW_BUFFERS) - 1;
+#else
+                assert(rt == 0);
+#endif
+
+                config.color_blend_mode = rtblend->rgb_func;
+                config.color_blend_dst_factor =
+                        v3d_factor(rtblend->rgb_dst_factor,
+                                   v3d->blend_dst_alpha_one);
+                config.color_blend_src_factor =
+                        v3d_factor(rtblend->rgb_src_factor,
+                                   v3d->blend_dst_alpha_one);
+
+                config.alpha_blend_mode = rtblend->alpha_func;
+                config.alpha_blend_dst_factor =
+                        v3d_factor(rtblend->alpha_dst_factor,
+                                   v3d->blend_dst_alpha_one);
+                config.alpha_blend_src_factor =
+                        v3d_factor(rtblend->alpha_src_factor,
+                                   v3d->blend_dst_alpha_one);
+        }
+}
+
+static void
+emit_flat_shade_flags(struct v3d_job *job,
+                      int varying_offset,
+                      uint32_t varyings,
+                      enum V3DX(Varying_Flags_Action) lower,
+                      enum V3DX(Varying_Flags_Action) higher)
+{
+        cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
+                flags.varying_offset_v0 = varying_offset;
+                flags.flat_shade_flags_for_varyings_v024 = varyings;
+                flags.action_for_flat_shade_flags_of_lower_numbered_varyings =
+                        lower;
+                flags.action_for_flat_shade_flags_of_higher_numbered_varyings =
+                        higher;
+        }
+}
+
+#if V3D_VERSION >= 40
+static void
+emit_noperspective_flags(struct v3d_job *job,
+                         int varying_offset,
+                         uint32_t varyings,
+                         enum V3DX(Varying_Flags_Action) lower,
+                         enum V3DX(Varying_Flags_Action) higher)
+{
+        cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) {
+                flags.varying_offset_v0 = varying_offset;
+                flags.non_perspective_flags_for_varyings_v024 = varyings;
+                flags.action_for_non_perspective_flags_of_lower_numbered_varyings =
+                        lower;
+                flags.action_for_non_perspective_flags_of_higher_numbered_varyings =
+                        higher;
+        }
+}
+
+static void
+emit_centroid_flags(struct v3d_job *job,
+                    int varying_offset,
+                    uint32_t varyings,
+                    enum V3DX(Varying_Flags_Action) lower,
+                    enum V3DX(Varying_Flags_Action) higher)
+{
+        cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
+                flags.varying_offset_v0 = varying_offset;
+                flags.centroid_flags_for_varyings_v024 = varyings;
+                flags.action_for_centroid_flags_of_lower_numbered_varyings =
+                        lower;
+                flags.action_for_centroid_flags_of_higher_numbered_varyings =
+                        higher;
+        }
+}
+#endif /* V3D_VERSION >= 40 */
+
+static bool
+emit_varying_flags(struct v3d_job *job, uint32_t *flags,
+                   void (*flag_emit_callback)(struct v3d_job *job,
+                                              int varying_offset,
+                                              uint32_t flags,
+                                              enum V3DX(Varying_Flags_Action) lower,
+                                              enum V3DX(Varying_Flags_Action) higher))
+{
+        struct v3d_context *v3d = job->v3d;
+        bool emitted_any = false;
+
+        for (int i = 0; i < ARRAY_SIZE(v3d->prog.fs->prog_data.fs->flat_shade_flags); i++) {
+                if (!flags[i])
+                        continue;
+
+                if (emitted_any) {
+                        flag_emit_callback(job, i, flags[i],
+                                           V3D_VARYING_FLAGS_ACTION_UNCHANGED,
+                                           V3D_VARYING_FLAGS_ACTION_UNCHANGED);
+                } else if (i == 0) {
+                        flag_emit_callback(job, i, flags[i],
+                                           V3D_VARYING_FLAGS_ACTION_UNCHANGED,
+                                           V3D_VARYING_FLAGS_ACTION_ZEROED);
+                } else {
+                        flag_emit_callback(job, i, flags[i],
+                                           V3D_VARYING_FLAGS_ACTION_ZEROED,
+                                           V3D_VARYING_FLAGS_ACTION_ZEROED);
+                }
+                emitted_any = true;
+        }
+
+        return emitted_any;
+}
+
+void
+v3dX(emit_state)(struct pipe_context *pctx)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_job *job = v3d->job;
+        bool rasterizer_discard = v3d->rasterizer->base.rasterizer_discard;
+
+        if (v3d->dirty & (VC5_DIRTY_SCISSOR | VC5_DIRTY_VIEWPORT |
+                          VC5_DIRTY_RASTERIZER)) {
+                float *vpscale = v3d->viewport.scale;
+                float *vptranslate = v3d->viewport.translate;
+                float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
+                float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
+                float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
+                float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
+
+                /* Clip to the scissor if it's enabled, but still clip to the
+                 * drawable regardless since that controls where the binner
+                 * tries to put things.
+                 *
+                 * Additionally, always clip the rendering to the viewport,
+                 * since the hardware does guardband clipping, meaning
+                 * primitives would rasterize outside of the view volume.
+                 */
+                uint32_t minx, miny, maxx, maxy;
+                if (!v3d->rasterizer->base.scissor) {
+                        minx = MAX2(vp_minx, 0);
+                        miny = MAX2(vp_miny, 0);
+                        maxx = MIN2(vp_maxx, job->draw_width);
+                        maxy = MIN2(vp_maxy, job->draw_height);
+                } else {
+                        minx = MAX2(vp_minx, v3d->scissor.minx);
+                        miny = MAX2(vp_miny, v3d->scissor.miny);
+                        maxx = MIN2(vp_maxx, v3d->scissor.maxx);
+                        maxy = MIN2(vp_maxy, v3d->scissor.maxy);
+                }
+
+                cl_emit(&job->bcl, CLIP_WINDOW, clip) {
+                        clip.clip_window_left_pixel_coordinate = minx;
+                        clip.clip_window_bottom_pixel_coordinate = miny;
+                        if (maxx > minx && maxy > miny) {
+                                clip.clip_window_width_in_pixels = maxx - minx;
+                                clip.clip_window_height_in_pixels = maxy - miny;
+                        } else if (V3D_VERSION < 41) {
+                                /* The HW won't entirely clip out when scissor
+                                 * w/h is 0.  Just treat it the same as
+                                 * rasterizer discard.
+                                 */
+                                rasterizer_discard = true;
+                                clip.clip_window_width_in_pixels = 1;
+                                clip.clip_window_height_in_pixels = 1;
+                        }
+                }
+
+                job->draw_min_x = MIN2(job->draw_min_x, minx);
+                job->draw_min_y = MIN2(job->draw_min_y, miny);
+                job->draw_max_x = MAX2(job->draw_max_x, maxx);
+                job->draw_max_y = MAX2(job->draw_max_y, maxy);
+        }
+
+        if (v3d->dirty & (VC5_DIRTY_RASTERIZER |
+                          VC5_DIRTY_ZSA |
+                          VC5_DIRTY_BLEND |
+                          VC5_DIRTY_COMPILED_FS)) {
+                cl_emit(&job->bcl, CFG_BITS, config) {
+                        config.enable_forward_facing_primitive =
+                                !rasterizer_discard &&
+                                !(v3d->rasterizer->base.cull_face &
+                                  PIPE_FACE_FRONT);
+                        config.enable_reverse_facing_primitive =
+                                !rasterizer_discard &&
+                                !(v3d->rasterizer->base.cull_face &
+                                  PIPE_FACE_BACK);
+                        /* This seems backwards, but it's what gets the
+                         * clipflat test to pass.
+                         */
+                        config.clockwise_primitives =
+                                v3d->rasterizer->base.front_ccw;
+
+                        config.enable_depth_offset =
+                                v3d->rasterizer->base.offset_tri;
+
+                        /* V3D follows GL behavior where the sample mask only
+                         * applies when MSAA is enabled.  Gallium has sample
+                         * mask apply anyway, and the MSAA blit shaders will
+                         * set sample mask without explicitly setting
+                         * rasterizer oversample.  Just force it on here,
+                         * since the blit shaders are the only way to have
+                         * !multisample && samplemask != 0xf.
+                         */
+                        config.rasterizer_oversample_mode =
+                                v3d->rasterizer->base.multisample ||
+                                v3d->sample_mask != 0xf;
+
+                        config.direct3d_provoking_vertex =
+                                v3d->rasterizer->base.flatshade_first;
+
+                        config.blend_enable = v3d->blend->blend_enables;
+
+                        /* Note: EZ state may update based on the compiled FS,
+                         * along with ZSA
+                         */
+                        config.early_z_updates_enable =
+                                (job->ez_state != VC5_EZ_DISABLED);
+                        if (v3d->zsa->base.depth.enabled) {
+                                config.z_updates_enable =
+                                        v3d->zsa->base.depth.writemask;
+                                config.early_z_enable =
+                                        config.early_z_updates_enable;
+                                config.depth_test_function =
+                                        v3d->zsa->base.depth.func;
+                        } else {
+                                config.depth_test_function = PIPE_FUNC_ALWAYS;
+                        }
+
+                        config.stencil_enable =
+                                v3d->zsa->base.stencil[0].enabled;
+                }
+
+        }
+
+        if (v3d->dirty & VC5_DIRTY_RASTERIZER &&
+            v3d->rasterizer->base.offset_tri) {
+                if (job->zsbuf &&
+                    job->zsbuf->format == PIPE_FORMAT_Z16_UNORM) {
+                        cl_emit_prepacked_sized(&job->bcl,
+                                                v3d->rasterizer->depth_offset_z16,
+                                                cl_packet_length(DEPTH_OFFSET));
+                } else {
+                        cl_emit_prepacked_sized(&job->bcl,
+                                                v3d->rasterizer->depth_offset,
+                                                cl_packet_length(DEPTH_OFFSET));
+                }
+        }
+
+        if (v3d->dirty & VC5_DIRTY_RASTERIZER) {
+                cl_emit(&job->bcl, POINT_SIZE, point_size) {
+                        point_size.point_size = v3d->rasterizer->point_size;
+                }
+
+                cl_emit(&job->bcl, LINE_WIDTH, line_width) {
+                        line_width.line_width = v3d->rasterizer->base.line_width;
+                }
+        }
+
+        if (v3d->dirty & VC5_DIRTY_VIEWPORT) {
+                cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+                        clip.viewport_half_width_in_1_256th_of_pixel =
+                                v3d->viewport.scale[0] * 256.0f;
+                        clip.viewport_half_height_in_1_256th_of_pixel =
+                                v3d->viewport.scale[1] * 256.0f;
+                }
+
+                cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+                        clip.viewport_z_offset_zc_to_zs =
+                                v3d->viewport.translate[2];
+                        clip.viewport_z_scale_zc_to_zs =
+                                v3d->viewport.scale[2];
+                }
+                cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
+                        float z1 = (v3d->viewport.translate[2] -
+                                    v3d->viewport.scale[2]);
+                        float z2 = (v3d->viewport.translate[2] +
+                                    v3d->viewport.scale[2]);
+                        clip.minimum_zw = MIN2(z1, z2);
+                        clip.maximum_zw = MAX2(z1, z2);
+                }
+
+                cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
+                        vp.viewport_centre_x_coordinate =
+                                v3d->viewport.translate[0];
+                        vp.viewport_centre_y_coordinate =
+                                v3d->viewport.translate[1];
+                }
+        }
+
+        if (v3d->dirty & VC5_DIRTY_BLEND) {
+                struct v3d_blend_state *blend = v3d->blend;
+
+                if (blend->blend_enables) {
+#if V3D_VERSION >= 40
+                        cl_emit(&job->bcl, BLEND_ENABLES, enables) {
+                                enables.mask = blend->blend_enables;
+                        }
+#endif
+
+                        if (blend->base.independent_blend_enable) {
+                                for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++)
+                                        emit_rt_blend(v3d, job, &blend->base, i);
+                        } else {
+                                emit_rt_blend(v3d, job, &blend->base, 0);
+                        }
+                }
+        }
+
+        if (v3d->dirty & VC5_DIRTY_BLEND) {
+                struct pipe_blend_state *blend = &v3d->blend->base;
+
+                cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
+                        for (int i = 0; i < 4; i++) {
+                                int rt = blend->independent_blend_enable ? i : 0;
+                                int rt_mask = blend->rt[rt].colormask;
+
+                                mask.mask |= translate_colormask(v3d, rt_mask,
+                                                                 i) << (4 * i);
+                        }
+                }
+        }
+
+        /* GFXH-1431: On V3D 3.x, writing BLEND_CONFIG resets the constant
+         * color.
+         */
+        if (v3d->dirty & VC5_DIRTY_BLEND_COLOR ||
+            (V3D_VERSION < 41 && (v3d->dirty & VC5_DIRTY_BLEND))) {
+                cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
+                        color.red_f16 = (v3d->swap_color_rb ?
+                                          v3d->blend_color.hf[2] :
+                                          v3d->blend_color.hf[0]);
+                        color.green_f16 = v3d->blend_color.hf[1];
+                        color.blue_f16 = (v3d->swap_color_rb ?
+                                           v3d->blend_color.hf[0] :
+                                           v3d->blend_color.hf[2]);
+                        color.alpha_f16 = v3d->blend_color.hf[3];
+                }
+        }
+
+        if (v3d->dirty & (VC5_DIRTY_ZSA | VC5_DIRTY_STENCIL_REF)) {
+                struct pipe_stencil_state *front = &v3d->zsa->base.stencil[0];
+                struct pipe_stencil_state *back = &v3d->zsa->base.stencil[1];
+
+                if (front->enabled) {
+                        cl_emit_with_prepacked(&job->bcl, STENCIL_CFG,
+                                               v3d->zsa->stencil_front, config) {
+                                config.stencil_ref_value =
+                                        v3d->stencil_ref.ref_value[0];
+                        }
+                }
+
+                if (back->enabled) {
+                        cl_emit_with_prepacked(&job->bcl, STENCIL_CFG,
+                                               v3d->zsa->stencil_back, config) {
+                                config.stencil_ref_value =
+                                        v3d->stencil_ref.ref_value[1];
+                        }
+                }
+        }
+
+#if V3D_VERSION < 40
+        /* Pre-4.x, we have texture state that depends on both the sampler and
+         * the view, so we merge them together at draw time.
+         */
+        if (v3d->dirty & VC5_DIRTY_FRAGTEX)
+                emit_textures(v3d, &v3d->fragtex);
+
+        if (v3d->dirty & VC5_DIRTY_VERTTEX)
+                emit_textures(v3d, &v3d->verttex);
+#endif
+
+        if (v3d->dirty & VC5_DIRTY_FLAT_SHADE_FLAGS) {
+                if (!emit_varying_flags(job,
+                                        v3d->prog.fs->prog_data.fs->flat_shade_flags,
+                                        emit_flat_shade_flags)) {
+                        cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
+                }
+        }
+
+#if V3D_VERSION >= 40
+        if (v3d->dirty & VC5_DIRTY_NOPERSPECTIVE_FLAGS) {
+                if (!emit_varying_flags(job,
+                                        v3d->prog.fs->prog_data.fs->noperspective_flags,
+                                        emit_noperspective_flags)) {
+                        cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags);
+                }
+        }
+
+        if (v3d->dirty & VC5_DIRTY_CENTROID_FLAGS) {
+                if (!emit_varying_flags(job,
+                                        v3d->prog.fs->prog_data.fs->centroid_flags,
+                                        emit_centroid_flags)) {
+                        cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
+                }
+        }
+#endif
+
+        /* Set up the transform feedback data specs (which VPM entries to
+         * output to which buffers).
+         */
+        if (v3d->dirty & (VC5_DIRTY_STREAMOUT |
+                          VC5_DIRTY_RASTERIZER |
+                          VC5_DIRTY_PRIM_MODE)) {
+                struct v3d_streamout_stateobj *so = &v3d->streamout;
+
+                if (so->num_targets) {
+                        bool psiz_per_vertex = (v3d->prim_mode == PIPE_PRIM_POINTS &&
+                                                v3d->rasterizer->base.point_size_per_vertex);
+                        uint16_t *tf_specs = (psiz_per_vertex ?
+                                              v3d->prog.bind_vs->tf_specs_psiz :
+                                              v3d->prog.bind_vs->tf_specs);
+
+#if V3D_VERSION >= 40
+                        job->tf_enabled = (v3d->prog.bind_vs->num_tf_specs != 0 &&
+                                           v3d->active_queries);
+
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_SPECS, tfe) {
+                                tfe.number_of_16_bit_output_data_specs_following =
+                                        v3d->prog.bind_vs->num_tf_specs;
+                                tfe.enable = job->tf_enabled;
+                        };
+#else /* V3D_VERSION < 40 */
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_ENABLE, tfe) {
+                                tfe.number_of_32_bit_output_buffer_address_following =
+                                        so->num_targets;
+                                tfe.number_of_16_bit_output_data_specs_following =
+                                        v3d->prog.bind_vs->num_tf_specs;
+                        };
+#endif /* V3D_VERSION < 40 */
+                        for (int i = 0; i < v3d->prog.bind_vs->num_tf_specs; i++) {
+                                cl_emit_prepacked(&job->bcl, &tf_specs[i]);
+                        }
+                } else if (job->tf_enabled) {
+#if V3D_VERSION >= 40
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_SPECS, tfe) {
+                                tfe.enable = false;
+                        };
+                        job->tf_enabled = false;
+#endif /* V3D_VERSION >= 40 */
+                }
+        }
+
+        /* Set up the trasnform feedback buffers. */
+        if (v3d->dirty & VC5_DIRTY_STREAMOUT) {
+                struct v3d_streamout_stateobj *so = &v3d->streamout;
+                for (int i = 0; i < so->num_targets; i++) {
+                        const struct pipe_stream_output_target *target =
+                                so->targets[i];
+                        struct v3d_resource *rsc = target ?
+                                v3d_resource(target->buffer) : NULL;
+                        struct pipe_shader_state *vs = &v3d->prog.bind_vs->base;
+                        struct pipe_stream_output_info *info = &vs->stream_output;
+                        uint32_t offset = (v3d->streamout.offsets[i] *
+                                           info->stride[i] * 4);
+
+#if V3D_VERSION >= 40
+                        if (!target)
+                                continue;
+
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_BUFFER, output) {
+                                output.buffer_address =
+                                        cl_address(rsc->bo,
+                                                   target->buffer_offset +
+                                                   offset);
+                                output.buffer_size_in_32_bit_words =
+                                        (target->buffer_size - offset) >> 2;
+                                output.buffer_number = i;
+                        }
+#else /* V3D_VERSION < 40 */
+                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_OUTPUT_ADDRESS, output) {
+                                if (target) {
+                                        output.address =
+                                                cl_address(rsc->bo,
+                                                           target->buffer_offset +
+                                                           offset);
+                                }
+                        };
+#endif /* V3D_VERSION < 40 */
+                        if (target) {
+                                v3d_job_add_write_resource(v3d->job,
+                                                           target->buffer);
+                        }
+                        /* XXX: buffer_size? */
+                }
+        }
+
+        if (v3d->dirty & VC5_DIRTY_OQ) {
+                cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
+                        job->oq_enabled = v3d->active_queries && v3d->current_oq;
+                        if (job->oq_enabled) {
+                                counter.address = cl_address(v3d->current_oq, 0);
+                        }
+                }
+        }
+
+#if V3D_VERSION >= 40
+        if (v3d->dirty & VC5_DIRTY_SAMPLE_STATE) {
+                cl_emit(&job->bcl, SAMPLE_STATE, state) {
+                        /* Note: SampleCoverage was handled at the
+                         * state_tracker level by converting to sample_mask.
+                         */
+                        state.coverage = 1.0;
+                        state.mask = job->msaa ? v3d->sample_mask : 0xf;
+                }
+        }
+#endif
+}
diff --git a/src/gallium/drivers/vc5/v3dx_format_table.c b/src/gallium/drivers/v3d/v3dx_format_table.c
similarity index 97%
rename from src/gallium/drivers/vc5/v3dx_format_table.c
rename to src/gallium/drivers/v3d/v3dx_format_table.c
index cc356fc..c0caba7 100644
--- a/src/gallium/drivers/vc5/v3dx_format_table.c
+++ b/src/gallium/drivers/v3d/v3dx_format_table.c
@@ -23,10 +23,10 @@
 
 #include "util/u_format.h"
 
-#include "vc5_context.h"
+#include "v3d_context.h"
 #include "broadcom/cle/v3dx_pack.h"
 #include "broadcom/common/v3d_macros.h"
-#include "vc5_format_table.h"
+#include "v3d_format_table.h"
 
 #define SWIZ(x,y,z,w) {          \
         PIPE_SWIZZLE_##x, \
@@ -58,7 +58,7 @@
 #define SWIZ_XXXX	SWIZ(X, X, X, X)
 #define SWIZ_000X	SWIZ(0, 0, 0, X)
 
-static const struct vc5_format format_table[] = {
+static const struct v3d_format format_table[] = {
         FORMAT(B8G8R8A8_UNORM,    RGBA8,        RGBA8,       SWIZ_ZYXW, 16, 0),
         FORMAT(B8G8R8X8_UNORM,    RGBA8,        RGBA8,       SWIZ_ZYX1, 16, 0),
         FORMAT(B8G8R8A8_SRGB,     SRGB8_ALPHA8, RGBA8,       SWIZ_ZYXW, 16, 0),
@@ -145,12 +145,12 @@
 #if V3D_VERSION >= 40
         FORMAT(S8_UINT_Z24_UNORM, D24S8,        DEPTH24_X8,  SWIZ_XXXX, 32, 1),
         FORMAT(X8Z24_UNORM,       D24S8,        DEPTH24_X8,  SWIZ_XXXX, 32, 1),
-        FORMAT(S8X24_UINT,        S8,           R32F,        SWIZ_XXXX, 32, 1),
-        FORMAT(Z32_FLOAT,         D32F,         R32F,        SWIZ_XXXX, 32, 1),
+        FORMAT(S8X24_UINT,        S8,           DEPTH_COMP32F, SWIZ_XXXX, 32, 1),
+        FORMAT(Z32_FLOAT,         D32F,         DEPTH_COMP32F, SWIZ_XXXX, 32, 1),
         FORMAT(Z16_UNORM,         D16,          DEPTH_COMP16,SWIZ_XXXX, 32, 1),
 
         /* Pretend we support this, but it'll be separate Z32F depth and S8. */
-        FORMAT(Z32_FLOAT_S8X24_UINT, D32F,      R32F,        SWIZ_XXXX, 32, 1),
+        FORMAT(Z32_FLOAT_S8X24_UINT, D32F,      DEPTH_COMP32F, SWIZ_XXXX, 32, 1),
 #else
         FORMAT(S8_UINT_Z24_UNORM, ZS_DEPTH24_STENCIL8, DEPTH24_X8, SWIZ_XXXX, 32, 1),
         FORMAT(X8Z24_UNORM,       ZS_DEPTH24_STENCIL8, DEPTH24_X8, SWIZ_XXXX, 32, 1),
@@ -178,7 +178,7 @@
         FORMAT(DXT5_RGBA,         NO,           BC3,         SWIZ_XYZ1, 16, 0),
 };
 
-const struct vc5_format *
+const struct v3d_format *
 v3dX(get_format_desc)(enum pipe_format f)
 {
         if (f < ARRAY_SIZE(format_table) && format_table[f].present)
diff --git a/src/gallium/drivers/vc5/v3dx_job.c b/src/gallium/drivers/v3d/v3dx_job.c
similarity index 84%
rename from src/gallium/drivers/vc5/v3dx_job.c
rename to src/gallium/drivers/v3d/v3dx_job.c
index ca3831c..d87dcea 100644
--- a/src/gallium/drivers/vc5/v3dx_job.c
+++ b/src/gallium/drivers/v3d/v3dx_job.c
@@ -27,17 +27,16 @@
  * kernel.
  */
 
-#include "vc5_context.h"
+#include "v3d_context.h"
 #include "broadcom/cle/v3dx_pack.h"
 
-void v3dX(bcl_epilogue)(struct vc5_context *vc5, struct vc5_job *job)
+void v3dX(bcl_epilogue)(struct v3d_context *v3d, struct v3d_job *job)
 {
-                vc5_cl_ensure_space_with_branch(&job->bcl,
+                v3d_cl_ensure_space_with_branch(&job->bcl,
                                                 cl_packet_length(OCCLUSION_QUERY_COUNTER) +
 #if V3D_VERSION >= 41
                                                 cl_packet_length(TRANSFORM_FEEDBACK_SPECS) +
 #endif
-                                                cl_packet_length(INCREMENT_SEMAPHORE) +
                                                 cl_packet_length(FLUSH_ALL_STATE));
 
                 if (job->oq_enabled) {
@@ -61,12 +60,6 @@
                 }
 #endif /* V3D_VERSION >= 41 */
 
-                /* Increment the semaphore indicating that binning is done and
-                 * unblocking the render thread.  Note that this doesn't act
-                 * until the FLUSH completes.
-                 */
-                cl_emit(&job->bcl, INCREMENT_SEMAPHORE, incr);
-
                 /* The FLUSH_ALL emits any unwritten state changes in each
                  * tile.  We can use this to reset any state that needs to be
                  * present at the start of the next tile, as we do with
diff --git a/src/gallium/drivers/vc5/vc5_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
similarity index 76%
rename from src/gallium/drivers/vc5/vc5_rcl.c
rename to src/gallium/drivers/v3d/v3dx_rcl.c
index 2b1309bc..3a76b0f 100644
--- a/src/gallium/drivers/vc5/vc5_rcl.c
+++ b/src/gallium/drivers/v3d/v3dx_rcl.c
@@ -22,8 +22,8 @@
  */
 
 #include "util/u_format.h"
-#include "vc5_context.h"
-#include "vc5_tiling.h"
+#include "v3d_context.h"
+#include "v3d_tiling.h"
 #include "broadcom/common/v3d_macros.h"
 #include "broadcom/cle/v3dx_pack.h"
 
@@ -40,7 +40,7 @@
  * dummy store.
  */
 static void
-flush_last_load(struct vc5_cl *cl)
+flush_last_load(struct v3d_cl *cl)
 {
         if (V3D_VERSION >= 40)
                 return;
@@ -52,17 +52,17 @@
 }
 
 static void
-load_general(struct vc5_cl *cl, struct pipe_surface *psurf, int buffer,
+load_general(struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
              uint32_t pipe_bit, uint32_t *loads_pending)
 {
-        struct vc5_surface *surf = vc5_surface(psurf);
+        struct v3d_surface *surf = v3d_surface(psurf);
         bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
         if (separate_stencil) {
                 psurf = surf->separate_stencil;
-                surf = vc5_surface(psurf);
+                surf = v3d_surface(psurf);
         }
 
-        struct vc5_resource *rsc = vc5_resource(psurf->texture);
+        struct v3d_resource *rsc = v3d_resource(psurf->texture);
 
         cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
                 load.buffer_to_load = buffer;
@@ -80,12 +80,16 @@
                         load.height_in_ub_or_stride =
                                 surf->padded_height_of_output_image_in_uif_blocks;
                 } else if (surf->tiling == VC5_TILING_RASTER) {
-                        struct vc5_resource_slice *slice =
+                        struct v3d_resource_slice *slice =
                                 &rsc->slices[psurf->u.tex.level];
                         load.height_in_ub_or_stride = slice->stride;
                 }
 
-                /* XXX: MSAA */
+                if (psurf->texture->nr_samples > 1)
+                        load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
+                else
+                        load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+
 #else /* V3D_VERSION < 40 */
                 /* Can't do raw ZSTENCIL loads -- need to load/store them to
                  * separate buffers for Z and stencil.
@@ -103,21 +107,21 @@
 }
 
 static void
-store_general(struct vc5_job *job,
-              struct vc5_cl *cl, struct pipe_surface *psurf, int buffer,
+store_general(struct v3d_job *job,
+              struct v3d_cl *cl, struct pipe_surface *psurf, int buffer,
               int pipe_bit, uint32_t *stores_pending, bool general_color_clear)
 {
-        struct vc5_surface *surf = vc5_surface(psurf);
+        struct v3d_surface *surf = v3d_surface(psurf);
         bool separate_stencil = surf->separate_stencil && buffer == STENCIL;
         if (separate_stencil) {
                 psurf = surf->separate_stencil;
-                surf = vc5_surface(psurf);
+                surf = v3d_surface(psurf);
         }
 
         *stores_pending &= ~pipe_bit;
         bool last_store = !(*stores_pending);
 
-        struct vc5_resource *rsc = vc5_resource(psurf->texture);
+        struct v3d_resource *rsc = v3d_resource(psurf->texture);
 
         rsc->writes++;
 
@@ -126,10 +130,7 @@
                 store.address = cl_address(rsc->bo, surf->offset);
 
 #if V3D_VERSION >= 40
-                store.clear_buffer_being_stored =
-                        ((job->cleared & pipe_bit) &&
-                         (general_color_clear ||
-                          !(pipe_bit & PIPE_CLEAR_COLOR_BUFFERS)));
+                store.clear_buffer_being_stored = false;
 
                 if (separate_stencil)
                         store.output_image_format = V3D_OUTPUT_IMAGE_FORMAT_S8;
@@ -143,10 +144,16 @@
                         store.height_in_ub_or_stride =
                                 surf->padded_height_of_output_image_in_uif_blocks;
                 } else if (surf->tiling == VC5_TILING_RASTER) {
-                        struct vc5_resource_slice *slice =
+                        struct v3d_resource_slice *slice =
                                 &rsc->slices[psurf->u.tex.level];
                         store.height_in_ub_or_stride = slice->stride;
                 }
+
+                if (psurf->texture->nr_samples > 1)
+                        store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
+                else
+                        store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
+
 #else /* V3D_VERSION < 40 */
                 /* Can't do raw ZSTENCIL stores -- need to load/store them to
                  * separate buffers for Z and stencil.
@@ -154,18 +161,18 @@
                 assert(buffer != ZSTENCIL);
                 store.raw_mode = true;
                 if (!last_store) {
-                        store.disable_colour_buffers_clear_on_write = true;
+                        store.disable_color_buffers_clear_on_write = true;
                         store.disable_z_buffer_clear_on_write = true;
                         store.disable_stencil_buffer_clear_on_write = true;
                 } else {
-                        store.disable_colour_buffers_clear_on_write =
+                        store.disable_color_buffers_clear_on_write =
                                 !(((pipe_bit & PIPE_CLEAR_COLOR_BUFFERS) &&
                                    general_color_clear &&
-                                   (job->cleared & pipe_bit)));
+                                   (job->clear & pipe_bit)));
                         store.disable_z_buffer_clear_on_write =
-                                !(job->cleared & PIPE_CLEAR_DEPTH);
+                                !(job->clear & PIPE_CLEAR_DEPTH);
                         store.disable_stencil_buffer_clear_on_write =
-                                !(job->cleared & PIPE_CLEAR_STENCIL);
+                                !(job->clear & PIPE_CLEAR_STENCIL);
                 }
                 store.padded_height_of_output_image_in_uif_blocks =
                         surf->padded_height_of_output_image_in_uif_blocks;
@@ -194,9 +201,9 @@
 }
 
 static void
-vc5_rcl_emit_loads(struct vc5_job *job, struct vc5_cl *cl)
+v3d_rcl_emit_loads(struct v3d_job *job, struct v3d_cl *cl)
 {
-        uint32_t loads_pending = job->resolve & ~job->cleared;
+        uint32_t loads_pending = job->load;
 
         for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
                 uint32_t bit = PIPE_CLEAR_COLOR0 << i;
@@ -216,10 +223,22 @@
         if ((loads_pending & PIPE_CLEAR_DEPTHSTENCIL) &&
             (V3D_VERSION >= 40 ||
              (job->zsbuf && job->zsbuf->texture->nr_samples > 1))) {
-                load_general(cl, job->zsbuf,
-                             zs_buffer_from_pipe_bits(loads_pending),
-                             PIPE_CLEAR_DEPTHSTENCIL,
-                             &loads_pending);
+                struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
+
+                if (rsc->separate_stencil &&
+                    (loads_pending & PIPE_CLEAR_STENCIL)) {
+                        load_general(cl, job->zsbuf,
+                                     STENCIL,
+                                     PIPE_CLEAR_STENCIL,
+                                     &loads_pending);
+                }
+
+                if (loads_pending & PIPE_CLEAR_DEPTHSTENCIL) {
+                        load_general(cl, job->zsbuf,
+                                     zs_buffer_from_pipe_bits(loads_pending),
+                                     loads_pending & PIPE_CLEAR_DEPTHSTENCIL,
+                                     &loads_pending);
+                }
         }
 
 #if V3D_VERSION < 40
@@ -227,8 +246,8 @@
          * tile coordinates.
          */
         if (loads_pending) {
-                cl_emit(cl, RELOAD_TILE_COLOUR_BUFFER, load) {
-                        load.disable_colour_buffer_load =
+                cl_emit(cl, RELOAD_TILE_COLOR_BUFFER, load) {
+                        load.disable_color_buffer_load =
                                 (~loads_pending &
                                  PIPE_CLEAR_COLOR_BUFFERS) >>
                                 PIPE_FIRST_COLOR_BUFFER_BIT;
@@ -245,11 +264,12 @@
 }
 
 static void
-vc5_rcl_emit_stores(struct vc5_job *job, struct vc5_cl *cl)
+v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl)
 {
-        MAYBE_UNUSED bool needs_color_clear = job->cleared & PIPE_CLEAR_COLOR_BUFFERS;
-        MAYBE_UNUSED bool needs_z_clear = job->cleared & PIPE_CLEAR_DEPTH;
-        MAYBE_UNUSED bool needs_s_clear = job->cleared & PIPE_CLEAR_STENCIL;
+#if V3D_VERSION < 40
+        MAYBE_UNUSED bool needs_color_clear = job->clear & PIPE_CLEAR_COLOR_BUFFERS;
+        MAYBE_UNUSED bool needs_z_clear = job->clear & PIPE_CLEAR_DEPTH;
+        MAYBE_UNUSED bool needs_s_clear = job->clear & PIPE_CLEAR_STENCIL;
 
         /* For clearing color in a TLB general on V3D 3.3:
          *
@@ -266,10 +286,13 @@
          * TLB color buffers.
          */
         bool general_color_clear = (needs_color_clear &&
-                                    (job->cleared & PIPE_CLEAR_COLOR_BUFFERS) ==
-                                    (job->resolve & PIPE_CLEAR_COLOR_BUFFERS));
+                                    (job->clear & PIPE_CLEAR_COLOR_BUFFERS) ==
+                                    (job->store & PIPE_CLEAR_COLOR_BUFFERS));
+#else
+        bool general_color_clear = false;
+#endif
 
-        uint32_t stores_pending = job->resolve;
+        uint32_t stores_pending = job->store;
 
         /* For V3D 4.1, use general stores for all TLB stores.
          *
@@ -282,7 +305,7 @@
          */
         for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
                 uint32_t bit = PIPE_CLEAR_COLOR0 << i;
-                if (!(job->resolve & bit))
+                if (!(job->store & bit))
                         continue;
 
                 struct pipe_surface *psurf = job->cbufs[i];
@@ -295,18 +318,18 @@
                               &stores_pending, general_color_clear);
         }
 
-        if (job->resolve & PIPE_CLEAR_DEPTHSTENCIL && job->zsbuf &&
+        if (job->store & PIPE_CLEAR_DEPTHSTENCIL && job->zsbuf &&
             !(V3D_VERSION < 40 && job->zsbuf->texture->nr_samples <= 1)) {
-                struct vc5_resource *rsc = vc5_resource(job->zsbuf->texture);
+                struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
                 if (rsc->separate_stencil) {
-                        if (job->resolve & PIPE_CLEAR_DEPTH) {
+                        if (job->store & PIPE_CLEAR_DEPTH) {
                                 store_general(job, cl, job->zsbuf, Z,
                                               PIPE_CLEAR_DEPTH,
                                               &stores_pending,
                                               general_color_clear);
                         }
 
-                        if (job->resolve & PIPE_CLEAR_STENCIL) {
+                        if (job->store & PIPE_CLEAR_STENCIL) {
                                 store_general(job, cl, job->zsbuf, STENCIL,
                                               PIPE_CLEAR_STENCIL,
                                               &stores_pending,
@@ -314,14 +337,14 @@
                         }
                 } else {
                         store_general(job, cl, job->zsbuf,
-                                      zs_buffer_from_pipe_bits(job->resolve),
-                                      job->resolve & PIPE_CLEAR_DEPTHSTENCIL,
+                                      zs_buffer_from_pipe_bits(job->store),
+                                      job->store & PIPE_CLEAR_DEPTHSTENCIL,
                                       &stores_pending, general_color_clear);
                 }
         }
 
-        if (stores_pending) {
 #if V3D_VERSION < 40
+        if (stores_pending) {
                 cl_emit(cl, STORE_MULTI_SAMPLE_RESOLVED_TILE_COLOR_BUFFER_EXTENDED, store) {
 
                         store.disable_color_buffer_write =
@@ -333,41 +356,47 @@
                         /* Note that when set this will clear all of the color
                          * buffers.
                          */
-                        store.disable_colour_buffers_clear_on_write =
+                        store.disable_color_buffers_clear_on_write =
                                 !needs_color_clear;
                         store.disable_z_buffer_clear_on_write =
                                 !needs_z_clear;
                         store.disable_stencil_buffer_clear_on_write =
                                 !needs_s_clear;
                 };
-#else /* V3D_VERSION >= 40 */
-                unreachable("All color buffers should have been stored.");
-#endif /* V3D_VERSION >= 40 */
         } else if (needs_color_clear && !general_color_clear) {
                 /* If we didn't do our color clears in the general packet,
                  * then emit a packet to clear all the TLB color buffers now.
                  */
-#if V3D_VERSION < 40
                 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
                         store.buffer_to_store = NONE;
                 }
+        }
 #else /* V3D_VERSION >= 40 */
+        assert(!stores_pending);
+
+        /* GFXH-1461/GFXH-1689: The per-buffer store command's clear
+         * buffer bit is broken for depth/stencil.  In addition, the
+         * clear packet's Z/S bit is broken, but the RTs bit ends up
+         * clearing Z/S.
+         */
+        if (job->clear) {
                 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
+                        clear.clear_z_stencil_buffer = true;
                         clear.clear_all_render_targets = true;
                 }
-#endif /* V3D_VERSION >= 40 */
         }
+#endif /* V3D_VERSION >= 40 */
 }
 
 static void
-vc5_rcl_emit_generic_per_tile_list(struct vc5_job *job, int last_cbuf)
+v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int last_cbuf)
 {
         /* Emit the generic list in our indirect state -- the rcl will just
          * have pointers into it.
          */
-        struct vc5_cl *cl = &job->indirect;
-        vc5_cl_ensure_space(cl, 200, 1);
-        struct vc5_cl_reloc tile_list_start = cl_get_address(cl);
+        struct v3d_cl *cl = &job->indirect;
+        v3d_cl_ensure_space(cl, 200, 1);
+        struct v3d_cl_reloc tile_list_start = cl_get_address(cl);
 
         if (V3D_VERSION >= 40) {
                 /* V3D 4.x only requires a single tile coordinates, and
@@ -376,7 +405,7 @@
                 cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
         }
 
-        vc5_rcl_emit_loads(job, cl);
+        v3d_rcl_emit_loads(job, cl);
 
         if (V3D_VERSION < 40) {
                 /* Tile Coordinates triggers the last reload and sets where
@@ -388,14 +417,13 @@
         /* The binner starts out writing tiles assuming that the initial mode
          * is triangles, so make sure that's the case.
          */
-        cl_emit(cl, PRIMITIVE_LIST_FORMAT, fmt) {
-                fmt.data_type = LIST_INDEXED;
+        cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
                 fmt.primitive_type = LIST_TRIANGLES;
         }
 
         cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
 
-        vc5_rcl_emit_stores(job, cl);
+        v3d_rcl_emit_stores(job, cl);
 
 #if V3D_VERSION >= 40
         cl_emit(cl, END_OF_TILE_MARKER, end);
@@ -411,13 +439,13 @@
 
 #if V3D_VERSION >= 40
 static void
-v3d_setup_render_target(struct vc5_job *job, int cbuf,
+v3d_setup_render_target(struct v3d_job *job, int cbuf,
                         uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp)
 {
         if (!job->cbufs[cbuf])
                 return;
 
-        struct vc5_surface *surf = vc5_surface(job->cbufs[cbuf]);
+        struct v3d_surface *surf = v3d_surface(job->cbufs[cbuf]);
         *rt_bpp = surf->internal_bpp;
         *rt_type = surf->internal_type;
         *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
@@ -426,10 +454,10 @@
 #else /* V3D_VERSION < 40 */
 
 static void
-v3d_emit_z_stencil_config(struct vc5_job *job, struct vc5_surface *surf,
-                          struct vc5_resource *rsc, bool is_separate_stencil)
+v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf,
+                          struct v3d_resource *rsc, bool is_separate_stencil)
 {
-        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_Z_STENCIL_CONFIG, zs) {
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_Z_STENCIL, zs) {
                 zs.address = cl_address(rsc->bo, surf->offset);
 
                 if (!is_separate_stencil) {
@@ -446,9 +474,9 @@
                 zs.memory_format = surf->tiling;
         }
 
-        if (job->resolve & (is_separate_stencil ?
-                            PIPE_CLEAR_STENCIL :
-                            PIPE_CLEAR_DEPTHSTENCIL)) {
+        if (job->store & (is_separate_stencil ?
+                          PIPE_CLEAR_STENCIL :
+                          PIPE_CLEAR_DEPTHSTENCIL)) {
                 rsc->writes++;
         }
 }
@@ -457,15 +485,15 @@
 #define div_round_up(a, b) (((a) + (b) - 1) / b)
 
 void
-v3dX(emit_rcl)(struct vc5_job *job)
+v3dX(emit_rcl)(struct v3d_job *job)
 {
         /* The RCL list should be empty. */
         assert(!job->rcl.bo);
 
-        vc5_cl_ensure_space_with_branch(&job->rcl, 200 + 256 *
+        v3d_cl_ensure_space_with_branch(&job->rcl, 200 + 256 *
                                         cl_packet_length(SUPERTILE_COORDINATES));
         job->submit.rcl_start = job->rcl.bo->offset;
-        vc5_job_add_bo(job, job->rcl.bo);
+        v3d_job_add_bo(job, job->rcl.bo);
 
         int nr_cbufs = 0;
         for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
@@ -473,18 +501,17 @@
                         nr_cbufs = i + 1;
         }
 
-        /* Comon config must be the first TILE_RENDERING_MODE_CONFIGURATION
+        /* Comon config must be the first TILE_RENDERING_MODE_CFG
          * and Z_STENCIL_CLEAR_VALUES must be last.  The ones in between are
          * optional updates to the previous HW state.
          */
-        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_COMMON_CONFIGURATION,
-                config) {
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
 #if V3D_VERSION < 40
-                config.enable_z_store = job->resolve & PIPE_CLEAR_DEPTH;
-                config.enable_stencil_store = job->resolve & PIPE_CLEAR_STENCIL;
+                config.enable_z_store = job->store & PIPE_CLEAR_DEPTH;
+                config.enable_stencil_store = job->store & PIPE_CLEAR_STENCIL;
 #else /* V3D_VERSION >= 40 */
                 if (job->zsbuf) {
-                        struct vc5_surface *surf = vc5_surface(job->zsbuf);
+                        struct v3d_surface *surf = v3d_surface(job->zsbuf);
                         config.internal_depth_type = surf->internal_type;
                 }
 #endif /* V3D_VERSION >= 40 */
@@ -510,8 +537,7 @@
                 config.image_width_pixels = job->draw_width;
                 config.image_height_pixels = job->draw_height;
 
-                config.number_of_render_targets_minus_1 =
-                        MAX2(nr_cbufs, 1) - 1;
+                config.number_of_render_targets = MAX2(nr_cbufs, 1);
 
                 config.multisample_mode_4x = job->msaa;
 
@@ -522,8 +548,8 @@
                 struct pipe_surface *psurf = job->cbufs[i];
                 if (!psurf)
                         continue;
-                struct vc5_surface *surf = vc5_surface(psurf);
-                struct vc5_resource *rsc = vc5_resource(psurf->texture);
+                struct v3d_surface *surf = v3d_surface(psurf);
+                struct v3d_resource *rsc = v3d_resource(psurf->texture);
 
                 MAYBE_UNUSED uint32_t config_pad = 0;
                 uint32_t clear_pad = 0;
@@ -531,7 +557,7 @@
                 /* XXX: Set the pad for raster. */
                 if (surf->tiling == VC5_TILING_UIF_NO_XOR ||
                     surf->tiling == VC5_TILING_UIF_XOR) {
-                        int uif_block_height = vc5_utile_height(rsc->cpp) * 2;
+                        int uif_block_height = v3d_utile_height(rsc->cpp) * 2;
                         uint32_t implicit_padded_height = (align(job->draw_height, uif_block_height) /
                                                            uif_block_height);
                         if (surf->padded_height_of_output_image_in_uif_blocks -
@@ -545,7 +571,7 @@
                 }
 
 #if V3D_VERSION < 40
-                cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_RENDER_TARGET_CONFIG, rt) {
+                cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
                         rt.address = cl_address(rsc->bo, surf->offset);
                         rt.internal_type = surf->internal_type;
                         rt.output_image_format = surf->format;
@@ -554,12 +580,12 @@
                         rt.render_target_number = i;
                         rt.pad = config_pad;
 
-                        if (job->resolve & PIPE_CLEAR_COLOR0 << i)
+                        if (job->store & PIPE_CLEAR_COLOR0 << i)
                                 rsc->writes++;
                 }
 #endif /* V3D_VERSION < 40 */
 
-                cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_CLEAR_COLORS_PART1,
+                cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1,
                         clear) {
                         clear.clear_color_low_32_bits = job->clear_color[i][0];
                         clear.clear_color_next_24_bits = job->clear_color[i][1] & 0xffffff;
@@ -567,7 +593,7 @@
                 };
 
                 if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) {
-                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_CLEAR_COLORS_PART2,
+                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2,
                                 clear) {
                                 clear.clear_color_mid_low_32_bits =
                                         ((job->clear_color[i][1] >> 24) |
@@ -580,7 +606,7 @@
                 }
 
                 if (surf->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
-                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_CLEAR_COLORS_PART3,
+                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3,
                                 clear) {
                                 clear.uif_padded_height_in_uif_blocks = clear_pad;
                                 clear.clear_color_high_16_bits = job->clear_color[i][3] >> 16;
@@ -590,7 +616,7 @@
         }
 
 #if V3D_VERSION >= 40
-        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_RENDER_TARGET_CONFIG, rt) {
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
                 v3d_setup_render_target(job, 0,
                                         &rt.render_target_0_internal_bpp,
                                         &rt.render_target_0_internal_type,
@@ -614,8 +640,8 @@
         /* TODO: Don't bother emitting if we don't load/clear Z/S. */
         if (job->zsbuf) {
                 struct pipe_surface *psurf = job->zsbuf;
-                struct vc5_surface *surf = vc5_surface(psurf);
-                struct vc5_resource *rsc = vc5_resource(psurf->texture);
+                struct v3d_surface *surf = v3d_surface(psurf);
+                struct v3d_resource *rsc = v3d_resource(psurf->texture);
 
                 v3d_emit_z_stencil_config(job, surf, rsc, false);
 
@@ -625,17 +651,17 @@
                  */
                 if (surf->separate_stencil) {
                         v3d_emit_z_stencil_config(job,
-                                                  vc5_surface(surf->separate_stencil),
+                                                  v3d_surface(surf->separate_stencil),
                                                   rsc->separate_stencil, true);
                 }
         }
 #endif /* V3D_VERSION < 40 */
 
         /* Ends rendering mode config. */
-        cl_emit(&job->rcl, TILE_RENDERING_MODE_CONFIGURATION_Z_STENCIL_CLEAR_VALUES,
+        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES,
                 clear) {
                 clear.z_clear_value = job->clear_z;
-                clear.stencil_vg_mask_clear_value = job->clear_s;
+                clear.stencil_clear_value = job->clear_s;
         };
 
         /* Always set initial block size before the first branch, which needs
@@ -656,7 +682,7 @@
                 list.address = cl_address(job->tile_alloc, 0);
         }
 
-        cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CONFIGURATION, config) {
+        cl_emit(&job->rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
                 uint32_t frame_w_in_supertiles, frame_h_in_supertiles;
                 const uint32_t max_supertiles = 256;
 
@@ -677,11 +703,12 @@
                                 supertile_h++;
                 }
 
+                config.number_of_bin_tile_lists = 1;
                 config.total_frame_width_in_tiles = job->draw_tiles_x;
                 config.total_frame_height_in_tiles = job->draw_tiles_y;
 
-                config.supertile_width_in_tiles_minus_1 = supertile_w - 1;
-                config.supertile_height_in_tiles_minus_1 = supertile_h - 1;
+                config.supertile_width_in_tiles = supertile_w;
+                config.supertile_height_in_tiles = supertile_h;
 
                 config.total_frame_width_in_supertiles = frame_w_in_supertiles;
                 config.total_frame_height_in_supertiles = frame_h_in_supertiles;
@@ -693,27 +720,44 @@
                 coords.tile_row_number = 0;
         }
 
+        /* Emit an initial clear of the tile buffers.  This is necessary for
+         * any buffers that should be cleared (since clearing normally happens
+         * at the *end* of the generic tile list), but it's also nice to clear
+         * everything so the first tile doesn't inherit any contents from some
+         * previous frame.
+         *
+         * Also, implement the GFXH-1742 workaround.  There's a race in the HW
+         * between the RCL updating the TLB's internal type/size and the
+         * spawning of the QPU instances using the TLB's current internal
+         * type/size.  To make sure the QPUs get the right state,, we need 1
+         * dummy store in between internal type/size changes on V3D 3.x, and 2
+         * dummy stores on 4.x.
+         */
 #if V3D_VERSION < 40
         cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
                 store.buffer_to_store = NONE;
         }
 #else
-        cl_emit(&job->rcl, END_OF_LOADS, end);
-        cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
-                store.buffer_to_store = NONE;
+        for (int i = 0; i < 2; i++) {
+                if (i > 0)
+                        cl_emit(&job->rcl, TILE_COORDINATES, coords);
+                cl_emit(&job->rcl, END_OF_LOADS, end);
+                cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                        store.buffer_to_store = NONE;
+                }
+                if (i == 0) {
+                        cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
+                                clear.clear_z_stencil_buffer = true;
+                                clear.clear_all_render_targets = true;
+                        }
+                }
+                cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
         }
-        cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
-                clear.clear_z_stencil_buffer = true;
-                clear.clear_all_render_targets = true;
-        }
-        cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
 #endif
 
         cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
 
-        vc5_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
-
-        cl_emit(&job->rcl, WAIT_ON_SEMAPHORE, sem);
+        v3d_rcl_emit_generic_per_tile_list(job, nr_cbufs - 1);
 
         /* XXX: Use Morton order */
         uint32_t supertile_w_in_pixels = job->tile_width * supertile_w;
diff --git a/src/gallium/drivers/vc5/v3dx_simulator.c b/src/gallium/drivers/v3d/v3dx_simulator.c
similarity index 82%
rename from src/gallium/drivers/vc5/v3dx_simulator.c
rename to src/gallium/drivers/v3d/v3dx_simulator.c
index 90fafae..a1e72e0 100644
--- a/src/gallium/drivers/vc5/v3dx_simulator.c
+++ b/src/gallium/drivers/v3d/v3dx_simulator.c
@@ -22,7 +22,7 @@
  */
 
 /**
- * @file vc5_simulator_hw.c
+ * @file v3d_simulator_hw.c
  *
  * Implements the actual HW interaction betweeh the GL driver's VC5 simulator and the simulator.
  *
@@ -31,11 +31,11 @@
  * we support.
  */
 
-#ifdef USE_VC5_SIMULATOR
+#ifdef USE_V3D_SIMULATOR
 
-#include "vc5_screen.h"
-#include "vc5_context.h"
-#include "vc5_simulator_wrapper.h"
+#include "v3d_screen.h"
+#include "v3d_context.h"
+#include "v3d_simulator_wrapper.h"
 
 #define HW_REGISTER_RO(x) (x)
 #define HW_REGISTER_RW(x) (x)
@@ -49,7 +49,7 @@
 #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
 
 static void
-vc5_flush_l3(struct v3d_hw *v3d)
+v3d_flush_l3(struct v3d_hw *v3d)
 {
         if (!v3d_hw_has_gca(v3d))
                 return;
@@ -64,7 +64,7 @@
 
 /* Invalidates the L2 cache.  This is a read-only cache. */
 static void
-vc5_flush_l2(struct v3d_hw *v3d)
+v3d_flush_l2(struct v3d_hw *v3d)
 {
         V3D_WRITE(V3D_CTL_0_L2CACTL,
                   V3D_CTL_0_L2CACTL_L2CCLR_SET |
@@ -73,7 +73,7 @@
 
 /* Invalidates texture L2 cachelines */
 static void
-vc5_flush_l2t(struct v3d_hw *v3d)
+v3d_flush_l2t(struct v3d_hw *v3d)
 {
         V3D_WRITE(V3D_CTL_0_L2TFLSTA, 0);
         V3D_WRITE(V3D_CTL_0_L2TFLEND, ~0);
@@ -84,32 +84,32 @@
 
 /* Invalidates the slice caches.  These are read-only caches. */
 static void
-vc5_flush_slices(struct v3d_hw *v3d)
+v3d_flush_slices(struct v3d_hw *v3d)
 {
         V3D_WRITE(V3D_CTL_0_SLCACTL, ~0);
 }
 
 static void
-vc5_flush_caches(struct v3d_hw *v3d)
+v3d_flush_caches(struct v3d_hw *v3d)
 {
-        vc5_flush_l3(v3d);
-        vc5_flush_l2(v3d);
-        vc5_flush_l2t(v3d);
-        vc5_flush_slices(v3d);
+        v3d_flush_l3(v3d);
+        v3d_flush_l2(v3d);
+        v3d_flush_l2t(v3d);
+        v3d_flush_slices(v3d);
 }
 
 int
 v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
-                                struct drm_vc5_get_param *args)
+                                struct drm_v3d_get_param *args)
 {
         static const uint32_t reg_map[] = {
-                [DRM_VC5_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
-                [DRM_VC5_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
-                [DRM_VC5_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
-                [DRM_VC5_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
-                [DRM_VC5_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0,
-                [DRM_VC5_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1,
-                [DRM_VC5_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2,
+                [DRM_V3D_PARAM_V3D_UIFCFG] = V3D_HUB_CTL_UIFCFG,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT1] = V3D_HUB_CTL_IDENT1,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT2] = V3D_HUB_CTL_IDENT2,
+                [DRM_V3D_PARAM_V3D_HUB_IDENT3] = V3D_HUB_CTL_IDENT3,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT0] = V3D_CTL_0_IDENT0,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT1] = V3D_CTL_0_IDENT1,
+                [DRM_V3D_PARAM_V3D_CORE0_IDENT2] = V3D_CTL_0_IDENT2,
         };
 
         if (args->param < ARRAY_SIZE(reg_map) && reg_map[args->param]) {
@@ -139,7 +139,7 @@
 }
 
 void
-v3dX(simulator_flush)(struct v3d_hw *v3d, struct drm_vc5_submit_cl *submit,
+v3dX(simulator_flush)(struct v3d_hw *v3d, struct drm_v3d_submit_cl *submit,
                       uint32_t gmp_ofs)
 {
         /* Completely reset the GMP. */
@@ -152,7 +152,7 @@
                 ;
         }
 
-        vc5_flush_caches(v3d);
+        v3d_flush_caches(v3d);
 
         if (submit->qma) {
                 V3D_WRITE(V3D_CLE_0_CT0QMA, submit->qma);
@@ -187,4 +187,4 @@
         }
 }
 
-#endif /* USE_VC5_SIMULATOR */
+#endif /* USE_V3D_SIMULATOR */
diff --git a/src/gallium/drivers/vc5/vc5_state.c b/src/gallium/drivers/v3d/v3dx_state.c
similarity index 68%
rename from src/gallium/drivers/vc5/vc5_state.c
rename to src/gallium/drivers/v3d/v3dx_state.c
index ba2d748..4bba899 100644
--- a/src/gallium/drivers/vc5/vc5_state.c
+++ b/src/gallium/drivers/v3d/v3dx_state.c
@@ -24,85 +24,70 @@
 
 #include "pipe/p_state.h"
 #include "util/u_format.h"
+#include "util/u_framebuffer.h"
 #include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_half.h"
 #include "util/u_helpers.h"
 
-#include "vc5_context.h"
-#include "vc5_tiling.h"
+#include "v3d_context.h"
+#include "v3d_tiling.h"
 #include "broadcom/common/v3d_macros.h"
 #include "broadcom/cle/v3dx_pack.h"
 
-static void *
-vc5_generic_cso_state_create(const void *src, uint32_t size)
-{
-        void *dst = calloc(1, size);
-        if (!dst)
-                return NULL;
-        memcpy(dst, src, size);
-        return dst;
-}
-
 static void
-vc5_generic_cso_state_delete(struct pipe_context *pctx, void *hwcso)
+v3d_generic_cso_state_delete(struct pipe_context *pctx, void *hwcso)
 {
         free(hwcso);
 }
 
 static void
-vc5_set_blend_color(struct pipe_context *pctx,
+v3d_set_blend_color(struct pipe_context *pctx,
                     const struct pipe_blend_color *blend_color)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->blend_color.f = *blend_color;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->blend_color.f = *blend_color;
         for (int i = 0; i < 4; i++) {
-                vc5->blend_color.hf[i] =
+                v3d->blend_color.hf[i] =
                         util_float_to_half(blend_color->color[i]);
         }
-        vc5->dirty |= VC5_DIRTY_BLEND_COLOR;
+        v3d->dirty |= VC5_DIRTY_BLEND_COLOR;
 }
 
 static void
-vc5_set_stencil_ref(struct pipe_context *pctx,
+v3d_set_stencil_ref(struct pipe_context *pctx,
                     const struct pipe_stencil_ref *stencil_ref)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->stencil_ref = *stencil_ref;
-        vc5->dirty |= VC5_DIRTY_STENCIL_REF;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->stencil_ref = *stencil_ref;
+        v3d->dirty |= VC5_DIRTY_STENCIL_REF;
 }
 
 static void
-vc5_set_clip_state(struct pipe_context *pctx,
+v3d_set_clip_state(struct pipe_context *pctx,
                    const struct pipe_clip_state *clip)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->clip = *clip;
-        vc5->dirty |= VC5_DIRTY_CLIP;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->clip = *clip;
+        v3d->dirty |= VC5_DIRTY_CLIP;
 }
 
 static void
-vc5_set_sample_mask(struct pipe_context *pctx, unsigned sample_mask)
+v3d_set_sample_mask(struct pipe_context *pctx, unsigned sample_mask)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->sample_mask = sample_mask & ((1 << VC5_MAX_SAMPLES) - 1);
-        vc5->dirty |= VC5_DIRTY_SAMPLE_MASK;
-}
-
-static uint16_t
-float_to_187_half(float f)
-{
-        return fui(f) >> 16;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->sample_mask = sample_mask & ((1 << VC5_MAX_SAMPLES) - 1);
+        v3d->dirty |= VC5_DIRTY_SAMPLE_STATE;
 }
 
 static void *
-vc5_create_rasterizer_state(struct pipe_context *pctx,
+v3d_create_rasterizer_state(struct pipe_context *pctx,
                             const struct pipe_rasterizer_state *cso)
 {
-        struct vc5_rasterizer_state *so;
+        struct v3d_rasterizer_state *so;
 
-        so = CALLOC_STRUCT(vc5_rasterizer_state);
+        so = CALLOC_STRUCT(v3d_rasterizer_state);
         if (!so)
                 return NULL;
 
@@ -113,9 +98,19 @@
          */
         so->point_size = MAX2(cso->point_size, .125f);
 
-        if (cso->offset_tri) {
-                so->offset_units = float_to_187_half(cso->offset_units);
-                so->offset_factor = float_to_187_half(cso->offset_scale);
+        STATIC_ASSERT(sizeof(so->depth_offset) >=
+                      cl_packet_length(DEPTH_OFFSET));
+        v3dx_pack(&so->depth_offset, DEPTH_OFFSET, depth) {
+                depth.depth_offset_factor = cso->offset_scale;
+                depth.depth_offset_units = cso->offset_units;
+        }
+
+        /* The HW treats polygon offset units based on a Z24 buffer, so we
+         * need to scale up offset_units if we're only Z16.
+         */
+        v3dx_pack(&so->depth_offset_z16, DEPTH_OFFSET, depth) {
+                depth.depth_offset_factor = cso->offset_scale;
+                depth.depth_offset_units = cso->offset_units * 256.0;
         }
 
         return so;
@@ -123,10 +118,31 @@
 
 /* Blend state is baked into shaders. */
 static void *
-vc5_create_blend_state(struct pipe_context *pctx,
+v3d_create_blend_state(struct pipe_context *pctx,
                        const struct pipe_blend_state *cso)
 {
-        return vc5_generic_cso_state_create(cso, sizeof(*cso));
+        struct v3d_blend_state *so;
+
+        so = CALLOC_STRUCT(v3d_blend_state);
+        if (!so)
+                return NULL;
+
+        so->base = *cso;
+
+        if (cso->independent_blend_enable) {
+                for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++) {
+                        so->blend_enables |= cso->rt[i].blend_enable << i;
+
+                        /* V3D 4.x is when we got independent blend enables. */
+                        assert(V3D_VERSION >= 40 ||
+                               cso->rt[i].blend_enable == cso->rt[0].blend_enable);
+                }
+        } else {
+                if (cso->rt[0].blend_enable)
+                        so->blend_enables = (1 << VC5_MAX_DRAW_BUFFERS) - 1;
+        }
+
+        return so;
 }
 
 static uint32_t
@@ -146,12 +162,12 @@
 }
 
 static void *
-vc5_create_depth_stencil_alpha_state(struct pipe_context *pctx,
+v3d_create_depth_stencil_alpha_state(struct pipe_context *pctx,
                                      const struct pipe_depth_stencil_alpha_state *cso)
 {
-        struct vc5_depth_stencil_alpha_state *so;
+        struct v3d_depth_stencil_alpha_state *so;
 
-        so = CALLOC_STRUCT(vc5_depth_stencil_alpha_state);
+        so = CALLOC_STRUCT(v3d_depth_stencil_alpha_state);
         if (!so)
                 return NULL;
 
@@ -193,7 +209,9 @@
         const struct pipe_stencil_state *back = &cso->stencil[1];
 
         if (front->enabled) {
-                v3dx_pack(&so->stencil_front, STENCIL_CONFIG, config) {
+                STATIC_ASSERT(sizeof(so->stencil_front) >=
+                              cl_packet_length(STENCIL_CFG));
+                v3dx_pack(&so->stencil_front, STENCIL_CFG, config) {
                         config.front_config = true;
                         /* If !back->enabled, then the front values should be
                          * used for both front and back-facing primitives.
@@ -213,7 +231,9 @@
                 }
         }
         if (back->enabled) {
-                v3dx_pack(&so->stencil_back, STENCIL_CONFIG, config) {
+                STATIC_ASSERT(sizeof(so->stencil_back) >=
+                              cl_packet_length(STENCIL_CFG));
+                v3dx_pack(&so->stencil_back, STENCIL_CFG, config) {
                         config.front_config = false;
                         config.back_config = true;
 
@@ -234,82 +254,82 @@
 }
 
 static void
-vc5_set_polygon_stipple(struct pipe_context *pctx,
+v3d_set_polygon_stipple(struct pipe_context *pctx,
                         const struct pipe_poly_stipple *stipple)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->stipple = *stipple;
-        vc5->dirty |= VC5_DIRTY_STIPPLE;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->stipple = *stipple;
+        v3d->dirty |= VC5_DIRTY_STIPPLE;
 }
 
 static void
-vc5_set_scissor_states(struct pipe_context *pctx,
+v3d_set_scissor_states(struct pipe_context *pctx,
                        unsigned start_slot,
                        unsigned num_scissors,
                        const struct pipe_scissor_state *scissor)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
+        struct v3d_context *v3d = v3d_context(pctx);
 
-        vc5->scissor = *scissor;
-        vc5->dirty |= VC5_DIRTY_SCISSOR;
+        v3d->scissor = *scissor;
+        v3d->dirty |= VC5_DIRTY_SCISSOR;
 }
 
 static void
-vc5_set_viewport_states(struct pipe_context *pctx,
+v3d_set_viewport_states(struct pipe_context *pctx,
                         unsigned start_slot,
                         unsigned num_viewports,
                         const struct pipe_viewport_state *viewport)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->viewport = *viewport;
-        vc5->dirty |= VC5_DIRTY_VIEWPORT;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->viewport = *viewport;
+        v3d->dirty |= VC5_DIRTY_VIEWPORT;
 }
 
 static void
-vc5_set_vertex_buffers(struct pipe_context *pctx,
+v3d_set_vertex_buffers(struct pipe_context *pctx,
                        unsigned start_slot, unsigned count,
                        const struct pipe_vertex_buffer *vb)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_vertexbuf_stateobj *so = &vc5->vertexbuf;
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_vertexbuf_stateobj *so = &v3d->vertexbuf;
 
         util_set_vertex_buffers_mask(so->vb, &so->enabled_mask, vb,
                                      start_slot, count);
         so->count = util_last_bit(so->enabled_mask);
 
-        vc5->dirty |= VC5_DIRTY_VTXBUF;
+        v3d->dirty |= VC5_DIRTY_VTXBUF;
 }
 
 static void
-vc5_blend_state_bind(struct pipe_context *pctx, void *hwcso)
+v3d_blend_state_bind(struct pipe_context *pctx, void *hwcso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->blend = hwcso;
-        vc5->dirty |= VC5_DIRTY_BLEND;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->blend = hwcso;
+        v3d->dirty |= VC5_DIRTY_BLEND;
 }
 
 static void
-vc5_rasterizer_state_bind(struct pipe_context *pctx, void *hwcso)
+v3d_rasterizer_state_bind(struct pipe_context *pctx, void *hwcso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->rasterizer = hwcso;
-        vc5->dirty |= VC5_DIRTY_RASTERIZER;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->rasterizer = hwcso;
+        v3d->dirty |= VC5_DIRTY_RASTERIZER;
 }
 
 static void
-vc5_zsa_state_bind(struct pipe_context *pctx, void *hwcso)
+v3d_zsa_state_bind(struct pipe_context *pctx, void *hwcso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->zsa = hwcso;
-        vc5->dirty |= VC5_DIRTY_ZSA;
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->zsa = hwcso;
+        v3d->dirty |= VC5_DIRTY_ZSA;
 }
 
 static void *
-vc5_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
+v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
                         const struct pipe_vertex_element *elements)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_vertex_stateobj *so = CALLOC_STRUCT(vc5_vertex_stateobj);
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_vertex_stateobj *so = CALLOC_STRUCT(v3d_vertex_stateobj);
 
         if (!so)
                 return NULL;
@@ -384,11 +404,11 @@
         /* Set up the default attribute values in case any of the vertex
          * elements use them.
          */
-        so->default_attribute_values = vc5_bo_alloc(vc5->screen,
+        so->default_attribute_values = v3d_bo_alloc(v3d->screen,
                                                     VC5_MAX_ATTRIBUTES *
                                                     4 * sizeof(float),
-                                                    "default attributes");
-        uint32_t *attrs = vc5_bo_map(so->default_attribute_values);
+                                                    "default_attributes");
+        uint32_t *attrs = v3d_bo_map(so->default_attribute_values);
         for (int i = 0; i < VC5_MAX_ATTRIBUTES; i++) {
                 attrs[i * 4 + 0] = 0;
                 attrs[i * 4 + 1] = 0;
@@ -405,19 +425,28 @@
 }
 
 static void
-vc5_vertex_state_bind(struct pipe_context *pctx, void *hwcso)
+v3d_vertex_state_delete(struct pipe_context *pctx, void *hwcso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        vc5->vtx = hwcso;
-        vc5->dirty |= VC5_DIRTY_VTXSTATE;
+        struct v3d_vertex_stateobj *so = hwcso;
+
+        v3d_bo_unreference(&so->default_attribute_values);
+        free(so);
 }
 
 static void
-vc5_set_constant_buffer(struct pipe_context *pctx, uint shader, uint index,
+v3d_vertex_state_bind(struct pipe_context *pctx, void *hwcso)
+{
+        struct v3d_context *v3d = v3d_context(pctx);
+        v3d->vtx = hwcso;
+        v3d->dirty |= VC5_DIRTY_VTXSTATE;
+}
+
+static void
+v3d_set_constant_buffer(struct pipe_context *pctx, uint shader, uint index,
                         const struct pipe_constant_buffer *cb)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_constbuf_stateobj *so = &vc5->constbuf[shader];
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_constbuf_stateobj *so = &v3d->constbuf[shader];
 
         util_copy_constant_buffer(&so->cb[index], cb);
 
@@ -432,35 +461,24 @@
 
         so->enabled_mask |= 1 << index;
         so->dirty_mask |= 1 << index;
-        vc5->dirty |= VC5_DIRTY_CONSTBUF;
+        v3d->dirty |= VC5_DIRTY_CONSTBUF;
 }
 
 static void
-vc5_set_framebuffer_state(struct pipe_context *pctx,
+v3d_set_framebuffer_state(struct pipe_context *pctx,
                           const struct pipe_framebuffer_state *framebuffer)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct pipe_framebuffer_state *cso = &vc5->framebuffer;
-        unsigned i;
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct pipe_framebuffer_state *cso = &v3d->framebuffer;
 
-        vc5->job = NULL;
+        v3d->job = NULL;
 
-        for (i = 0; i < framebuffer->nr_cbufs; i++)
-                pipe_surface_reference(&cso->cbufs[i], framebuffer->cbufs[i]);
-        for (; i < vc5->framebuffer.nr_cbufs; i++)
-                pipe_surface_reference(&cso->cbufs[i], NULL);
+        util_copy_framebuffer_state(cso, framebuffer);
 
-        cso->nr_cbufs = framebuffer->nr_cbufs;
-
-        pipe_surface_reference(&cso->zsbuf, framebuffer->zsbuf);
-
-        cso->width = framebuffer->width;
-        cso->height = framebuffer->height;
-
-        vc5->swap_color_rb = 0;
-        vc5->blend_dst_alpha_one = 0;
-        for (int i = 0; i < vc5->framebuffer.nr_cbufs; i++) {
-                struct pipe_surface *cbuf = vc5->framebuffer.cbufs[i];
+        v3d->swap_color_rb = 0;
+        v3d->blend_dst_alpha_one = 0;
+        for (int i = 0; i < v3d->framebuffer.nr_cbufs; i++) {
+                struct pipe_surface *cbuf = v3d->framebuffer.cbufs[i];
                 if (!cbuf)
                         continue;
 
@@ -472,27 +490,27 @@
                  */
                 if (desc->swizzle[0] == PIPE_SWIZZLE_Z &&
                     cbuf->format != PIPE_FORMAT_B5G6R5_UNORM) {
-                        vc5->swap_color_rb |= 1 << i;
+                        v3d->swap_color_rb |= 1 << i;
                 }
 
                 if (desc->swizzle[3] == PIPE_SWIZZLE_1)
-                        vc5->blend_dst_alpha_one |= 1 << i;
+                        v3d->blend_dst_alpha_one |= 1 << i;
         }
 
-        vc5->dirty |= VC5_DIRTY_FRAMEBUFFER;
+        v3d->dirty |= VC5_DIRTY_FRAMEBUFFER;
 }
 
-static struct vc5_texture_stateobj *
-vc5_get_stage_tex(struct vc5_context *vc5, enum pipe_shader_type shader)
+static struct v3d_texture_stateobj *
+v3d_get_stage_tex(struct v3d_context *v3d, enum pipe_shader_type shader)
 {
         switch (shader) {
         case PIPE_SHADER_FRAGMENT:
-                vc5->dirty |= VC5_DIRTY_FRAGTEX;
-                return &vc5->fragtex;
+                v3d->dirty |= VC5_DIRTY_FRAGTEX;
+                return &v3d->fragtex;
                 break;
         case PIPE_SHADER_VERTEX:
-                vc5->dirty |= VC5_DIRTY_VERTTEX;
-                return &vc5->verttex;
+                v3d->dirty |= VC5_DIRTY_VERTTEX;
+                return &v3d->verttex;
                 break;
         default:
                 fprintf(stderr, "Unknown shader target %d\n", shader);
@@ -520,11 +538,11 @@
 
 
 static void *
-vc5_create_sampler_state(struct pipe_context *pctx,
+v3d_create_sampler_state(struct pipe_context *pctx,
                          const struct pipe_sampler_state *cso)
 {
-        MAYBE_UNUSED struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_sampler_state *so = CALLOC_STRUCT(vc5_sampler_state);
+        MAYBE_UNUSED struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_sampler_state *so = CALLOC_STRUCT(v3d_sampler_state);
 
         if (!so)
                 return NULL;
@@ -536,9 +554,9 @@
                  cso->min_img_filter == PIPE_TEX_MIPFILTER_NEAREST);
 
 #if V3D_VERSION >= 40
-        so->bo = vc5_bo_alloc(vc5->screen, cl_packet_length(SAMPLER_STATE),
+        so->bo = v3d_bo_alloc(v3d->screen, cl_packet_length(SAMPLER_STATE),
                               "sampler");
-        void *map = vc5_bo_map(so->bo);
+        void *map = v3d_bo_map(so->bo);
 
         v3dx_pack(map, SAMPLER_STATE, sampler) {
                 sampler.wrap_i_border = false;
@@ -561,9 +579,17 @@
                                                    15);
                 sampler.max_level_of_detail = MIN2(cso->max_lod, 15);
 
+                /* If we're not doing inter-miplevel filtering, we need to
+                 * clamp the LOD so that we only sample from baselevel.
+                 * However, we need to still allow the calculated LOD to be
+                 * fractionally over the baselevel, so that the HW can decide
+                 * between the min and mag filters.
+                 */
                 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-                        sampler.min_level_of_detail = 0;
-                        sampler.max_level_of_detail = 0;
+                        sampler.min_level_of_detail =
+                                MIN2(sampler.min_level_of_detail, 1.0 / 256.0);
+                        sampler.max_level_of_detail =
+                                MIN2(sampler.max_level_of_detail, 1.0 / 256.0);
                 }
 
                 if (cso->max_anisotropy) {
@@ -577,21 +603,21 @@
                                 sampler.maximum_anisotropy = 1;
                 }
 
-                sampler.border_colour_mode = V3D_BORDER_COLOUR_FOLLOWS;
-                /* XXX: The border colour field is in the TMU blending format
+                sampler.border_color_mode = V3D_BORDER_COLOR_FOLLOWS;
+                /* XXX: The border color field is in the TMU blending format
                  * (32, f16, or i16), and we need to customize it based on
                  * that.
                  *
                  * XXX: for compat alpha formats, we need the alpha field to
                  * be in the red channel.
                  */
-                sampler.border_colour_red =
+                sampler.border_color_red =
                         util_float_to_half(cso->border_color.f[0]);
-                sampler.border_colour_green =
+                sampler.border_color_green =
                         util_float_to_half(cso->border_color.f[1]);
-                sampler.border_colour_blue =
+                sampler.border_color_blue =
                         util_float_to_half(cso->border_color.f[2]);
-                sampler.border_colour_alpha =
+                sampler.border_color_alpha =
                         util_float_to_half(cso->border_color.f[3]);
         }
 
@@ -611,12 +637,12 @@
 }
 
 static void
-vc5_sampler_states_bind(struct pipe_context *pctx,
+v3d_sampler_states_bind(struct pipe_context *pctx,
                         enum pipe_shader_type shader, unsigned start,
                         unsigned nr, void **hwcso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_texture_stateobj *stage_tex = vc5_get_stage_tex(vc5, shader);
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_texture_stateobj *stage_tex = v3d_get_stage_tex(v3d, shader);
 
         assert(start == 0);
         unsigned i;
@@ -636,13 +662,13 @@
 }
 
 static void
-vc5_sampler_state_delete(struct pipe_context *pctx,
+v3d_sampler_state_delete(struct pipe_context *pctx,
                          void *hwcso)
 {
         struct pipe_sampler_state *psampler = hwcso;
-        struct vc5_sampler_state *sampler = vc5_sampler_state(psampler);
+        struct v3d_sampler_state *sampler = v3d_sampler_state(psampler);
 
-        vc5_bo_unreference(&sampler->bo);
+        v3d_bo_unreference(&sampler->bo);
         free(psampler);
 }
 
@@ -667,13 +693,13 @@
 #endif
 
 static struct pipe_sampler_view *
-vc5_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
+v3d_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                         const struct pipe_sampler_view *cso)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_screen *screen = vc5->screen;
-        struct vc5_sampler_view *so = CALLOC_STRUCT(vc5_sampler_view);
-        struct vc5_resource *rsc = vc5_resource(prsc);
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_screen *screen = v3d->screen;
+        struct v3d_sampler_view *so = CALLOC_STRUCT(v3d_sampler_view);
+        struct v3d_resource *rsc = v3d_resource(prsc);
 
         if (!so)
                 return NULL;
@@ -693,7 +719,7 @@
                 cso->swizzle_a
         };
         const uint8_t *fmt_swizzle =
-                vc5_get_format_swizzle(&screen->devinfo, so->base.format);
+                v3d_get_format_swizzle(&screen->devinfo, so->base.format);
         util_format_compose_swizzles(fmt_swizzle, view_swizzle, so->swizzle);
 
         so->base.texture = prsc;
@@ -703,12 +729,14 @@
         int msaa_scale = prsc->nr_samples > 1 ? 2 : 1;
 
 #if V3D_VERSION >= 40
-        so->bo = vc5_bo_alloc(vc5->screen, cl_packet_length(SAMPLER_STATE),
-                              "sampler");
-        void *map = vc5_bo_map(so->bo);
+        so->bo = v3d_bo_alloc(v3d->screen,
+                              cl_packet_length(TEXTURE_SHADER_STATE), "sampler");
+        void *map = v3d_bo_map(so->bo);
 
         v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
 #else /* V3D_VERSION < 40 */
+        STATIC_ASSERT(sizeof(so->texture_shader_state) >=
+                      cl_packet_length(TEXTURE_SHADER_STATE));
         v3dx_pack(&so->texture_shader_state, TEXTURE_SHADER_STATE, tex) {
 #endif
 
@@ -743,7 +771,9 @@
                  */
                 tex.texture_base_pointer = cl_address(NULL,
                                                       rsc->bo->offset +
-                                                      rsc->slices[0].offset),
+                                                      rsc->slices[0].offset +
+                                                      cso->u.tex.first_layer *
+                                                      rsc->cube_map_stride),
 
                 tex.swizzle_r = translate_swizzle(so->swizzle[0]);
                 tex.swizzle_g = translate_swizzle(so->swizzle[1]);
@@ -752,7 +782,7 @@
 #endif
                 tex.array_stride_64_byte_aligned = rsc->cube_map_stride / 64;
 
-                if (prsc->nr_samples > 1) {
+                if (prsc->nr_samples > 1 && V3D_VERSION < 40) {
                         /* Using texture views to reinterpret formats on our
                          * MSAA textures won't work, because we don't lay out
                          * the bits in memory as it's expected -- for example,
@@ -760,14 +790,19 @@
                          * ARB_texture_view spec, but in HW we lay them out as
                          * 32bpp RGBA8 and 64bpp RGBA16F.  Just assert for now
                          * to catch failures.
+                         *
+                         * We explicitly allow remapping S8Z24 to RGBA8888 for
+                         * v3d_blit.c's stencil blits.
                          */
-                        assert(util_format_linear(cso->format) ==
-                               util_format_linear(prsc->format));
+                        assert((util_format_linear(cso->format) ==
+                                util_format_linear(prsc->format)) ||
+                               (prsc->format == PIPE_FORMAT_S8_UINT_Z24_UNORM &&
+                                cso->format == PIPE_FORMAT_R8G8B8A8_UNORM));
                         uint32_t output_image_format =
-                                vc5_get_rt_format(&screen->devinfo, cso->format);
+                                v3d_get_rt_format(&screen->devinfo, cso->format);
                         uint32_t internal_type;
                         uint32_t internal_bpp;
-                        vc5_get_internal_type_bpp_for_output_format(&screen->devinfo,
+                        v3d_get_internal_type_bpp_for_output_format(&screen->devinfo,
                                                                     output_image_format,
                                                                     &internal_type,
                                                                     &internal_bpp);
@@ -791,7 +826,7 @@
                          */
                         tex.srgb = false;
                 } else {
-                        tex.texture_type = vc5_get_tex_format(&screen->devinfo,
+                        tex.texture_type = v3d_get_tex_format(&screen->devinfo,
                                                               cso->format);
                 }
 
@@ -822,24 +857,24 @@
 }
 
 static void
-vc5_sampler_view_destroy(struct pipe_context *pctx,
+v3d_sampler_view_destroy(struct pipe_context *pctx,
                          struct pipe_sampler_view *psview)
 {
-        struct vc5_sampler_view *sview = vc5_sampler_view(psview);
+        struct v3d_sampler_view *sview = v3d_sampler_view(psview);
 
-        vc5_bo_unreference(&sview->bo);
+        v3d_bo_unreference(&sview->bo);
         pipe_resource_reference(&psview->texture, NULL);
         free(psview);
 }
 
 static void
-vc5_set_sampler_views(struct pipe_context *pctx,
+v3d_set_sampler_views(struct pipe_context *pctx,
                       enum pipe_shader_type shader,
                       unsigned start, unsigned nr,
                       struct pipe_sampler_view **views)
 {
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_texture_stateobj *stage_tex = vc5_get_stage_tex(vc5, shader);
+        struct v3d_context *v3d = v3d_context(pctx);
+        struct v3d_texture_stateobj *stage_tex = v3d_get_stage_tex(v3d, shader);
         unsigned i;
         unsigned new_nr = 0;
 
@@ -859,7 +894,7 @@
 }
 
 static struct pipe_stream_output_target *
-vc5_create_stream_output_target(struct pipe_context *pctx,
+v3d_create_stream_output_target(struct pipe_context *pctx,
                                 struct pipe_resource *prsc,
                                 unsigned buffer_offset,
                                 unsigned buffer_size)
@@ -881,7 +916,7 @@
 }
 
 static void
-vc5_stream_output_target_destroy(struct pipe_context *pctx,
+v3d_stream_output_target_destroy(struct pipe_context *pctx,
                                  struct pipe_stream_output_target *target)
 {
         pipe_resource_reference(&target->buffer, NULL);
@@ -889,19 +924,23 @@
 }
 
 static void
-vc5_set_stream_output_targets(struct pipe_context *pctx,
+v3d_set_stream_output_targets(struct pipe_context *pctx,
                               unsigned num_targets,
                               struct pipe_stream_output_target **targets,
                               const unsigned *offsets)
 {
-        struct vc5_context *ctx = vc5_context(pctx);
-        struct vc5_streamout_stateobj *so = &ctx->streamout;
+        struct v3d_context *ctx = v3d_context(pctx);
+        struct v3d_streamout_stateobj *so = &ctx->streamout;
         unsigned i;
 
         assert(num_targets <= ARRAY_SIZE(so->targets));
 
-        for (i = 0; i < num_targets; i++)
+        for (i = 0; i < num_targets; i++) {
+                if (offsets[i] != -1)
+                        so->offsets[i] = offsets[i];
+
                 pipe_so_target_reference(&so->targets[i], targets[i]);
+        }
 
         for (; i < so->num_targets; i++)
                 pipe_so_target_reference(&so->targets[i], NULL);
@@ -914,43 +953,43 @@
 void
 v3dX(state_init)(struct pipe_context *pctx)
 {
-        pctx->set_blend_color = vc5_set_blend_color;
-        pctx->set_stencil_ref = vc5_set_stencil_ref;
-        pctx->set_clip_state = vc5_set_clip_state;
-        pctx->set_sample_mask = vc5_set_sample_mask;
-        pctx->set_constant_buffer = vc5_set_constant_buffer;
-        pctx->set_framebuffer_state = vc5_set_framebuffer_state;
-        pctx->set_polygon_stipple = vc5_set_polygon_stipple;
-        pctx->set_scissor_states = vc5_set_scissor_states;
-        pctx->set_viewport_states = vc5_set_viewport_states;
+        pctx->set_blend_color = v3d_set_blend_color;
+        pctx->set_stencil_ref = v3d_set_stencil_ref;
+        pctx->set_clip_state = v3d_set_clip_state;
+        pctx->set_sample_mask = v3d_set_sample_mask;
+        pctx->set_constant_buffer = v3d_set_constant_buffer;
+        pctx->set_framebuffer_state = v3d_set_framebuffer_state;
+        pctx->set_polygon_stipple = v3d_set_polygon_stipple;
+        pctx->set_scissor_states = v3d_set_scissor_states;
+        pctx->set_viewport_states = v3d_set_viewport_states;
 
-        pctx->set_vertex_buffers = vc5_set_vertex_buffers;
+        pctx->set_vertex_buffers = v3d_set_vertex_buffers;
 
-        pctx->create_blend_state = vc5_create_blend_state;
-        pctx->bind_blend_state = vc5_blend_state_bind;
-        pctx->delete_blend_state = vc5_generic_cso_state_delete;
+        pctx->create_blend_state = v3d_create_blend_state;
+        pctx->bind_blend_state = v3d_blend_state_bind;
+        pctx->delete_blend_state = v3d_generic_cso_state_delete;
 
-        pctx->create_rasterizer_state = vc5_create_rasterizer_state;
-        pctx->bind_rasterizer_state = vc5_rasterizer_state_bind;
-        pctx->delete_rasterizer_state = vc5_generic_cso_state_delete;
+        pctx->create_rasterizer_state = v3d_create_rasterizer_state;
+        pctx->bind_rasterizer_state = v3d_rasterizer_state_bind;
+        pctx->delete_rasterizer_state = v3d_generic_cso_state_delete;
 
-        pctx->create_depth_stencil_alpha_state = vc5_create_depth_stencil_alpha_state;
-        pctx->bind_depth_stencil_alpha_state = vc5_zsa_state_bind;
-        pctx->delete_depth_stencil_alpha_state = vc5_generic_cso_state_delete;
+        pctx->create_depth_stencil_alpha_state = v3d_create_depth_stencil_alpha_state;
+        pctx->bind_depth_stencil_alpha_state = v3d_zsa_state_bind;
+        pctx->delete_depth_stencil_alpha_state = v3d_generic_cso_state_delete;
 
-        pctx->create_vertex_elements_state = vc5_vertex_state_create;
-        pctx->delete_vertex_elements_state = vc5_generic_cso_state_delete;
-        pctx->bind_vertex_elements_state = vc5_vertex_state_bind;
+        pctx->create_vertex_elements_state = v3d_vertex_state_create;
+        pctx->delete_vertex_elements_state = v3d_vertex_state_delete;
+        pctx->bind_vertex_elements_state = v3d_vertex_state_bind;
 
-        pctx->create_sampler_state = vc5_create_sampler_state;
-        pctx->delete_sampler_state = vc5_sampler_state_delete;
-        pctx->bind_sampler_states = vc5_sampler_states_bind;
+        pctx->create_sampler_state = v3d_create_sampler_state;
+        pctx->delete_sampler_state = v3d_sampler_state_delete;
+        pctx->bind_sampler_states = v3d_sampler_states_bind;
 
-        pctx->create_sampler_view = vc5_create_sampler_view;
-        pctx->sampler_view_destroy = vc5_sampler_view_destroy;
-        pctx->set_sampler_views = vc5_set_sampler_views;
+        pctx->create_sampler_view = v3d_create_sampler_view;
+        pctx->sampler_view_destroy = v3d_sampler_view_destroy;
+        pctx->set_sampler_views = v3d_set_sampler_views;
 
-        pctx->create_stream_output_target = vc5_create_stream_output_target;
-        pctx->stream_output_target_destroy = vc5_stream_output_target_destroy;
-        pctx->set_stream_output_targets = vc5_set_stream_output_targets;
+        pctx->create_stream_output_target = v3d_create_stream_output_target;
+        pctx->stream_output_target_destroy = v3d_stream_output_target_destroy;
+        pctx->set_stream_output_targets = v3d_set_stream_output_targets;
 }
diff --git a/src/gallium/drivers/vc4/Automake.inc b/src/gallium/drivers/vc4/Automake.inc
index b1aa972..650466e 100644
--- a/src/gallium/drivers/vc4/Automake.inc
+++ b/src/gallium/drivers/vc4/Automake.inc
@@ -2,9 +2,18 @@
 
 TARGET_DRIVERS += vc4
 TARGET_CPPFLAGS += -DGALLIUM_VC4
+
+if !HAVE_GALLIUM_V3D
+TARGET_LIB_DEPS += \
+	$(top_builddir)/src/broadcom/libbroadcom.la \
+	$(top_builddir)/src/broadcom/libbroadcom_v33.la \
+	$(top_builddir)/src/broadcom/libbroadcom_v41.la
+endif
+
 TARGET_LIB_DEPS += \
 	$(top_builddir)/src/gallium/winsys/vc4/drm/libvc4drm.la \
 	$(top_builddir)/src/gallium/drivers/vc4/libvc4.la \
 	$(top_builddir)/src/broadcom/cle/libbroadcom_cle.la
 
+
 endif
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index d65bf20..4c7dd84 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -30,7 +30,8 @@
 	-I$(top_builddir)/src/compiler/nir \
 	-I$(top_srcdir)/include/drm-uapi \
 	-I$(top_builddir)/src \
-	-I$(top_srcdir)/src/broadcom/cle \
+	-I$(top_srcdir)/src/broadcom \
+	-I$(top_builddir)/src/broadcom \
 	$(LIBDRM_CFLAGS) \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(SIM_CFLAGS) \
diff --git a/src/gallium/drivers/vc4/meson.build b/src/gallium/drivers/vc4/meson.build
index ef7e7bc..50adcc2 100644
--- a/src/gallium/drivers/vc4/meson.build
+++ b/src/gallium/drivers/vc4/meson.build
@@ -115,6 +115,6 @@
 
 driver_vc4 = declare_dependency(
   compile_args : '-DGALLIUM_VC4',
-  link_with : [libvc4, libvc4winsys, libbroadcom_cle],
+  link_with : [libvc4, libvc4winsys, libbroadcom_cle, libbroadcom_v3d],
   dependencies : idep_nir,
 )
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index ca1b9a3..a6ae0cf 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -28,6 +28,7 @@
 #include "kernel/vc4_packet.h"
 
 #include "broadcom/cle/v3d_decoder.h"
+#include "broadcom/clif/clif_dump.h"
 
 void
 vc4_dump_cl(void *cl, uint32_t size, bool is_render)
@@ -41,6 +42,8 @@
         };
         struct v3d_spec *spec = v3d_spec_load(&devinfo);
 
+        struct clif_dump *clif = clif_dump_init(&devinfo, stderr, true);
+
         uint32_t offset = 0, hw_offset = 0;
         uint8_t *p = cl;
 
@@ -60,7 +63,7 @@
                 fprintf(stderr, "0x%08x 0x%08x: 0x%02x %s\n",
                         offset, hw_offset, header, v3d_group_get_name(inst));
 
-                v3d_print_group(stderr, inst, offset, p, "");
+                v3d_print_group(clif, inst, offset, p);
 
                 switch (header) {
                 case VC4_PACKET_HALT:
@@ -75,5 +78,7 @@
                         hw_offset += length;
                 p += length;
         }
+
+        clif_dump_destroy(clif);
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index c1e041d..9ff39c2 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -59,8 +59,17 @@
 
         if (fence) {
                 struct pipe_screen *screen = pctx->screen;
+                int fd = -1;
+
+                if (flags & PIPE_FLUSH_FENCE_FD) {
+                        /* The vc4_fence takes ownership of the returned fd. */
+                        drmSyncobjExportSyncFile(vc4->fd, vc4->job_syncobj,
+                                                 &fd);
+                }
+
                 struct vc4_fence *f = vc4_fence_create(vc4->screen,
-                                                       vc4->last_emit_seqno);
+                                                       vc4->last_emit_seqno,
+                                                       fd);
                 screen->fence_reference(screen, fence, NULL);
                 *fence = (struct pipe_fence_handle *)f;
         }
@@ -124,6 +133,13 @@
 
         vc4_program_fini(pctx);
 
+        if (vc4->screen->has_syncobj) {
+                drmSyncobjDestroy(vc4->fd, vc4->job_syncobj);
+                drmSyncobjDestroy(vc4->fd, vc4->in_syncobj);
+        }
+        if (vc4->in_fence_fd >= 0)
+                close(vc4->in_fence_fd);
+
         ralloc_free(vc4);
 }
 
@@ -132,6 +148,7 @@
 {
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_context *vc4;
+        int err;
 
         /* Prevent dumping of the shaders built during context setup. */
         uint32_t saved_shaderdb_flag = vc4_debug & VC4_DEBUG_SHADERDB;
@@ -157,10 +174,16 @@
         vc4_query_init(pctx);
         vc4_resource_context_init(pctx);
 
-        vc4_job_init(vc4);
-
         vc4->fd = screen->fd;
 
+        err = vc4_job_init(vc4);
+        if (err)
+                goto fail;
+
+        err = vc4_fence_context_init(vc4);
+        if (err)
+                goto fail;
+
         slab_create_child(&vc4->transfer_pool, &screen->transfer_pool);
 
 	vc4->uploader = u_upload_create_default(&vc4->base);
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 16bebee..ce8bcff 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -408,6 +408,13 @@
 
         struct vc4_hwperfmon *perfmon;
         /** @} */
+
+        /** Handle of syncobj containing the last submitted job fence. */
+        uint32_t job_syncobj;
+
+        int in_fence_fd;
+        /** Handle of the syncobj that holds in_fence_fd for submission. */
+        uint32_t in_syncobj;
 };
 
 struct vc4_rasterizer_state {
@@ -502,7 +509,8 @@
                         struct vc4_texture_stateobj *texstate);
 
 void vc4_flush(struct pipe_context *pctx);
-void vc4_job_init(struct vc4_context *vc4);
+int vc4_job_init(struct vc4_context *vc4);
+int vc4_fence_context_init(struct vc4_context *vc4);
 struct vc4_job *vc4_get_job(struct vc4_context *vc4,
                             struct pipe_surface *cbuf,
                             struct pipe_surface *zsbuf);
diff --git a/src/gallium/drivers/vc4/vc4_fence.c b/src/gallium/drivers/vc4/vc4_fence.c
index f61e7c6..0dbfbe9 100644
--- a/src/gallium/drivers/vc4/vc4_fence.c
+++ b/src/gallium/drivers/vc4/vc4_fence.c
@@ -34,26 +34,39 @@
  * fired off as our fence marker.
  */
 
+#include <libsync.h>
+#include <fcntl.h>
+
 #include "util/u_inlines.h"
 
 #include "vc4_screen.h"
+#include "vc4_context.h"
 #include "vc4_bufmgr.h"
 
 struct vc4_fence {
         struct pipe_reference reference;
         uint64_t seqno;
+        int fd;
 };
 
+static inline struct vc4_fence *
+vc4_fence(struct pipe_fence_handle *pfence)
+{
+        return (struct vc4_fence *)pfence;
+}
+
 static void
 vc4_fence_reference(struct pipe_screen *pscreen,
                     struct pipe_fence_handle **pp,
                     struct pipe_fence_handle *pf)
 {
         struct vc4_fence **p = (struct vc4_fence **)pp;
-        struct vc4_fence *f = (struct vc4_fence *)pf;
+        struct vc4_fence *f = vc4_fence(pf);
         struct vc4_fence *old = *p;
 
         if (pipe_reference(&(*p)->reference, &f->reference)) {
+                if (old->fd >= 0)
+                        close(old->fd);
                 free(old);
         }
         *p = f;
@@ -66,13 +79,16 @@
                  uint64_t timeout_ns)
 {
         struct vc4_screen *screen = vc4_screen(pscreen);
-        struct vc4_fence *f = (struct vc4_fence *)pf;
+        struct vc4_fence *f = vc4_fence(pf);
+
+        if (f->fd >= 0)
+                return sync_wait(f->fd, timeout_ns / 1000000) == 0;
 
         return vc4_wait_seqno(screen, f->seqno, timeout_ns, "fence wait");
 }
 
 struct vc4_fence *
-vc4_fence_create(struct vc4_screen *screen, uint64_t seqno)
+vc4_fence_create(struct vc4_screen *screen, uint64_t seqno, int fd)
 {
         struct vc4_fence *f = calloc(1, sizeof(*f));
 
@@ -81,13 +97,64 @@
 
         pipe_reference_init(&f->reference, 1);
         f->seqno = seqno;
+        f->fd = fd;
 
         return f;
 }
 
+static void
+vc4_fence_create_fd(struct pipe_context *pctx, struct pipe_fence_handle **pf,
+                    int fd, enum pipe_fd_type type)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+        struct vc4_fence **fence = (struct vc4_fence **)pf;
+
+        assert(type == PIPE_FD_TYPE_NATIVE_SYNC);
+        *fence = vc4_fence_create(vc4->screen, vc4->last_emit_seqno,
+                                  fcntl(fd, F_DUPFD_CLOEXEC, 3));
+}
+
+static void
+vc4_fence_server_sync(struct pipe_context *pctx,
+                      struct pipe_fence_handle *pfence)
+{
+        struct vc4_context *vc4 = vc4_context(pctx);
+        struct vc4_fence *fence = vc4_fence(pfence);
+
+        if (fence->fd >= 0)
+                sync_accumulate("vc4", &vc4->in_fence_fd, fence->fd);
+}
+
+static int
+vc4_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle *pfence)
+{
+        struct vc4_fence *fence = vc4_fence(pfence);
+
+        return fcntl(fence->fd, F_DUPFD_CLOEXEC, 3);
+}
+
+int
+vc4_fence_context_init(struct vc4_context *vc4)
+{
+        vc4->base.create_fence_fd = vc4_fence_create_fd;
+        vc4->base.fence_server_sync = vc4_fence_server_sync;
+        vc4->in_fence_fd = -1;
+
+        /* Since we initialize the in_fence_fd to -1 (no wait necessary),
+         * we also need to initialize our in_syncobj as signaled.
+         */
+        if (vc4->screen->has_syncobj) {
+                return drmSyncobjCreate(vc4->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
+                                        &vc4->in_syncobj);
+        } else {
+                return 0;
+        }
+}
+
 void
-vc4_fence_init(struct vc4_screen *screen)
+vc4_fence_screen_init(struct vc4_screen *screen)
 {
         screen->base.fence_reference = vc4_fence_reference;
         screen->base.fence_finish = vc4_fence_finish;
+        screen->base.fence_get_fd = vc4_fence_get_fd;
 }
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index 41c274c..7256976 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -477,6 +477,19 @@
         }
         submit.flags |= job->flags;
 
+        if (vc4->screen->has_syncobj) {
+                submit.out_sync = vc4->job_syncobj;
+
+                if (vc4->in_fence_fd >= 0) {
+                        /* This replaces the fence in the syncobj. */
+                        drmSyncobjImportSyncFile(vc4->fd, vc4->in_syncobj,
+                                                 vc4->in_fence_fd);
+                        submit.in_sync = vc4->in_syncobj;
+                        close(vc4->in_fence_fd);
+                        vc4->in_fence_fd = -1;
+                }
+        }
+
         if (!(vc4_debug & VC4_DEBUG_NORAST)) {
                 int ret;
 
@@ -530,7 +543,7 @@
         return _mesa_hash_data(key, sizeof(struct vc4_job_key));
 }
 
-void
+int
 vc4_job_init(struct vc4_context *vc4)
 {
         vc4->jobs = _mesa_hash_table_create(vc4,
@@ -539,5 +552,24 @@
         vc4->write_jobs = _mesa_hash_table_create(vc4,
                                                   _mesa_hash_pointer,
                                                   _mesa_key_pointer_equal);
+
+        if (vc4->screen->has_syncobj) {
+                /* Create the syncobj as signaled since with no job executed
+                 * there is nothing to wait on.
+                 */
+                int ret = drmSyncobjCreate(vc4->fd,
+                                           DRM_SYNCOBJ_CREATE_SIGNALED,
+                                           &vc4->job_syncobj);
+                if (ret) {
+                        /* If the screen indicated syncobj support, we should
+                         * be able to create a signaled syncobj.
+                         * At this point it is too late to pretend the screen
+                         * has no syncobj support.
+                         */
+                        return ret;
+                }
+        }
+
+        return 0;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
index 1085243..92b9e89 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -46,7 +46,6 @@
 
         nir_tex_instr *txf = nir_tex_instr_create(c->s, 1);
         txf->op = nir_texop_txf;
-        txf->texture = txf_ms->texture;
         txf->texture_index = txf_ms->texture_index;
         txf->coord_components = txf_ms->coord_components;
         txf->is_shadow = txf_ms->is_shadow;
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index cb8e9af..1f46b64 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -687,24 +687,44 @@
 }
 
 static struct qreg
+ntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x)
+{
+        /* Since we're using a Taylor approximation, we want to have a small
+         * number of coefficients and take advantage of sin/cos repeating
+         * every 2pi.  We keep our x as close to 0 as we can, since the series
+         * will be less accurate as |x| increases.  (Also, be careful of
+         * shifting the input x value to be tricky with sin/cos relations,
+         * because getting accurate values for x==0 is very important for SDL
+         * rendering)
+         */
+        struct qreg scaled_x =
+                qir_FMUL(c, x,
+                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
+        /* Note: FTOI truncates toward 0. */
+        struct qreg x_frac = qir_FSUB(c, scaled_x,
+                                      qir_ITOF(c, qir_FTOI(c, scaled_x)));
+        /* Map [0.5, 1] to [-0.5, 0] */
+        qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5)));
+        qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC;
+        /* Map [-1, -0.5] to [0, 0.5] */
+        qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5)));
+        qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS;
+
+        return x_frac;
+}
+
+static struct qreg
 ntq_fsin(struct vc4_compile *c, struct qreg src)
 {
         float coeff[] = {
-                -2.0 * M_PI,
-                pow(2.0 * M_PI, 3) / (3 * 2 * 1),
-                -pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
-                pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
-                -pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
+                2.0 * M_PI,
+                -pow(2.0 * M_PI, 3) / (3 * 2 * 1),
+                pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1),
+                -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1),
+                pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
         };
 
-        struct qreg scaled_x =
-                qir_FMUL(c,
-                         src,
-                         qir_uniform_f(c, 1.0 / (M_PI * 2.0)));
-
-        struct qreg x = qir_FADD(c,
-                                 ntq_ffract(c, scaled_x),
-                                 qir_uniform_f(c, -0.5));
+        struct qreg x = ntq_shrink_sincos_input_range(c, src);
         struct qreg x2 = qir_FMUL(c, x, x);
         struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0]));
         for (int i = 1; i < ARRAY_SIZE(coeff); i++) {
@@ -722,21 +742,15 @@
 ntq_fcos(struct vc4_compile *c, struct qreg src)
 {
         float coeff[] = {
-                -1.0f,
-                pow(2.0 * M_PI, 2) / (2 * 1),
-                -pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
-                pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
-                -pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
-                pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
+                1.0f,
+                -pow(2.0 * M_PI, 2) / (2 * 1),
+                pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1),
+                -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1),
+                pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
+                -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1),
         };
 
-        struct qreg scaled_x =
-                qir_FMUL(c, src,
-                         qir_uniform_f(c, 1.0f / (M_PI * 2.0f)));
-        struct qreg x_frac = qir_FADD(c,
-                                      ntq_ffract(c, scaled_x),
-                                      qir_uniform_f(c, -0.5));
-
+        struct qreg x_frac = ntq_shrink_sincos_input_range(c, src);
         struct qreg sum = qir_uniform_f(c, coeff[0]);
         struct qreg x2 = qir_FMUL(c, x_frac, x_frac);
         struct qreg x = x2; /* Current x^2, x^4, or x^6 */
@@ -2221,6 +2235,7 @@
         .lower_all_io_to_temps = true,
         .lower_extract_byte = true,
         .lower_extract_word = true,
+        .lower_fdiv = true,
         .lower_ffma = true,
         .lower_flrp32 = true,
         .lower_fpow = true,
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 85a3c04..914bcc2 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -28,6 +28,7 @@
 #include "util/u_format.h"
 #include "util/u_inlines.h"
 #include "util/u_surface.h"
+#include "util/u_transfer_helper.h"
 #include "util/u_upload_mgr.h"
 
 #include "drm_fourcc.h"
@@ -76,15 +77,8 @@
         struct vc4_transfer *trans = vc4_transfer(ptrans);
 
         if (trans->map) {
-                struct vc4_resource *rsc;
-                struct vc4_resource_slice *slice;
-                if (trans->ss_resource) {
-                        rsc = vc4_resource(trans->ss_resource);
-                        slice = &rsc->slices[0];
-                } else {
-                        rsc = vc4_resource(ptrans->resource);
-                        slice = &rsc->slices[ptrans->level];
-                }
+                struct vc4_resource *rsc = vc4_resource(ptrans->resource);
+                struct vc4_resource_slice *slice = &rsc->slices[ptrans->level];
 
                 if (ptrans->usage & PIPE_TRANSFER_WRITE) {
                         vc4_store_tiled_image(rsc->bo->map + slice->offset +
@@ -97,51 +91,10 @@
                 free(trans->map);
         }
 
-        if (trans->ss_resource && (ptrans->usage & PIPE_TRANSFER_WRITE)) {
-                struct pipe_blit_info blit;
-                memset(&blit, 0, sizeof(blit));
-
-                blit.src.resource = trans->ss_resource;
-                blit.src.format = trans->ss_resource->format;
-                blit.src.box.width = trans->ss_box.width;
-                blit.src.box.height = trans->ss_box.height;
-                blit.src.box.depth = 1;
-
-                blit.dst.resource = ptrans->resource;
-                blit.dst.format = ptrans->resource->format;
-                blit.dst.level = ptrans->level;
-                blit.dst.box = trans->ss_box;
-
-                blit.mask = util_format_get_mask(ptrans->resource->format);
-                blit.filter = PIPE_TEX_FILTER_NEAREST;
-
-                pctx->blit(pctx, &blit);
-
-                pipe_resource_reference(&trans->ss_resource, NULL);
-        }
-
         pipe_resource_reference(&ptrans->resource, NULL);
         slab_free(&vc4->transfer_pool, ptrans);
 }
 
-static struct pipe_resource *
-vc4_get_temp_resource(struct pipe_context *pctx,
-                      struct pipe_resource *prsc,
-                      const struct pipe_box *box)
-{
-        struct pipe_resource temp_setup;
-
-        memset(&temp_setup, 0, sizeof(temp_setup));
-        temp_setup.target = prsc->target;
-        temp_setup.format = prsc->format;
-        temp_setup.width0 = box->width;
-        temp_setup.height0 = box->height;
-        temp_setup.depth0 = 1;
-        temp_setup.array_size = 1;
-
-        return pctx->screen->resource_create(pctx->screen, &temp_setup);
-}
-
 static void *
 vc4_resource_transfer_map(struct pipe_context *pctx,
                           struct pipe_resource *prsc,
@@ -215,50 +168,6 @@
         ptrans->usage = usage;
         ptrans->box = *box;
 
-        /* If the resource is multisampled, we need to resolve to single
-         * sample.  This seems like it should be handled at a higher layer.
-         */
-        if (prsc->nr_samples > 1) {
-                trans->ss_resource = vc4_get_temp_resource(pctx, prsc, box);
-                if (!trans->ss_resource)
-                        goto fail;
-                assert(!trans->ss_resource->nr_samples);
-
-                /* The ptrans->box gets modified for tile alignment, so save
-                 * the original box for unmap time.
-                 */
-                trans->ss_box = *box;
-
-                if (usage & PIPE_TRANSFER_READ) {
-                        struct pipe_blit_info blit;
-                        memset(&blit, 0, sizeof(blit));
-
-                        blit.src.resource = ptrans->resource;
-                        blit.src.format = ptrans->resource->format;
-                        blit.src.level = ptrans->level;
-                        blit.src.box = trans->ss_box;
-
-                        blit.dst.resource = trans->ss_resource;
-                        blit.dst.format = trans->ss_resource->format;
-                        blit.dst.box.width = trans->ss_box.width;
-                        blit.dst.box.height = trans->ss_box.height;
-                        blit.dst.box.depth = 1;
-
-                        blit.mask = util_format_get_mask(prsc->format);
-                        blit.filter = PIPE_TEX_FILTER_NEAREST;
-
-                        pctx->blit(pctx, &blit);
-                        vc4_flush_jobs_writing_resource(vc4, blit.dst.resource);
-                }
-
-                /* The rest of the mapping process should use our temporary. */
-                prsc = trans->ss_resource;
-                rsc = vc4_resource(prsc);
-                ptrans->box.x = 0;
-                ptrans->box.y = 0;
-                ptrans->box.z = 0;
-        }
-
         if (usage & PIPE_TRANSFER_UNSYNCHRONIZED)
                 buf = vc4_bo_map_unsynchronized(rsc->bo);
         else
@@ -403,7 +312,7 @@
                 whandle->modifier = DRM_FORMAT_MOD_LINEAR;
 
         switch (whandle->type) {
-        case DRM_API_HANDLE_TYPE_SHARED:
+        case WINSYS_HANDLE_TYPE_SHARED:
                 if (screen->ro) {
                         /* This could probably be supported, assuming that a
                          * control node was used for pl111.
@@ -413,12 +322,12 @@
                 }
 
                 return vc4_bo_flink(rsc->bo, &whandle->handle);
-        case DRM_API_HANDLE_TYPE_KMS:
+        case WINSYS_HANDLE_TYPE_KMS:
                 if (screen->ro && renderonly_get_handle(rsc->scanout, whandle))
                         return TRUE;
                 whandle->handle = rsc->bo->handle;
                 return TRUE;
-        case DRM_API_HANDLE_TYPE_FD:
+        case WINSYS_HANDLE_TYPE_FD:
                 /* FDs are cross-device, so we can export directly from vc4.
                  */
                 whandle->handle = vc4_bo_get_dmabuf(rsc->bo);
@@ -667,7 +576,15 @@
                         goto fail;
         }
 
-        if (screen->ro && tmpl->bind & PIPE_BIND_SCANOUT) {
+        /* Set up the "scanout resource" (the dmabuf export of our buffer to
+         * the KMS handle) if the buffer might ever have
+         * resource_get_handle(WINSYS_HANDLE_TYPE_KMS) called on it.
+         * create_with_modifiers() doesn't give us usage flags, so we have to
+         * assume that all calls with modifiers are scanout-possible.
+         */
+        if (screen->ro &&
+            ((tmpl->bind & PIPE_BIND_SCANOUT) ||
+             !(count == 1 && modifiers[0] == DRM_FORMAT_MOD_INVALID))) {
                 rsc->scanout =
                         renderonly_scanout_for_resource(prsc, screen->ro, NULL);
                 if (!rsc->scanout)
@@ -708,11 +625,11 @@
                 return NULL;
 
         switch (whandle->type) {
-        case DRM_API_HANDLE_TYPE_SHARED:
+        case WINSYS_HANDLE_TYPE_SHARED:
                 rsc->bo = vc4_bo_open_name(screen,
                                            whandle->handle, whandle->stride);
                 break;
-        case DRM_API_HANDLE_TYPE_FD:
+        case WINSYS_HANDLE_TYPE_FD:
                 rsc->bo = vc4_bo_open_dmabuf(screen,
                                              whandle->handle, whandle->stride);
                 break;
@@ -1203,6 +1120,14 @@
         return shadow_rsc;
 }
 
+static const struct u_transfer_vtbl transfer_vtbl = {
+        .resource_create          = vc4_resource_create,
+        .resource_destroy         = vc4_resource_destroy,
+        .transfer_map             = vc4_resource_transfer_map,
+        .transfer_unmap           = vc4_resource_transfer_unmap,
+        .transfer_flush_region    = u_default_transfer_flush_region,
+};
+
 void
 vc4_resource_screen_init(struct pipe_screen *pscreen)
 {
@@ -1215,6 +1140,8 @@
         pscreen->resource_destroy = u_resource_destroy_vtbl;
         pscreen->resource_get_handle = vc4_resource_get_handle;
         pscreen->resource_destroy = vc4_resource_destroy;
+        pscreen->transfer_helper = u_transfer_helper_create(&transfer_vtbl,
+                                                            false, false, true);
 
         /* Test if the kernel has GET_TILING; it will return -EINVAL if the
          * ioctl does not exist, but -ENOENT if we pass an impossible handle.
@@ -1231,9 +1158,9 @@
 void
 vc4_resource_context_init(struct pipe_context *pctx)
 {
-        pctx->transfer_map = vc4_resource_transfer_map;
-        pctx->transfer_flush_region = u_default_transfer_flush_region;
-        pctx->transfer_unmap = vc4_resource_transfer_unmap;
+        pctx->transfer_map = u_transfer_helper_transfer_map;
+        pctx->transfer_flush_region = u_transfer_helper_transfer_flush_region;
+        pctx->transfer_unmap = u_transfer_helper_transfer_unmap;
         pctx->buffer_subdata = u_default_buffer_subdata;
         pctx->texture_subdata = u_default_texture_subdata;
         pctx->create_surface = vc4_create_surface;
diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h
index d4c491e..8c0aadb 100644
--- a/src/gallium/drivers/vc4/vc4_resource.h
+++ b/src/gallium/drivers/vc4/vc4_resource.h
@@ -32,9 +32,6 @@
 struct vc4_transfer {
         struct pipe_transfer base;
         void *map;
-
-        struct pipe_resource *ss_resource;
-        struct pipe_box ss_box;
 };
 
 struct vc4_resource_slice {
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index cead71b..646e7fa 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -148,6 +148,9 @@
         case PIPE_CAP_TEXTURE_BARRIER:
                 return 1;
 
+        case PIPE_CAP_NATIVE_FENCE_FD:
+                return screen->has_syncobj;
+
         case PIPE_CAP_TILE_RASTER_ORDER:
                 return vc4_has_feature(screen,
                                        DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER);
@@ -161,6 +164,7 @@
                 return 256;
 
         case PIPE_CAP_GLSL_FEATURE_LEVEL:
+	case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
                 return 120;
 
         case PIPE_CAP_MAX_VIEWPORTS:
@@ -263,7 +267,6 @@
         case PIPE_CAP_VIEWPORT_SUBPIXEL_BITS:
         case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
         case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
-        case PIPE_CAP_NATIVE_FENCE_FD:
         case PIPE_CAP_TGSI_FS_FBFETCH:
         case PIPE_CAP_TGSI_MUL_ZERO_WINS:
         case PIPE_CAP_DOUBLES:
@@ -284,11 +287,19 @@
         case PIPE_CAP_LOAD_CONSTBUF:
         case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
         case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+	case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
         case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
         case PIPE_CAP_CONTEXT_PRIORITY_MASK:
         case PIPE_CAP_FENCE_SIGNAL:
-	case PIPE_CAP_CONSTBUF0_FLAGS:
+        case PIPE_CAP_CONSTBUF0_FLAGS:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+        case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+        case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
         case PIPE_CAP_PACKED_UNIFORMS:
+        case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
                 return 0;
 
                 /* Stream output. */
@@ -375,6 +386,11 @@
                 return 0.0f;
         case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
                 return 0.0f;
+
+        case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+        case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+                return 0.0f;
         default:
                 fprintf(stderr, "unknown paramf %d\n", param);
                 return 0;
@@ -450,6 +466,8 @@
         case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
         case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
                 return 0;
+        case PIPE_SHADER_CAP_SCALAR_ISA:
+                return 1;
         default:
                 fprintf(stderr, "unknown shader param %d\n", param);
                 return 0;
@@ -462,15 +480,18 @@
                                enum pipe_format format,
                                enum pipe_texture_target target,
                                unsigned sample_count,
+                               unsigned storage_sample_count,
                                unsigned usage)
 {
         struct vc4_screen *screen = vc4_screen(pscreen);
 
+        if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+                return false;
+
         if (sample_count > 1 && sample_count != VC4_MAX_SAMPLES)
                 return FALSE;
 
-        if ((target >= PIPE_MAX_TEXTURE_TYPES) ||
-            !util_format_is_supported(format, usage)) {
+        if (target >= PIPE_MAX_TEXTURE_TYPES) {
                 return FALSE;
         }
 
@@ -648,7 +669,9 @@
 vc4_screen_create(int fd, struct renderonly *ro)
 {
         struct vc4_screen *screen = rzalloc(NULL, struct vc4_screen);
+        uint64_t syncobj_cap = 0;
         struct pipe_screen *pscreen;
+        int err;
 
         pscreen = &screen->base;
 
@@ -684,6 +707,10 @@
         screen->has_perfmon_ioctl =
                 vc4_has_feature(screen, DRM_VC4_PARAM_SUPPORTS_PERFMON);
 
+        err = drmGetCap(fd, DRM_CAP_SYNCOBJ, &syncobj_cap);
+        if (err == 0 && syncobj_cap)
+                screen->has_syncobj = true;
+
         if (!vc4_get_chip_info(screen))
                 goto fail;
 
@@ -691,7 +718,7 @@
 
         slab_create_parent(&screen->transfer_pool, sizeof(struct vc4_transfer), 16);
 
-        vc4_fence_init(screen);
+        vc4_fence_screen_init(screen);
 
         vc4_debug = debug_get_option_vc4_debug();
         if (vc4_debug & VC4_DEBUG_SHADERDB)
diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h
index 0b88442..f4550d1 100644
--- a/src/gallium/drivers/vc4/vc4_screen.h
+++ b/src/gallium/drivers/vc4/vc4_screen.h
@@ -98,6 +98,7 @@
         bool has_madvise;
         bool has_tiling_ioctl;
         bool has_perfmon_ioctl;
+        bool has_syncobj;
 
         struct vc4_simulator_file *sim_file;
 };
@@ -118,9 +119,9 @@
 extern uint32_t vc4_debug;
 
 void
-vc4_fence_init(struct vc4_screen *screen);
+vc4_fence_screen_init(struct vc4_screen *screen);
 
 struct vc4_fence *
-vc4_fence_create(struct vc4_screen *screen, uint64_t seqno);
+vc4_fence_create(struct vc4_screen *screen, uint64_t seqno, int fd);
 
 #endif /* VC4_SCREEN_H */
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 4067004..1e4657a 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -23,6 +23,7 @@
  */
 
 #include "pipe/p_state.h"
+#include "util/u_framebuffer.h"
 #include "util/u_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
@@ -414,21 +415,10 @@
 {
         struct vc4_context *vc4 = vc4_context(pctx);
         struct pipe_framebuffer_state *cso = &vc4->framebuffer;
-        unsigned i;
 
         vc4->job = NULL;
 
-        for (i = 0; i < framebuffer->nr_cbufs; i++)
-                pipe_surface_reference(&cso->cbufs[i], framebuffer->cbufs[i]);
-        for (; i < vc4->framebuffer.nr_cbufs; i++)
-                pipe_surface_reference(&cso->cbufs[i], NULL);
-
-        cso->nr_cbufs = framebuffer->nr_cbufs;
-
-        pipe_surface_reference(&cso->zsbuf, framebuffer->zsbuf);
-
-        cso->width = framebuffer->width;
-        cso->height = framebuffer->height;
+        util_copy_framebuffer_state(cso, framebuffer);
 
         /* Nonzero texture mipmap levels are laid out as if they were in
          * power-of-two-sized spaces.  The renderbuffer config infers its
diff --git a/src/gallium/drivers/vc5/Automake.inc b/src/gallium/drivers/vc5/Automake.inc
deleted file mode 100644
index 57c8a28..0000000
--- a/src/gallium/drivers/vc5/Automake.inc
+++ /dev/null
@@ -1,14 +0,0 @@
-if HAVE_GALLIUM_VC5
-
-TARGET_DRIVERS += vc5
-TARGET_CPPFLAGS += -DGALLIUM_VC5
-TARGET_LIB_DEPS += \
-	$(top_builddir)/src/gallium/winsys/vc5/drm/libvc5drm.la \
-	$(top_builddir)/src/gallium/drivers/vc5/libvc5.la \
-	$(top_builddir)/src/broadcom/libbroadcom.la
-
-if !HAVE_GALLIUM_VC4
-TARGET_LIB_DEPS += $(top_builddir)/src/broadcom/cle/libbroadcom_cle.la
-endif
-
-endif
diff --git a/src/gallium/drivers/vc5/Makefile.sources b/src/gallium/drivers/vc5/Makefile.sources
deleted file mode 100644
index c1e4e0b..0000000
--- a/src/gallium/drivers/vc5/Makefile.sources
+++ /dev/null
@@ -1,37 +0,0 @@
-C_SOURCES := \
-	vc5_blit.c \
-	vc5_bufmgr.c \
-	vc5_bufmgr.h \
-	vc5_cl.c \
-	vc5_cl.h \
-	vc5_context.c \
-	vc5_context.h \
-	vc5_drm.h \
-	vc5_fence.c \
-	vc5_formats.c \
-	vc5_format_table.h \
-	vc5_job.c \
-	vc5_program.c \
-	vc5_query.c \
-	vc5_resource.c \
-	vc5_resource.h \
-	vc5_screen.c \
-	vc5_screen.h \
-	vc5_simulator.c \
-	vc5_simulator_wrapper.cpp \
-	vc5_simulator_wrapper.h \
-	vc5_tiling.c \
-	vc5_tiling.h \
-	vc5_uniforms.c \
-	$()
-
-VC5_PER_VERSION_SOURCES = \
-	v3dx_context.h \
-	v3dx_format_table.c \
-	v3dx_job.c \
-	v3dx_simulator.c \
-	vc5_draw.c \
-	vc5_emit.c \
-	vc5_rcl.c \
-	vc5_state.c \
-	$()
diff --git a/src/gallium/drivers/vc5/vc5_blit.c b/src/gallium/drivers/vc5/vc5_blit.c
deleted file mode 100644
index 6481141..0000000
--- a/src/gallium/drivers/vc5/vc5_blit.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright © 2015-2017 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "util/u_format.h"
-#include "util/u_surface.h"
-#include "util/u_blitter.h"
-#include "vc5_context.h"
-
-#if 0
-static struct pipe_surface *
-vc5_get_blit_surface(struct pipe_context *pctx,
-                     struct pipe_resource *prsc, unsigned level)
-{
-        struct pipe_surface tmpl;
-
-        memset(&tmpl, 0, sizeof(tmpl));
-        tmpl.format = prsc->format;
-        tmpl.u.tex.level = level;
-        tmpl.u.tex.first_layer = 0;
-        tmpl.u.tex.last_layer = 0;
-
-        return pctx->create_surface(pctx, prsc, &tmpl);
-}
-
-static bool
-is_tile_unaligned(unsigned size, unsigned tile_size)
-{
-        return size & (tile_size - 1);
-}
-
-static bool
-vc5_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
-{
-        struct vc5_context *vc5 = vc5_context(pctx);
-        bool msaa = (info->src.resource->nr_samples > 1 ||
-                     info->dst.resource->nr_samples > 1);
-        int tile_width = msaa ? 32 : 64;
-        int tile_height = msaa ? 32 : 64;
-
-        if (util_format_is_depth_or_stencil(info->dst.resource->format))
-                return false;
-
-        if (info->scissor_enable)
-                return false;
-
-        if ((info->mask & PIPE_MASK_RGBA) == 0)
-                return false;
-
-        if (info->dst.box.x != info->src.box.x ||
-            info->dst.box.y != info->src.box.y ||
-            info->dst.box.width != info->src.box.width ||
-            info->dst.box.height != info->src.box.height) {
-                return false;
-        }
-
-        int dst_surface_width = u_minify(info->dst.resource->width0,
-                                         info->dst.level);
-        int dst_surface_height = u_minify(info->dst.resource->height0,
-                                         info->dst.level);
-        if (is_tile_unaligned(info->dst.box.x, tile_width) ||
-            is_tile_unaligned(info->dst.box.y, tile_height) ||
-            (is_tile_unaligned(info->dst.box.width, tile_width) &&
-             info->dst.box.x + info->dst.box.width != dst_surface_width) ||
-            (is_tile_unaligned(info->dst.box.height, tile_height) &&
-             info->dst.box.y + info->dst.box.height != dst_surface_height)) {
-                return false;
-        }
-
-        /* VC5_PACKET_LOAD_TILE_BUFFER_GENERAL uses the
-         * VC5_PACKET_TILE_RENDERING_MODE_CONFIG's width (determined by our
-         * destination surface) to determine the stride.  This may be wrong
-         * when reading from texture miplevels > 0, which are stored in
-         * POT-sized areas.  For MSAA, the tile addresses are computed
-         * explicitly by the RCL, but still use the destination width to
-         * determine the stride (which could be fixed by explicitly supplying
-         * it in the ABI).
-         */
-        struct vc5_resource *rsc = vc5_resource(info->src.resource);
-
-        uint32_t stride;
-
-        if (info->src.resource->nr_samples > 1)
-                stride = align(dst_surface_width, 32) * 4 * rsc->cpp;
-        /* XXX else if (rsc->slices[info->src.level].tiling == VC5_TILING_FORMAT_T)
-           stride = align(dst_surface_width * rsc->cpp, 128); */
-        else
-                stride = align(dst_surface_width * rsc->cpp, 16);
-
-        if (stride != rsc->slices[info->src.level].stride)
-                return false;
-
-        if (info->dst.resource->format != info->src.resource->format)
-                return false;
-
-        if (false) {
-                fprintf(stderr, "RCL blit from %d,%d to %d,%d (%d,%d)\n",
-                        info->src.box.x,
-                        info->src.box.y,
-                        info->dst.box.x,
-                        info->dst.box.y,
-                        info->dst.box.width,
-                        info->dst.box.height);
-        }
-
-        struct pipe_surface *dst_surf =
-                vc5_get_blit_surface(pctx, info->dst.resource, info->dst.level);
-        struct pipe_surface *src_surf =
-                vc5_get_blit_surface(pctx, info->src.resource, info->src.level);
-
-        vc5_flush_jobs_reading_resource(vc5, info->src.resource);
-
-        struct vc5_job *job = vc5_get_job(vc5, dst_surf, NULL);
-        pipe_surface_reference(&job->color_read, src_surf);
-
-        /* If we're resolving from MSAA to single sample, we still need to run
-         * the engine in MSAA mode for the load.
-         */
-        if (!job->msaa && info->src.resource->nr_samples > 1) {
-                job->msaa = true;
-                job->tile_width = 32;
-                job->tile_height = 32;
-        }
-
-        job->draw_min_x = info->dst.box.x;
-        job->draw_min_y = info->dst.box.y;
-        job->draw_max_x = info->dst.box.x + info->dst.box.width;
-        job->draw_max_y = info->dst.box.y + info->dst.box.height;
-        job->draw_width = dst_surf->width;
-        job->draw_height = dst_surf->height;
-
-        job->tile_width = tile_width;
-        job->tile_height = tile_height;
-        job->msaa = msaa;
-        job->needs_flush = true;
-        job->resolve |= PIPE_CLEAR_COLOR;
-
-        vc5_job_submit(vc5, job);
-
-        pipe_surface_reference(&dst_surf, NULL);
-        pipe_surface_reference(&src_surf, NULL);
-
-        return true;
-}
-#endif
-
-void
-vc5_blitter_save(struct vc5_context *vc5)
-{
-        util_blitter_save_fragment_constant_buffer_slot(vc5->blitter,
-                                                        vc5->constbuf[PIPE_SHADER_FRAGMENT].cb);
-        util_blitter_save_vertex_buffer_slot(vc5->blitter, vc5->vertexbuf.vb);
-        util_blitter_save_vertex_elements(vc5->blitter, vc5->vtx);
-        util_blitter_save_vertex_shader(vc5->blitter, vc5->prog.bind_vs);
-        util_blitter_save_so_targets(vc5->blitter, vc5->streamout.num_targets,
-                                     vc5->streamout.targets);
-        util_blitter_save_rasterizer(vc5->blitter, vc5->rasterizer);
-        util_blitter_save_viewport(vc5->blitter, &vc5->viewport);
-        util_blitter_save_scissor(vc5->blitter, &vc5->scissor);
-        util_blitter_save_fragment_shader(vc5->blitter, vc5->prog.bind_fs);
-        util_blitter_save_blend(vc5->blitter, vc5->blend);
-        util_blitter_save_depth_stencil_alpha(vc5->blitter, vc5->zsa);
-        util_blitter_save_stencil_ref(vc5->blitter, &vc5->stencil_ref);
-        util_blitter_save_sample_mask(vc5->blitter, vc5->sample_mask);
-        util_blitter_save_framebuffer(vc5->blitter, &vc5->framebuffer);
-        util_blitter_save_fragment_sampler_states(vc5->blitter,
-                        vc5->fragtex.num_samplers,
-                        (void **)vc5->fragtex.samplers);
-        util_blitter_save_fragment_sampler_views(vc5->blitter,
-                        vc5->fragtex.num_textures, vc5->fragtex.textures);
-        util_blitter_save_so_targets(vc5->blitter, vc5->streamout.num_targets,
-                                     vc5->streamout.targets);
-}
-
-static bool
-vc5_render_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
-{
-        struct vc5_context *vc5 = vc5_context(ctx);
-
-        if (!util_blitter_is_blit_supported(vc5->blitter, info)) {
-                fprintf(stderr, "blit unsupported %s -> %s\n",
-                    util_format_short_name(info->src.resource->format),
-                    util_format_short_name(info->dst.resource->format));
-                return false;
-        }
-
-        vc5_blitter_save(vc5);
-        util_blitter_blit(vc5->blitter, info);
-
-        return true;
-}
-
-/* Optimal hardware path for blitting pixels.
- * Scaling, format conversion, up- and downsampling (resolve) are allowed.
- */
-void
-vc5_blit(struct pipe_context *pctx, const struct pipe_blit_info *blit_info)
-{
-        struct pipe_blit_info info = *blit_info;
-
-#if 0
-        if (vc5_tile_blit(pctx, blit_info))
-                return;
-#endif
-
-        vc5_render_blit(pctx, &info);
-}
diff --git a/src/gallium/drivers/vc5/vc5_context.c b/src/gallium/drivers/vc5/vc5_context.c
deleted file mode 100644
index b6d1234..0000000
--- a/src/gallium/drivers/vc5/vc5_context.c
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright © 2014-2017 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include <xf86drm.h>
-#include <err.h>
-
-#include "pipe/p_defines.h"
-#include "util/hash_table.h"
-#include "util/ralloc.h"
-#include "util/u_inlines.h"
-#include "util/u_memory.h"
-#include "util/u_blitter.h"
-#include "util/u_upload_mgr.h"
-#include "indices/u_primconvert.h"
-#include "pipe/p_screen.h"
-
-#include "vc5_screen.h"
-#include "vc5_context.h"
-#include "vc5_resource.h"
-
-void
-vc5_flush(struct pipe_context *pctx)
-{
-        struct vc5_context *vc5 = vc5_context(pctx);
-
-        struct hash_entry *entry;
-        hash_table_foreach(vc5->jobs, entry) {
-                struct vc5_job *job = entry->data;
-                vc5_job_submit(vc5, job);
-        }
-}
-
-static void
-vc5_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
-               unsigned flags)
-{
-        struct vc5_context *vc5 = vc5_context(pctx);
-
-        vc5_flush(pctx);
-
-        if (fence) {
-                struct pipe_screen *screen = pctx->screen;
-                struct vc5_fence *f = vc5_fence_create(vc5);
-                screen->fence_reference(screen, fence, NULL);
-                *fence = (struct pipe_fence_handle *)f;
-        }
-}
-
-static void
-vc5_invalidate_resource(struct pipe_context *pctx, struct pipe_resource *prsc)
-{
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_resource *rsc = vc5_resource(prsc);
-
-        rsc->initialized_buffers = 0;
-
-        struct hash_entry *entry = _mesa_hash_table_search(vc5->write_jobs,
-                                                           prsc);
-        if (!entry)
-                return;
-
-        struct vc5_job *job = entry->data;
-        if (job->key.zsbuf && job->key.zsbuf->texture == prsc)
-                job->resolve &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL);
-}
-
-static void
-vc5_context_destroy(struct pipe_context *pctx)
-{
-        struct vc5_context *vc5 = vc5_context(pctx);
-
-        vc5_flush(pctx);
-
-        if (vc5->blitter)
-                util_blitter_destroy(vc5->blitter);
-
-        if (vc5->primconvert)
-                util_primconvert_destroy(vc5->primconvert);
-
-        if (vc5->uploader)
-                u_upload_destroy(vc5->uploader);
-
-        slab_destroy_child(&vc5->transfer_pool);
-
-        pipe_surface_reference(&vc5->framebuffer.cbufs[0], NULL);
-        pipe_surface_reference(&vc5->framebuffer.zsbuf, NULL);
-
-        vc5_program_fini(pctx);
-
-        ralloc_free(vc5);
-}
-
-struct pipe_context *
-vc5_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
-{
-        struct vc5_screen *screen = vc5_screen(pscreen);
-        struct vc5_context *vc5;
-
-        /* Prevent dumping of the shaders built during context setup. */
-        uint32_t saved_shaderdb_flag = V3D_DEBUG & V3D_DEBUG_SHADERDB;
-        V3D_DEBUG &= ~V3D_DEBUG_SHADERDB;
-
-        vc5 = rzalloc(NULL, struct vc5_context);
-        if (!vc5)
-                return NULL;
-        struct pipe_context *pctx = &vc5->base;
-
-        vc5->screen = screen;
-
-        int ret = drmSyncobjCreate(screen->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
-                                   &vc5->out_sync);
-        if (ret) {
-                ralloc_free(vc5);
-                return NULL;
-        }
-
-        pctx->screen = pscreen;
-        pctx->priv = priv;
-        pctx->destroy = vc5_context_destroy;
-        pctx->flush = vc5_pipe_flush;
-        pctx->invalidate_resource = vc5_invalidate_resource;
-
-        if (screen->devinfo.ver >= 41) {
-                v3d41_draw_init(pctx);
-                v3d41_state_init(pctx);
-        } else {
-                v3d33_draw_init(pctx);
-                v3d33_state_init(pctx);
-        }
-        vc5_program_init(pctx);
-        vc5_query_init(pctx);
-        vc5_resource_context_init(pctx);
-
-        vc5_job_init(vc5);
-
-        vc5->fd = screen->fd;
-
-        slab_create_child(&vc5->transfer_pool, &screen->transfer_pool);
-
-        vc5->uploader = u_upload_create_default(&vc5->base);
-        vc5->base.stream_uploader = vc5->uploader;
-        vc5->base.const_uploader = vc5->uploader;
-
-        vc5->blitter = util_blitter_create(pctx);
-        if (!vc5->blitter)
-                goto fail;
-
-        vc5->primconvert = util_primconvert_create(pctx,
-                                                   (1 << PIPE_PRIM_QUADS) - 1);
-        if (!vc5->primconvert)
-                goto fail;
-
-        V3D_DEBUG |= saved_shaderdb_flag;
-
-        vc5->sample_mask = (1 << VC5_MAX_SAMPLES) - 1;
-        vc5->active_queries = true;
-
-        return &vc5->base;
-
-fail:
-        pctx->destroy(pctx);
-        return NULL;
-}
diff --git a/src/gallium/drivers/vc5/vc5_emit.c b/src/gallium/drivers/vc5/vc5_emit.c
deleted file mode 100644
index 0d11d7e..0000000
--- a/src/gallium/drivers/vc5/vc5_emit.c
+++ /dev/null
@@ -1,683 +0,0 @@
-/*
- * Copyright © 2014-2017 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "util/u_format.h"
-#include "util/u_half.h"
-#include "vc5_context.h"
-#include "broadcom/common/v3d_macros.h"
-#include "broadcom/cle/v3dx_pack.h"
-#include "broadcom/compiler/v3d_compiler.h"
-
-static uint8_t
-vc5_factor(enum pipe_blendfactor factor, bool dst_alpha_one)
-{
-        /* We may get a bad blendfactor when blending is disabled. */
-        if (factor == 0)
-                return V3D_BLEND_FACTOR_ZERO;
-
-        switch (factor) {
-        case PIPE_BLENDFACTOR_ZERO:
-                return V3D_BLEND_FACTOR_ZERO;
-        case PIPE_BLENDFACTOR_ONE:
-                return V3D_BLEND_FACTOR_ONE;
-        case PIPE_BLENDFACTOR_SRC_COLOR:
-                return V3D_BLEND_FACTOR_SRC_COLOR;
-        case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-                return V3D_BLEND_FACTOR_INV_SRC_COLOR;
-        case PIPE_BLENDFACTOR_DST_COLOR:
-                return V3D_BLEND_FACTOR_DST_COLOR;
-        case PIPE_BLENDFACTOR_INV_DST_COLOR:
-                return V3D_BLEND_FACTOR_INV_DST_COLOR;
-        case PIPE_BLENDFACTOR_SRC_ALPHA:
-                return V3D_BLEND_FACTOR_SRC_ALPHA;
-        case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-                return V3D_BLEND_FACTOR_INV_SRC_ALPHA;
-        case PIPE_BLENDFACTOR_DST_ALPHA:
-                return (dst_alpha_one ?
-                        V3D_BLEND_FACTOR_ONE :
-                        V3D_BLEND_FACTOR_DST_ALPHA);
-        case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-                return (dst_alpha_one ?
-                        V3D_BLEND_FACTOR_ZERO :
-                        V3D_BLEND_FACTOR_INV_DST_ALPHA);
-        case PIPE_BLENDFACTOR_CONST_COLOR:
-                return V3D_BLEND_FACTOR_CONST_COLOR;
-        case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-                return V3D_BLEND_FACTOR_INV_CONST_COLOR;
-        case PIPE_BLENDFACTOR_CONST_ALPHA:
-                return V3D_BLEND_FACTOR_CONST_ALPHA;
-        case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-                return V3D_BLEND_FACTOR_INV_CONST_ALPHA;
-        case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-                return V3D_BLEND_FACTOR_SRC_ALPHA_SATURATE;
-        default:
-                unreachable("Bad blend factor");
-        }
-}
-
-static inline uint16_t
-swizzled_border_color(const struct v3d_device_info *devinfo,
-                      struct pipe_sampler_state *sampler,
-                      struct vc5_sampler_view *sview,
-                      int chan)
-{
-        const struct util_format_description *desc =
-                util_format_description(sview->base.format);
-        uint8_t swiz = chan;
-
-        /* If we're doing swizzling in the sampler, then only rearrange the
-         * border color for the mismatch between the VC5 texture format and
-         * the PIPE_FORMAT, since GL_ARB_texture_swizzle will be handled by
-         * the sampler's swizzle.
-         *
-         * For swizzling in the shader, we don't do any pre-swizzling of the
-         * border color.
-         */
-        if (vc5_get_tex_return_size(devinfo, sview->base.format,
-                                    sampler->compare_mode) != 32)
-                swiz = desc->swizzle[swiz];
-
-        switch (swiz) {
-        case PIPE_SWIZZLE_0:
-                return util_float_to_half(0.0);
-        case PIPE_SWIZZLE_1:
-                return util_float_to_half(1.0);
-        default:
-                return util_float_to_half(sampler->border_color.f[swiz]);
-        }
-}
-
-#if V3D_VERSION < 40
-static uint32_t
-translate_swizzle(unsigned char pipe_swizzle)
-{
-        switch (pipe_swizzle) {
-        case PIPE_SWIZZLE_0:
-                return 0;
-        case PIPE_SWIZZLE_1:
-                return 1;
-        case PIPE_SWIZZLE_X:
-        case PIPE_SWIZZLE_Y:
-        case PIPE_SWIZZLE_Z:
-        case PIPE_SWIZZLE_W:
-                return 2 + pipe_swizzle;
-        default:
-                unreachable("unknown swizzle");
-        }
-}
-
-static void
-emit_one_texture(struct vc5_context *vc5, struct vc5_texture_stateobj *stage_tex,
-                 int i)
-{
-        struct vc5_job *job = vc5->job;
-        struct pipe_sampler_state *psampler = stage_tex->samplers[i];
-        struct vc5_sampler_state *sampler = vc5_sampler_state(psampler);
-        struct pipe_sampler_view *psview = stage_tex->textures[i];
-        struct vc5_sampler_view *sview = vc5_sampler_view(psview);
-        struct pipe_resource *prsc = psview->texture;
-        struct vc5_resource *rsc = vc5_resource(prsc);
-        const struct v3d_device_info *devinfo = &vc5->screen->devinfo;
-
-        stage_tex->texture_state[i].offset =
-                vc5_cl_ensure_space(&job->indirect,
-                                    cl_packet_length(TEXTURE_SHADER_STATE),
-                                    32);
-        vc5_bo_set_reference(&stage_tex->texture_state[i].bo,
-                             job->indirect.bo);
-
-        uint32_t return_size = vc5_get_tex_return_size(devinfo, psview->format,
-                                                       psampler->compare_mode);
-
-        struct V3D33_TEXTURE_SHADER_STATE unpacked = {
-                /* XXX */
-                .border_color_red = swizzled_border_color(devinfo, psampler,
-                                                          sview, 0),
-                .border_color_green = swizzled_border_color(devinfo, psampler,
-                                                            sview, 1),
-                .border_color_blue = swizzled_border_color(devinfo, psampler,
-                                                           sview, 2),
-                .border_color_alpha = swizzled_border_color(devinfo, psampler,
-                                                            sview, 3),
-
-                /* In the normal texturing path, the LOD gets clamped between
-                 * min/max, and the base_level field (set in the sampler view
-                 * from first_level) only decides where the min/mag switch
-                 * happens, so we need to use the LOD clamps to keep us
-                 * between min and max.
-                 *
-                 * For txf, the LOD clamp is still used, despite GL not
-                 * wanting that.  We will need to have a separate
-                 * TEXTURE_SHADER_STATE that ignores psview->min/max_lod to
-                 * support txf properly.
-                 */
-                .min_level_of_detail = MIN2(psview->u.tex.first_level +
-                                            MAX2(psampler->min_lod, 0),
-                                            psview->u.tex.last_level),
-                .max_level_of_detail = MIN2(psview->u.tex.first_level +
-                                            psampler->max_lod,
-                                            psview->u.tex.last_level),
-
-                .texture_base_pointer = cl_address(rsc->bo,
-                                                   rsc->slices[0].offset),
-
-                .output_32_bit = return_size == 32,
-        };
-
-        /* Set up the sampler swizzle if we're doing 16-bit sampling.  For
-         * 32-bit, we leave swizzling up to the shader compiler.
-         *
-         * Note: Contrary to the docs, the swizzle still applies even if the
-         * return size is 32.  It's just that you probably want to swizzle in
-         * the shader, because you need the Y/Z/W channels to be defined.
-         */
-        if (return_size == 32) {
-                unpacked.swizzle_r = translate_swizzle(PIPE_SWIZZLE_X);
-                unpacked.swizzle_g = translate_swizzle(PIPE_SWIZZLE_Y);
-                unpacked.swizzle_b = translate_swizzle(PIPE_SWIZZLE_Z);
-                unpacked.swizzle_a = translate_swizzle(PIPE_SWIZZLE_W);
-        } else {
-                unpacked.swizzle_r = translate_swizzle(sview->swizzle[0]);
-                unpacked.swizzle_g = translate_swizzle(sview->swizzle[1]);
-                unpacked.swizzle_b = translate_swizzle(sview->swizzle[2]);
-                unpacked.swizzle_a = translate_swizzle(sview->swizzle[3]);
-        }
-
-        int min_img_filter = psampler->min_img_filter;
-        int min_mip_filter = psampler->min_mip_filter;
-        int mag_img_filter = psampler->mag_img_filter;
-
-        if (return_size == 32) {
-                min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
-                mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-                mag_img_filter = PIPE_TEX_FILTER_NEAREST;
-        }
-
-        bool min_nearest = min_img_filter == PIPE_TEX_FILTER_NEAREST;
-        switch (min_mip_filter) {
-        case PIPE_TEX_MIPFILTER_NONE:
-                unpacked.filter += min_nearest ? 2 : 0;
-                break;
-        case PIPE_TEX_MIPFILTER_NEAREST:
-                unpacked.filter += min_nearest ? 4 : 8;
-                break;
-        case PIPE_TEX_MIPFILTER_LINEAR:
-                unpacked.filter += min_nearest ? 4 : 8;
-                unpacked.filter += 2;
-                break;
-        }
-
-        if (mag_img_filter == PIPE_TEX_FILTER_NEAREST)
-                unpacked.filter++;
-
-        if (psampler->max_anisotropy > 8)
-                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_16_1;
-        else if (psampler->max_anisotropy > 4)
-                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_8_1;
-        else if (psampler->max_anisotropy > 2)
-                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_4_1;
-        else if (psampler->max_anisotropy)
-                unpacked.filter = V3D_TMU_FILTER_ANISOTROPIC_2_1;
-
-        uint8_t packed[cl_packet_length(TEXTURE_SHADER_STATE)];
-        cl_packet_pack(TEXTURE_SHADER_STATE)(&job->indirect, packed, &unpacked);
-
-        for (int i = 0; i < ARRAY_SIZE(packed); i++)
-                packed[i] |= sview->texture_shader_state[i] | sampler->texture_shader_state[i];
-
-        /* TMU indirect structs need to be 32b aligned. */
-        vc5_cl_ensure_space(&job->indirect, ARRAY_SIZE(packed), 32);
-        cl_emit_prepacked(&job->indirect, &packed);
-}
-
-static void
-emit_textures(struct vc5_context *vc5, struct vc5_texture_stateobj *stage_tex)
-{
-        for (int i = 0; i < stage_tex->num_textures; i++) {
-                if (stage_tex->textures[i])
-                        emit_one_texture(vc5, stage_tex, i);
-        }
-}
-#endif /* V3D_VERSION < 40 */
-
-static uint32_t
-translate_colormask(struct vc5_context *vc5, uint32_t colormask, int rt)
-{
-        if (vc5->swap_color_rb & (1 << rt)) {
-                colormask = ((colormask & (2 | 8)) |
-                             ((colormask & 1) << 2) |
-                             ((colormask & 4) >> 2));
-        }
-
-        return (~colormask) & 0xf;
-}
-
-static void
-emit_rt_blend(struct vc5_context *vc5, struct vc5_job *job,
-              struct pipe_blend_state *blend, int rt)
-{
-        cl_emit(&job->bcl, BLEND_CONFIG, config) {
-                struct pipe_rt_blend_state *rtblend = &blend->rt[rt];
-
-#if V3D_VERSION >= 40
-                config.render_target_mask = 1 << rt;
-#else
-                assert(rt == 0);
-#endif
-
-                config.colour_blend_mode = rtblend->rgb_func;
-                config.colour_blend_dst_factor =
-                        vc5_factor(rtblend->rgb_dst_factor,
-                                   vc5->blend_dst_alpha_one);
-                config.colour_blend_src_factor =
-                        vc5_factor(rtblend->rgb_src_factor,
-                                   vc5->blend_dst_alpha_one);
-
-                config.alpha_blend_mode = rtblend->alpha_func;
-                config.alpha_blend_dst_factor =
-                        vc5_factor(rtblend->alpha_dst_factor,
-                                   vc5->blend_dst_alpha_one);
-                config.alpha_blend_src_factor =
-                        vc5_factor(rtblend->alpha_src_factor,
-                                   vc5->blend_dst_alpha_one);
-        }
-}
-
-void
-v3dX(emit_state)(struct pipe_context *pctx)
-{
-        struct vc5_context *vc5 = vc5_context(pctx);
-        struct vc5_job *job = vc5->job;
-        bool rasterizer_discard = vc5->rasterizer->base.rasterizer_discard;
-
-        if (vc5->dirty & (VC5_DIRTY_SCISSOR | VC5_DIRTY_VIEWPORT |
-                          VC5_DIRTY_RASTERIZER)) {
-                float *vpscale = vc5->viewport.scale;
-                float *vptranslate = vc5->viewport.translate;
-                float vp_minx = -fabsf(vpscale[0]) + vptranslate[0];
-                float vp_maxx = fabsf(vpscale[0]) + vptranslate[0];
-                float vp_miny = -fabsf(vpscale[1]) + vptranslate[1];
-                float vp_maxy = fabsf(vpscale[1]) + vptranslate[1];
-
-                /* Clip to the scissor if it's enabled, but still clip to the
-                 * drawable regardless since that controls where the binner
-                 * tries to put things.
-                 *
-                 * Additionally, always clip the rendering to the viewport,
-                 * since the hardware does guardband clipping, meaning
-                 * primitives would rasterize outside of the view volume.
-                 */
-                uint32_t minx, miny, maxx, maxy;
-                if (!vc5->rasterizer->base.scissor) {
-                        minx = MAX2(vp_minx, 0);
-                        miny = MAX2(vp_miny, 0);
-                        maxx = MIN2(vp_maxx, job->draw_width);
-                        maxy = MIN2(vp_maxy, job->draw_height);
-                } else {
-                        minx = MAX2(vp_minx, vc5->scissor.minx);
-                        miny = MAX2(vp_miny, vc5->scissor.miny);
-                        maxx = MIN2(vp_maxx, vc5->scissor.maxx);
-                        maxy = MIN2(vp_maxy, vc5->scissor.maxy);
-                }
-
-                cl_emit(&job->bcl, CLIP_WINDOW, clip) {
-                        clip.clip_window_left_pixel_coordinate = minx;
-                        clip.clip_window_bottom_pixel_coordinate = miny;
-                        clip.clip_window_width_in_pixels = maxx - minx;
-                        clip.clip_window_height_in_pixels = maxy - miny;
-
-#if V3D_VERSION < 41
-                        /* The HW won't entirely clip out when scissor w/h is
-                         * 0.  Just treat it the same as rasterizer discard.
-                         */
-                        if (clip.clip_window_width_in_pixels == 0 ||
-                            clip.clip_window_height_in_pixels == 0) {
-                                rasterizer_discard = true;
-                                clip.clip_window_width_in_pixels = 1;
-                                clip.clip_window_height_in_pixels = 1;
-                        }
-#endif
-                }
-
-                job->draw_min_x = MIN2(job->draw_min_x, minx);
-                job->draw_min_y = MIN2(job->draw_min_y, miny);
-                job->draw_max_x = MAX2(job->draw_max_x, maxx);
-                job->draw_max_y = MAX2(job->draw_max_y, maxy);
-        }
-
-        if (vc5->dirty & (VC5_DIRTY_RASTERIZER |
-                          VC5_DIRTY_ZSA |
-                          VC5_DIRTY_BLEND |
-                          VC5_DIRTY_COMPILED_FS)) {
-                cl_emit(&job->bcl, CONFIGURATION_BITS, config) {
-                        config.enable_forward_facing_primitive =
-                                !rasterizer_discard &&
-                                !(vc5->rasterizer->base.cull_face &
-                                  PIPE_FACE_FRONT);
-                        config.enable_reverse_facing_primitive =
-                                !rasterizer_discard &&
-                                !(vc5->rasterizer->base.cull_face &
-                                  PIPE_FACE_BACK);
-                        /* This seems backwards, but it's what gets the
-                         * clipflat test to pass.
-                         */
-                        config.clockwise_primitives =
-                                vc5->rasterizer->base.front_ccw;
-
-                        config.enable_depth_offset =
-                                vc5->rasterizer->base.offset_tri;
-
-                        config.rasterizer_oversample_mode =
-                                vc5->rasterizer->base.multisample;
-
-                        config.direct3d_provoking_vertex =
-                                vc5->rasterizer->base.flatshade_first;
-
-                        config.blend_enable = vc5->blend->rt[0].blend_enable;
-
-                        /* Note: EZ state may update based on the compiled FS,
-                         * along with ZSA
-                         */
-                        config.early_z_updates_enable =
-                                (job->ez_state != VC5_EZ_DISABLED);
-                        if (vc5->zsa->base.depth.enabled) {
-                                config.z_updates_enable =
-                                        vc5->zsa->base.depth.writemask;
-                                config.early_z_enable =
-                                        config.early_z_updates_enable;
-                                config.depth_test_function =
-                                        vc5->zsa->base.depth.func;
-                        } else {
-                                config.depth_test_function = PIPE_FUNC_ALWAYS;
-                        }
-
-                        config.stencil_enable =
-                                vc5->zsa->base.stencil[0].enabled;
-                }
-
-        }
-
-        if (vc5->dirty & VC5_DIRTY_RASTERIZER &&
-            vc5->rasterizer->base.offset_tri) {
-                cl_emit(&job->bcl, DEPTH_OFFSET, depth) {
-                        depth.depth_offset_factor =
-                                vc5->rasterizer->offset_factor;
-                        depth.depth_offset_units =
-                                vc5->rasterizer->offset_units;
-                }
-        }
-
-        if (vc5->dirty & VC5_DIRTY_RASTERIZER) {
-                cl_emit(&job->bcl, POINT_SIZE, point_size) {
-                        point_size.point_size = vc5->rasterizer->point_size;
-                }
-
-                cl_emit(&job->bcl, LINE_WIDTH, line_width) {
-                        line_width.line_width = vc5->rasterizer->base.line_width;
-                }
-        }
-
-        if (vc5->dirty & VC5_DIRTY_VIEWPORT) {
-                cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
-                        clip.viewport_half_width_in_1_256th_of_pixel =
-                                vc5->viewport.scale[0] * 256.0f;
-                        clip.viewport_half_height_in_1_256th_of_pixel =
-                                vc5->viewport.scale[1] * 256.0f;
-                }
-
-                cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
-                        clip.viewport_z_offset_zc_to_zs =
-                                vc5->viewport.translate[2];
-                        clip.viewport_z_scale_zc_to_zs =
-                                vc5->viewport.scale[2];
-                }
-                cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
-                        clip.minimum_zw = (vc5->viewport.translate[2] -
-                                           vc5->viewport.scale[2]);
-                        clip.maximum_zw = (vc5->viewport.translate[2] +
-                                           vc5->viewport.scale[2]);
-                }
-
-                cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
-                        vp.viewport_centre_x_coordinate =
-                                vc5->viewport.translate[0];
-                        vp.viewport_centre_y_coordinate =
-                                vc5->viewport.translate[1];
-                }
-        }
-
-        if (vc5->dirty & VC5_DIRTY_BLEND && vc5->blend->rt[0].blend_enable) {
-                struct pipe_blend_state *blend = vc5->blend;
-
-                if (blend->independent_blend_enable) {
-                        for (int i = 0; i < VC5_MAX_DRAW_BUFFERS; i++)
-                                emit_rt_blend(vc5, job, blend, i);
-                } else {
-                        emit_rt_blend(vc5, job, blend, 0);
-                }
-        }
-
-        if (vc5->dirty & VC5_DIRTY_BLEND) {
-                struct pipe_blend_state *blend = vc5->blend;
-
-                cl_emit(&job->bcl, COLOUR_WRITE_MASKS, mask) {
-                        if (blend->independent_blend_enable) {
-                                mask.render_target_0_per_colour_component_write_masks =
-                                        translate_colormask(vc5, blend->rt[0].colormask, 0);
-                                mask.render_target_1_per_colour_component_write_masks =
-                                        translate_colormask(vc5, blend->rt[1].colormask, 1);
-                                mask.render_target_2_per_colour_component_write_masks =
-                                        translate_colormask(vc5, blend->rt[2].colormask, 2);
-                                mask.render_target_3_per_colour_component_write_masks =
-                                        translate_colormask(vc5, blend->rt[3].colormask, 3);
-                        } else {
-                                mask.render_target_0_per_colour_component_write_masks =
-                                        translate_colormask(vc5, blend->rt[0].colormask, 0);
-                                mask.render_target_1_per_colour_component_write_masks =
-                                        translate_colormask(vc5, blend->rt[0].colormask, 1);
-                                mask.render_target_2_per_colour_component_write_masks =
-                                        translate_colormask(vc5, blend->rt[0].colormask, 2);
-                                mask.render_target_3_per_colour_component_write_masks =
-                                        translate_colormask(vc5, blend->rt[0].colormask, 3);
-                        }
-                }
-        }
-
-        /* GFXH-1431: On V3D 3.x, writing BLEND_CONFIG resets the constant
-         * color.
-         */
-        if (vc5->dirty & VC5_DIRTY_BLEND_COLOR ||
-            (V3D_VERSION < 41 && (vc5->dirty & VC5_DIRTY_BLEND))) {
-                cl_emit(&job->bcl, BLEND_CONSTANT_COLOUR, colour) {
-                        colour.red_f16 = (vc5->swap_color_rb ?
-                                          vc5->blend_color.hf[2] :
-                                          vc5->blend_color.hf[0]);
-                        colour.green_f16 = vc5->blend_color.hf[1];
-                        colour.blue_f16 = (vc5->swap_color_rb ?
-                                           vc5->blend_color.hf[0] :
-                                           vc5->blend_color.hf[2]);
-                        colour.alpha_f16 = vc5->blend_color.hf[3];
-                }
-        }
-
-        if (vc5->dirty & (VC5_DIRTY_ZSA | VC5_DIRTY_STENCIL_REF)) {
-                struct pipe_stencil_state *front = &vc5->zsa->base.stencil[0];
-                struct pipe_stencil_state *back = &vc5->zsa->base.stencil[1];
-
-                if (front->enabled) {
-                        cl_emit_with_prepacked(&job->bcl, STENCIL_CONFIG,
-                                               vc5->zsa->stencil_front, config) {
-                                config.stencil_ref_value =
-                                        vc5->stencil_ref.ref_value[0];
-                        }
-                }
-
-                if (back->enabled) {
-                        cl_emit_with_prepacked(&job->bcl, STENCIL_CONFIG,
-                                               vc5->zsa->stencil_back, config) {
-                                config.stencil_ref_value =
-                                        vc5->stencil_ref.ref_value[1];
-                        }
-                }
-        }
-
-#if V3D_VERSION < 40
-        /* Pre-4.x, we have texture state that depends on both the sampler and
-         * the view, so we merge them together at draw time.
-         */
-        if (vc5->dirty & VC5_DIRTY_FRAGTEX)
-                emit_textures(vc5, &vc5->fragtex);
-
-        if (vc5->dirty & VC5_DIRTY_VERTTEX)
-                emit_textures(vc5, &vc5->verttex);
-#endif
-
-        if (vc5->dirty & VC5_DIRTY_FLAT_SHADE_FLAGS) {
-                bool emitted_any = false;
-
-                for (int i = 0; i < ARRAY_SIZE(vc5->prog.fs->prog_data.fs->flat_shade_flags); i++) {
-                        if (!vc5->prog.fs->prog_data.fs->flat_shade_flags[i])
-                                continue;
-
-                        cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
-                                flags.varying_offset_v0 = i;
-
-                                if (emitted_any) {
-                                        flags.action_for_flat_shade_flags_of_lower_numbered_varyings =
-                                                V3D_VARYING_FLAGS_ACTION_UNCHANGED;
-                                        flags.action_for_flat_shade_flags_of_higher_numbered_varyings =
-                                                V3D_VARYING_FLAGS_ACTION_UNCHANGED;
-                                } else {
-                                        flags.action_for_flat_shade_flags_of_lower_numbered_varyings =
-                                                ((i == 0) ?
-                                                 V3D_VARYING_FLAGS_ACTION_UNCHANGED :
-                                                 V3D_VARYING_FLAGS_ACTION_ZEROED);
-
-                                        flags.action_for_flat_shade_flags_of_higher_numbered_varyings =
-                                                V3D_VARYING_FLAGS_ACTION_ZEROED;
-                                }
-
-                                flags.flat_shade_flags_for_varyings_v024 =
-                                        vc5->prog.fs->prog_data.fs->flat_shade_flags[i];
-                        }
-
-                        emitted_any = true;
-                }
-
-                if (!emitted_any) {
-                        cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
-                }
-        }
-
-        /* Set up the transform feedback data specs (which VPM entries to
-         * output to which buffers).
-         */
-        if (vc5->dirty & (VC5_DIRTY_STREAMOUT |
-                          VC5_DIRTY_RASTERIZER |
-                          VC5_DIRTY_PRIM_MODE)) {
-                struct vc5_streamout_stateobj *so = &vc5->streamout;
-
-                if (so->num_targets) {
-                        bool psiz_per_vertex = (vc5->prim_mode == PIPE_PRIM_POINTS &&
-                                                vc5->rasterizer->base.point_size_per_vertex);
-                        uint16_t *tf_specs = (psiz_per_vertex ?
-                                              vc5->prog.bind_vs->tf_specs_psiz :
-                                              vc5->prog.bind_vs->tf_specs);
-
-#if V3D_VERSION >= 40
-                        job->tf_enabled = (vc5->prog.bind_vs->num_tf_specs != 0 &&
-                                           vc5->active_queries);
-
-                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_SPECS, tfe) {
-                                tfe.number_of_16_bit_output_data_specs_following =
-                                        vc5->prog.bind_vs->num_tf_specs;
-                                tfe.enable = job->tf_enabled;
-                        };
-#else /* V3D_VERSION < 40 */
-                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_ENABLE, tfe) {
-                                tfe.number_of_32_bit_output_buffer_address_following =
-                                        so->num_targets;
-                                tfe.number_of_16_bit_output_data_specs_following =
-                                        vc5->prog.bind_vs->num_tf_specs;
-                        };
-#endif /* V3D_VERSION < 40 */
-                        for (int i = 0; i < vc5->prog.bind_vs->num_tf_specs; i++) {
-                                cl_emit_prepacked(&job->bcl, &tf_specs[i]);
-                        }
-                } else if (job->tf_enabled) {
-#if V3D_VERSION >= 40
-                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_SPECS, tfe) {
-                                tfe.enable = false;
-                        };
-                        job->tf_enabled = false;
-#endif /* V3D_VERSION >= 40 */
-                }
-        }
-
-        /* Set up the trasnform feedback buffers. */
-        if (vc5->dirty & VC5_DIRTY_STREAMOUT) {
-                struct vc5_streamout_stateobj *so = &vc5->streamout;
-                for (int i = 0; i < so->num_targets; i++) {
-                        const struct pipe_stream_output_target *target =
-                                so->targets[i];
-                        struct vc5_resource *rsc = target ?
-                                vc5_resource(target->buffer) : NULL;
-
-#if V3D_VERSION >= 40
-                        if (!target)
-                                continue;
-
-                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_BUFFER, output) {
-                                output.buffer_address =
-                                        cl_address(rsc->bo,
-                                                   target->buffer_offset);
-                                output.buffer_size_in_32_bit_words =
-                                        target->buffer_size >> 2;
-                                output.buffer_number = i;
-                        }
-#else /* V3D_VERSION < 40 */
-                        cl_emit(&job->bcl, TRANSFORM_FEEDBACK_OUTPUT_ADDRESS, output) {
-                                if (target) {
-                                        output.address =
-                                                cl_address(rsc->bo,
-                                                           target->buffer_offset);
-                                }
-                        };
-#endif /* V3D_VERSION < 40 */
-                        if (target) {
-                                vc5_job_add_write_resource(vc5->job,
-                                                           target->buffer);
-                        }
-                        /* XXX: buffer_size? */
-                }
-        }
-
-        if (vc5->dirty & VC5_DIRTY_OQ) {
-                cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
-                        job->oq_enabled = vc5->active_queries && vc5->current_oq;
-                        if (job->oq_enabled) {
-                                counter.address = cl_address(vc5->current_oq, 0);
-                        }
-                }
-        }
-}
diff --git a/src/gallium/drivers/vc5/vc5_fence.c b/src/gallium/drivers/vc5/vc5_fence.c
deleted file mode 100644
index 731dd6d..0000000
--- a/src/gallium/drivers/vc5/vc5_fence.c
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright © 2014 Broadcom
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-/** @file vc5_fence.c
- *
- * Seqno-based fence management.
- *
- * We have two mechanisms for waiting in our kernel API: You can wait on a BO
- * to have all rendering to from any process to be completed, or wait on a
- * seqno for that particular seqno to be passed.  The fence API we're
- * implementing is based on waiting for all rendering in the context to have
- * completed (with no reference to what other processes might be doing with
- * the same BOs), so we can just use the seqno of the last rendering we'd
- * fired off as our fence marker.
- */
-
-#include "util/u_inlines.h"
-
-#include "vc5_context.h"
-#include "vc5_bufmgr.h"
-
-struct vc5_fence {
-        struct pipe_reference reference;
-        uint32_t sync;
-};
-
-static void
-vc5_fence_reference(struct pipe_screen *pscreen,
-                    struct pipe_fence_handle **pp,
-                    struct pipe_fence_handle *pf)
-{
-        struct vc5_screen *screen = vc5_screen(pscreen);
-        struct vc5_fence **p = (struct vc5_fence **)pp;
-        struct vc5_fence *f = (struct vc5_fence *)pf;
-        struct vc5_fence *old = *p;
-
-        if (pipe_reference(&(*p)->reference, &f->reference)) {
-                drmSyncobjDestroy(screen->fd, old->sync);
-                free(old);
-        }
-        *p = f;
-}
-
-static boolean
-vc5_fence_finish(struct pipe_screen *pscreen,
-		 struct pipe_context *ctx,
-                 struct pipe_fence_handle *pf,
-                 uint64_t timeout_ns)
-{
-        struct vc5_screen *screen = vc5_screen(pscreen);
-        struct vc5_fence *f = (struct vc5_fence *)pf;
-
-        return drmSyncobjWait(screen->fd, &f->sync, 1, timeout_ns, 0, NULL);
-}
-
-struct vc5_fence *
-vc5_fence_create(struct vc5_context *vc5)
-{
-        struct vc5_fence *f = calloc(1, sizeof(*f));
-        if (!f)
-                return NULL;
-
-        uint32_t new_sync;
-        /* Make a new sync object for the context. */
-        int ret = drmSyncobjCreate(vc5->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
-                                   &new_sync);
-        if (ret) {
-                free(f);
-                return NULL;
-        }
-
-        pipe_reference_init(&f->reference, 1);
-        f->sync = vc5->out_sync;
-        vc5->out_sync = new_sync;
-
-        return f;
-}
-
-void
-vc5_fence_init(struct vc5_screen *screen)
-{
-        screen->base.fence_reference = vc5_fence_reference;
-        screen->base.fence_finish = vc5_fence_finish;
-}
diff --git a/src/gallium/drivers/virgl/virgl_buffer.c b/src/gallium/drivers/virgl/virgl_buffer.c
index 2e63aeb..88a22b5 100644
--- a/src/gallium/drivers/virgl/virgl_buffer.c
+++ b/src/gallium/drivers/virgl/virgl_buffer.c
@@ -164,6 +164,9 @@
    vbind = pipe_to_virgl_bind(template->bind);
    size = template->width0;
 
+   /* SSBOs and texture buffers can written to by host compute shaders. */
+   if (vbind == VIRGL_BIND_SHADER_BUFFER || vbind == VIRGL_BIND_SAMPLER_VIEW)
+      buf->base.clean = FALSE;
    buf->base.hw_res = vs->vws->resource_create(vs->vws, template->target, template->format, vbind, template->width0, 1, 1, 1, 0, 0, size);
 
    util_range_set_empty(&buf->valid_buffer_range);
diff --git a/src/gallium/drivers/virgl/virgl_context.c b/src/gallium/drivers/virgl/virgl_context.c
index 8d701bb..d1a1c98 100644
--- a/src/gallium/drivers/virgl/virgl_context.c
+++ b/src/gallium/drivers/virgl/virgl_context.c
@@ -168,6 +168,34 @@
    }
 }
 
+static void virgl_attach_res_shader_buffers(struct virgl_context *vctx,
+                                            enum pipe_shader_type shader_type)
+{
+   struct virgl_winsys *vws = virgl_screen(vctx->base.screen)->vws;
+   struct virgl_resource *res;
+   unsigned i;
+   for (i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
+      res = virgl_resource(vctx->ssbos[shader_type][i]);
+      if (res) {
+         vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
+      }
+   }
+}
+
+static void virgl_attach_res_shader_images(struct virgl_context *vctx,
+                                           enum pipe_shader_type shader_type)
+{
+   struct virgl_winsys *vws = virgl_screen(vctx->base.screen)->vws;
+   struct virgl_resource *res;
+   unsigned i;
+   for (i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
+      res = virgl_resource(vctx->images[shader_type][i]);
+      if (res) {
+         vws->emit_res(vws, vctx->cbuf, res->hw_res, FALSE);
+      }
+   }
+}
+
 /*
  * after flushing, the hw context still has a bunch of
  * resources bound, so we need to rebind those here.
@@ -183,6 +211,8 @@
    for (shader_type = 0; shader_type < PIPE_SHADER_TYPES; shader_type++) {
       virgl_attach_res_sampler_views(vctx, shader_type);
       virgl_attach_res_uniform_buffers(vctx, shader_type);
+      virgl_attach_res_shader_buffers(vctx, shader_type);
+      virgl_attach_res_shader_images(vctx, shader_type);
    }
    virgl_attach_res_vertex_buffers(vctx);
    virgl_attach_res_so_targets(vctx);
@@ -476,7 +506,7 @@
    handle = virgl_object_assign_handle();
    /* encode VS state */
    ret = virgl_encode_shader_state(vctx, handle, type,
-                                   &shader->stream_output,
+                                   &shader->stream_output, 0,
                                    new_tokens);
    if (ret) {
       return NULL;
@@ -492,6 +522,18 @@
    return virgl_shader_encoder(ctx, shader, PIPE_SHADER_VERTEX);
 }
 
+static void *virgl_create_tcs_state(struct pipe_context *ctx,
+                                   const struct pipe_shader_state *shader)
+{
+   return virgl_shader_encoder(ctx, shader, PIPE_SHADER_TESS_CTRL);
+}
+
+static void *virgl_create_tes_state(struct pipe_context *ctx,
+                                   const struct pipe_shader_state *shader)
+{
+   return virgl_shader_encoder(ctx, shader, PIPE_SHADER_TESS_EVAL);
+}
+
 static void *virgl_create_gs_state(struct pipe_context *ctx,
                                    const struct pipe_shader_state *shader)
 {
@@ -534,6 +576,26 @@
    virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_SHADER);
 }
 
+static void
+virgl_delete_tcs_state(struct pipe_context *ctx,
+                       void *tcs)
+{
+   uint32_t handle = (unsigned long)tcs;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_SHADER);
+}
+
+static void
+virgl_delete_tes_state(struct pipe_context *ctx,
+                      void *tes)
+{
+   uint32_t handle = (unsigned long)tes;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_SHADER);
+}
+
 static void virgl_bind_vs_state(struct pipe_context *ctx,
                                         void *vss)
 {
@@ -543,6 +605,24 @@
    virgl_encode_bind_shader(vctx, handle, PIPE_SHADER_VERTEX);
 }
 
+static void virgl_bind_tcs_state(struct pipe_context *ctx,
+                               void *vss)
+{
+   uint32_t handle = (unsigned long)vss;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_bind_shader(vctx, handle, PIPE_SHADER_TESS_CTRL);
+}
+
+static void virgl_bind_tes_state(struct pipe_context *ctx,
+                               void *vss)
+{
+   uint32_t handle = (unsigned long)vss;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_bind_shader(vctx, handle, PIPE_SHADER_TESS_EVAL);
+}
+
 static void virgl_bind_gs_state(struct pipe_context *ctx,
                                void *vss)
 {
@@ -724,6 +804,12 @@
    virgl_attach_res_sampler_views(vctx, shader_type);
 }
 
+static void
+virgl_texture_barrier(struct pipe_context *pctx, unsigned flags)
+{
+   /* stub */
+}
+
 static void virgl_destroy_sampler_view(struct pipe_context *ctx,
                                  struct pipe_sampler_view *view)
 {
@@ -794,6 +880,17 @@
    virgl_encoder_set_sample_mask(vctx, sample_mask);
 }
 
+static void virgl_set_min_samples(struct pipe_context *ctx,
+                                 unsigned min_samples)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *rs = virgl_screen(ctx->screen);
+
+   if (!(rs->caps.caps.v2.capability_bits & VIRGL_CAP_SET_MIN_SAMPLES))
+      return;
+   virgl_encoder_set_min_samples(vctx, min_samples);
+}
+
 static void virgl_set_clip_state(struct pipe_context *ctx,
                                 const struct pipe_clip_state *clip)
 {
@@ -801,6 +898,18 @@
    virgl_encoder_set_clip_state(vctx, clip);
 }
 
+static void virgl_set_tess_state(struct pipe_context *ctx,
+                                 const float default_outer_level[4],
+                                 const float default_inner_level[2])
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *rs = virgl_screen(ctx->screen);
+
+   if (!rs->caps.caps.v1.bset.has_tessellation_shaders)
+      return;
+   virgl_encode_set_tess_state(vctx, default_outer_level, default_inner_level);
+}
+
 static void virgl_resource_copy_region(struct pipe_context *ctx,
                                       struct pipe_resource *dst,
                                       unsigned dst_level,
@@ -838,6 +947,117 @@
                     blit);
 }
 
+static void virgl_set_shader_buffers(struct pipe_context *ctx,
+                                     enum pipe_shader_type shader,
+                                     unsigned start_slot, unsigned count,
+                                     const struct pipe_shader_buffer *buffers)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *rs = virgl_screen(ctx->screen);
+
+   for (unsigned i = 0; i < count; i++) {
+      unsigned idx = start_slot + i;
+
+      if (buffers) {
+         if (buffers[i].buffer) {
+            pipe_resource_reference(&vctx->ssbos[shader][idx], buffers[i].buffer);
+            continue;
+         }
+      }
+      pipe_resource_reference(&vctx->ssbos[shader][idx], NULL);
+   }
+
+   uint32_t max_shader_buffer = (shader == PIPE_SHADER_FRAGMENT || shader == PIPE_SHADER_COMPUTE) ?
+      rs->caps.caps.v2.max_shader_buffer_frag_compute :
+      rs->caps.caps.v2.max_shader_buffer_other_stages;
+   if (!max_shader_buffer)
+      return;
+   virgl_encode_set_shader_buffers(vctx, shader, start_slot, count, buffers);
+}
+
+static void virgl_set_shader_images(struct pipe_context *ctx,
+                                    enum pipe_shader_type shader,
+                                    unsigned start_slot, unsigned count,
+                                    const struct pipe_image_view *images)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *rs = virgl_screen(ctx->screen);
+
+   for (unsigned i = 0; i < count; i++) {
+      unsigned idx = start_slot + i;
+
+      if (images) {
+         if (images[i].resource) {
+            pipe_resource_reference(&vctx->images[shader][idx], images[i].resource);
+            continue;
+         }
+      }
+      pipe_resource_reference(&vctx->images[shader][idx], NULL);
+   }
+
+   uint32_t max_shader_images = (shader == PIPE_SHADER_FRAGMENT || shader == PIPE_SHADER_COMPUTE) ?
+     rs->caps.caps.v2.max_shader_image_frag_compute :
+     rs->caps.caps.v2.max_shader_image_other_stages;
+   if (!max_shader_images)
+      return;
+   virgl_encode_set_shader_images(vctx, shader, start_slot, count, images);
+}
+
+static void virgl_memory_barrier(struct pipe_context *ctx,
+                                 unsigned flags)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *rs = virgl_screen(ctx->screen);
+
+   if (!(rs->caps.caps.v2.capability_bits & VIRGL_CAP_MEMORY_BARRIER))
+      return;
+   virgl_encode_memory_barrier(vctx, flags);
+}
+
+static void *virgl_create_compute_state(struct pipe_context *ctx,
+                                        const struct pipe_compute_state *state)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   uint32_t handle;
+   const struct tgsi_token *new_tokens = state->prog;
+   struct pipe_stream_output_info so_info = {};
+   int ret;
+
+   handle = virgl_object_assign_handle();
+   ret = virgl_encode_shader_state(vctx, handle, PIPE_SHADER_COMPUTE,
+                                   &so_info,
+                                   state->req_local_mem,
+                                   new_tokens);
+   if (ret) {
+      return NULL;
+   }
+
+   return (void *)(unsigned long)handle;
+}
+
+static void virgl_bind_compute_state(struct pipe_context *ctx, void *state)
+{
+   uint32_t handle = (unsigned long)state;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_bind_shader(vctx, handle, PIPE_SHADER_COMPUTE);
+}
+
+static void virgl_delete_compute_state(struct pipe_context *ctx, void *state)
+{
+   uint32_t handle = (unsigned long)state;
+   struct virgl_context *vctx = virgl_context(ctx);
+
+   virgl_encode_delete_object(vctx, handle, VIRGL_OBJECT_SHADER);
+}
+
+static void virgl_launch_grid(struct pipe_context *ctx,
+                              const struct pipe_grid_info *info)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   virgl_encode_launch_grid(vctx, info);
+}
+
 static void
 virgl_context_destroy( struct pipe_context *ctx )
 {
@@ -858,6 +1078,42 @@
    FREE(vctx);
 }
 
+static void virgl_get_sample_position(struct pipe_context *ctx,
+				      unsigned sample_count,
+				      unsigned index,
+				      float *out_value)
+{
+   struct virgl_context *vctx = virgl_context(ctx);
+   struct virgl_screen *vs = virgl_screen(vctx->base.screen);
+
+   if (sample_count > vs->caps.caps.v1.max_samples) {
+      debug_printf("VIRGL: requested %d MSAA samples, but only %d supported\n",
+		   sample_count, vs->caps.caps.v1.max_samples);
+      return;
+   }
+
+   /* The following is basically copied from dri/i965gen6_get_sample_position
+    * The only addition is that we hold the msaa positions for all sample
+    * counts in a flat array. */
+   uint32_t bits = 0;
+   if (sample_count == 1) {
+      out_value[0] = out_value[1] = 0.5f;
+      return;
+   } else if (sample_count == 2) {
+      bits = vs->caps.caps.v2.msaa_sample_positions[0] >> (8 * index);
+   } else if (sample_count <= 4) {
+      bits = vs->caps.caps.v2.msaa_sample_positions[1] >> (8 * index);
+   } else if (sample_count <= 8) {
+      bits = vs->caps.caps.v2.msaa_sample_positions[2 + (index >> 2)] >> (8 * (index & 3));
+   } else if (sample_count <= 16) {
+      bits = vs->caps.caps.v2.msaa_sample_positions[4 + (index >> 2)] >> (8 * (index & 3));
+   }
+   out_value[0] = ((bits >> 4) & 0xf) / 16.0f;
+   out_value[1] = (bits & 0xf) / 16.0f;
+   debug_printf("VIRGL: sample postion [%2d/%2d] = (%f, %f)\n",
+		index, sample_count, out_value[0], out_value[1]);
+}
+
 struct pipe_context *virgl_context_create(struct pipe_screen *pscreen,
                                           void *priv,
                                           unsigned flags)
@@ -893,18 +1149,30 @@
    vctx->base.set_vertex_buffers = virgl_set_vertex_buffers;
    vctx->base.set_constant_buffer = virgl_set_constant_buffer;
 
+   vctx->base.set_tess_state = virgl_set_tess_state;
    vctx->base.create_vs_state = virgl_create_vs_state;
+   vctx->base.create_tcs_state = virgl_create_tcs_state;
+   vctx->base.create_tes_state = virgl_create_tes_state;
    vctx->base.create_gs_state = virgl_create_gs_state;
    vctx->base.create_fs_state = virgl_create_fs_state;
 
    vctx->base.bind_vs_state = virgl_bind_vs_state;
+   vctx->base.bind_tcs_state = virgl_bind_tcs_state;
+   vctx->base.bind_tes_state = virgl_bind_tes_state;
    vctx->base.bind_gs_state = virgl_bind_gs_state;
    vctx->base.bind_fs_state = virgl_bind_fs_state;
 
    vctx->base.delete_vs_state = virgl_delete_vs_state;
+   vctx->base.delete_tcs_state = virgl_delete_tcs_state;
+   vctx->base.delete_tes_state = virgl_delete_tes_state;
    vctx->base.delete_gs_state = virgl_delete_gs_state;
    vctx->base.delete_fs_state = virgl_delete_fs_state;
 
+   vctx->base.create_compute_state = virgl_create_compute_state;
+   vctx->base.bind_compute_state = virgl_bind_compute_state;
+   vctx->base.delete_compute_state = virgl_delete_compute_state;
+   vctx->base.launch_grid = virgl_launch_grid;
+
    vctx->base.clear = virgl_clear;
    vctx->base.draw_vbo = virgl_draw_vbo;
    vctx->base.flush = virgl_flush_from_st;
@@ -912,6 +1180,7 @@
    vctx->base.create_sampler_view = virgl_create_sampler_view;
    vctx->base.sampler_view_destroy = virgl_destroy_sampler_view;
    vctx->base.set_sampler_views = virgl_set_sampler_views;
+   vctx->base.texture_barrier = virgl_texture_barrier;
 
    vctx->base.create_sampler_state = virgl_create_sampler_state;
    vctx->base.delete_sampler_state = virgl_delete_sampler_state;
@@ -920,15 +1189,22 @@
    vctx->base.set_polygon_stipple = virgl_set_polygon_stipple;
    vctx->base.set_scissor_states = virgl_set_scissor_states;
    vctx->base.set_sample_mask = virgl_set_sample_mask;
+   vctx->base.set_min_samples = virgl_set_min_samples;
    vctx->base.set_stencil_ref = virgl_set_stencil_ref;
    vctx->base.set_clip_state = virgl_set_clip_state;
 
    vctx->base.set_blend_color = virgl_set_blend_color;
 
+   vctx->base.get_sample_position = virgl_get_sample_position;
+
    vctx->base.resource_copy_region = virgl_resource_copy_region;
    vctx->base.flush_resource = virgl_flush_resource;
    vctx->base.blit =  virgl_blit;
 
+   vctx->base.set_shader_buffers = virgl_set_shader_buffers;
+   vctx->base.set_shader_images = virgl_set_shader_images;
+   vctx->base.memory_barrier = virgl_memory_barrier;
+
    virgl_init_context_resource_functions(&vctx->base);
    virgl_init_query_functions(vctx);
    virgl_init_so_functions(vctx);
diff --git a/src/gallium/drivers/virgl/virgl_context.h b/src/gallium/drivers/virgl/virgl_context.h
index 3492dcf..38d1f45 100644
--- a/src/gallium/drivers/virgl/virgl_context.h
+++ b/src/gallium/drivers/virgl/virgl_context.h
@@ -68,6 +68,9 @@
    unsigned num_so_targets;
 
    struct pipe_resource *ubos[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
+
+   struct pipe_resource *ssbos[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS];
+   struct pipe_resource *images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS];
    int num_transfers;
    int num_draws;
    struct list_head to_flush_bufs;
diff --git a/src/gallium/drivers/virgl/virgl_encode.c b/src/gallium/drivers/virgl/virgl_encode.c
index c038ac7..9aece38 100644
--- a/src/gallium/drivers/virgl/virgl_encode.c
+++ b/src/gallium/drivers/virgl/virgl_encode.c
@@ -232,7 +232,7 @@
            VIRGL_OBJ_SHADER_SO_OUTPUT_BUFFER(so_info->output[i].output_buffer) |
            VIRGL_OBJ_SHADER_SO_OUTPUT_DST_OFFSET(so_info->output[i].dst_offset);
          virgl_encoder_write_dword(ctx->cbuf, tmp);
-         virgl_encoder_write_dword(ctx->cbuf, 0);
+         virgl_encoder_write_dword(ctx->cbuf, so_info->output[i].stream);
       }
    }
 }
@@ -241,6 +241,7 @@
                               uint32_t handle,
                               uint32_t type,
                               const struct pipe_stream_output_info *so_info,
+                              uint32_t cs_req_local_mem,
                               const struct tgsi_token *tokens)
 {
    char *str, *sptr;
@@ -298,7 +299,10 @@
 
       virgl_emit_shader_header(ctx, handle, len, type, offlen, num_tokens);
 
-      virgl_emit_shader_streamout(ctx, first_pass ? so_info : NULL);
+      if (type == PIPE_SHADER_COMPUTE)
+         virgl_encoder_write_dword(ctx->cbuf, cs_req_local_mem);
+      else
+         virgl_emit_shader_streamout(ctx, first_pass ? so_info : NULL);
 
       virgl_encoder_write_block(ctx->cbuf, (uint8_t *)sptr, length);
 
@@ -346,6 +350,12 @@
       virgl_encoder_write_dword(ctx->cbuf, surf ? surf->handle : 0);
    }
 
+   struct virgl_screen *rs = virgl_screen(ctx->base.screen);
+   if (rs->caps.caps.v2.capability_bits & VIRGL_CAP_FB_NO_ATTACH) {
+      virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_FRAMEBUFFER_STATE_NO_ATTACH, 0, VIRGL_SET_FRAMEBUFFER_STATE_NO_ATTACH_SIZE));
+      virgl_encoder_write_dword(ctx->cbuf, state->width | (state->height << 16));
+      virgl_encoder_write_dword(ctx->cbuf, state->layers | (state->samples << 16));
+   }
    return 0;
 }
 
@@ -419,6 +429,8 @@
                           const struct pipe_draw_info *info)
 {
    uint32_t length = VIRGL_DRAW_VBO_SIZE;
+   if (info->mode == PIPE_PRIM_PATCHES)
+      length = VIRGL_DRAW_VBO_SIZE_TESS;
    if (info->indirect)
       length = VIRGL_DRAW_VBO_SIZE_INDIRECT;
    virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_DRAW_VBO, 0, length));
@@ -437,9 +449,11 @@
       virgl_encoder_write_dword(ctx->cbuf, info->count_from_stream_output->buffer_size);
    else
       virgl_encoder_write_dword(ctx->cbuf, 0);
+   if (length >= VIRGL_DRAW_VBO_SIZE_TESS) {
+      virgl_encoder_write_dword(ctx->cbuf, info->vertices_per_patch); /* vertices per patch */
+      virgl_encoder_write_dword(ctx->cbuf, info->drawid); /* drawid */
+   }
    if (length == VIRGL_DRAW_VBO_SIZE_INDIRECT) {
-      virgl_encoder_write_dword(ctx->cbuf, 0); /* vertices per patch */
-      virgl_encoder_write_dword(ctx->cbuf, 0); /* drawid */
       virgl_encoder_write_res(ctx, virgl_resource(info->indirect->buffer));
       virgl_encoder_write_dword(ctx->cbuf, info->indirect->offset);
       virgl_encoder_write_dword(ctx->cbuf, 0); /* indirect stride */
@@ -585,12 +599,15 @@
                              const struct pipe_sampler_view *state)
 {
    unsigned elem_size = util_format_get_blocksize(state->format);
-
+   struct virgl_screen *rs = virgl_screen(ctx->base.screen);
    uint32_t tmp;
+   uint32_t dword_fmt_target = state->format;
    virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_CREATE_OBJECT, VIRGL_OBJECT_SAMPLER_VIEW, VIRGL_OBJ_SAMPLER_VIEW_SIZE));
    virgl_encoder_write_dword(ctx->cbuf, handle);
    virgl_encoder_write_res(ctx, res);
-   virgl_encoder_write_dword(ctx->cbuf, state->format);
+   if (rs->caps.caps.v2.capability_bits & VIRGL_CAP_TEXTURE_VIEW)
+     dword_fmt_target |= (state->target << 24);
+   virgl_encoder_write_dword(ctx->cbuf, dword_fmt_target);
    if (res->u.b.target == PIPE_BUFFER) {
       virgl_encoder_write_dword(ctx->cbuf, state->u.buf.offset / elem_size);
       virgl_encoder_write_dword(ctx->cbuf, (state->u.buf.offset + state->u.buf.size) / elem_size - 1);
@@ -719,6 +736,13 @@
    virgl_encoder_write_dword(ctx->cbuf, sample_mask);
 }
 
+void virgl_encoder_set_min_samples(struct virgl_context *ctx,
+                                  unsigned min_samples)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_MIN_SAMPLES, 0, VIRGL_SET_MIN_SAMPLES_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, min_samples);
+}
+
 void virgl_encoder_set_clip_state(struct virgl_context *ctx,
                                  const struct pipe_clip_state *clip)
 {
@@ -891,3 +915,97 @@
    virgl_encoder_write_dword(ctx->cbuf, type);
    return 0;
 }
+
+int virgl_encode_set_tess_state(struct virgl_context *ctx,
+                                const float outer[4],
+                                const float inner[2])
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_TESS_STATE, 0, 6));
+   for (i = 0; i < 4; i++)
+      virgl_encoder_write_dword(ctx->cbuf, fui(outer[i]));
+   for (i = 0; i < 2; i++)
+      virgl_encoder_write_dword(ctx->cbuf, fui(inner[i]));
+   return 0;
+}
+
+int virgl_encode_set_shader_buffers(struct virgl_context *ctx,
+                                    enum pipe_shader_type shader,
+                                    unsigned start_slot, unsigned count,
+                                    const struct pipe_shader_buffer *buffers)
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_SHADER_BUFFERS, 0, VIRGL_SET_SHADER_BUFFER_SIZE(count)));
+
+   virgl_encoder_write_dword(ctx->cbuf, shader);
+   virgl_encoder_write_dword(ctx->cbuf, start_slot);
+   for (i = 0; i < count; i++) {
+      if (buffers) {
+         struct virgl_resource *res = virgl_resource(buffers[i].buffer);
+         virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_offset);
+         virgl_encoder_write_dword(ctx->cbuf, buffers[i].buffer_size);
+         virgl_encoder_write_res(ctx, res);
+      } else {
+         virgl_encoder_write_dword(ctx->cbuf, 0);
+         virgl_encoder_write_dword(ctx->cbuf, 0);
+         virgl_encoder_write_dword(ctx->cbuf, 0);
+      }
+   }
+   return 0;
+}
+
+int virgl_encode_set_shader_images(struct virgl_context *ctx,
+                                   enum pipe_shader_type shader,
+                                   unsigned start_slot, unsigned count,
+                                   const struct pipe_image_view *images)
+{
+   int i;
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_SET_SHADER_IMAGES, 0, VIRGL_SET_SHADER_IMAGE_SIZE(count)));
+
+   virgl_encoder_write_dword(ctx->cbuf, shader);
+   virgl_encoder_write_dword(ctx->cbuf, start_slot);
+   for (i = 0; i < count; i++) {
+      if (images) {
+         struct virgl_resource *res = virgl_resource(images[i].resource);
+         virgl_encoder_write_dword(ctx->cbuf, images[i].format);
+         virgl_encoder_write_dword(ctx->cbuf, images[i].access);
+         virgl_encoder_write_dword(ctx->cbuf, images[i].u.buf.offset);
+         virgl_encoder_write_dword(ctx->cbuf, images[i].u.buf.size);
+         virgl_encoder_write_res(ctx, res);
+      } else {
+         virgl_encoder_write_dword(ctx->cbuf, 0);
+         virgl_encoder_write_dword(ctx->cbuf, 0);
+         virgl_encoder_write_dword(ctx->cbuf, 0);
+         virgl_encoder_write_dword(ctx->cbuf, 0);
+         virgl_encoder_write_dword(ctx->cbuf, 0);
+      }
+   }
+   return 0;
+}
+
+int virgl_encode_memory_barrier(struct virgl_context *ctx,
+                                unsigned flags)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_MEMORY_BARRIER, 0, 1));
+   virgl_encoder_write_dword(ctx->cbuf, flags);
+   return 0;
+}
+
+int virgl_encode_launch_grid(struct virgl_context *ctx,
+                             const struct pipe_grid_info *grid_info)
+{
+   virgl_encoder_write_cmd_dword(ctx, VIRGL_CMD0(VIRGL_CCMD_LAUNCH_GRID, 0, VIRGL_LAUNCH_GRID_SIZE));
+   virgl_encoder_write_dword(ctx->cbuf, grid_info->block[0]);
+   virgl_encoder_write_dword(ctx->cbuf, grid_info->block[1]);
+   virgl_encoder_write_dword(ctx->cbuf, grid_info->block[2]);
+   virgl_encoder_write_dword(ctx->cbuf, grid_info->grid[0]);
+   virgl_encoder_write_dword(ctx->cbuf, grid_info->grid[1]);
+   virgl_encoder_write_dword(ctx->cbuf, grid_info->grid[2]);
+   if (grid_info->indirect) {
+      struct virgl_resource *res = virgl_resource(grid_info->indirect);
+      virgl_encoder_write_res(ctx, res);
+   } else
+      virgl_encoder_write_dword(ctx->cbuf, 0);
+   virgl_encoder_write_dword(ctx->cbuf, grid_info->indirect_offset);
+   return 0;
+}
diff --git a/src/gallium/drivers/virgl/virgl_encode.h b/src/gallium/drivers/virgl/virgl_encode.h
index 02c032d..c982eb9 100644
--- a/src/gallium/drivers/virgl/virgl_encode.h
+++ b/src/gallium/drivers/virgl/virgl_encode.h
@@ -90,6 +90,7 @@
                                      uint32_t handle,
                                      uint32_t type,
                                      const struct pipe_stream_output_info *so_info,
+                                     uint32_t cs_req_local_mem,
                                      const struct tgsi_token *tokens);
 
 int virgl_encode_stream_output_info(struct virgl_context *ctx,
@@ -211,6 +212,9 @@
 void virgl_encoder_set_sample_mask(struct virgl_context *ctx,
                                   unsigned sample_mask);
 
+void virgl_encoder_set_min_samples(struct virgl_context *ctx,
+                                  unsigned min_samples);
+
 void virgl_encoder_set_clip_state(struct virgl_context *ctx,
                                  const struct pipe_clip_state *clip);
 
@@ -251,4 +255,21 @@
 
 int virgl_encode_bind_shader(struct virgl_context *ctx,
                              uint32_t handle, uint32_t type);
+
+int virgl_encode_set_tess_state(struct virgl_context *ctx,
+                                const float outer[4],
+                                const float inner[2]);
+
+int virgl_encode_set_shader_buffers(struct virgl_context *ctx,
+                                    enum pipe_shader_type shader,
+                                    unsigned start_slot, unsigned count,
+                                    const struct pipe_shader_buffer *buffers);
+int virgl_encode_set_shader_images(struct virgl_context *ctx,
+                                   enum pipe_shader_type shader,
+                                   unsigned start_slot, unsigned count,
+                                   const struct pipe_image_view *images);
+int virgl_encode_memory_barrier(struct virgl_context *ctx,
+                                unsigned flags);
+int virgl_encode_launch_grid(struct virgl_context *ctx,
+                             const struct pipe_grid_info *grid_info);
 #endif
diff --git a/src/gallium/drivers/virgl/virgl_hw.h b/src/gallium/drivers/virgl/virgl_hw.h
index 93849c0..02cedcd 100644
--- a/src/gallium/drivers/virgl/virgl_hw.h
+++ b/src/gallium/drivers/virgl/virgl_hw.h
@@ -83,6 +83,7 @@
    VIRGL_FORMAT_L8A8_SRGB               = 96,
    VIRGL_FORMAT_B8G8R8A8_SRGB           = 100,
    VIRGL_FORMAT_B8G8R8X8_SRGB           = 101,
+   VIRGL_FORMAT_R8G8B8A8_SRGB           = 104,
 
    /* compressed formats */
    VIRGL_FORMAT_DXT1_RGB                = 105,
@@ -196,6 +197,20 @@
    VIRGL_FORMAT_MAX,
 };
 
+/* These are used by the capability_bits field in virgl_caps_v2. */
+#define VIRGL_CAP_NONE 0
+#define VIRGL_CAP_TGSI_INVARIANT       (1 << 0)
+#define VIRGL_CAP_TEXTURE_VIEW         (1 << 1)
+#define VIRGL_CAP_SET_MIN_SAMPLES      (1 << 2)
+#define VIRGL_CAP_COPY_IMAGE           (1 << 3)
+#define VIRGL_CAP_TGSI_PRECISE         (1 << 4)
+#define VIRGL_CAP_TXQS                 (1 << 5)
+#define VIRGL_CAP_MEMORY_BARRIER       (1 << 6)
+#define VIRGL_CAP_COMPUTE_SHADER       (1 << 7)
+#define VIRGL_CAP_FB_NO_ATTACH         (1 << 8)
+#define VIRGL_CAP_ROBUST_BUFFER_ACCESS (1 << 9)
+#define VIRGL_CAP_TGSI_FBFETCH         (1 << 10)
+
 #define VIRGL_BIND_DEPTH_STENCIL (1 << 0)
 #define VIRGL_BIND_RENDER_TARGET (1 << 1)
 #define VIRGL_BIND_SAMPLER_VIEW  (1 << 3)
@@ -204,6 +219,7 @@
 #define VIRGL_BIND_CONSTANT_BUFFER (1 << 6)
 #define VIRGL_BIND_DISPLAY_TARGET (1 << 7)
 #define VIRGL_BIND_STREAM_OUTPUT (1 << 11)
+#define VIRGL_BIND_SHADER_BUFFER (1 << 14)
 #define VIRGL_BIND_CURSOR        (1 << 16)
 #define VIRGL_BIND_CUSTOM        (1 << 17)
 #define VIRGL_BIND_SCANOUT       (1 << 18)
@@ -237,6 +253,11 @@
         unsigned has_indirect_draw:1;
         unsigned has_sample_shading:1;
         unsigned has_cull:1;
+        unsigned conditional_render_inverted:1;
+        unsigned derivative_control:1;
+        unsigned polygon_offset_clamp:1;
+        unsigned transform_feedback_overflow_query:1;
+        /* DO NOT ADD ANYMORE MEMBERS - need to add another 32-bit to v2 caps */
 };
 
 /* endless expansion capabilites - current gallium has 252 formats */
@@ -286,6 +307,19 @@
         int32_t max_texture_gather_offset;
         uint32_t texture_buffer_offset_alignment;
         uint32_t uniform_buffer_offset_alignment;
+        uint32_t shader_buffer_offset_alignment;
+        uint32_t capability_bits;
+        uint32_t msaa_sample_positions[8];
+        uint32_t max_vertex_attrib_stride;
+        uint32_t max_shader_buffer_frag_compute;
+        uint32_t max_shader_buffer_other_stages;
+        uint32_t max_shader_image_frag_compute;
+        uint32_t max_shader_image_other_stages;
+        uint32_t max_image_samples;
+        uint32_t max_compute_work_group_invocations;
+        uint32_t max_compute_shared_memory_size;
+        uint32_t max_compute_grid_size[3];
+        uint32_t max_compute_block_size[3];
 };
 
 union virgl_caps {
diff --git a/src/gallium/drivers/virgl/virgl_protocol.h b/src/gallium/drivers/virgl/virgl_protocol.h
index 5dc2874..982bc5c 100644
--- a/src/gallium/drivers/virgl/virgl_protocol.h
+++ b/src/gallium/drivers/virgl/virgl_protocol.h
@@ -83,6 +83,14 @@
    VIRGL_CCMD_CREATE_SUB_CTX,
    VIRGL_CCMD_DESTROY_SUB_CTX,
    VIRGL_CCMD_BIND_SHADER,
+
+   VIRGL_CCMD_SET_TESS_STATE,
+   VIRGL_CCMD_SET_MIN_SAMPLES,
+   VIRGL_CCMD_SET_SHADER_BUFFERS,
+   VIRGL_CCMD_SET_SHADER_IMAGES,
+   VIRGL_CCMD_MEMORY_BARRIER,
+   VIRGL_CCMD_LAUNCH_GRID,
+   VIRGL_CCMD_SET_FRAMEBUFFER_STATE_NO_ATTACH,
 };
 
 /*
@@ -481,4 +489,54 @@
 #define VIRGL_BIND_SHADER_HANDLE 1
 #define VIRGL_BIND_SHADER_TYPE 2
 
+/* tess state */
+#define VIRGL_TESS_STATE_SIZE 6
+
+/* set min samples */
+#define VIRGL_SET_MIN_SAMPLES_SIZE 1
+#define VIRGL_SET_MIN_SAMPLES_MASK 1
+
+/* set shader buffers */
+#define VIRGL_SET_SHADER_BUFFER_ELEMENT_SIZE 3
+#define VIRGL_SET_SHADER_BUFFER_SIZE(x) (VIRGL_SET_SHADER_BUFFER_ELEMENT_SIZE * (x)) + 2
+#define VIRGL_SET_SHADER_BUFFER_SHADER_TYPE 1
+#define VIRGL_SET_SHADER_BUFFER_START_SLOT 2
+#define VIRGL_SET_SHADER_BUFFER_OFFSET(x) ((x) * VIRGL_SET_SHADER_BUFFER_ELEMENT_SIZE + 3)
+#define VIRGL_SET_SHADER_BUFFER_LENGTH(x) ((x) * VIRGL_SET_SHADER_BUFFER_ELEMENT_SIZE + 4)
+#define VIRGL_SET_SHADER_BUFFER_RES_HANDLE(x) ((x) * VIRGL_SET_SHADER_BUFFER_ELEMENT_SIZE + 5)
+
+/* set shader images */
+#define VIRGL_SET_SHADER_IMAGE_ELEMENT_SIZE 5
+#define VIRGL_SET_SHADER_IMAGE_SIZE(x) (VIRGL_SET_SHADER_IMAGE_ELEMENT_SIZE * (x)) + 2
+#define VIRGL_SET_SHADER_IMAGE_SHADER_TYPE 1
+#define VIRGL_SET_SHADER_IMAGE_START_SLOT 2
+#define VIRGL_SET_SHADER_IMAGE_FORMAT(x) ((x) * VIRGL_SET_SHADER_IMAGE_ELEMENT_SIZE + 3)
+#define VIRGL_SET_SHADER_IMAGE_ACCESS(x) ((x) * VIRGL_SET_SHADER_IMAGE_ELEMENT_SIZE + 4)
+#define VIRGL_SET_SHADER_IMAGE_LAYER_OFFSET(x) ((x) * VIRGL_SET_SHADER_IMAGE_ELEMENT_SIZE + 5)
+#define VIRGL_SET_SHADER_IMAGE_LEVEL_SIZE(x) ((x) * VIRGL_SET_SHADER_IMAGE_ELEMENT_SIZE + 6)
+#define VIRGL_SET_SHADER_IMAGE_RES_HANDLE(x) ((x) * VIRGL_SET_SHADER_IMAGE_ELEMENT_SIZE + 7)
+
+/* memory barrier */
+#define VIRGL_MEMORY_BARRIER_SIZE 1
+#define VIRGL_MEMORY_BARRIER_FLAGS 1
+
+#define VIRGL_LAUNCH_GRID_SIZE 8
+#define VIRGL_LAUNCH_BLOCK_X 1
+#define VIRGL_LAUNCH_BLOCK_Y 2
+#define VIRGL_LAUNCH_BLOCK_Z 3
+#define VIRGL_LAUNCH_GRID_X 4
+#define VIRGL_LAUNCH_GRID_Y 5
+#define VIRGL_LAUNCH_GRID_Z 6
+#define VIRGL_LAUNCH_INDIRECT_HANDLE 7
+#define VIRGL_LAUNCH_INDIRECT_OFFSET 8
+
+/* framebuffer state no attachment */
+#define VIRGL_SET_FRAMEBUFFER_STATE_NO_ATTACH_SIZE 2
+#define VIRGL_SET_FRAMEBUFFER_STATE_NO_ATTACH_WIDTH_HEIGHT 1
+#define VIRGL_SET_FRAMEBUFFER_STATE_NO_ATTACH_WIDTH(x) (x & 0xffff)
+#define VIRGL_SET_FRAMEBUFFER_STATE_NO_ATTACH_HEIGHT(x) ((x >> 16) & 0xffff)
+#define VIRGL_SET_FRAMEBUFFER_STATE_NO_ATTACH_LAYERS_SAMPLES 2
+#define VIRGL_SET_FRAMEBUFFER_STATE_NO_ATTACH_LAYERS(x) (x & 0xffff)
+#define VIRGL_SET_FRAMEBUFFER_STATE_NO_ATTACH_SAMPLES(x) ((x >> 16) & 0xff)
+
 #endif
diff --git a/src/gallium/drivers/virgl/virgl_query.c b/src/gallium/drivers/virgl/virgl_query.c
index 3a930d2..a75b29c 100644
--- a/src/gallium/drivers/virgl/virgl_query.c
+++ b/src/gallium/drivers/virgl/virgl_query.c
@@ -48,12 +48,14 @@
 #define VIRGL_QUERY_SO_OVERFLOW_PREDICATE 8
 #define VIRGL_QUERY_GPU_FINISHED          9
 #define VIRGL_QUERY_PIPELINE_STATISTICS  10
+#define VIRGL_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE 11
+#define VIRGL_QUERY_SO_OVERFLOW_ANY_PREDICATE 12
 
 static const int pquery_map[] =
 {
    VIRGL_QUERY_OCCLUSION_COUNTER,
    VIRGL_QUERY_OCCLUSION_PREDICATE,
-   -1,
+   VIRGL_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE,
    VIRGL_QUERY_TIMESTAMP,
    VIRGL_QUERY_TIMESTAMP_DISJOINT,
    VIRGL_QUERY_TIME_ELAPSED,
@@ -61,7 +63,7 @@
    VIRGL_QUERY_PRIMITIVES_EMITTED,
    VIRGL_QUERY_SO_STATISTICS,
    VIRGL_QUERY_SO_OVERFLOW_PREDICATE,
-   -1,
+   VIRGL_QUERY_SO_OVERFLOW_ANY_PREDICATE,
    VIRGL_QUERY_GPU_FINISHED,
    VIRGL_QUERY_PIPELINE_STATISTICS,
 };
diff --git a/src/gallium/drivers/virgl/virgl_resource.h b/src/gallium/drivers/virgl/virgl_resource.h
index bab9bcb..297bc72 100644
--- a/src/gallium/drivers/virgl/virgl_resource.h
+++ b/src/gallium/drivers/virgl/virgl_resource.h
@@ -134,6 +134,8 @@
       outbind |= VIRGL_BIND_CUSTOM;
    if (pbind & PIPE_BIND_SCANOUT)
       outbind |= VIRGL_BIND_SCANOUT;
+   if (pbind & PIPE_BIND_SHADER_BUFFER)
+      outbind |= VIRGL_BIND_SHADER_BUFFER;
    return outbind;
 }
 
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 02613f1..e17d257 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -88,9 +88,10 @@
    case PIPE_CAP_INDEP_BLEND_FUNC:
       return vscreen->caps.caps.v1.bset.indep_blend_func;
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
-   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
    case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
    case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
       return vscreen->caps.caps.v1.bset.fragment_coord_conventions;
    case PIPE_CAP_DEPTH_CLIP_DISABLE:
       return vscreen->caps.caps.v1.bset.depth_clip_disable;
@@ -133,10 +134,12 @@
       return 1;
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
       return vscreen->caps.caps.v1.glsl_level;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
+      return MIN2(vscreen->caps.caps.v1.glsl_level, 140);
    case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
       return 0;
    case PIPE_CAP_COMPUTE:
-      return 0;
+      return vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_COMPUTE_SHADER;
    case PIPE_CAP_USER_VERTEX_BUFFERS:
       return 0;
    case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
@@ -198,49 +201,67 @@
       return vscreen->caps.caps.v1.bset.has_sample_shading;
    case PIPE_CAP_CULL_DISTANCE:
       return vscreen->caps.caps.v1.bset.has_cull;
+   case PIPE_CAP_MAX_VERTEX_STREAMS:
+      return vscreen->caps.caps.v1.glsl_level >= 400 ? 4 : 1;
+   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+      return vscreen->caps.caps.v1.bset.conditional_render_inverted;
+   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+      return vscreen->caps.caps.v1.bset.derivative_control;
+   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+      return vscreen->caps.caps.v1.bset.polygon_offset_clamp;
+   case PIPE_CAP_QUERY_SO_OVERFLOW:
+      return vscreen->caps.caps.v1.bset.transform_feedback_overflow_query;
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+      return vscreen->caps.caps.v2.shader_buffer_offset_alignment;
+   case PIPE_CAP_DOUBLES:
+      return vscreen->caps.caps.v1.bset.has_fp64;
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return vscreen->caps.caps.v2.max_shader_patch_varyings;
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+      return vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_TEXTURE_VIEW;
+   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+      return vscreen->caps.caps.v2.max_vertex_attrib_stride;
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+      return vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_COPY_IMAGE;
+   case PIPE_CAP_TGSI_TXQS:
+      return vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_TXQS;
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+      return vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_FB_NO_ATTACH;
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+      return vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_ROBUST_BUFFER_ACCESS;
+   case PIPE_CAP_TGSI_FS_FBFETCH:
+      return vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_TGSI_FBFETCH;
    case PIPE_CAP_TEXTURE_GATHER_SM5:
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
    case PIPE_CAP_FAKE_SW_MSAA:
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-   case PIPE_CAP_MAX_VERTEX_STREAMS:
    case PIPE_CAP_MULTI_DRAW_INDIRECT:
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
-   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
-   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
-   case PIPE_CAP_SAMPLER_VIEW_TARGET:
    case PIPE_CAP_CLIP_HALFZ:
    case PIPE_CAP_VERTEXID_NOBASE:
-   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
    case PIPE_CAP_DEPTH_BOUNDS_TEST:
-   case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_DRAW_PARAMETERS:
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
-   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
-   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_STRING_MARKER:
    case PIPE_CAP_QUERY_MEMORY_INFO:
    case PIPE_CAP_PCI_GROUP:
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
-   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
-   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
    case PIPE_CAP_PRIMITIVE_RESTART_FOR_PATCHES:
    case PIPE_CAP_TGSI_VOTE:
    case PIPE_CAP_MAX_WINDOW_RECTANGLES:
@@ -249,7 +270,6 @@
    case PIPE_CAP_TGSI_ARRAY_COMPONENTS:
    case PIPE_CAP_TGSI_CAN_READ_OUTPUTS:
    case PIPE_CAP_GLSL_OPTIMIZE_CONSERVATIVELY:
-   case PIPE_CAP_TGSI_FS_FBFETCH:
    case PIPE_CAP_TGSI_MUL_ZERO_WINS:
    case PIPE_CAP_INT64:
    case PIPE_CAP_INT64_DIVMOD:
@@ -258,24 +278,30 @@
    case PIPE_CAP_POLYGON_MODE_FILL_RECTANGLE:
    case PIPE_CAP_SPARSE_BUFFER_PAGE_SIZE:
    case PIPE_CAP_TGSI_BALLOT:
-   case PIPE_CAP_DOUBLES:
    case PIPE_CAP_TGSI_TES_LAYER_VIEWPORT:
    case PIPE_CAP_CAN_BIND_CONST_BUFFER_AS_VERTEX:
    case PIPE_CAP_ALLOW_MAPPED_BUFFERS_DURING_EXECUTION:
    case PIPE_CAP_POST_DEPTH_COVERAGE:
    case PIPE_CAP_BINDLESS_TEXTURE:
    case PIPE_CAP_NIR_SAMPLERS_AS_DEREF:
-   case PIPE_CAP_QUERY_SO_OVERFLOW:
    case PIPE_CAP_MEMOBJ:
    case PIPE_CAP_LOAD_CONSTBUF:
    case PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS:
    case PIPE_CAP_TILE_RASTER_ORDER:
    case PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES:
+   case PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS:
    case PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET:
    case PIPE_CAP_CONTEXT_PRIORITY_MASK:
    case PIPE_CAP_FENCE_SIGNAL:
    case PIPE_CAP_CONSTBUF0_FLAGS:
    case PIPE_CAP_PACKED_UNIFORMS:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES:
+   case PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE:
+   case PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS:
+   case PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
@@ -300,11 +326,23 @@
                        enum pipe_shader_cap param)
 {
    struct virgl_screen *vscreen = virgl_screen(screen);
+
+   if ((shader == PIPE_SHADER_TESS_CTRL || shader == PIPE_SHADER_TESS_EVAL) &&
+       !vscreen->caps.caps.v1.bset.has_tessellation_shaders)
+      return 0;
+
+   if (shader == PIPE_SHADER_COMPUTE &&
+       !(vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_COMPUTE_SHADER))
+     return 0;
+
    switch(shader)
    {
    case PIPE_SHADER_FRAGMENT:
    case PIPE_SHADER_VERTEX:
    case PIPE_SHADER_GEOMETRY:
+   case PIPE_SHADER_TESS_CTRL:
+   case PIPE_SHADER_TESS_EVAL:
+   case PIPE_SHADER_COMPUTE:
       switch (param) {
       case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
       case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS:
@@ -342,12 +380,27 @@
          return 32;
       case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
          return 4096 * sizeof(float[4]);
+      case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+         if (shader == PIPE_SHADER_FRAGMENT || shader == PIPE_SHADER_COMPUTE)
+            return vscreen->caps.caps.v2.max_shader_buffer_frag_compute;
+         else
+            return vscreen->caps.caps.v2.max_shader_buffer_other_stages;
+      case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+         if (shader == PIPE_SHADER_FRAGMENT || shader == PIPE_SHADER_COMPUTE)
+            return vscreen->caps.caps.v2.max_shader_image_frag_compute;
+         else
+            return vscreen->caps.caps.v2.max_shader_image_other_stages;
+      case PIPE_SHADER_CAP_SUPPORTED_IRS:
+         return (1 << PIPE_SHADER_IR_TGSI);
       case PIPE_SHADER_CAP_LOWER_IF_THRESHOLD:
       case PIPE_SHADER_CAP_TGSI_SKIP_MERGE_REGISTERS:
       case PIPE_SHADER_CAP_INT64_ATOMICS:
       case PIPE_SHADER_CAP_FP16:
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS:
       case PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS:
+         return 0;
+      case PIPE_SHADER_CAP_SCALAR_ISA:
+         return 1;
       default:
          return 0;
       }
@@ -373,12 +426,61 @@
       return 16.0;
    case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
       return vscreen->caps.caps.v2.max_texture_lod_bias;
+   case PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE:
+   case PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY:
+      return 0.0f;
    }
    /* should only get here on unhandled cases */
    debug_printf("Unexpected PIPE_CAPF %d query\n", param);
    return 0.0;
 }
 
+static int
+virgl_get_compute_param(struct pipe_screen *screen,
+                        enum pipe_shader_ir ir_type,
+                        enum pipe_compute_cap param,
+                        void *ret)
+{
+   struct virgl_screen *vscreen = virgl_screen(screen);
+   if (!(vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_COMPUTE_SHADER))
+      return 0;
+   switch (param) {
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      if (ret) {
+         uint64_t *grid_size = ret;
+         grid_size[0] = vscreen->caps.caps.v2.max_compute_grid_size[0];
+         grid_size[1] = vscreen->caps.caps.v2.max_compute_grid_size[1];
+         grid_size[2] = vscreen->caps.caps.v2.max_compute_grid_size[2];
+      }
+      return 3 * sizeof(uint64_t) ;
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      if (ret) {
+         uint64_t *block_size = ret;
+         block_size[0] = vscreen->caps.caps.v2.max_compute_block_size[0];
+         block_size[1] = vscreen->caps.caps.v2.max_compute_block_size[1];
+         block_size[2] = vscreen->caps.caps.v2.max_compute_block_size[2];
+      }
+      return 3 * sizeof(uint64_t);
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      if (ret) {
+         uint64_t *max_threads_per_block = ret;
+         *max_threads_per_block = vscreen->caps.caps.v2.max_compute_work_group_invocations;
+      }
+      return sizeof(uint64_t);
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
+      if (ret) {
+         uint64_t *max_local_size = ret;
+         /* Value reported by the closed source driver. */
+         *max_local_size = vscreen->caps.caps.v2.max_compute_shared_memory_size;
+      }
+      return sizeof(uint64_t);
+   default:
+      break;
+   }
+   return 0;
+}
+
 static boolean
 virgl_is_vertex_format_supported(struct pipe_screen *screen,
                                  enum pipe_format format)
@@ -428,12 +530,16 @@
                                  enum pipe_format format,
                                  enum pipe_texture_target target,
                                  unsigned sample_count,
+                                 unsigned storage_sample_count,
                                  unsigned bind)
 {
    struct virgl_screen *vscreen = virgl_screen(screen);
    const struct util_format_description *format_desc;
    int i;
 
+   if (MAX2(1, sample_count) != MAX2(1, storage_sample_count))
+      return false;
+
    assert(target == PIPE_BUFFER ||
           target == PIPE_TEXTURE_1D ||
           target == PIPE_TEXTURE_1D_ARRAY ||
@@ -454,6 +560,12 @@
    if (sample_count > 1) {
       if (!vscreen->caps.caps.v1.bset.texture_multisample)
          return FALSE;
+
+      if (bind & PIPE_BIND_SHADER_IMAGE) {
+         if (sample_count > vscreen->caps.caps.v2.max_image_samples)
+            return FALSE;
+      }
+
       if (sample_count > vscreen->caps.caps.v1.max_samples)
          return FALSE;
    }
@@ -462,7 +574,18 @@
       return virgl_is_vertex_format_supported(screen, format);
    }
 
+   /* Allow 3-comp 32 bit textures only for TBOs (needed for ARB_tbo_rgb32) */
+   if ((format == PIPE_FORMAT_R32G32B32_FLOAT ||
+       format == PIPE_FORMAT_R32G32B32_SINT ||
+       format == PIPE_FORMAT_R32G32B32_UINT) &&
+       target != PIPE_BUFFER)
+      return FALSE;
+
    if (bind & PIPE_BIND_RENDER_TARGET) {
+      /* For ARB_framebuffer_no_attachments. */
+      if (format == PIPE_FORMAT_NONE)
+         return TRUE;
+
       if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS)
          return FALSE;
 
@@ -602,6 +725,7 @@
    screen->base.get_vendor = virgl_get_vendor;
    screen->base.get_param = virgl_get_param;
    screen->base.get_shader_param = virgl_get_shader_param;
+   screen->base.get_compute_param = virgl_get_compute_param;
    screen->base.get_paramf = virgl_get_paramf;
    screen->base.is_format_supported = virgl_is_format_supported;
    screen->base.destroy = virgl_destroy_screen;
diff --git a/src/gallium/drivers/virgl/virgl_tgsi.c b/src/gallium/drivers/virgl/virgl_tgsi.c
index ff5abf6..d1f785d 100644
--- a/src/gallium/drivers/virgl/virgl_tgsi.c
+++ b/src/gallium/drivers/virgl/virgl_tgsi.c
@@ -31,6 +31,7 @@
 struct virgl_transform_context {
    struct tgsi_transform_context base;
    bool cull_enabled;
+   bool has_precise;
 };
 
 static void
@@ -76,7 +77,8 @@
 virgl_tgsi_transform_instruction(struct tgsi_transform_context *ctx,
 				 struct tgsi_full_instruction *inst)
 {
-   if (inst->Instruction.Precise)
+   struct virgl_transform_context *vtctx = (struct virgl_transform_context *)ctx;
+   if (!vtctx->has_precise && inst->Instruction.Precise)
       inst->Instruction.Precise = 0;
 
    for (unsigned i = 0; i < inst->Instruction.NumSrcRegs; i++) {
@@ -104,6 +106,7 @@
    transform.base.transform_property = virgl_tgsi_transform_property;
    transform.base.transform_instruction = virgl_tgsi_transform_instruction;
    transform.cull_enabled = vscreen->caps.caps.v1.bset.has_cull;
+   transform.has_precise = vscreen->caps.caps.v2.capability_bits & VIRGL_CAP_TGSI_PRECISE;
    tgsi_transform_shader(tokens_in, new_tokens, newLen, &transform.base);
 
    return new_tokens;
diff --git a/src/gallium/drivers/virgl/virgl_winsys.h b/src/gallium/drivers/virgl/virgl_winsys.h
index 690e610..0e6cb795 100644
--- a/src/gallium/drivers/virgl/virgl_winsys.h
+++ b/src/gallium/drivers/virgl/virgl_winsys.h
@@ -127,12 +127,18 @@
    caps->caps.v2.max_geom_total_output_components = 1024;
    caps->caps.v2.max_vertex_outputs = 32;
    caps->caps.v2.max_vertex_attribs = 16;
-   caps->caps.v2.max_shader_patch_varyings = 0;
+   caps->caps.v2.max_shader_patch_varyings = 30;
    caps->caps.v2.min_texel_offset = -8;
    caps->caps.v2.max_texel_offset = 7;
    caps->caps.v2.min_texture_gather_offset = -8;
    caps->caps.v2.max_texture_gather_offset = 7;
    caps->caps.v2.texture_buffer_offset_alignment = 0;
    caps->caps.v2.uniform_buffer_offset_alignment = 256;
+   caps->caps.v2.shader_buffer_offset_alignment = 32;
+   caps->caps.v2.capability_bits = 0;
+   caps->caps.v2.max_vertex_attrib_stride = 0;
+   caps->caps.v2.max_image_samples = 0;
+   caps->caps.v2.max_compute_work_group_invocations = 0;
+   caps->caps.v2.max_compute_shared_memory_size = 0;
 }
 #endif
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index c3dc5ed..7cf037f 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -279,8 +279,35 @@
    void (*set_framebuffer_state)( struct pipe_context *,
                                   const struct pipe_framebuffer_state * );
 
+   /**
+    * Set the sample locations used during rasterization. When NULL or sized
+    * zero, the default locations are used.
+    *
+    * Note that get_sample_position() still returns the default locations.
+    *
+    * The samples are accessed with
+    * locations[(pixel_y*grid_w+pixel_x)*ms+i],
+    * where:
+    * ms      = the sample count
+    * grid_w  = the pixel grid width for the sample count
+    * grid_w  = the pixel grid height for the sample count
+    * pixel_x = the window x coordinate modulo grid_w
+    * pixel_y = the window y coordinate modulo grid_w
+    * i       = the sample index
+    * This gives a result with the x coordinate as the low 4 bits and the y
+    * coordinate as the high 4 bits. For each coordinate 0 is the left or top
+    * edge of the pixel's rectangle and 16 (not 15) is the right or bottom edge.
+    *
+    * Out of bounds accesses are return undefined values.
+    *
+    * The pixel grid is used to vary sample locations across pixels and its
+    * size can be queried with get_sample_pixel_grid().
+    */
+   void (*set_sample_locations)( struct pipe_context *,
+                                 size_t size, const uint8_t *locations );
+
    void (*set_polygon_stipple)( struct pipe_context *,
-				const struct pipe_poly_stipple * );
+                                const struct pipe_poly_stipple * );
 
    void (*set_scissor_states)( struct pipe_context *,
                                unsigned start_slot,
@@ -485,6 +512,16 @@
                         int clear_value_size);
 
    /**
+    * If a depth buffer is rendered with different sample location state than
+    * what is current at the time of reading, the values may differ because
+    * depth buffer compression can depend the sample locations.
+    *
+    * This function is a hint to decompress the current depth buffer to avoid
+    * such problems.
+    */
+   void (*evaluate_depth_buffer)(struct pipe_context *pipe);
+
+   /**
     * Flush draw commands.
     *
     * This guarantees that the new fence (if any) will finish in finite time,
@@ -720,7 +757,7 @@
    /*@}*/
 
    /**
-    * Get sample position for an individual sample point.
+    * Get the default sample position for an individual sample point.
     *
     * \param sample_count - total number of samples
     * \param sample_index - sample to get the position values for
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index c4ae053..9909b2b 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -607,6 +607,17 @@
 
 
 /**
+ * Conservative rasterization modes.
+ */
+enum pipe_conservative_raster_mode
+{
+   PIPE_CONSERVATIVE_RASTER_OFF,
+   PIPE_CONSERVATIVE_RASTER_POST_SNAP,
+   PIPE_CONSERVATIVE_RASTER_PRE_SNAP,
+};
+
+
+/**
  * resource_get_handle flags.
  */
 /* Requires pipe_context::flush_resource before external use. */
@@ -675,6 +686,7 @@
    PIPE_CAP_VERTEX_COLOR_UNCLAMPED,
    PIPE_CAP_VERTEX_COLOR_CLAMPED,
    PIPE_CAP_GLSL_FEATURE_LEVEL,
+   PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY,
    PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION,
    PIPE_CAP_USER_VERTEX_BUFFERS,
    PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY,
@@ -790,11 +802,19 @@
    PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS,
    PIPE_CAP_TILE_RASTER_ORDER,
    PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES,
+   PIPE_CAP_FRAMEBUFFER_MSAA_CONSTRAINTS,
    PIPE_CAP_SIGNED_VERTEX_BUFFER_OFFSET,
    PIPE_CAP_CONTEXT_PRIORITY_MASK,
    PIPE_CAP_FENCE_SIGNAL,
    PIPE_CAP_CONSTBUF0_FLAGS,
    PIPE_CAP_PACKED_UNIFORMS,
+   PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES,
+   PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES,
+   PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES,
+   PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES,
+   PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS,
+   PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE,
+   PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS,
 };
 
 /**
@@ -834,6 +854,9 @@
    PIPE_CAPF_MAX_POINT_WIDTH_AA,
    PIPE_CAPF_MAX_TEXTURE_ANISOTROPY,
    PIPE_CAPF_MAX_TEXTURE_LOD_BIAS,
+   PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE,
+   PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE,
+   PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY,
 };
 
 /** Shader caps not specific to any single stage */
@@ -876,6 +899,7 @@
    PIPE_SHADER_CAP_TGSI_LDEXP_SUPPORTED,
    PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTERS,
    PIPE_SHADER_CAP_MAX_HW_ATOMIC_COUNTER_BUFFERS,
+   PIPE_SHADER_CAP_SCALAR_ISA,
 };
 
 /**
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 101e229..c4d6e1c 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -132,6 +132,17 @@
 			    void *ret);
 
    /**
+    * Get the sample pixel grid's size. This function requires
+    * PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS to be callable.
+    *
+    * \param sample_count - total number of samples
+    * \param out_width - the width of the pixel grid
+    * \param out_height - the height of the pixel grid
+    */
+   void (*get_sample_pixel_grid)(struct pipe_screen *, unsigned sample_count,
+                                 unsigned *out_width, unsigned *out_height);
+
+   /**
     * Query a timestamp in nanoseconds. The returned value should match
     * PIPE_QUERY_TIMESTAMP. This function returns immediately and doesn't
     * wait for rendering to complete (which cannot be achieved with queries).
@@ -157,6 +168,7 @@
                                    enum pipe_format format,
                                    enum pipe_texture_target target,
                                    unsigned sample_count,
+                                   unsigned storage_sample_count,
                                    unsigned bindings );
 
    /**
@@ -191,7 +203,7 @@
     * another process by first creating a pipe texture and then calling
     * resource_get_handle.
     *
-    * NOTE: in the case of DRM_API_HANDLE_TYPE_FD handles, the caller
+    * NOTE: in the case of WINSYS_HANDLE_TYPE_FD handles, the caller
     * retains ownership of the FD.  (This is consistent with
     * EGL_EXT_image_dma_buf_import)
     *
@@ -238,7 +250,7 @@
     * the resource into a format compatible for sharing. The use case is
     * OpenGL-OpenCL interop. The context parameter is allowed to be NULL.
     *
-    * NOTE: in the case of DRM_API_HANDLE_TYPE_FD handles, the caller
+    * NOTE: in the case of WINSYS_HANDLE_TYPE_FD handles, the caller
     * takes ownership of the FD.  (This is consistent with
     * EGL_MESA_image_dma_buf_export)
     *
@@ -389,7 +401,7 @@
     * Then the underlying memory object is then exported through interfaces
     * compatible with EXT_external_resources.
     *
-    * Note: For DRM_API_HANDLE_TYPE_FD handles, the caller retains ownership
+    * Note: For WINSYS_HANDLE_TYPE_FD handles, the caller retains ownership
     * of the fd.
     *
     * \param handle  A handle representing the memory object to import
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index f4e45c2..08ed081 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -506,6 +506,8 @@
 
    TGSI_OPCODE_LOAD               = 161,
    TGSI_OPCODE_STORE              = 162,
+   TGSI_OPCODE_IMG2HND            = 163,
+   TGSI_OPCODE_SAMP2HND           = 164,
    /* gap */
    TGSI_OPCODE_BARRIER            = 166,
 
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 4dce399..671cccd 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -74,6 +74,7 @@
 #define PIPE_MAX_CLIP_OR_CULL_DISTANCE_COUNT 8
 #define PIPE_MAX_CLIP_OR_CULL_DISTANCE_ELEMENT_COUNT 2
 #define PIPE_MAX_WINDOW_RECTANGLES 8
+#define PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE 4
 
 #define PIPE_MAX_HW_ATOMIC_BUFFERS 32
 
@@ -113,6 +114,7 @@
    unsigned line_smooth:1;
    unsigned line_stipple_enable:1;
    unsigned line_last_pixel:1;
+   unsigned conservative_raster_mode:2; /**< PIPE_CONSERVATIVE_RASTER_x */
 
    /**
     * Use the first vertex of a primitive as the provoking vertex for
@@ -123,6 +125,12 @@
    unsigned half_pixel_center:1;
    unsigned bottom_edge_rule:1;
 
+   /*
+    * Conservative rasterization subpixel precision bias in bits
+    */
+   unsigned subpixel_precision_x:4;
+   unsigned subpixel_precision_y:4;
+
    /**
     * When true, rasterization is disabled and no pixels are written.
     * This only makes sense with the Stream Out functionality.
@@ -186,6 +194,7 @@
    float offset_units;
    float offset_scale;
    float offset_clamp;
+   float conservative_raster_dilate;
 };
 
 
@@ -510,7 +519,6 @@
 struct pipe_resource
 {
    struct pipe_reference reference;
-   struct pipe_screen *screen; /**< screen that this texture belongs to */
 
    unsigned width0; /**< Used by both buffers and textures. */
    uint16_t height0; /* Textures: The maximum height/depth/array_size is 16k. */
@@ -520,9 +528,20 @@
    enum pipe_format format:16;         /**< PIPE_FORMAT_x */
    enum pipe_texture_target target:8; /**< PIPE_TEXTURE_x */
    unsigned last_level:8;    /**< Index of last mipmap level present/defined */
-   unsigned nr_samples:8;    /**< for multisampled surfaces, nr of samples */
-   unsigned usage:8;         /**< PIPE_USAGE_x (not a bitmask) */
 
+   /** Number of samples determining quality, driving rasterizer, shading,
+    *  and framebuffer.
+    */
+   unsigned nr_samples:8;
+
+   /** Multiple samples within a pixel can have the same value.
+    *  nr_storage_samples determines how many slots for different values
+    *  there are per pixel. Only color buffers can set this lower than
+    *  nr_samples.
+    */
+   unsigned nr_storage_samples:8;
+
+   unsigned usage:8;         /**< PIPE_USAGE_x (not a bitmask) */
    unsigned bind;            /**< bitmask of PIPE_BIND_x */
    unsigned flags;           /**< bitmask of PIPE_RESOURCE_FLAG_x */
 
@@ -531,6 +550,8 @@
     * next plane.
     */
    struct pipe_resource *next;
+   /* The screen pointer should be last for optimal structure packing. */
+   struct pipe_screen *screen; /**< screen that this texture belongs to */
 };
 
 
diff --git a/src/gallium/include/state_tracker/drisw_api.h b/src/gallium/include/state_tracker/drisw_api.h
index 03d5ee4..e365ab8 100644
--- a/src/gallium/include/state_tracker/drisw_api.h
+++ b/src/gallium/include/state_tracker/drisw_api.h
@@ -2,6 +2,7 @@
 #define _DRISW_API_H_
 
 #include "pipe/p_compiler.h"
+#include "sw_winsys.h"
 
 struct pipe_screen;
 struct dri_drawable;
@@ -18,6 +19,9 @@
                       void *data, unsigned width, unsigned height);
    void (*put_image2) (struct dri_drawable *dri_drawable,
                        void *data, int x, int y, unsigned width, unsigned height, unsigned stride);
+   void (*put_image_shm) (struct dri_drawable *dri_drawable,
+                          int shmid, char *shmaddr, unsigned offset,
+                          int x, int y, unsigned width, unsigned height, unsigned stride);
 };
 
 #endif
diff --git a/src/gallium/include/state_tracker/drm_driver.h b/src/gallium/include/state_tracker/drm_driver.h
index f188b5a..19cd19f 100644
--- a/src/gallium/include/state_tracker/drm_driver.h
+++ b/src/gallium/include/state_tracker/drm_driver.h
@@ -4,58 +4,13 @@
 
 #include "pipe/p_compiler.h"
 
+#include "winsys_handle.h"
+
 struct pipe_screen;
 struct pipe_screen_config;
 struct pipe_context;
 struct pipe_resource;
 
-#define DRM_API_HANDLE_TYPE_SHARED 0
-#define DRM_API_HANDLE_TYPE_KMS    1
-#define DRM_API_HANDLE_TYPE_FD     2
-
-
-/**
- * For use with pipe_screen::{texture_from_handle|texture_get_handle}.
- */
-struct winsys_handle
-{
-   /**
-    * Input for texture_from_handle, valid values are
-    * DRM_API_HANDLE_TYPE_SHARED or DRM_API_HANDLE_TYPE_FD.
-    * Input to texture_get_handle,
-    * to select handle for kms, flink, or prime.
-    */
-   unsigned type;
-   /**
-    * Input for texture_get_handle, allows to export the offset
-    * of a specific layer of an array texture.
-    */
-   unsigned layer;
-   /**
-    * Input to texture_from_handle.
-    * Output for texture_get_handle.
-    */
-   unsigned handle;
-   /**
-    * Input to texture_from_handle.
-    * Output for texture_get_handle.
-    */
-   unsigned stride;
-   /**
-    * Input to texture_from_handle.
-    * Output for texture_get_handle.
-    */
-   unsigned offset;
-
-   /**
-    * Input to resource_from_handle.
-    * Output from resource_get_handle.
-    */
-   uint64_t modifier;
-};
-
-
-
 /**
  * Configuration queries.
  */
diff --git a/src/gallium/include/state_tracker/st_api.h b/src/gallium/include/state_tracker/st_api.h
index ec6e784..03377a3 100644
--- a/src/gallium/include/state_tracker/st_api.h
+++ b/src/gallium/include/state_tracker/st_api.h
@@ -190,6 +190,8 @@
  */
 struct st_visual
 {
+   bool no_config;
+
    /**
     * Available buffers.  Bitfield of ST_ATTACHMENT_*_MASK bits.
     */
@@ -222,6 +224,8 @@
    boolean force_glsl_extensions_warn;
    unsigned force_glsl_version;
    boolean allow_glsl_extension_directive_midshader;
+   boolean allow_glsl_builtin_const_expression;
+   boolean allow_glsl_relaxed_es;
    boolean allow_glsl_builtin_variable_redeclaration;
    boolean allow_higher_compat_version;
    boolean glsl_zero_init;
diff --git a/src/gallium/include/state_tracker/sw_winsys.h b/src/gallium/include/state_tracker/sw_winsys.h
index 0b792cd..cd5838a 100644
--- a/src/gallium/include/state_tracker/sw_winsys.h
+++ b/src/gallium/include/state_tracker/sw_winsys.h
@@ -37,14 +37,13 @@
 
 #include "pipe/p_compiler.h" /* for boolean */
 #include "pipe/p_format.h"
-
+#include "state_tracker/winsys_handle.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 
-struct winsys_handle;
 struct pipe_screen;
 struct pipe_context;
 struct pipe_resource;
diff --git a/src/gallium/include/state_tracker/winsys_handle.h b/src/gallium/include/state_tracker/winsys_handle.h
new file mode 100644
index 0000000..167c1a9
--- /dev/null
+++ b/src/gallium/include/state_tracker/winsys_handle.h
@@ -0,0 +1,58 @@
+
+#ifndef _WINSYS_HANDLE_H_
+#define _WINSYS_HANDLE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define WINSYS_HANDLE_TYPE_SHARED 0
+#define WINSYS_HANDLE_TYPE_KMS    1
+#define WINSYS_HANDLE_TYPE_FD     2
+#define WINSYS_HANDLE_TYPE_SHMID   3
+
+/**
+ * For use with pipe_screen::{texture_from_handle|texture_get_handle}.
+ */
+struct winsys_handle
+{
+   /**
+    * Input for texture_from_handle, valid values are
+    * WINSYS_HANDLE_TYPE_SHARED or WINSYS_HANDLE_TYPE_FD.
+    * Input to texture_get_handle,
+    * to select handle for kms, flink, or prime.
+    */
+   unsigned type;
+   /**
+    * Input for texture_get_handle, allows to export the offset
+    * of a specific layer of an array texture.
+    */
+   unsigned layer;
+   /**
+    * Input to texture_from_handle.
+    * Output for texture_get_handle.
+    */
+   unsigned handle;
+   /**
+    * Input to texture_from_handle.
+    * Output for texture_get_handle.
+    */
+   unsigned stride;
+   /**
+    * Input to texture_from_handle.
+    * Output for texture_get_handle.
+    */
+   unsigned offset;
+
+   /**
+    * Input to resource_from_handle.
+    * Output from resource_get_handle.
+    */
+   uint64_t modifier;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _WINSYS_HANDLE_H_ */
diff --git a/src/gallium/meson.build b/src/gallium/meson.build
index 0239172..561af9d 100644
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -40,9 +40,6 @@
   subdir('winsys/sw/hgl')
 endif
 if with_gallium_swr
-  if meson.version().version_compare('< 0.44.0')
-    error('SWR requires meson 0.44.0 or greater.')
-  endif
   subdir('drivers/swr')
 else
   driver_swr = declare_dependency()
@@ -97,11 +94,11 @@
 else
   driver_pl111 = declare_dependency()
 endif
-if with_gallium_vc5
-  subdir('winsys/vc5/drm')
-  subdir('drivers/vc5')
+if with_gallium_v3d
+  subdir('winsys/v3d/drm')
+  subdir('drivers/v3d')
 else
-  driver_vc5 = declare_dependency()
+  driver_v3d = declare_dependency()
 endif
 if with_gallium_etnaviv
   subdir('winsys/etnaviv/drm')
@@ -144,9 +141,6 @@
   # consumer
   subdir('targets/pipe-loader')
 
-  if meson.version().version_compare('< 0.44.0')
-    error('OpenCL requires meson 0.44.0 or greater.')
-  endif
   subdir('state_trackers/clover')
   subdir('targets/opencl')
 endif
diff --git a/src/gallium/state_trackers/clover/core/format.cpp b/src/gallium/state_trackers/clover/core/format.cpp
index 5701292..dee1872 100644
--- a/src/gallium/state_trackers/clover/core/format.cpp
+++ b/src/gallium/state_trackers/clover/core/format.cpp
@@ -152,7 +152,7 @@
       for (auto f : formats) {
          if (all_of([=](const device &dev) {
                   return dev.pipe->is_format_supported(
-                     dev.pipe, f.second, target, 1, bindings);
+                     dev.pipe, f.second, target, 1, 1, bindings);
                }, ctx.devices()))
             s.insert(f.first);
       }
diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp b/src/gallium/state_trackers/clover/core/kernel.hpp
index 4ba6ff4..5d46854 100644
--- a/src/gallium/state_trackers/clover/core/kernel.hpp
+++ b/src/gallium/state_trackers/clover/core/kernel.hpp
@@ -93,6 +93,7 @@
          /// Free any resources that were allocated in bind().
          virtual void unbind(exec_context &ctx) = 0;
 
+         virtual ~argument() {};
       protected:
          argument();
 
diff --git a/src/gallium/state_trackers/clover/llvm/codegen/native.cpp b/src/gallium/state_trackers/clover/llvm/codegen/native.cpp
index 409f8ac..b8ed01c 100644
--- a/src/gallium/state_trackers/clover/llvm/codegen/native.cpp
+++ b/src/gallium/state_trackers/clover/llvm/codegen/native.cpp
@@ -114,8 +114,7 @@
 
       std::unique_ptr<TargetMachine> tm {
          t->createTargetMachine(target.triple, target.cpu, "", {},
-                                compat::default_reloc_model,
-                                compat::default_code_model,
+                                ::llvm::None, compat::default_code_model,
                                 ::llvm::CodeGenOpt::Default) };
       if (!tm)
          fail(r_log, build_error(),
@@ -124,15 +123,14 @@
       ::llvm::SmallVector<char, 1024> data;
 
       {
-         compat::pass_manager pm;
+         ::llvm::legacy::PassManager pm;
          ::llvm::raw_svector_ostream os { data };
-         compat::raw_ostream_to_emit_file fos(os);
 
-         mod.setDataLayout(compat::get_data_layout(*tm));
+         mod.setDataLayout(tm->createDataLayout());
          tm->Options.MCOptions.AsmVerbose =
             (ft == TargetMachine::CGFT_AssemblyFile);
 
-         if (tm->addPassesToEmitFile(pm, fos, ft))
+         if (compat::add_passes_to_emit_file(*tm, pm, os, ft))
             fail(r_log, build_error(), "TargetMachine can't emit this file");
 
          pm.run(mod);
diff --git a/src/gallium/state_trackers/clover/llvm/compat.hpp b/src/gallium/state_trackers/clover/llvm/compat.hpp
index 5d553e5..975012c 100644
--- a/src/gallium/state_trackers/clover/llvm/compat.hpp
+++ b/src/gallium/state_trackers/clover/llvm/compat.hpp
@@ -54,15 +54,8 @@
 #include <llvm/Support/ErrorOr.h>
 #endif
 
-#if HAVE_LLVM >= 0x0307
 #include <llvm/IR/LegacyPassManager.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
-#else
-#include <llvm/PassManager.h>
-#include <llvm/Target/TargetLibraryInfo.h>
-#include <llvm/Target/TargetSubtargetInfo.h>
-#include <llvm/Support/FormattedStream.h>
-#endif
 
 #include <clang/Basic/TargetInfo.h>
 #include <clang/Frontend/CodeGenOptions.h>
@@ -71,12 +64,6 @@
 namespace clover {
    namespace llvm {
       namespace compat {
-#if HAVE_LLVM >= 0x0307
-         typedef ::llvm::TargetLibraryInfoImpl target_library_info;
-#else
-         typedef ::llvm::TargetLibraryInfo target_library_info;
-#endif
-
          template<typename T, typename AS>
          unsigned target_address_space(const T &target, const AS lang_as) {
             const auto &map = target.getAddressSpaceMap();
@@ -96,19 +83,6 @@
 #endif
 
          inline void
-         set_lang_defaults(clang::CompilerInvocation &inv,
-                           clang::LangOptions &lopts, clang::InputKind ik,
-                           const ::llvm::Triple &t,
-                           clang::PreprocessorOptions &ppopts,
-                           clang::LangStandard::Kind std) {
-#if HAVE_LLVM >= 0x0309
-            inv.setLangDefaults(lopts, ik, t, ppopts, std);
-#else
-            inv.setLangDefaults(lopts, ik, std);
-#endif
-         }
-
-         inline void
          add_link_bitcode_file(clang::CodeGenOptions &opts,
                                const std::string &path) {
 #if HAVE_LLVM >= 0x0500
@@ -118,78 +92,8 @@
             F.PropagateAttrs = true;
             F.LinkFlags = ::llvm::Linker::Flags::None;
             opts.LinkBitcodeFiles.emplace_back(F);
-#elif HAVE_LLVM >= 0x0308
+#else
             opts.LinkBitcodeFiles.emplace_back(::llvm::Linker::Flags::None, path);
-#else
-            opts.LinkBitcodeFile = path;
-#endif
-         }
-
-#if HAVE_LLVM >= 0x0307
-         typedef ::llvm::legacy::PassManager pass_manager;
-#else
-         typedef ::llvm::PassManager pass_manager;
-#endif
-
-         inline void
-         add_data_layout_pass(pass_manager &pm) {
-#if HAVE_LLVM < 0x0307
-            pm.add(new ::llvm::DataLayoutPass());
-#endif
-         }
-
-         inline void
-         add_internalize_pass(pass_manager &pm,
-                              const std::vector<std::string> &names) {
-#if HAVE_LLVM >= 0x0309
-            pm.add(::llvm::createInternalizePass(
-                      [=](const ::llvm::GlobalValue &gv) {
-                         return std::find(names.begin(), names.end(),
-                                          gv.getName()) != names.end();
-                      }));
-#else
-            pm.add(::llvm::createInternalizePass(std::vector<const char *>(
-                      map(std::mem_fn(&std::string::data), names))));
-#endif
-         }
-
-         inline std::unique_ptr< ::llvm::Linker>
-         create_linker(::llvm::Module &mod) {
-#if HAVE_LLVM >= 0x0308
-            return std::unique_ptr< ::llvm::Linker>(new ::llvm::Linker(mod));
-#else
-            return std::unique_ptr< ::llvm::Linker>(new ::llvm::Linker(&mod));
-#endif
-         }
-
-         inline bool
-         link_in_module(::llvm::Linker &linker,
-                        std::unique_ptr< ::llvm::Module> mod) {
-#if HAVE_LLVM >= 0x0308
-            return linker.linkInModule(std::move(mod));
-#else
-            return linker.linkInModule(mod.get());
-#endif
-         }
-
-#if HAVE_LLVM >= 0x0307
-         typedef ::llvm::raw_svector_ostream &raw_ostream_to_emit_file;
-#else
-         typedef ::llvm::formatted_raw_ostream raw_ostream_to_emit_file;
-#endif
-
-#if HAVE_LLVM >= 0x0307
-         typedef ::llvm::DataLayout data_layout;
-#else
-         typedef const ::llvm::DataLayout *data_layout;
-#endif
-
-         inline data_layout
-         get_data_layout(::llvm::TargetMachine &tm) {
-#if HAVE_LLVM >= 0x0307
-            return tm.createDataLayout();
-#else
-            return tm.getSubtargetImpl()->getDataLayout();
 #endif
          }
 
@@ -199,12 +103,6 @@
          const auto default_code_model = ::llvm::CodeModel::Default;
 #endif
 
-#if HAVE_LLVM >= 0x0309
-         const auto default_reloc_model = ::llvm::None;
-#else
-         const auto default_reloc_model = ::llvm::Reloc::Default;
-#endif
-
          template<typename M, typename F> void
          handle_module_error(M &mod, const F &f) {
 #if HAVE_LLVM >= 0x0400
@@ -248,6 +146,16 @@
 #endif
 	}
 
+	template<typename TM, typename PM, typename OS, typename FT>
+	bool add_passes_to_emit_file(TM &tm, PM &pm, OS &os, FT &ft)
+	{
+#if HAVE_LLVM >= 0x0700
+		return tm.addPassesToEmitFile(pm, os, nullptr, ft);
+#else
+		return tm.addPassesToEmitFile(pm, os, ft);
+#endif
+	}
+
 	template<typename T, typename M>
 	T get_abi_type(const T &arg_type, const M &mod) {
 #if HAVE_LLVM >= 0x0700
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index b2c64bc..0a677ce 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -216,7 +216,7 @@
       // http://www.llvm.org/bugs/show_bug.cgi?id=19735
       c->getDiagnosticOpts().ShowCarets = false;
 
-      compat::set_lang_defaults(c->getInvocation(), c->getLangOpts(),
+      c->getInvocation().setLangDefaults(c->getLangOpts(),
                                 compat::ik_opencl, ::llvm::Triple(target.triple),
                                 c->getPreprocessorOpts(),
                                 get_language_version(opts, device_clc_version));
@@ -314,9 +314,7 @@
    void
    optimize(Module &mod, unsigned optimization_level,
             bool internalize_symbols) {
-      compat::pass_manager pm;
-
-      compat::add_data_layout_pass(pm);
+      ::llvm::legacy::PassManager pm;
 
       // By default, the function internalizer pass will look for a function
       // called "main" and then mark all other functions as internal.  Marking
@@ -330,13 +328,19 @@
       // list of kernel functions to the internalizer.  The internalizer will
       // treat the functions in the list as "main" functions and internalize
       // all of the other functions.
-      if (internalize_symbols)
-         compat::add_internalize_pass(pm, map(std::mem_fn(&Function::getName),
-                                              get_kernels(mod)));
+      if (internalize_symbols) {
+         std::vector<std::string> names =
+            map(std::mem_fn(&Function::getName), get_kernels(mod));
+         pm.add(::llvm::createInternalizePass(
+                      [=](const ::llvm::GlobalValue &gv) {
+                         return std::find(names.begin(), names.end(),
+                                          gv.getName()) != names.end();
+                      }));
+      }
 
       ::llvm::PassManagerBuilder pmb;
       pmb.OptLevel = optimization_level;
-      pmb.LibraryInfo = new compat::target_library_info(
+      pmb.LibraryInfo = new ::llvm::TargetLibraryInfoImpl(
          ::llvm::Triple(mod.getTargetTriple()));
       pmb.populateModulePassManager(pm);
       pm.run(mod);
@@ -346,11 +350,10 @@
    link(LLVMContext &ctx, const clang::CompilerInstance &c,
         const std::vector<module> &modules, std::string &r_log) {
       std::unique_ptr<Module> mod { new Module("link", ctx) };
-      auto linker = compat::create_linker(*mod);
+      std::unique_ptr< ::llvm::Linker> linker { new ::llvm::Linker(*mod) };
 
       for (auto &m : modules) {
-         if (compat::link_in_module(*linker,
-                                    parse_module_library(m, ctx, r_log)))
+         if (linker->linkInModule(parse_module_library(m, ctx, r_log)))
             throw build_error();
       }
 
diff --git a/src/gallium/state_trackers/clover/meson.build b/src/gallium/state_trackers/clover/meson.build
index d1497e6..1a09d8f 100644
--- a/src/gallium/state_trackers/clover/meson.build
+++ b/src/gallium/state_trackers/clover/meson.build
@@ -115,7 +115,7 @@
 
 libclover = static_library(
   'clover',
-  clover_files,
+  [clover_files, sha1_h],
   include_directories : clover_incs,
   cpp_args : [clover_cpp_args, cpp_vis_args],
   link_with : [libcltgsi, libclllvm],
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index 1754d86..2411a36 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -620,9 +620,9 @@
 
    memset(&whandle, 0, sizeof(whandle));
    if (screen->can_share_buffer)
-      whandle.type = DRM_API_HANDLE_TYPE_SHARED;
+      whandle.type = WINSYS_HANDLE_TYPE_SHARED;
    else
-      whandle.type = DRM_API_HANDLE_TYPE_KMS;
+      whandle.type = WINSYS_HANDLE_TYPE_KMS;
 
    screen->base.screen->resource_get_handle(screen->base.screen, NULL,
          buffer->resource, &whandle,
@@ -808,9 +808,9 @@
          whandle.offset = 0;
          whandle.modifier = DRM_FORMAT_MOD_INVALID;
          if (screen->can_share_buffer)
-            whandle.type = DRM_API_HANDLE_TYPE_SHARED;
+            whandle.type = WINSYS_HANDLE_TYPE_SHARED;
          else
-            whandle.type = DRM_API_HANDLE_TYPE_KMS;
+            whandle.type = WINSYS_HANDLE_TYPE_KMS;
          drawable->textures[statt] =
             screen->base.screen->resource_from_handle(screen->base.screen,
                   &templ, &whandle,
@@ -832,6 +832,7 @@
             templ.bind = drawable->textures[statt]->bind &
                          ~(PIPE_BIND_SCANOUT | PIPE_BIND_SHARED);
             templ.nr_samples = drawable->stvis.samples;
+            templ.nr_storage_samples = drawable->stvis.samples;
 
             /* Try to reuse the resource.
              * (the other resource parameters should be constant)
@@ -883,10 +884,12 @@
 
          if (drawable->stvis.samples > 1) {
             templ.nr_samples = drawable->stvis.samples;
+            templ.nr_storage_samples = drawable->stvis.samples;
             zsbuf = &drawable->msaa_textures[statt];
          }
          else {
             templ.nr_samples = 0;
+            templ.nr_storage_samples = 0;
             zsbuf = &drawable->textures[statt];
          }
 
@@ -1068,7 +1071,7 @@
    enum pipe_format pf;
 
    memset(&whandle, 0, sizeof(whandle));
-   whandle.type = DRM_API_HANDLE_TYPE_SHARED;
+   whandle.type = WINSYS_HANDLE_TYPE_SHARED;
    whandle.handle = name;
    whandle.modifier = DRM_FORMAT_MOD_INVALID;
 
@@ -1127,7 +1130,7 @@
          goto exit;
       }
 
-      whandles[i].type = DRM_API_HANDLE_TYPE_FD;
+      whandles[i].type = WINSYS_HANDLE_TYPE_FD;
       whandles[i].handle = (unsigned)fds[i];
       whandles[i].stride = (unsigned)strides[i];
       whandles[i].offset = (unsigned)offsets[i];
@@ -1267,35 +1270,35 @@
 
    switch (attrib) {
    case __DRI_IMAGE_ATTRIB_STRIDE:
-      whandle.type = DRM_API_HANDLE_TYPE_KMS;
+      whandle.type = WINSYS_HANDLE_TYPE_KMS;
       if (!image->texture->screen->resource_get_handle(image->texture->screen,
             NULL, image->texture, &whandle, usage))
          return GL_FALSE;
       *value = whandle.stride;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_OFFSET:
-      whandle.type = DRM_API_HANDLE_TYPE_KMS;
+      whandle.type = WINSYS_HANDLE_TYPE_KMS;
       if (!image->texture->screen->resource_get_handle(image->texture->screen,
             NULL, image->texture, &whandle, usage))
          return GL_FALSE;
       *value = whandle.offset;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_HANDLE:
-      whandle.type = DRM_API_HANDLE_TYPE_KMS;
+      whandle.type = WINSYS_HANDLE_TYPE_KMS;
       if (!image->texture->screen->resource_get_handle(image->texture->screen,
          NULL, image->texture, &whandle, usage))
          return GL_FALSE;
       *value = whandle.handle;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_NAME:
-      whandle.type = DRM_API_HANDLE_TYPE_SHARED;
+      whandle.type = WINSYS_HANDLE_TYPE_SHARED;
       if (!image->texture->screen->resource_get_handle(image->texture->screen,
          NULL, image->texture, &whandle, usage))
          return GL_FALSE;
       *value = whandle.handle;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_FD:
-      whandle.type= DRM_API_HANDLE_TYPE_FD;
+      whandle.type= WINSYS_HANDLE_TYPE_FD;
       if (!image->texture->screen->resource_get_handle(image->texture->screen,
             NULL, image->texture, &whandle, usage))
          return GL_FALSE;
@@ -1318,12 +1321,12 @@
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_FOURCC:
       *value = convert_to_fourcc(image->dri_format);
-      return GL_TRUE;
+      return *value != -1;
    case __DRI_IMAGE_ATTRIB_NUM_PLANES:
       *value = 1;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_MODIFIER_UPPER:
-      whandle.type = DRM_API_HANDLE_TYPE_KMS;
+      whandle.type = WINSYS_HANDLE_TYPE_KMS;
       whandle.modifier = DRM_FORMAT_MOD_INVALID;
       if (!image->texture->screen->resource_get_handle(image->texture->screen,
             NULL, image->texture, &whandle, usage))
@@ -1333,7 +1336,7 @@
       *value = (whandle.modifier >> 32) & 0xffffffff;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_MODIFIER_LOWER:
-      whandle.type = DRM_API_HANDLE_TYPE_KMS;
+      whandle.type = WINSYS_HANDLE_TYPE_KMS;
       whandle.modifier = DRM_FORMAT_MOD_INVALID;
       if (!image->texture->screen->resource_get_handle(image->texture->screen,
             NULL, image->texture, &whandle, usage))
@@ -1413,7 +1416,7 @@
       return NULL;
 
    memset(&whandle, 0, sizeof(whandle));
-   whandle.type = DRM_API_HANDLE_TYPE_SHARED;
+   whandle.type = WINSYS_HANDLE_TYPE_SHARED;
    whandle.handle = names[0];
    whandle.stride = strides[0];
    whandle.offset = offsets[0];
@@ -1492,7 +1495,7 @@
                                        fourcc_to_pipe_format(
                                           fourcc_formats[i]),
                                        screen->target,
-                                       0, bind)) {
+                                       0, 0, bind)) {
          if (j < max)
             formats[j] = fourcc_formats[i];
          j++;
@@ -1513,7 +1516,8 @@
    const unsigned usage = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
 
    if (pscreen->query_dmabuf_modifiers != NULL &&
-       pscreen->is_format_supported(pscreen, format, screen->target, 0, usage)) {
+       pscreen->is_format_supported(pscreen, format, screen->target, 0, 0,
+                                    usage)) {
       pscreen->query_dmabuf_modifiers(pscreen, format, max, modifiers,
                                       external_only, count);
       return true;
@@ -1951,7 +1955,7 @@
    }
 
    memset(&whandle, 0, sizeof(whandle));
-   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.type = WINSYS_HANDLE_TYPE_FD;
 
    success = screen->resource_get_handle(screen, st->pipe, res, &whandle,
                                          usage);
diff --git a/src/gallium/state_trackers/dri/dri_context.c b/src/gallium/state_trackers/dri/dri_context.c
index fb30733..af9e332 100644
--- a/src/gallium/state_trackers/dri/dri_context.c
+++ b/src/gallium/state_trackers/dri/dri_context.c
@@ -62,6 +62,7 @@
       __DRIVER_CONTEXT_ATTRIB_RELEASE_BEHAVIOR;
    const __DRIbackgroundCallableExtension *backgroundCallable =
       screen->sPriv->dri2.backgroundCallable;
+   const struct driOptionCache *optionCache = &screen->dev->option_cache;
 
    if (screen->has_reset_status_query) {
       allowed_flags |= __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS;
@@ -88,8 +89,13 @@
       break;
    case API_OPENGL_COMPAT:
    case API_OPENGL_CORE:
-      attribs.profile = api == API_OPENGL_COMPAT ? ST_PROFILE_DEFAULT
-                                                 : ST_PROFILE_OPENGL_CORE;
+      if (driQueryOptionb(optionCache, "force_compat_profile")) {
+         attribs.profile = ST_PROFILE_DEFAULT;
+      } else {
+         attribs.profile = api == API_OPENGL_COMPAT ? ST_PROFILE_DEFAULT
+                                                    : ST_PROFILE_OPENGL_CORE;
+      }
+
       attribs.major = ctx_config->major_version;
       attribs.minor = ctx_config->minor_version;
 
diff --git a/src/gallium/state_trackers/dri/dri_helpers.c b/src/gallium/state_trackers/dri/dri_helpers.c
index 5d42873..25095bb 100644
--- a/src/gallium/state_trackers/dri/dri_helpers.c
+++ b/src/gallium/state_trackers/dri/dri_helpers.c
@@ -296,12 +296,6 @@
    img->dri_format = driGLFormatToImageFormat(rb->Format);
    img->loader_private = loaderPrivate;
 
-   if (img->dri_format == __DRI_IMAGE_FORMAT_NONE) {
-      *error = __DRI_IMAGE_ERROR_BAD_PARAMETER;
-      free(img);
-      return NULL;
-   }
-
    pipe_resource_reference(&img->texture, tex);
 
    *error = __DRI_IMAGE_ERROR_SUCCESS;
@@ -379,12 +373,6 @@
 
    img->loader_private = loaderPrivate;
 
-   if (img->dri_format == __DRI_IMAGE_FORMAT_NONE) {
-      *error = __DRI_IMAGE_ERROR_BAD_PARAMETER;
-      free(img);
-      return NULL;
-   }
-
    pipe_resource_reference(&img->texture, tex);
 
    *error = __DRI_IMAGE_ERROR_SUCCESS;
diff --git a/src/gallium/state_trackers/dri/dri_query_renderer.c b/src/gallium/state_trackers/dri/dri_query_renderer.c
index 80847e3..2417f40 100644
--- a/src/gallium/state_trackers/dri/dri_query_renderer.c
+++ b/src/gallium/state_trackers/dri/dri_query_renderer.c
@@ -52,7 +52,7 @@
       value[0] =
          screen->base.screen->is_format_supported(screen->base.screen,
                                                   PIPE_FORMAT_B8G8R8A8_SRGB,
-                                                  PIPE_TEXTURE_2D, 0,
+                                                  PIPE_TEXTURE_2D, 0, 0,
                                                   PIPE_BIND_RENDER_TARGET);
       return 0;
    case __DRI2_RENDERER_HAS_CONTEXT_PRIORITY:
diff --git a/src/gallium/state_trackers/dri/dri_screen.c b/src/gallium/state_trackers/dri/dri_screen.c
index aaee987..a0dcdb5 100644
--- a/src/gallium/state_trackers/dri/dri_screen.c
+++ b/src/gallium/state_trackers/dri/dri_screen.c
@@ -74,6 +74,10 @@
       driQueryOptioni(optionCache, "force_glsl_version");
    options->allow_glsl_extension_directive_midshader =
       driQueryOptionb(optionCache, "allow_glsl_extension_directive_midshader");
+   options->allow_glsl_builtin_const_expression =
+      driQueryOptionb(optionCache, "allow_glsl_builtin_const_expression");
+   options->allow_glsl_relaxed_es =
+      driQueryOptionb(optionCache, "allow_glsl_relaxed_es");
    options->allow_glsl_builtin_variable_redeclaration =
       driQueryOptionb(optionCache, "allow_glsl_builtin_variable_redeclaration");
    options->allow_higher_compat_version =
@@ -185,22 +189,22 @@
       ? MSAA_VISUAL_MAX_SAMPLES : 1;
 
    pf_x8z24 = p_screen->is_format_supported(p_screen, PIPE_FORMAT_Z24X8_UNORM,
-					    PIPE_TEXTURE_2D, 0,
+					    PIPE_TEXTURE_2D, 0, 0,
                                             PIPE_BIND_DEPTH_STENCIL);
    pf_z24x8 = p_screen->is_format_supported(p_screen, PIPE_FORMAT_X8Z24_UNORM,
-					    PIPE_TEXTURE_2D, 0,
+					    PIPE_TEXTURE_2D, 0, 0,
                                             PIPE_BIND_DEPTH_STENCIL);
    pf_s8z24 = p_screen->is_format_supported(p_screen, PIPE_FORMAT_Z24_UNORM_S8_UINT,
-					    PIPE_TEXTURE_2D, 0,
+					    PIPE_TEXTURE_2D, 0, 0,
                                             PIPE_BIND_DEPTH_STENCIL);
    pf_z24s8 = p_screen->is_format_supported(p_screen, PIPE_FORMAT_S8_UINT_Z24_UNORM,
-					    PIPE_TEXTURE_2D, 0,
+					    PIPE_TEXTURE_2D, 0, 0,
                                             PIPE_BIND_DEPTH_STENCIL);
    pf_z16 = p_screen->is_format_supported(p_screen, PIPE_FORMAT_Z16_UNORM,
-                                          PIPE_TEXTURE_2D, 0,
+                                          PIPE_TEXTURE_2D, 0, 0,
                                           PIPE_BIND_DEPTH_STENCIL);
    pf_z32 = p_screen->is_format_supported(p_screen, PIPE_FORMAT_Z32_UNORM,
-                                          PIPE_TEXTURE_2D, 0,
+                                          PIPE_TEXTURE_2D, 0, 0,
                                           PIPE_BIND_DEPTH_STENCIL);
 
    if (pf_z16) {
@@ -248,7 +252,7 @@
          continue;
 
       if (!p_screen->is_format_supported(p_screen, pipe_formats[format],
-                                         PIPE_TEXTURE_2D, 0,
+                                         PIPE_TEXTURE_2D, 0, 0,
                                          PIPE_BIND_RENDER_TARGET |
                                          PIPE_BIND_DISPLAY_TARGET))
          continue;
@@ -257,7 +261,7 @@
          int samples = i > 1 ? i : 0;
 
          if (p_screen->is_format_supported(p_screen, pipe_formats[format],
-                                           PIPE_TEXTURE_2D, samples,
+                                           PIPE_TEXTURE_2D, samples, samples,
                                            PIPE_BIND_RENDER_TARGET)) {
             msaa_modes[num_msaa_modes++] = samples;
          }
@@ -298,13 +302,16 @@
  * Roughly the converse of dri_fill_in_modes.
  */
 void
-dri_fill_st_visual(struct st_visual *stvis, struct dri_screen *screen,
+dri_fill_st_visual(struct st_visual *stvis,
+                   const struct dri_screen *screen,
                    const struct gl_config *mode)
 {
    memset(stvis, 0, sizeof(*stvis));
 
-   if (!mode)
+   if (!mode) {
+      stvis->no_config = true;
       return;
+   }
 
    /* Deduce the color format. */
    switch (mode->redMask) {
diff --git a/src/gallium/state_trackers/dri/dri_screen.h b/src/gallium/state_trackers/dri/dri_screen.h
index 677e945..e410aa9 100644
--- a/src/gallium/state_trackers/dri/dri_screen.h
+++ b/src/gallium/state_trackers/dri/dri_screen.h
@@ -127,7 +127,8 @@
 }
 
 void
-dri_fill_st_visual(struct st_visual *stvis, struct dri_screen *screen,
+dri_fill_st_visual(struct st_visual *stvis,
+                   const struct dri_screen *screen,
                    const struct gl_config *mode);
 
 void
diff --git a/src/gallium/state_trackers/dri/drisw.c b/src/gallium/state_trackers/dri/drisw.c
index eb57523..e24fcba 100644
--- a/src/gallium/state_trackers/dri/drisw.c
+++ b/src/gallium/state_trackers/dri/drisw.c
@@ -26,14 +26,6 @@
  *
  **************************************************************************/
 
-/* TODO:
- *
- * xshm / EGLImage:
- *
- * Allow the loaders to use the XSHM extension. It probably requires callbacks
- * for createImage/destroyImage similar to DRI2 getBuffers.
- */
-
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
@@ -87,6 +79,19 @@
 }
 
 static inline void
+put_image_shm(__DRIdrawable *dPriv, int shmid, char *shmaddr,
+              unsigned offset, int x, int y,
+              unsigned width, unsigned height, unsigned stride)
+{
+   __DRIscreen *sPriv = dPriv->driScreenPriv;
+   const __DRIswrastLoaderExtension *loader = sPriv->swrast_loader;
+
+   loader->putImageShm(dPriv, __DRI_SWRAST_IMAGE_OP_SWAP,
+                       x, y, width, height, stride,
+                       shmid, shmaddr, offset, dPriv->loaderPrivate);
+}
+
+static inline void
 get_image(__DRIdrawable *dPriv, int x, int y, int width, int height, void *data)
 {
    __DRIscreen *sPriv = dPriv->driScreenPriv;
@@ -112,6 +117,26 @@
                      data, dPriv->loaderPrivate);
 }
 
+static inline bool
+get_image_shm(__DRIdrawable *dPriv, int x, int y, int width, int height,
+              struct pipe_resource *res)
+{
+   __DRIscreen *sPriv = dPriv->driScreenPriv;
+   const __DRIswrastLoaderExtension *loader = sPriv->swrast_loader;
+   struct winsys_handle whandle;
+
+   whandle.type = WINSYS_HANDLE_TYPE_SHMID;
+
+   if (loader->base.version < 4 || !loader->getImageShm)
+      return FALSE;
+
+   if (!res->screen->resource_get_handle(res->screen, NULL, res, &whandle, PIPE_HANDLE_USAGE_WRITE))
+      return FALSE;
+
+   loader->getImageShm(dPriv, x, y, width, height, whandle.handle, dPriv->loaderPrivate);
+   return TRUE;
+}
+
 static void
 drisw_update_drawable_info(struct dri_drawable *drawable)
 {
@@ -153,6 +178,17 @@
 }
 
 static inline void
+drisw_put_image_shm(struct dri_drawable *drawable,
+                    int shmid, char *shmaddr, unsigned offset,
+                    int x, int y, unsigned width, unsigned height,
+                    unsigned stride)
+{
+   __DRIdrawable *dPriv = drawable->dPriv;
+
+   put_image_shm(dPriv, shmid, shmaddr, offset, x, y, width, height, stride);
+}
+
+static inline void
 drisw_present_texture(__DRIdrawable *dPriv,
                       struct pipe_resource *ptex, struct pipe_box *sub_box)
 {
@@ -348,7 +384,8 @@
                            x, y, w, h, &transfer);
 
    /* Copy the Drawable content to the mapped texture buffer */
-   get_image(dPriv, x, y, w, h, map);
+   if (!get_image_shm(dPriv, x, y, w, h, res))
+      get_image(dPriv, x, y, w, h, map);
 
    /* The pipe transfer has a pitch rounded up to the nearest 64 pixels.
       get_image() has a pitch rounded up to 4 bytes.  */
@@ -394,6 +431,7 @@
 static const __DRIconfig **
 drisw_init_screen(__DRIscreen * sPriv)
 {
+   const __DRIswrastLoaderExtension *loader = sPriv->swrast_loader;
    const __DRIconfig **configs;
    struct dri_screen *screen;
    struct pipe_screen *pscreen = NULL;
@@ -409,6 +447,10 @@
 
    sPriv->driverPrivate = (void *)screen;
    sPriv->extensions = drisw_screen_extensions;
+   if (loader->base.version >= 4) {
+      if (loader->putImageShm)
+         drisw_lf.put_image_shm = drisw_put_image_shm;
+   }
 
    if (pipe_loader_sw_probe_dri(&screen->dev, &drisw_lf)) {
       dri_init_options(screen);
diff --git a/src/gallium/state_trackers/glx/xlib/glx_getproc.c b/src/gallium/state_trackers/glx/xlib/glx_getproc.c
index 58b4763..6b94f2c 100644
--- a/src/gallium/state_trackers/glx/xlib/glx_getproc.c
+++ b/src/gallium/state_trackers/glx/xlib/glx_getproc.c
@@ -43,7 +43,7 @@
 };
 
 
-static struct name_address_pair GLX_functions[] = {
+static const struct name_address_pair GLX_functions[] = {
    /*** GLX_VERSION_1_0 ***/
    { "glXChooseVisual", (__GLXextFuncPtr) glXChooseVisual },
    { "glXCopyContext", (__GLXextFuncPtr) glXCopyContext },
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.c b/src/gallium/state_trackers/glx/xlib/xm_api.c
index e4c9408..b560ffc 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.c
@@ -489,7 +489,7 @@
    for (i = 0; i < count; i++) {
       if (xmdpy->screen->is_format_supported(xmdpy->screen, formats[i],
                                              target, sample_count,
-                                             tex_usage)) {
+                                             sample_count, tex_usage)) {
          fmt = formats[i];
          break;
       }
@@ -892,6 +892,7 @@
    if (!xmdpy->screen->is_format_supported(xmdpy->screen,
                                            v->stvis.color_format,
                                            PIPE_TEXTURE_2D, num_samples,
+                                           num_samples,
                                            PIPE_BIND_RENDER_TARGET))
       v->stvis.color_format = PIPE_FORMAT_NONE;
 
diff --git a/src/gallium/state_trackers/glx/xlib/xm_st.c b/src/gallium/state_trackers/glx/xlib/xm_st.c
index 9def70f..2fa80f4 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_st.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_st.c
@@ -136,6 +136,7 @@
    templ.array_size = 1;
    templ.last_level = 0;
    templ.nr_samples = xstfb->stvis.samples;
+   templ.nr_storage_samples = xstfb->stvis.samples;
 
    for (i = 0; i < ST_ATTACHMENT_COUNT; i++) {
       enum pipe_format format;
diff --git a/src/gallium/state_trackers/nine/basetexture9.c b/src/gallium/state_trackers/nine/basetexture9.c
index 0a9034c..911eee6 100644
--- a/src/gallium/state_trackers/nine/basetexture9.c
+++ b/src/gallium/state_trackers/nine/basetexture9.c
@@ -557,7 +557,7 @@
     srgb_format = util_format_srgb(resource->format);
     if (sRGB && srgb_format != PIPE_FORMAT_NONE &&
         screen->is_format_supported(screen, srgb_format,
-                                    resource->target, 0, resource->bind))
+                                    resource->target, 0, 0, resource->bind))
         templ.format = srgb_format;
     else
         templ.format = resource->format;
diff --git a/src/gallium/state_trackers/nine/buffer9.c b/src/gallium/state_trackers/nine/buffer9.c
index ca4e438..69b08e8 100644
--- a/src/gallium/state_trackers/nine/buffer9.c
+++ b/src/gallium/state_trackers/nine/buffer9.c
@@ -121,6 +121,7 @@
     info->array_size = 1;
     info->last_level = 0;
     info->nr_samples = 0;
+    info->nr_storage_samples = 0;
 
     hr = NineResource9_ctor(&This->base, pParams, NULL, TRUE,
                             Type, Pool, Usage);
diff --git a/src/gallium/state_trackers/nine/cubetexture9.c b/src/gallium/state_trackers/nine/cubetexture9.c
index 65251ad..8982168 100644
--- a/src/gallium/state_trackers/nine/cubetexture9.c
+++ b/src/gallium/state_trackers/nine/cubetexture9.c
@@ -90,6 +90,7 @@
         info->last_level = util_logbase2(EdgeLength);
     info->array_size = 6;
     info->nr_samples = 0;
+    info->nr_storage_samples = 0;
     info->bind = PIPE_BIND_SAMPLER_VIEW;
     info->usage = PIPE_USAGE_DEFAULT;
     info->flags = 0;
diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index 127f2ae..61eb5d9 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -784,6 +784,10 @@
 
     DBG("This=%p X=%d Y=%d Flags=%d\n", This, X, Y, Flags);
 
+    if (This->cursor.pos.x == X &&
+        This->cursor.pos.y == Y)
+        return;
+
     This->cursor.pos.x = X;
     This->cursor.pos.y = Y;
 
@@ -1580,6 +1584,7 @@
     user_assert(screen->is_format_supported(screen, src_res->format,
                                             src_res->target,
                                             src_res->nr_samples,
+                                            src_res->nr_storage_samples,
                                             PIPE_BIND_SAMPLER_VIEW),
                 D3DERR_INVALIDCALL);
 
@@ -1705,6 +1710,7 @@
         user_assert(screen->is_format_supported(screen, dst_res->format,
                                                 dst_res->target,
                                                 dst_res->nr_samples,
+                                                dst_res->nr_storage_samples,
                                                 zs ? PIPE_BIND_DEPTH_STENCIL :
                                                 PIPE_BIND_RENDER_TARGET),
                     D3DERR_INVALIDCALL);
@@ -3008,7 +3014,7 @@
         templ.bind = PIPE_BIND_STREAM_OUTPUT;
         templ.usage = PIPE_USAGE_STREAM;
         templ.height0 = templ.depth0 = templ.array_size = 1;
-        templ.last_level = templ.nr_samples = 0;
+        templ.last_level = templ.nr_samples = templ.nr_storage_samples = 0;
 
         resource = screen_sw->resource_create(screen_sw, &templ);
         if (!resource)
diff --git a/src/gallium/state_trackers/nine/nine_pipe.h b/src/gallium/state_trackers/nine/nine_pipe.h
index 6bd4a0c..7b68c09 100644
--- a/src/gallium/state_trackers/nine/nine_pipe.h
+++ b/src/gallium/state_trackers/nine/nine_pipe.h
@@ -201,7 +201,7 @@
 static inline boolean
 depth_stencil_format( D3DFORMAT fmt )
 {
-    static D3DFORMAT allowed[] = {
+    static const D3DFORMAT allowed[] = {
         D3DFMT_D16_LOCKABLE,
         D3DFMT_D32,
         D3DFMT_D15S1,
@@ -288,7 +288,7 @@
 
 #define format_check_internal(pipe_format) \
     screen->is_format_supported(screen, pipe_format, target, \
-                                sample_count, bindings)
+                                sample_count, sample_count, bindings)
 
 static inline enum pipe_format
 d3d9_to_pipe_format_checked(struct pipe_screen *screen,
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index 422df93..f9e6b96 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -378,7 +378,7 @@
     struct sm1_src_param dst_rel[1];
     struct sm1_dst_param dst[1];
 
-    struct sm1_op_info *info;
+    const struct sm1_op_info *info;
 };
 
 static void
@@ -483,7 +483,7 @@
         struct ureg_dst a0;
         struct ureg_dst tS[8]; /* texture stage registers */
         struct ureg_dst tdst; /* scratch dst if we need extra modifiers */
-        struct ureg_dst t[5]; /* scratch TEMPs */
+        struct ureg_dst t[8]; /* scratch TEMPs */
         struct ureg_src vC[2]; /* PS color in */
         struct ureg_src vT[8]; /* PS texcoord in */
         struct ureg_dst rL[NINE_MAX_LOOP_DEPTH]; /* loop ctr */
@@ -2273,6 +2273,18 @@
     return D3D_OK;
 }
 
+DECL_SPECIAL(RCP)
+{
+    struct ureg_program *ureg = tx->ureg;
+    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
+    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
+    struct ureg_dst tmp = tx_scratch(tx);
+    ureg_RCP(ureg, tmp, src);
+    ureg_MIN(ureg, tmp, ureg_imm1f(ureg, FLT_MAX), ureg_src(tmp));
+    ureg_MAX(ureg, dst, ureg_imm1f(ureg, -FLT_MAX), ureg_src(tmp));
+    return D3D_OK;
+}
+
 DECL_SPECIAL(RSQ)
 {
     struct ureg_program *ureg = tx->ureg;
@@ -2901,7 +2913,7 @@
 #define _OPI(o,t,vv1,vv2,pv1,pv2,d,s,h) \
     { D3DSIO_##o, TGSI_OPCODE_##t, { vv1, vv2 }, { pv1, pv2, }, d, s, h }
 
-struct sm1_op_info inst_table[] =
+static const struct sm1_op_info inst_table[] =
 {
     _OPI(NOP, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, SPECIAL(NOP)), /* 0 */
     _OPI(MOV, MOV, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL),
@@ -2909,7 +2921,7 @@
     _OPI(SUB, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(SUB)), /* 3 */
     _OPI(MAD, MAD, V(0,0), V(3,0), V(0,0), V(3,0), 1, 3, NULL), /* 4 */
     _OPI(MUL, MUL, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 5 */
-    _OPI(RCP, RCP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL), /* 6 */
+    _OPI(RCP, RCP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(RCP)), /* 6 */
     _OPI(RSQ, RSQ, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(RSQ)), /* 7 */
     _OPI(DP3, DP3, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 8 */
     _OPI(DP4, DP4, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 9 */
@@ -3008,10 +3020,10 @@
     _OPI(BREAKP, BRK,  V(0,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(BREAKP))
 };
 
-struct sm1_op_info inst_phase =
+static const struct sm1_op_info inst_phase =
     _OPI(PHASE, NOP, V(0,0), V(0,0), V(1,4), V(1,4), 0, 0, SPECIAL(PHASE));
 
-struct sm1_op_info inst_comment =
+static const struct sm1_op_info inst_comment =
     _OPI(COMMENT, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, SPECIAL(COMMENT));
 
 static void
@@ -3279,7 +3291,7 @@
     struct sm1_instruction *insn = &tx->insn;
     HRESULT hr;
     DWORD tok;
-    struct sm1_op_info *info = NULL;
+    const struct sm1_op_info *info = NULL;
     unsigned i;
 
     sm1_parse_comments(tx, TRUE);
diff --git a/src/gallium/state_trackers/nine/surface9.c b/src/gallium/state_trackers/nine/surface9.c
index d917fa1..71aa4f4 100644
--- a/src/gallium/state_trackers/nine/surface9.c
+++ b/src/gallium/state_trackers/nine/surface9.c
@@ -104,6 +104,7 @@
     This->base.info.last_level = 0;
     This->base.info.array_size = 1;
     This->base.info.nr_samples = multisample_type;
+    This->base.info.nr_storage_samples = multisample_type;
     This->base.info.usage = PIPE_USAGE_DEFAULT;
     This->base.info.bind = PIPE_BIND_SAMPLER_VIEW; /* StretchRect */
 
@@ -242,7 +243,7 @@
     srgb_format = util_format_srgb(resource->format);
     if (srgb_format == PIPE_FORMAT_NONE ||
         !screen->is_format_supported(screen, srgb_format,
-                                     resource->target, 0, resource->bind))
+                                     resource->target, 0, 0, resource->bind))
         srgb_format = resource->format;
 
     memset(&templ, 0, sizeof(templ));
@@ -803,6 +804,7 @@
     This->desc.Width = This->base.info.width0 = resource->width0;
     This->desc.Height = This->base.info.height0 = resource->height0;
     This->base.info.nr_samples = resource->nr_samples;
+    This->base.info.nr_storage_samples = resource->nr_storage_samples;
 
     This->stride = nine_format_get_stride(This->base.info.format,
                                           This->desc.Width);
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index f24a7d0..aa485a6 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -96,7 +96,7 @@
     HRESULT hr;
 
     memset(&whandle, 0, sizeof(whandle));
-    whandle.type = DRM_API_HANDLE_TYPE_FD;
+    whandle.type = WINSYS_HANDLE_TYPE_FD;
     This->screen->resource_get_handle(This->screen, pipe, resource,
                                       &whandle,
                                       for_frontbuffer_reading ?
@@ -307,6 +307,7 @@
     for (i = 0; i < newBufferCount; ++i) {
         tmplt.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
         tmplt.nr_samples = multisample_type;
+        tmplt.nr_storage_samples = multisample_type;
         if (!has_present_buffers)
             tmplt.bind |= NINE_BIND_PRESENTBUFFER_FLAGS;
         tmplt.format = d3d9_to_pipe_format_checked(This->screen,
@@ -345,6 +346,7 @@
             tmplt.format = PIPE_FORMAT_B8G8R8X8_UNORM;
             tmplt.bind = NINE_BIND_PRESENTBUFFER_FLAGS;
             tmplt.nr_samples = 0;
+            tmplt.nr_storage_samples = 0;
             if (This->actx->linear_framebuffer)
                 tmplt.bind |= PIPE_BIND_LINEAR;
             if (pParams->SwapEffect != D3DSWAPEFFECT_DISCARD)
@@ -361,6 +363,7 @@
     if (pParams->EnableAutoDepthStencil) {
         tmplt.bind = d3d9_get_pipe_depth_format_bindings(pParams->AutoDepthStencilFormat);
         tmplt.nr_samples = multisample_type;
+        tmplt.nr_storage_samples = multisample_type;
         tmplt.format = d3d9_to_pipe_format_checked(This->screen,
                                                    pParams->AutoDepthStencilFormat,
                                                    PIPE_TEXTURE_2D,
diff --git a/src/gallium/state_trackers/nine/texture9.c b/src/gallium/state_trackers/nine/texture9.c
index 78ca4ad..fca5e60 100644
--- a/src/gallium/state_trackers/nine/texture9.c
+++ b/src/gallium/state_trackers/nine/texture9.c
@@ -131,6 +131,7 @@
         info->last_level = util_logbase2(MAX2(Width, Height));
     info->array_size = 1;
     info->nr_samples = 0;
+    info->nr_storage_samples = 0;
     info->bind = PIPE_BIND_SAMPLER_VIEW;
     info->usage = PIPE_USAGE_DEFAULT;
     info->flags = 0;
diff --git a/src/gallium/state_trackers/nine/threadpool.c b/src/gallium/state_trackers/nine/threadpool.c
index cc62fd2..19721aa 100644
--- a/src/gallium/state_trackers/nine/threadpool.c
+++ b/src/gallium/state_trackers/nine/threadpool.c
@@ -37,6 +37,7 @@
 #include "os/os_thread.h"
 #include "threadpool.h"
 
+/* POSIX thread function */
 static void *
 threadpool_worker(void *data)
 {
@@ -76,6 +77,15 @@
     return NULL;
 }
 
+/* Windows thread function */
+static DWORD NINE_WINAPI
+wthreadpool_worker(void *data)
+{
+    threadpool_worker(data);
+
+    return 0;
+}
+
 struct threadpool *
 _mesa_threadpool_create(struct NineSwapChain9 *swapchain)
 {
@@ -87,7 +97,9 @@
     pthread_mutex_init(&pool->m, NULL);
     pthread_cond_init(&pool->new_work, NULL);
 
-    pool->wthread = NineSwapChain9_CreateThread(swapchain, threadpool_worker, pool);
+    /* This uses WINE's CreateThread, so the thread function needs to use
+     * the Windows ABI */
+    pool->wthread = NineSwapChain9_CreateThread(swapchain, wthreadpool_worker, pool);
     if (!pool->wthread) {
         /* using pthread as fallback */
         pthread_create(&pool->pthread, NULL, threadpool_worker, pool);
diff --git a/src/gallium/state_trackers/nine/volume9.c b/src/gallium/state_trackers/nine/volume9.c
index 62af3e6..ec811ae 100644
--- a/src/gallium/state_trackers/nine/volume9.c
+++ b/src/gallium/state_trackers/nine/volume9.c
@@ -92,6 +92,7 @@
     This->info.last_level = 0;
     This->info.array_size = 1;
     This->info.nr_samples = 0;
+    This->info.nr_storage_samples = 0;
     This->info.usage = PIPE_USAGE_DEFAULT;
     This->info.bind = PIPE_BIND_SAMPLER_VIEW;
     This->info.flags = 0;
diff --git a/src/gallium/state_trackers/nine/volumetexture9.c b/src/gallium/state_trackers/nine/volumetexture9.c
index c836dd2..5dec484 100644
--- a/src/gallium/state_trackers/nine/volumetexture9.c
+++ b/src/gallium/state_trackers/nine/volumetexture9.c
@@ -88,6 +88,7 @@
         info->last_level = util_logbase2(MAX2(MAX2(Width, Height), Depth));
     info->array_size = 1;
     info->nr_samples = 0;
+    info->nr_storage_samples = 0;
     info->bind = PIPE_BIND_SAMPLER_VIEW;
     info->usage = PIPE_USAGE_DEFAULT;
     info->flags = 0;
diff --git a/src/gallium/state_trackers/omx/tizonia/Makefile.am b/src/gallium/state_trackers/omx/tizonia/Makefile.am
index 0eac85a..ac72aff 100644
--- a/src/gallium/state_trackers/omx/tizonia/Makefile.am
+++ b/src/gallium/state_trackers/omx/tizonia/Makefile.am
@@ -29,7 +29,6 @@
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/egl \
 	-I$(top_srcdir)/src/egl/drivers/dri2 \
-	-I$(top_srcdir)/src/egl/wayland/wayland-egl \
 	-I$(top_srcdir)/src/egl/main \
 	-I$(top_srcdir)/src/gbm/main \
 	-I$(top_srcdir)/src/loader \
@@ -38,6 +37,7 @@
 	-I$(top_srcdir)/src/gallium/state_trackers/omx \
 	$(GALLIUM_CFLAGS) \
 	$(LIBDRM_CFLAGS) \
+	$(WAYLAND_EGL_CFLAGS) \
 	$(VISIBILITY_CFLAGS) \
 	$(VL_CFLAGS) \
 	$(XCB_DRI3_CFLAGS) \
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c
index deaeb19..42ec973 100644
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -305,7 +305,7 @@
          drv->pipe->flush(drv->pipe, NULL, 0);
 
          memset(&whandle, 0, sizeof(whandle));
-         whandle.type = DRM_API_HANDLE_TYPE_FD;
+         whandle.type = WINSYS_HANDLE_TYPE_FD;
 
          if (!screen->resource_get_handle(screen, drv->pipe,
                                           buf->derived_surface.resource,
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c
index 3f892c9..807fc83 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -353,6 +353,23 @@
       return VA_STATUS_ERROR_INVALID_IMAGE;
    }
 
+   if (x < 0 || y < 0) {
+      mtx_unlock(&drv->mutex);
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+   }
+
+   if (x + width > surf->templat.width ||
+       y + height > surf->templat.height) {
+      mtx_unlock(&drv->mutex);
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+   }
+
+   if (width > vaimage->width ||
+       height > vaimage->height) {
+      mtx_unlock(&drv->mutex);
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+   }
+
    img_buf = handle_table_get(drv->htab, vaimage->buf);
    if (!img_buf) {
       mtx_unlock(&drv->mutex);
@@ -400,11 +417,19 @@
    }
 
    for (i = 0; i < vaimage->num_planes; i++) {
-      unsigned width, height;
+      unsigned box_w = align(width, 2);
+      unsigned box_h = align(height, 2);
+      unsigned box_x = x & ~1;
+      unsigned box_y = y & ~1;
       if (!views[i]) continue;
-      vlVaVideoSurfaceSize(surf, i, &width, &height);
+      vl_video_buffer_adjust_size(&box_w, &box_h, i,
+                                  surf->templat.chroma_format,
+                                  surf->templat.interlaced);
+      vl_video_buffer_adjust_size(&box_x, &box_y, i,
+                                  surf->templat.chroma_format,
+                                  surf->templat.interlaced);
       for (j = 0; j < views[i]->texture->array_size; ++j) {
-         struct pipe_box box = {0, 0, j, width, height, 1};
+         struct pipe_box box = {box_x, box_y, j, box_w, box_h, 1};
          struct pipe_transfer *transfer;
          uint8_t *map;
          map = drv->pipe->transfer_map(drv->pipe, views[i]->texture, 0,
diff --git a/src/gallium/state_trackers/va/subpicture.c b/src/gallium/state_trackers/va/subpicture.c
index 981a99c..b213e9a 100644
--- a/src/gallium/state_trackers/va/subpicture.c
+++ b/src/gallium/state_trackers/va/subpicture.c
@@ -218,7 +218,7 @@
    tex_temp.flags = 0;
    if (!drv->pipe->screen->is_format_supported(
           drv->pipe->screen, tex_temp.format, tex_temp.target,
-          tex_temp.nr_samples, tex_temp.bind)) {
+          tex_temp.nr_samples, tex_temp.nr_storage_samples, tex_temp.bind)) {
       mtx_unlock(&drv->mutex);
       return VA_STATUS_ERROR_ALLOCATION_FAILED;
    }
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 8604136..d693139 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -525,71 +525,82 @@
 }
 
 static VAStatus
-suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface,
-                            VASurfaceAttribExternalBuffers *memory_attibute,
-                            unsigned index, struct pipe_video_buffer *templat)
+surface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface,
+                             VASurfaceAttribExternalBuffers *memory_attribute,
+                             unsigned index, struct pipe_video_buffer *templat)
 {
    vlVaDriver *drv;
    struct pipe_screen *pscreen;
-   struct pipe_resource *resource;
    struct pipe_resource res_templ;
    struct winsys_handle whandle;
    struct pipe_resource *resources[VL_NUM_COMPONENTS];
+   const enum pipe_format *resource_formats = NULL;
+   VAStatus result;
+   int i;
 
    pscreen = VL_VA_PSCREEN(ctx);
    drv = VL_VA_DRIVER(ctx);
 
-   if (!memory_attibute || !memory_attibute->buffers ||
-       index > memory_attibute->num_buffers)
+   if (!memory_attribute || !memory_attribute->buffers ||
+       index > memory_attribute->num_buffers)
       return VA_STATUS_ERROR_INVALID_PARAMETER;
 
-   if (surface->templat.width != memory_attibute->width ||
-       surface->templat.height != memory_attibute->height ||
-       memory_attibute->num_planes < 1)
+   if (surface->templat.width != memory_attribute->width ||
+       surface->templat.height != memory_attribute->height ||
+       memory_attribute->num_planes < 1)
       return VA_STATUS_ERROR_INVALID_PARAMETER;
 
-   switch (memory_attibute->pixel_format) {
-   case VA_FOURCC_RGBA:
-   case VA_FOURCC_RGBX:
-   case VA_FOURCC_BGRA:
-   case VA_FOURCC_BGRX:
-      if (memory_attibute->num_planes != 1)
-         return VA_STATUS_ERROR_INVALID_PARAMETER;
-      break;
-   default:
+   if (memory_attribute->num_planes > VL_NUM_COMPONENTS)
       return VA_STATUS_ERROR_INVALID_PARAMETER;
-   }
+
+   resource_formats = vl_video_buffer_formats(pscreen, templat->buffer_format);
+   if (!resource_formats)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
 
    memset(&res_templ, 0, sizeof(res_templ));
    res_templ.target = PIPE_TEXTURE_2D;
    res_templ.last_level = 0;
    res_templ.depth0 = 1;
    res_templ.array_size = 1;
-   res_templ.width0 = memory_attibute->width;
-   res_templ.height0 = memory_attibute->height;
-   res_templ.format = surface->templat.buffer_format;
+   res_templ.width0 = memory_attribute->width;
+   res_templ.height0 = memory_attribute->height;
    res_templ.bind = PIPE_BIND_SAMPLER_VIEW;
    res_templ.usage = PIPE_USAGE_DEFAULT;
 
    memset(&whandle, 0, sizeof(struct winsys_handle));
-   whandle.type = DRM_API_HANDLE_TYPE_FD;
-   whandle.handle = memory_attibute->buffers[index];
-   whandle.stride = memory_attibute->pitches[index];
+   whandle.type = WINSYS_HANDLE_TYPE_FD;
+   whandle.handle = memory_attribute->buffers[index];
 
-   resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle,
-                                            PIPE_HANDLE_USAGE_READ_WRITE);
-
-   if (!resource)
-      return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
+   // Create a resource for each plane.
    memset(resources, 0, sizeof resources);
-   resources[0] = resource;
+   for (i = 0; i < memory_attribute->num_planes; i++) {
+      res_templ.format = resource_formats[i];
+      if (res_templ.format == PIPE_FORMAT_NONE) {
+         result = VA_STATUS_ERROR_INVALID_PARAMETER;
+         goto fail;
+      }
+
+      whandle.stride = memory_attribute->pitches[i];
+      whandle.offset = memory_attribute->offsets[i];
+      resources[i] = pscreen->resource_from_handle(pscreen, &res_templ, &whandle,
+                                                   PIPE_HANDLE_USAGE_READ_WRITE);
+      if (!resources[i]) {
+         result = VA_STATUS_ERROR_ALLOCATION_FAILED;
+         goto fail;
+      }
+   }
 
    surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources);
-   if (!surface->buffer)
-      return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
+   if (!surface->buffer) {
+      result = VA_STATUS_ERROR_ALLOCATION_FAILED;
+      goto fail;
+   }
    return VA_STATUS_SUCCESS;
+
+fail:
+   for (i = 0; i < VL_NUM_COMPONENTS; i++)
+      pipe_resource_reference(&resources[i], NULL);
+   return result;
 }
 
 VAStatus
@@ -629,7 +640,7 @@
                     VASurfaceAttrib *attrib_list, unsigned int num_attribs)
 {
    vlVaDriver *drv;
-   VASurfaceAttribExternalBuffers *memory_attibute;
+   VASurfaceAttribExternalBuffers *memory_attribute;
    struct pipe_video_buffer templat;
    struct pipe_screen *pscreen;
    int i;
@@ -655,7 +666,7 @@
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
    /* Default. */
-   memory_attibute = NULL;
+   memory_attribute = NULL;
    memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA;
    expected_fourcc = 0;
 
@@ -687,7 +698,7 @@
           (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) {
          if (attrib_list[i].value.type != VAGenericValueTypePointer)
             return VA_STATUS_ERROR_INVALID_PARAMETER;
-         memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p;
+         memory_attribute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p;
       }
    }
 
@@ -703,10 +714,10 @@
    case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
       break;
    case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
-      if (!memory_attibute)
+      if (!memory_attribute)
          return VA_STATUS_ERROR_INVALID_PARAMETER;
 
-      expected_fourcc = memory_attibute->pixel_format;
+      expected_fourcc = memory_attribute->pixel_format;
       break;
    default:
       assert(0);
@@ -730,7 +741,7 @@
    if (expected_fourcc) {
       enum pipe_format expected_format = VaFourccToPipeFormat(expected_fourcc);
 
-      if (expected_format != templat.buffer_format || memory_attibute)
+      if (expected_format != templat.buffer_format || memory_attribute)
         templat.interlaced = 0;
 
       templat.buffer_format = expected_format;
@@ -757,10 +768,10 @@
       case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
          /* The application will clear the TILING flag when the surface is
           * intended to be exported as dmabuf. Adding shared flag because not
-          * null memory_attibute means VASurfaceAttribExternalBuffers is used.
+          * null memory_attribute means VASurfaceAttribExternalBuffers is used.
           */
-         if (memory_attibute &&
-             !(memory_attibute->flags & VA_SURFACE_EXTBUF_DESC_ENABLE_TILING))
+         if (memory_attribute &&
+             !(memory_attribute->flags & VA_SURFACE_EXTBUF_DESC_ENABLE_TILING))
             templat.bind = PIPE_BIND_LINEAR | PIPE_BIND_SHARED;
 
 	 vaStatus = vlVaHandleSurfaceAllocate(drv, surf, &templat);
@@ -769,7 +780,7 @@
          break;
 
       case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
-         vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, &templat);
+         vaStatus = surface_from_external_memory(ctx, surf, memory_attribute, i, &templat);
          if (vaStatus != VA_STATUS_SUCCESS)
             goto free_surf;
          break;
@@ -1037,7 +1048,7 @@
       }
 
       memset(&whandle, 0, sizeof(whandle));
-      whandle.type = DRM_API_HANDLE_TYPE_FD;
+      whandle.type = WINSYS_HANDLE_TYPE_FD;
 
       if (!screen->resource_get_handle(screen, drv->pipe, resource,
                                        &whandle, usage)) {
diff --git a/src/gallium/state_trackers/vdpau/output.c b/src/gallium/state_trackers/vdpau/output.c
index 8ef8268..6ef7a40 100644
--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -805,7 +805,7 @@
    vlsurface->device->context->flush(vlsurface->device->context, NULL, 0);
 
    memset(&whandle, 0, sizeof(struct winsys_handle));
-   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.type = WINSYS_HANDLE_TYPE_FD;
 
    pscreen = vlsurface->surface->texture->screen;
    if (!pscreen->resource_get_handle(pscreen, vlsurface->device->context,
diff --git a/src/gallium/state_trackers/vdpau/query.c b/src/gallium/state_trackers/vdpau/query.c
index 6b8b5a6..2c4ebe6 100644
--- a/src/gallium/state_trackers/vdpau/query.c
+++ b/src/gallium/state_trackers/vdpau/query.c
@@ -247,7 +247,7 @@
    mtx_lock(&dev->mutex);
    *is_supported = pscreen->is_format_supported
    (
-      pscreen, format, PIPE_TEXTURE_3D, 1,
+      pscreen, format, PIPE_TEXTURE_3D, 1, 1,
       PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET
    );
    if (*is_supported) {
@@ -299,7 +299,7 @@
    mtx_lock(&dev->mutex);
    *is_supported = pscreen->is_format_supported
    (
-      pscreen, format, PIPE_TEXTURE_2D, 1,
+      pscreen, format, PIPE_TEXTURE_2D, 1, 1,
       PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET
    );
    mtx_unlock(&dev->mutex);
@@ -348,19 +348,19 @@
    mtx_lock(&dev->mutex);
    *is_supported = pscreen->is_format_supported
    (
-      pscreen, rgba_format, PIPE_TEXTURE_2D, 1,
+      pscreen, rgba_format, PIPE_TEXTURE_2D, 1, 1,
       PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET
    );
 
    *is_supported &= pscreen->is_format_supported
    (
-      pscreen, index_format, PIPE_TEXTURE_2D, 1,
+      pscreen, index_format, PIPE_TEXTURE_2D, 1, 1,
       PIPE_BIND_SAMPLER_VIEW
    );
 
    *is_supported &= pscreen->is_format_supported
    (
-      pscreen, colortbl_format, PIPE_TEXTURE_1D, 1,
+      pscreen, colortbl_format, PIPE_TEXTURE_1D, 1, 1,
       PIPE_BIND_SAMPLER_VIEW
    );
    mtx_unlock(&dev->mutex);
@@ -403,7 +403,7 @@
    mtx_lock(&dev->mutex);
    *is_supported = pscreen->is_format_supported
    (
-      pscreen, rgba_format, PIPE_TEXTURE_2D, 1,
+      pscreen, rgba_format, PIPE_TEXTURE_2D, 1, 1,
       PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET
    );
 
@@ -447,7 +447,7 @@
    mtx_lock(&dev->mutex);
    *is_supported = pscreen->is_format_supported
    (
-      pscreen, format, PIPE_TEXTURE_3D, 1,
+      pscreen, format, PIPE_TEXTURE_3D, 1, 1,
       PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET
    );
    if (*is_supported) {
diff --git a/src/gallium/state_trackers/vdpau/surface.c b/src/gallium/state_trackers/vdpau/surface.c
index 012d303..95bab87 100644
--- a/src/gallium/state_trackers/vdpau/surface.c
+++ b/src/gallium/state_trackers/vdpau/surface.c
@@ -526,7 +526,7 @@
    }
 
    memset(&whandle, 0, sizeof(struct winsys_handle));
-   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.type = WINSYS_HANDLE_TYPE_FD;
    whandle.layer = surf->u.tex.first_layer;
 
    pscreen = surf->texture->screen;
diff --git a/src/gallium/state_trackers/vdpau/vdpau_private.h b/src/gallium/state_trackers/vdpau/vdpau_private.h
index b36c0c4..420573b 100644
--- a/src/gallium/state_trackers/vdpau/vdpau_private.h
+++ b/src/gallium/state_trackers/vdpau/vdpau_private.h
@@ -343,8 +343,9 @@
 CheckSurfaceParams(struct pipe_screen *screen,
                    const struct pipe_resource *templ)
 {
-   return screen->is_format_supported(
-         screen, templ->format, templ->target, templ->nr_samples, templ->bind);
+   return screen->is_format_supported(screen, templ->format, templ->target,
+                                      templ->nr_samples,
+                                      templ->nr_storage_samples, templ->bind);
 }
 
 typedef struct
diff --git a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c
index d709faa..02ccb76 100644
--- a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c
@@ -101,45 +101,47 @@
       return 0;
    }
 
-   for (piAttrib = piAttribList; *piAttrib; piAttrib++) {
-      switch (*piAttrib) {
-      case WGL_PBUFFER_LARGEST_ARB:
-         piAttrib++;
-         useLargest = *piAttrib;
-         break;
-       case WGL_TEXTURE_FORMAT_ARB:
-          /* WGL_ARB_render_texture */
-          piAttrib++;
-          textureFormat = *piAttrib;
-          if (textureFormat != WGL_TEXTURE_RGB_ARB &&
-             textureFormat != WGL_TEXTURE_RGBA_ARB &&
-             textureFormat != WGL_NO_TEXTURE_ARB) {
-             SetLastError(ERROR_INVALID_DATA);
-             return 0;
-          }
-          break;
-       case WGL_TEXTURE_TARGET_ARB:
-          /* WGL_ARB_render_texture */
-          piAttrib++;
-          textureTarget = *piAttrib;
-          if (textureTarget != WGL_TEXTURE_CUBE_MAP_ARB &&
-              textureTarget != WGL_TEXTURE_1D_ARB &&
-              textureTarget != WGL_TEXTURE_2D_ARB &&
-              textureTarget != WGL_NO_TEXTURE_ARB) {
-             SetLastError(ERROR_INVALID_DATA);
-             return 0;
-          }
-          break;
-      case WGL_MIPMAP_TEXTURE_ARB:
-         /* WGL_ARB_render_texture */
-         piAttrib++;
-         textureMipmap = !!*piAttrib;
-         break;
-      default:
-         SetLastError(ERROR_INVALID_DATA);
-         debug_printf("wgl: Unsupported attribute 0x%x in %s\n",
-                      *piAttrib, __func__);
-         return 0;
+   if (piAttribList) {
+      for (piAttrib = piAttribList; *piAttrib; piAttrib++) {
+         switch (*piAttrib) {
+         case WGL_PBUFFER_LARGEST_ARB:
+            piAttrib++;
+            useLargest = *piAttrib;
+            break;
+          case WGL_TEXTURE_FORMAT_ARB:
+             /* WGL_ARB_render_texture */
+             piAttrib++;
+             textureFormat = *piAttrib;
+             if (textureFormat != WGL_TEXTURE_RGB_ARB &&
+                textureFormat != WGL_TEXTURE_RGBA_ARB &&
+                textureFormat != WGL_NO_TEXTURE_ARB) {
+                SetLastError(ERROR_INVALID_DATA);
+                return 0;
+             }
+             break;
+          case WGL_TEXTURE_TARGET_ARB:
+             /* WGL_ARB_render_texture */
+             piAttrib++;
+             textureTarget = *piAttrib;
+             if (textureTarget != WGL_TEXTURE_CUBE_MAP_ARB &&
+                 textureTarget != WGL_TEXTURE_1D_ARB &&
+                 textureTarget != WGL_TEXTURE_2D_ARB &&
+                 textureTarget != WGL_NO_TEXTURE_ARB) {
+                SetLastError(ERROR_INVALID_DATA);
+                return 0;
+             }
+             break;
+         case WGL_MIPMAP_TEXTURE_ARB:
+            /* WGL_ARB_render_texture */
+            piAttrib++;
+            textureMipmap = !!*piAttrib;
+            break;
+         default:
+            SetLastError(ERROR_INVALID_DATA);
+            debug_printf("wgl: Unsupported attribute 0x%x in %s\n",
+                         *piAttrib, __func__);
+            return 0;
+         }
       }
    }
 
diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c
index 833308d..d4e7a94 100644
--- a/src/gallium/state_trackers/wgl/stw_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c
@@ -254,7 +254,7 @@
 
       for (cfmt = 0; cfmt < num_color_formats; cfmt++) {
          if (!screen->is_format_supported(screen, color_formats[cfmt].format,
-                                          PIPE_TEXTURE_2D, samples,
+                                          PIPE_TEXTURE_2D, samples, samples,
                                           bind_flags)) {
             continue;
          }
@@ -267,6 +267,7 @@
 
                if (!screen->is_format_supported(screen, depth->format,
                                                 PIPE_TEXTURE_2D, samples,
+                                                samples,
                                                 PIPE_BIND_DEPTH_STENCIL)) {
                   continue;
                }
diff --git a/src/gallium/state_trackers/wgl/stw_st.c b/src/gallium/state_trackers/wgl/stw_st.c
index 7cf18f0..2445c33 100644
--- a/src/gallium/state_trackers/wgl/stw_st.c
+++ b/src/gallium/state_trackers/wgl/stw_st.c
@@ -95,6 +95,7 @@
    templ.array_size = 1;
    templ.last_level = 0;
    templ.nr_samples = stwfb->stvis.samples;
+   templ.nr_storage_samples = stwfb->stvis.samples;;
 
    for (i = 0; i < ST_ATTACHMENT_COUNT; i++) {
       enum pipe_format format;
diff --git a/src/gallium/state_trackers/xa/meson.build b/src/gallium/state_trackers/xa/meson.build
index 109abc1..aff3963 100644
--- a/src/gallium/state_trackers/xa/meson.build
+++ b/src/gallium/state_trackers/xa/meson.build
@@ -18,7 +18,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-xa_version = ['2', '3', '0']
+xa_version = ['2', '4', '0']
 
 xa_conf = configuration_data()
 xa_conf.set('XA_MAJOR', xa_version[0])
diff --git a/src/gallium/state_trackers/xa/xa_context.c b/src/gallium/state_trackers/xa/xa_context.c
index 1f47170..ba22087 100644
--- a/src/gallium/state_trackers/xa/xa_context.c
+++ b/src/gallium/state_trackers/xa/xa_context.c
@@ -201,7 +201,7 @@
     }
 
     if (!screen->is_format_supported(screen,  dst->tex->format,
-				     PIPE_TEXTURE_2D, 0,
+				     PIPE_TEXTURE_2D, 0, 0,
 				     PIPE_BIND_RENDER_TARGET))
 	return -XA_ERR_INVAL;
 
diff --git a/src/gallium/state_trackers/xa/xa_renderer.c b/src/gallium/state_trackers/xa/xa_renderer.c
index 27497d3..e331123 100644
--- a/src/gallium/state_trackers/xa/xa_renderer.c
+++ b/src/gallium/state_trackers/xa/xa_renderer.c
@@ -417,7 +417,7 @@
     uint32_t fs_traits = FS_COMPOSITE;
 
     assert(screen->is_format_supported(screen, dst_surface->format,
-				       PIPE_TEXTURE_2D, 0,
+				       PIPE_TEXTURE_2D, 0, 0,
 				       PIPE_BIND_RENDER_TARGET));
     (void)screen;
 
diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c
index 03a3abf..c046a3a 100644
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -104,7 +104,7 @@
 	break;
     case xa_format_a8:
         if (xa->screen->is_format_supported(xa->screen, PIPE_FORMAT_R8_UNORM,
-                                            PIPE_TEXTURE_2D, 0,
+                                            PIPE_TEXTURE_2D, 0, 0,
                                             stype_bind[xa_type_a] |
                                             PIPE_BIND_RENDER_TARGET))
             fdesc.format = PIPE_FORMAT_R8_UNORM;
@@ -134,7 +134,7 @@
 	break;
     case xa_format_yuv8:
         if (xa->screen->is_format_supported(xa->screen, PIPE_FORMAT_R8_UNORM,
-                                            PIPE_TEXTURE_2D, 0,
+                                            PIPE_TEXTURE_2D, 0, 0,
                                             stype_bind[xa_type_yuv_component]))
             fdesc.format = PIPE_FORMAT_R8_UNORM;
         else
@@ -196,7 +196,7 @@
                 xa_get_pipe_format(xa, xa_format);
 
 	    if (xa->screen->is_format_supported(xa->screen, fdesc.format,
-						PIPE_TEXTURE_2D, 0, bind)) {
+						PIPE_TEXTURE_2D, 0, 0, bind)) {
 		if (xa->format_map[stype][0] == 0)
 		    xa->format_map[stype][0] = num_formats;
 		xa->format_map[stype][1] = num_formats;
@@ -300,7 +300,7 @@
 	bind |= PIPE_BIND_SCANOUT;
 
     if (!xa->screen->is_format_supported(xa->screen, fdesc.format,
-					 PIPE_TEXTURE_2D, 0, bind))
+					 PIPE_TEXTURE_2D, 0, 0, bind))
 	return -XA_ERR_INVAL;
 
     return XA_ERR_NONE;
@@ -311,12 +311,12 @@
 {
     switch (type) {
     case xa_handle_type_kms:
-	return DRM_API_HANDLE_TYPE_KMS;
+	return WINSYS_HANDLE_TYPE_KMS;
     case xa_handle_type_fd:
-        return DRM_API_HANDLE_TYPE_FD;
+        return WINSYS_HANDLE_TYPE_FD;
     case xa_handle_type_shared:
     default:
-	return DRM_API_HANDLE_TYPE_SHARED;
+	return WINSYS_HANDLE_TYPE_SHARED;
     }
 }
 
@@ -404,7 +404,7 @@
 		  uint32_t handle, uint32_t stride)
 {
     return xa_surface_from_handle2(xa, width, height, depth, stype, xa_format,
-                                   DRM_API_HANDLE_TYPE_SHARED, flags, handle,
+                                   WINSYS_HANDLE_TYPE_SHARED, flags, handle,
                                    stride);
 }
 
@@ -470,7 +470,7 @@
 	    return -XA_ERR_INVAL;
 
 	if (!xa->screen->is_format_supported(xa->screen, fdesc.format,
-					     PIPE_TEXTURE_2D, 0,
+					     PIPE_TEXTURE_2D, 0, 0,
 					     template->bind |
 					     PIPE_BIND_RENDER_TARGET))
 	    return -XA_ERR_INVAL;
diff --git a/src/gallium/state_trackers/xvmc/subpicture.c b/src/gallium/state_trackers/xvmc/subpicture.c
index bc26976..7a6dc89 100644
--- a/src/gallium/state_trackers/xvmc/subpicture.c
+++ b/src/gallium/state_trackers/xvmc/subpicture.c
@@ -62,14 +62,14 @@
    case FOURCC_AI44:
       ret = PIPE_FORMAT_R4A4_UNORM;
       if (!screen->is_format_supported(
-                screen, ret, PIPE_TEXTURE_2D, 0, PIPE_BIND_SAMPLER_VIEW))
+                screen, ret, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW))
          ret = PIPE_FORMAT_B4G4R4A4_UNORM;
       break;
 
    case FOURCC_IA44:
       ret = PIPE_FORMAT_A4R4_UNORM;
       if (!screen->is_format_supported(
-                screen, ret, PIPE_TEXTURE_2D, 0, PIPE_BIND_SAMPLER_VIEW))
+                screen, ret, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW))
          ret = PIPE_FORMAT_B4G4R4A4_UNORM;
       break;
 
@@ -79,7 +79,7 @@
    }
 
    if (!screen->is_format_supported(
-             screen, ret, PIPE_TEXTURE_2D, 0, PIPE_BIND_SAMPLER_VIEW)) {
+             screen, ret, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW)) {
       XVMC_MSG(XVMC_ERR, "[XvMC] Unsupported 2D format %s for Xv image ID 0x%08X.\n", util_format_name(ret), xvimage_id);
       ret = PIPE_FORMAT_NONE;
    }
@@ -125,7 +125,7 @@
       component_order[2] = 'V';
       component_order[3] = 'A';
       if (!screen->is_format_supported(
-                screen, *palette_format, PIPE_TEXTURE_1D, 0,
+                screen, *palette_format, PIPE_TEXTURE_1D, 0, 0,
                 PIPE_BIND_SAMPLER_VIEW)) {
          /* One of these formats better be supported... */
          *palette_format = PIPE_FORMAT_B8G8R8X8_UNORM;
diff --git a/src/gallium/targets/d3dadapter9/drm.c b/src/gallium/targets/d3dadapter9/drm.c
index 9c5bd8a..507a267 100644
--- a/src/gallium/targets/d3dadapter9/drm.c
+++ b/src/gallium/targets/d3dadapter9/drm.c
@@ -107,7 +107,7 @@
     if (drm->dev)
         pipe_loader_release(&drm->dev, 1);
 
-    close(drm->fd);
+    /* The pipe loader takes ownership of the fd */
     FREE(ctx);
 }
 
diff --git a/src/gallium/targets/d3dadapter9/meson.build b/src/gallium/targets/d3dadapter9/meson.build
index bd05b4f9..bc72b11 100644
--- a/src/gallium/targets/d3dadapter9/meson.build
+++ b/src/gallium/targets/d3dadapter9/meson.build
@@ -53,7 +53,7 @@
     libswkmsdri,
   ],
   dependencies : [
-    dep_selinux, dep_expat, dep_libdrm, dep_llvm,
+    dep_selinux, dep_expat, dep_libdrm, dep_llvm, dep_thread,
     driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau,
     driver_i915, driver_svga,
   ],
diff --git a/src/gallium/targets/dri/Makefile.am b/src/gallium/targets/dri/Makefile.am
index a0778b6..a857b51 100644
--- a/src/gallium/targets/dri/Makefile.am
+++ b/src/gallium/targets/dri/Makefile.am
@@ -76,8 +76,8 @@
 
 include $(top_srcdir)/src/gallium/drivers/tegra/Automake.inc
 
+include $(top_srcdir)/src/gallium/drivers/v3d/Automake.inc
 include $(top_srcdir)/src/gallium/drivers/vc4/Automake.inc
-include $(top_srcdir)/src/gallium/drivers/vc5/Automake.inc
 include $(top_srcdir)/src/gallium/drivers/pl111/Automake.inc
 
 include $(top_srcdir)/src/gallium/drivers/virgl/Automake.inc
diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build
index bdbd458..e3202c9 100644
--- a/src/gallium/targets/dri/meson.build
+++ b/src/gallium/targets/dri/meson.build
@@ -56,7 +56,7 @@
   dependencies : [
     dep_selinux, dep_expat, dep_libdrm, dep_llvm, dep_thread,
     driver_swrast, driver_r300, driver_r600, driver_radeonsi, driver_nouveau,
-    driver_pl111, driver_vc4, driver_vc5, driver_freedreno, driver_etnaviv,
+    driver_pl111, driver_v3d, driver_vc4, driver_freedreno, driver_etnaviv,
     driver_imx, driver_tegra, driver_i915, driver_svga, driver_virgl,
     driver_swr,
   ],
@@ -68,8 +68,8 @@
              [with_gallium_freedreno, ['msm_dri.so', 'kgsl_dri.so']],
              [with_gallium_softpipe or with_gallium_swr, 'swrast_dri.so'],
              [with_gallium_softpipe and with_gallium_drisw_kms, 'kms_swrast_dri.so'],
+             [with_gallium_v3d, 'v3d_dri.so'],
              [with_gallium_vc4, 'vc4_dri.so'],
-             [with_gallium_vc5, 'vc5_dri.so'],
              [with_gallium_etnaviv, 'etnaviv_dri.so'],
              [with_gallium_imx, 'imx-drm_dri.so'],
              [with_gallium_tegra, 'tegra_dri.so'],
diff --git a/src/gallium/targets/dri/target.c b/src/gallium/targets/dri/target.c
index e09e776..835d125 100644
--- a/src/gallium/targets/dri/target.c
+++ b/src/gallium/targets/dri/target.c
@@ -71,6 +71,10 @@
 DEFINE_LOADER_DRM_ENTRYPOINT(virtio_gpu)
 #endif
 
+#if defined(GALLIUM_V3D)
+DEFINE_LOADER_DRM_ENTRYPOINT(v3d)
+#endif
+
 #if defined(GALLIUM_VC4)
 DEFINE_LOADER_DRM_ENTRYPOINT(vc4)
 #if defined(GALLIUM_PL111)
@@ -78,10 +82,6 @@
 #endif
 #endif
 
-#if defined(GALLIUM_VC5)
-DEFINE_LOADER_DRM_ENTRYPOINT(vc5)
-#endif
-
 #if defined(GALLIUM_ETNAVIV)
 DEFINE_LOADER_DRM_ENTRYPOINT(imx_drm)
 DEFINE_LOADER_DRM_ENTRYPOINT(etnaviv)
diff --git a/src/gallium/tests/graw/clear.c b/src/gallium/tests/graw/clear.c
index 45b0cc0..2a08ae1 100644
--- a/src/gallium/tests/graw/clear.c
+++ b/src/gallium/tests/graw/clear.c
@@ -73,7 +73,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = (PIPE_BIND_RENDER_TARGET |
                    PIPE_BIND_DISPLAY_TARGET);
    
diff --git a/src/gallium/tests/graw/fs-test.c b/src/gallium/tests/graw/fs-test.c
index d1ade1d..cc87b02 100644
--- a/src/gallium/tests/graw/fs-test.c
+++ b/src/gallium/tests/graw/fs-test.c
@@ -301,7 +301,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = PIPE_BIND_SAMPLER_VIEW;
 
    
@@ -411,7 +410,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = (PIPE_BIND_RENDER_TARGET |
                    PIPE_BIND_DISPLAY_TARGET);
    
diff --git a/src/gallium/tests/graw/graw_util.h b/src/gallium/tests/graw/graw_util.h
index 36064e1..c919ec7 100644
--- a/src/gallium/tests/graw/graw_util.h
+++ b/src/gallium/tests/graw/graw_util.h
@@ -77,7 +77,6 @@
       resource_temp.depth0 = 1;
       resource_temp.array_size = 1;
       resource_temp.last_level = 0;
-      resource_temp.nr_samples = 1;
       resource_temp.bind = (PIPE_BIND_RENDER_TARGET |
                             PIPE_BIND_DISPLAY_TARGET);
       info->color_buf[i] = info->screen->resource_create(info->screen,
@@ -109,7 +108,6 @@
    resource_temp.depth0 = 1;
    resource_temp.array_size = 1;
    resource_temp.last_level = 0;
-   resource_temp.nr_samples = 1;
    resource_temp.bind = PIPE_BIND_DEPTH_STENCIL;
    info->zs_buf = info->screen->resource_create(info->screen, &resource_temp);
    if (!info->zs_buf) {
@@ -233,7 +231,6 @@
    temp.depth0 = 1;
    temp.last_level = 0;
    temp.array_size = 1;
-   temp.nr_samples = 1;
    temp.bind = PIPE_BIND_SAMPLER_VIEW;
    
    tex = info->screen->resource_create(info->screen, &temp);
diff --git a/src/gallium/tests/graw/gs-test.c b/src/gallium/tests/graw/gs-test.c
index dad3298..9c3c29b 100644
--- a/src/gallium/tests/graw/gs-test.c
+++ b/src/gallium/tests/graw/gs-test.c
@@ -158,7 +158,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = PIPE_BIND_CONSTANT_BUFFER;
 
    constbuf1 = screen->resource_create(screen, &templat);
@@ -392,7 +391,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = PIPE_BIND_SAMPLER_VIEW;
 
    
@@ -502,7 +500,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = (PIPE_BIND_RENDER_TARGET |
                    PIPE_BIND_DISPLAY_TARGET);
    
diff --git a/src/gallium/tests/graw/quad-sample.c b/src/gallium/tests/graw/quad-sample.c
index 7917420..d532e60 100644
--- a/src/gallium/tests/graw/quad-sample.c
+++ b/src/gallium/tests/graw/quad-sample.c
@@ -216,7 +216,6 @@
    templat.height0 = SIZE;
    templat.depth0 = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = PIPE_BIND_SAMPLER_VIEW;
 
    
@@ -326,7 +325,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = (PIPE_BIND_RENDER_TARGET |
                    PIPE_BIND_DISPLAY_TARGET);
    
diff --git a/src/gallium/tests/graw/shader-leak.c b/src/gallium/tests/graw/shader-leak.c
index fb4344c..4178448 100644
--- a/src/gallium/tests/graw/shader-leak.c
+++ b/src/gallium/tests/graw/shader-leak.c
@@ -199,7 +199,6 @@
    templat.height0 = HEIGHT;
    templat.depth0 = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = (PIPE_BIND_RENDER_TARGET |
                    PIPE_BIND_DISPLAY_TARGET);
    
diff --git a/src/gallium/tests/graw/tri-gs.c b/src/gallium/tests/graw/tri-gs.c
index 2ca36ce..5efc9e8 100644
--- a/src/gallium/tests/graw/tri-gs.c
+++ b/src/gallium/tests/graw/tri-gs.c
@@ -207,7 +207,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = (PIPE_BIND_RENDER_TARGET |
                    PIPE_BIND_DISPLAY_TARGET);
    
diff --git a/src/gallium/tests/graw/tri-instanced.c b/src/gallium/tests/graw/tri-instanced.c
index 6c6783c..9bd2ff5 100644
--- a/src/gallium/tests/graw/tri-instanced.c
+++ b/src/gallium/tests/graw/tri-instanced.c
@@ -258,7 +258,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = (PIPE_BIND_RENDER_TARGET |
                    PIPE_BIND_DISPLAY_TARGET);
    
diff --git a/src/gallium/tests/graw/vs-test.c b/src/gallium/tests/graw/vs-test.c
index e3b50ea..26976f4 100644
--- a/src/gallium/tests/graw/vs-test.c
+++ b/src/gallium/tests/graw/vs-test.c
@@ -90,7 +90,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = PIPE_BIND_CONSTANT_BUFFER;
 
    constbuf = screen->resource_create(screen,
@@ -290,7 +289,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = PIPE_BIND_SAMPLER_VIEW;
 
    
@@ -400,7 +398,6 @@
    templat.depth0 = 1;
    templat.array_size = 1;
    templat.last_level = 0;
-   templat.nr_samples = 1;
    templat.bind = (PIPE_BIND_RENDER_TARGET |
                    PIPE_BIND_DISPLAY_TARGET);
    
diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c
index 443451e..afe5d3e 100644
--- a/src/gallium/tests/trivial/compute.c
+++ b/src/gallium/tests/trivial/compute.c
@@ -1131,7 +1131,7 @@
                 printf("   - %s\n", util_format_name(surface_fmts[i]));
 
                 if (!ctx->screen->is_format_supported(ctx->screen,
-                       surface_fmts[i], PIPE_TEXTURE_2D, 1,
+                       surface_fmts[i], PIPE_TEXTURE_2D, 1, 1,
                        PIPE_BIND_COMPUTE_RESOURCE)) {
                    printf("(unsupported)\n");
                    continue;
@@ -1251,7 +1251,7 @@
                 printf("   - %s\n", util_format_name(surface_fmts[i]));
 
                 if (!ctx->screen->is_format_supported(ctx->screen,
-                       surface_fmts[i], PIPE_TEXTURE_2D, 1,
+                       surface_fmts[i], PIPE_TEXTURE_2D, 1, 1,
                        PIPE_BIND_COMPUTE_RESOURCE)) {
                    printf("(unsupported)\n");
                    continue;
diff --git a/src/gallium/tests/trivial/quad-tex.c b/src/gallium/tests/trivial/quad-tex.c
index df0e130..1f29306 100644
--- a/src/gallium/tests/trivial/quad-tex.c
+++ b/src/gallium/tests/trivial/quad-tex.c
@@ -27,8 +27,8 @@
 #define USE_TRACE 0
 #define WIDTH 300
 #define HEIGHT 300
-#define NEAR 30
-#define FAR 1000
+#define NEAR 0
+#define FAR 1
 #define FLIP 0
 
 /* pipe_*_state structs */
@@ -174,6 +174,7 @@
 		memset(&box, 0, sizeof(box));
 		box.width = 2;
 		box.height = 2;
+		box.depth = 1;
 
 		ptr = p->pipe->transfer_map(p->pipe, p->tex, 0, PIPE_TRANSFER_WRITE, &box, &t);
 		ptr[0] = 0xffff0000;
@@ -226,7 +227,7 @@
 	{
 		float x = 0;
 		float y = 0;
-		float z = FAR;
+		float z = NEAR;
 		float half_width = (float)WIDTH / 2.0f;
 		float half_height = (float)HEIGHT / 2.0f;
 		float half_depth = ((float)FAR - (float)NEAR) / 2.0f;
diff --git a/src/gallium/tests/trivial/tri.c b/src/gallium/tests/trivial/tri.c
index 71e9702..87a335f 100644
--- a/src/gallium/tests/trivial/tri.c
+++ b/src/gallium/tests/trivial/tri.c
@@ -27,8 +27,8 @@
 #define USE_TRACE 0
 #define WIDTH 300
 #define HEIGHT 300
-#define NEAR 30
-#define FAR 1000
+#define NEAR 0
+#define FAR 1
 #define FLIP 0
 
 /* pipe_*_state structs */
@@ -171,7 +171,7 @@
 	{
 		float x = 0;
 		float y = 0;
-		float z = FAR;
+		float z = NEAR;
 		float half_width = (float)WIDTH / 2.0f;
 		float half_height = (float)HEIGHT / 2.0f;
 		float half_depth = ((float)FAR - (float)NEAR) / 2.0f;
diff --git a/src/gallium/tools/trace/dump_state.py b/src/gallium/tools/trace/dump_state.py
index 4531843..2622d13 100755
--- a/src/gallium/tools/trace/dump_state.py
+++ b/src/gallium/tools/trace/dump_state.py
@@ -137,6 +137,7 @@
     def visit_struct(self, node):
         struct = Struct()
         for member_name, member_node in node.members:
+            member_name = member_name.replace('.', '_')
             member_value = self.visit(member_node)
             setattr(struct, member_name, member_value)
         self.result = struct
@@ -185,7 +186,7 @@
     def destroy(self):
         pass
 
-    def context_create(self):
+    def context_create(self, priv=None, flags=0):
         return Context(self.interpreter)
     
     def is_format_supported(self, format, target, sample_count, bind, geom_flags):
@@ -498,10 +499,11 @@
             vertex = []
             for velem in self._state.vertex_elements:
                 vbuf = self._state.vertex_buffers[velem.vertex_buffer_index]
-                if vbuf.buffer is None:
+                resource = vbuf.buffer_resource
+                if resource is None:
                     continue
 
-                data = vbuf.buffer.data
+                data = resource.data
 
                 offset = vbuf.buffer_offset + velem.src_offset + vbuf.stride*index
                 format = {
@@ -523,7 +525,7 @@
                     'PIPE_FORMAT_R16G16B16_SNORM': '3h',
                 }[velem.src_format]
 
-                data = vbuf.buffer.data
+                data = resource.data
                 attribute = unpack_from(format, data, offset)
                 vertex.append(attribute)
 
@@ -551,7 +553,7 @@
 
         self._state.draw = info
 
-        if info.indexed:
+        if info.index_size != 0:
             min_index, max_index = self._merge_indices(info)
         else:
             min_index = info.start
@@ -637,6 +639,25 @@
     def tex_transfer_destroy(self, transfer):
         self.interpreter.unregister_object(transfer)
 
+    def buffer_subdata(self, resource, usage, data, box=None, offset=None, size=None, level=None, stride=None, layer_stride=None):
+        if box is not None:
+            # XXX trace_context_transfer_unmap generates brokens buffer_subdata
+            assert offset is None
+            assert size is None
+            assert level == 0
+            offset = box.x
+            size = box.width
+            box = None
+
+        if resource is not None and resource.target == PIPE_BUFFER:
+            data = data.getValue()
+            assert len(data) >= size
+            assert offset + size <= len(resource.data)
+            resource.data[offset : offset + size] = data[:size]
+
+    def texture_subdata(self, resource, level, usage, box, data, stride, layer_stride):
+        pass
+
     def transfer_inline_write(self, resource, level, usage, box, stride, layer_stride, data):
         if resource is not None and resource.target == PIPE_BUFFER:
             data = data.getValue()
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 9b6d6e8..5b7c3cf 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -28,6 +28,7 @@
 #include "amdgpu_cs.h"
 
 #include "util/os_time.h"
+#include "util/u_hash_table.h"
 #include "state_tracker/drm_driver.h"
 #include <amdgpu_drm.h>
 #include <xf86drm.h>
@@ -168,16 +169,21 @@
 void amdgpu_bo_destroy(struct pb_buffer *_buf)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
+   struct amdgpu_winsys *ws = bo->ws;
 
    assert(bo->bo && "must not be called for slab entries");
 
-   if (bo->ws->debug_all_bos) {
-      simple_mtx_lock(&bo->ws->global_bo_list_lock);
+   if (ws->debug_all_bos) {
+      simple_mtx_lock(&ws->global_bo_list_lock);
       LIST_DEL(&bo->u.real.global_list_item);
-      bo->ws->num_buffers--;
-      simple_mtx_unlock(&bo->ws->global_bo_list_lock);
+      ws->num_buffers--;
+      simple_mtx_unlock(&ws->global_bo_list_lock);
    }
 
+   simple_mtx_lock(&ws->bo_export_table_lock);
+   util_hash_table_remove(ws->bo_export_table, bo->bo);
+   simple_mtx_unlock(&ws->bo_export_table_lock);
+
    amdgpu_bo_va_op(bo->bo, 0, bo->base.size, bo->va, 0, AMDGPU_VA_OP_UNMAP);
    amdgpu_va_range_free(bo->u.real.va_handle);
    amdgpu_bo_free(bo->bo);
@@ -185,16 +191,16 @@
    amdgpu_bo_remove_fences(bo);
 
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-      bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->info.gart_page_size);
+      ws->allocated_vram -= align64(bo->base.size, ws->info.gart_page_size);
    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-      bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->info.gart_page_size);
+      ws->allocated_gtt -= align64(bo->base.size, ws->info.gart_page_size);
 
    if (bo->u.real.map_count >= 1) {
       if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-         bo->ws->mapped_vram -= bo->base.size;
+         ws->mapped_vram -= bo->base.size;
       else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-         bo->ws->mapped_gtt -= bo->base.size;
-      bo->ws->num_mapped_buffers--;
+         ws->mapped_gtt -= bo->base.size;
+      ws->num_mapped_buffers--;
    }
 
    FREE(bo);
@@ -213,7 +219,7 @@
 }
 
 static void *amdgpu_bo_map(struct pb_buffer *buf,
-                           struct radeon_winsys_cs *rcs,
+                           struct radeon_cmdbuf *rcs,
                            enum pipe_transfer_usage usage)
 {
    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
@@ -428,6 +434,9 @@
    if (flags & RADEON_FLAG_NO_INTERPROCESS_SHARING &&
        ws->info.has_local_buffers)
       request.flags |= AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
+   if (ws->zero_all_vram_allocs &&
+       (request.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM))
+      request.flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
 
    r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
    if (r) {
@@ -1261,57 +1270,78 @@
                                                unsigned *offset)
 {
    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
-   struct amdgpu_winsys_bo *bo;
+   struct amdgpu_winsys_bo *bo = NULL;
    enum amdgpu_bo_handle_type type;
    struct amdgpu_bo_import_result result = {0};
    uint64_t va;
-   amdgpu_va_handle va_handle;
+   amdgpu_va_handle va_handle = NULL;
    struct amdgpu_bo_info info = {0};
    enum radeon_bo_domain initial = 0;
    int r;
 
-   /* Initialize the structure. */
-   bo = CALLOC_STRUCT(amdgpu_winsys_bo);
-   if (!bo) {
-      return NULL;
-   }
-
    switch (whandle->type) {
-   case DRM_API_HANDLE_TYPE_SHARED:
+   case WINSYS_HANDLE_TYPE_SHARED:
       type = amdgpu_bo_handle_type_gem_flink_name;
       break;
-   case DRM_API_HANDLE_TYPE_FD:
+   case WINSYS_HANDLE_TYPE_FD:
       type = amdgpu_bo_handle_type_dma_buf_fd;
       break;
    default:
       return NULL;
    }
 
+   if (stride)
+      *stride = whandle->stride;
+   if (offset)
+      *offset = whandle->offset;
+
    r = amdgpu_bo_import(ws->dev, type, whandle->handle, &result);
    if (r)
-      goto error;
+      return NULL;
+
+   simple_mtx_lock(&ws->bo_export_table_lock);
+   bo = util_hash_table_get(ws->bo_export_table, result.buf_handle);
+
+   /* If the amdgpu_winsys_bo instance already exists, bump the reference
+    * counter and return it.
+    */
+   if (bo) {
+      p_atomic_inc(&bo->base.reference.count);
+      simple_mtx_unlock(&ws->bo_export_table_lock);
+
+      /* Release the buffer handle, because we don't need it anymore.
+       * This function is returning an existing buffer, which has its own
+       * handle.
+       */
+      amdgpu_bo_free(result.buf_handle);
+      return &bo->base;
+   }
 
    /* Get initial domains. */
    r = amdgpu_bo_query_info(result.buf_handle, &info);
    if (r)
-      goto error_query;
+      goto error;
 
    r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general,
                              result.alloc_size, 1 << 20, 0, &va, &va_handle,
 			     AMDGPU_VA_RANGE_HIGH);
    if (r)
-      goto error_query;
+      goto error;
+
+   bo = CALLOC_STRUCT(amdgpu_winsys_bo);
+   if (!bo)
+      goto error;
 
    r = amdgpu_bo_va_op(result.buf_handle, 0, result.alloc_size, va, 0, AMDGPU_VA_OP_MAP);
    if (r)
-      goto error_va_map;
+      goto error;
 
    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_VRAM)
       initial |= RADEON_DOMAIN_VRAM;
    if (info.preferred_heap & AMDGPU_GEM_DOMAIN_GTT)
       initial |= RADEON_DOMAIN_GTT;
 
-
+   /* Initialize the structure. */
    pipe_reference_init(&bo->base.reference, 1);
    bo->base.alignment = info.phys_alignment;
    bo->bo = result.buf_handle;
@@ -1324,11 +1354,6 @@
    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
    bo->is_shared = true;
 
-   if (stride)
-      *stride = whandle->stride;
-   if (offset)
-      *offset = whandle->offset;
-
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
       ws->allocated_vram += align64(bo->base.size, ws->info.gart_page_size);
    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
@@ -1336,16 +1361,18 @@
 
    amdgpu_add_buffer_to_global_list(bo);
 
+   util_hash_table_set(ws->bo_export_table, bo->bo, bo);
+   simple_mtx_unlock(&ws->bo_export_table_lock);
+
    return &bo->base;
 
-error_va_map:
-   amdgpu_va_range_free(va_handle);
-
-error_query:
-   amdgpu_bo_free(result.buf_handle);
-
 error:
-   FREE(bo);
+   simple_mtx_unlock(&ws->bo_export_table_lock);
+   if (bo)
+      FREE(bo);
+   if (va_handle)
+      amdgpu_va_range_free(va_handle);
+   amdgpu_bo_free(result.buf_handle);
    return NULL;
 }
 
@@ -1355,6 +1382,7 @@
                                  struct winsys_handle *whandle)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
+   struct amdgpu_winsys *ws = bo->ws;
    enum amdgpu_bo_handle_type type;
    int r;
 
@@ -1365,13 +1393,13 @@
    bo->u.real.use_reusable_pool = false;
 
    switch (whandle->type) {
-   case DRM_API_HANDLE_TYPE_SHARED:
+   case WINSYS_HANDLE_TYPE_SHARED:
       type = amdgpu_bo_handle_type_gem_flink_name;
       break;
-   case DRM_API_HANDLE_TYPE_FD:
+   case WINSYS_HANDLE_TYPE_FD:
       type = amdgpu_bo_handle_type_dma_buf_fd;
       break;
-   case DRM_API_HANDLE_TYPE_KMS:
+   case WINSYS_HANDLE_TYPE_KMS:
       type = amdgpu_bo_handle_type_kms;
       break;
    default:
@@ -1382,6 +1410,10 @@
    if (r)
       return false;
 
+   simple_mtx_lock(&ws->bo_export_table_lock);
+   util_hash_table_set(ws->bo_export_table, bo->bo, bo);
+   simple_mtx_unlock(&ws->bo_export_table_lock);
+
    whandle->stride = stride;
    whandle->offset = offset;
    whandle->offset += slice_size * whandle->layer;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index eb050b8..ac7160a 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -260,7 +260,7 @@
 }
 
 static struct pipe_fence_handle *
-amdgpu_cs_get_next_fence(struct radeon_winsys_cs *rcs)
+amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
 {
    struct amdgpu_cs *cs = amdgpu_cs(rcs);
    struct pipe_fence_handle *fence = NULL;
@@ -608,7 +608,7 @@
    return idx;
 }
 
-static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs,
+static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
                                     struct pb_buffer *buf,
                                     enum radeon_bo_usage usage,
                                     enum radeon_bo_domain domains,
@@ -629,7 +629,7 @@
     */
    if (bo == cs->last_added_bo &&
        (usage & cs->last_added_bo_usage) == usage &&
-       (1ull << priority) & cs->last_added_bo_priority_usage)
+       (1u << priority) & cs->last_added_bo_priority_usage)
       return cs->last_added_bo_index;
 
    if (!bo->sparse) {
@@ -658,7 +658,7 @@
       buffer = &cs->sparse_buffers[index];
    }
 
-   buffer->u.real.priority_usage |= 1ull << priority;
+   buffer->u.real.priority_usage |= 1u << priority;
    buffer->usage |= usage;
 
    cs->last_added_bo = bo;
@@ -902,9 +902,7 @@
 static void amdgpu_destroy_cs_context(struct amdgpu_cs_context *cs)
 {
    amdgpu_cs_context_cleanup(cs);
-   FREE(cs->flags);
    FREE(cs->real_buffers);
-   FREE(cs->handles);
    FREE(cs->slab_buffers);
    FREE(cs->sparse_buffers);
    FREE(cs->fence_dependencies);
@@ -912,7 +910,7 @@
 }
 
 
-static struct radeon_winsys_cs *
+static struct radeon_cmdbuf *
 amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
                  enum ring_type ring_type,
                  void (*flush)(void *ctx, unsigned flags,
@@ -967,12 +965,12 @@
    return &cs->main.base;
 }
 
-static bool amdgpu_cs_validate(struct radeon_winsys_cs *rcs)
+static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
 {
    return true;
 }
 
-static bool amdgpu_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
+static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
 {
    struct amdgpu_ib *ib = amdgpu_ib(rcs);
    struct amdgpu_cs *cs = amdgpu_cs_from_ib(ib);
@@ -996,7 +994,7 @@
    /* Allocate a new chunk */
    if (rcs->num_prev >= rcs->max_prev) {
       unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
-      struct radeon_winsys_cs_chunk *new_prev;
+      struct radeon_cmdbuf_chunk *new_prev;
 
       new_prev = REALLOC(rcs->prev,
                          sizeof(*new_prev) * rcs->max_prev,
@@ -1053,7 +1051,7 @@
    return true;
 }
 
-static unsigned amdgpu_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
+static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
                                           struct radeon_bo_list_item *list)
 {
     struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
@@ -1102,7 +1100,7 @@
    return amdgpu_fence_wait((void *)fence, 0, false);
 }
 
-static void amdgpu_cs_add_fence_dependency(struct radeon_winsys_cs *rws,
+static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rws,
                                            struct pipe_fence_handle *pfence)
 {
    struct amdgpu_cs *acs = amdgpu_cs(rws);
@@ -1236,7 +1234,7 @@
    return idx;
 }
 
-static void amdgpu_cs_add_syncobj_signal(struct radeon_winsys_cs *rws,
+static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
                                          struct pipe_fence_handle *fence)
 {
    struct amdgpu_cs *acs = amdgpu_cs(rws);
@@ -1302,14 +1300,7 @@
       unsigned num = 0;
 
       simple_mtx_lock(&ws->global_bo_list_lock);
-
-      handles = malloc(sizeof(handles[0]) * ws->num_buffers);
-      if (!handles) {
-         simple_mtx_unlock(&ws->global_bo_list_lock);
-         amdgpu_cs_context_cleanup(cs);
-         cs->error_code = -ENOMEM;
-         return;
-      }
+      handles = alloca(sizeof(handles[0]) * ws->num_buffers);
 
       LIST_FOR_EACH_ENTRY(bo, &ws->global_bo_list, u.real.global_list_item) {
          assert(num < ws->num_buffers);
@@ -1318,29 +1309,22 @@
 
       r = amdgpu_bo_list_create(ws->dev, ws->num_buffers,
                                 handles, NULL, &bo_list);
-      free(handles);
       simple_mtx_unlock(&ws->global_bo_list_lock);
+      if (r) {
+         fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
+         goto cleanup;
+      }
    } else {
       unsigned num_handles;
 
       if (!amdgpu_add_sparse_backing_buffers(cs)) {
+         fprintf(stderr, "amdgpu: amdgpu_add_sparse_backing_buffers failed\n");
          r = -ENOMEM;
-         goto bo_list_error;
+         goto cleanup;
       }
 
-      if (cs->max_real_submit < cs->num_real_buffers) {
-         FREE(cs->handles);
-         FREE(cs->flags);
-
-         cs->handles = MALLOC(sizeof(*cs->handles) * cs->num_real_buffers);
-         cs->flags = MALLOC(sizeof(*cs->flags) * cs->num_real_buffers);
-
-         if (!cs->handles || !cs->flags) {
-            cs->max_real_submit = 0;
-            r = -ENOMEM;
-            goto bo_list_error;
-         }
-      }
+      amdgpu_bo_handle *handles = alloca(sizeof(*handles) * cs->num_real_buffers);
+      uint8_t *flags = alloca(sizeof(*flags) * cs->num_real_buffers);
 
       num_handles = 0;
       for (i = 0; i < cs->num_real_buffers; ++i) {
@@ -1351,29 +1335,23 @@
 
          assert(buffer->u.real.priority_usage != 0);
 
-         cs->handles[num_handles] = buffer->bo->bo;
-         cs->flags[num_handles] = (util_last_bit64(buffer->u.real.priority_usage) - 1) / 4;
+         handles[num_handles] = buffer->bo->bo;
+         flags[num_handles] = (util_last_bit(buffer->u.real.priority_usage) - 1) / 2;
 	 ++num_handles;
       }
 
-      if (acs->ring_type == RING_GFX)
-         ws->gfx_bo_list_counter += cs->num_real_buffers;
-
       if (num_handles) {
          r = amdgpu_bo_list_create(ws->dev, num_handles,
-                                   cs->handles, cs->flags, &bo_list);
-      } else {
-         r = 0;
+                                   handles, flags, &bo_list);
+         if (r) {
+            fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
+            goto cleanup;
+         }
       }
    }
-bo_list_error:
 
-   if (r) {
-      fprintf(stderr, "amdgpu: buffer list creation failed (%d)\n", r);
-      amdgpu_fence_signalled(cs->fence);
-      cs->error_code = r;
-      goto cleanup;
-   }
+   if (acs->ring_type == RING_GFX)
+      ws->gfx_bo_list_counter += cs->num_real_buffers;
 
    if (acs->ctx->num_rejected_cs) {
       r = -ECANCELED;
@@ -1475,7 +1453,6 @@
                                num_chunks, chunks, &seq_no);
    }
 
-   cs->error_code = r;
    if (r) {
       if (r == -ENOMEM)
          fprintf(stderr, "amdgpu: Not enough memory for command submission.\n");
@@ -1485,8 +1462,6 @@
          fprintf(stderr, "amdgpu: The CS has been rejected, "
                  "see dmesg for more information (%i).\n", r);
 
-      amdgpu_fence_signalled(cs->fence);
-
       acs->ctx->num_rejected_cs++;
       ws->num_total_rejected_cs++;
    } else {
@@ -1503,6 +1478,13 @@
       amdgpu_bo_list_destroy(bo_list);
 
 cleanup:
+   /* If there was an error, signal the fence, because it won't be signalled
+    * by the hardware. */
+   if (r)
+      amdgpu_fence_signalled(cs->fence);
+
+   cs->error_code = r;
+
    for (i = 0; i < cs->num_real_buffers; i++)
       p_atomic_dec(&cs->real_buffers[i].bo->num_active_ioctls);
    for (i = 0; i < cs->num_slab_buffers; i++)
@@ -1514,7 +1496,7 @@
 }
 
 /* Make sure the previous submission is completed. */
-void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs)
+void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
 {
    struct amdgpu_cs *cs = amdgpu_cs(rcs);
 
@@ -1522,7 +1504,7 @@
    util_queue_fence_wait(&cs->flush_completed);
 }
 
-static int amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
+static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
                            unsigned flags,
                            struct pipe_fence_handle **fence)
 {
@@ -1639,7 +1621,7 @@
    return error_code;
 }
 
-static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs)
+static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
 {
    struct amdgpu_cs *cs = amdgpu_cs(rcs);
 
@@ -1654,7 +1636,7 @@
    FREE(cs);
 }
 
-static bool amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs,
+static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
                                     struct pb_buffer *_buf,
                                     enum radeon_bo_usage usage)
 {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 80acb7c..9f5a4fd 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -46,7 +46,7 @@
    struct amdgpu_winsys_bo *bo;
    union {
       struct {
-         uint64_t priority_usage;
+         uint32_t priority_usage;
       } real;
       struct {
          uint32_t real_idx; /* index of underlying real BO */
@@ -61,7 +61,7 @@
 };
 
 struct amdgpu_ib {
-   struct radeon_winsys_cs base;
+   struct radeon_cmdbuf base;
 
    /* A buffer out of which new IBs are allocated. */
    struct pb_buffer        *big_ib_buffer;
@@ -81,10 +81,6 @@
    unsigned                    num_real_buffers;
    struct amdgpu_cs_buffer     *real_buffers;
 
-   unsigned                    max_real_submit;
-   amdgpu_bo_handle            *handles;
-   uint8_t                     *flags;
-
    unsigned                    num_slab_buffers;
    unsigned                    max_slab_buffers;
    struct amdgpu_cs_buffer     *slab_buffers;
@@ -98,7 +94,7 @@
    struct amdgpu_winsys_bo     *last_added_bo;
    unsigned                    last_added_bo_index;
    unsigned                    last_added_bo_usage;
-   uint64_t                    last_added_bo_priority_usage;
+   uint32_t                    last_added_bo_priority_usage;
 
    struct pipe_fence_handle    **fence_dependencies;
    unsigned                    num_fence_dependencies;
@@ -193,13 +189,13 @@
 int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo);
 
 static inline struct amdgpu_ib *
-amdgpu_ib(struct radeon_winsys_cs *base)
+amdgpu_ib(struct radeon_cmdbuf *base)
 {
    return (struct amdgpu_ib *)base;
 }
 
 static inline struct amdgpu_cs *
-amdgpu_cs(struct radeon_winsys_cs *base)
+amdgpu_cs(struct radeon_cmdbuf *base)
 {
    assert(amdgpu_ib(base)->ib_type == IB_MAIN);
    return (struct amdgpu_cs*)base;
@@ -261,7 +257,7 @@
 void amdgpu_add_fences(struct amdgpu_winsys_bo *bo,
                        unsigned num_fences,
                        struct pipe_fence_handle **fences);
-void amdgpu_cs_sync_flush(struct radeon_winsys_cs *rcs);
+void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs);
 void amdgpu_cs_init_functions(struct amdgpu_winsys *ws);
 void amdgpu_cs_submit_ib(void *job, int thread_index);
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
index b5a1ebb..2f0d0f2 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
@@ -85,6 +85,7 @@
    config.info.depth = tex->depth0;
    config.info.array_size = tex->array_size;
    config.info.samples = tex->nr_samples;
+   config.info.storage_samples = tex->nr_storage_samples;
    config.info.levels = tex->last_level + 1;
    config.info.num_channels = util_format_get_nr_components(tex->format);
    config.is_3d = !!(tex->target == PIPE_TEXTURE_3D);
@@ -94,15 +95,12 @@
     * always use consecutive surface indices when FMASK is allocated between
     * them.
     */
-   if (flags & RADEON_SURF_FMASK)
-      config.info.surf_index = &ws->surf_index_fmask;
-   else if (!(flags & RADEON_SURF_Z_OR_SBUFFER))
-      config.info.surf_index = &ws->surf_index_color;
-   else
-      config.info.surf_index = NULL;
-
+   config.info.surf_index = &ws->surf_index_color;
    config.info.fmask_surf_index = &ws->surf_index_fmask;
 
+   if (flags & RADEON_SURF_Z_OR_SBUFFER)
+      config.info.surf_index = NULL;
+
    return ac_compute_surface(ws->addrlib, &ws->info, &config, mode, surf);
 }
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 84d8ca6..5d1c410 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -31,6 +31,7 @@
 #include "amdgpu_public.h"
 
 #include "util/u_hash_table.h"
+#include "util/hash_table.h"
 #include <amdgpu_drm.h>
 #include <xf86drm.h>
 #include <stdio.h>
@@ -53,13 +54,6 @@
    if (!ac_query_gpu_info(fd, ws->dev, &ws->info, &ws->amdinfo))
       goto fail;
 
-   /* LLVM 5.0 is required for GFX9. */
-   if (ws->info.chip_class >= GFX9 && HAVE_LLVM < 0x0500) {
-      fprintf(stderr, "amdgpu: LLVM 5.0 is required, got LLVM %i.%i\n",
-              HAVE_LLVM >> 8, HAVE_LLVM & 255);
-      goto fail;
-   }
-
    ws->addrlib = amdgpu_addr_create(&ws->info, &ws->amdinfo, &ws->info.max_alignment);
    if (!ws->addrlib) {
       fprintf(stderr, "amdgpu: Cannot create addrlib.\n");
@@ -69,6 +63,7 @@
    ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL;
    ws->debug_all_bos = debug_get_option_all_bos();
    ws->reserve_vmid = strstr(debug_get_option("R600_DEBUG", ""), "reserve_vmid") != NULL;
+   ws->zero_all_vram_allocs = strstr(debug_get_option("R600_DEBUG", ""), "zerovram") != NULL;
 
    return true;
 
@@ -97,7 +92,9 @@
    simple_mtx_destroy(&ws->bo_fence_lock);
    pb_slabs_deinit(&ws->bo_slabs);
    pb_cache_deinit(&ws->bo_cache);
+   util_hash_table_destroy(ws->bo_export_table);
    simple_mtx_destroy(&ws->global_bo_list_lock);
+   simple_mtx_destroy(&ws->bo_export_table_lock);
    do_winsys_deinit(ws);
    FREE(rws);
 }
@@ -108,7 +105,7 @@
    *info = ((struct amdgpu_winsys *)rws)->info;
 }
 
-static bool amdgpu_cs_request_feature(struct radeon_winsys_cs *rcs,
+static bool amdgpu_cs_request_feature(struct radeon_cmdbuf *rcs,
                                       enum radeon_feature_id fid,
                                       bool enable)
 {
@@ -193,16 +190,12 @@
                                    0xffffffff, 0, out) == 0;
 }
 
-static unsigned hash_dev(void *key)
+static unsigned hash_pointer(void *key)
 {
-#if defined(PIPE_ARCH_X86_64)
-   return pointer_to_intptr(key) ^ (pointer_to_intptr(key) >> 32);
-#else
-   return pointer_to_intptr(key);
-#endif
+   return _mesa_hash_pointer(key);
 }
 
-static int compare_dev(void *key1, void *key2)
+static int compare_pointers(void *key1, void *key2)
 {
    return key1 != key2;
 }
@@ -258,7 +251,7 @@
    /* Look up the winsys from the dev table. */
    simple_mtx_lock(&dev_tab_mutex);
    if (!dev_tab)
-      dev_tab = util_hash_table_create(hash_dev, compare_dev);
+      dev_tab = util_hash_table_create(hash_pointer, compare_pointers);
 
    /* Initialize the amdgpu device. This should always return the same pointer
     * for the same fd. */
@@ -274,6 +267,12 @@
    if (ws) {
       pipe_reference(NULL, &ws->reference);
       simple_mtx_unlock(&dev_tab_mutex);
+
+      /* Release the device handle, because we don't need it anymore.
+       * This function is returning an existing winsys instance, which
+       * has its own device handle.
+       */
+      amdgpu_device_deinitialize(dev);
       return &ws->base;
    }
 
@@ -323,10 +322,13 @@
    amdgpu_surface_init_functions(ws);
 
    LIST_INITHEAD(&ws->global_bo_list);
+   ws->bo_export_table = util_hash_table_create(hash_pointer, compare_pointers);
+
    (void) simple_mtx_init(&ws->global_bo_list_lock, mtx_plain);
    (void) simple_mtx_init(&ws->bo_fence_lock, mtx_plain);
+   (void) simple_mtx_init(&ws->bo_export_table_lock, mtx_plain);
 
-   if (!util_queue_init(&ws->cs_queue, "amdgpu_cs", 8, 1,
+   if (!util_queue_init(&ws->cs_queue, "cs", 8, 1,
                         UTIL_QUEUE_INIT_RESIZE_IF_FULL)) {
       amdgpu_winsys_destroy(&ws->base);
       simple_mtx_unlock(&dev_tab_mutex);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
index a6784e8..c355eff 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -79,11 +79,17 @@
    bool check_vm;
    bool debug_all_bos;
    bool reserve_vmid;
+   bool zero_all_vram_allocs;
 
    /* List of all allocated buffers */
    simple_mtx_t global_bo_list_lock;
    struct list_head global_bo_list;
    unsigned num_buffers;
+
+   /* For returning the same amdgpu_winsys_bo instance for exported
+    * and re-imported buffers. */
+   struct util_hash_table *bo_export_table;
+   simple_mtx_t bo_export_table_lock;
 };
 
 static inline struct amdgpu_winsys *
diff --git a/src/gallium/winsys/i915/drm/i915_drm_buffer.c b/src/gallium/winsys/i915/drm/i915_drm_buffer.c
index 890f7dc..509984a 100644
--- a/src/gallium/winsys/i915/drm/i915_drm_buffer.c
+++ b/src/gallium/winsys/i915/drm/i915_drm_buffer.c
@@ -98,7 +98,7 @@
    struct i915_drm_buffer *buf;
    uint32_t tile = 0, swizzle = 0;
 
-   if ((whandle->type != DRM_API_HANDLE_TYPE_SHARED) && (whandle->type != DRM_API_HANDLE_TYPE_FD))
+   if ((whandle->type != WINSYS_HANDLE_TYPE_SHARED) && (whandle->type != WINSYS_HANDLE_TYPE_FD))
       return NULL;
 
    if (whandle->offset != 0)
@@ -110,9 +110,9 @@
 
    buf->magic = 0xDEAD1337;
 
-   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED)
+   if (whandle->type == WINSYS_HANDLE_TYPE_SHARED)
        buf->bo = drm_intel_bo_gem_create_from_name(idws->gem_manager, "gallium3d_from_handle", whandle->handle);
-   else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+   else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
        int fd = (int) whandle->handle;
        buf->bo = drm_intel_bo_gem_create_from_prime(idws->gem_manager, fd, height * whandle->stride);
    }
@@ -143,7 +143,7 @@
 {
    struct i915_drm_buffer *buf = i915_drm_buffer(buffer);
 
-   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+   if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
       if (!buf->flinked) {
          if (drm_intel_bo_flink(buf->bo, &buf->flink))
             return FALSE;
@@ -151,9 +151,9 @@
       }
 
       whandle->handle = buf->flink;
-   } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+   } else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
       whandle->handle = buf->bo->handle;
-   } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+   } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
       int fd;
 
       if (drm_intel_bo_gem_export_to_prime(buf->bo, &fd))
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 6652977..07a9b2d 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -497,7 +497,7 @@
 }
 
 static void *radeon_bo_map(struct pb_buffer *buf,
-                           struct radeon_winsys_cs *rcs,
+                           struct radeon_cmdbuf *rcs,
                            enum pipe_transfer_usage usage)
 {
     struct radeon_bo *bo = (struct radeon_bo*)buf;
@@ -1157,10 +1157,10 @@
      * The list of pairs is guarded by a mutex, of course. */
     mtx_lock(&ws->bo_handles_mutex);
 
-    if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+    if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
         /* First check if there already is an existing bo for the handle. */
         bo = util_hash_table_get(ws->bo_names, (void*)(uintptr_t)whandle->handle);
-    } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+    } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
         /* We must first get the GEM handle, as fds are unreliable keys */
         r = drmPrimeFDToHandle(ws->fd, whandle->handle, &handle);
         if (r)
@@ -1184,7 +1184,7 @@
         goto fail;
     }
 
-    if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+    if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
         struct drm_gem_open open_arg = {};
         memset(&open_arg, 0, sizeof(open_arg));
         /* Open the BO. */
@@ -1196,7 +1196,7 @@
         handle = open_arg.handle;
         size = open_arg.size;
         bo->flink_name = whandle->handle;
-    } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+    } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
         size = lseek(whandle->handle, 0, SEEK_END);
         /* 
          * Could check errno to determine whether the kernel is new enough, but
@@ -1301,7 +1301,7 @@
 
     bo->u.real.use_reusable_pool = false;
 
-    if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+    if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
         if (!bo->flink_name) {
             flink.handle = bo->handle;
 
@@ -1316,9 +1316,9 @@
             mtx_unlock(&ws->bo_handles_mutex);
         }
         whandle->handle = bo->flink_name;
-    } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+    } else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
         whandle->handle = bo->handle;
-    } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+    } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
         if (drmPrimeHandleToFD(ws->fd, bo->handle, DRM_CLOEXEC, (int*)&whandle->handle))
             return false;
     }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 9070464..798be78 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -67,7 +67,7 @@
 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
 
 static struct pipe_fence_handle *
-radeon_cs_create_fence(struct radeon_winsys_cs *rcs);
+radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
 static void radeon_fence_reference(struct pipe_fence_handle **dst,
                                    struct pipe_fence_handle *src);
 
@@ -145,7 +145,7 @@
 }
 
 
-static struct radeon_winsys_cs *
+static struct radeon_cmdbuf *
 radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
                      enum ring_type ring_type,
                      void (*flush)(void *ctx, unsigned flags,
@@ -329,7 +329,7 @@
     return idx;
 }
 
-static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs,
+static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
                                         struct pb_buffer *buf,
                                         enum radeon_bo_usage usage,
                                         enum radeon_bo_domain domains,
@@ -366,7 +366,7 @@
     reloc->read_domains |= rd;
     reloc->write_domain |= wd;
     reloc->flags = MAX2(reloc->flags, priority);
-    cs->csc->relocs_bo[index].u.real.priority_usage |= 1ull << priority;
+    cs->csc->relocs_bo[index].u.real.priority_usage |= 1u << priority;
 
     if (added_domains & RADEON_DOMAIN_VRAM)
         cs->base.used_vram += bo->base.size;
@@ -376,7 +376,7 @@
     return index;
 }
 
-static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs,
+static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
                                    struct pb_buffer *buf)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
@@ -384,7 +384,7 @@
     return radeon_lookup_buffer(cs->csc, (struct radeon_bo*)buf);
 }
 
-static bool radeon_drm_cs_validate(struct radeon_winsys_cs *rcs)
+static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     bool status =
@@ -423,13 +423,13 @@
     return status;
 }
 
-static bool radeon_drm_cs_check_space(struct radeon_winsys_cs *rcs, unsigned dw)
+static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
 {
    assert(rcs->current.cdw <= rcs->current.max_dw);
    return rcs->current.max_dw - rcs->current.cdw >= dw;
 }
 
-static unsigned radeon_drm_cs_get_buffer_list(struct radeon_winsys_cs *rcs,
+static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
                                               struct radeon_bo_list_item *list)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
@@ -480,7 +480,7 @@
 /*
  * Make sure previous submission of this cs are completed
  */
-void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs)
+void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 
@@ -541,7 +541,7 @@
 
 DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", false)
 
-static int radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
+static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
                                unsigned flags,
                                struct pipe_fence_handle **pfence)
 {
@@ -700,7 +700,7 @@
     return 0;
 }
 
-static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs)
+static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
 
@@ -715,7 +715,7 @@
     FREE(cs);
 }
 
-static bool radeon_bo_is_referenced(struct radeon_winsys_cs *rcs,
+static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
                                     struct pb_buffer *_buf,
                                     enum radeon_bo_usage usage)
 {
@@ -744,7 +744,7 @@
 /* FENCES */
 
 static struct pipe_fence_handle *
-radeon_cs_create_fence(struct radeon_winsys_cs *rcs)
+radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct pb_buffer *fence;
@@ -777,7 +777,7 @@
 }
 
 static struct pipe_fence_handle *
-radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
+radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
 {
    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
    struct pipe_fence_handle *fence = NULL;
@@ -796,7 +796,7 @@
 }
 
 static void
-radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs *cs,
+radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
                                    struct pipe_fence_handle *fence)
 {
    /* TODO: Handle the following unlikely multi-threaded scenario:
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index f9b26af..f4c6cbe 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -33,7 +33,7 @@
     struct radeon_bo    *bo;
     union {
         struct {
-            uint64_t    priority_usage;
+            uint32_t    priority_usage;
         } real;
         struct {
             unsigned    real_idx;
@@ -65,7 +65,7 @@
 };
 
 struct radeon_drm_cs {
-    struct radeon_winsys_cs base;
+    struct radeon_cmdbuf base;
     enum ring_type          ring_type;
 
     /* We flip between these two CS. While one is being consumed
@@ -92,7 +92,7 @@
 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo);
 
 static inline struct radeon_drm_cs *
-radeon_drm_cs(struct radeon_winsys_cs *base)
+radeon_drm_cs(struct radeon_cmdbuf *base)
 {
     return (struct radeon_drm_cs*)base;
 }
@@ -131,7 +131,7 @@
     return bo->num_cs_references != 0;
 }
 
-void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs);
+void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs);
 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws);
 void radeon_drm_cs_emit_ioctl_oneshot(void *job, int thread_index);
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_surface.c b/src/gallium/winsys/radeon/drm/radeon_drm_surface.c
index 77fb775..20cfc86 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_surface.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_surface.c
@@ -220,6 +220,66 @@
 			      surf_ws->micro_tile_mode == RADEON_MICRO_MODE_ROTATED;
 }
 
+static void si_compute_cmask(const struct radeon_info *info,
+		      const struct ac_surf_config *config,
+		      struct radeon_surf *surf)
+{
+	unsigned pipe_interleave_bytes = info->pipe_interleave_bytes;
+	unsigned num_pipes = info->num_tile_pipes;
+	unsigned cl_width, cl_height;
+
+	if (surf->flags & RADEON_SURF_Z_OR_SBUFFER)
+		return;
+
+	assert(info->chip_class <= VI);
+
+	switch (num_pipes) {
+	case 2:
+		cl_width = 32;
+		cl_height = 16;
+		break;
+	case 4:
+		cl_width = 32;
+		cl_height = 32;
+		break;
+	case 8:
+		cl_width = 64;
+		cl_height = 32;
+		break;
+	case 16: /* Hawaii */
+		cl_width = 64;
+		cl_height = 64;
+		break;
+	default:
+		assert(0);
+		return;
+	}
+
+	unsigned base_align = num_pipes * pipe_interleave_bytes;
+
+	unsigned width = align(surf->u.legacy.level[0].nblk_x, cl_width*8);
+	unsigned height = align(surf->u.legacy.level[0].nblk_y, cl_height*8);
+	unsigned slice_elements = (width * height) / (8*8);
+
+	/* Each element of CMASK is a nibble. */
+	unsigned slice_bytes = slice_elements / 2;
+
+	surf->u.legacy.cmask_slice_tile_max = (width * height) / (128*128);
+	if (surf->u.legacy.cmask_slice_tile_max)
+		surf->u.legacy.cmask_slice_tile_max -= 1;
+
+	unsigned num_layers;
+	if (config->is_3d)
+		num_layers = config->info.depth;
+	else if (config->is_cube)
+		num_layers = 6;
+	else
+		num_layers = config->info.array_size;
+
+	surf->cmask_alignment = MAX2(256, base_align);
+	surf->cmask_size = align(slice_bytes, base_align) * num_layers;
+}
+
 static int radeon_winsys_surface_init(struct radeon_winsys *rws,
                                       const struct pipe_resource *tex,
                                       unsigned flags, unsigned bpe,
@@ -243,6 +303,67 @@
         return r;
 
     surf_drm_to_winsys(ws, surf_ws, &surf_drm);
+
+    /* Compute FMASK. */
+    if (ws->gen == DRV_SI &&
+        tex->nr_samples >= 2 &&
+        !(flags & (RADEON_SURF_Z_OR_SBUFFER | RADEON_SURF_FMASK))) {
+        /* FMASK is allocated like an ordinary texture. */
+        struct pipe_resource templ = *tex;
+        struct radeon_surf fmask = {};
+        unsigned fmask_flags, bpe;
+
+        templ.nr_samples = 1;
+        fmask_flags = flags | RADEON_SURF_FMASK;
+
+        switch (tex->nr_samples) {
+        case 2:
+        case 4:
+            bpe = 1;
+            break;
+        case 8:
+            bpe = 4;
+            break;
+        default:
+            fprintf(stderr, "radeon: Invalid sample count for FMASK allocation.\n");
+            return -1;
+        }
+
+        if (radeon_winsys_surface_init(rws, &templ, fmask_flags, bpe,
+                                       RADEON_SURF_MODE_2D, &fmask)) {
+            fprintf(stderr, "Got error in surface_init while allocating FMASK.\n");
+            return -1;
+        }
+
+        assert(fmask.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
+
+        surf_ws->fmask_size = fmask.surf_size;
+        surf_ws->fmask_alignment = MAX2(256, fmask.surf_alignment);
+        surf_ws->fmask_tile_swizzle = fmask.tile_swizzle;
+
+        surf_ws->u.legacy.fmask.slice_tile_max =
+            (fmask.u.legacy.level[0].nblk_x * fmask.u.legacy.level[0].nblk_y) / 64;
+        if (surf_ws->u.legacy.fmask.slice_tile_max)
+            surf_ws->u.legacy.fmask.slice_tile_max -= 1;
+
+        surf_ws->u.legacy.fmask.tiling_index = fmask.u.legacy.tiling_index[0];
+        surf_ws->u.legacy.fmask.bankh = fmask.u.legacy.bankh;
+        surf_ws->u.legacy.fmask.pitch_in_pixels = fmask.u.legacy.level[0].nblk_x;
+    }
+
+    if (ws->gen == DRV_SI) {
+	    struct ac_surf_config config;
+
+	    /* Only these fields need to be set for the CMASK computation. */
+	    config.info.width = tex->width0;
+	    config.info.height = tex->height0;
+	    config.info.depth = tex->depth0;
+	    config.info.array_size = tex->array_size;
+	    config.is_3d = !!(tex->target == PIPE_TEXTURE_3D);
+	    config.is_cube = !!(tex->target == PIPE_TEXTURE_CUBE);
+
+	    si_compute_cmask(&ws->info, &config, surf_ws);
+    }
     return 0;
 }
 
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index 6e3162d..491e8e1 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -471,6 +471,32 @@
     radeon_get_drm_value(ws->fd, RADEON_INFO_MAX_SE, NULL,
                          &ws->info.max_se);
 
+    switch (ws->info.family) {
+    case CHIP_HAINAN:
+    case CHIP_KABINI:
+    case CHIP_MULLINS:
+        ws->info.num_tcc_blocks = 2;
+        break;
+    case CHIP_VERDE:
+    case CHIP_OLAND:
+    case CHIP_BONAIRE:
+    case CHIP_KAVERI:
+        ws->info.num_tcc_blocks = 4;
+        break;
+    case CHIP_PITCAIRN:
+        ws->info.num_tcc_blocks = 8;
+        break;
+    case CHIP_TAHITI:
+        ws->info.num_tcc_blocks = 12;
+        break;
+    case CHIP_HAWAII:
+        ws->info.num_tcc_blocks = 16;
+        break;
+    default:
+        ws->info.num_tcc_blocks = 0;
+        break;
+    }
+
     if (!ws->info.max_se) {
         switch (ws->info.family) {
         default:
@@ -529,6 +555,28 @@
     ws->info.tcc_cache_line_size = 64; /* TC L2 line size on GCN */
     ws->info.ib_start_alignment = 4096;
     ws->info.kernel_flushes_hdp_before_ib = ws->info.drm_minor >= 40;
+    /* HTILE is broken with 1D tiling on old kernels and CIK. */
+    ws->info.htile_cmask_support_1d_tiling = ws->info.chip_class != CIK ||
+                                             ws->info.drm_minor >= 38;
+    ws->info.si_TA_CS_BC_BASE_ADDR_allowed = ws->info.drm_minor >= 48;
+    ws->info.has_bo_metadata = false;
+    ws->info.has_gpu_reset_status_query = false;
+    ws->info.has_gpu_reset_counter_query = ws->info.drm_minor >= 43;
+    ws->info.has_eqaa_surface_allocator = false;
+    ws->info.has_format_bc1_through_bc7 = ws->info.drm_minor >= 31;
+    ws->info.kernel_flushes_tc_l2_after_ib = true;
+    /* Old kernels disallowed register writes via COPY_DATA
+     * that are used for indirect compute dispatches. */
+    ws->info.has_indirect_compute_dispatch = ws->info.chip_class == CIK ||
+                                             (ws->info.chip_class == SI &&
+                                              ws->info.drm_minor >= 45);
+    /* SI doesn't support unaligned loads. */
+    ws->info.has_unaligned_shader_loads = ws->info.chip_class == CIK &&
+                                          ws->info.drm_minor >= 50;
+    ws->info.has_sparse_vm_mappings = false;
+    /* 2D tiling on CIK is supported since DRM 2.35.0 */
+    ws->info.has_2d_tiling = ws->info.chip_class <= SI || ws->info.drm_minor >= 35;
+    ws->info.has_read_registers_query = ws->info.drm_minor >= 42;
 
     ws->check_vm = strstr(debug_get_option("R600_DEBUG", ""), "check_vm") != NULL;
 
@@ -573,7 +621,7 @@
     *info = ((struct radeon_drm_winsys *)rws)->info;
 }
 
-static bool radeon_cs_request_feature(struct radeon_winsys_cs *rcs,
+static bool radeon_cs_request_feature(struct radeon_cmdbuf *rcs,
                                       enum radeon_feature_id fid,
                                       bool enable)
 {
@@ -859,7 +907,7 @@
     ws->info.gart_page_size = sysconf(_SC_PAGESIZE);
 
     if (ws->num_cpus > 1 && debug_get_option_thread())
-        util_queue_init(&ws->cs_queue, "radeon_cs", 8, 1, 0);
+        util_queue_init(&ws->cs_queue, "rcs", 8, 1, 0);
 
     /* Create the screen at the end. The winsys must be initialized
      * completely.
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_dri.c b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
index 2a0ac7b..76f29e2 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_dri.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
@@ -234,11 +234,11 @@
     }
 
     switch (whandle->type) {
-    case DRM_API_HANDLE_TYPE_SHARED:
-    case DRM_API_HANDLE_TYPE_KMS:
+    case WINSYS_HANDLE_TYPE_SHARED:
+    case WINSYS_HANDLE_TYPE_KMS:
        handle = whandle->handle;
        break;
-    case DRM_API_HANDLE_TYPE_FD:
+    case WINSYS_HANDLE_TYPE_FD:
        ret = drmPrimeFDToHandle(vws->ioctl.drm_fd, whandle->handle,
                                 &handle);
        if (ret) {
@@ -263,7 +263,7 @@
     /*
      * Need to close the handle we got from prime.
      */
-    if (whandle->type == DRM_API_HANDLE_TYPE_FD)
+    if (whandle->type == WINSYS_HANDLE_TYPE_FD)
        vmw_ioctl_surface_destroy(vws, handle);
 
     if (ret) {
@@ -340,11 +340,11 @@
     whandle->offset = 0;
 
     switch (whandle->type) {
-    case DRM_API_HANDLE_TYPE_SHARED:
-    case DRM_API_HANDLE_TYPE_KMS:
+    case WINSYS_HANDLE_TYPE_SHARED:
+    case WINSYS_HANDLE_TYPE_KMS:
        whandle->handle = vsrf->sid;
        break;
-    case DRM_API_HANDLE_TYPE_FD:
+    case WINSYS_HANDLE_TYPE_FD:
        ret = drmPrimeHandleToFD(vws->ioctl.drm_fd, vsrf->sid, DRM_CLOEXEC,
 				(int *)&whandle->handle);
        if (ret) {
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
index 62a2af6..16dd5c8 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
@@ -290,13 +290,13 @@
    int ret;
 
    switch(whandle->type) {
-   case DRM_API_HANDLE_TYPE_SHARED:
-   case DRM_API_HANDLE_TYPE_KMS:
+   case WINSYS_HANDLE_TYPE_SHARED:
+   case WINSYS_HANDLE_TYPE_KMS:
       *needs_unref = FALSE;
       req->handle_type = DRM_VMW_HANDLE_LEGACY;
       req->sid = whandle->handle;
       break;
-   case DRM_API_HANDLE_TYPE_FD:
+   case WINSYS_HANDLE_TYPE_FD:
       if (!vws->ioctl.have_drm_2_6) {
          uint32_t handle;
 
diff --git a/src/gallium/winsys/sw/dri/dri_sw_winsys.c b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
index 0084998..d519bcf 100644
--- a/src/gallium/winsys/sw/dri/dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/dri/dri_sw_winsys.c
@@ -26,6 +26,13 @@
  *
  **************************************************************************/
 
+#if !defined(ANDROID) || ANDROID_API_LEVEL >= 26
+/* Android's libc began supporting shm in Oreo */
+#define HAVE_SHM
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#endif
+
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
 #include "util/u_inlines.h"
@@ -45,6 +52,7 @@
    unsigned stride;
 
    unsigned map_flags;
+   int shmid;
    void *data;
    void *mapped;
    const void *front_private;
@@ -79,6 +87,27 @@
    return TRUE;
 }
 
+#ifdef HAVE_SHM
+static char *
+alloc_shm(struct dri_sw_displaytarget *dri_sw_dt, unsigned size)
+{
+   char *addr;
+
+   dri_sw_dt->shmid = shmget(IPC_PRIVATE, size, IPC_CREAT|0777);
+   if (dri_sw_dt->shmid < 0)
+      return NULL;
+
+   addr = (char *) shmat(dri_sw_dt->shmid, 0, 0);
+   /* mark the segment immediately for deletion to avoid leaks */
+   shmctl(dri_sw_dt->shmid, IPC_RMID, 0);
+
+   if (addr == (char *) -1)
+      return NULL;
+
+   return addr;
+}
+#endif
+
 static struct sw_displaytarget *
 dri_sw_displaytarget_create(struct sw_winsys *winsys,
                             unsigned tex_usage,
@@ -88,6 +117,7 @@
                             const void *front_private,
                             unsigned *stride)
 {
+   struct dri_sw_winsys *ws = dri_sw_winsys(winsys);
    struct dri_sw_displaytarget *dri_sw_dt;
    unsigned nblocksy, size, format_stride;
 
@@ -106,7 +136,16 @@
    nblocksy = util_format_get_nblocksy(format, height);
    size = dri_sw_dt->stride * nblocksy;
 
-   dri_sw_dt->data = align_malloc(size, alignment);
+   dri_sw_dt->shmid = -1;
+
+#ifdef HAVE_SHM
+   if (ws->lf->put_image_shm)
+      dri_sw_dt->data = alloc_shm(dri_sw_dt, size);
+#endif
+
+   if(!dri_sw_dt->data)
+      dri_sw_dt->data = align_malloc(size, alignment);
+
    if(!dri_sw_dt->data)
       goto no_data;
 
@@ -125,7 +164,14 @@
 {
    struct dri_sw_displaytarget *dri_sw_dt = dri_sw_displaytarget(dt);
 
-   align_free(dri_sw_dt->data);
+   if (dri_sw_dt->shmid >= 0) {
+#ifdef HAVE_SHM
+      shmdt(dri_sw_dt->data);
+      shmctl(dri_sw_dt->shmid, IPC_RMID, 0);
+#endif
+   } else {
+      align_free(dri_sw_dt->data);
+   }
 
    FREE(dri_sw_dt);
 }
@@ -174,7 +220,15 @@
                                 struct sw_displaytarget *dt,
                                 struct winsys_handle *whandle)
 {
-   assert(0);
+   struct dri_sw_displaytarget *dri_sw_dt = dri_sw_displaytarget(dt);
+
+   if (whandle->type == WINSYS_HANDLE_TYPE_SHMID) {
+      if (dri_sw_dt->shmid < 0)
+         return FALSE;
+      whandle->handle = dri_sw_dt->shmid;
+      return TRUE;
+   }
+
    return FALSE;
 }
 
@@ -187,25 +241,38 @@
    struct dri_sw_winsys *dri_sw_ws = dri_sw_winsys(ws);
    struct dri_sw_displaytarget *dri_sw_dt = dri_sw_displaytarget(dt);
    struct dri_drawable *dri_drawable = (struct dri_drawable *)context_private;
-   unsigned width, height;
+   unsigned width, height, x = 0, y = 0;
    unsigned blsize = util_format_get_blocksize(dri_sw_dt->format);
+   unsigned offset = 0;
+   char *data = dri_sw_dt->data;
 
    /* Set the width to 'stride / cpp'.
     *
     * PutImage correctly clips to the width of the dst drawable.
     */
-   width = dri_sw_dt->stride / blsize;
-
-   height = dri_sw_dt->height;
-
    if (box) {
-       void *data;
-       data = (char *)dri_sw_dt->data + (dri_sw_dt->stride * box->y) + box->x * blsize;
-       dri_sw_ws->lf->put_image2(dri_drawable, data,
-                                 box->x, box->y, box->width, box->height, dri_sw_dt->stride);
+      offset = (dri_sw_dt->stride * box->y) + box->x * blsize;
+      data += offset;
+      x = box->x;
+      y = box->y;
+      width = box->width;
+      height = box->height;
    } else {
-       dri_sw_ws->lf->put_image(dri_drawable, dri_sw_dt->data, width, height);
+      width = dri_sw_dt->stride / blsize;
+      height = dri_sw_dt->height;
    }
+
+   if (dri_sw_dt->shmid != -1) {
+      dri_sw_ws->lf->put_image_shm(dri_drawable, dri_sw_dt->shmid, dri_sw_dt->data, offset,
+                                   x, y, width, height, dri_sw_dt->stride);
+      return;
+   }
+
+   if (box)
+      dri_sw_ws->lf->put_image2(dri_drawable, data,
+                                x, y, width, height, dri_sw_dt->stride);
+   else
+      dri_sw_ws->lf->put_image(dri_drawable, data, width, height);
 }
 
 static void
diff --git a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
index 34f0aa8..9564d94 100644
--- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
@@ -393,11 +393,11 @@
    struct kms_sw_plane *kms_sw_pl;
 
 
-   assert(whandle->type == DRM_API_HANDLE_TYPE_KMS ||
-          whandle->type == DRM_API_HANDLE_TYPE_FD);
+   assert(whandle->type == WINSYS_HANDLE_TYPE_KMS ||
+          whandle->type == WINSYS_HANDLE_TYPE_FD);
 
    switch(whandle->type) {
-   case DRM_API_HANDLE_TYPE_FD:
+   case WINSYS_HANDLE_TYPE_FD:
       kms_sw_pl = kms_sw_displaytarget_add_from_prime(kms_sw, whandle->handle,
                                                       templ->format,
                                                       templ->width0,
@@ -407,7 +407,7 @@
       if (kms_sw_pl)
          *stride = kms_sw_pl->stride;
       return sw_displaytarget(kms_sw_pl);
-   case DRM_API_HANDLE_TYPE_KMS:
+   case WINSYS_HANDLE_TYPE_KMS:
       kms_sw_dt = kms_sw_displaytarget_find_and_ref(kms_sw, whandle->handle);
       if (kms_sw_dt) {
          struct kms_sw_plane *plane;
@@ -438,12 +438,12 @@
    struct kms_sw_displaytarget *kms_sw_dt = plane->dt;
 
    switch(whandle->type) {
-   case DRM_API_HANDLE_TYPE_KMS:
+   case WINSYS_HANDLE_TYPE_KMS:
       whandle->handle = kms_sw_dt->handle;
       whandle->stride = plane->stride;
       whandle->offset = plane->offset;
       return TRUE;
-   case DRM_API_HANDLE_TYPE_FD:
+   case WINSYS_HANDLE_TYPE_FD:
       if (!drmPrimeHandleToFD(kms_sw->fd, kms_sw_dt->handle,
                              DRM_CLOEXEC, (int*)&whandle->handle)) {
          whandle->stride = plane->stride;
diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
index 47cf7f0..c5370c7 100644
--- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
+++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
@@ -92,7 +92,7 @@
    struct wrapper_sw_winsys *wsw = wrapper_sw_winsys(ws);
 
    return wsw->screen->is_format_supported(wsw->screen, format,
-                                           PIPE_TEXTURE_2D, 0,
+                                           PIPE_TEXTURE_2D, 0, 0,
                                            PIPE_BIND_RENDER_TARGET |
                                            PIPE_BIND_DISPLAY_TARGET);
 }
diff --git a/src/gallium/winsys/vc5/drm/Android.mk b/src/gallium/winsys/v3d/drm/Android.mk
similarity index 97%
rename from src/gallium/winsys/vc5/drm/Android.mk
rename to src/gallium/winsys/v3d/drm/Android.mk
index 3b1523b..4cdd969 100644
--- a/src/gallium/winsys/vc5/drm/Android.mk
+++ b/src/gallium/winsys/v3d/drm/Android.mk
@@ -27,7 +27,7 @@
 
 LOCAL_SRC_FILES := $(C_SOURCES)
 
-LOCAL_MODULE := libmesa_winsys_vc5
+LOCAL_MODULE := libmesa_winsys_v3d
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/winsys/vc5/drm/Makefile.am b/src/gallium/winsys/v3d/drm/Makefile.am
similarity index 94%
rename from src/gallium/winsys/vc5/drm/Makefile.am
rename to src/gallium/winsys/v3d/drm/Makefile.am
index e858403..ac2ef23 100644
--- a/src/gallium/winsys/vc5/drm/Makefile.am
+++ b/src/gallium/winsys/v3d/drm/Makefile.am
@@ -26,8 +26,8 @@
 	-I$(top_srcdir)/src/gallium/drivers \
 	$(GALLIUM_WINSYS_CFLAGS)
 
-noinst_LTLIBRARIES = libvc5drm.la
+noinst_LTLIBRARIES = libv3ddrm.la
 
-libvc5drm_la_SOURCES = $(C_SOURCES)
+libv3ddrm_la_SOURCES = $(C_SOURCES)
 
 EXTRA_DIST = meson.build
diff --git a/src/gallium/winsys/v3d/drm/Makefile.sources b/src/gallium/winsys/v3d/drm/Makefile.sources
new file mode 100644
index 0000000..1fdeefb
--- /dev/null
+++ b/src/gallium/winsys/v3d/drm/Makefile.sources
@@ -0,0 +1,3 @@
+C_SOURCES := \
+	v3d_drm_public.h \
+	v3d_drm_winsys.c
diff --git a/src/gallium/winsys/vc5/drm/meson.build b/src/gallium/winsys/v3d/drm/meson.build
similarity index 94%
rename from src/gallium/winsys/vc5/drm/meson.build
rename to src/gallium/winsys/v3d/drm/meson.build
index d859301..e5d7b12 100644
--- a/src/gallium/winsys/vc5/drm/meson.build
+++ b/src/gallium/winsys/v3d/drm/meson.build
@@ -18,9 +18,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-libvc5winsys = static_library(
-  'vc5winsys',
-  files('vc5_drm_winsys.c'),
+libv3dwinsys = static_library(
+  'v3dwinsys',
+  files('v3d_drm_winsys.c'),
   include_directories : [
     inc_src, inc_include,
     inc_gallium, inc_gallium_aux, inc_gallium_drivers,
diff --git a/src/gallium/winsys/vc5/drm/vc5_drm_public.h b/src/gallium/winsys/v3d/drm/v3d_drm_public.h
similarity index 95%
rename from src/gallium/winsys/vc5/drm/vc5_drm_public.h
rename to src/gallium/winsys/v3d/drm/v3d_drm_public.h
index 6e19848..46aed9d 100644
--- a/src/gallium/winsys/vc5/drm/vc5_drm_public.h
+++ b/src/gallium/winsys/v3d/drm/v3d_drm_public.h
@@ -26,6 +26,6 @@
 
 struct pipe_screen;
 
-struct pipe_screen *vc5_drm_screen_create(int drmFD);
+struct pipe_screen *v3d_drm_screen_create(int drmFD);
 
 #endif /* __VC5_DRM_PUBLIC_H__ */
diff --git a/src/gallium/winsys/vc5/drm/vc5_drm_winsys.c b/src/gallium/winsys/v3d/drm/v3d_drm_winsys.c
similarity index 89%
rename from src/gallium/winsys/vc5/drm/vc5_drm_winsys.c
rename to src/gallium/winsys/v3d/drm/v3d_drm_winsys.c
index d089291..63b7a57 100644
--- a/src/gallium/winsys/vc5/drm/vc5_drm_winsys.c
+++ b/src/gallium/winsys/v3d/drm/v3d_drm_winsys.c
@@ -24,12 +24,12 @@
 #include <unistd.h>
 #include <fcntl.h>
 
-#include "vc5_drm_public.h"
+#include "v3d_drm_public.h"
 
-#include "vc5/vc5_screen.h"
+#include "v3d/v3d_screen.h"
 
 struct pipe_screen *
-vc5_drm_screen_create(int fd)
+v3d_drm_screen_create(int fd)
 {
-	return vc5_screen_create(fcntl(fd, F_DUPFD_CLOEXEC, 3));
+	return v3d_screen_create(fcntl(fd, F_DUPFD_CLOEXEC, 3));
 }
diff --git a/src/gallium/winsys/vc5/drm/Makefile.sources b/src/gallium/winsys/vc5/drm/Makefile.sources
deleted file mode 100644
index ea7566f..0000000
--- a/src/gallium/winsys/vc5/drm/Makefile.sources
+++ /dev/null
@@ -1,3 +0,0 @@
-C_SOURCES := \
-	vc5_drm_public.h \
-	vc5_drm_winsys.c
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index 03ef74a..d55e4c7 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -398,7 +398,7 @@
 
    mtx_lock(&qdws->bo_handles_mutex);
 
-   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+   if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
       res = util_hash_table_get(qdws->bo_names, (void*)(uintptr_t)handle);
       if (res) {
          struct virgl_hw_res *r = NULL;
@@ -407,7 +407,7 @@
       }
    }
 
-   if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+   if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
       int r;
       r = drmPrimeFDToHandle(qdws->fd, whandle->handle, &handle);
       if (r) {
@@ -427,7 +427,7 @@
    if (!res)
       goto done;
 
-   if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+   if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
       res->bo_handle = handle;
    } else {
       memset(&open_arg, 0, sizeof(open_arg));
@@ -476,7 +476,7 @@
    if (!res)
        return FALSE;
 
-   if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
+   if (whandle->type == WINSYS_HANDLE_TYPE_SHARED) {
       if (!res->flinked) {
          memset(&flink, 0, sizeof(flink));
          flink.handle = res->bo_handle;
@@ -492,9 +492,9 @@
          mtx_unlock(&qdws->bo_handles_mutex);
       }
       whandle->handle = res->flink;
-   } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) {
+   } else if (whandle->type == WINSYS_HANDLE_TYPE_KMS) {
       whandle->handle = res->bo_handle;
-   } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) {
+   } else if (whandle->type == WINSYS_HANDLE_TYPE_FD) {
       if (drmPrimeHandleToFD(qdws->fd, res->bo_handle, DRM_CLOEXEC, (int*)&whandle->handle))
             return FALSE;
       mtx_lock(&qdws->bo_handles_mutex);
@@ -811,8 +811,15 @@
 {
    struct virgl_drm_winsys *qdws;
    int ret;
+   int gl = 0;
    struct drm_virtgpu_getparam getparam = {0};
 
+   getparam.param = VIRTGPU_PARAM_3D_FEATURES;
+   getparam.value = (uint64_t)(uintptr_t)&gl;
+   ret = drmIoctl(drmFD, DRM_IOCTL_VIRTGPU_GETPARAM, &getparam);
+   if (ret < 0 || !gl)
+      return NULL;
+
    qdws = CALLOC_STRUCT(virgl_drm_winsys);
    if (!qdws)
       return NULL;
@@ -925,6 +932,10 @@
       int dup_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
 
       vws = virgl_drm_winsys_create(dup_fd);
+      if (!vws) {
+         close(dup_fd);
+         goto unlock;
+      }
 
       pscreen = virgl_create_screen(vws);
       if (pscreen) {
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c
index adec26b..2134920 100644
--- a/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_socket.c
@@ -129,12 +129,14 @@
 int virgl_vtest_send_get_caps(struct virgl_vtest_winsys *vws,
                               struct virgl_drm_caps *caps)
 {
-   uint32_t get_caps_buf[VTEST_HDR_SIZE];
+   uint32_t get_caps_buf[VTEST_HDR_SIZE * 2];
    uint32_t resp_buf[VTEST_HDR_SIZE];
-
+   uint32_t caps_size = sizeof(struct virgl_caps_v2);
    int ret;
    get_caps_buf[VTEST_CMD_LEN] = 0;
-   get_caps_buf[VTEST_CMD_ID] = VCMD_GET_CAPS;
+   get_caps_buf[VTEST_CMD_ID] = VCMD_GET_CAPS2;
+   get_caps_buf[VTEST_CMD_LEN + 2] = 0;
+   get_caps_buf[VTEST_CMD_ID + 2] = VCMD_GET_CAPS;
 
    virgl_block_write(vws->sock_fd, &get_caps_buf, sizeof(get_caps_buf));
 
@@ -142,7 +144,27 @@
    if (ret <= 0)
       return 0;
 
-   ret = virgl_block_read(vws->sock_fd, &caps->caps, sizeof(struct virgl_caps_v1));
+   if (resp_buf[1] == 2) {
+       struct virgl_caps_v1 dummy;
+       uint32_t resp_size = resp_buf[0] - 1;
+       uint32_t dummy_size = 0;
+       if (resp_size > caps_size) {
+	   dummy_size = resp_size - caps_size;
+	   resp_size = caps_size;
+       }
+
+       ret = virgl_block_read(vws->sock_fd, &caps->caps, resp_size);
+
+       if (dummy_size)
+	   ret = virgl_block_read(vws->sock_fd, &dummy, dummy_size);
+
+       /* now read back the pointless caps v1 we requested */
+       ret = virgl_block_read(vws->sock_fd, resp_buf, sizeof(resp_buf));
+       if (ret <= 0)
+	   return 0;
+       ret = virgl_block_read(vws->sock_fd, &dummy, sizeof(struct virgl_caps_v1));
+   } else
+       ret = virgl_block_read(vws->sock_fd, &caps->caps, sizeof(struct virgl_caps_v1));
 
    return 0;
 }
@@ -221,8 +243,10 @@
    vtest_hdr[VTEST_CMD_LEN] = VCMD_TRANSFER_HDR_SIZE;
    vtest_hdr[VTEST_CMD_ID] = vcmd;
 
+   /* The host expects the size in dwords so calculate the rounded up
+    * value here. */
    if (vcmd == VCMD_TRANSFER_PUT)
-      vtest_hdr[VTEST_CMD_LEN] += data_size + 3 / 4;
+      vtest_hdr[VTEST_CMD_LEN] += (data_size + 3) / 4;
 
    cmd[0] = handle;
    cmd[1] = level;
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
index 9a96c6e..d1fd605 100644
--- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
@@ -536,7 +536,7 @@
    res = virgl_vtest_winsys_resource_cache_create(vws,
                                                 PIPE_BUFFER,
                                                 PIPE_FORMAT_R8_UNORM,
-                                                PIPE_BIND_CUSTOM,
+                                                VIRGL_BIND_CUSTOM,
                                                 8, 1, 1, 0, 0, 0, 8);
 
    return (struct pipe_fence_handle *)res;
diff --git a/src/gallium/winsys/virgl/vtest/vtest_protocol.h b/src/gallium/winsys/virgl/vtest/vtest_protocol.h
index 86d197f..95bd8c1 100644
--- a/src/gallium/winsys/virgl/vtest/vtest_protocol.h
+++ b/src/gallium/winsys/virgl/vtest/vtest_protocol.h
@@ -47,6 +47,8 @@
 
 /* pass the process cmd line for debugging */
 #define VCMD_CREATE_RENDERER 8
+
+#define VCMD_GET_CAPS2 9
 /* get caps */
 /* 0 length cmd */
 /* resp VCMD_GET_CAPS + caps */
diff --git a/src/gbm/backends/dri/gbm_dri.c b/src/gbm/backends/dri/gbm_dri.c
index df20db4..b3d6ceb 100644
--- a/src/gbm/backends/dri/gbm_dri.c
+++ b/src/gbm/backends/dri/gbm_dri.c
@@ -580,6 +580,14 @@
      GBM_FORMAT_ARGB2101010, __DRI_IMAGE_FORMAT_ARGB2101010,
      { 0x3ff00000, 0x000ffc00, 0x000003ff, 0xc0000000 },
    },
+   {
+     GBM_FORMAT_XBGR2101010, __DRI_IMAGE_FORMAT_XBGR2101010,
+     { 0x000003ff, 0x000ffc00, 0x3ff00000, 0x00000000 },
+   },
+   {
+     GBM_FORMAT_ABGR2101010, __DRI_IMAGE_FORMAT_ABGR2101010,
+     { 0x000003ff, 0x000ffc00, 0x3ff00000, 0xc0000000 },
+   },
 };
 
 /* The two GBM_BO_FORMAT_[XA]RGB8888 formats alias the GBM_FORMAT_*
diff --git a/src/gbm/meson.build b/src/gbm/meson.build
index 7a60868..719f9c1 100644
--- a/src/gbm/meson.build
+++ b/src/gbm/meson.build
@@ -32,7 +32,6 @@
 deps_gbm = []
 incs_gbm = [
   include_directories('main'), inc_include, inc_src, inc_loader,
-  inc_wayland_drm,
 ]
 
 if with_dri2
@@ -45,8 +44,6 @@
   incs_gbm += inc_wayland_drm
 endif
 
-# TODO: wayland support (requires egl)
-
 libgbm = shared_library(
   'gbm',
   files_gbm,
diff --git a/src/glx/Makefile.am b/src/glx/Makefile.am
index 8f9d80c..d06ae29 100644
--- a/src/glx/Makefile.am
+++ b/src/glx/Makefile.am
@@ -24,10 +24,6 @@
 
 EXTRA_DIST = SConscript meson.build
 
-if HAVE_XF86VIDMODE
-EXTRA_DEFINES_XF86VIDMODE = -DXF86VIDMODE
-endif
-
 AM_CFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/include/GL/internal \
@@ -38,7 +34,6 @@
 	-I$(top_builddir)/src/mapi/glapi \
 	-I$(top_srcdir)/src/mapi/glapi \
 	$(VISIBILITY_CFLAGS) \
-	$(EXTRA_DEFINES_XF86VIDMODE) \
 	-D_REENTRANT \
 	-DDEFAULT_DRIVER_DIR=\"$(DRI_DRIVER_SEARCH_DIR)\" \
 	$(DEFINES) \
diff --git a/src/glx/SConscript b/src/glx/SConscript
index 8ce1771..051f55b 100644
--- a/src/glx/SConscript
+++ b/src/glx/SConscript
@@ -36,10 +36,7 @@
 env.PkgUseModules('X11')
 env.PkgUseModules('XCB')
 env.PkgUseModules('DRM')
-
-if env['HAVE_XF86VIDMODE']:
-    env.Append(CPPDEFINES = ['XF86VIDMODE'])
-    env.PkgUseModules('XF86VIDMODE')
+env.PkgUseModules('XF86VIDMODE')
 
 sources = [
     'clientattrib.c',
diff --git a/src/glx/drisw_glx.c b/src/glx/drisw_glx.c
index df2467a..a277710 100644
--- a/src/glx/drisw_glx.c
+++ b/src/glx/drisw_glx.c
@@ -28,10 +28,12 @@
 #include <dlfcn.h>
 #include "dri_common.h"
 #include "drisw_priv.h"
+#include <X11/extensions/shmproto.h>
+#include <assert.h>
 
 static Bool
-XCreateDrawable(struct drisw_drawable * pdp,
-                Display * dpy, XID drawable, int visualid)
+XCreateGCs(struct drisw_drawable * pdp,
+           Display * dpy, XID drawable, int visualid)
 {
    XGCValues gcvalues;
    long visMask;
@@ -56,15 +58,78 @@
    if (!pdp->visinfo || num_visuals == 0)
       return False;
 
-   /* create XImage */
-   pdp->ximage = XCreateImage(dpy,
-                              pdp->visinfo->visual,
-                              pdp->visinfo->depth,
-                              ZPixmap, 0,             /* format, offset */
-                              NULL,                   /* data */
-                              0, 0,                   /* width, height */
-                              32,                     /* bitmap_pad */
-                              0);                     /* bytes_per_line */
+   return True;
+}
+
+static int xshm_error = 0;
+static int xshm_opcode = -1;
+
+/**
+ * Catches potential Xlib errors.
+ */
+static int
+handle_xerror(Display *dpy, XErrorEvent *event)
+{
+   (void) dpy;
+
+   assert(xshm_opcode != -1);
+   if (event->request_code != xshm_opcode ||
+       event->minor_code != X_ShmAttach)
+      return 0;
+
+   xshm_error = 1;
+   return 0;
+}
+
+static Bool
+XCreateDrawable(struct drisw_drawable * pdp, int shmid, Display * dpy)
+{
+   if (pdp->ximage) {
+      XDestroyImage(pdp->ximage);
+      pdp->ximage = NULL;
+   }
+
+   if (!xshm_error && shmid >= 0) {
+      pdp->shminfo.shmid = shmid;
+      pdp->ximage = XShmCreateImage(dpy,
+                                    pdp->visinfo->visual,
+                                    pdp->visinfo->depth,
+                                    ZPixmap,              /* format */
+                                    NULL,                 /* data */
+                                    &pdp->shminfo,        /* shminfo */
+                                    0, 0);                /* width, height */
+      if (pdp->ximage != NULL) {
+         int (*old_handler)(Display *, XErrorEvent *);
+
+         /* dispatch pending errors */
+         XSync(dpy, False);
+
+         old_handler = XSetErrorHandler(handle_xerror);
+         /* This may trigger the X protocol error we're ready to catch: */
+         XShmAttach(dpy, &pdp->shminfo);
+         XSync(dpy, False);
+
+         if (xshm_error) {
+         /* we are on a remote display, this error is normal, don't print it */
+            XDestroyImage(pdp->ximage);
+            pdp->ximage = NULL;
+         }
+
+         (void) XSetErrorHandler(old_handler);
+      }
+   }
+
+   if (pdp->ximage == NULL) {
+      pdp->shminfo.shmid = -1;
+      pdp->ximage = XCreateImage(dpy,
+                                 pdp->visinfo->visual,
+                                 pdp->visinfo->depth,
+                                 ZPixmap, 0,             /* format, offset */
+                                 NULL,                   /* data */
+                                 0, 0,                   /* width, height */
+                                 32,                     /* bitmap_pad */
+                                 0);                     /* bytes_per_line */
+   }
 
   /**
    * swrast does not handle 24-bit depth with 24 bpp, so let X do the
@@ -79,7 +144,9 @@
 static void
 XDestroyDrawable(struct drisw_drawable * pdp, Display * dpy, XID drawable)
 {
-   XDestroyImage(pdp->ximage);
+   if (pdp->ximage)
+      XDestroyImage(pdp->ximage);
+
    free(pdp->visinfo);
 
    XFreeGC(dpy, pdp->gc);
@@ -133,9 +200,9 @@
 }
 
 static void
-swrastPutImage2(__DRIdrawable * draw, int op,
+swrastXPutImage(__DRIdrawable * draw, int op,
                 int x, int y, int w, int h, int stride,
-                char *data, void *loaderPrivate)
+                int shmid, char *data, void *loaderPrivate)
 {
    struct drisw_drawable *pdp = loaderPrivate;
    __GLXDRIdrawable *pdraw = &(pdp->base);
@@ -144,6 +211,11 @@
    XImage *ximage;
    GC gc;
 
+   if (!pdp->ximage || shmid != pdp->shminfo.shmid) {
+      if (!XCreateDrawable(pdp, shmid, dpy))
+         return;
+   }
+
    switch (op) {
    case __DRI_SWRAST_IMAGE_OP_DRAW:
       gc = pdp->gc;
@@ -156,24 +228,52 @@
    }
 
    drawable = pdraw->xDrawable;
-
    ximage = pdp->ximage;
-   ximage->data = data;
-   ximage->width = w;
-   ximage->height = h;
    ximage->bytes_per_line = stride ? stride : bytes_per_line(w * ximage->bits_per_pixel, 32);
+   ximage->data = data;
 
-   XPutImage(dpy, drawable, gc, ximage, 0, 0, x, y, w, h);
-
+   if (pdp->shminfo.shmid >= 0) {
+      ximage->width = ximage->bytes_per_line / ((ximage->bits_per_pixel + 7)/ 8);
+      ximage->height = h;
+      XShmPutImage(dpy, drawable, gc, ximage, 0, 0, x, y, w, h, False);
+      XSync(dpy, False);
+   } else {
+      ximage->width = w;
+      ximage->height = h;
+      XPutImage(dpy, drawable, gc, ximage, 0, 0, x, y, w, h);
+   }
    ximage->data = NULL;
 }
 
 static void
+swrastPutImageShm(__DRIdrawable * draw, int op,
+                  int x, int y, int w, int h, int stride,
+                  int shmid, char *shmaddr, unsigned offset,
+                  void *loaderPrivate)
+{
+   struct drisw_drawable *pdp = loaderPrivate;
+
+   pdp->shminfo.shmaddr = shmaddr;
+   swrastXPutImage(draw, op, x, y, w, h, stride, shmid,
+                   shmaddr + offset, loaderPrivate);
+}
+
+static void
+swrastPutImage2(__DRIdrawable * draw, int op,
+                int x, int y, int w, int h, int stride,
+                char *data, void *loaderPrivate)
+{
+   swrastXPutImage(draw, op, x, y, w, h, stride, -1,
+                   data, loaderPrivate);
+}
+
+static void
 swrastPutImage(__DRIdrawable * draw, int op,
                int x, int y, int w, int h,
                char *data, void *loaderPrivate)
 {
-   swrastPutImage2(draw, op, x, y, w, h, 0, data, loaderPrivate);
+   swrastXPutImage(draw, op, x, y, w, h, 0, -1,
+                   data, loaderPrivate);
 }
 
 static void
@@ -187,6 +287,11 @@
    Drawable readable;
    XImage *ximage;
 
+   if (!prp->ximage || prp->shminfo.shmid >= 0) {
+      if (!XCreateDrawable(prp, -1, dpy))
+         return;
+   }
+
    readable = pread->xDrawable;
 
    ximage = prp->ximage;
@@ -208,6 +313,49 @@
    swrastGetImage2(read, x, y, w, h, 0, data, loaderPrivate);
 }
 
+static void
+swrastGetImageShm(__DRIdrawable * read,
+                  int x, int y, int w, int h,
+                  int shmid, void *loaderPrivate)
+{
+   struct drisw_drawable *prp = loaderPrivate;
+   __GLXDRIdrawable *pread = &(prp->base);
+   Display *dpy = pread->psc->dpy;
+   Drawable readable;
+   XImage *ximage;
+
+   if (!prp->ximage || shmid != prp->shminfo.shmid) {
+      if (!XCreateDrawable(prp, shmid, dpy))
+         return;
+   }
+   readable = pread->xDrawable;
+
+   ximage = prp->ximage;
+   ximage->data = prp->shminfo.shmaddr; /* no offset */
+   ximage->width = w;
+   ximage->height = h;
+   ximage->bytes_per_line = bytes_per_line(w * ximage->bits_per_pixel, 32);
+
+   XShmGetImage(dpy, readable, ximage, x, y, ~0L);
+}
+
+static const __DRIswrastLoaderExtension swrastLoaderExtension_shm = {
+   .base = {__DRI_SWRAST_LOADER, 4 },
+
+   .getDrawableInfo     = swrastGetDrawableInfo,
+   .putImage            = swrastPutImage,
+   .getImage            = swrastGetImage,
+   .putImage2           = swrastPutImage2,
+   .getImage2           = swrastGetImage2,
+   .putImageShm         = swrastPutImageShm,
+   .getImageShm         = swrastGetImageShm,
+};
+
+static const __DRIextension *loader_extensions_shm[] = {
+   &swrastLoaderExtension_shm.base,
+   NULL
+};
+
 static const __DRIswrastLoaderExtension swrastLoaderExtension = {
    .base = {__DRI_SWRAST_LOADER, 3 },
 
@@ -218,7 +366,7 @@
    .getImage2           = swrastGetImage2,
 };
 
-static const __DRIextension *loader_extensions[] = {
+static const __DRIextension *loader_extensions_noshm[] = {
    &swrastLoaderExtension.base,
    NULL
 };
@@ -527,7 +675,7 @@
    pdp->base.drawable = drawable;
    pdp->base.psc = &psc->base;
 
-   ret = XCreateDrawable(pdp, psc->base.dpy, xDrawable, modes->visualID);
+   ret = XCreateGCs(pdp, psc->base.dpy, xDrawable, modes->visualID);
    if (!ret) {
       free(pdp);
       return NULL;
@@ -661,6 +809,14 @@
    }
 }
 
+static int
+check_xshm(Display *dpy)
+{
+   int ignore;
+
+   return XQueryExtension(dpy, "MIT-SHM", &xshm_opcode, &ignore, &ignore);
+}
+
 static struct glx_screen *
 driswCreateScreen(int screen, struct glx_display *priv)
 {
@@ -670,6 +826,7 @@
    struct drisw_screen *psc;
    struct glx_config *configs = NULL, *visuals = NULL;
    int i;
+   const __DRIextension **loader_extensions_local;
 
    psc = calloc(1, sizeof *psc);
    if (psc == NULL)
@@ -688,6 +845,11 @@
    if (extensions == NULL)
       goto handle_error;
 
+   if (!check_xshm(psc->base.dpy))
+      loader_extensions_local = loader_extensions_noshm;
+   else
+      loader_extensions_local = loader_extensions_shm;
+
    for (i = 0; extensions[i]; i++) {
       if (strcmp(extensions[i]->name, __DRI_CORE) == 0)
 	 psc->core = (__DRIcoreExtension *) extensions[i];
@@ -704,12 +866,12 @@
 
    if (psc->swrast->base.version >= 4) {
       psc->driScreen =
-         psc->swrast->createNewScreen2(screen, loader_extensions,
+         psc->swrast->createNewScreen2(screen, loader_extensions_local,
                                        extensions,
                                        &driver_configs, psc);
    } else {
       psc->driScreen =
-         psc->swrast->createNewScreen(screen, loader_extensions,
+         psc->swrast->createNewScreen(screen, loader_extensions_local,
                                       &driver_configs, psc);
    }
    if (psc->driScreen == NULL) {
diff --git a/src/glx/drisw_priv.h b/src/glx/drisw_priv.h
index 5d47900..a670da2 100644
--- a/src/glx/drisw_priv.h
+++ b/src/glx/drisw_priv.h
@@ -23,6 +23,8 @@
  * SOFTWARE.
  */
 
+#include <X11/extensions/XShm.h>
+
 struct drisw_display
 {
    __GLXDRIdisplay base;
@@ -62,6 +64,7 @@
    __DRIdrawable *driDrawable;
    XVisualInfo *visinfo;
    XImage *ximage;
+   XShmSegmentInfo shminfo;
 };
 
 _X_HIDDEN int
diff --git a/src/glx/glxcmds.c b/src/glx/glxcmds.c
index e8485ac..79e3503 100644
--- a/src/glx/glxcmds.c
+++ b/src/glx/glxcmds.c
@@ -46,11 +46,9 @@
 #include "util/debug.h"
 #else
 #include <sys/time.h>
-#ifdef XF86VIDMODE
 #include <X11/extensions/xf86vmode.h>
 #endif
 #endif
-#endif
 
 #include <X11/Xlib-xcb.h>
 #include <xcb/xcb.h>
@@ -937,6 +935,7 @@
    config->fbconfigID = (GLXFBConfigID) (GLX_DONT_CARE);
 
    config->swapMethod = GLX_DONT_CARE;
+   config->sRGBCapable = GLX_DONT_CARE;
 }
 
 #define MATCH_DONT_CARE( param )        \
@@ -2070,7 +2069,6 @@
 __glxGetMscRate(struct glx_screen *psc,
 		int32_t * numerator, int32_t * denominator)
 {
-#ifdef XF86VIDMODE
    XF86VidModeModeLine mode_line;
    int dot_clock;
    int i;
@@ -2117,8 +2115,6 @@
 
       return True;
    }
-   else
-#endif
 
    return False;
 }
@@ -2144,7 +2140,7 @@
 __glXGetMscRateOML(Display * dpy, GLXDrawable drawable,
                    int32_t * numerator, int32_t * denominator)
 {
-#if defined( GLX_DIRECT_RENDERING ) && defined( XF86VIDMODE )
+#if defined(GLX_DIRECT_RENDERING) && !defined(GLX_USE_APPLEGL)
    __GLXDRIdrawable *draw = GetGLXDRIDrawable(dpy, drawable);
 
    if (draw == NULL)
diff --git a/src/glx/glxext.c b/src/glx/glxext.c
index 5f23d37..ca3bf9d 100644
--- a/src/glx/glxext.c
+++ b/src/glx/glxext.c
@@ -402,8 +402,6 @@
 #endif
    }
 
-   config->sRGBCapable = GL_FALSE;
-
    /*
     ** Additional properties may be in a list at the end
     ** of the reply.  They are in pairs of property type
@@ -660,6 +658,8 @@
        */
       m->drawableType = GLX_WINDOW_BIT | GLX_PIXMAP_BIT | GLX_PBUFFER_BIT;
 #endif
+      /* Older X servers don't send this so we default it here. */
+      m->sRGBCapable = GL_FALSE;
        __glXInitializeVisualConfigFromTags(m, nprops, props,
                                           tagged_only, GL_TRUE);
       m->screen = screen;
diff --git a/src/glx/meson.build b/src/glx/meson.build
index 90ab552..f3bbcb4 100644
--- a/src/glx/meson.build
+++ b/src/glx/meson.build
@@ -137,10 +137,6 @@
   '-DDEFAULT_DRIVER_DIR="@0@"'.format(dri_search_path),
 ]
 
-if dep_xxf86vm.found()
-  gl_lib_cargs += '-DHAVE_XF86VIDMODE'
-endif
-
 libglx = static_library(
   'glx',
   [files_libglx, glx_generated],
@@ -157,24 +153,17 @@
   build_by_default : false,
 )
 
-# workaround for bug #2180
-dummy_c = custom_target(
-  'dummy_c',
-  output : 'dummy.c',
-  command : [prog_touch, '@OUTPUT@'],
-)
-
 if with_glx == 'dri'
   libgl = shared_library(
     gl_lib_name,
-    dummy_c,  # workaround for bug #2180
+    [],
     include_directories : [inc_common, inc_glapi, inc_loader, inc_gl_internal],
     link_with : [libglapi_static, libglapi],
     link_whole : libglx,
     link_args : [ld_args_bsymbolic, ld_args_gc_sections, extra_ld_args_libgl],
     dependencies : [
       dep_libdrm, dep_dl, dep_m, dep_thread, dep_x11, dep_xcb_glx, dep_xcb,
-      dep_x11_xcb, dep_xcb_dri2, dep_xext, dep_xfixes, dep_xdamage,
+      dep_x11_xcb, dep_xcb_dri2, dep_xext, dep_xfixes, dep_xdamage, dep_xxf86vm,
       extra_deps_libgl,
     ],
     version : gl_lib_version,
diff --git a/src/intel/Makefile.am b/src/intel/Makefile.am
index 3e098a7..8448640 100644
--- a/src/intel/Makefile.am
+++ b/src/intel/Makefile.am
@@ -71,6 +71,8 @@
 	isl/meson.build \
 	tools/intel_sanitize_gpu.c \
 	tools/intel_sanitize_gpu.in \
+	tools/intel_dump_gpu.c \
+	tools/intel_dump_gpu.in \
 	tools/meson.build \
 	vulkan/meson.build \
 	meson.build
diff --git a/src/intel/Makefile.compiler.am b/src/intel/Makefile.compiler.am
index af30a58..46711fe 100644
--- a/src/intel/Makefile.compiler.am
+++ b/src/intel/Makefile.compiler.am
@@ -50,8 +50,8 @@
 	common/libintel_common.la \
 	dev/libintel_dev.la \
 	$(top_builddir)/src/compiler/nir/libnir.la \
-	$(top_builddir)/src/util/libmesautil.la \
 	$(top_builddir)/src/intel/isl/libisl.la \
+	$(top_builddir)/src/util/libmesautil.la \
 	$(PTHREAD_LIBS) \
 	$(DLOPEN_LIBS)
 
diff --git a/src/intel/Makefile.isl.am b/src/intel/Makefile.isl.am
index 52a71cf..f512944 100644
--- a/src/intel/Makefile.isl.am
+++ b/src/intel/Makefile.isl.am
@@ -82,6 +82,7 @@
 isl_tests_isl_surf_get_image_offset_test_LDADD = \
 	dev/libintel_dev.la \
 	isl/libisl.la \
+	$(top_builddir)/src/util/libmesautil.la \
 	-lm
 
 # ----------------------------------------------------------------------------
diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 91c71a8..5f6cd96 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -9,11 +9,15 @@
 
 COMMON_FILES = \
 	common/gen_clflush.h \
+	common/gen_batch_decoder.c \
 	common/gen_debug.c \
 	common/gen_debug.h \
 	common/gen_decoder.c \
 	common/gen_decoder.h \
+	common/gen_disasm.c \
+	common/gen_disasm.h \
 	common/gen_defines.h \
+	common/gen_gem.h \
 	common/gen_l3_config.c \
 	common/gen_l3_config.h \
 	common/gen_urb_config.c \
@@ -250,6 +254,9 @@
 VULKAN_WSI_X11_FILES := \
 	vulkan/anv_wsi_x11.c
 
+VULKAN_WSI_DISPLAY_FILES := \
+	vulkan/anv_wsi_display.c
+
 VULKAN_GEM_FILES := \
 	vulkan/anv_gem.c
 
diff --git a/src/intel/Makefile.tools.am b/src/intel/Makefile.tools.am
index a8685c2..0062408 100644
--- a/src/intel/Makefile.tools.am
+++ b/src/intel/Makefile.tools.am
@@ -21,13 +21,12 @@
 
 noinst_PROGRAMS += \
 	tools/aubinator \
-	tools/aubinator_error_decode
+	tools/aubinator_error_decode \
+	tools/error2aub
+
 
 tools_aubinator_SOURCES = \
 	tools/aubinator.c \
-	tools/disasm.c \
-	tools/gen_batch_decoder.c \
-	tools/gen_disasm.h \
 	tools/intel_aub.h
 
 tools_aubinator_CFLAGS = \
@@ -48,10 +47,7 @@
 
 
 tools_aubinator_error_decode_SOURCES = \
-	tools/aubinator_error_decode.c \
-	tools/disasm.c \
-	tools/gen_batch_decoder.c \
-	tools/gen_disasm.h
+	tools/aubinator_error_decode.c
 
 tools_aubinator_error_decode_LDADD = \
 	common/libintel_common.la \
@@ -65,3 +61,23 @@
 tools_aubinator_error_decode_CFLAGS = \
 	$(AM_CFLAGS) \
 	$(ZLIB_CFLAGS)
+
+
+tools_error2aub_SOURCES = \
+	tools/gen_context.h \
+	tools/gen8_context.h \
+	tools/gen10_context.h \
+	tools/aub_write.h \
+	tools/aub_write.c \
+	tools/error2aub.c
+
+tools_error2aub_CFLAGS = \
+	$(AM_CFLAGS) \
+	$(ZLIB_CFLAGS)
+
+tools_error2aub_LDADD = \
+	dev/libintel_dev.la \
+	$(PTHREAD_LIBS) \
+	$(DLOPEN_LIBS) \
+	$(ZLIB_LIBS) \
+	-lm
diff --git a/src/intel/Makefile.vulkan.am b/src/intel/Makefile.vulkan.am
index 0a84074..d511263 100644
--- a/src/intel/Makefile.vulkan.am
+++ b/src/intel/Makefile.vulkan.am
@@ -195,6 +195,21 @@
 VULKAN_LIB_DEPS += $(WAYLAND_CLIENT_LIBS)
 endif
 
+if HAVE_PLATFORM_DRM
+VULKAN_CPPFLAGS += \
+	-DVK_USE_PLATFORM_DISPLAY_KHR
+
+VULKAN_SOURCES += $(VULKAN_WSI_DISPLAY_FILES)
+endif
+
+if HAVE_XLIB_LEASE
+VULKAN_CPPFLAGS += \
+	-DVK_USE_PLATFORM_XLIB_XRANDR_EXT \
+	$(XCB_RANDR_CFLAGS) \
+	$(XLIB_RANDR_CFLAGS)
+VULKAN_LIB_DEPS += $(XCB_RANDR_LIBS)
+endif
+
 noinst_LTLIBRARIES += vulkan/libvulkan_common.la
 vulkan_libvulkan_common_la_SOURCES = $(VULKAN_SOURCES)
 vulkan_libvulkan_common_la_CFLAGS = $(VULKAN_CFLAGS)
diff --git a/src/intel/blorp/blorp.c b/src/intel/blorp/blorp.c
index 73f8c67..ffe2607 100644
--- a/src/intel/blorp/blorp.c
+++ b/src/intel/blorp/blorp.c
@@ -75,18 +75,6 @@
    if (format == ISL_FORMAT_UNSUPPORTED)
       format = surf->surf->format;
 
-   if (format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
-      /* Unfortunately, ISL_FORMAT_R24_UNORM_X8_TYPELESS it isn't supported as
-       * a render target, which would prevent us from blitting to 24-bit
-       * depth.  The miptree consists of 32 bits per pixel, arranged as 24-bit
-       * depth values interleaved with 8 "don't care" bits.  Since depth
-       * values don't require any blending, it doesn't matter how we interpret
-       * the bit pattern as long as we copy the right amount of data, so just
-       * map it as 8-bit BGRA.
-       */
-      format = ISL_FORMAT_B8G8R8A8_UNORM;
-   }
-
    info->surf = *surf->surf;
    info->addr = surf->addr;
 
@@ -217,7 +205,7 @@
 
    const unsigned *program =
       brw_compile_fs(compiler, blorp->driver_ctx, mem_ctx, wm_key,
-                     wm_prog_data, nir, NULL, -1, -1, false, use_repclear,
+                     wm_prog_data, nir, NULL, -1, -1, -1, false, use_repclear,
                      NULL, NULL);
 
    return program;
diff --git a/src/intel/blorp/blorp.h b/src/intel/blorp/blorp.h
index 69c1b52..ee343a4 100644
--- a/src/intel/blorp/blorp.h
+++ b/src/intel/blorp/blorp.h
@@ -72,6 +72,11 @@
 
    /* This flag indicates that the blorp call should be predicated. */
    BLORP_BATCH_PREDICATE_ENABLE      = (1 << 1),
+
+   /* This flag indicates that blorp should *not* update the indirect clear
+    * color buffer.
+    */
+   BLORP_BATCH_NO_UPDATE_CLEAR_COLOR = (1 << 2),
 };
 
 struct blorp_batch {
@@ -114,6 +119,14 @@
    uint32_t tile_x_sa, tile_y_sa;
 };
 
+enum blorp_filter {
+   BLORP_FILTER_NONE,
+   BLORP_FILTER_NEAREST,
+   BLORP_FILTER_BILINEAR,
+   BLORP_FILTER_SAMPLE_0,
+   BLORP_FILTER_AVERAGE,
+};
+
 void
 blorp_blit(struct blorp_batch *batch,
            const struct blorp_surf *src_surf,
@@ -126,7 +139,8 @@
            float src_x1, float src_y1,
            float dst_x0, float dst_y0,
            float dst_x1, float dst_y1,
-           uint32_t filter, bool mirror_x, bool mirror_y);
+           enum blorp_filter filter,
+           bool mirror_x, bool mirror_y);
 
 void
 blorp_copy(struct blorp_batch *batch,
diff --git a/src/intel/blorp/blorp_blit.c b/src/intel/blorp/blorp_blit.c
index d3bf64c..7cc580a 100644
--- a/src/intel/blorp/blorp_blit.c
+++ b/src/intel/blorp/blorp_blit.c
@@ -22,9 +22,11 @@
  */
 
 #include "blorp_nir_builder.h"
+#include "compiler/nir/nir_format_convert.h"
 
 #include "blorp_priv.h"
 
+#include "util/format_rgb9e5.h"
 /* header-only include needed for _mesa_unorm_to_float and friends. */
 #include "mesa/main/format_utils.h"
 
@@ -68,9 +70,6 @@
 brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v,
                          const struct brw_blorp_blit_prog_key *key)
 {
-    /* Blended and scaled blits never use pixel discard. */
-    assert(!key->use_kill || !(key->blend && key->blit_scaled));
-
 #define LOAD_INPUT(name, type)\
    v->v_##name = BLORP_CREATE_NIR_INPUT(b->shader, name, type);
 
@@ -176,8 +175,6 @@
    tex->is_shadow = false;
 
    /* Blorp only has one texture and it's bound at unit 0 */
-   tex->texture = NULL;
-   tex->sampler = NULL;
    tex->texture_index = 0;
    tex->sampler_index = 0;
 
@@ -282,25 +279,6 @@
    return &tex->dest.ssa;
 }
 
-static nir_ssa_def *
-nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src,
-                  uint32_t src_mask, int src_left_shift)
-{
-   nir_ssa_def *masked = nir_iand(b, src, nir_imm_int(b, src_mask));
-
-   nir_ssa_def *shifted;
-   if (src_left_shift > 0) {
-      shifted = nir_ishl(b, masked, nir_imm_int(b, src_left_shift));
-   } else if (src_left_shift < 0) {
-      shifted = nir_ushr(b, masked, nir_imm_int(b, -src_left_shift));
-   } else {
-      assert(src_left_shift == 0);
-      shifted = masked;
-   }
-
-   return nir_ior(b, dst, shifted);
-}
-
 /**
  * Emit code to compensate for the difference between Y and W tiling.
  *
@@ -898,49 +876,150 @@
 {
    assert(key->texture_data_type == nir_type_uint);
 
-   if (key->dst_bpc > key->src_bpc) {
-      nir_ssa_def *u = nir_ssa_undef(b, 1, 32);
-      nir_ssa_def *dst_chan[2] = { u, u };
-      unsigned shift = 0;
-      unsigned dst_idx = 0;
-      for (unsigned i = 0; i < 4; i++) {
-         nir_ssa_def *shifted = nir_ishl(b, nir_channel(b, color, i),
-                                            nir_imm_int(b, shift));
-         if (shift == 0) {
-            dst_chan[dst_idx] = shifted;
-         } else {
-            dst_chan[dst_idx] = nir_ior(b, dst_chan[dst_idx], shifted);
-         }
+   if (key->src_format == key->dst_format)
+      return color;
 
-         shift += key->src_bpc;
-         if (shift >= key->dst_bpc) {
-            dst_idx++;
-            shift = 0;
-         }
-      }
+   const struct isl_format_layout *src_fmtl =
+      isl_format_get_layout(key->src_format);
+   const struct isl_format_layout *dst_fmtl =
+      isl_format_get_layout(key->dst_format);
 
-      return nir_vec4(b, dst_chan[0], dst_chan[1], u, u);
+   /* They must be uint formats with the same bit size */
+   assert(src_fmtl->bpb == dst_fmtl->bpb);
+   assert(src_fmtl->channels.r.type == ISL_UINT);
+   assert(dst_fmtl->channels.r.type == ISL_UINT);
+
+   /* They must be in regular color formats (no luminance or alpha) */
+   assert(src_fmtl->channels.r.bits > 0);
+   assert(dst_fmtl->channels.r.bits > 0);
+
+   /* They must be in RGBA order (possibly with channels missing) */
+   assert(src_fmtl->channels.r.start_bit == 0);
+   assert(dst_fmtl->channels.r.start_bit == 0);
+
+   if (src_fmtl->bpb <= 32) {
+      const unsigned src_channels =
+         isl_format_get_num_channels(key->src_format);
+      const unsigned src_bits[4] = {
+         src_fmtl->channels.r.bits,
+         src_fmtl->channels.g.bits,
+         src_fmtl->channels.b.bits,
+         src_fmtl->channels.a.bits,
+      };
+      const unsigned dst_channels =
+         isl_format_get_num_channels(key->dst_format);
+      const unsigned dst_bits[4] = {
+         dst_fmtl->channels.r.bits,
+         dst_fmtl->channels.g.bits,
+         dst_fmtl->channels.b.bits,
+         dst_fmtl->channels.a.bits,
+      };
+      nir_ssa_def *packed =
+         nir_format_pack_uint_unmasked(b, color, src_bits, src_channels);
+      color = nir_format_unpack_uint(b, packed, dst_bits, dst_channels);
    } else {
-      assert(key->dst_bpc < key->src_bpc);
+      const unsigned src_bpc = src_fmtl->channels.r.bits;
+      const unsigned dst_bpc = dst_fmtl->channels.r.bits;
 
-      nir_ssa_def *mask = nir_imm_int(b, ~0u >> (32 - key->dst_bpc));
+      assert(src_fmtl->channels.g.bits == 0 ||
+             src_fmtl->channels.g.bits == src_fmtl->channels.r.bits);
+      assert(src_fmtl->channels.b.bits == 0 ||
+             src_fmtl->channels.b.bits == src_fmtl->channels.r.bits);
+      assert(src_fmtl->channels.a.bits == 0 ||
+             src_fmtl->channels.a.bits == src_fmtl->channels.r.bits);
+      assert(dst_fmtl->channels.g.bits == 0 ||
+             dst_fmtl->channels.g.bits == dst_fmtl->channels.r.bits);
+      assert(dst_fmtl->channels.b.bits == 0 ||
+             dst_fmtl->channels.b.bits == dst_fmtl->channels.r.bits);
+      assert(dst_fmtl->channels.a.bits == 0 ||
+             dst_fmtl->channels.a.bits == dst_fmtl->channels.r.bits);
 
-      nir_ssa_def *dst_chan[4];
-      unsigned src_idx = 0;
-      unsigned shift = 0;
-      for (unsigned i = 0; i < 4; i++) {
-         dst_chan[i] = nir_iand(b, nir_ushr(b, nir_channel(b, color, src_idx),
-                                               nir_imm_int(b, shift)),
-                                   mask);
-         shift += key->dst_bpc;
-         if (shift >= key->src_bpc) {
-            src_idx++;
-            shift = 0;
-         }
-      }
+      /* Restrict to only the channels we actually have */
+      const unsigned src_channels =
+         isl_format_get_num_channels(key->src_format);
+      color = nir_channels(b, color, (1 << src_channels) - 1);
 
-      return nir_vec4(b, dst_chan[0], dst_chan[1], dst_chan[2], dst_chan[3]);
+      color = nir_format_bitcast_uint_vec_unmasked(b, color, src_bpc, dst_bpc);
    }
+
+   /* Blorp likes to assume that colors are vec4s */
+   nir_ssa_def *u = nir_ssa_undef(b, 1, 32);
+   nir_ssa_def *chans[4] = { u, u, u, u };
+   for (unsigned i = 0; i < color->num_components; i++)
+      chans[i] = nir_channel(b, color, i);
+   return nir_vec4(b, chans[0], chans[1], chans[2], chans[3]);
+}
+
+static nir_ssa_def *
+select_color_channel(struct nir_builder *b, nir_ssa_def *color,
+                     nir_alu_type data_type,
+                     enum isl_channel_select chan)
+{
+   if (chan == ISL_CHANNEL_SELECT_ZERO) {
+      return nir_imm_int(b, 0);
+   } else if (chan == ISL_CHANNEL_SELECT_ONE) {
+      switch (data_type) {
+      case nir_type_int:
+      case nir_type_uint:
+         return nir_imm_int(b, 1);
+      case nir_type_float:
+         return nir_imm_float(b, 1);
+      default:
+         unreachable("Invalid data type");
+      }
+   } else {
+      assert((unsigned)(chan - ISL_CHANNEL_SELECT_RED) < 4);
+      return nir_channel(b, color, chan - ISL_CHANNEL_SELECT_RED);
+   }
+}
+
+static nir_ssa_def *
+swizzle_color(struct nir_builder *b, nir_ssa_def *color,
+              struct isl_swizzle swizzle, nir_alu_type data_type)
+{
+   return nir_vec4(b,
+                   select_color_channel(b, color, data_type, swizzle.r),
+                   select_color_channel(b, color, data_type, swizzle.g),
+                   select_color_channel(b, color, data_type, swizzle.b),
+                   select_color_channel(b, color, data_type, swizzle.a));
+}
+
+static nir_ssa_def *
+convert_color(struct nir_builder *b, nir_ssa_def *color,
+              const struct brw_blorp_blit_prog_key *key)
+{
+   /* All of our color conversions end up generating a single-channel color
+    * value that we need to write out.
+    */
+   nir_ssa_def *value;
+
+   if (key->dst_format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
+      /* The destination image is bound as R32_UINT but the data needs to be
+       * in R24_UNORM_X8_TYPELESS.  The bottom 24 are the actual data and the
+       * top 8 need to be zero.  We can accomplish this by simply multiplying
+       * by a factor to scale things down.
+       */
+      unsigned factor = (1 << 24) - 1;
+      value = nir_fsat(b, nir_channel(b, color, 0));
+      value = nir_f2i32(b, nir_fmul(b, value, nir_imm_float(b, factor)));
+   } else if (key->dst_format == ISL_FORMAT_L8_UNORM_SRGB) {
+      value = nir_format_linear_to_srgb(b, nir_channel(b, color, 0));
+   } else if (key->dst_format == ISL_FORMAT_R8G8B8_UNORM_SRGB) {
+      value = nir_format_linear_to_srgb(b, color);
+   } else if (key->dst_format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) {
+      value = nir_format_pack_r9g9b9e5(b, color);
+   } else {
+      unreachable("Unsupported format conversion");
+   }
+
+   nir_ssa_def *out_comps[4];
+   for (unsigned i = 0; i < 4; i++) {
+      if (i < value->num_components)
+         out_comps[i] = nir_channel(b, value, i);
+      else
+         out_comps[i] = nir_ssa_undef(b, 1, 32);
+   }
+   return nir_vec(b, out_comps, 4);
 }
 
 /**
@@ -1100,18 +1179,6 @@
       assert(key->persample_msaa_dispatch);
    }
 
-   if (key->blend) {
-      /* We are blending, which means we won't have an opportunity to
-       * translate the tiling and sample count for the texture surface.  So
-       * the surface state for the texture must be configured with the correct
-       * tiling and sample count.
-       */
-      assert(!key->src_tiled_w);
-      assert(key->tex_samples == key->src_samples);
-      assert(key->tex_layout == key->src_layout);
-      assert(key->tex_samples > 0);
-   }
-
    if (key->persample_msaa_dispatch) {
       /* It only makes sense to do persample dispatch if the render target is
        * configured as multisampled.
@@ -1187,10 +1254,8 @@
     * If we need to kill pixels that are outside the destination rectangle,
     * now is the time to do it.
     */
-   if (key->use_kill) {
-      assert(!(key->blend && key->blit_scaled));
+   if (key->use_kill)
       blorp_nir_discard_if_outside_rect(&b, dst_pos, &v);
-   }
 
    src_pos = blorp_blit_apply_transform(&b, nir_i2f32(&b, dst_pos), &v);
    if (dst_pos->num_components == 3) {
@@ -1213,7 +1278,82 @@
     * that we want to texture from.  Exception: if we are blending, then S is
     * irrelevant, because we are going to fetch all samples.
     */
-   if (key->blend && !key->blit_scaled) {
+   switch (key->filter) {
+   case BLORP_FILTER_NONE:
+   case BLORP_FILTER_NEAREST:
+   case BLORP_FILTER_SAMPLE_0:
+      /* We're going to use texelFetch, so we need integers */
+      if (src_pos->num_components == 2) {
+         src_pos = nir_f2i32(&b, src_pos);
+      } else {
+         assert(src_pos->num_components == 3);
+         src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i32(&b, src_pos), 0),
+                                nir_channel(&b, nir_f2i32(&b, src_pos), 1),
+                                nir_channel(&b, src_pos, 2));
+      }
+
+      /* We aren't blending, which means we just want to fetch a single
+       * sample from the source surface.  The address that we want to fetch
+       * from is related to the X, Y and S values according to the formula:
+       *
+       * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
+       *
+       * If the actual tiling and sample count of the source surface are
+       * not the same as the configuration of the texture, then we need to
+       * adjust the coordinates to compensate for the difference.
+       */
+      if (tex_tiled_w != key->src_tiled_w ||
+          key->tex_samples != key->src_samples ||
+          key->tex_layout != key->src_layout) {
+         src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples,
+                                         key->src_layout);
+         /* Now (X, Y, S) = detile(src_tiling, offset) */
+         if (tex_tiled_w != key->src_tiled_w)
+            src_pos = blorp_nir_retile_w_to_y(&b, src_pos);
+         /* Now (X, Y, S) = detile(tex_tiling, offset) */
+         src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples,
+                                         key->tex_layout);
+      }
+
+      if (key->need_src_offset)
+         src_pos = nir_iadd(&b, src_pos, nir_load_var(&b, v.v_src_offset));
+
+      /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
+       *
+       * In other words: X, Y, and S now contain values which, when passed to
+       * the texturing unit, will cause data to be read from the correct
+       * memory location.  So we can fetch the texel now.
+       */
+      if (key->src_samples == 1) {
+         color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type);
+      } else {
+         nir_ssa_def *mcs = NULL;
+         if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
+            mcs = blorp_blit_txf_ms_mcs(&b, &v, src_pos);
+
+         color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type);
+      }
+      break;
+
+   case BLORP_FILTER_BILINEAR:
+      assert(!key->src_tiled_w);
+      assert(key->tex_samples == key->src_samples);
+      assert(key->tex_layout == key->src_layout);
+
+      if (key->src_samples == 1) {
+         color = blorp_nir_tex(&b, &v, key, src_pos);
+      } else {
+         assert(!key->use_kill);
+         color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples,
+                                                 key, &v);
+      }
+      break;
+
+   case BLORP_FILTER_AVERAGE:
+      assert(!key->src_tiled_w);
+      assert(key->tex_samples == key->src_samples);
+      assert(key->tex_layout == key->src_layout);
+
       /* Resolves (effecively) use texelFetch, so we need integers and we
        * don't care about the sample index if we got one.
        */
@@ -1238,69 +1378,29 @@
                                                 key->tex_aux_usage,
                                                 key->texture_data_type);
       }
-   } else if (key->blend && key->blit_scaled) {
-      assert(!key->use_kill);
-      color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v);
-   } else {
-      if (key->bilinear_filter) {
-         color = blorp_nir_tex(&b, &v, key, src_pos);
-      } else {
-         /* We're going to use texelFetch, so we need integers */
-         if (src_pos->num_components == 2) {
-            src_pos = nir_f2i32(&b, src_pos);
-         } else {
-            assert(src_pos->num_components == 3);
-            src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i32(&b, src_pos), 0),
-                                   nir_channel(&b, nir_f2i32(&b, src_pos), 1),
-                                   nir_channel(&b, src_pos, 2));
-         }
+      break;
 
-         /* We aren't blending, which means we just want to fetch a single
-          * sample from the source surface.  The address that we want to fetch
-          * from is related to the X, Y and S values according to the formula:
-          *
-          * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
-          *
-          * If the actual tiling and sample count of the source surface are
-          * not the same as the configuration of the texture, then we need to
-          * adjust the coordinates to compensate for the difference.
-          */
-         if (tex_tiled_w != key->src_tiled_w ||
-             key->tex_samples != key->src_samples ||
-             key->tex_layout != key->src_layout) {
-            src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples,
-                                            key->src_layout);
-            /* Now (X, Y, S) = detile(src_tiling, offset) */
-            if (tex_tiled_w != key->src_tiled_w)
-               src_pos = blorp_nir_retile_w_to_y(&b, src_pos);
-            /* Now (X, Y, S) = detile(tex_tiling, offset) */
-            src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples,
-                                            key->tex_layout);
-         }
-
-         if (key->need_src_offset)
-            src_pos = nir_iadd(&b, src_pos, nir_load_var(&b, v.v_src_offset));
-
-         /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
-          *
-          * In other words: X, Y, and S now contain values which, when passed to
-          * the texturing unit, will cause data to be read from the correct
-          * memory location.  So we can fetch the texel now.
-          */
-         if (key->src_samples == 1) {
-            color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type);
-         } else {
-            nir_ssa_def *mcs = NULL;
-            if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
-               mcs = blorp_blit_txf_ms_mcs(&b, &v, src_pos);
-
-            color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type);
-         }
-      }
+   default:
+      unreachable("Invalid blorp filter");
    }
 
-   if (key->dst_bpc != key->src_bpc)
+   if (!isl_swizzle_is_identity(key->src_swizzle)) {
+      color = swizzle_color(&b, color, key->src_swizzle,
+                            key->texture_data_type);
+   }
+
+   if (!isl_swizzle_is_identity(key->dst_swizzle)) {
+      color = swizzle_color(&b, color, isl_swizzle_invert(key->dst_swizzle),
+                            nir_type_int);
+   }
+
+   if (key->format_bit_cast) {
+      assert(isl_swizzle_is_identity(key->src_swizzle));
+      assert(isl_swizzle_is_identity(key->dst_swizzle));
       color = bit_cast_color(&b, color, key);
+   } else if (key->dst_format) {
+      color = convert_color(&b, color, key);
+   }
 
    if (key->dst_rgb) {
       /* The destination image is bound as a red texture three times as wide
@@ -1569,6 +1669,56 @@
    struct blt_axis x, y;
 };
 
+static enum isl_format
+get_red_format_for_rgb_format(enum isl_format format)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+
+   switch (fmtl->channels.r.bits) {
+   case 8:
+      switch (fmtl->channels.r.type) {
+      case ISL_UNORM:
+         return ISL_FORMAT_R8_UNORM;
+      case ISL_SNORM:
+         return ISL_FORMAT_R8_SNORM;
+      case ISL_UINT:
+         return ISL_FORMAT_R8_UINT;
+      case ISL_SINT:
+         return ISL_FORMAT_R8_SINT;
+      default:
+         unreachable("Invalid 8-bit RGB channel type");
+      }
+   case 16:
+      switch (fmtl->channels.r.type) {
+      case ISL_UNORM:
+         return ISL_FORMAT_R16_UNORM;
+      case ISL_SNORM:
+         return ISL_FORMAT_R16_SNORM;
+      case ISL_SFLOAT:
+         return ISL_FORMAT_R16_FLOAT;
+      case ISL_UINT:
+         return ISL_FORMAT_R16_UINT;
+      case ISL_SINT:
+         return ISL_FORMAT_R16_SINT;
+      default:
+         unreachable("Invalid 8-bit RGB channel type");
+      }
+   case 32:
+      switch (fmtl->channels.r.type) {
+      case ISL_SFLOAT:
+         return ISL_FORMAT_R32_FLOAT;
+      case ISL_UINT:
+         return ISL_FORMAT_R32_UINT;
+      case ISL_SINT:
+         return ISL_FORMAT_R32_SINT;
+      default:
+         unreachable("Invalid 8-bit RGB channel type");
+      }
+   default:
+      unreachable("Invalid number of red channel bits");
+   }
+}
+
 void
 surf_fake_rgb_with_red(const struct isl_device *isl_dev,
                        struct brw_blorp_surface_info *info)
@@ -1579,26 +1729,9 @@
    info->surf.phys_level0_sa.width *= 3;
    info->tile_x_sa *= 3;
 
-   enum isl_format red_format;
-   switch (info->view.format) {
-   case ISL_FORMAT_R8G8B8_UNORM:
-      red_format = ISL_FORMAT_R8_UNORM;
-      break;
-   case ISL_FORMAT_R8G8B8_UINT:
-      red_format = ISL_FORMAT_R8_UINT;
-      break;
-   case ISL_FORMAT_R16G16B16_UNORM:
-      red_format = ISL_FORMAT_R16_UNORM;
-      break;
-   case ISL_FORMAT_R16G16B16_UINT:
-      red_format = ISL_FORMAT_R16_UINT;
-      break;
-   case ISL_FORMAT_R32G32B32_UINT:
-      red_format = ISL_FORMAT_R32_UINT;
-      break;
-   default:
-      unreachable("Invalid RGB copy destination format");
-   }
+   enum isl_format red_format =
+      get_red_format_for_rgb_format(info->view.format);
+
    assert(isl_format_get_layout(red_format)->channels.r.type ==
           isl_format_get_layout(info->view.format)->channels.r.type);
    assert(isl_format_get_layout(red_format)->channels.r.bits ==
@@ -1817,8 +1950,8 @@
 
    params->num_samples = params->dst.surf.samples;
 
-   if ((wm_prog_key->bilinear_filter ||
-        (wm_prog_key->blend && !wm_prog_key->blit_scaled)) &&
+   if ((wm_prog_key->filter == BLORP_FILTER_AVERAGE ||
+        wm_prog_key->filter == BLORP_FILTER_BILINEAR) &&
        batch->blorp->isl_dev->info->gen <= 6) {
       /* Gen4-5 don't support non-normalized texture coordinates */
       wm_prog_key->src_coords_normalized = true;
@@ -1839,10 +1972,47 @@
       params->x0 *= 3;
       params->x1 *= 3;
 
+      /* If it happens to be sRGB, we need to force a conversion */
+      if (params->dst.view.format == ISL_FORMAT_R8G8B8_UNORM_SRGB)
+         wm_prog_key->dst_format = ISL_FORMAT_R8G8B8_UNORM_SRGB;
+
       surf_fake_rgb_with_red(batch->blorp->isl_dev, &params->dst);
 
       wm_prog_key->dst_rgb = true;
       wm_prog_key->need_dst_offset = true;
+   } else if (isl_format_is_rgbx(params->dst.view.format)) {
+      /* We can handle RGBX formats easily enough by treating them as RGBA */
+      params->dst.view.format =
+         isl_format_rgbx_to_rgba(params->dst.view.format);
+   } else if (params->dst.view.format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
+      wm_prog_key->dst_format = params->dst.view.format;
+      params->dst.view.format = ISL_FORMAT_R32_UINT;
+   } else if (params->dst.view.format == ISL_FORMAT_A4B4G4R4_UNORM) {
+      params->dst.view.swizzle =
+         isl_swizzle_compose(params->dst.view.swizzle,
+                             ISL_SWIZZLE(ALPHA, RED, GREEN, BLUE));
+      params->dst.view.format = ISL_FORMAT_B4G4R4A4_UNORM;
+   } else if (params->dst.view.format == ISL_FORMAT_L8_UNORM_SRGB) {
+      wm_prog_key->dst_format = params->dst.view.format;
+      params->dst.view.format = ISL_FORMAT_R8_UNORM;
+   } else if (params->dst.view.format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) {
+      wm_prog_key->dst_format = params->dst.view.format;
+      params->dst.view.format = ISL_FORMAT_R32_UINT;
+   }
+
+   if (devinfo->gen <= 7 && !devinfo->is_haswell &&
+       !isl_swizzle_is_identity(params->src.view.swizzle)) {
+      wm_prog_key->src_swizzle = params->src.view.swizzle;
+      params->src.view.swizzle = ISL_SWIZZLE_IDENTITY;
+   } else {
+      wm_prog_key->src_swizzle = ISL_SWIZZLE_IDENTITY;
+   }
+
+   if (!isl_swizzle_supports_rendering(devinfo, params->dst.view.swizzle)) {
+      wm_prog_key->dst_swizzle = params->dst.view.swizzle;
+      params->dst.view.swizzle = ISL_SWIZZLE_IDENTITY;
+   } else {
+      wm_prog_key->dst_swizzle = ISL_SWIZZLE_IDENTITY;
    }
 
    if (params->src.tile_x_sa || params->src.tile_y_sa) {
@@ -2059,7 +2229,8 @@
            float src_x1, float src_y1,
            float dst_x0, float dst_y0,
            float dst_x1, float dst_y1,
-           GLenum filter, bool mirror_x, bool mirror_y)
+           enum blorp_filter filter,
+           bool mirror_x, bool mirror_y)
 {
    struct blorp_params params;
    blorp_params_init(&params);
@@ -2079,6 +2250,17 @@
       }
    }
 
+   /* ISL_FORMAT_R24_UNORM_X8_TYPELESS it isn't supported as a render target,
+    * which requires shader math to render to it.  Blitting Z24X8 to Z24X8
+    * is fairly common though, so we'd like to avoid it.  Since we don't need
+    * to blend depth values, we can simply pick a renderable format with the
+    * right number of bits-per-pixel, like 8-bit BGRA.
+    */
+   if (dst_surf->surf->format == ISL_FORMAT_R24_UNORM_X8_TYPELESS &&
+       src_surf->surf->format == ISL_FORMAT_R24_UNORM_X8_TYPELESS) {
+      src_format = dst_format = ISL_FORMAT_B8G8R8A8_UNORM;
+   }
+
    brw_blorp_surface_info_init(batch->blorp, &params.src, src_surf, src_level,
                                src_layer, src_format, false);
    brw_blorp_surface_info_init(batch->blorp, &params.dst, dst_surf, dst_level,
@@ -2088,14 +2270,10 @@
    params.dst.view.swizzle = dst_swizzle;
 
    struct brw_blorp_blit_prog_key wm_prog_key = {
-      .shader_type = BLORP_SHADER_TYPE_BLIT
+      .shader_type = BLORP_SHADER_TYPE_BLIT,
+      .filter = filter,
    };
 
-   /* Scaled blitting or not. */
-   wm_prog_key.blit_scaled =
-      ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
-       (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
-
    /* Scaling factors used for bilinear filtering in multisample scaled
     * blits.
     */
@@ -2105,27 +2283,6 @@
       wm_prog_key.x_scale = 2.0f;
    wm_prog_key.y_scale = params.src.surf.samples / wm_prog_key.x_scale;
 
-   if (filter == GL_LINEAR &&
-       params.src.surf.samples <= 1 && params.dst.surf.samples <= 1) {
-      wm_prog_key.bilinear_filter = true;
-   }
-
-   if ((params.src.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) == 0 &&
-       (params.src.surf.usage & ISL_SURF_USAGE_STENCIL_BIT) == 0 &&
-       !isl_format_has_int_channel(params.src.surf.format) &&
-       params.src.surf.samples > 1 && params.dst.surf.samples <= 1) {
-      /* We are downsampling a non-integer color buffer, so blend.
-       *
-       * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
-       *
-       *    "If the source formats are integer types or stencil values, a
-       *    single sample's value is selected for each pixel."
-       *
-       * This implies we should not blend in that case.
-       */
-      wm_prog_key.blend = true;
-   }
-
    params.wm_inputs.rect_grid.x1 =
       minify(params.src.surf.logical_level0_px.width, src_level) *
       wm_prog_key.x_scale - 1.0f;
@@ -2269,80 +2426,17 @@
    case ISL_FORMAT_R32_SNORM:
       return ISL_FORMAT_R32_UINT;
 
+   case ISL_FORMAT_B10G10R10A2_UNORM:
+   case ISL_FORMAT_B10G10R10A2_UNORM_SRGB:
+   case ISL_FORMAT_R10G10B10A2_UNORM:
+   case ISL_FORMAT_R10G10B10A2_UINT:
+      return ISL_FORMAT_R10G10B10A2_UINT;
+
    default:
       unreachable("Not a compressible format");
    }
 }
 
-/* Takes an isl_color_value and returns a color value that is the original
- * color value only bit-casted to a UINT format.  This value, together with
- * the format from get_ccs_compatible_uint_format, will yield the same bit
- * value as the original color and format.
- */
-static union isl_color_value
-bitcast_color_value_to_uint(union isl_color_value color,
-                            const struct isl_format_layout *fmtl)
-{
-   /* All CCS formats have the same number of bits in each channel */
-   const struct isl_channel_layout *chan = &fmtl->channels.r;
-
-   union isl_color_value bits;
-   switch (chan->type) {
-   case ISL_UINT:
-   case ISL_SINT:
-      /* Hardware will ignore the high bits so there's no need to cast */
-      bits = color;
-      break;
-
-   case ISL_UNORM:
-      for (unsigned i = 0; i < 4; i++)
-         bits.u32[i] = _mesa_float_to_unorm(color.f32[i], chan->bits);
-      break;
-
-   case ISL_SNORM:
-      for (unsigned i = 0; i < 4; i++)
-         bits.i32[i] = _mesa_float_to_snorm(color.f32[i], chan->bits);
-      break;
-
-   case ISL_SFLOAT:
-      switch (chan->bits) {
-      case 16:
-         for (unsigned i = 0; i < 4; i++)
-            bits.u32[i] = _mesa_float_to_half(color.f32[i]);
-         break;
-
-      case 32:
-         bits = color;
-         break;
-
-      default:
-         unreachable("Invalid float format size");
-      }
-      break;
-
-   default:
-      unreachable("Invalid channel type");
-   }
-
-   switch (fmtl->format) {
-   case ISL_FORMAT_B8G8R8A8_UNORM:
-   case ISL_FORMAT_B8G8R8A8_UNORM_SRGB:
-   case ISL_FORMAT_B8G8R8X8_UNORM:
-   case ISL_FORMAT_B8G8R8X8_UNORM_SRGB: {
-      /* If it's a BGRA format, we need to swap blue and red */
-      uint32_t tmp = bits.u32[0];
-      bits.u32[0] = bits.u32[2];
-      bits.u32[2] = tmp;
-      break;
-   }
-
-   default:
-      break; /* Nothing to do */
-   }
-
-   return bits;
-}
-
 void
 blorp_surf_convert_to_uncompressed(const struct isl_device *isl_dev,
                                    struct brw_blorp_surface_info *info,
@@ -2424,6 +2518,7 @@
 
    struct brw_blorp_blit_prog_key wm_prog_key = {
       .shader_type = BLORP_SHADER_TYPE_BLIT,
+      .filter = BLORP_FILTER_NONE,
       .need_src_offset = src_surf->tile_x_sa || src_surf->tile_y_sa,
       .need_dst_offset = dst_surf->tile_x_sa || dst_surf->tile_y_sa,
    };
@@ -2478,8 +2573,11 @@
       assert(isl_formats_are_ccs_e_compatible(batch->blorp->isl_dev->info,
                                               linear_src_format,
                                               params.src.view.format));
-      params.src.clear_color =
-         bitcast_color_value_to_uint(params.src.clear_color, src_fmtl);
+      uint32_t packed[4];
+      isl_color_value_pack(&params.src.clear_color,
+                           linear_src_format, packed);
+      isl_color_value_unpack(&params.src.clear_color,
+                             params.src.view.format, packed);
    }
 
    if (params.dst.aux_usage == ISL_AUX_USAGE_CCS_E) {
@@ -2489,14 +2587,33 @@
       assert(isl_formats_are_ccs_e_compatible(batch->blorp->isl_dev->info,
                                               linear_dst_format,
                                               params.dst.view.format));
-      params.dst.clear_color =
-         bitcast_color_value_to_uint(params.dst.clear_color, dst_fmtl);
+      uint32_t packed[4];
+      isl_color_value_pack(&params.dst.clear_color,
+                           linear_dst_format, packed);
+      isl_color_value_unpack(&params.dst.clear_color,
+                             params.dst.view.format, packed);
    }
 
-   wm_prog_key.src_bpc =
-      isl_format_get_layout(params.src.view.format)->channels.r.bits;
-   wm_prog_key.dst_bpc =
-      isl_format_get_layout(params.dst.view.format)->channels.r.bits;
+   if (params.src.view.format != params.dst.view.format) {
+      enum isl_format src_cast_format = params.src.view.format;
+      enum isl_format dst_cast_format = params.dst.view.format;
+
+      /* The BLORP bitcast code gets confused by RGB formats.  Just treat them
+       * as RGBA and then everything will be happy.  This is perfectly safe
+       * because BLORP likes to treat things as if they have vec4 colors all
+       * the time anyway.
+       */
+      if (isl_format_is_rgb(src_cast_format))
+         src_cast_format = isl_format_rgb_to_rgba(src_cast_format);
+      if (isl_format_is_rgb(dst_cast_format))
+         dst_cast_format = isl_format_rgb_to_rgba(dst_cast_format);
+
+      if (src_cast_format != dst_cast_format) {
+         wm_prog_key.format_bit_cast = true;
+         wm_prog_key.src_format = src_cast_format;
+         wm_prog_key.dst_format = dst_cast_format;
+      }
+   }
 
    if (src_fmtl->bw > 1 || src_fmtl->bh > 1) {
       blorp_surf_convert_to_uncompressed(batch->blorp->isl_dev, &params.src,
diff --git a/src/intel/blorp/blorp_genX_exec.h b/src/intel/blorp/blorp_genX_exec.h
index c5ca0c7..6da2485 100644
--- a/src/intel/blorp/blorp_genX_exec.h
+++ b/src/intel/blorp/blorp_genX_exec.h
@@ -59,6 +59,10 @@
 static void *
 blorp_alloc_vertex_buffer(struct blorp_batch *batch, uint32_t size,
                           struct blorp_address *addr);
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+                                           const struct blorp_address *addrs,
+                                           unsigned num_vbs);
 
 #if GEN_GEN >= 8
 static struct blorp_address
@@ -334,19 +338,22 @@
    uint32_t num_vbs = 2;
    memset(vb, 0, sizeof(vb));
 
-   struct blorp_address addr;
+   struct blorp_address addrs[2] = {};
    uint32_t size;
-   blorp_emit_vertex_data(batch, params, &addr, &size);
-   blorp_fill_vertex_buffer_state(batch, vb, 0, addr, size, 3 * sizeof(float));
+   blorp_emit_vertex_data(batch, params, &addrs[0], &size);
+   blorp_fill_vertex_buffer_state(batch, vb, 0, addrs[0], size,
+                                  3 * sizeof(float));
 
-   blorp_emit_input_varying_data(batch, params, &addr, &size);
-   blorp_fill_vertex_buffer_state(batch, vb, 1, addr, size, 0);
+   blorp_emit_input_varying_data(batch, params, &addrs[1], &size);
+   blorp_fill_vertex_buffer_state(batch, vb, 1, addrs[1], size, 0);
 
    const unsigned num_dwords = 1 + num_vbs * GENX(VERTEX_BUFFER_STATE_length);
    uint32_t *dw = blorp_emitn(batch, GENX(3DSTATE_VERTEX_BUFFERS), num_dwords);
    if (!dw)
       return;
 
+   blorp_vf_invalidate_for_vb_48b_transitions(batch, addrs, num_vbs);
+
    for (unsigned i = 0; i < num_vbs; i++) {
       GENX(VERTEX_BUFFER_STATE_pack)(batch, dw, &vb[i]);
       dw += GENX(VERTEX_BUFFER_STATE_length);
@@ -755,18 +762,45 @@
          ps.BindingTableEntryCount = 1;
       }
 
-      if (prog_data) {
-         ps.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
-         ps.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
+     /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to
+      * disable prefetching of binding tables on A0 and B0 steppings.
+      * TODO: Revisit this WA on C0 stepping.
+      */
+      if (GEN_GEN == 11)
+         ps.BindingTableEntryCount = 0;
 
+      if (prog_data) {
          ps._8PixelDispatchEnable = prog_data->dispatch_8;
          ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
 
-         ps.KernelStartPointer0 = params->wm_prog_kernel;
-         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
+         /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+          *
+          *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+          *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+          *
+          * Since 16x MSAA is first introduced on SKL, we don't need to apply
+          * the workaround on any older hardware.
+          */
+         if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+             params->num_samples == 16) {
+            assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+            ps._32PixelDispatchEnable = false;
+         }
+
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
       }
 
       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
@@ -860,17 +894,23 @@
 #endif
 
       if (prog_data) {
-         ps.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
-         ps.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
-
-         ps.KernelStartPointer0 = params->wm_prog_kernel;
-         ps.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
-
          ps._8PixelDispatchEnable = prog_data->dispatch_8;
          ps._16PixelDispatchEnable = prog_data->dispatch_16;
+         ps._32PixelDispatchEnable = prog_data->dispatch_32;
+
+         ps.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+         ps.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+         ps.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+         ps.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+         ps.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 1);
+         ps.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, ps, 2);
 
          ps.AttributeEnable = prog_data->num_varying_inputs > 0;
       } else {
@@ -922,17 +962,23 @@
       if (prog_data) {
          wm.ThreadDispatchEnable = true;
 
-         wm.DispatchGRFStartRegisterForConstantSetupData0 =
-            prog_data->base.dispatch_grf_start_reg;
-         wm.DispatchGRFStartRegisterForConstantSetupData2 =
-            prog_data->dispatch_grf_start_reg_2;
-
-         wm.KernelStartPointer0 = params->wm_prog_kernel;
-         wm.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
-
          wm._8PixelDispatchEnable = prog_data->dispatch_8;
          wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm._32PixelDispatchEnable = prog_data->dispatch_32;
+
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData1 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 1);
+         wm.DispatchGRFStartRegisterForConstantSetupData2 =
+            brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm, 2);
+
+         wm.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+         wm.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+         wm.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 2);
 
          wm.NumberofSFOutputAttributes = prog_data->num_varying_inputs;
       }
@@ -1582,6 +1628,29 @@
     */
    blorp_emit_3dstate_multisample(batch, params);
 
+   /* According to the SKL PRM formula for WM_INT::ThreadDispatchEnable, the
+    * 3DSTATE_WM::ForceThreadDispatchEnable field can force WM thread dispatch
+    * even when WM_HZ_OP is active.  However, WM thread dispatch is normally
+    * disabled for HiZ ops and it appears that force-enabling it can lead to
+    * GPU hangs on at least Skylake.  Since we don't know the current state of
+    * the 3DSTATE_WM packet, just emit a dummy one prior to 3DSTATE_WM_HZ_OP.
+    */
+   blorp_emit(batch, GENX(3DSTATE_WM), wm);
+
+   /* From the BDW PRM Volume 7, Depth Buffer Clear:
+    *
+    *    The clear value must be between the min and max depth values
+    *    (inclusive) defined in the CC_VIEWPORT. If the depth buffer format is
+    *    D32_FLOAT, then +/-DENORM values are also allowed.
+    *
+    * Set the bounds to match our hardware limits, [0.0, 1.0].
+    */
+   if (params->depth.enabled && params->hiz_op == ISL_AUX_OP_FAST_CLEAR) {
+      assert(params->depth.clear_color.f32[0] >= 0.0f);
+      assert(params->depth.clear_color.f32[0] <= 1.0f);
+      blorp_emit_cc_viewport(batch);
+   }
+
    /* If we can't alter the depth stencil config and multiple layers are
     * involved, the HiZ op will fail. This is because the op requires that a
     * new config is emitted for each additional layer.
@@ -1697,8 +1766,10 @@
 static void
 blorp_exec(struct blorp_batch *batch, const struct blorp_params *params)
 {
-   blorp_update_clear_color(batch, &params->dst, params->fast_clear_op);
-   blorp_update_clear_color(batch, &params->depth, params->hiz_op);
+   if (!(batch->flags & BLORP_BATCH_NO_UPDATE_CLEAR_COLOR)) {
+      blorp_update_clear_color(batch, &params->dst, params->fast_clear_op);
+      blorp_update_clear_color(batch, &params->depth, params->hiz_op);
+   }
 
 #if GEN_GEN >= 8
    if (params->hiz_op != ISL_AUX_OP_NONE) {
diff --git a/src/intel/blorp/blorp_priv.h b/src/intel/blorp/blorp_priv.h
index 7adad5d..a6aa2aa 100644
--- a/src/intel/blorp/blorp_priv.h
+++ b/src/intel/blorp/blorp_priv.h
@@ -246,8 +246,14 @@
    /* Actual MSAA layout used by the source image. */
    enum isl_msaa_layout src_layout;
 
-   /* Number of bits per channel in the source image. */
-   uint8_t src_bpc;
+   /* The swizzle to apply to the source in the shader */
+   struct isl_swizzle src_swizzle;
+
+   /* The format of the source if format-specific workarounds are needed
+    * and 0 (ISL_FORMAT_R32G32B32A32_FLOAT) if the destination is natively
+    * renderable.
+    */
+   enum isl_format src_format;
 
    /* True if the source requires normalized coordinates */
    bool src_coords_normalized;
@@ -266,8 +272,17 @@
    /* Actual MSAA layout used by the destination image. */
    enum isl_msaa_layout dst_layout;
 
-   /* Number of bits per channel in the destination image. */
-   uint8_t dst_bpc;
+   /* The swizzle to apply to the destination in the shader */
+   struct isl_swizzle dst_swizzle;
+
+   /* The format of the destination if format-specific workarounds are needed
+    * and 0 (ISL_FORMAT_R32G32B32A32_FLOAT) if the destination is natively
+    * renderable.
+    */
+   enum isl_format dst_format;
+
+   /* Whether or not the format workarounds are a bitcast operation */
+   bool format_bit_cast;
 
    /* Type of the data to be read from the texture (one of
     * nir_type_(int|uint|float)).
@@ -292,11 +307,7 @@
     */
    bool dst_rgb;
 
-   /* True if all source samples should be blended together to produce each
-    * destination pixel.  If true, src_tiled_w must be false, tex_samples must
-    * equal src_samples, and tex_samples must be nonzero.
-    */
-   bool blend;
+   enum blorp_filter filter;
 
    /* True if the rectangle being sent through the rendering pipeline might be
     * larger than the destination rectangle, so the WM program should kill any
@@ -310,9 +321,6 @@
     */
    bool persample_msaa_dispatch;
 
-   /* True for scaled blitting. */
-   bool blit_scaled;
-
    /* True if this blit operation may involve intratile offsets on the source.
     * In this case, we need to add the offset before texturing.
     */
@@ -328,9 +336,6 @@
     */
    float x_scale;
    float y_scale;
-
-   /* True for blits with filter = GL_LINEAR. */
-   bool bilinear_filter;
 };
 
 /**
diff --git a/src/intel/tools/gen_batch_decoder.c b/src/intel/common/gen_batch_decoder.c
similarity index 76%
rename from src/intel/tools/gen_batch_decoder.c
rename to src/intel/common/gen_batch_decoder.c
index c6b9087..f2510e2 100644
--- a/src/intel/tools/gen_batch_decoder.c
+++ b/src/intel/common/gen_batch_decoder.c
@@ -33,14 +33,17 @@
                           const char *xml_path,
                           struct gen_batch_decode_bo (*get_bo)(void *,
                                                                uint64_t),
+                          unsigned (*get_state_size)(void *, uint32_t),
                           void *user_data)
 {
    memset(ctx, 0, sizeof(*ctx));
 
    ctx->get_bo = get_bo;
+   ctx->get_state_size = get_state_size;
    ctx->user_data = user_data;
    ctx->fp = fp;
    ctx->flags = flags;
+   ctx->max_vbo_decoded_lines = -1; /* No limit! */
 
    if (xml_path == NULL)
       ctx->spec = gen_spec_load(devinfo);
@@ -103,17 +106,35 @@
    return bo;
 }
 
+static int
+update_count(struct gen_batch_decode_ctx *ctx,
+             uint32_t offset_from_dsba,
+             unsigned element_dwords,
+             unsigned guess)
+{
+   unsigned size = 0;
+
+   if (ctx->get_state_size)
+      size = ctx->get_state_size(ctx->user_data, offset_from_dsba);
+
+   if (size > 0)
+      return size / (sizeof(uint32_t) * element_dwords);
+
+   /* In the absence of any information, just guess arbitrarily. */
+   return guess;
+}
+
 static void
 ctx_disassemble_program(struct gen_batch_decode_ctx *ctx,
                         uint32_t ksp, const char *type)
 {
-   if (!ctx->instruction_base.map)
+   uint64_t addr = ctx->instruction_base + ksp;
+   struct gen_batch_decode_bo bo = ctx_get_bo(ctx, addr);
+   if (!bo.map)
       return;
 
-   printf("\nReferenced %s:\n", type);
-   gen_disasm_disassemble(ctx->disasm,
-                          (void *)ctx->instruction_base.map, ksp,
-                          ctx->fp);
+   fprintf(ctx->fp, "\nReferenced %s:\n", type);
+   gen_disasm_disassemble(ctx->disasm, bo.map, 0, ctx->fp);
 }
 
 /* Heuristic to determine whether a uint32_t is probably actually a float
@@ -145,24 +166,29 @@
 ctx_print_buffer(struct gen_batch_decode_ctx *ctx,
                  struct gen_batch_decode_bo bo,
                  uint32_t read_length,
-                 uint32_t pitch)
+                 uint32_t pitch,
+                 int max_lines)
 {
    const uint32_t *dw_end = bo.map + MIN2(bo.size, read_length);
 
-   unsigned line_count = 0;
+   int column_count = 0, line_count = -1;
    for (const uint32_t *dw = bo.map; dw < dw_end; dw++) {
-      if (line_count * 4 == pitch || line_count == 8) {
+      if (column_count * 4 == pitch || column_count == 8) {
          fprintf(ctx->fp, "\n");
-         line_count = 0;
+         column_count = 0;
+         line_count++;
+
+         if (max_lines >= 0 && line_count >= max_lines)
+            break;
       }
-      fprintf(ctx->fp, line_count == 0 ? "  " : " ");
+      fprintf(ctx->fp, column_count == 0 ? "  " : " ");
 
       if ((ctx->flags & GEN_BATCH_DECODE_FLOATS) && probably_float(*dw))
          fprintf(ctx->fp, "  %8.2f", *(float *) dw);
       else
          fprintf(ctx->fp, "  0x%08x", *dw);
 
-      line_count++;
+      column_count++;
    }
    fprintf(ctx->fp, "\n");
 }
@@ -175,15 +201,15 @@
    struct gen_field_iterator iter;
    gen_field_iterator_init(&iter, inst, p, 0, false);
 
-   do {
+   while (gen_field_iterator_next(&iter)) {
       if (strcmp(iter.name, "Surface State Base Address") == 0) {
-         ctx->surface_base = ctx_get_bo(ctx, iter.raw_value);
+         ctx->surface_base = iter.raw_value;
       } else if (strcmp(iter.name, "Dynamic State Base Address") == 0) {
-         ctx->dynamic_base = ctx_get_bo(ctx, iter.raw_value);
+         ctx->dynamic_base = iter.raw_value;
       } else if (strcmp(iter.name, "Instruction Base Address") == 0) {
-         ctx->instruction_base = ctx_get_bo(ctx, iter.raw_value);
+         ctx->instruction_base = iter.raw_value;
       }
-   } while (gen_field_iterator_next(&iter));
+   }
 }
 
 static void
@@ -196,35 +222,39 @@
       return;
    }
 
-   /* If we don't know the actual count, guess. */
    if (count < 0)
-      count = 8;
+      count = update_count(ctx, offset, 1, 8);
 
-   if (ctx->surface_base.map == NULL) {
-      fprintf(ctx->fp, "  binding table unavailable\n");
-      return;
-   }
-
-   if (offset % 32 != 0 || offset >= UINT16_MAX ||
-       offset >= ctx->surface_base.size) {
+   if (offset % 32 != 0 || offset >= UINT16_MAX) {
       fprintf(ctx->fp, "  invalid binding table pointer\n");
       return;
    }
 
-   const uint32_t *pointers = ctx->surface_base.map + offset;
+   struct gen_batch_decode_bo bind_bo =
+      ctx_get_bo(ctx, ctx->surface_base + offset);
+
+   if (bind_bo.map == NULL) {
+      fprintf(ctx->fp, "  binding table unavailable\n");
+      return;
+   }
+
+   const uint32_t *pointers = bind_bo.map;
    for (int i = 0; i < count; i++) {
       if (pointers[i] == 0)
          continue;
 
+      uint64_t addr = ctx->surface_base + pointers[i];
+      struct gen_batch_decode_bo bo = ctx_get_bo(ctx, addr);
+      uint32_t size = strct->dw_length * 4;
+
       if (pointers[i] % 32 != 0 ||
-          (pointers[i] + strct->dw_length * 4) >= ctx->surface_base.size) {
+          addr < bo.addr || addr + size >= bo.addr + bo.size) {
          fprintf(ctx->fp, "pointer %u: %08x <not valid>\n", i, pointers[i]);
          continue;
       }
 
       fprintf(ctx->fp, "pointer %u: %08x\n", i, pointers[i]);
-      ctx_print_group(ctx, strct, ctx->surface_base.addr + pointers[i],
-                      ctx->surface_base.map + pointers[i]);
+      ctx_print_group(ctx, strct, addr, bo.map + (addr - bo.addr));
    }
 }
 
@@ -233,22 +263,23 @@
 {
    struct gen_group *strct = gen_spec_find_struct(ctx->spec, "SAMPLER_STATE");
 
-   /* If we don't know the actual count, guess. */
    if (count < 0)
-      count = 4;
+      count = update_count(ctx, offset, strct->dw_length, 4);
 
-   if (ctx->dynamic_base.map == NULL) {
+   uint64_t state_addr = ctx->dynamic_base + offset;
+   struct gen_batch_decode_bo bo = ctx_get_bo(ctx, state_addr);
+   const void *state_map = bo.map;
+
+   if (state_map == NULL) {
       fprintf(ctx->fp, "  samplers unavailable\n");
       return;
    }
 
-   if (offset % 32 != 0 || offset >= ctx->dynamic_base.size) {
+   if (offset % 32 != 0 || state_addr - bo.addr >= bo.size) {
       fprintf(ctx->fp, "  invalid sampler state pointer\n");
       return;
    }
 
-   uint64_t state_addr = ctx->dynamic_base.addr + offset;
-   const void *state_map = ctx->dynamic_base.map + offset;
    for (int i = 0; i < count; i++) {
       fprintf(ctx->fp, "sampler state %d\n", i);
       ctx_print_group(ctx, strct, state_addr, state_map);
@@ -261,9 +292,6 @@
 handle_media_interface_descriptor_load(struct gen_batch_decode_ctx *ctx,
                                        const uint32_t *p)
 {
-   if (ctx->dynamic_base.map == NULL)
-      return;
-
    struct gen_group *inst = gen_spec_find_instruction(ctx->spec, p);
    struct gen_group *desc =
       gen_spec_find_struct(ctx->spec, "INTERFACE_DESCRIPTOR_DATA");
@@ -272,27 +300,34 @@
    gen_field_iterator_init(&iter, inst, p, 0, false);
    uint32_t descriptor_offset = 0;
    int descriptor_count = 0;
-   do {
+   while (gen_field_iterator_next(&iter)) {
       if (strcmp(iter.name, "Interface Descriptor Data Start Address") == 0) {
          descriptor_offset = strtol(iter.value, NULL, 16);
       } else if (strcmp(iter.name, "Interface Descriptor Total Length") == 0) {
          descriptor_count =
             strtol(iter.value, NULL, 16) / (desc->dw_length * 4);
       }
-   } while (gen_field_iterator_next(&iter));
+   }
 
-   uint64_t desc_addr = ctx->dynamic_base.addr + descriptor_offset;
-   const uint32_t *desc_map = ctx->dynamic_base.map + descriptor_offset;
+   uint64_t desc_addr = ctx->dynamic_base + descriptor_offset;
+   struct gen_batch_decode_bo bo = ctx_get_bo(ctx, desc_addr);
+   const void *desc_map = bo.map;
+
+   if (desc_map == NULL) {
+      fprintf(ctx->fp, "  interface descriptors unavailable\n");
+      return;
+   }
+
    for (int i = 0; i < descriptor_count; i++) {
       fprintf(ctx->fp, "descriptor %d: %08x\n", i, descriptor_offset);
 
       ctx_print_group(ctx, desc, desc_addr, desc_map);
 
       gen_field_iterator_init(&iter, desc, desc_map, 0, false);
-      uint64_t ksp;
-      uint32_t sampler_offset, sampler_count;
-      uint32_t binding_table_offset, binding_entry_count;
-      do {
+      uint64_t ksp = 0;
+      uint32_t sampler_offset = 0, sampler_count = 0;
+      uint32_t binding_table_offset = 0, binding_entry_count = 0;
+      while (gen_field_iterator_next(&iter)) {
          if (strcmp(iter.name, "Kernel Start Pointer") == 0) {
             ksp = strtoll(iter.value, NULL, 16);
          } else if (strcmp(iter.name, "Sampler State Pointer") == 0) {
@@ -304,7 +339,7 @@
          } else if (strcmp(iter.name, "Binding Table Entry Count") == 0) {
             binding_entry_count = strtol(iter.value, NULL, 10);
          }
-      } while (gen_field_iterator_next(&iter));
+      }
 
       ctx_disassemble_program(ctx, ksp, "compute shader");
       printf("\n");
@@ -322,6 +357,7 @@
                               const uint32_t *p)
 {
    struct gen_group *inst = gen_spec_find_instruction(ctx->spec, p);
+   struct gen_group *vbs = gen_spec_find_struct(ctx->spec, "VERTEX_BUFFER_STATE");
 
    struct gen_batch_decode_bo vb = {};
    uint32_t vb_size = 0;
@@ -331,45 +367,52 @@
 
    struct gen_field_iterator iter;
    gen_field_iterator_init(&iter, inst, p, 0, false);
-   do {
-      if (strcmp(iter.name, "Vertex Buffer Index") == 0) {
-         index = iter.raw_value;
-      } else if (strcmp(iter.name, "Buffer Pitch") == 0) {
-         pitch = iter.raw_value;
-      } else if (strcmp(iter.name, "Buffer Starting Address") == 0) {
-         vb = ctx_get_bo(ctx, iter.raw_value);
-      } else if (strcmp(iter.name, "Buffer Size") == 0) {
-         vb_size = iter.raw_value;
-         ready = true;
-      } else if (strcmp(iter.name, "End Address") == 0) {
-         if (vb.map && iter.raw_value >= vb.addr)
-            vb_size = iter.raw_value - vb.addr;
-         else
-            vb_size = 0;
-         ready = true;
+   while (gen_field_iterator_next(&iter)) {
+      if (iter.struct_desc != vbs)
+         continue;
+
+      struct gen_field_iterator vbs_iter;
+      gen_field_iterator_init(&vbs_iter, vbs, &iter.p[iter.start_bit / 32], 0, false);
+      while (gen_field_iterator_next(&vbs_iter)) {
+         if (strcmp(vbs_iter.name, "Vertex Buffer Index") == 0) {
+            index = vbs_iter.raw_value;
+         } else if (strcmp(vbs_iter.name, "Buffer Pitch") == 0) {
+            pitch = vbs_iter.raw_value;
+         } else if (strcmp(vbs_iter.name, "Buffer Starting Address") == 0) {
+            vb = ctx_get_bo(ctx, vbs_iter.raw_value);
+         } else if (strcmp(vbs_iter.name, "Buffer Size") == 0) {
+            vb_size = vbs_iter.raw_value;
+            ready = true;
+         } else if (strcmp(vbs_iter.name, "End Address") == 0) {
+            if (vb.map && vbs_iter.raw_value >= vb.addr)
+               vb_size = vbs_iter.raw_value - vb.addr;
+            else
+               vb_size = 0;
+            ready = true;
+         }
+
+         if (!ready)
+            continue;
+
+         fprintf(ctx->fp, "vertex buffer %d, size %d\n", index, vb_size);
+
+         if (vb.map == NULL) {
+            fprintf(ctx->fp, "  buffer contents unavailable\n");
+            continue;
+         }
+
+         if (vb.map == 0 || vb_size == 0)
+            continue;
+
+         ctx_print_buffer(ctx, vb, vb_size, pitch, ctx->max_vbo_decoded_lines);
+
+         vb.map = NULL;
+         vb_size = 0;
+         index = -1;
+         pitch = -1;
+         ready = false;
       }
-
-      if (!ready)
-         continue;
-
-      fprintf(ctx->fp, "vertex buffer %d, size %d\n", index, vb_size);
-
-      if (vb.map == NULL) {
-         fprintf(ctx->fp, "  buffer contents unavailable\n");
-         continue;
-      }
-
-      if (vb.map == 0 || vb_size == 0)
-         continue;
-
-      ctx_print_buffer(ctx, vb, vb_size, pitch);
-
-      vb.map = NULL;
-      vb_size = 0;
-      index = -1;
-      pitch = -1;
-      ready = false;
-   } while (gen_field_iterator_next(&iter));
+   }
 }
 
 static void
@@ -384,7 +427,7 @@
 
    struct gen_field_iterator iter;
    gen_field_iterator_init(&iter, inst, p, 0, false);
-   do {
+   while (gen_field_iterator_next(&iter)) {
       if (strcmp(iter.name, "Index Format") == 0) {
          format = iter.raw_value;
       } else if (strcmp(iter.name, "Buffer Starting Address") == 0) {
@@ -392,7 +435,7 @@
       } else if (strcmp(iter.name, "Buffer Size") == 0) {
          ib_size = iter.raw_value;
       }
-   } while (gen_field_iterator_next(&iter));
+   }
 
    if (ib.map == NULL) {
       fprintf(ctx->fp, "  buffer contents unavailable\n");
@@ -434,7 +477,7 @@
 
    struct gen_field_iterator iter;
    gen_field_iterator_init(&iter, inst, p, 0, false);
-   do {
+   while (gen_field_iterator_next(&iter)) {
       if (strcmp(iter.name, "Kernel Start Pointer") == 0) {
          ksp = iter.raw_value;
       } else if (strcmp(iter.name, "SIMD8 Dispatch Enable") == 0) {
@@ -446,7 +489,7 @@
       } else if (strcmp(iter.name, "Enable") == 0) {
          is_enabled = iter.raw_value;
       }
-   } while (gen_field_iterator_next(&iter));
+   }
 
    const char *type =
       strcmp(inst->name,   "VS_STATE") == 0 ? "vertex shader" :
@@ -475,7 +518,7 @@
 
    struct gen_field_iterator iter;
    gen_field_iterator_init(&iter, inst, p, 0, false);
-   do {
+   while (gen_field_iterator_next(&iter)) {
       if (strncmp(iter.name, "Kernel Start Pointer ",
                   strlen("Kernel Start Pointer ")) == 0) {
          int idx = iter.name[strlen("Kernel Start Pointer ")] - '0';
@@ -487,7 +530,7 @@
       } else if (strcmp(iter.name, "32 Pixel Dispatch Enable") == 0) {
          enabled[2] = strcmp(iter.value, "true") == 0;
       }
-   } while (gen_field_iterator_next(&iter));
+   }
 
    /* Reorder KSPs to be [8, 16, 32] instead of the hardware order. */
    if (enabled[0] + enabled[1] + enabled[2] == 1) {
@@ -517,31 +560,46 @@
 decode_3dstate_constant(struct gen_batch_decode_ctx *ctx, const uint32_t *p)
 {
    struct gen_group *inst = gen_spec_find_instruction(ctx->spec, p);
+   struct gen_group *body =
+      gen_spec_find_struct(ctx->spec, "3DSTATE_CONSTANT_BODY");
 
-   uint32_t read_length[4];
-   struct gen_batch_decode_bo buffer[4];
-   memset(buffer, 0, sizeof(buffer));
+   uint32_t read_length[4] = {0};
+   uint64_t read_addr[4];
 
-   int rlidx = 0, bidx = 0;
-
-   struct gen_field_iterator iter;
-   gen_field_iterator_init(&iter, inst, p, 0, false);
-   do {
-      if (strcmp(iter.name, "Read Length") == 0) {
-         read_length[rlidx++] = iter.raw_value;
-      } else if (strcmp(iter.name, "Buffer") == 0) {
-         buffer[bidx++] = ctx_get_bo(ctx, iter.raw_value);
-      }
-   } while (gen_field_iterator_next(&iter));
-
-   for (int i = 0; i < 4; i++) {
-      if (read_length[i] == 0 || buffer[i].map == NULL)
+   struct gen_field_iterator outer;
+   gen_field_iterator_init(&outer, inst, p, 0, false);
+   while (gen_field_iterator_next(&outer)) {
+      if (outer.struct_desc != body)
          continue;
 
-      unsigned size = read_length[i] * 32;
-      fprintf(ctx->fp, "constant buffer %d, size %u\n", i, size);
+      struct gen_field_iterator iter;
+      gen_field_iterator_init(&iter, body, &outer.p[outer.start_bit / 32],
+                              0, false);
 
-      ctx_print_buffer(ctx, buffer[i], size, 0);
+      while (gen_field_iterator_next(&iter)) {
+         int idx;
+         if (sscanf(iter.name, "Read Length[%d]", &idx) == 1) {
+            read_length[idx] = iter.raw_value;
+         } else if (sscanf(iter.name, "Buffer[%d]", &idx) == 1) {
+            read_addr[idx] = iter.raw_value;
+         }
+      }
+
+      for (int i = 0; i < 4; i++) {
+         if (read_length[i] == 0)
+            continue;
+
+         struct gen_batch_decode_bo buffer = ctx_get_bo(ctx, read_addr[i]);
+         if (!buffer.map) {
+            fprintf(ctx->fp, "constant buffer %d unavailable\n", i);
+            continue;
+         }
+
+         unsigned size = read_length[i] * 32;
+         fprintf(ctx->fp, "constant buffer %d, size %u\n", i, size);
+
+         ctx_print_buffer(ctx, buffer, size, 0, -1);
+      }
    }
 }
 
@@ -583,33 +641,50 @@
                               const char *struct_type, const uint32_t *p,
                               int count)
 {
-   if (ctx->dynamic_base.map == NULL) {
-      fprintf(ctx->fp, "  dynamic %s state unavailable\n", struct_type);
-      return;
-   }
-
    struct gen_group *inst = gen_spec_find_instruction(ctx->spec, p);
-   struct gen_group *state = gen_spec_find_struct(ctx->spec, struct_type);
 
-   uint32_t state_offset;
+   uint32_t state_offset = 0;
 
    struct gen_field_iterator iter;
    gen_field_iterator_init(&iter, inst, p, 0, false);
-   do {
+   while (gen_field_iterator_next(&iter)) {
       if (str_ends_with(iter.name, "Pointer")) {
          state_offset = iter.raw_value;
          break;
       }
-   } while (gen_field_iterator_next(&iter));
+   }
 
-   uint32_t state_addr = ctx->dynamic_base.addr + state_offset;
-   const uint32_t *state_map = ctx->dynamic_base.map + state_offset;
-   for (int i = 0; i < count; i++) {
-      fprintf(ctx->fp, "%s %d\n", struct_type, i);
-      ctx_print_group(ctx, state, state_offset, state_map);
+   uint64_t state_addr = ctx->dynamic_base + state_offset;
+   struct gen_batch_decode_bo bo = ctx_get_bo(ctx, state_addr);
+   const void *state_map = bo.map;
+
+   if (state_map == NULL) {
+      fprintf(ctx->fp, "  dynamic %s state unavailable\n", struct_type);
+      return;
+   }
+
+   struct gen_group *state = gen_spec_find_struct(ctx->spec, struct_type);
+   if (strcmp(struct_type, "BLEND_STATE") == 0) {
+      /* Blend states are different from the others because they have a header
+       * struct called BLEND_STATE which is followed by a variable number of
+       * BLEND_STATE_ENTRY structs.
+       */
+      fprintf(ctx->fp, "%s\n", struct_type);
+      ctx_print_group(ctx, state, state_addr, state_map);
 
       state_addr += state->dw_length * 4;
-      state_map += state->dw_length;
+      state_map += state->dw_length * 4;
+
+      struct_type = "BLEND_STATE_ENTRY";
+      state = gen_spec_find_struct(ctx->spec, struct_type);
+   }
+
+   for (int i = 0; i < count; i++) {
+      fprintf(ctx->fp, "%s %d\n", struct_type, i);
+      ctx_print_group(ctx, state, state_addr, state_map);
+
+      state_addr += state->dw_length * 4;
+      state_map += state->dw_length * 4;
    }
 }
 
@@ -784,48 +859,39 @@
       }
 
       if (strcmp(inst_name, "MI_BATCH_BUFFER_START") == 0) {
-         struct gen_batch_decode_bo next_batch;
+         struct gen_batch_decode_bo next_batch = {};
          bool second_level;
          struct gen_field_iterator iter;
          gen_field_iterator_init(&iter, inst, p, 0, false);
-         do {
+         while (gen_field_iterator_next(&iter)) {
             if (strcmp(iter.name, "Batch Buffer Start Address") == 0) {
                next_batch = ctx_get_bo(ctx, iter.raw_value);
             } else if (strcmp(iter.name, "Second Level Batch Buffer") == 0) {
                second_level = iter.raw_value;
             }
-         } while (gen_field_iterator_next(&iter));
-
-         if (next_batch.map == NULL) {
-            fprintf(ctx->fp, "Secondary batch at 0x%08"PRIx64" unavailable",
-                    next_batch.addr);
          }
 
+         if (next_batch.map == NULL) {
+            fprintf(ctx->fp, "Secondary batch at 0x%08"PRIx64" unavailable\n",
+                    next_batch.addr);
+         } else {
+            gen_print_batch(ctx, next_batch.map, next_batch.size,
+                            next_batch.addr);
+         }
          if (second_level) {
             /* MI_BATCH_BUFFER_START with "2nd Level Batch Buffer" set acts
              * like a subroutine call.  Commands that come afterwards get
              * processed once the 2nd level batch buffer returns with
              * MI_BATCH_BUFFER_END.
              */
-            if (next_batch.map) {
-               gen_print_batch(ctx, next_batch.map, next_batch.size,
-                               next_batch.addr);
-            }
+            continue;
          } else {
             /* MI_BATCH_BUFFER_START with "2nd Level Batch Buffer" unset acts
              * like a goto.  Nothing after it will ever get processed.  In
              * order to prevent the recursion from growing, we just reset the
              * loop and continue;
              */
-            if (next_batch.map) {
-               p = next_batch.map;
-               end = next_batch.map + next_batch.size;
-               length = 0;
-               continue;
-            } else {
-               /* Nothing we can do */
-               break;
-            }
+            break;
          }
       } else if (strcmp(inst_name, "MI_BATCH_BUFFER_END") == 0) {
          break;
diff --git a/src/intel/common/gen_debug.h b/src/intel/common/gen_debug.h
index 3db7f3b..72d7ca2 100644
--- a/src/intel/common/gen_debug.h
+++ b/src/intel/common/gen_debug.h
@@ -94,7 +94,11 @@
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
+#if ANDROID_API_LEVEL >= 26
+#include <log/log.h>
+#else
 #include <cutils/log.h>
+#endif /* use log/log.h start from android 8 major version */
 #ifndef ALOGW
 #define ALOGW LOGW
 #endif
diff --git a/src/intel/common/gen_decoder.c b/src/intel/common/gen_decoder.c
index 1b8123b..04de7b7 100644
--- a/src/intel/common/gen_decoder.c
+++ b/src/intel/common/gen_decoder.c
@@ -151,7 +151,8 @@
 create_group(struct parser_context *ctx,
              const char *name,
              const char **atts,
-             struct gen_group *parent)
+             struct gen_group *parent,
+             bool fixed_length)
 {
    struct gen_group *group;
 
@@ -161,6 +162,7 @@
 
    group->spec = ctx->spec;
    group->variable = false;
+   group->fixed_length = fixed_length;
 
    for (int i = 0; atts[i]; i += 2) {
       char *p;
@@ -370,18 +372,19 @@
          minor = 0;
 
       ctx->spec->gen = gen_make_gen(major, minor);
-   } else if (strcmp(element_name, "instruction") == 0 ||
-              strcmp(element_name, "struct") == 0) {
-      ctx->group = create_group(ctx, name, atts, NULL);
+   } else if (strcmp(element_name, "instruction") == 0) {
+      ctx->group = create_group(ctx, name, atts, NULL, false);
+   } else if (strcmp(element_name, "struct") == 0) {
+      ctx->group = create_group(ctx, name, atts, NULL, true);
    } else if (strcmp(element_name, "register") == 0) {
-      ctx->group = create_group(ctx, name, atts, NULL);
+      ctx->group = create_group(ctx, name, atts, NULL, true);
       get_register_offset(atts, &ctx->group->register_offset);
    } else if (strcmp(element_name, "group") == 0) {
       struct gen_group *previous_group = ctx->group;
       while (previous_group->next)
          previous_group = previous_group->next;
 
-      struct gen_group *group = create_group(ctx, "", atts, ctx->group);
+      struct gen_group *group = create_group(ctx, "", atts, ctx->group, false);
       previous_group->next = group;
       ctx->group = group;
    } else if (strcmp(element_name, "field") == 0) {
@@ -713,6 +716,9 @@
 int
 gen_group_get_length(struct gen_group *group, const uint32_t *p)
 {
+   if (group && group->fixed_length)
+      return group->dw_length;
+
    uint32_t h = p[0];
    uint32_t type = field_value(h, 29, 31);
 
@@ -806,6 +812,18 @@
 }
 
 static void
+iter_start_field(struct gen_field_iterator *iter, struct gen_field *field)
+{
+   iter->field = field;
+
+   int group_member_offset = iter_group_offset_bits(iter, iter->group_iter);
+
+   iter->start_bit = group_member_offset + iter->field->start;
+   iter->end_bit = group_member_offset + iter->field->end;
+   iter->struct_desc = NULL;
+}
+
+static void
 iter_advance_group(struct gen_field_iterator *iter)
 {
    if (iter->group->variable)
@@ -819,32 +837,20 @@
       }
    }
 
-   iter->field = iter->group->fields;
+   iter_start_field(iter, iter->group->fields);
 }
 
 static bool
 iter_advance_field(struct gen_field_iterator *iter)
 {
    if (iter_more_fields(iter)) {
-      iter->field = iter->field->next;
+      iter_start_field(iter, iter->field->next);
    } else {
       if (!iter_more_groups(iter))
          return false;
 
       iter_advance_group(iter);
    }
-
-   if (iter->field->name)
-      strncpy(iter->name, iter->field->name, sizeof(iter->name));
-   else
-      memset(iter->name, 0, sizeof(iter->name));
-
-   int group_member_offset = iter_group_offset_bits(iter, iter->group_iter);
-
-   iter->start_bit = group_member_offset + iter->field->start;
-   iter->end_bit = group_member_offset + iter->field->end;
-   iter->struct_desc = NULL;
-
    return true;
 }
 
@@ -888,7 +894,7 @@
    } v;
 
    if (iter->field->name)
-      strncpy(iter->name, iter->field->name, sizeof(iter->name));
+      snprintf(iter->name, sizeof(iter->name), "%s", iter->field->name);
    else
       memset(iter->name, 0, sizeof(iter->name));
 
@@ -981,25 +987,35 @@
    memset(iter, 0, sizeof(*iter));
 
    iter->group = group;
-   if (group->fields)
-      iter->field = group->fields;
-   else
-      iter->field = group->next->fields;
    iter->p = p;
    iter->p_bit = p_bit;
 
    int length = gen_group_get_length(iter->group, iter->p);
-   iter->p_end = length > 0 ? &p[length] : NULL;
+   iter->p_end = length >= 0 ? &p[length] : NULL;
    iter->print_colors = print_colors;
-
-   bool result = iter_decode_field(iter);
-   if (length >= 0)
-      assert(result);
 }
 
 bool
 gen_field_iterator_next(struct gen_field_iterator *iter)
 {
+   /* Initial condition */
+   if (!iter->field) {
+      if (iter->group->fields)
+         iter_start_field(iter, iter->group->fields);
+      else
+         iter_start_field(iter, iter->group->next->fields);
+
+      bool result = iter_decode_field(iter);
+      if (!result && iter->p_end) {
+         /* We're dealing with a non empty struct of length=0 (BLEND_STATE on
+          * Gen 7.5)
+          */
+         assert(iter->group->dw_length == 0);
+      }
+
+      return result;
+   }
+
    if (!iter_advance_field(iter))
       return false;
 
@@ -1040,7 +1056,7 @@
    int last_dword = -1;
 
    gen_field_iterator_init(&iter, group, p, p_bit, color);
-   do {
+   while (gen_field_iterator_next(&iter)) {
       int iter_dword = iter.end_bit / 32;
       if (last_dword != iter_dword) {
          for (int i = last_dword + 1; i <= iter_dword; i++)
@@ -1050,10 +1066,11 @@
       if (!gen_field_is_header(iter.field)) {
          fprintf(outfile, "    %s: %s\n", iter.name, iter.value);
          if (iter.struct_desc) {
-            uint64_t struct_offset = offset + 4 * iter_dword;
+            int struct_dword = iter.start_bit / 32;
+            uint64_t struct_offset = offset + 4 * struct_dword;
             gen_print_group(outfile, iter.struct_desc, struct_offset,
-                            &p[iter_dword], iter.start_bit % 32, color);
+                            &p[struct_dword], iter.start_bit % 32, color);
          }
       }
-   } while (gen_field_iterator_next(&iter));
+   }
 }
diff --git a/src/intel/common/gen_decoder.h b/src/intel/common/gen_decoder.h
index f28ac7d..d8add4c 100644
--- a/src/intel/common/gen_decoder.h
+++ b/src/intel/common/gen_decoder.h
@@ -103,7 +103,8 @@
    uint32_t dw_length;
    uint32_t group_offset, group_count;
    uint32_t group_size;
-   bool variable;
+   bool variable; /* <group> specific */
+   bool fixed_length; /* True for <struct> & <register> */
 
    struct gen_group *parent;
    struct gen_group *next;
@@ -111,8 +112,7 @@
    uint32_t opcode_mask;
    uint32_t opcode;
 
-   /* Register specific */
-   uint32_t register_offset;
+   uint32_t register_offset; /* <register> specific */
 };
 
 struct gen_value {
@@ -205,8 +205,15 @@
 struct gen_disasm *disasm;
 
 struct gen_batch_decode_ctx {
-   struct gen_batch_decode_bo (*get_bo)(void *user_data,
-                                        uint64_t base_address);
+   /**
+    * Return information about the buffer containing the given address.
+    *
+    * If the given address is inside a buffer, the map pointer should be
+    * offset accordingly so it points at the data corresponding to address.
+    */
+   struct gen_batch_decode_bo (*get_bo)(void *user_data, uint64_t address);
+   unsigned (*get_state_size)(void *user_data,
+                              uint32_t offset_from_dynamic_state_base_addr);
    void *user_data;
 
    FILE *fp;
@@ -215,9 +222,11 @@
 
    struct gen_disasm *disasm;
 
-   struct gen_batch_decode_bo surface_base;
-   struct gen_batch_decode_bo dynamic_base;
-   struct gen_batch_decode_bo instruction_base;
+   uint64_t surface_base;
+   uint64_t dynamic_base;
+   uint64_t instruction_base;
+
+   int max_vbo_decoded_lines;
 };
 
 void gen_batch_decode_ctx_init(struct gen_batch_decode_ctx *ctx,
@@ -226,6 +235,8 @@
                                const char *xml_path,
                                struct gen_batch_decode_bo (*get_bo)(void *,
                                                                     uint64_t),
+
+                               unsigned (*get_state_size)(void *, uint32_t),
                                void *user_data);
 void gen_batch_decode_ctx_finish(struct gen_batch_decode_ctx *ctx);
 
diff --git a/src/intel/tools/disasm.c b/src/intel/common/gen_disasm.c
similarity index 93%
rename from src/intel/tools/disasm.c
rename to src/intel/common/gen_disasm.c
index 1de20f5..4f835c1 100644
--- a/src/intel/tools/disasm.c
+++ b/src/intel/common/gen_disasm.c
@@ -44,14 +44,15 @@
 }
 
 static int
-gen_disasm_find_end(struct gen_disasm *disasm, void *assembly, int start)
+gen_disasm_find_end(struct gen_disasm *disasm,
+                    const void *assembly, int start)
 {
    struct gen_device_info *devinfo = &disasm->devinfo;
    int offset = start;
 
    /* This loop exits when send-with-EOT or when opcode is 0 */
    while (true) {
-      brw_inst *insn = assembly + offset;
+      const brw_inst *insn = assembly + offset;
 
       if (brw_inst_cmpt_control(devinfo, insn)) {
          offset += 8;
@@ -70,7 +71,7 @@
 }
 
 void
-gen_disasm_disassemble(struct gen_disasm *disasm, void *assembly,
+gen_disasm_disassemble(struct gen_disasm *disasm, const void *assembly,
                        int start, FILE *out)
 {
    struct gen_device_info *devinfo = &disasm->devinfo;
diff --git a/src/intel/tools/gen_disasm.h b/src/intel/common/gen_disasm.h
similarity index 95%
rename from src/intel/tools/gen_disasm.h
rename to src/intel/common/gen_disasm.h
index c8c18b2..d979114 100644
--- a/src/intel/tools/gen_disasm.h
+++ b/src/intel/common/gen_disasm.h
@@ -34,7 +34,7 @@
 
 struct gen_disasm *gen_disasm_create(const struct gen_device_info *devinfo);
 void gen_disasm_disassemble(struct gen_disasm *disasm,
-                            void *assembly, int start, FILE *out);
+                            const void *assembly, int start, FILE *out);
 
 void gen_disasm_destroy(struct gen_disasm *disasm);
 
diff --git a/src/intel/common/gen_gem.h b/src/intel/common/gen_gem.h
new file mode 100644
index 0000000..7dd9ae6
--- /dev/null
+++ b/src/intel/common/gen_gem.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GEN_GEM_H
+#define GEN_GEM_H
+
+static inline uint64_t
+gen_canonical_address(uint64_t v)
+{
+   /* From the Broadwell PRM Vol. 2a, MI_LOAD_REGISTER_MEM::MemoryAddress:
+    *
+    *    "This field specifies the address of the memory location where the
+    *    register value specified in the DWord above will read from. The
+    *    address specifies the DWord location of the data. Range =
+    *    GraphicsVirtualAddress[63:2] for a DWord register GraphicsAddress
+    *    [63:48] are ignored by the HW and assumed to be in correct
+    *    canonical form [63:48] == [47]."
+    */
+   const int shift = 63 - 47;
+   return (int64_t)(v << shift) >> shift;
+}
+
+/**
+ * This returns a 48-bit address with the high 16 bits zeroed.
+ *
+ * It's the opposite of gen_canonicalize_address.
+ */
+static inline uint64_t
+gen_48b_address(uint64_t v)
+{
+   const int shift = 63 - 47;
+   return (uint64_t)(v << shift) >> shift;
+}
+
+#endif /* GEN_GEM_H */
diff --git a/src/intel/common/meson.build b/src/intel/common/meson.build
index 5e0394a..332e978 100644
--- a/src/intel/common/meson.build
+++ b/src/intel/common/meson.build
@@ -22,10 +22,14 @@
 
 files_libintel_common = files(
   'gen_clflush.h',
+  'gen_batch_decoder.c',
   'gen_debug.c',
   'gen_debug.h',
   'gen_decoder.c',
   'gen_decoder.h',
+  'gen_disasm.c',
+  'gen_disasm.h',
+  'gen_gem.h',
   'gen_l3_config.c',
   'gen_l3_config.h',
   'gen_urb_config.c',
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
index d5f4837..6df9621 100644
--- a/src/intel/compiler/brw_compiler.c
+++ b/src/intel/compiler/brw_compiler.c
@@ -45,7 +45,8 @@
    .lower_device_index_to_zero = true,                                        \
    .native_integers = true,                                                   \
    .use_interpolated_input_intrinsics = true,                                 \
-   .vertex_id_zero_based = true
+   .vertex_id_zero_based = true,                                              \
+   .lower_base_vertex = true
 
 #define COMMON_SCALAR_OPTIONS                                                 \
    .lower_pack_half_2x16 = true,                                              \
@@ -180,6 +181,33 @@
    return compiler;
 }
 
+static void
+insert_u64_bit(uint64_t *val, bool add)
+{
+   *val = (*val << 1) | !!add;
+}
+
+uint64_t
+brw_get_compiler_config_value(const struct brw_compiler *compiler)
+{
+   uint64_t config = 0;
+   insert_u64_bit(&config, compiler->precise_trig);
+   if (compiler->devinfo->gen >= 8 && compiler->devinfo->gen < 10) {
+      insert_u64_bit(&config, compiler->scalar_stage[MESA_SHADER_VERTEX]);
+      insert_u64_bit(&config, compiler->scalar_stage[MESA_SHADER_TESS_CTRL]);
+      insert_u64_bit(&config, compiler->scalar_stage[MESA_SHADER_TESS_EVAL]);
+      insert_u64_bit(&config, compiler->scalar_stage[MESA_SHADER_GEOMETRY]);
+   }
+   uint64_t debug_bits = INTEL_DEBUG;
+   uint64_t mask = DEBUG_DISK_CACHE_MASK;
+   while (mask != 0) {
+      const uint64_t bit = 1ULL << (ffsll(mask) - 1);
+      insert_u64_bit(&config, (debug_bits & bit) != 0);
+      mask &= ~bit;
+   }
+   return config;
+}
+
 unsigned
 brw_prog_data_size(gl_shader_stage stage)
 {
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
index 2419624..c510d34 100644
--- a/src/intel/compiler/brw_compiler.h
+++ b/src/intel/compiler/brw_compiler.h
@@ -112,6 +112,12 @@
     * will attempt to push everything.
     */
    bool supports_pull_constants;
+
+   /**
+    * Whether or not the driver supports NIR shader constants.  This controls
+    * whether nir_opt_large_constants will be run.
+    */
+   bool supports_shader_constants;
 };
 
 /**
@@ -397,6 +403,7 @@
    bool force_dual_color_blend:1;
    bool coherent_fb_fetch:1;
 
+   uint8_t color_outputs_valid;
    uint64_t input_slots_valid;
    unsigned program_string_id;
    GLenum alpha_test_func;          /* < For Gen4/5 MRT alpha test */
@@ -683,11 +690,14 @@
 
    GLuint num_varying_inputs;
 
-   uint8_t reg_blocks_0;
-   uint8_t reg_blocks_2;
+   uint8_t reg_blocks_8;
+   uint8_t reg_blocks_16;
+   uint8_t reg_blocks_32;
 
-   uint8_t dispatch_grf_start_reg_2;
-   uint32_t prog_offset_2;
+   uint8_t dispatch_grf_start_reg_16;
+   uint8_t dispatch_grf_start_reg_32;
+   uint32_t prog_offset_16;
+   uint32_t prog_offset_32;
 
    struct {
       /** @{
@@ -705,6 +715,7 @@
    bool inner_coverage;
    bool dispatch_8;
    bool dispatch_16;
+   bool dispatch_32;
    bool dual_src_blend;
    bool persample_dispatch;
    bool uses_pos_offset;
@@ -745,6 +756,91 @@
    int urb_setup[VARYING_SLOT_MAX];
 };
 
+/** Returns the SIMD width corresponding to a given KSP index
+ *
+ * The "Variable Pixel Dispatch" table in the PRM (which can be found, for
+ * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to
+ * kernel start pointer (KSP) indices that is based on what dispatch widths
+ * are enabled.  This function provides, effectively, the reverse mapping.
+ *
+ * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD
+ * width of 8, 16, or 32 is returned.  If the KSP is invalid, 0 is returned.
+ */
+static inline unsigned
+brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled,
+                          bool simd16_enabled, bool simd32_enabled)
+{
+   /* This function strictly ignores contiguous dispatch */
+   switch (ksp_idx) {
+   case 0:
+      return simd8_enabled ? 8 :
+             (simd16_enabled && !simd32_enabled) ? 16 :
+             (simd32_enabled && !simd16_enabled) ? 32 : 0;
+   case 1:
+      return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0;
+   case 2:
+      return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0;
+   default:
+      unreachable("Invalid KSP index");
+   }
+}
+
+#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \
+   brw_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \
+                             (wm_state)._16PixelDispatchEnable, \
+                             (wm_state)._32PixelDispatchEnable)
+
+#define brw_wm_state_has_ksp(wm_state, ksp_idx) \
+   (brw_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0)
+
+static inline uint32_t
+_brw_wm_prog_data_prog_offset(const struct brw_wm_prog_data *prog_data,
+                              unsigned simd_width)
+{
+   switch (simd_width) {
+   case 8: return 0;
+   case 16: return prog_data->prog_offset_16;
+   case 32: return prog_data->prog_offset_32;
+   default: return 0;
+   }
+}
+
+#define brw_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \
+   _brw_wm_prog_data_prog_offset(prog_data, \
+      brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
+
+static inline uint8_t
+_brw_wm_prog_data_dispatch_grf_start_reg(const struct brw_wm_prog_data *prog_data,
+                                         unsigned simd_width)
+{
+   switch (simd_width) {
+   case 8: return prog_data->base.dispatch_grf_start_reg;
+   case 16: return prog_data->dispatch_grf_start_reg_16;
+   case 32: return prog_data->dispatch_grf_start_reg_32;
+   default: return 0;
+   }
+}
+
+#define brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \
+   _brw_wm_prog_data_dispatch_grf_start_reg(prog_data, \
+      brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
+
+static inline uint8_t
+_brw_wm_prog_data_reg_blocks(const struct brw_wm_prog_data *prog_data,
+                             unsigned simd_width)
+{
+   switch (simd_width) {
+   case 8: return prog_data->reg_blocks_8;
+   case 16: return prog_data->reg_blocks_16;
+   case 32: return prog_data->reg_blocks_32;
+   default: return 0;
+   }
+}
+
+#define brw_wm_prog_data_reg_blocks(prog_data, wm_state, ksp_idx) \
+   _brw_wm_prog_data_reg_blocks(prog_data, \
+      brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx))
+
 struct brw_push_const_block {
    unsigned dwords;     /* Dword count, not reg aligned */
    unsigned regs;
@@ -977,7 +1073,7 @@
 
    bool uses_vertexid;
    bool uses_instanceid;
-   bool uses_basevertex;
+   bool uses_is_indexed_draw;
    bool uses_firstvertex;
    bool uses_baseinstance;
    bool uses_drawid;
@@ -1118,6 +1214,18 @@
 struct brw_compiler *
 brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo);
 
+/**
+ * Returns a compiler configuration for use with disk shader cache
+ *
+ * This value only needs to change for settings that can cause different
+ * program generation between two runs on the same hardware.
+ *
+ * For example, it doesn't need to be different for gen 8 and gen 9 hardware,
+ * but it does need to be different if INTEL_DEBUG=nocompact is or isn't used.
+ */
+uint64_t
+brw_get_compiler_config_value(const struct brw_compiler *compiler);
+
 unsigned
 brw_prog_data_size(gl_shader_stage stage);
 
@@ -1230,6 +1338,7 @@
                struct gl_program *prog,
                int shader_time_index8,
                int shader_time_index16,
+               int shader_time_index32,
                bool allow_spilling,
                bool use_rep_send, struct brw_vue_map *vue_map,
                char **error_str);
diff --git a/src/intel/compiler/brw_eu.c b/src/intel/compiler/brw_eu.c
index 5375209..6ef0a6a 100644
--- a/src/intel/compiler/brw_eu.c
+++ b/src/intel/compiler/brw_eu.c
@@ -129,91 +129,76 @@
 unsigned
 brw_get_default_exec_size(struct brw_codegen *p)
 {
-   return brw_inst_exec_size(p->devinfo, p->current);
+   return p->current->exec_size;
 }
 
 unsigned
 brw_get_default_group(struct brw_codegen *p)
 {
-   if (p->devinfo->gen >= 6) {
-      unsigned group = brw_inst_qtr_control(p->devinfo, p->current) * 8;
-      if (p->devinfo->gen >= 7)
-         group += brw_inst_nib_control(p->devinfo, p->current) * 4;
-      return group;
-   } else {
-      unsigned qtr_control = brw_inst_qtr_control(p->devinfo, p->current);
-      if (qtr_control == BRW_COMPRESSION_COMPRESSED)
-         return 0;
-      else
-         return qtr_control * 8;
-   }
+   return p->current->group;
 }
 
 unsigned
 brw_get_default_access_mode(struct brw_codegen *p)
 {
-   return brw_inst_access_mode(p->devinfo, p->current);
+   return p->current->access_mode;
 }
 
 void
 brw_set_default_exec_size(struct brw_codegen *p, unsigned value)
 {
-   brw_inst_set_exec_size(p->devinfo, p->current, value);
+   p->current->exec_size = value;
 }
 
 void brw_set_default_predicate_control( struct brw_codegen *p, unsigned pc )
 {
-   brw_inst_set_pred_control(p->devinfo, p->current, pc);
+   p->current->predicate = pc;
 }
 
 void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse)
 {
-   brw_inst_set_pred_inv(p->devinfo, p->current, predicate_inverse);
+   p->current->pred_inv = predicate_inverse;
 }
 
 void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg)
 {
-   if (p->devinfo->gen >= 7)
-      brw_inst_set_flag_reg_nr(p->devinfo, p->current, reg);
-
-   brw_inst_set_flag_subreg_nr(p->devinfo, p->current, subreg);
+   assert(subreg < 2);
+   p->current->flag_subreg = reg * 2 + subreg;
 }
 
 void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode )
 {
-   brw_inst_set_access_mode(p->devinfo, p->current, access_mode);
+   p->current->access_mode = access_mode;
 }
 
 void
 brw_set_default_compression_control(struct brw_codegen *p,
 			    enum brw_compression compression_control)
 {
-   if (p->devinfo->gen >= 6) {
-      /* Since we don't use the SIMD32 support in gen6, we translate
-       * the pre-gen6 compression control here.
+   switch (compression_control) {
+   case BRW_COMPRESSION_NONE:
+      /* This is the "use the first set of bits of dmask/vmask/arf
+       * according to execsize" option.
        */
-      switch (compression_control) {
-      case BRW_COMPRESSION_NONE:
-	 /* This is the "use the first set of bits of dmask/vmask/arf
-	  * according to execsize" option.
-	  */
-         brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_1Q);
-	 break;
-      case BRW_COMPRESSION_2NDHALF:
-	 /* For SIMD8, this is "use the second set of 8 bits." */
-         brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_2Q);
-	 break;
-      case BRW_COMPRESSION_COMPRESSED:
-	 /* For SIMD16 instruction compression, use the first set of 16 bits
-	  * since we don't do SIMD32 dispatch.
-	  */
-         brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_1H);
-	 break;
-      default:
-         unreachable("not reached");
-      }
-   } else {
-      brw_inst_set_qtr_control(p->devinfo, p->current, compression_control);
+      p->current->group = 0;
+      break;
+   case BRW_COMPRESSION_2NDHALF:
+      /* For SIMD8, this is "use the second set of 8 bits." */
+      p->current->group = 8;
+      break;
+   case BRW_COMPRESSION_COMPRESSED:
+      /* For SIMD16 instruction compression, use the first set of 16 bits
+       * since we don't do SIMD32 dispatch.
+       */
+      p->current->group = 0;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   if (p->devinfo->gen <= 6) {
+      p->current->compressed =
+         (compression_control == BRW_COMPRESSION_COMPRESSED);
    }
 }
 
@@ -246,7 +231,7 @@
 void
 brw_set_default_compression(struct brw_codegen *p, bool on)
 {
-   brw_inst_set_compression(p->devinfo, p->current, on);
+   p->current->compressed = on;
 }
 
 /**
@@ -283,29 +268,28 @@
 void
 brw_set_default_group(struct brw_codegen *p, unsigned group)
 {
-   brw_inst_set_group(p->devinfo, p->current, group);
+   p->current->group = group;
 }
 
 void brw_set_default_mask_control( struct brw_codegen *p, unsigned value )
 {
-   brw_inst_set_mask_control(p->devinfo, p->current, value);
+   p->current->mask_control = value;
 }
 
 void brw_set_default_saturate( struct brw_codegen *p, bool enable )
 {
-   brw_inst_set_saturate(p->devinfo, p->current, enable);
+   p->current->saturate = enable;
 }
 
 void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value)
 {
-   if (p->devinfo->gen >= 6)
-      brw_inst_set_acc_wr_control(p->devinfo, p->current, value);
+   p->current->acc_wr_control = value;
 }
 
 void brw_push_insn_state( struct brw_codegen *p )
 {
    assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
-   memcpy(p->current + 1, p->current, sizeof(brw_inst));
+   *(p->current + 1) = *p->current;
    p->current++;
 }
 
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index e3d0c4c..3824ab2 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -46,6 +46,36 @@
 
 #define BRW_EU_MAX_INSN_STACK 5
 
+struct brw_insn_state {
+   /* One of BRW_EXECUTE_* */
+   unsigned exec_size:3;
+
+   /* Group in units of channels */
+   unsigned group:5;
+
+   /* Compression control on gen4-5 */
+   bool compressed:1;
+
+   /* One of BRW_MASK_* */
+   unsigned mask_control:1;
+
+   bool saturate:1;
+
+   /* One of BRW_ALIGN_* */
+   unsigned access_mode:1;
+
+   /* One of BRW_PREDICATE_* */
+   enum brw_predicate predicate:4;
+
+   bool pred_inv:1;
+
+   /* Flag subreg.  Bottom bit is subreg, top bit is reg */
+   unsigned flag_subreg:2;
+
+   bool acc_wr_control:1;
+};
+
+
 /* A helper for accessing the last instruction emitted.  This makes it easy
  * to set various bits on an instruction without having to create temporary
  * variable and assign the emitted instruction to those.
@@ -62,9 +92,8 @@
 
    /* Allow clients to push/pop instruction state:
     */
-   brw_inst stack[BRW_EU_MAX_INSN_STACK];
-   bool compressed_stack[BRW_EU_MAX_INSN_STACK];
-   brw_inst *current;
+   struct brw_insn_state stack[BRW_EU_MAX_INSN_STACK];
+   struct brw_insn_state *current;
 
    /** Whether or not the user wants automatic exec sizes
     *
@@ -216,47 +245,141 @@
 
 /* Helpers for SEND instruction:
  */
-void brw_set_sampler_message(struct brw_codegen *p,
-                             brw_inst *insn,
-                             unsigned binding_table_index,
-                             unsigned sampler,
-                             unsigned msg_type,
-                             unsigned response_length,
-                             unsigned msg_length,
-                             unsigned header_present,
-                             unsigned simd_mode,
-                             unsigned return_format);
 
-void brw_set_message_descriptor(struct brw_codegen *p,
-                                brw_inst *inst,
-                                enum brw_message_target sfid,
-                                unsigned msg_length,
-                                unsigned response_length,
-                                bool header_present,
-                                bool end_of_thread);
+/**
+ * Construct a message descriptor immediate with the specified common
+ * descriptor controls.
+ */
+static inline uint32_t
+brw_message_desc(const struct gen_device_info *devinfo,
+                 unsigned msg_length,
+                 unsigned response_length,
+                 bool header_present)
+{
+   if (devinfo->gen >= 5) {
+      return (SET_BITS(msg_length, 28, 25) |
+              SET_BITS(response_length, 24, 20) |
+              SET_BITS(header_present, 19, 19));
+   } else {
+      return (SET_BITS(msg_length, 23, 20) |
+              SET_BITS(response_length, 19, 16));
+   }
+}
 
-void brw_set_dp_read_message(struct brw_codegen *p,
-			     brw_inst *insn,
-			     unsigned binding_table_index,
-			     unsigned msg_control,
-			     unsigned msg_type,
-			     unsigned target_cache,
-			     unsigned msg_length,
-                             bool header_present,
-			     unsigned response_length);
+/**
+ * Construct a message descriptor immediate with the specified sampler
+ * function controls.
+ */
+static inline uint32_t
+brw_sampler_desc(const struct gen_device_info *devinfo,
+                 unsigned binding_table_index,
+                 unsigned sampler,
+                 unsigned msg_type,
+                 unsigned simd_mode,
+                 unsigned return_format)
+{
+   const unsigned desc = (SET_BITS(binding_table_index, 7, 0) |
+                          SET_BITS(sampler, 11, 8));
+   if (devinfo->gen >= 7)
+      return (desc | SET_BITS(msg_type, 16, 12) |
+              SET_BITS(simd_mode, 18, 17));
+   else if (devinfo->gen >= 5)
+      return (desc | SET_BITS(msg_type, 15, 12) |
+              SET_BITS(simd_mode, 17, 16));
+   else if (devinfo->is_g4x)
+      return desc | SET_BITS(msg_type, 15, 12);
+   else
+      return (desc | SET_BITS(return_format, 13, 12) |
+              SET_BITS(msg_type, 15, 14));
+}
 
-void brw_set_dp_write_message(struct brw_codegen *p,
-			      brw_inst *insn,
-			      unsigned binding_table_index,
-			      unsigned msg_control,
-			      unsigned msg_type,
-                              unsigned target_cache,
-			      unsigned msg_length,
-			      bool header_present,
-			      unsigned last_render_target,
-			      unsigned response_length,
-			      unsigned end_of_thread,
-			      unsigned send_commit_msg);
+/**
+ * Construct a message descriptor immediate with the specified dataport read
+ * function controls.
+ */
+static inline uint32_t
+brw_dp_read_desc(const struct gen_device_info *devinfo,
+                 unsigned binding_table_index,
+                 unsigned msg_control,
+                 unsigned msg_type,
+                 unsigned target_cache)
+{
+   const unsigned desc = SET_BITS(binding_table_index, 7, 0);
+   if (devinfo->gen >= 7)
+      return (desc | SET_BITS(msg_control, 13, 8) |
+              SET_BITS(msg_type, 17, 14));
+   else if (devinfo->gen >= 6)
+      return (desc | SET_BITS(msg_control, 12, 8) |
+              SET_BITS(msg_type, 16, 13));
+   else if (devinfo->gen >= 5 || devinfo->is_g4x)
+      return (desc | SET_BITS(msg_control, 10, 8) |
+              SET_BITS(msg_type, 13, 11) |
+              SET_BITS(target_cache, 15, 14));
+   else
+      return (desc | SET_BITS(msg_control, 11, 8) |
+              SET_BITS(msg_type, 13, 12) |
+              SET_BITS(target_cache, 15, 14));
+}
+
+/**
+ * Construct a message descriptor immediate with the specified dataport write
+ * function controls.
+ */
+static inline uint32_t
+brw_dp_write_desc(const struct gen_device_info *devinfo,
+                  unsigned binding_table_index,
+                  unsigned msg_control,
+                  unsigned msg_type,
+                  unsigned last_render_target,
+                  unsigned send_commit_msg)
+{
+   const unsigned desc = SET_BITS(binding_table_index, 7, 0);
+   if (devinfo->gen >= 7)
+      return (desc | SET_BITS(msg_control, 13, 8) |
+              SET_BITS(last_render_target, 12, 12) |
+              SET_BITS(msg_type, 17, 14));
+   else if (devinfo->gen >= 6)
+      return (desc | SET_BITS(msg_control, 12, 8) |
+              SET_BITS(last_render_target, 12, 12) |
+              SET_BITS(msg_type, 16, 13) |
+              SET_BITS(send_commit_msg, 17, 17));
+   else
+      return (desc | SET_BITS(msg_control, 11, 8) |
+              SET_BITS(last_render_target, 11, 11) |
+              SET_BITS(msg_type, 14, 12) |
+              SET_BITS(send_commit_msg, 15, 15));
+}
+
+/**
+ * Construct a message descriptor immediate with the specified dataport
+ * surface function controls.
+ */
+static inline uint32_t
+brw_dp_surface_desc(const struct gen_device_info *devinfo,
+                    unsigned msg_type,
+                    unsigned msg_control)
+{
+   assert(devinfo->gen >= 7);
+   return (SET_BITS(msg_control, 13, 8) |
+           SET_BITS(msg_type, 17, 14));
+}
+
+/**
+ * Construct a message descriptor immediate with the specified pixel
+ * interpolator function controls.
+ */
+static inline uint32_t
+brw_pixel_interp_desc(const struct gen_device_info *devinfo,
+                      unsigned msg_type,
+                      bool noperspective,
+                      unsigned simd_mode,
+                      unsigned slot_group)
+{
+   return (SET_BITS(slot_group, 11, 11) |
+           SET_BITS(msg_type, 13, 12) |
+           SET_BITS(!!noperspective, 14, 14) |
+           SET_BITS(simd_mode, 16, 16));
+}
 
 void brw_urb_WRITE(struct brw_codegen *p,
 		   struct brw_reg dest,
@@ -271,18 +394,15 @@
 /**
  * Send message to shared unit \p sfid with a possibly indirect descriptor \p
  * desc.  If \p desc is not an immediate it will be transparently loaded to an
- * address register using an OR instruction.  The returned instruction can be
- * passed as argument to the usual brw_set_*_message() functions in order to
- * specify any additional descriptor bits -- If \p desc is an immediate this
- * will be the SEND instruction itself, otherwise it will be the OR
- * instruction.
+ * address register using an OR instruction.
  */
-struct brw_inst *
+void
 brw_send_indirect_message(struct brw_codegen *p,
                           unsigned sfid,
                           struct brw_reg dst,
                           struct brw_reg payload,
-                          struct brw_reg desc);
+                          struct brw_reg desc,
+                          unsigned desc_imm);
 
 void brw_ff_sync(struct brw_codegen *p,
 		   struct brw_reg dest,
@@ -299,16 +419,16 @@
                    unsigned binding_table_index,
                    bool   send_commit_msg);
 
-void brw_fb_WRITE(struct brw_codegen *p,
-		   struct brw_reg payload,
-		   struct brw_reg implied_header,
-		   unsigned msg_control,
-		   unsigned binding_table_index,
-		   unsigned msg_length,
-		   unsigned response_length,
-		   bool eot,
-		   bool last_render_target,
-		   bool header_present);
+brw_inst *brw_fb_WRITE(struct brw_codegen *p,
+                       struct brw_reg payload,
+                       struct brw_reg implied_header,
+                       unsigned msg_control,
+                       unsigned binding_table_index,
+                       unsigned msg_length,
+                       unsigned response_length,
+                       bool eot,
+                       bool last_render_target,
+                       bool header_present);
 
 brw_inst *gen9_fb_READ(struct brw_codegen *p,
                        struct brw_reg dst,
@@ -513,7 +633,8 @@
 
 void
 brw_memory_fence(struct brw_codegen *p,
-                 struct brw_reg dst);
+                 struct brw_reg dst,
+                 enum opcode send_op);
 
 void
 brw_pixel_interpolator_query(struct brw_codegen *p,
@@ -570,6 +691,15 @@
 
 void brw_set_src1(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg);
 
+void brw_set_desc_ex(struct brw_codegen *p, brw_inst *insn,
+                     unsigned desc, unsigned ex_desc);
+
+static inline void
+brw_set_desc(struct brw_codegen *p, brw_inst *insn, unsigned desc)
+{
+   brw_set_desc_ex(p, insn, desc, 0);
+}
+
 void brw_set_uip_jip(struct brw_codegen *p, int start_offset);
 
 enum brw_conditional_mod brw_negate_cmod(uint32_t cmod);
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 332d627..2289fc9 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -46,6 +46,13 @@
       fieldval & field ## _MASK;                                        \
    })
 
+#define SET_BITS(value, high, low)                                      \
+   ({                                                                   \
+      const uint32_t fieldval = (value) << (low);                       \
+      assert((fieldval & ~INTEL_MASK(high, low)) == 0);                 \
+      fieldval & INTEL_MASK(high, low);                                 \
+   })
+
 #define GET_BITS(data, high, low) ((data & INTEL_MASK((high), (low))) >> (low))
 #define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
 
@@ -200,7 +207,7 @@
    BRW_OPCODE_SHR =	8,
    BRW_OPCODE_SHL =	9,
    BRW_OPCODE_DIM =	10,  /**< Gen7.5 only */ /* Reused */
-   // BRW_OPCODE_SMOV =	10,  /**< Gen8+       */ /* Reused */
+   BRW_OPCODE_SMOV =	10,  /**< Gen8+       */ /* Reused */
    /* Reserved - 11 */
    BRW_OPCODE_ASR =	12,
    /* Reserved - 13-15 */
@@ -216,27 +223,27 @@
    BRW_OPCODE_BFI2 =	26,  /**< Gen7+ */
    /* Reserved - 27-31 */
    BRW_OPCODE_JMPI =	32,
-   // BRW_OPCODE_BRD =	33,  /**< Gen7+ */
+   BRW_OPCODE_BRD =	33,  /**< Gen7+ */
    BRW_OPCODE_IF =	34,
    BRW_OPCODE_IFF =	35,  /**< Pre-Gen6    */ /* Reused */
-   // BRW_OPCODE_BRC =	35,  /**< Gen7+       */ /* Reused */
+   BRW_OPCODE_BRC =	35,  /**< Gen7+       */ /* Reused */
    BRW_OPCODE_ELSE =	36,
    BRW_OPCODE_ENDIF =	37,
    BRW_OPCODE_DO =	38,  /**< Pre-Gen6    */ /* Reused */
-   // BRW_OPCODE_CASE =	38,  /**< Gen6 only   */ /* Reused */
+   BRW_OPCODE_CASE =	38,  /**< Gen6 only   */ /* Reused */
    BRW_OPCODE_WHILE =	39,
    BRW_OPCODE_BREAK =	40,
    BRW_OPCODE_CONTINUE = 41,
    BRW_OPCODE_HALT =	42,
-   // BRW_OPCODE_CALLA =	43,  /**< Gen7.5+     */
-   // BRW_OPCODE_MSAVE =	44,  /**< Pre-Gen6    */ /* Reused */
-   // BRW_OPCODE_CALL =	44,  /**< Gen6+       */ /* Reused */
-   // BRW_OPCODE_MREST =	45,  /**< Pre-Gen6    */ /* Reused */
-   // BRW_OPCODE_RET =	45,  /**< Gen6+       */ /* Reused */
-   // BRW_OPCODE_PUSH =	46,  /**< Pre-Gen6    */ /* Reused */
-   // BRW_OPCODE_FORK =	46,  /**< Gen6 only   */ /* Reused */
-   // BRW_OPCODE_GOTO =	46,  /**< Gen8+       */ /* Reused */
-   // BRW_OPCODE_POP =	47,  /**< Pre-Gen6    */
+   BRW_OPCODE_CALLA =	43,  /**< Gen7.5+     */
+   BRW_OPCODE_MSAVE =	44,  /**< Pre-Gen6    */ /* Reused */
+   BRW_OPCODE_CALL =	44,  /**< Gen6+       */ /* Reused */
+   BRW_OPCODE_MREST =	45,  /**< Pre-Gen6    */ /* Reused */
+   BRW_OPCODE_RET =	45,  /**< Gen6+       */ /* Reused */
+   BRW_OPCODE_PUSH =	46,  /**< Pre-Gen6    */ /* Reused */
+   BRW_OPCODE_FORK =	46,  /**< Gen6 only   */ /* Reused */
+   BRW_OPCODE_GOTO =	46,  /**< Gen8+       */ /* Reused */
+   BRW_OPCODE_POP =	47,  /**< Pre-Gen6    */
    BRW_OPCODE_WAIT =	48,
    BRW_OPCODE_SEND =	49,
    BRW_OPCODE_SENDC =	50,
@@ -273,7 +280,7 @@
    BRW_OPCODE_PLN =	90,  /**< G45+ */
    BRW_OPCODE_MAD =	91,  /**< Gen6+ */
    BRW_OPCODE_LRP =	92,  /**< Gen6+ */
-   // BRW_OPCODE_MADM =	93,  /**< Gen8+ */
+   BRW_OPCODE_MADM =	93,  /**< Gen8+ */
    /* Reserved 94-124 */
    BRW_OPCODE_NENOP =	125, /**< G45 only */
    BRW_OPCODE_NOP =	126,
@@ -480,6 +487,8 @@
 
    SHADER_OPCODE_GET_BUFFER_SIZE,
 
+   SHADER_OPCODE_INTERLOCK,
+
    VEC4_OPCODE_MOV_BYTES,
    VEC4_OPCODE_PACK_BYTES,
    VEC4_OPCODE_UNPACK_UNIFORM,
@@ -499,7 +508,6 @@
     */
    FS_OPCODE_DDY_COARSE,
    FS_OPCODE_DDY_FINE,
-   FS_OPCODE_CINTERP,
    FS_OPCODE_LINTERP,
    FS_OPCODE_PIXEL_X,
    FS_OPCODE_PIXEL_Y,
@@ -508,7 +516,6 @@
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4,
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
    FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
-   FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
    FS_OPCODE_DISCARD_JUMP,
    FS_OPCODE_SET_SAMPLE_ID,
    FS_OPCODE_PACK_HALF_2x16_SPLIT,
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 0dfe26a..0847d9b 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -360,45 +360,21 @@
 }
 
 /**
- * Set the Message Descriptor and Extended Message Descriptor fields
- * for SEND messages.
- *
- * \note This zeroes out the Function Control bits, so it must be called
- *       \b before filling out any message-specific data.  Callers can
- *       choose not to fill in irrelevant bits; they will be zero.
+ * Specify the descriptor and extended descriptor immediate for a SEND(C)
+ * message instruction.
  */
 void
-brw_set_message_descriptor(struct brw_codegen *p,
-			   brw_inst *inst,
-			   enum brw_message_target sfid,
-			   unsigned msg_length,
-			   unsigned response_length,
-			   bool header_present,
-			   bool end_of_thread)
+brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
+                unsigned desc, unsigned ex_desc)
 {
    const struct gen_device_info *devinfo = p->devinfo;
-
-   brw_set_src1(p, inst, brw_imm_d(0));
-
-   /* For indirect sends, `inst` will not be the SEND/SENDC instruction
-    * itself; instead, it will be a MOV/OR into the address register.
-    *
-    * In this case, we avoid setting the extended message descriptor bits,
-    * since they go on the later SEND/SENDC instead and if set here would
-    * instead clobber the conditionalmod bits.
-    */
-   unsigned opcode = brw_inst_opcode(devinfo, inst);
-   if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
-      brw_inst_set_sfid(devinfo, inst, sfid);
-   }
-
-   brw_inst_set_mlen(devinfo, inst, msg_length);
-   brw_inst_set_rlen(devinfo, inst, response_length);
-   brw_inst_set_eot(devinfo, inst, end_of_thread);
-
-   if (devinfo->gen >= 5) {
-      brw_inst_set_header_present(devinfo, inst, header_present);
-   }
+   assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
+          brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
+   brw_inst_set_src1_file_type(devinfo, inst,
+                               BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_D);
+   brw_inst_set_send_desc(devinfo, inst, desc);
+   if (devinfo->gen >= 9)
+      brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
 }
 
 static void brw_set_math_message( struct brw_codegen *p,
@@ -436,9 +412,10 @@
       break;
    }
 
+   brw_set_desc(p, inst, brw_message_desc(
+                   devinfo, msg_length, response_length, false));
 
-   brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
-			      msg_length, response_length, false, false);
+   brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
    brw_inst_set_math_msg_function(devinfo, inst, function);
    brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
    brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
@@ -456,8 +433,11 @@
 {
    const struct gen_device_info *devinfo = p->devinfo;
 
-   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
-			      1, response_length, true, end_of_thread);
+   brw_set_desc(p, insn, brw_message_desc(
+                   devinfo, 1, response_length, true));
+
+   brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
+   brw_inst_set_eot(devinfo, insn, end_of_thread);
    brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
    brw_inst_set_urb_allocate(devinfo, insn, allocate);
    /* The following fields are not used by FF_SYNC: */
@@ -481,9 +461,11 @@
    assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
    assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
 
-   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
-			      msg_length, response_length, true,
-                              flags & BRW_URB_WRITE_EOT);
+   brw_set_desc(p, insn, brw_message_desc(
+                   devinfo, msg_length, response_length, true));
+
+   brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
+   brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
 
    if (flags & BRW_URB_WRITE_OWORD) {
       assert(msg_length == 2); /* header + one OWORD of data */
@@ -508,91 +490,6 @@
    }
 }
 
-void
-brw_set_dp_write_message(struct brw_codegen *p,
-			 brw_inst *insn,
-			 unsigned binding_table_index,
-			 unsigned msg_control,
-			 unsigned msg_type,
-                         unsigned target_cache,
-			 unsigned msg_length,
-			 bool header_present,
-			 unsigned last_render_target,
-			 unsigned response_length,
-			 unsigned end_of_thread,
-			 unsigned send_commit_msg)
-{
-   const struct gen_device_info *devinfo = p->devinfo;
-   const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
-                          BRW_SFID_DATAPORT_WRITE);
-
-   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
-			      header_present, end_of_thread);
-
-   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
-   brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
-   brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
-   brw_inst_set_rt_last(devinfo, insn, last_render_target);
-   if (devinfo->gen < 7) {
-      brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
-   }
-
-   if (devinfo->gen >= 11)
-      brw_inst_set_null_rt(devinfo, insn, false);
-}
-
-void
-brw_set_dp_read_message(struct brw_codegen *p,
-			brw_inst *insn,
-			unsigned binding_table_index,
-			unsigned msg_control,
-			unsigned msg_type,
-			unsigned target_cache,
-			unsigned msg_length,
-                        bool header_present,
-			unsigned response_length)
-{
-   const struct gen_device_info *devinfo = p->devinfo;
-   const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
-                          BRW_SFID_DATAPORT_READ);
-
-   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
-			      header_present, false);
-
-   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
-   brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
-   brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
-   if (devinfo->gen < 6)
-      brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
-}
-
-void
-brw_set_sampler_message(struct brw_codegen *p,
-                        brw_inst *inst,
-                        unsigned binding_table_index,
-                        unsigned sampler,
-                        unsigned msg_type,
-                        unsigned response_length,
-                        unsigned msg_length,
-                        unsigned header_present,
-                        unsigned simd_mode,
-                        unsigned return_format)
-{
-   const struct gen_device_info *devinfo = p->devinfo;
-
-   brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
-			      response_length, header_present, false);
-
-   brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
-   brw_inst_set_sampler(devinfo, inst, sampler);
-   brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
-   if (devinfo->gen >= 5) {
-      brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
-   } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
-      brw_inst_set_sampler_return_format(devinfo, inst, return_format);
-   }
-}
-
 static void
 gen7_set_dp_scratch_message(struct brw_codegen *p,
                             brw_inst *inst,
@@ -611,8 +508,10 @@
    const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
                                 num_regs - 1);
 
-   brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
-                              mlen, rlen, header_present, false);
+   brw_set_desc(p, inst, brw_message_desc(
+                   devinfo, mlen, rlen, header_present));
+
+   brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
    brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
    brw_inst_set_scratch_read_write(devinfo, inst, write);
    brw_inst_set_scratch_type(devinfo, inst, dword);
@@ -621,72 +520,6 @@
    brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
 }
 
-struct brw_insn_state {
-   /* One of BRW_EXECUTE_* */
-   unsigned exec_size:3;
-
-   /* Group in units of channels */
-   unsigned group:5;
-
-   /* Compression control on gen4-5 */
-   bool compressed:1;
-
-   /* One of BRW_MASK_* */
-   unsigned mask_control:1;
-
-   bool saturate:1;
-
-   /* One of BRW_ALIGN_* */
-   unsigned access_mode:1;
-
-   /* One of BRW_PREDICATE_* */
-   enum brw_predicate predicate:4;
-
-   bool pred_inv:1;
-
-   /* Flag subreg.  Bottom bit is subreg, top bit is reg */
-   unsigned flag_subreg:2;
-
-   bool acc_wr_control:1;
-};
-
-static struct brw_insn_state
-brw_inst_get_state(const struct gen_device_info *devinfo,
-                   const brw_inst *insn)
-{
-   struct brw_insn_state state = { };
-
-   state.exec_size = brw_inst_exec_size(devinfo, insn);
-   if (devinfo->gen >= 6) {
-      state.group = brw_inst_qtr_control(devinfo, insn) * 8;
-      if (devinfo->gen >= 7)
-         state.group += brw_inst_nib_control(devinfo, insn) * 4;
-   } else {
-      unsigned qtr_control = brw_inst_qtr_control(devinfo, insn);
-      if (qtr_control == BRW_COMPRESSION_COMPRESSED) {
-         state.group = 0;
-         state.compressed = true;
-      } else {
-         state.group = qtr_control * 8;
-         state.compressed = false;
-      }
-   }
-   state.access_mode = brw_inst_access_mode(devinfo, insn);
-   state.mask_control = brw_inst_mask_control(devinfo, insn);
-   state.saturate = brw_inst_saturate(devinfo, insn);
-   state.predicate = brw_inst_pred_control(devinfo, insn);
-   state.pred_inv = brw_inst_pred_inv(devinfo, insn);
-
-   state.flag_subreg = brw_inst_flag_subreg_nr(devinfo, insn);
-   if (devinfo->gen >= 7)
-      state.flag_subreg += brw_inst_flag_reg_nr(devinfo, insn) * 2;
-
-   if (devinfo->gen >= 6)
-      state.acc_wr_control = brw_inst_acc_wr_control(devinfo, insn);
-
-   return state;
-}
-
 static void
 brw_inst_set_state(const struct gen_device_info *devinfo,
                    brw_inst *insn,
@@ -735,8 +568,7 @@
    brw_inst_set_opcode(devinfo, insn, opcode);
 
    /* Apply the default instruction state */
-   struct brw_insn_state current = brw_inst_get_state(devinfo, p->current);
-   brw_inst_set_state(devinfo, insn, &current);
+   brw_inst_set_state(devinfo, insn, p->current);
 
    return insn;
 }
@@ -2077,7 +1909,7 @@
    const unsigned target_cache =
       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
-       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+       BRW_SFID_DATAPORT_WRITE);
    uint32_t msg_type;
 
    if (devinfo->gen >= 6)
@@ -2118,6 +1950,7 @@
       struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
 					 BRW_REGISTER_TYPE_UW);
 
+      brw_inst_set_sfid(devinfo, insn, target_cache);
       brw_inst_set_compression(devinfo, insn, false);
 
       if (brw_inst_exec_size(devinfo, insn) >= 16)
@@ -2157,18 +1990,12 @@
       else
 	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
 
-      brw_set_dp_write_message(p,
-			       insn,
-                               brw_scratch_surface_idx(p),
-			       BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
-			       msg_type,
-                               target_cache,
-			       mlen,
-			       true, /* header_present */
-			       0, /* not a render target */
-			       send_commit_msg, /* response_length */
-			       0, /* eot */
-			       send_commit_msg);
+      brw_set_desc(p, insn,
+                   brw_message_desc(devinfo, mlen, send_commit_msg, true) |
+                   brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
+                                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+                                     msg_type, 0, /* not a render target */
+                                     send_commit_msg));
    }
 }
 
@@ -2210,7 +2037,7 @@
    const unsigned target_cache =
       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
-       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+       BRW_SFID_DATAPORT_READ);
 
    {
       brw_push_insn_state(p);
@@ -2230,6 +2057,7 @@
    {
       brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
 
+      brw_inst_set_sfid(devinfo, insn, target_cache);
       assert(brw_inst_pred_control(devinfo, insn) == 0);
       brw_inst_set_compression(devinfo, insn, false);
 
@@ -2241,15 +2069,12 @@
          brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
       }
 
-      brw_set_dp_read_message(p,
-			      insn,
-                              brw_scratch_surface_idx(p),
-			      BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
-			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
-			      target_cache,
-			      1, /* msg_length */
-                              true, /* header_present */
-			      rlen);
+      brw_set_desc(p, insn,
+                   brw_message_desc(devinfo, 1, rlen, true) |
+                   brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
+                                    BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+                                    BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
+                                    BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
    }
 }
 
@@ -2301,7 +2126,7 @@
    const struct gen_device_info *devinfo = p->devinfo;
    const unsigned target_cache =
       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
-       BRW_DATAPORT_READ_TARGET_DATA_CACHE);
+       BRW_SFID_DATAPORT_READ);
    const unsigned exec_size = 1 << brw_get_default_exec_size(p);
 
    /* On newer hardware, offset is in units of owords. */
@@ -2330,6 +2155,8 @@
 
    brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
 
+   brw_inst_set_sfid(devinfo, insn, target_cache);
+
    /* cast dest to a uword[8] vector */
    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
 
@@ -2341,33 +2168,32 @@
       brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
    }
 
-   brw_set_dp_read_message(p, insn, bind_table_index,
-                           BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
-			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
-			   target_cache,
-			   1, /* msg_length */
-                           true, /* header_present */
-			   DIV_ROUND_UP(exec_size, 8)); /* response_length */
+   brw_set_desc(p, insn,
+                brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
+                brw_dp_read_desc(devinfo, bind_table_index,
+                                 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
+                                 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
+                                 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
 
    brw_pop_insn_state(p);
 }
 
-
-void brw_fb_WRITE(struct brw_codegen *p,
-                  struct brw_reg payload,
-                  struct brw_reg implied_header,
-                  unsigned msg_control,
-                  unsigned binding_table_index,
-                  unsigned msg_length,
-                  unsigned response_length,
-                  bool eot,
-                  bool last_render_target,
-                  bool header_present)
+brw_inst *
+brw_fb_WRITE(struct brw_codegen *p,
+             struct brw_reg payload,
+             struct brw_reg implied_header,
+             unsigned msg_control,
+             unsigned binding_table_index,
+             unsigned msg_length,
+             unsigned response_length,
+             bool eot,
+             bool last_render_target,
+             bool header_present)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    const unsigned target_cache =
       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
-       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+       BRW_SFID_DATAPORT_WRITE);
    brw_inst *insn;
    unsigned msg_type;
    struct brw_reg dest, src0;
@@ -2382,6 +2208,7 @@
    } else {
       insn = next_insn(p, BRW_OPCODE_SEND);
    }
+   brw_inst_set_sfid(devinfo, insn, target_cache);
    brw_inst_set_compression(devinfo, insn, false);
 
    if (devinfo->gen >= 6) {
@@ -2399,18 +2226,15 @@
 
    brw_set_dest(p, insn, dest);
    brw_set_src0(p, insn, src0);
-   brw_set_dp_write_message(p,
-			    insn,
-			    binding_table_index,
-			    msg_control,
-			    msg_type,
-                            target_cache,
-			    msg_length,
-			    header_present,
-			    last_render_target,
-			    response_length,
-			    eot,
-			    0 /* send_commit_msg */);
+   brw_set_desc(p, insn,
+                brw_message_desc(devinfo, msg_length, response_length,
+                                 header_present) |
+                brw_dp_write_desc(devinfo, binding_table_index, msg_control,
+                                  msg_type, last_render_target,
+                                  0 /* send_commit_msg */));
+   brw_inst_set_eot(devinfo, insn, eot);
+
+   return insn;
 }
 
 brw_inst *
@@ -2428,14 +2252,16 @@
       brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
    brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
 
+   brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
    brw_set_dest(p, insn, dst);
    brw_set_src0(p, insn, payload);
-   brw_set_dp_read_message(p, insn, binding_table_index,
-                           per_sample << 5 | msg_subtype,
-                           GEN9_DATAPORT_RC_RENDER_TARGET_READ,
-                           GEN6_SFID_DATAPORT_RENDER_CACHE,
-                           msg_length, true /* header_present */,
-                           response_length);
+   brw_set_desc(
+      p, insn,
+      brw_message_desc(devinfo, msg_length, response_length, true) |
+      brw_dp_read_desc(devinfo, binding_table_index,
+                       per_sample << 5 | msg_subtype,
+                       GEN9_DATAPORT_RC_RENDER_TARGET_READ,
+                       BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
    brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
 
    return insn;
@@ -2466,6 +2292,7 @@
       gen6_resolve_implied_move(p, &src0, msg_reg_nr);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
    brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
 
    /* From the 965 PRM (volume 4, part 1, section 14.2.41):
@@ -2487,15 +2314,11 @@
 
    brw_set_dest(p, insn, dest);
    brw_set_src0(p, insn, src0);
-   brw_set_sampler_message(p, insn,
-                           binding_table_index,
-                           sampler,
-                           msg_type,
-                           response_length,
-                           msg_length,
-                           header_present,
-                           simd_mode,
-                           return_format);
+   brw_set_desc(p, insn,
+                brw_message_desc(devinfo, msg_length, response_length,
+                                 header_present) |
+                brw_sampler_desc(devinfo, binding_table_index, sampler,
+                                 msg_type, simd_mode, return_format));
 }
 
 /* Adjust the message header's sampler state pointer to
@@ -2596,31 +2419,24 @@
 		       swizzle);
 }
 
-struct brw_inst *
+void
 brw_send_indirect_message(struct brw_codegen *p,
                           unsigned sfid,
                           struct brw_reg dst,
                           struct brw_reg payload,
-                          struct brw_reg desc)
+                          struct brw_reg desc,
+                          unsigned desc_imm)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    struct brw_inst *send;
-   int setup;
 
    dst = retype(dst, BRW_REGISTER_TYPE_UW);
 
    assert(desc.type == BRW_REGISTER_TYPE_UD);
 
-   /* We hold on to the setup instruction (the SEND in the direct case, the OR
-    * in the indirect case) by its index in the instruction store.  The
-    * pointer returned by next_insn() may become invalid if emitting the SEND
-    * in the indirect case reallocs the store.
-    */
-
    if (desc.file == BRW_IMMEDIATE_VALUE) {
-      setup = p->nr_insn;
       send = next_insn(p, BRW_OPCODE_SEND);
-      brw_set_src1(p, send, desc);
+      brw_set_desc(p, send, desc.ud | desc_imm);
 
    } else {
       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
@@ -2632,11 +2448,10 @@
       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
 
       /* Load the indirect descriptor to an address register using OR so the
-       * caller can specify additional descriptor bits with the usual
-       * brw_set_*_message() helper functions.
+       * caller can specify additional descriptor bits with the desc_imm
+       * immediate.
        */
-      setup = p->nr_insn;
-      brw_OR(p, addr, desc, brw_imm_ud(0));
+      brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
 
       brw_pop_insn_state(p);
 
@@ -2650,23 +2465,16 @@
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
    brw_inst_set_sfid(devinfo, send, sfid);
-
-   return &p->store[setup];
 }
 
-static struct brw_inst *
+static void
 brw_send_indirect_surface_message(struct brw_codegen *p,
                                   unsigned sfid,
                                   struct brw_reg dst,
                                   struct brw_reg payload,
                                   struct brw_reg surface,
-                                  unsigned message_len,
-                                  unsigned response_len,
-                                  bool header_present)
+                                  unsigned desc_imm)
 {
-   const struct gen_device_info *devinfo = p->devinfo;
-   struct brw_inst *insn;
-
    if (surface.file != BRW_IMMEDIATE_VALUE) {
       struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
 
@@ -2679,22 +2487,17 @@
       /* Mask out invalid bits from the surface index to avoid hangs e.g. when
        * some surface array is accessed out of bounds.
        */
-      insn = brw_AND(p, addr,
-                     suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
-                               BRW_GET_SWZ(surface.swizzle, 0)),
-                     brw_imm_ud(0xff));
+      brw_AND(p, addr,
+              suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
+                        BRW_GET_SWZ(surface.swizzle, 0)),
+              brw_imm_ud(0xff));
 
       brw_pop_insn_state(p);
 
       surface = addr;
    }
 
-   insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
-   brw_inst_set_mlen(devinfo, insn, message_len);
-   brw_inst_set_rlen(devinfo, insn, response_len);
-   brw_inst_set_header_present(devinfo, insn, header_present);
-
-   return insn;
+   brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm);
 }
 
 static bool
@@ -2903,26 +2706,22 @@
    const unsigned target_cache =
       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
-       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+       BRW_SFID_DATAPORT_WRITE);
    brw_inst *insn;
 
    gen6_resolve_implied_move(p, &src0, msg_reg_nr);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, insn, target_cache);
    brw_set_dest(p, insn, dest);
    brw_set_src0(p, insn, src0);
-   brw_set_src1(p, insn, brw_imm_d(0));
-   brw_set_dp_write_message(p, insn,
-                            binding_table_index,
-                            0, /* msg_control: ignored */
-                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
-                            target_cache,
-                            1, /* msg_length */
-                            true, /* header_present */
-                            0, /* last_render_target: ignored */
-                            send_commit_msg, /* response_length */
-                            0, /* end_of_thread */
-                            send_commit_msg); /* send_commit_msg */
+   brw_set_desc(p, insn,
+                brw_message_desc(devinfo, 1, send_commit_msg, true) |
+                brw_dp_write_desc(devinfo, binding_table_index,
+                                  0, /* msg_control: ignored */
+                                  GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
+                                  0, /* last_render_target: ignored */
+                                  send_commit_msg)); /* send_commit_msg */
 }
 
 static unsigned
@@ -2939,37 +2738,34 @@
       return num_channels;
 }
 
-static void
-brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
-                                  brw_inst *insn,
-                                  unsigned atomic_op,
-                                  bool response_expected)
+static uint32_t
+brw_dp_untyped_atomic_desc(struct brw_codegen *p,
+                           unsigned atomic_op,
+                           bool response_expected)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    unsigned msg_control =
       atomic_op | /* Atomic Operation Type: BRW_AOP_* */
       (response_expected ? 1 << 5 : 0); /* Return data expected */
+   unsigned msg_type;
 
    if (devinfo->gen >= 8 || devinfo->is_haswell) {
       if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
          if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
             msg_control |= 1 << 4; /* SIMD8 mode */
 
-         brw_inst_set_dp_msg_type(devinfo, insn,
-                                  HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
+         msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP;
       } else {
-         brw_inst_set_dp_msg_type(devinfo, insn,
-            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
+         msg_type = HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2;
       }
    } else {
-      brw_inst_set_dp_msg_type(devinfo, insn,
-                               GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
-
       if (brw_get_default_exec_size(p) != BRW_EXECUTE_16)
          msg_control |= 1 << 4; /* SIMD8 mode */
+
+      msg_type = GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP;
    }
 
-   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
 }
 
 void
@@ -2986,6 +2782,11 @@
    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
                           GEN7_SFID_DATAPORT_DATA_CACHE);
+   const unsigned response_length = brw_surface_payload_size(
+      p, response_expected, devinfo->gen >= 8 || devinfo->is_haswell, true);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, response_length, header_present) |
+      brw_dp_untyped_atomic_desc(p, atomic_op, response_expected);
    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
    /* Mask out unused components -- This is especially important in Align16
     * mode on generations that don't have native support for SIMD4x2 atomics,
@@ -2994,22 +2795,19 @@
     * uninitialized Y, Z and W coordinates of the payload.
     */
    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
-   struct brw_inst *insn = brw_send_indirect_surface_message(
-      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
-      brw_surface_payload_size(p, response_expected,
-                               devinfo->gen >= 8 || devinfo->is_haswell, true),
-      header_present);
 
-   brw_set_dp_untyped_atomic_message(
-      p, insn, atomic_op, response_expected);
+   brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
+                                     payload, surface, desc);
 }
 
-static void
-brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
-                                        struct brw_inst *insn,
-                                        unsigned num_channels)
+static uint32_t
+brw_dp_untyped_surface_read_desc(struct brw_codegen *p,
+                                 unsigned num_channels)
 {
    const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned msg_type = (devinfo->gen >= 8 || devinfo->is_haswell ?
+                              HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
+                              GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ);
    /* Set mask of 32-bit channels to drop. */
    unsigned msg_control = 0xf & (0xf << num_channels);
 
@@ -3020,11 +2818,7 @@
          msg_control |= 2 << 4; /* SIMD8 mode */
    }
 
-   brw_inst_set_dp_msg_type(devinfo, insn,
-                            (devinfo->gen >= 8 || devinfo->is_haswell ?
-                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
-                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
-   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
 }
 
 void
@@ -3039,21 +2833,23 @@
    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
                           GEN7_SFID_DATAPORT_DATA_CACHE);
-   struct brw_inst *insn = brw_send_indirect_surface_message(
-      p, sfid, dst, payload, surface, msg_length,
-      brw_surface_payload_size(p, num_channels, true, true),
-      false);
+   const unsigned response_length =
+      brw_surface_payload_size(p, num_channels, true, true);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, response_length, false) |
+      brw_dp_untyped_surface_read_desc(p, num_channels);
 
-   brw_set_dp_untyped_surface_read_message(
-      p, insn, num_channels);
+   brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
 }
 
-static void
-brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
-                                         struct brw_inst *insn,
-                                         unsigned num_channels)
+static uint32_t
+brw_dp_untyped_surface_write_desc(struct brw_codegen *p,
+                                  unsigned num_channels)
 {
    const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned msg_type = (devinfo->gen >= 8 || devinfo->is_haswell ?
+                              HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
+                              GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
    /* Set mask of 32-bit channels to drop. */
    unsigned msg_control = 0xf & (0xf << num_channels);
 
@@ -3069,11 +2865,7 @@
          msg_control |= 2 << 4; /* SIMD8 mode */
    }
 
-   brw_inst_set_dp_msg_type(devinfo, insn,
-                            devinfo->gen >= 8 || devinfo->is_haswell ?
-                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
-                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
-   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
 }
 
 void
@@ -3088,16 +2880,16 @@
    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
                           GEN7_SFID_DATAPORT_DATA_CACHE);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, 0, header_present) |
+      brw_dp_untyped_surface_write_desc(p, num_channels);
    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
    const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
                           WRITEMASK_X : WRITEMASK_XYZW;
-   struct brw_inst *insn = brw_send_indirect_surface_message(
-      p, sfid, brw_writemask(brw_null_reg(), mask),
-      payload, surface, msg_length, 0, header_present);
 
-   brw_set_dp_untyped_surface_write_message(
-      p, insn, num_channels);
+   brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
+                                     payload, surface, desc);
 }
 
 static unsigned
@@ -3115,6 +2907,21 @@
    }
 }
 
+static uint32_t
+brw_dp_byte_scattered_desc(struct brw_codegen *p, unsigned bit_size,
+                           unsigned msg_type)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   unsigned msg_control =
+      brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
+
+   if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
+      msg_control |= 1; /* SIMD16 mode */
+   else
+      msg_control |= 0; /* SIMD8 mode */
+
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
+}
 
 void
 brw_byte_scattered_read(struct brw_codegen *p,
@@ -3127,24 +2934,15 @@
    const struct gen_device_info *devinfo = p->devinfo;
    assert(devinfo->gen > 7 || devinfo->is_haswell);
    assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
-   const unsigned sfid =  GEN7_SFID_DATAPORT_DATA_CACHE;
+   const unsigned response_length =
+      brw_surface_payload_size(p, 1, true, true);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, response_length, false) |
+      brw_dp_byte_scattered_desc(p, bit_size,
+                                 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ);
 
-   struct brw_inst *insn = brw_send_indirect_surface_message(
-      p, sfid, dst, payload, surface, msg_length,
-      brw_surface_payload_size(p, 1, true, true),
-      false);
-
-   unsigned msg_control =
-      brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
-
-   if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
-      msg_control |= 1; /* SIMD16 mode */
-   else
-      msg_control |= 0; /* SIMD8 mode */
-
-   brw_inst_set_dp_msg_type(devinfo, insn,
-                            HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ);
-   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+   brw_send_indirect_surface_message(p, GEN7_SFID_DATAPORT_DATA_CACHE,
+                                     dst, payload, surface, desc);
 }
 
 void
@@ -3158,57 +2956,46 @@
    const struct gen_device_info *devinfo = p->devinfo;
    assert(devinfo->gen > 7 || devinfo->is_haswell);
    assert(brw_get_default_access_mode(p) == BRW_ALIGN_1);
-   const unsigned sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, 0, header_present) |
+      brw_dp_byte_scattered_desc(p, bit_size,
+                                 HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE);
 
-   struct brw_inst *insn = brw_send_indirect_surface_message(
-      p, sfid, brw_writemask(brw_null_reg(), WRITEMASK_XYZW),
-      payload, surface, msg_length, 0, header_present);
-
-   unsigned msg_control =
-      brw_byte_scattered_data_element_from_bit_size(bit_size) << 2;
-
-   if (brw_get_default_exec_size(p) == BRW_EXECUTE_16)
-      msg_control |= 1;
-   else
-      msg_control |= 0;
-
-   brw_inst_set_dp_msg_type(devinfo, insn,
-                            HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE);
-   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+   brw_send_indirect_surface_message(p, GEN7_SFID_DATAPORT_DATA_CACHE,
+                                     brw_writemask(brw_null_reg(),
+                                                   WRITEMASK_XYZW),
+                                     payload, surface, desc);
 }
 
-static void
-brw_set_dp_typed_atomic_message(struct brw_codegen *p,
-                                struct brw_inst *insn,
-                                unsigned atomic_op,
-                                bool response_expected)
+static uint32_t
+brw_dp_typed_atomic_desc(struct brw_codegen *p,
+                         unsigned atomic_op,
+                         bool response_expected)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    unsigned msg_control =
       atomic_op | /* Atomic Operation Type: BRW_AOP_* */
       (response_expected ? 1 << 5 : 0); /* Return data expected */
+   unsigned msg_type;
 
    if (devinfo->gen >= 8 || devinfo->is_haswell) {
       if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
          if ((brw_get_default_group(p) / 8) % 2 == 1)
             msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
 
-         brw_inst_set_dp_msg_type(devinfo, insn,
-                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
+         msg_type = HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP;
       } else {
-         brw_inst_set_dp_msg_type(devinfo, insn,
-                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
+         msg_type = HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2;
       }
 
    } else {
-      brw_inst_set_dp_msg_type(devinfo, insn,
-                               GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
-
       if ((brw_get_default_group(p) / 8) % 2 == 1)
          msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
+
+      msg_type = GEN7_DATAPORT_RC_TYPED_ATOMIC_OP;
    }
 
-   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
 }
 
 void
@@ -3224,27 +3011,27 @@
    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
                           GEN6_SFID_DATAPORT_RENDER_CACHE);
+   const unsigned response_length = brw_surface_payload_size(
+      p, response_expected, devinfo->gen >= 8 || devinfo->is_haswell, false);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, response_length, header_present) |
+      brw_dp_typed_atomic_desc(p, atomic_op, response_expected);
    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
    const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
-   struct brw_inst *insn = brw_send_indirect_surface_message(
-      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
-      brw_surface_payload_size(p, response_expected,
-                               devinfo->gen >= 8 || devinfo->is_haswell, false),
-      header_present);
 
-   brw_set_dp_typed_atomic_message(
-      p, insn, atomic_op, response_expected);
+   brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
+                                     payload, surface, desc);
 }
 
-static void
-brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
-                                      struct brw_inst *insn,
-                                      unsigned num_channels)
+static uint32_t
+brw_dp_typed_surface_read_desc(struct brw_codegen *p,
+                               unsigned num_channels)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    /* Set mask of unused channels. */
    unsigned msg_control = 0xf & (0xf << num_channels);
+   unsigned msg_type;
 
    if (devinfo->gen >= 8 || devinfo->is_haswell) {
       if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
@@ -3254,19 +3041,17 @@
             msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
       }
 
-      brw_inst_set_dp_msg_type(devinfo, insn,
-                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
+      msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ;
    } else {
       if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
          if ((brw_get_default_group(p) / 8) % 2 == 1)
             msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
       }
 
-      brw_inst_set_dp_msg_type(devinfo, insn,
-                               GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
+      msg_type = GEN7_DATAPORT_RC_TYPED_SURFACE_READ;
    }
 
-   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
 }
 
 void
@@ -3282,24 +3067,23 @@
    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
                           GEN6_SFID_DATAPORT_RENDER_CACHE);
-   struct brw_inst *insn = brw_send_indirect_surface_message(
-      p, sfid, dst, payload, surface, msg_length,
-      brw_surface_payload_size(p, num_channels,
-                               devinfo->gen >= 8 || devinfo->is_haswell, false),
-      header_present);
+   const unsigned response_length = brw_surface_payload_size(
+      p, num_channels, devinfo->gen >= 8 || devinfo->is_haswell, false);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, response_length, header_present) |
+      brw_dp_typed_surface_read_desc(p, num_channels);
 
-   brw_set_dp_typed_surface_read_message(
-      p, insn, num_channels);
+   brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
 }
 
-static void
-brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
-                                       struct brw_inst *insn,
-                                       unsigned num_channels)
+static uint32_t
+brw_dp_typed_surface_write_desc(struct brw_codegen *p,
+                                unsigned num_channels)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    /* Set mask of unused channels. */
    unsigned msg_control = 0xf & (0xf << num_channels);
+   unsigned msg_type;
 
    if (devinfo->gen >= 8 || devinfo->is_haswell) {
       if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
@@ -3309,8 +3093,7 @@
             msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
       }
 
-      brw_inst_set_dp_msg_type(devinfo, insn,
-                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
+      msg_type = HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE;
 
    } else {
       if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
@@ -3318,11 +3101,10 @@
             msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
       }
 
-      brw_inst_set_dp_msg_type(devinfo, insn,
-                               GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
+      msg_type = GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE;
    }
 
-   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+   return brw_dp_surface_desc(devinfo, msg_type, msg_control);
 }
 
 void
@@ -3337,16 +3119,16 @@
    const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
                           GEN6_SFID_DATAPORT_RENDER_CACHE);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, 0, header_present) |
+      brw_dp_typed_surface_write_desc(p, num_channels);
    const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
    /* Mask out unused components -- See comment in brw_untyped_atomic(). */
    const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
                           WRITEMASK_X : WRITEMASK_XYZW);
-   struct brw_inst *insn = brw_send_indirect_surface_message(
-      p, sfid, brw_writemask(brw_null_reg(), mask),
-      payload, surface, msg_length, 0, header_present);
 
-   brw_set_dp_typed_surface_write_message(
-      p, insn, num_channels);
+   brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
+                                     payload, surface, desc);
 }
 
 static void
@@ -3357,11 +3139,10 @@
 {
    const struct gen_device_info *devinfo = p->devinfo;
 
-   brw_set_message_descriptor(p, insn, sfid,
-                              1 /* message length */,
-                              (commit_enable ? 1 : 0) /* response length */,
-                              true /* header present */,
-                              false);
+   brw_set_desc(p, insn, brw_message_desc(
+                   devinfo, 1, (commit_enable ? 1 : 0), true));
+
+   brw_inst_set_sfid(devinfo, insn, sfid);
 
    switch (sfid) {
    case GEN6_SFID_DATAPORT_RENDER_CACHE:
@@ -3380,7 +3161,8 @@
 
 void
 brw_memory_fence(struct brw_codegen *p,
-                 struct brw_reg dst)
+                 struct brw_reg dst,
+                 enum opcode send_op)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    const bool commit_enable =
@@ -3396,7 +3178,7 @@
    /* Set dst as destination for dependency tracking, the MEMORY_FENCE
     * message doesn't write anything back.
     */
-   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn = next_insn(p, send_op);
    dst = retype(dst, BRW_REGISTER_TYPE_UW);
    brw_set_dest(p, insn, dst);
    brw_set_src0(p, insn, dst);
@@ -3408,7 +3190,7 @@
        * flush it too.  Use a different register so both flushes can be
        * pipelined by the hardware.
        */
-      insn = next_insn(p, BRW_OPCODE_SEND);
+      insn = next_insn(p, send_op);
       brw_set_dest(p, insn, offset(dst, 1));
       brw_set_src0(p, insn, offset(dst, 1));
       brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
@@ -3436,24 +3218,23 @@
                              unsigned response_length)
 {
    const struct gen_device_info *devinfo = p->devinfo;
-   struct brw_inst *insn;
    const uint16_t exec_size = brw_get_default_exec_size(p);
+   const unsigned slot_group = brw_get_default_group(p) / 16;
+   const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
+   const unsigned desc =
+      brw_message_desc(devinfo, msg_length, response_length, false) |
+      brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
+                            slot_group);
 
    /* brw_send_indirect_message will automatically use a direct send message
     * if data is actually immediate.
     */
-   insn = brw_send_indirect_message(p,
-                                    GEN7_SFID_PIXEL_INTERPOLATOR,
-                                    dest,
-                                    mrf,
-                                    vec1(data));
-   brw_inst_set_mlen(devinfo, insn, msg_length);
-   brw_inst_set_rlen(devinfo, insn, response_length);
-
-   brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
-   brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
-   brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
-   brw_inst_set_pi_message_type(devinfo, insn, mode);
+   brw_send_indirect_message(p,
+                             GEN7_SFID_PIXEL_INTERPOLATOR,
+                             dest,
+                             mrf,
+                             vec1(data),
+                             desc);
 }
 
 void
@@ -3503,9 +3284,8 @@
           */
          inst = brw_FBL(p, vec1(dst), exec_mask);
       } else {
-         const struct brw_reg flag = brw_flag_reg(
-            brw_inst_flag_reg_nr(devinfo, p->current),
-            brw_inst_flag_subreg_nr(devinfo, p->current));
+         const struct brw_reg flag = brw_flag_reg(p->current->flag_subreg / 2,
+                                                  p->current->flag_subreg % 2);
 
          brw_set_default_exec_size(p, BRW_EXECUTE_1);
          brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
@@ -3710,10 +3490,11 @@
                          struct brw_reg payload,
                          uint32_t surf_index)
 {
-   const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
                           HSW_SFID_DATAPORT_DATA_CACHE_1 :
                           GEN7_SFID_DATAPORT_DATA_CACHE);
-   assert(p->devinfo->gen >= 7);
+   assert(devinfo->gen >= 7);
 
    brw_push_insn_state(p);
    brw_set_default_access_mode(p, BRW_ALIGN_1);
@@ -3728,10 +3509,11 @@
                                       BRW_ARF_NULL, 0));
    brw_set_src0(p, send, brw_vec1_reg(payload.file,
                                       payload.nr, 0));
-   brw_set_src1(p, send, brw_imm_ud(0));
-   brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
-   brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
-   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
+   brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
+                          brw_dp_untyped_atomic_desc(p, BRW_AOP_ADD, false)));
+
+   brw_inst_set_sfid(devinfo, send, sfid);
+   brw_inst_set_binding_table_index(devinfo, send, surf_index);
 
    brw_pop_insn_state(p);
 }
@@ -3754,13 +3536,9 @@
    brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
    brw_set_src0(p, inst, src);
    brw_set_src1(p, inst, brw_null_reg());
+   brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
 
-   brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
-                              1 /* msg_length */,
-                              0 /* response_length */,
-                              false /* header_present */,
-                              false /* end_of_thread */);
-
+   brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
    brw_inst_set_gateway_notify(devinfo, inst, 1);
    brw_inst_set_gateway_subfuncid(devinfo, inst,
                                   BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c
index 3ac5fea..852f701 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -474,9 +474,11 @@
       dst_type_size = 8;
 
    if (exec_type_size > dst_type_size) {
-      ERROR_IF(dst_stride * dst_type_size != exec_type_size,
-               "Destination stride must be equal to the ratio of the sizes of "
-               "the execution data type to the destination type");
+      if (!(dst_type_is_byte && inst_is_raw_move(devinfo, inst))) {
+         ERROR_IF(dst_stride * dst_type_size != exec_type_size,
+                  "Destination stride must be equal to the ratio of the sizes "
+                  "of the execution data type to the destination type");
+      }
 
       unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
 
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 29ddf07..1183e7c 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -49,7 +49,7 @@
 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
               const fs_reg *src, unsigned sources)
 {
-   memset(this, 0, sizeof(*this));
+   memset((void*)this, 0, sizeof(*this));
 
    this->src = new fs_reg[MAX2(sources, 3)];
    for (unsigned i = 0; i < sources; i++)
@@ -131,7 +131,7 @@
 
 fs_inst::fs_inst(const fs_inst &that)
 {
-   memcpy(this, &that, sizeof(that));
+   memcpy((void*)this, &that, sizeof(that));
 
    this->src = new fs_reg[MAX2(that.sources, 3)];
 
@@ -191,21 +191,8 @@
                             vec4_result, surf_index, vec4_offset);
    inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
 
-   fs_reg dw = offset(vec4_result, bld, (const_offset & 0xf) / 4);
-   switch (type_sz(dst.type)) {
-   case 2:
-      shuffle_32bit_load_result_to_16bit_data(bld, dst, dw, 0, 1);
-      bld.MOV(dst, subscript(dw, dst.type, (const_offset / 2) & 1));
-      break;
-   case 4:
-      bld.MOV(dst, retype(dw, dst.type));
-      break;
-   case 8:
-      shuffle_32bit_load_result_to_64bit_data(bld, dst, dw, 1);
-      break;
-   default:
-      unreachable("Unsupported bit_size");
-   }
+   shuffle_from_32bit_read(bld, dst, vec4_result,
+                           (const_offset & 0xf) / type_sz(dst.type), 1);
 }
 
 /**
@@ -407,6 +394,25 @@
 }
 
 bool
+fs_inst::can_do_cmod()
+{
+   if (!backend_instruction::can_do_cmod())
+      return false;
+
+   /* The accumulator result appears to get used for the conditional modifier
+    * generation.  When negating a UD value, there is a 33rd bit generated for
+    * the sign in the accumulator value, so now you can't check, for example,
+    * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
+    */
+   for (unsigned i = 0; i < sources; i++) {
+      if (type_is_unsigned_int(src[i].type) && src[i].negate)
+         return false;
+   }
+
+   return true;
+}
+
+bool
 fs_inst::can_change_types() const
 {
    return dst.type == src[0].type &&
@@ -421,7 +427,7 @@
 void
 fs_reg::init()
 {
-   memset(this, 0, sizeof(*this));
+   memset((void*)this, 0, sizeof(*this));
    type = BRW_REGISTER_TYPE_UD;
    stride = 1;
 }
@@ -818,6 +824,8 @@
       else
          return 1;
    }
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      return (i == 0 ? 2 : 1);
 
    default:
       return 1;
@@ -829,6 +837,15 @@
 {
    switch (opcode) {
    case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_REP_FB_WRITE:
+      if (arg == 0) {
+         if (base_mrf >= 0)
+            return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
+         else
+            return mlen * REG_SIZE;
+      }
+      break;
+
    case FS_OPCODE_FB_READ:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
@@ -844,13 +861,17 @@
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
-   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
    case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
    case SHADER_OPCODE_BYTE_SCATTERED_READ:
       if (arg == 0)
          return mlen * REG_SIZE;
       break;
 
+   case FS_OPCODE_SET_SAMPLE_ID:
+      if (arg == 1)
+         return 1;
+      break;
+
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
       /* The payload is actually stored in src1 */
       if (arg == 1)
@@ -960,8 +981,8 @@
                             opcode != BRW_OPCODE_CSEL &&
                             opcode != BRW_OPCODE_IF &&
                             opcode != BRW_OPCODE_WHILE)) ||
-       opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS ||
-       opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL) {
+       opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
+       opcode == FS_OPCODE_FB_WRITE) {
       return flag_mask(this);
    } else {
       return flag_mask(dst, size_written);
@@ -1010,7 +1031,8 @@
    case SHADER_OPCODE_SAMPLEINFO:
       return 1;
    case FS_OPCODE_FB_WRITE:
-      return 2;
+   case FS_OPCODE_REP_FB_WRITE:
+      return inst->src[0].file == BAD_FILE ? 0 : 2;
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
    case SHADER_OPCODE_GEN4_SCRATCH_READ:
       return 1;
@@ -1076,11 +1098,11 @@
 
    /* gl_FragCoord.z */
    if (devinfo->gen >= 6) {
-      bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
+      bld.MOV(wpos, fetch_payload_reg(bld, payload.source_depth_reg));
    } else {
       bld.emit(FS_OPCODE_LINTERP, wpos,
-           this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
-           interp_reg(VARYING_SLOT_POS, 2));
+               this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
+               component(interp_reg(VARYING_SLOT_POS, 2), 0));
    }
    wpos = offset(wpos, bld, 1);
 
@@ -1214,30 +1236,16 @@
     * The X, Y sample positions come in as bytes in  thread payload. So, read
     * the positions using vstride=16, width=8, hstride=2.
     */
-   struct brw_reg sample_pos_reg =
-      stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
-                    BRW_REGISTER_TYPE_B), 16, 8, 2);
+   const fs_reg sample_pos_reg =
+      fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);
 
-   if (dispatch_width == 8) {
-      abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
-   } else {
-      abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
-      abld.half(1).MOV(half(int_sample_x, 1),
-                       fs_reg(suboffset(sample_pos_reg, 16)));
-   }
    /* Compute gl_SamplePosition.x */
-   compute_sample_position(pos, int_sample_x);
-   pos = offset(pos, abld, 1);
-   if (dispatch_width == 8) {
-      abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
-   } else {
-      abld.half(0).MOV(half(int_sample_y, 0),
-                       fs_reg(suboffset(sample_pos_reg, 1)));
-      abld.half(1).MOV(half(int_sample_y, 1),
-                       fs_reg(suboffset(sample_pos_reg, 17)));
-   }
+   abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));
+   compute_sample_position(offset(pos, abld, 0), int_sample_x);
+
    /* Compute gl_SamplePosition.y */
-   compute_sample_position(pos, int_sample_y);
+   abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));
+   compute_sample_position(offset(pos, abld, 1), int_sample_y);
    return reg;
 }
 
@@ -1286,16 +1294,20 @@
        * TODO: These payload bits exist on Gen7 too, but they appear to always
        *       be zero, so this code fails to work.  We should find out why.
        */
-      fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
+      const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);
 
-      abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
-                                         BRW_REGISTER_TYPE_UB), 1, 8, 0)),
-                    brw_imm_v(0x44440000));
+      for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
+         const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
+         hbld.SHR(offset(tmp, hbld, i),
+                  stride(retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UB),
+                         1, 8, 0),
+                  brw_imm_v(0x44440000));
+      }
+
       abld.AND(*reg, tmp, brw_imm_w(0xf));
    } else {
-      const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
-                                         BRW_REGISTER_TYPE_UD), 0);
-      const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UW);
+      const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);
+      const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
 
       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
        * 8x multisampling, subspan 0 will represent sample N (where N
@@ -1325,8 +1337,15 @@
                brw_imm_ud(0xc0));
       abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
 
-      /* This works for both SIMD8 and SIMD16 */
-      abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210));
+      /* This works for SIMD8-SIMD16.  It also works for SIMD32 but only if we
+       * can assume 4x MSAA.  Disallow it on IVB+
+       *
+       * FINISHME: One day, we could come up with a way to do this that
+       * actually works on gen7.
+       */
+      if (devinfo->gen >= 7)
+         limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gen7");
+      abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210));
 
       /* This special instruction takes care of setting vstride=1,
        * width=4, hstride=0 of t2 during an ADD instruction.
@@ -1346,8 +1365,8 @@
 
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
 
-   fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
-                               BRW_REGISTER_TYPE_D));
+   fs_reg coverage_mask =
+      fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D);
 
    if (wm_prog_data->persample_dispatch) {
       /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
@@ -1609,14 +1628,26 @@
     * setup regs, now that the location of the constants has been chosen.
     */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->opcode == FS_OPCODE_LINTERP) {
-	 assert(inst->src[1].file == FIXED_GRF);
-         inst->src[1].nr += urb_start;
-      }
-
-      if (inst->opcode == FS_OPCODE_CINTERP) {
-	 assert(inst->src[0].file == FIXED_GRF);
-         inst->src[0].nr += urb_start;
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == ATTR) {
+            /* ATTR regs in the FS are in units of logical scalar inputs each
+             * of which consumes half of a GRF register.
+             */
+            assert(inst->src[i].offset < REG_SIZE / 2);
+            const unsigned grf = urb_start + inst->src[i].nr / 2;
+            const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) +
+                                    inst->src[i].offset;
+            const unsigned width = inst->src[i].stride == 0 ?
+                                   1 : MIN2(inst->exec_size, 8);
+            struct brw_reg reg = stride(
+               byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+                           offset),
+               width * inst->src[i].stride,
+               width, inst->src[i].stride);
+            reg.abs = inst->src[i].abs;
+            reg.negate = inst->src[i].negate;
+            inst->src[i] = reg;
+         }
       }
    }
 
@@ -2445,7 +2476,8 @@
          }
          break;
       case BRW_OPCODE_OR:
-         if (inst->src[0].equals(inst->src[1])) {
+         if (inst->src[0].equals(inst->src[1]) ||
+             inst->src[1].is_zero()) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->src[1] = reg_undef;
             progress = true;
@@ -2663,7 +2695,7 @@
 {
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
-   if (stage != MESA_SHADER_FRAGMENT)
+   if (stage != MESA_SHADER_FRAGMENT || dispatch_width > 16)
       return false;
 
    if (devinfo->gen != 9 && !devinfo->is_cherryview)
@@ -3233,7 +3265,18 @@
       write->mlen = 1;
    } else {
       assume(key->nr_color_regions > 0);
+
+      struct brw_reg header =
+         retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);
+      bld.exec_all().group(16, 0)
+         .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
       for (int i = 0; i < key->nr_color_regions; ++i) {
+         if (i > 0) {
+            bld.exec_all().group(1, 0)
+               .MOV(component(header, 2), brw_imm_ud(i));
+         }
+
          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
          write->saturate = key->clamp_fragment_color;
          write->base_mrf = base_mrf;
@@ -3243,6 +3286,7 @@
       }
    }
    write->eot = true;
+   write->last_rt = true;
 
    calculate_cfg();
 
@@ -3957,31 +4001,104 @@
    int header_size = 2, payload_header_size;
    unsigned length = 0;
 
-   /* From the Sandy Bridge PRM, volume 4, page 198:
-    *
-    *     "Dispatched Pixel Enables. One bit per pixel indicating
-    *      which pixels were originally enabled when the thread was
-    *      dispatched. This field is only required for the end-of-
-    *      thread message and on all dual-source messages."
-    */
-   if (devinfo->gen >= 6 &&
-       (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
-       color1.file == BAD_FILE &&
-       key->nr_color_regions == 1) {
-      header_size = 0;
-   }
+   if (devinfo->gen < 6) {
+      /* TODO: Support SIMD32 on gen4-5 */
+      assert(bld.group() < 16);
 
-   if (header_size != 0) {
-      assert(header_size == 2);
-      /* Allocate 2 registers for a header */
-      length += 2;
-   }
+      /* For gen4-5, we always have a header consisting of g0 and g1.  We have
+       * an implied MOV from g0,g1 to the start of the message.  The MOV from
+       * g0 is handled by the hardware and the MOV from g1 is provided by the
+       * generator.  This is required because, on gen4-5, the generator may
+       * generate two write messages with different message lengths in order
+       * to handle AA data properly.
+       *
+       * Also, since the pixel mask goes in the g0 portion of the message and
+       * since render target writes are the last thing in the shader, we write
+       * the pixel mask directly into g0 and it will get copied as part of the
+       * implied write.
+       */
+      if (prog_data->uses_kill) {
+         bld.exec_all().group(1, 0)
+            .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
+                 brw_flag_reg(0, 1));
+      }
 
-   if (payload.aa_dest_stencil_reg) {
+      assert(length == 0);
+      length = 2;
+   } else if ((devinfo->gen <= 7 && !devinfo->is_haswell &&
+               prog_data->uses_kill) ||
+              color1.file != BAD_FILE ||
+              key->nr_color_regions > 1) {
+      /* From the Sandy Bridge PRM, volume 4, page 198:
+       *
+       *     "Dispatched Pixel Enables. One bit per pixel indicating
+       *      which pixels were originally enabled when the thread was
+       *      dispatched. This field is only required for the end-of-
+       *      thread message and on all dual-source messages."
+       */
+      const fs_builder ubld = bld.exec_all().group(8, 0);
+
+      fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      if (bld.group() < 16) {
+         /* The header starts off as g0 and g1 for the first half */
+         ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
+                                              BRW_REGISTER_TYPE_UD));
+      } else {
+         /* The header starts off as g0 and g2 for the second half */
+         assert(bld.group() < 32);
+         const fs_reg header_sources[2] = {
+            retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
+            retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
+         };
+         ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
+      }
+
+      uint32_t g00_bits = 0;
+
+      /* Set "Source0 Alpha Present to RenderTarget" bit in message
+       * header.
+       */
+      if (inst->target > 0 && key->replicate_alpha)
+         g00_bits |= 1 << 11;
+
+      /* Set computes stencil to render target */
+      if (prog_data->computed_stencil)
+         g00_bits |= 1 << 14;
+
+      if (g00_bits) {
+         /* OR extra bits into g0.0 */
+         ubld.group(1, 0).OR(component(header, 0),
+                             retype(brw_vec1_grf(0, 0),
+                                    BRW_REGISTER_TYPE_UD),
+                             brw_imm_ud(g00_bits));
+      }
+
+      /* Set the render target index for choosing BLEND_STATE. */
+      if (inst->target > 0) {
+         ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
+      }
+
+      if (prog_data->uses_kill) {
+         assert(bld.group() < 16);
+         ubld.group(1, 0).MOV(retype(component(header, 15),
+                                     BRW_REGISTER_TYPE_UW),
+                              brw_flag_reg(0, 1));
+      }
+
+      assert(length == 0);
+      sources[0] = header;
+      sources[1] = horiz_offset(header, 8);
+      length = 2;
+   }
+   assert(length == 0 || length == 2);
+   header_size = length;
+
+   if (payload.aa_dest_stencil_reg[0]) {
+      assert(inst->group < 16);
       sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
          .MOV(sources[length],
-              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
       length++;
    }
 
@@ -4001,7 +4118,7 @@
 
       bld.exec_all().annotate("FB write oMask")
          .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
-                           inst->group),
+                           inst->group % 16),
               sample_mask);
       length++;
    }
@@ -4046,7 +4163,7 @@
 
    if (src_stencil.file != BAD_FILE) {
       assert(devinfo->gen >= 9);
-      assert(bld.dispatch_width() != 16);
+      assert(bld.dispatch_width() == 8);
 
       /* XXX: src_stencil is only available on gen9+. dst_depth is never
        * available on gen9+. As such it's impossible to have both enabled at the
@@ -4082,7 +4199,13 @@
       if (devinfo->gen < 6 && bld.dispatch_width() == 16)
          load->dst.nr |= BRW_MRF_COMPR4;
 
-      inst->resize_sources(0);
+      if (devinfo->gen < 6) {
+         /* Set up src[0] for the implied MOV from grf0-1 */
+         inst->resize_sources(1);
+         inst->src[0] = brw_vec8_grf(0, 0);
+      } else {
+         inst->resize_sources(0);
+      }
       inst->base_mrf = 1;
    }
 
@@ -4094,12 +4217,21 @@
 static void
 lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
 {
-   const fs_builder &ubld = bld.exec_all();
+   const fs_builder &ubld = bld.exec_all().group(8, 0);
    const unsigned length = 2;
-   const fs_reg header = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD, length);
+   const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
 
-   ubld.group(16, 0)
-       .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   if (bld.group() < 16) {
+      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
+                                           BRW_REGISTER_TYPE_UD));
+   } else {
+      assert(bld.group() < 32);
+      const fs_reg header_sources[] = {
+         retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
+         retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
+      };
+      ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
+   }
 
    inst->resize_sources(1);
    inst->src[0] = header;
@@ -4711,7 +4843,7 @@
       inst->src[1] = tmp;
 
       inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
-
+      inst->mlen = inst->exec_size / 8;
    } else {
       const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen),
                            BRW_REGISTER_TYPE_UD);
@@ -4987,8 +5119,14 @@
             type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
             type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
 
+         /* We check size_read(i) against size_written instead of REG_SIZE
+          * because we want to properly handle SIMD32.  In SIMD32, you can end
+          * up with writes to 4 registers and a source that reads 2 registers
+          * and we may still need to lower all the way to SIMD8 in that case.
+          */
          if (inst->size_written > REG_SIZE &&
-             inst->size_read(i) != 0 && inst->size_read(i) <= REG_SIZE &&
+             inst->size_read(i) != 0 &&
+             inst->size_read(i) < inst->size_written &&
              !is_scalar_exception && !is_packed_word_exception) {
             const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
             max_width = MIN2(max_width, inst->exec_size / reg_count);
@@ -5658,7 +5796,7 @@
              */
             fs_inst split_inst = *inst;
             split_inst.exec_size = lower_width;
-            split_inst.eot = inst->eot && i == n - 1;
+            split_inst.eot = inst->eot && i == int(n - 1);
 
             /* Select the correct channel enables for the i-th group, then
              * transform the sources and destination and emit the lowered
@@ -5936,118 +6074,81 @@
    fprintf(file, "\n");
 }
 
-/**
- * Possibly returns an instruction that set up @param reg.
- *
- * Sometimes we want to take the result of some expression/variable
- * dereference tree and rewrite the instruction generating the result
- * of the tree.  When processing the tree, we know that the
- * instructions generated are all writing temporaries that are dead
- * outside of this tree.  So, if we have some instructions that write
- * a temporary, we're free to point that temp write somewhere else.
- *
- * Note that this doesn't guarantee that the instruction generated
- * only reg -- it might be the size=4 destination of a texture instruction.
- */
-fs_inst *
-fs_visitor::get_instruction_generating_reg(fs_inst *start,
-					   fs_inst *end,
-					   const fs_reg &reg)
-{
-   if (end == start ||
-       end->is_partial_write() ||
-       !reg.equals(end->dst)) {
-      return NULL;
-   } else {
-      return end;
-   }
-}
-
 void
 fs_visitor::setup_fs_payload_gen6()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
-
+   const unsigned payload_width = MIN2(16, dispatch_width);
+   assert(dispatch_width % payload_width == 0);
    assert(devinfo->gen >= 6);
 
-   /* R0-1: masks, pixel X/Y coordinates. */
-   payload.num_regs = 2;
-   /* R2: only for 32-pixel dispatch.*/
-
-   /* R3-26: barycentric interpolation coordinates.  These appear in the
-    * same order that they appear in the brw_barycentric_mode
-    * enum.  Each set of coordinates occupies 2 registers if dispatch width
-    * == 8 and 4 registers if dispatch width == 16.  Coordinates only
-    * appear if they were enabled using the "Barycentric Interpolation
-    * Mode" bits in WM_STATE.
-    */
-   for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
-      if (prog_data->barycentric_interp_modes & (1 << i)) {
-         payload.barycentric_coord_reg[i] = payload.num_regs;
-         payload.num_regs += 2;
-         if (dispatch_width == 16) {
-            payload.num_regs += 2;
-         }
-      }
-   }
-
-   /* R27: interpolated depth if uses source depth */
-   prog_data->uses_src_depth =
+   prog_data->uses_src_depth = prog_data->uses_src_w =
       (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
-   if (prog_data->uses_src_depth) {
-      payload.source_depth_reg = payload.num_regs;
-      payload.num_regs++;
-      if (dispatch_width == 16) {
-         /* R28: interpolated depth if not SIMD8. */
-         payload.num_regs++;
-      }
-   }
 
-   /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
-   prog_data->uses_src_w =
-      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
-   if (prog_data->uses_src_w) {
-      payload.source_w_reg = payload.num_regs;
-      payload.num_regs++;
-      if (dispatch_width == 16) {
-         /* R30: interpolated W if not SIMD8. */
-         payload.num_regs++;
-      }
-   }
-
-   /* R31: MSAA position offsets. */
-   if (prog_data->persample_dispatch &&
-       (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
-      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
-       *
-       *    "MSDISPMODE_PERSAMPLE is required in order to select
-       *    POSOFFSET_SAMPLE"
-       *
-       * So we can only really get sample positions if we are doing real
-       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
-       * persample dispatch, we hard-code it to 0.5.
-       */
-      prog_data->uses_pos_offset = true;
-      payload.sample_pos_reg = payload.num_regs;
-      payload.num_regs++;
-   }
-
-   /* R32: MSAA input coverage mask */
    prog_data->uses_sample_mask =
       (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
-   if (prog_data->uses_sample_mask) {
-      assert(devinfo->gen >= 7);
-      payload.sample_mask_in_reg = payload.num_regs;
-      payload.num_regs++;
-      if (dispatch_width == 16) {
-         /* R33: input coverage mask if not SIMD8. */
-         payload.num_regs++;
-      }
+
+   /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+    *
+    *    "MSDISPMODE_PERSAMPLE is required in order to select
+    *    POSOFFSET_SAMPLE"
+    *
+    * So we can only really get sample positions if we are doing real
+    * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+    * persample dispatch, we hard-code it to 0.5.
+    */
+   prog_data->uses_pos_offset = prog_data->persample_dispatch &&
+      (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
+
+   /* R0: PS thread payload header. */
+   payload.num_regs++;
+
+   for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
+      /* R1: masks, pixel X/Y coordinates. */
+      payload.subspan_coord_reg[j] = payload.num_regs++;
    }
 
-   /* R34-: bary for 32-pixel. */
-   /* R58-59: interp W for 32-pixel. */
+   for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
+      /* R3-26: barycentric interpolation coordinates.  These appear in the
+       * same order that they appear in the brw_barycentric_mode enum.  Each
+       * set of coordinates occupies 2 registers if dispatch width == 8 and 4
+       * registers if dispatch width == 16.  Coordinates only appear if they
+       * were enabled using the "Barycentric Interpolation Mode" bits in
+       * WM_STATE.
+       */
+      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+         if (prog_data->barycentric_interp_modes & (1 << i)) {
+            payload.barycentric_coord_reg[i][j] = payload.num_regs;
+            payload.num_regs += payload_width / 4;
+         }
+      }
+
+      /* R27-28: interpolated depth if uses source depth */
+      if (prog_data->uses_src_depth) {
+         payload.source_depth_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R29-30: interpolated W set if GEN6_WM_USES_SOURCE_W. */
+      if (prog_data->uses_src_w) {
+         payload.source_w_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+
+      /* R31: MSAA position offsets. */
+      if (prog_data->uses_pos_offset) {
+         payload.sample_pos_reg[j] = payload.num_regs;
+         payload.num_regs++;
+      }
+
+      /* R32-33: MSAA input coverage mask */
+      if (prog_data->uses_sample_mask) {
+         assert(devinfo->gen >= 7);
+         payload.sample_mask_in_reg[j] = payload.num_regs;
+         payload.num_regs += payload_width / 8;
+      }
+   }
 
    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
       source_depth_to_render_target = true;
@@ -6129,44 +6230,6 @@
    }
 }
 
-/**
- * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
- *
- * The needs_unlit_centroid_workaround ends up producing one of these per
- * channel of centroid input, so it's good to clean them up.
- *
- * An assumption here is that nothing ever modifies the dispatched pixels
- * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
- * dictates that anyway.
- */
-bool
-fs_visitor::opt_drop_redundant_mov_to_flags()
-{
-   bool flag_mov_found[4] = {false};
-   bool progress = false;
-
-   /* Instructions removed by this pass can only be added if this were true */
-   if (!devinfo->needs_unlit_centroid_workaround)
-      return false;
-
-   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      if (inst->is_control_flow()) {
-         memset(flag_mov_found, 0, sizeof(flag_mov_found));
-      } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
-         if (!flag_mov_found[inst->flag_subreg]) {
-            flag_mov_found[inst->flag_subreg] = true;
-         } else {
-            inst->remove(block);
-            progress = true;
-         }
-      } else if (inst->flags_written()) {
-         flag_mov_found[inst->flag_subreg] = false;
-      }
-   }
-
-   return progress;
-}
-
 void
 fs_visitor::optimize()
 {
@@ -6224,7 +6287,6 @@
    int iteration = 0;
    int pass_num = 0;
 
-   OPT(opt_drop_redundant_mov_to_flags);
    OPT(remove_extra_rounding_modes);
 
    do {
@@ -6710,8 +6772,11 @@
        * Initialize it with the dispatched pixels.
        */
       if (wm_prog_data->uses_kill) {
-         fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
-         discard_init->flag_subreg = 1;
+         const fs_reg dispatch_mask =
+            devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0);
+         bld.exec_all().group(1, 0)
+            .MOV(retype(brw_flag_reg(0, 1), BRW_REGISTER_TYPE_UW),
+                 retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
       }
 
       /* Generate FS IR for main().  (the visitor only descends into
@@ -7028,7 +7093,7 @@
                const nir_shader *src_shader,
                struct gl_program *prog,
                int shader_time_index8, int shader_time_index16,
-               bool allow_spilling,
+               int shader_time_index32, bool allow_spilling,
                bool use_rep_send, struct brw_vue_map *vue_map,
                char **error_str)
 {
@@ -7076,9 +7141,7 @@
    prog_data->barycentric_interp_modes =
       brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
 
-   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
-   uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
-   unsigned simd8_grf_used = 0, simd16_grf_used = 0;
+   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
 
    fs_visitor v8(compiler, log_data, mem_ctx, key,
                  &prog_data->base, prog, shader, 8,
@@ -7090,8 +7153,8 @@
       return NULL;
    } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
       simd8_cfg = v8.cfg;
-      simd8_grf_start = v8.payload.num_regs;
-      simd8_grf_used = v8.grf_used;
+      prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+      prog_data->reg_blocks_8 = brw_register_blocks(v8.grf_used);
    }
 
    if (v8.max_dispatch_width >= 16 &&
@@ -7107,8 +7170,28 @@
                                    v16.fail_msg);
       } else {
          simd16_cfg = v16.cfg;
-         simd16_grf_start = v16.payload.num_regs;
-         simd16_grf_used = v16.grf_used;
+         prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
+         prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
+      }
+   }
+
+   /* Currently, the compiler only supports SIMD32 on SNB+ */
+   if (v8.max_dispatch_width >= 32 && !use_rep_send &&
+       compiler->devinfo->gen >= 6 &&
+       unlikely(INTEL_DEBUG & DEBUG_DO32)) {
+      /* Try a SIMD32 compile */
+      fs_visitor v32(compiler, log_data, mem_ctx, key,
+                     &prog_data->base, prog, shader, 32,
+                     shader_time_index32);
+      v32.import_uniforms(&v8);
+      if (!v32.run_fs(allow_spilling, false)) {
+         compiler->shader_perf_log(log_data,
+                                   "SIMD32 shader failed to compile: %s",
+                                   v32.fail_msg);
+      } else {
+         simd32_cfg = v32.cfg;
+         prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
+         prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
       }
    }
 
@@ -7121,8 +7204,30 @@
     * Instead, we just give them exactly one shader and we pick the widest one
     * available.
     */
-   if (compiler->devinfo->gen < 5 && simd16_cfg)
-      simd8_cfg = NULL;
+   if (compiler->devinfo->gen < 5) {
+      if (simd32_cfg || simd16_cfg)
+         simd8_cfg = NULL;
+      if (simd32_cfg)
+         simd16_cfg = NULL;
+   }
+
+   /* If computed depth is enabled SNB only allows SIMD8. */
+   if (compiler->devinfo->gen == 6 &&
+       prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)
+      assert(simd16_cfg == NULL && simd32_cfg == NULL);
+
+   if (compiler->devinfo->gen <= 5 && !simd8_cfg) {
+      /* Iron lake and earlier only have one Dispatch GRF start field.  Make
+       * the data available in the base prog data struct for convenience.
+       */
+      if (simd16_cfg) {
+         prog_data->base.dispatch_grf_start_reg =
+            prog_data->dispatch_grf_start_reg_16;
+      } else if (simd32_cfg) {
+         prog_data->base.dispatch_grf_start_reg =
+            prog_data->dispatch_grf_start_reg_32;
+      }
+   }
 
    if (prog_data->persample_dispatch) {
       /* Starting with SandyBridge (where we first get MSAA), the different
@@ -7130,16 +7235,11 @@
        * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On all hardware
        * generations, the only configurations supporting persample dispatch
        * are are this in which only one dispatch width is enabled.
-       *
-       * If computed depth is enabled, SNB only allows SIMD8 while IVB+
-       * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
        */
-      if (compiler->devinfo->gen == 6 &&
-          prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
-         simd16_cfg = NULL;
-      } else if (simd16_cfg) {
+      if (simd32_cfg || simd16_cfg)
          simd8_cfg = NULL;
-      }
+      if (simd32_cfg)
+         simd16_cfg = NULL;
    }
 
    /* We have to compute the flat inputs after the visitor is finished running
@@ -7148,7 +7248,7 @@
     */
    brw_compute_flat_inputs(prog_data, shader);
 
-   fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
+   fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
                   v8.promoted_constants, v8.runtime_check_aads_emit,
                   MESA_SHADER_FRAGMENT);
 
@@ -7162,20 +7262,16 @@
    if (simd8_cfg) {
       prog_data->dispatch_8 = true;
       g.generate_code(simd8_cfg, 8);
-      prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
-      prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+   }
 
-      if (simd16_cfg) {
-         prog_data->dispatch_16 = true;
-         prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
-         prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
-         prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
-      }
-   } else if (simd16_cfg) {
+   if (simd16_cfg) {
       prog_data->dispatch_16 = true;
-      g.generate_code(simd16_cfg, 16);
-      prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
-      prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
+      prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
+   }
+
+   if (simd32_cfg) {
+      prog_data->dispatch_32 = true;
+      prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32);
    }
 
    return g.get_assembly();
@@ -7262,7 +7358,6 @@
 compile_cs_to_nir(const struct brw_compiler *compiler,
                   void *mem_ctx,
                   const struct brw_cs_prog_key *key,
-                  struct brw_cs_prog_data *prog_data,
                   const nir_shader *src_shader,
                   unsigned dispatch_width)
 {
@@ -7303,7 +7398,7 @@
     */
    if (min_dispatch_width <= 8) {
       nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
-                                           prog_data, src_shader, 8);
+                                           src_shader, 8);
       v8 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
                           NULL, /* Never used in core profile */
                           nir8, 8, shader_time_index);
@@ -7324,7 +7419,7 @@
        !fail_msg && min_dispatch_width <= 16) {
       /* Try a SIMD16 compile */
       nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
-                                            prog_data, src_shader, 16);
+                                            src_shader, 16);
       v16 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
                            NULL, /* Never used in core profile */
                            nir16, 16, shader_time_index);
@@ -7357,7 +7452,7 @@
    if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
       /* Try a SIMD32 compile */
       nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
-                                            prog_data, src_shader, 32);
+                                            src_shader, 32);
       v32 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
                            NULL, /* Never used in core profile */
                            nir32, 32, shader_time_index);
@@ -7389,7 +7484,7 @@
       if (error_str)
          *error_str = ralloc_strdup(mem_ctx, fail_msg);
    } else {
-      fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
+      fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
                      promoted_constants, false, MESA_SHADER_COMPUTE);
       if (INTEL_DEBUG & DEBUG_CS) {
          char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index e384db8..d56e337 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -83,10 +83,6 @@
    void setup_uniform_clipplane_values();
    void compute_clip_distance();
 
-   fs_inst *get_instruction_generating_reg(fs_inst *start,
-					   fs_inst *end,
-					   const fs_reg &reg);
-
    void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
                                    const fs_reg &dst,
                                    const fs_reg &surf_index,
@@ -235,7 +231,7 @@
    fs_reg get_nir_src(const nir_src &src);
    fs_reg get_nir_src_imm(const nir_src &src);
    fs_reg get_nir_dest(const nir_dest &dest);
-   fs_reg get_nir_image_deref(const nir_deref_var *deref);
+   fs_reg get_nir_image_deref(nir_deref_instr *deref);
    fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
@@ -276,7 +272,7 @@
 
    fs_reg get_timestamp(const brw::fs_builder &bld);
 
-   struct brw_reg interp_reg(int location, int channel);
+   fs_reg interp_reg(int location, int channel);
 
    int implied_mrf_writes(fs_inst *inst) const;
 
@@ -338,14 +334,15 @@
 
    /** Register numbers for thread payload fields. */
    struct thread_payload {
-      uint8_t source_depth_reg;
-      uint8_t source_w_reg;
-      uint8_t aa_dest_stencil_reg;
-      uint8_t dest_depth_reg;
-      uint8_t sample_pos_reg;
-      uint8_t sample_mask_in_reg;
-      uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT];
-      uint8_t local_invocation_id_reg;
+      uint8_t subspan_coord_reg[2];
+      uint8_t source_depth_reg[2];
+      uint8_t source_w_reg[2];
+      uint8_t aa_dest_stencil_reg[2];
+      uint8_t dest_depth_reg[2];
+      uint8_t sample_pos_reg[2];
+      uint8_t sample_mask_in_reg[2];
+      uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT][2];
+      uint8_t local_invocation_id_reg[2];
 
       /** The number of thread payload registers the hardware will supply. */
       uint8_t num_regs;
@@ -387,7 +384,6 @@
 public:
    fs_generator(const struct brw_compiler *compiler, void *log_data,
                 void *mem_ctx,
-                const void *key,
                 struct brw_stage_prog_data *prog_data,
                 unsigned promoted_constants,
                 bool runtime_check_aads_emit,
@@ -485,7 +481,6 @@
    const struct gen_device_info *devinfo;
 
    struct brw_codegen *p;
-   const void * const key;
    struct brw_stage_prog_data * const prog_data;
 
    unsigned dispatch_width; /**< 8, 16 or 32 */
@@ -499,29 +494,58 @@
    void *mem_ctx;
 };
 
-void shuffle_32bit_load_result_to_64bit_data(const brw::fs_builder &bld,
-                                             const fs_reg &dst,
-                                             const fs_reg &src,
-                                             uint32_t components);
+namespace brw {
+   inline fs_reg
+   fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
+                     brw_reg_type type = BRW_REGISTER_TYPE_F, unsigned n = 1)
+   {
+      if (!regs[0])
+         return fs_reg();
 
-fs_reg shuffle_64bit_data_for_32bit_write(const brw::fs_builder &bld,
-                                          const fs_reg &src,
-                                          uint32_t components);
+      if (bld.dispatch_width() > 16) {
+         const fs_reg tmp = bld.vgrf(type, n);
+         const brw::fs_builder hbld = bld.exec_all().group(16, 0);
+         const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
+         fs_reg *const components = new fs_reg[n * m];
 
-void shuffle_32bit_load_result_to_16bit_data(const brw::fs_builder &bld,
-                                             const fs_reg &dst,
-                                             const fs_reg &src,
-                                             uint32_t first_component,
-                                             uint32_t components);
+         for (unsigned c = 0; c < n; c++) {
+            for (unsigned g = 0; g < m; g++) {
+               components[c * m + g] =
+                  offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
+            }
+         }
 
-void shuffle_16bit_data_for_32bit_write(const brw::fs_builder &bld,
-                                        const fs_reg &dst,
-                                        const fs_reg &src,
-                                        uint32_t components);
+         hbld.LOAD_PAYLOAD(tmp, components, n * m, 0);
+
+         delete[] components;
+         return tmp;
+
+      } else {
+         return fs_reg(retype(brw_vec8_grf(regs[0], 0), type));
+      }
+   }
+}
+
+void shuffle_from_32bit_read(const brw::fs_builder &bld,
+                             const fs_reg &dst,
+                             const fs_reg &src,
+                             uint32_t first_component,
+                             uint32_t components);
+
+fs_reg shuffle_for_32bit_write(const brw::fs_builder &bld,
+                               const fs_reg &src,
+                               uint32_t first_component,
+                               uint32_t components);
 
 fs_reg setup_imm_df(const brw::fs_builder &bld,
                     double v);
 
+fs_reg setup_imm_b(const brw::fs_builder &bld,
+                   int8_t v);
+
+fs_reg setup_imm_ub(const brw::fs_builder &bld,
+                   uint8_t v);
+
 enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode,
                                                nir_intrinsic_op op);
 
diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
index 4203c8c..0cafaf5 100644
--- a/src/intel/compiler/brw_fs_builder.h
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -235,14 +235,14 @@
       src_reg
       sample_mask_reg() const
       {
-         assert(shader->stage != MESA_SHADER_FRAGMENT ||
-                group() + dispatch_width() <= 16);
          if (shader->stage != MESA_SHADER_FRAGMENT) {
             return brw_imm_d(0xffffffff);
          } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
             return brw_flag_reg(0, 1);
          } else {
-            return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
+            assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16);
+            return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7),
+                          BRW_REGISTER_TYPE_UD);
          }
       }
 
@@ -795,7 +795,7 @@
              !gen_device_info_is_9lp(shader->devinfo))
             return false;
 
-         if (type_sz(type > 4))
+         if (type_sz(type) > 4)
             return true;
 
          if (opcode == BRW_OPCODE_MUL &&
diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
index 462e51d..5fb522f 100644
--- a/src/intel/compiler/brw_fs_cmod_propagation.cpp
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -49,6 +49,123 @@
  */
 
 static bool
+cmod_propagate_cmp_to_add(const gen_device_info *devinfo, bblock_t *block,
+                          fs_inst *inst)
+{
+   bool read_flag = false;
+
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+      if (scan_inst->opcode == BRW_OPCODE_ADD &&
+          !scan_inst->is_partial_write() &&
+          scan_inst->exec_size == inst->exec_size) {
+         bool negate;
+
+         /* A CMP is basically a subtraction.  The result of the
+          * subtraction must be the same as the result of the addition.
+          * This means that one of the operands must be negated.  So (a +
+          * b) vs (a == -b) or (a + -b) vs (a == b).
+          */
+         if ((inst->src[0].equals(scan_inst->src[0]) &&
+              inst->src[1].negative_equals(scan_inst->src[1])) ||
+             (inst->src[0].equals(scan_inst->src[1]) &&
+              inst->src[1].negative_equals(scan_inst->src[0]))) {
+            negate = false;
+         } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
+                     inst->src[1].equals(scan_inst->src[1])) ||
+                    (inst->src[0].negative_equals(scan_inst->src[1]) &&
+                     inst->src[1].equals(scan_inst->src[0]))) {
+            negate = true;
+         } else {
+            goto not_match;
+         }
+
+         /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
+          *
+          *    * Note that the [post condition signal] bits generated at
+          *      the output of a compute are before the .sat.
+          *
+          * So we don't have to bail if scan_inst has saturate.
+          */
+         /* Otherwise, try propagating the conditional. */
+         const enum brw_conditional_mod cond =
+            negate ? brw_swap_cmod(inst->conditional_mod)
+            : inst->conditional_mod;
+
+         if (scan_inst->can_do_cmod() &&
+             ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+              scan_inst->conditional_mod == cond)) {
+            scan_inst->conditional_mod = cond;
+            inst->remove(block);
+            return true;
+         }
+         break;
+      }
+
+   not_match:
+      if (scan_inst->flags_written())
+         break;
+
+      read_flag = read_flag || scan_inst->flags_read(devinfo);
+   }
+
+   return false;
+}
+
+/**
+ * Propagate conditional modifiers from NOT instructions
+ *
+ * Attempt to convert sequences like
+ *
+ *    or(8)           g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
+ *    ...
+ *    not.nz.f0(8)    null            g78<8,8,1>UD
+ *
+ * into
+ *
+ *    or.z.f0(8)      g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
+ */
+static bool
+cmod_propagate_not(const gen_device_info *devinfo, bblock_t *block,
+                   fs_inst *inst)
+{
+   const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
+   bool read_flag = false;
+
+   if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
+      return false;
+
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+      if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                          inst->src[0], inst->size_read(0))) {
+         if (scan_inst->opcode != BRW_OPCODE_OR &&
+             scan_inst->opcode != BRW_OPCODE_AND)
+            break;
+
+         if (scan_inst->is_partial_write() ||
+             scan_inst->dst.offset != inst->src[0].offset ||
+             scan_inst->exec_size != inst->exec_size)
+            break;
+
+         if (scan_inst->can_do_cmod() &&
+             ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+              scan_inst->conditional_mod == cond)) {
+            scan_inst->conditional_mod = cond;
+            inst->remove(block);
+            return true;
+         }
+         break;
+      }
+
+      if (scan_inst->flags_written())
+         break;
+
+      read_flag = read_flag || scan_inst->flags_read(devinfo);
+   }
+
+   return false;
+}
+
+static bool
 opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
 {
    bool progress = false;
@@ -59,7 +176,8 @@
 
       if ((inst->opcode != BRW_OPCODE_AND &&
            inst->opcode != BRW_OPCODE_CMP &&
-           inst->opcode != BRW_OPCODE_MOV) ||
+           inst->opcode != BRW_OPCODE_MOV &&
+           inst->opcode != BRW_OPCODE_NOT) ||
           inst->predicate != BRW_PREDICATE_NONE ||
           !inst->dst.is_null() ||
           (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
@@ -90,64 +208,30 @@
           inst->conditional_mod != BRW_CONDITIONAL_NZ)
          continue;
 
+      /* A CMP with a second source of zero can match with anything.  A CMP
+       * with a second source that is not zero can only match with an ADD
+       * instruction.
+       *
+       * Only apply this optimization to float-point sources.  It can fail for
+       * integers.  For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
+       * int(0x80000000) - 4 overflows and results in 0x7ffffffc.  that's not
+       * less than zero, so the flags get set differently than for (a < b).
+       */
+      if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
+         if (brw_reg_type_is_floating_point(inst->src[0].type) &&
+             cmod_propagate_cmp_to_add(devinfo, block, inst))
+            progress = true;
+
+         continue;
+      }
+
+      if (inst->opcode == BRW_OPCODE_NOT) {
+         progress = cmod_propagate_not(devinfo, block, inst) || progress;
+         continue;
+      }
+
       bool read_flag = false;
       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
-         /* A CMP with a second source of zero can match with anything.  A CMP
-          * with a second source that is not zero can only match with an ADD
-          * instruction.
-          */
-         if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
-            bool negate;
-
-            if (scan_inst->opcode != BRW_OPCODE_ADD)
-               goto not_match;
-
-            /* A CMP is basically a subtraction.  The result of the
-             * subtraction must be the same as the result of the addition.
-             * This means that one of the operands must be negated.  So (a +
-             * b) vs (a == -b) or (a + -b) vs (a == b).
-             */
-            if ((inst->src[0].equals(scan_inst->src[0]) &&
-                 inst->src[1].negative_equals(scan_inst->src[1])) ||
-                (inst->src[0].equals(scan_inst->src[1]) &&
-                 inst->src[1].negative_equals(scan_inst->src[0]))) {
-               negate = false;
-            } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
-                        inst->src[1].equals(scan_inst->src[1])) ||
-                       (inst->src[0].negative_equals(scan_inst->src[1]) &&
-                        inst->src[1].equals(scan_inst->src[0]))) {
-               negate = true;
-            } else {
-               goto not_match;
-            }
-
-            if (scan_inst->is_partial_write() ||
-                scan_inst->exec_size != inst->exec_size)
-               goto not_match;
-
-            /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
-             *
-             *    * Note that the [post condition signal] bits generated at
-             *      the output of a compute are before the .sat.
-             *
-             * So we don't have to bail if scan_inst has saturate.
-             */
-
-            /* Otherwise, try propagating the conditional. */
-            const enum brw_conditional_mod cond =
-               negate ? brw_swap_cmod(inst->conditional_mod)
-                      : inst->conditional_mod;
-
-            if (scan_inst->can_do_cmod() &&
-                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
-                 scan_inst->conditional_mod == cond)) {
-               scan_inst->conditional_mod = cond;
-               inst->remove(block);
-               progress = true;
-            }
-            break;
-         }
-
          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
                              inst->src[0], inst->size_read(0))) {
             if (scan_inst->is_partial_write() ||
@@ -242,7 +326,6 @@
             break;
          }
 
-      not_match:
          if (scan_inst->flags_written())
             break;
 
diff --git a/src/intel/compiler/brw_fs_cse.cpp b/src/intel/compiler/brw_fs_cse.cpp
index 48220ef..6859733 100644
--- a/src/intel/compiler/brw_fs_cse.cpp
+++ b/src/intel/compiler/brw_fs_cse.cpp
@@ -75,7 +75,6 @@
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
-   case FS_OPCODE_CINTERP:
    case FS_OPCODE_LINTERP:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 7f0dc13..e265d59 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -175,14 +175,13 @@
 
 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
                            void *mem_ctx,
-                           const void *key,
                            struct brw_stage_prog_data *prog_data,
                            unsigned promoted_constants,
                            bool runtime_check_aads_emit,
                            gl_shader_stage stage)
 
    : compiler(compiler), log_data(log_data),
-     devinfo(compiler->devinfo), key(key),
+     devinfo(compiler->devinfo),
      prog_data(prog_data),
      promoted_constants(promoted_constants),
      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
@@ -267,21 +266,35 @@
       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
       brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
+      brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1),
+              offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1));
       brw_pop_insn_state(p);
    }
 
-   if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
+   if (inst->opcode == FS_OPCODE_REP_FB_WRITE) {
+      assert(inst->group == 0 && inst->exec_size == 16);
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
-   else if (prog_data->dual_src_blend) {
-      if (!inst->group)
+
+   } else if (prog_data->dual_src_blend) {
+      assert(inst->exec_size == 8);
+
+      if (inst->group % 16 == 0)
          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
-      else
+      else if (inst->group % 16 == 8)
          msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
-   } else if (inst->exec_size == 16)
-      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
-   else
-      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
+      else
+         unreachable("Invalid dual-source FB write instruction group");
+
+   } else {
+      assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
+
+      if (inst->exec_size == 16)
+         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+      else if (inst->exec_size == 8)
+         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
+      else
+         unreachable("Invalid FB write execution size");
+   }
 
    /* We assume render targets start at 0, because headerless FB write
     * messages set "Render Target Index" to 0.  Using a different binding
@@ -289,20 +302,19 @@
     */
    const uint32_t surf_index = inst->target;
 
-   bool last_render_target = inst->eot ||
-                             (prog_data->dual_src_blend && dispatch_width == 16);
+   brw_inst *insn = brw_fb_WRITE(p,
+                                 payload,
+                                 retype(implied_header, BRW_REGISTER_TYPE_UW),
+                                 msg_control,
+                                 surf_index,
+                                 nr,
+                                 0,
+                                 inst->eot,
+                                 inst->last_rt,
+                                 inst->header_size != 0);
 
-
-   brw_fb_WRITE(p,
-                payload,
-                implied_header,
-                msg_control,
-                surf_index,
-                nr,
-                0,
-                inst->eot,
-                last_render_target,
-                inst->header_size != 0);
+   if (devinfo->gen >= 6)
+      brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16);
 
    brw_mark_surface_used(&prog_data->base, surf_index);
 }
@@ -310,86 +322,16 @@
 void
 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
 {
-   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
-   const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
-   struct brw_reg implied_header;
-
    if (devinfo->gen < 8 && !devinfo->is_haswell) {
       brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
    }
 
+   const struct brw_reg implied_header =
+      devinfo->gen < 6 ? payload : brw_null_reg();
+
    if (inst->base_mrf >= 0)
       payload = brw_message_reg(inst->base_mrf);
 
-   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
-    * move, here's g1.
-    */
-   if (inst->header_size != 0) {
-      brw_push_insn_state(p);
-      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-      brw_set_default_exec_size(p, BRW_EXECUTE_1);
-      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
-      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_set_default_flag_reg(p, 0, 0);
-
-      /* On HSW, the GPU will use the predicate on SENDC, unless the header is
-       * present.
-       */
-      if (prog_data->uses_kill) {
-         struct brw_reg pixel_mask;
-
-         if (devinfo->gen >= 6)
-            pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
-         else
-            pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-
-         brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
-      }
-
-      if (devinfo->gen >= 6) {
-         brw_push_insn_state(p);
-         brw_set_default_exec_size(p, BRW_EXECUTE_16);
-	 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-	 brw_MOV(p,
-		 retype(payload, BRW_REGISTER_TYPE_UD),
-		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
-         brw_pop_insn_state(p);
-
-         if (inst->target > 0 && key->replicate_alpha) {
-            /* Set "Source0 Alpha Present to RenderTarget" bit in message
-             * header.
-             */
-            brw_OR(p,
-		   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
-		   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
-		   brw_imm_ud(0x1 << 11));
-         }
-
-	 if (inst->target > 0) {
-	    /* Set the render target index for choosing BLEND_STATE. */
-	    brw_MOV(p, retype(vec1(suboffset(payload, 2)),
-                              BRW_REGISTER_TYPE_UD),
-		    brw_imm_ud(inst->target));
-	 }
-
-         /* Set computes stencil to render target */
-         if (prog_data->computed_stencil) {
-            brw_OR(p,
-                   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
-                   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
-                   brw_imm_ud(0x1 << 14));
-         }
-
-	 implied_header = brw_null_reg();
-      } else {
-	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
-      }
-
-      brw_pop_insn_state(p);
-   } else {
-      implied_header = brw_null_reg();
-   }
-
    if (!runtime_check_aads_emit) {
       fire_fb_write(inst, payload, implied_header, inst->mlen);
    } else {
@@ -740,7 +682,7 @@
 }
 
 void
-fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
+fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
 {
    brw_barrier(p, src);
    brw_WAIT(p);
@@ -781,6 +723,7 @@
       struct brw_reg dwQ = suboffset(interp, 1);
       struct brw_reg dwR = suboffset(interp, 3);
 
+      brw_push_insn_state(p);
       brw_set_default_exec_size(p, BRW_EXECUTE_8);
 
       if (inst->exec_size == 8) {
@@ -795,16 +738,14 @@
           */
          brw_inst_set_saturate(p->devinfo, i[0], false);
       } else {
-         brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+         brw_set_default_group(p, inst->group);
          i[0] = brw_MAD(p,            acc, dwR, offset(delta_x, 0), dwP);
          i[1] = brw_MAD(p, offset(dst, 0), acc, offset(delta_x, 1), dwQ);
 
-         brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+         brw_set_default_group(p, inst->group + 8);
          i[2] = brw_MAD(p,            acc, dwR, offset(delta_y, 0), dwP);
          i[3] = brw_MAD(p, offset(dst, 1), acc, offset(delta_y, 1), dwQ);
 
-         brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
-
          brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);
          brw_inst_set_cond_modifier(p->devinfo, i[3], inst->conditional_mod);
 
@@ -816,12 +757,65 @@
          brw_inst_set_saturate(p->devinfo, i[2], false);
       }
 
-      return true;
-   } else if (devinfo->has_pln &&
-              (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
-      brw_PLN(p, dst, interp, delta_x);
+      brw_pop_insn_state(p);
 
-      return false;
+      return true;
+   } else if (devinfo->has_pln) {
+      if (devinfo->gen <= 6 && (delta_x.nr & 1) != 0) {
+         /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane":
+          *
+          *    "[DevSNB]:<src1> must be even register aligned.
+          *
+          * This restriction is lifted on Ivy Bridge.
+          *
+          * This means that we need to split PLN into LINE+MAC on-the-fly.
+          * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so
+          * we have to split into SIMD8 pieces.  For gen4 (!has_pln), the
+          * coordinate registers are laid out differently so we leave it as a
+          * SIMD16 instruction.
+          */
+         assert(inst->exec_size == 8 || inst->exec_size == 16);
+         assert(inst->group % 16 == 0);
+
+         brw_push_insn_state(p);
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
+
+         /* Thanks to two accumulators, we can emit all the LINEs and then all
+          * the MACs.  This improves parallelism a bit.
+          */
+         for (unsigned g = 0; g < inst->exec_size / 8; g++) {
+            brw_inst *line = brw_LINE(p, brw_null_reg(), interp,
+                                      offset(delta_x, g * 2));
+            brw_inst_set_group(devinfo, line, inst->group + g * 8);
+
+            /* LINE writes the accumulator automatically on gen4-5.  On Sandy
+             * Bridge and later, we have to explicitly enable it.
+             */
+            if (devinfo->gen >= 6)
+               brw_inst_set_acc_wr_control(p->devinfo, line, true);
+
+            /* brw_set_default_saturate() is called before emitting
+             * instructions, so the saturate bit is set in each instruction,
+             * so we need to unset it on the LINE instructions.
+             */
+            brw_inst_set_saturate(p->devinfo, line, false);
+         }
+
+         for (unsigned g = 0; g < inst->exec_size / 8; g++) {
+            brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1),
+                                    offset(delta_x, g * 2 + 1));
+            brw_inst_set_group(devinfo, mac, inst->group + g * 8);
+            brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod);
+         }
+
+         brw_pop_insn_state(p);
+
+         return true;
+      } else {
+         brw_PLN(p, dst, interp, delta_x);
+
+         return false;
+      }
    } else {
       i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x);
       i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y);
@@ -893,6 +887,11 @@
    uint32_t return_format;
    bool is_combined_send = inst->eot;
 
+   /* Sampler EOT message of less than the dispatch width would kill the
+    * thread prematurely.
+    */
+   assert(!is_combined_send || inst->exec_size == dispatch_width);
+
    switch (dst.type) {
    case BRW_REGISTER_TYPE_D:
       return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
@@ -1180,17 +1179,16 @@
       brw_pop_insn_state(p);
 
       /* dst = send(offset, a0.0 | <descriptor>) */
-      brw_inst *insn = brw_send_indirect_message(
-         p, BRW_SFID_SAMPLER, dst, src, addr);
-      brw_set_sampler_message(p, insn,
-                              0 /* surface */,
-                              0 /* sampler */,
-                              msg_type,
-                              inst->size_written / REG_SIZE,
-                              inst->mlen /* mlen */,
-                              inst->header_size != 0 /* header */,
-                              simd_mode,
-                              return_format);
+      brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, src, addr,
+         brw_message_desc(devinfo, inst->mlen, inst->size_written / REG_SIZE,
+                          inst->header_size) |
+         brw_sampler_desc(devinfo,
+                          0 /* surface */,
+                          0 /* sampler */,
+                          msg_type,
+                          simd_mode,
+                          return_format));
 
       /* visitor knows more than we do about the surface limit required,
        * so has already done marking.
@@ -1323,7 +1321,7 @@
 }
 
 void
-fs_generator::generate_discard_jump(fs_inst *inst)
+fs_generator::generate_discard_jump(fs_inst *)
 {
    assert(devinfo->gen >= 6);
 
@@ -1423,15 +1421,16 @@
       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
       brw_pop_insn_state(p);
 
+      brw_inst_set_sfid(devinfo, send, GEN6_SFID_DATAPORT_CONSTANT_CACHE);
       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
       brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
-      brw_set_dp_read_message(p, send, surf_index,
-                              BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
-                              GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
-                              GEN6_SFID_DATAPORT_CONSTANT_CACHE,
-                              1, /* mlen */
-                              true, /* header */
-                              DIV_ROUND_UP(inst->size_written, REG_SIZE));
+      brw_set_desc(p, send,
+                   brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written,
+                                                             REG_SIZE), true) |
+                   brw_dp_read_desc(devinfo, surf_index,
+                                    BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
+                                    GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
+                                    BRW_DATAPORT_READ_TARGET_DATA_CACHE));
 
    } else {
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
@@ -1447,17 +1446,16 @@
       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
 
       /* dst = send(payload, a0.0 | <descriptor>) */
-      brw_inst *insn = brw_send_indirect_message(
+      brw_send_indirect_message(
          p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
          retype(dst, BRW_REGISTER_TYPE_UD),
-         retype(payload, BRW_REGISTER_TYPE_UD), addr);
-      brw_set_dp_read_message(p, insn, 0 /* surface */,
-                              BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
-                              GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
-                              GEN6_SFID_DATAPORT_CONSTANT_CACHE,
-                              1, /* mlen */
-                              true, /* header */
-                              DIV_ROUND_UP(inst->size_written, REG_SIZE));
+         retype(payload, BRW_REGISTER_TYPE_UD), addr,
+         brw_message_desc(devinfo, 1,
+                          DIV_ROUND_UP(inst->size_written, REG_SIZE), true) |
+         brw_dp_read_desc(devinfo, 0 /* surface */,
+                          BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
+                          GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
+                          BRW_DATAPORT_READ_TARGET_DATA_CACHE));
 
       brw_pop_insn_state(p);
    }
@@ -1504,6 +1502,7 @@
 
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    brw_inst_set_compression(devinfo, send, false);
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER);
    brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
    brw_set_src0(p, send, header);
    if (devinfo->gen < 6)
@@ -1513,15 +1512,11 @@
     * stored in it.
     */
    uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
-   brw_set_sampler_message(p, send,
-                           surf_index,
-                           0, /* sampler (unused) */
-                           msg_type,
-                           rlen,
-                           inst->mlen,
-                           inst->header_size != 0,
-                           simd_mode,
-                           return_format);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) |
+                brw_sampler_desc(devinfo, surf_index,
+                                 0, /* sampler (unused) */
+                                 msg_type, simd_mode, return_format));
 }
 
 void
@@ -1535,17 +1530,15 @@
     * gen7, so the fact that it's a send message is hidden at the IR level.
     */
    assert(inst->header_size == 0);
-   assert(!inst->mlen);
+   assert(inst->mlen);
    assert(index.type == BRW_REGISTER_TYPE_UD);
 
-   uint32_t simd_mode, rlen, mlen;
+   uint32_t simd_mode, rlen;
    if (inst->exec_size == 16) {
-      mlen = 2;
       rlen = 8;
       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
    } else {
       assert(inst->exec_size == 8);
-      mlen = 1;
       rlen = 4;
       simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
    }
@@ -1555,17 +1548,15 @@
       uint32_t surf_index = index.ud;
 
       brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+      brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER);
       brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
       brw_set_src0(p, send, offset);
-      brw_set_sampler_message(p, send,
-                              surf_index,
-                              0, /* LD message ignores sampler unit */
-                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
-                              rlen,
-                              mlen,
-                              false, /* no header */
-                              simd_mode,
-                              0);
+      brw_set_desc(p, send,
+                   brw_message_desc(devinfo, inst->mlen, rlen, false) |
+                   brw_sampler_desc(devinfo, surf_index,
+                                    0, /* LD message ignores sampler unit */
+                                    GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                                    simd_mode, 0));
 
    } else {
 
@@ -1584,45 +1575,19 @@
       brw_pop_insn_state(p);
 
       /* dst = send(offset, a0.0 | <descriptor>) */
-      brw_inst *insn = brw_send_indirect_message(
+      brw_send_indirect_message(
          p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
-         offset, addr);
-      brw_set_sampler_message(p, insn,
-                              0 /* surface */,
-                              0 /* sampler */,
-                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
-                              rlen /* rlen */,
-                              mlen /* mlen */,
-                              false /* header */,
-                              simd_mode,
-                              0);
+         offset, addr,
+         brw_message_desc(devinfo, inst->mlen, rlen, false) |
+         brw_sampler_desc(devinfo,
+                          0 /* surface */,
+                          0 /* sampler */,
+                          GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                          simd_mode,
+                          0));
    }
 }
 
-/**
- * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
- * into the flags register (f0.0).
- *
- * Used only on Gen6 and above.
- */
-void
-fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
-{
-   struct brw_reg flags = brw_flag_subreg(inst->flag_subreg);
-   struct brw_reg dispatch_mask;
-
-   if (devinfo->gen >= 6)
-      dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
-   else
-      dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-
-   brw_push_insn_state(p);
-   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-   brw_set_default_exec_size(p, BRW_EXECUTE_1);
-   brw_MOV(p, flags, dispatch_mask);
-   brw_pop_insn_state(p);
-}
-
 void
 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
                                                 struct brw_reg dst,
@@ -1630,16 +1595,18 @@
                                                 struct brw_reg msg_data,
                                                 unsigned msg_type)
 {
-   assert(inst->size_written % REG_SIZE == 0);
+   const bool has_payload = inst->src[0].file != BAD_FILE;
    assert(msg_data.type == BRW_REGISTER_TYPE_UD);
+   assert(inst->size_written % REG_SIZE == 0);
 
    brw_pixel_interpolator_query(p,
          retype(dst, BRW_REGISTER_TYPE_UW),
-         src,
+         /* If we don't have a payload, what we send doesn't matter */
+         has_payload ? src : brw_vec8_grf(0, 0),
          inst->pi_noperspective,
          msg_type,
          msg_data,
-         inst->mlen,
+         has_payload ? 2 * inst->exec_size / 8 : 1,
          inst->size_written / REG_SIZE);
 }
 
@@ -1657,22 +1624,24 @@
    assert(src0.type == BRW_REGISTER_TYPE_D ||
           src0.type == BRW_REGISTER_TYPE_UD);
 
-   struct brw_reg reg = stride(src1, 1, 4, 0);
-   if (devinfo->gen >= 8 || inst->exec_size == 8) {
-      brw_ADD(p, dst, src0, reg);
-   } else if (inst->exec_size == 16) {
-      brw_push_insn_state(p);
-      brw_set_default_exec_size(p, BRW_EXECUTE_8);
-      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
-      brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
-      brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
-      brw_pop_insn_state(p);
+   const struct brw_reg reg = stride(src1, 1, 4, 0);
+   const unsigned lower_size = MIN2(inst->exec_size,
+                                    devinfo->gen >= 8 ? 16 : 8);
+
+   for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
+      brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8),
+                               offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) *
+                                             (i * lower_size / (1 << src0.width))) *
+                                            type_sz(src0.type) / REG_SIZE),
+                               suboffset(reg, i * lower_size / 4));
+      brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1);
+      brw_inst_set_group(devinfo, insn, inst->group + lower_size * i);
+      brw_inst_set_compression(devinfo, insn, lower_size > 8);
    }
 }
 
 void
-fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
+fs_generator::generate_pack_half_2x16_split(fs_inst *,
                                             struct brw_reg dst,
                                             struct brw_reg x,
                                             struct brw_reg y)
@@ -1740,7 +1709,7 @@
 }
 
 void
-fs_generator::generate_shader_time_add(fs_inst *inst,
+fs_generator::generate_shader_time_add(fs_inst *,
                                        struct brw_reg payload,
                                        struct brw_reg offset,
                                        struct brw_reg value)
@@ -1864,7 +1833,13 @@
       brw_set_default_access_mode(p, BRW_ALIGN_1);
       brw_set_default_predicate_control(p, inst->predicate);
       brw_set_default_predicate_inverse(p, inst->predicate_inverse);
-      brw_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2);
+      /* On gen7 and above, hardware automatically adds the group onto the
+       * flag subregister number.  On Sandy Bridge and older, we have to do it
+       * ourselves.
+       */
+      const unsigned flag_subreg = inst->flag_subreg +
+         (devinfo->gen >= 7 ? 0 : inst->group / 16);
+      brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
       brw_set_default_saturate(p, inst->saturate);
       brw_set_default_mask_control(p, inst->force_writemask_all);
       brw_set_default_acc_write_control(p, inst->writes_accumulator);
@@ -2105,9 +2080,6 @@
                       BRW_MATH_PRECISION_FULL);
 	 }
 	 break;
-      case FS_OPCODE_CINTERP:
-	 brw_MOV(p, dst, src[0]);
-	 break;
       case FS_OPCODE_LINTERP:
 	 multiple_instructions_emitted = generate_linterp(inst, dst, src);
 	 break;
@@ -2209,10 +2181,6 @@
          generate_fb_read(inst, dst, src[0]);
          break;
 
-      case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
-         generate_mov_dispatch_to_flags(inst);
-         break;
-
       case FS_OPCODE_DISCARD_JUMP:
          generate_discard_jump(inst);
          break;
@@ -2277,7 +2245,12 @@
          break;
 
       case SHADER_OPCODE_MEMORY_FENCE:
-         brw_memory_fence(p, dst);
+         brw_memory_fence(p, dst, BRW_OPCODE_SEND);
+         break;
+
+      case SHADER_OPCODE_INTERLOCK:
+         /* The interlock is basically a memory fence issued via sendc */
+         brw_memory_fence(p, dst, BRW_OPCODE_SENDC);
          break;
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
diff --git a/src/intel/compiler/brw_fs_lower_conversions.cpp b/src/intel/compiler/brw_fs_lower_conversions.cpp
index 663c967..145fb55 100644
--- a/src/intel/compiler/brw_fs_lower_conversions.cpp
+++ b/src/intel/compiler/brw_fs_lower_conversions.cpp
@@ -43,6 +43,24 @@
    }
 }
 
+/* From the SKL PRM Vol 2a, "Move":
+ *
+ *    "A mov with the same source and destination type, no source modifier,
+ *     and no saturation is a raw move. A packed byte destination region (B
+ *     or UB type with HorzStride == 1 and ExecSize > 1) can only be written
+ *     using raw move."
+ */
+static bool
+is_byte_raw_mov (const fs_inst *inst)
+{
+   return type_sz(inst->dst.type) == 1 &&
+          inst->opcode == BRW_OPCODE_MOV &&
+          inst->src[0].type == inst->dst.type &&
+          !inst->saturate &&
+          !inst->src[0].negate &&
+          !inst->src[0].abs;
+}
+
 bool
 fs_visitor::lower_conversions()
 {
@@ -54,7 +72,8 @@
       bool saturate = inst->saturate;
 
       if (supports_type_conversion(inst)) {
-         if (get_exec_type_size(inst) == 8 && type_sz(inst->dst.type) < 8) {
+         if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
+             !is_byte_raw_mov(inst)) {
             /* From the Broadwell PRM, 3D Media GPGPU, "Double Precision Float to
              * Single Precision Float":
              *
@@ -64,6 +83,9 @@
              * So we need to allocate a temporary that's two registers, and then do
              * a strided MOV to get the lower DWord of every Qword that has the
              * result.
+             *
+             * This restriction applies, in general, whenever we convert to
+             * a type with a smaller bit-size.
              */
             fs_reg temp = ibld.vgrf(get_exec_type(inst));
             fs_reg strided_temp = subscript(temp, dst.type, 0);
@@ -76,7 +98,10 @@
              * size_written accordingly.
              */
             inst->size_written = inst->dst.component_size(inst->exec_size);
-            ibld.at(block, inst->next).MOV(dst, strided_temp)->saturate = saturate;
+
+            fs_inst *mov = ibld.at(block, inst->next).MOV(dst, strided_temp);
+            mov->saturate = saturate;
+            mov->predicate = inst->predicate;
 
             progress = true;
          }
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 8eb69b9..6e9a582 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -123,10 +123,11 @@
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       switch (intrin->intrinsic) {
       case nir_intrinsic_load_vertex_id:
-         unreachable("should be lowered by lower_vertex_id().");
+      case nir_intrinsic_load_base_vertex:
+         unreachable("should be lowered by nir_lower_system_values().");
 
       case nir_intrinsic_load_vertex_id_zero_base:
-      case nir_intrinsic_load_base_vertex:
+      case nir_intrinsic_load_is_indexed_draw:
       case nir_intrinsic_load_first_vertex:
       case nir_intrinsic_load_instance_id:
       case nir_intrinsic_load_base_instance:
@@ -195,11 +196,15 @@
              * masks for 2 and 3) in SIMD16.
              */
             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
-            abld.SHR(shifted,
-                     stride(byte_offset(retype(brw_vec1_grf(1, 0),
-                                               BRW_REGISTER_TYPE_UB), 28),
-                            1, 8, 0),
-                     brw_imm_v(0x76543210));
+
+            for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
+               const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
+               hbld.SHR(offset(shifted, hbld, i),
+                        stride(retype(brw_vec1_grf(1 + i, 7),
+                                      BRW_REGISTER_TYPE_UB),
+                               1, 8, 0),
+                        brw_imm_v(0x76543210));
+            }
 
             /* A set bit in the pixel mask means the channel is enabled, but
              * that is the opposite of gl_HelperInvocation so we need to invert
@@ -298,10 +303,13 @@
       default:
          unreachable("Invalid bit size");
       }
+   case BRW_REGISTER_TYPE_B:
    case BRW_REGISTER_TYPE_W:
    case BRW_REGISTER_TYPE_D:
    case BRW_REGISTER_TYPE_Q:
       switch(bit_size) {
+      case 8:
+         return BRW_REGISTER_TYPE_B;
       case 16:
          return BRW_REGISTER_TYPE_W;
       case 32:
@@ -311,10 +319,13 @@
       default:
          unreachable("Invalid bit size");
       }
+   case BRW_REGISTER_TYPE_UB:
    case BRW_REGISTER_TYPE_UW:
    case BRW_REGISTER_TYPE_UD:
    case BRW_REGISTER_TYPE_UQ:
       switch(bit_size) {
+      case 8:
+         return BRW_REGISTER_TYPE_UB;
       case 16:
          return BRW_REGISTER_TYPE_UW;
       case 32:
@@ -395,6 +406,10 @@
    nir_emit_cf_list(&if_stmt->else_list);
 
    bld.emit(BRW_OPCODE_ENDIF);
+
+   if (devinfo->gen < 7)
+      limit_dispatch_width(16, "Non-uniform control flow unsupported "
+                           "in SIMD32 mode.");
 }
 
 void
@@ -405,6 +420,10 @@
    nir_emit_cf_list(&loop->body);
 
    bld.emit(BRW_OPCODE_WHILE);
+
+   if (devinfo->gen < 7)
+      limit_dispatch_width(16, "Non-uniform control flow unsupported "
+                           "in SIMD32 mode.");
 }
 
 void
@@ -425,6 +444,10 @@
       nir_emit_alu(abld, nir_instr_as_alu(instr));
       break;
 
+   case nir_instr_type_deref:
+      /* Derefs can exist for images but they do nothing */
+      break;
+
    case nir_instr_type_intrinsic:
       switch (stage) {
       case MESA_SHADER_VERTEX:
@@ -764,20 +787,10 @@
        * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
        */
 
-   case nir_op_f2f16_undef:
-   case nir_op_i2i16:
-   case nir_op_u2u16: {
-      /* TODO: Fixing aligment rules for conversions from 32-bits to
-       * 16-bit types should be moved to lower_conversions
-       */
-      fs_reg tmp = bld.vgrf(op[0].type, 1);
-      tmp = subscript(tmp, result.type, 0);
-      inst = bld.MOV(tmp, op[0]);
-      inst->saturate = instr->dest.saturate;
-      inst = bld.MOV(result, tmp);
+   case nir_op_f2f16:
+      inst = bld.MOV(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
-   }
 
    case nir_op_f2f64:
    case nir_op_f2i64:
@@ -795,12 +808,12 @@
        *        the same qword.
        *     (...)"
        *
-       * This means that 32-bit to 64-bit conversions need to have the 32-bit
-       * data elements aligned to 64-bit. This restriction does not apply to
-       * BDW and later.
+       * This means that conversions from bit-sizes smaller than 64-bit to
+       * 64-bit need to have the source data elements aligned to 64-bit.
+       * This restriction does not apply to BDW and later.
        */
       if (nir_dest_bit_size(instr->dest.dest) == 64 &&
-          nir_src_bit_size(instr->src[0].src) == 32 &&
+          nir_src_bit_size(instr->src[0].src) < 64 &&
           (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
          fs_reg tmp = bld.vgrf(result.type, 1);
          tmp = subscript(tmp, op[0].type, 0);
@@ -813,8 +826,16 @@
    case nir_op_f2f32:
    case nir_op_f2i32:
    case nir_op_f2u32:
+   case nir_op_f2i16:
+   case nir_op_f2u16:
    case nir_op_i2i32:
    case nir_op_u2u32:
+   case nir_op_i2i16:
+   case nir_op_u2u16:
+   case nir_op_i2f16:
+   case nir_op_u2f16:
+   case nir_op_i2i8:
+   case nir_op_u2u8:
       inst = bld.MOV(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
@@ -884,17 +905,24 @@
       break;
    }
 
-   case nir_op_isign:
+   case nir_op_isign: {
       /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
        *               -> non-negative val generates 0x00000000.
        *  Predicated OR sets 1 if val is positive.
        */
-      assert(nir_dest_bit_size(instr->dest.dest) < 64);
-      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
-      bld.ASR(result, op[0], brw_imm_d(31));
-      inst = bld.OR(result, result, brw_imm_d(1));
+      uint32_t bit_size = nir_dest_bit_size(instr->dest.dest);
+      assert(bit_size == 32 || bit_size == 16);
+
+      fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0);
+      fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1);
+      fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15);
+
+      bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G);
+      bld.ASR(result, op[0], shift);
+      inst = bld.OR(result, result, one);
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
+   }
 
    case nir_op_frcp:
       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
@@ -1034,9 +1062,11 @@
    case nir_op_feq:
    case nir_op_fne: {
       fs_reg dest = result;
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
-         dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
-      }
+
+      const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
+      if (bit_size != 32)
+         dest = bld.vgrf(op[0].type, 1);
+
       brw_conditional_mod cond;
       switch (instr->op) {
       case nir_op_flt:
@@ -1054,9 +1084,19 @@
       default:
          unreachable("bad opcode");
       }
+
       bld.CMP(dest, op[0], op[1], cond);
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+      if (bit_size > 32) {
          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      } else if(bit_size < 32) {
+         /* When we convert the result to 32-bit we need to be careful and do
+          * it as a signed conversion to get sign extension (for 32-bit true)
+          */
+         const brw_reg_type src_type =
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
       }
       break;
    }
@@ -1068,9 +1108,10 @@
    case nir_op_ieq:
    case nir_op_ine: {
       fs_reg dest = result;
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
-         dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1);
-      }
+
+      const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+      if (bit_size != 32)
+         dest = bld.vgrf(op[0].type, 1);
 
       brw_conditional_mod cond;
       switch (instr->op) {
@@ -1092,8 +1133,17 @@
          unreachable("bad opcode");
       }
       bld.CMP(dest, op[0], op[1], cond);
-      if (nir_src_bit_size(instr->src[0].src) > 32) {
+
+      if (bit_size > 32) {
          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      } else if (bit_size < 32) {
+         /* When we convert the result to 32-bit we need to be careful and do
+          * it as a signed conversion to get sign extension (for 32-bit true)
+          */
+         const brw_reg_type src_type =
+            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
+
+         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
       }
       break;
    }
@@ -1180,8 +1230,9 @@
       break;
 
    case nir_op_i2b:
-   case nir_op_f2b:
-      if (nir_src_bit_size(instr->src[0].src) == 64) {
+   case nir_op_f2b: {
+      uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
+      if (bit_size == 64) {
          /* two-argument instructions can't take 64-bit immediates */
          fs_reg zero;
          fs_reg tmp;
@@ -1203,13 +1254,18 @@
          bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
          bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
       } else {
-         if (instr->op == nir_op_f2b) {
-            bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+         fs_reg zero;
+         if (bit_size == 32) {
+            zero = instr->op == nir_op_f2b ? brw_imm_f(0.0f) : brw_imm_d(0);
          } else {
-            bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
+            assert(bit_size == 16);
+            zero = instr->op == nir_op_f2b ?
+               retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
          }
+         bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
       }
       break;
+   }
 
    case nir_op_ftrunc:
       inst = bld.RNDZ(result, op[0]);
@@ -1302,6 +1358,7 @@
       break;
 
    case nir_op_pack_64_2x32_split:
+   case nir_op_pack_32_2x16_split:
       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
       break;
 
@@ -1314,6 +1371,15 @@
       break;
    }
 
+   case nir_op_unpack_32_2x16_split_x:
+   case nir_op_unpack_32_2x16_split_y: {
+      if (instr->op == nir_op_unpack_32_2x16_split_x)
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
+      else
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
+      break;
+   }
+
    case nir_op_fpow:
       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
       inst->saturate = instr->dest.saturate;
@@ -1521,6 +1587,16 @@
    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
 
    switch (instr->def.bit_size) {
+   case 8:
+      for (unsigned i = 0; i < instr->def.num_components; i++)
+         bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value.i8[i]));
+      break;
+
+   case 16:
+      for (unsigned i = 0; i < instr->def.num_components; i++)
+         bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value.i16[i]));
+      break;
+
    case 32:
       for (unsigned i = 0; i < instr->def.num_components; i++)
          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
@@ -1603,7 +1679,10 @@
 {
    if (dest.is_ssa) {
       const brw_reg_type reg_type =
-         brw_reg_type_from_bit_size(dest.ssa.bit_size, BRW_REGISTER_TYPE_F);
+         brw_reg_type_from_bit_size(dest.ssa.bit_size,
+                                    dest.ssa.bit_size == 8 ?
+                                    BRW_REGISTER_TYPE_D :
+                                    BRW_REGISTER_TYPE_F);
       nir_ssa_values[dest.ssa.index] =
          bld.vgrf(reg_type, dest.ssa.num_components);
       return nir_ssa_values[dest.ssa.index];
@@ -1616,51 +1695,56 @@
 }
 
 fs_reg
-fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
+fs_visitor::get_nir_image_deref(nir_deref_instr *deref)
 {
-   fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
-                BRW_REGISTER_TYPE_UD);
-   fs_reg indirect;
-   unsigned indirect_max = 0;
+   fs_reg arr_offset = brw_imm_ud(0);
+   unsigned array_size = BRW_IMAGE_PARAM_SIZE * 4;
+   nir_deref_instr *head = deref;
+   while (head->deref_type != nir_deref_type_var) {
+      assert(head->deref_type == nir_deref_type_array);
 
-   for (const nir_deref *tail = &deref->deref; tail->child;
-        tail = tail->child) {
-      const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
-      assert(tail->child->deref_type == nir_deref_type_array);
-      const unsigned size = glsl_get_length(tail->type);
-      const unsigned element_size = type_size_scalar(deref_array->deref.type);
-      const unsigned base = MIN2(deref_array->base_offset, size - 1);
-      image = offset(image, bld, base * element_size);
+      /* This level's element size is the previous level's array size */
+      const unsigned elem_size = array_size;
 
-      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-         fs_reg tmp = vgrf(glsl_type::uint_type);
-
-         /* Accessing an invalid surface index with the dataport can result
-          * in a hang.  According to the spec "if the index used to
-          * select an individual element is negative or greater than or
-          * equal to the size of the array, the results of the operation
-          * are undefined but may not lead to termination" -- which is one
-          * of the possible outcomes of the hang.  Clamp the index to
-          * prevent access outside of the array bounds.
-          */
-         bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
-                                     BRW_REGISTER_TYPE_UD),
-                         brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
-
-         indirect_max += element_size * (tail->type->length - 1);
-
-         bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
-         if (indirect.file == BAD_FILE) {
-            indirect = tmp;
-         } else {
-            bld.ADD(indirect, indirect, tmp);
-         }
+      fs_reg index = retype(get_nir_src_imm(head->arr.index),
+                            BRW_REGISTER_TYPE_UD);
+      if (arr_offset.file == BRW_IMMEDIATE_VALUE &&
+          index.file == BRW_IMMEDIATE_VALUE) {
+         arr_offset.ud += index.ud * elem_size;
+      } else if (index.file == BRW_IMMEDIATE_VALUE) {
+         bld.ADD(arr_offset, arr_offset, brw_imm_ud(index.ud * elem_size));
+      } else {
+         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+         bld.MUL(tmp, index, brw_imm_ud(elem_size));
+         bld.ADD(tmp, tmp, arr_offset);
+         arr_offset = tmp;
       }
+
+      head = nir_deref_instr_parent(head);
+      assert(glsl_type_is_array(head->type));
+      array_size = elem_size * glsl_get_length(head->type);
    }
 
-   if (indirect.file == BAD_FILE) {
-      return image;
+   assert(head->deref_type == nir_deref_type_var);
+   const unsigned max_arr_offset = array_size - (BRW_IMAGE_PARAM_SIZE * 4);
+   fs_reg image(UNIFORM, head->var->data.driver_location / 4,
+                BRW_REGISTER_TYPE_UD);
+
+   if (arr_offset.file == BRW_IMMEDIATE_VALUE) {
+      /* The offset is in bytes but we want it in dwords */
+      return offset(image, bld, MIN2(arr_offset.ud, max_arr_offset) / 4);
    } else {
+      /* Accessing an invalid surface index with the dataport can result
+       * in a hang.  According to the spec "if the index used to
+       * select an individual element is negative or greater than or
+       * equal to the size of the array, the results of the operation
+       * are undefined but may not lead to termination" -- which is one
+       * of the possible outcomes of the hang.  Clamp the index to
+       * prevent access outside of the array bounds.
+       */
+      bld.emit_minmax(arr_offset, arr_offset, brw_imm_ud(max_arr_offset),
+                      BRW_CONDITIONAL_L);
+
       /* Emit a pile of MOVs to load the uniform into a temporary.  The
        * dead-code elimination pass will get rid of what we don't use.
        */
@@ -1668,7 +1752,7 @@
       for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
          bld.emit(SHADER_OPCODE_MOV_INDIRECT,
                   offset(tmp, bld, j), offset(image, bld, j),
-                  indirect, brw_imm_ud((indirect_max + 1) * 4));
+                  arr_offset, brw_imm_ud(max_arr_offset + 4));
       }
       return tmp;
    }
@@ -1718,23 +1802,23 @@
 get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
 {
    switch (op) {
-   case nir_intrinsic_image_var_atomic_add:
+   case nir_intrinsic_image_deref_atomic_add:
       return BRW_AOP_ADD;
-   case nir_intrinsic_image_var_atomic_min:
+   case nir_intrinsic_image_deref_atomic_min:
       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
               BRW_AOP_IMIN : BRW_AOP_UMIN);
-   case nir_intrinsic_image_var_atomic_max:
+   case nir_intrinsic_image_deref_atomic_max:
       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
               BRW_AOP_IMAX : BRW_AOP_UMAX);
-   case nir_intrinsic_image_var_atomic_and:
+   case nir_intrinsic_image_deref_atomic_and:
       return BRW_AOP_AND;
-   case nir_intrinsic_image_var_atomic_or:
+   case nir_intrinsic_image_deref_atomic_or:
       return BRW_AOP_OR;
-   case nir_intrinsic_image_var_atomic_xor:
+   case nir_intrinsic_image_deref_atomic_xor:
       return BRW_AOP_XOR;
-   case nir_intrinsic_image_var_atomic_exchange:
+   case nir_intrinsic_image_deref_atomic_exchange:
       return BRW_AOP_MOV;
-   case nir_intrinsic_image_var_atomic_comp_swap:
+   case nir_intrinsic_image_deref_atomic_comp_swap:
       return BRW_AOP_CMPWR;
    default:
       unreachable("Not reachable.");
@@ -1751,21 +1835,8 @@
 {
    struct brw_wm_prog_data *wm_prog_data =
       brw_wm_prog_data(bld.shader->stage_prog_data);
-   fs_inst *inst;
-   fs_reg payload;
-   int mlen;
 
-   if (src.file == BAD_FILE) {
-      /* Dummy payload */
-      payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
-      mlen = 1;
-   } else {
-      payload = src;
-      mlen = 2 * bld.dispatch_width() / 8;
-   }
-
-   inst = bld.emit(opcode, dst, payload, desc);
-   inst->mlen = mlen;
+   fs_inst *inst = bld.emit(opcode, dst, src, desc);
    /* 2 floats per slot returned */
    inst->size_written = 2 * dst.component_size(inst->exec_size);
    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
@@ -2278,11 +2349,11 @@
       }
 
       if (type_sz(dst.type) == 8) {
-         shuffle_32bit_load_result_to_64bit_data(
-            bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
-
-         for (unsigned c = 0; c < num_components; c++)
-            bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+         shuffle_from_32bit_read(bld,
+                                 offset(dst, bld, iter * 2),
+                                 retype(tmp_dst, BRW_REGISTER_TYPE_D),
+                                 0,
+                                 num_components);
       }
 
       if (num_iterations > 1) {
@@ -2345,10 +2416,8 @@
                               1 /* dims */,
                               num_components_32bit,
                               BRW_PREDICATE_NONE);
-         shuffle_32bit_load_result_to_16bit_data(bld,
-               retype(dest, BRW_REGISTER_TYPE_W),
-               retype(read_result, BRW_REGISTER_TYPE_D),
-               first_component, num_components);
+         shuffle_from_32bit_read(bld, dest, read_result, first_component,
+                                 num_components);
       } else {
          fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
          for (unsigned i = 0; i < num_components; i++) {
@@ -2409,16 +2478,8 @@
                                                 BRW_PREDICATE_NONE);
 
          /* Shuffle the 32-bit load result into valid 64-bit data */
-         const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
-         shuffle_32bit_load_result_to_64bit_data(
-            bld, packed_result, read_result, iter_components);
-
-         /* Move each component to its destination */
-         read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
-         for (int c = 0; c < iter_components; c++) {
-            bld.MOV(offset(dest, bld, it * 2 + c),
-                    offset(packed_result, bld, c));
-         }
+         shuffle_from_32bit_read(bld, offset(dest, bld, it * 2),
+                                 read_result, 0, iter_components);
 
          bld.ADD(read_offset, read_offset, brw_imm_ud(16));
       }
@@ -2439,20 +2500,8 @@
 
    switch (instr->intrinsic) {
    case nir_intrinsic_load_vertex_id:
-      unreachable("should be lowered by lower_vertex_id()");
-
-   case nir_intrinsic_load_vertex_id_zero_base:
    case nir_intrinsic_load_base_vertex:
-   case nir_intrinsic_load_instance_id:
-   case nir_intrinsic_load_base_instance:
-   case nir_intrinsic_load_draw_id: {
-      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
-      fs_reg val = nir_system_values[sv];
-      assert(val.file != BAD_FILE);
-      dest.type = val.type;
-      bld.MOV(dest, val);
-      break;
-   }
+      unreachable("should be lowered by nir_lower_system_values()");
 
    case nir_intrinsic_load_input: {
       fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
@@ -2466,20 +2515,20 @@
       if (type_sz(dest.type) == 8)
          first_component /= 2;
 
-      for (unsigned j = 0; j < num_components; j++) {
-         bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
-      }
-
-      if (type_sz(dest.type) == 8) {
-         shuffle_32bit_load_result_to_64bit_data(bld,
-                                                 dest,
-                                                 retype(dest, BRW_REGISTER_TYPE_F),
-                                                 instr->num_components);
-      }
+      /* For 16-bit support maybe a temporary will be needed to copy from
+       * the ATTR file.
+       */
+      shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
+                              first_component, num_components);
       break;
    }
 
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_draw_id:
    case nir_intrinsic_load_first_vertex:
+   case nir_intrinsic_load_is_indexed_draw:
       unreachable("lowered by brw_nir_lower_vs_inputs");
 
    default:
@@ -2652,13 +2701,10 @@
           * or SSBOs.
           */
          if (type_sz(dst.type) == 8) {
-            shuffle_32bit_load_result_to_64bit_data(
-               bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
-
-            for (unsigned c = 0; c < num_components; c++) {
-               bld.MOV(offset(orig_dst, bld, iter * 2 + c),
-                       offset(dst, bld, c));
-            }
+            shuffle_from_32bit_read(bld,
+                                    offset(orig_dst, bld, iter * 2),
+                                    retype(dst, BRW_REGISTER_TYPE_D),
+                                    0, num_components);
          }
 
          /* Copy the temporary to the destination to deal with writemasking.
@@ -2832,8 +2878,7 @@
                 * for that.
                 */
                unsigned channel = iter * 2 + i;
-               fs_reg dest = shuffle_64bit_data_for_32bit_write(bld,
-                  offset(value, bld, channel), 1);
+               fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
 
                srcs[header_regs + (i + first_component) * 2] = dest;
                srcs[header_regs + (i + first_component) * 2 + 1] =
@@ -3001,13 +3046,10 @@
              * or SSBOs.
              */
             if (type_sz(dest.type) == 8) {
-               shuffle_32bit_load_result_to_64bit_data(
-                  bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
-
-               for (unsigned c = 0; c < num_components; c++) {
-                  bld.MOV(offset(orig_dest, bld, iter * 2 + c),
-                          offset(dest, bld, c));
-               }
+               shuffle_from_32bit_read(bld,
+                                       offset(orig_dest, bld, iter * 2),
+                                       retype(dest, BRW_REGISTER_TYPE_D),
+                                       0, num_components);
             }
 
             /* If we are loading double data and we need a second read message
@@ -3338,21 +3380,24 @@
       if (devinfo->gen >= 6) {
          emit_discard_jump();
       }
+
+      limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode.");
       break;
    }
 
    case nir_intrinsic_load_input: {
       /* load_input is only used for flat inputs */
       unsigned base = nir_intrinsic_base(instr);
-      unsigned component = nir_intrinsic_component(instr);
+      unsigned comp = nir_intrinsic_component(instr);
       unsigned num_components = instr->num_components;
+      fs_reg orig_dest = dest;
       enum brw_reg_type type = dest.type;
 
       /* Special case fields in the VUE header */
       if (base == VARYING_SLOT_LAYER)
-         component = 1;
+         comp = 1;
       else if (base == VARYING_SLOT_VIEWPORT)
-         component = 2;
+         comp = 2;
 
       if (nir_dest_bit_size(instr->dest) == 64) {
          /* const_index is in 32-bit type size units that could not be aligned
@@ -3361,20 +3406,17 @@
           */
          type = BRW_REGISTER_TYPE_F;
          num_components *= 2;
+         dest = bld.vgrf(type, num_components);
       }
 
       for (unsigned int i = 0; i < num_components; i++) {
-         struct brw_reg interp = interp_reg(base, component + i);
-         interp = suboffset(interp, 3);
-         bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
-                  retype(fs_reg(interp), type));
+         bld.MOV(offset(retype(dest, type), bld, i),
+                 retype(component(interp_reg(base, comp + i), 3), type));
       }
 
       if (nir_dest_bit_size(instr->dest) == 64) {
-         shuffle_32bit_load_result_to_64bit_data(bld,
-                                                 dest,
-                                                 retype(dest, type),
-                                                 instr->num_components);
+         shuffle_from_32bit_read(bld, orig_dest, dest, 0,
+                                 instr->num_components);
       }
       break;
    }
@@ -3441,7 +3483,7 @@
                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
                                             dest,
                                             fs_reg(), /* src */
-                                            msg_data,
+                                            component(msg_data, 0),
                                             interpolation);
             set_predicate(BRW_PREDICATE_NORMAL, inst);
 
@@ -3540,8 +3582,8 @@
 
       for (unsigned int i = 0; i < instr->num_components; i++) {
          fs_reg interp =
-            fs_reg(interp_reg(nir_intrinsic_base(instr),
-                              nir_intrinsic_component(instr) + i));
+            component(interp_reg(nir_intrinsic_base(instr),
+                                 nir_intrinsic_component(instr) + i), 0);
          interp.type = BRW_REGISTER_TYPE_F;
          dest.type = BRW_REGISTER_TYPE_F;
 
@@ -3692,8 +3734,8 @@
       unsigned type_size = 4;
       if (nir_src_bit_size(instr->src[0]) == 64) {
          type_size = 8;
-         val_reg = shuffle_64bit_data_for_32bit_write(bld,
-            val_reg, instr->num_components);
+         val_reg = shuffle_for_32bit_write(bld, val_reg, 0,
+                                           instr->num_components);
       }
 
       unsigned type_slots = type_size / 4;
@@ -3819,24 +3861,25 @@
       dest = get_nir_dest(instr->dest);
 
    switch (instr->intrinsic) {
-   case nir_intrinsic_image_var_load:
-   case nir_intrinsic_image_var_store:
-   case nir_intrinsic_image_var_atomic_add:
-   case nir_intrinsic_image_var_atomic_min:
-   case nir_intrinsic_image_var_atomic_max:
-   case nir_intrinsic_image_var_atomic_and:
-   case nir_intrinsic_image_var_atomic_or:
-   case nir_intrinsic_image_var_atomic_xor:
-   case nir_intrinsic_image_var_atomic_exchange:
-   case nir_intrinsic_image_var_atomic_comp_swap: {
+   case nir_intrinsic_image_deref_load:
+   case nir_intrinsic_image_deref_store:
+   case nir_intrinsic_image_deref_atomic_add:
+   case nir_intrinsic_image_deref_atomic_min:
+   case nir_intrinsic_image_deref_atomic_max:
+   case nir_intrinsic_image_deref_atomic_and:
+   case nir_intrinsic_image_deref_atomic_or:
+   case nir_intrinsic_image_deref_atomic_xor:
+   case nir_intrinsic_image_deref_atomic_exchange:
+   case nir_intrinsic_image_deref_atomic_comp_swap: {
       using namespace image_access;
 
       if (stage == MESA_SHADER_FRAGMENT &&
-          instr->intrinsic != nir_intrinsic_image_var_load)
+          instr->intrinsic != nir_intrinsic_image_deref_load)
          brw_wm_prog_data(prog_data)->has_side_effects = true;
 
       /* Get the referenced image variable and type. */
-      const nir_variable *var = instr->variables[0]->var;
+      nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
+      const nir_variable *var = nir_deref_instr_get_variable(deref);
       const glsl_type *type = var->type->without_array();
       const brw_reg_type base_type = get_image_base_type(type);
 
@@ -3848,22 +3891,22 @@
       const unsigned dest_components = nir_intrinsic_dest_components(instr);
 
       /* Get the arguments of the image intrinsic. */
-      const fs_reg image = get_nir_image_deref(instr->variables[0]);
-      const fs_reg addr = retype(get_nir_src(instr->src[0]),
+      const fs_reg image = get_nir_image_deref(deref);
+      const fs_reg addr = retype(get_nir_src(instr->src[1]),
                                  BRW_REGISTER_TYPE_UD);
-      const fs_reg src0 = (info->num_srcs >= 3 ?
-                           retype(get_nir_src(instr->src[2]), base_type) :
-                           fs_reg());
-      const fs_reg src1 = (info->num_srcs >= 4 ?
+      const fs_reg src0 = (info->num_srcs >= 4 ?
                            retype(get_nir_src(instr->src[3]), base_type) :
                            fs_reg());
+      const fs_reg src1 = (info->num_srcs >= 5 ?
+                           retype(get_nir_src(instr->src[4]), base_type) :
+                           fs_reg());
       fs_reg tmp;
 
       /* Emit an image load, store or atomic op. */
-      if (instr->intrinsic == nir_intrinsic_image_var_load)
+      if (instr->intrinsic == nir_intrinsic_image_deref_load)
          tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
 
-      else if (instr->intrinsic == nir_intrinsic_image_var_store)
+      else if (instr->intrinsic == nir_intrinsic_image_deref_store)
          emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
                           var->data.image.write_only ? GL_NONE : format);
 
@@ -3880,6 +3923,8 @@
       break;
    }
 
+   case nir_intrinsic_group_memory_barrier:
+   case nir_intrinsic_memory_barrier_shared:
    case nir_intrinsic_memory_barrier_atomic_counter:
    case nir_intrinsic_memory_barrier_buffer:
    case nir_intrinsic_memory_barrier_image:
@@ -3891,29 +3936,6 @@
       break;
    }
 
-   case nir_intrinsic_group_memory_barrier:
-   case nir_intrinsic_memory_barrier_shared:
-      /* We treat these workgroup-level barriers as no-ops.  This should be
-       * safe at present and as long as:
-       *
-       *  - Memory access instructions are not subsequently reordered by the
-       *    compiler back-end.
-       *
-       *  - All threads from a given compute shader workgroup fit within a
-       *    single subslice and therefore talk to the same HDC shared unit
-       *    what supposedly guarantees ordering and coherency between threads
-       *    from the same workgroup.  This may change in the future when we
-       *    start splitting workgroups across multiple subslices.
-       *
-       *  - The context is not in fault-and-stream mode, which could cause
-       *    memory transactions (including to SLM) prior to the barrier to be
-       *    replayed after the barrier if a pagefault occurs.  This shouldn't
-       *    be a problem up to and including SKL because fault-and-stream is
-       *    not usable due to hardware issues, but that's likely to change in
-       *    the future.
-       */
-      break;
-
    case nir_intrinsic_shader_clock: {
       /* We cannot do anything if there is an event, so ignore it for now */
       const fs_reg shader_clock = get_timestamp(bld);
@@ -3923,13 +3945,14 @@
       break;
    }
 
-   case nir_intrinsic_image_var_size: {
+   case nir_intrinsic_image_deref_size: {
       /* Get the referenced image variable and type. */
-      const nir_variable *var = instr->variables[0]->var;
+      nir_deref_instr *deref = nir_src_as_deref(instr->src[0]);
+      const nir_variable *var = nir_deref_instr_get_variable(deref);
       const glsl_type *type = var->type->without_array();
 
       /* Get the size of the image. */
-      const fs_reg image = get_nir_image_deref(instr->variables[0]);
+      const fs_reg image = get_nir_image_deref(deref);
       const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
 
       /* For 1DArray image types, the array index is stored in the Z component.
@@ -3967,7 +3990,7 @@
       break;
    }
 
-   case nir_intrinsic_image_var_samples:
+   case nir_intrinsic_image_deref_samples:
       /* The driver does not support multi-sampled images. */
       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
       break;
@@ -4234,10 +4257,9 @@
              * iteration handle the rest.
              */
             num_components = MIN2(2, num_components);
-            write_src = shuffle_64bit_data_for_32bit_write(bld, write_src,
-                                                           num_components);
+            write_src = shuffle_for_32bit_write(bld, write_src, 0,
+                                                num_components);
          } else if (type_size < 4) {
-            assert(type_size == 2);
             /* For 16-bit types we pack two consecutive values into a 32-bit
              * word and use an untyped write message. For single values or not
              * 32-bit-aligned we need to use byte-scattered writes because
@@ -4261,23 +4283,23 @@
                 * being aligned to 32-bit.
                 */
                num_components = 1;
-            } else if (num_components > 2 && (num_components % 2)) {
-               /* If there is an odd number of consecutive components we left
-                * the not paired component for a following emit of length == 1
-                * with byte_scattered_write.
+            } else if (num_components * type_size > 4 &&
+                       (num_components * type_size % 4)) {
+               /* If the pending components size is not a multiple of 4 bytes
+                * we left the not aligned components for following emits of
+                * length == 1 with byte_scattered_write.
                 */
-               num_components --;
+               num_components -= (num_components * type_size % 4) / type_size;
+            } else if (num_components * type_size < 4) {
+               num_components = 1;
             }
             /* For num_components == 1 we are also shuffling the component
              * because byte scattered writes of 16-bit need values to be dword
              * aligned. Shuffling only one component would be the same as
              * striding it.
              */
-            fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D,
-                                  DIV_ROUND_UP(num_components, 2));
-            shuffle_16bit_data_for_32bit_write(bld, tmp, write_src,
-                                               num_components);
-            write_src = tmp;
+            write_src = shuffle_for_32bit_write(bld, write_src, 0,
+                                                num_components);
          }
 
          fs_reg offset_reg;
@@ -4293,7 +4315,6 @@
          }
 
          if (type_size < 4 && num_components == 1) {
-            assert(type_size == 2);
             /* Untyped Surface messages have a fixed 32-bit size, so we need
              * to rely on byte scattered in order to write 16-bit elements.
              * The byte_scattered_write message needs that every written 16-bit
@@ -4334,7 +4355,7 @@
       unsigned num_components = instr->num_components;
       unsigned first_component = nir_intrinsic_component(instr);
       if (nir_src_bit_size(instr->src[0]) == 64) {
-         src = shuffle_64bit_data_for_32bit_write(bld, src, num_components);
+         src = shuffle_for_32bit_write(bld, src, 0, num_components);
          num_components *= 2;
       }
 
@@ -4797,6 +4818,21 @@
       break;
    }
 
+   case nir_intrinsic_begin_invocation_interlock: {
+      const fs_builder ubld = bld.group(8, 0);
+      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+
+      ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
+         REG_SIZE;
+
+      break;
+   }
+
+   case nir_intrinsic_end_invocation_interlock: {
+      /* We don't need to do anything here */
+      break;
+   }
+
    default:
       unreachable("unknown intrinsic");
    }
@@ -5174,153 +5210,150 @@
    }
 }
 
-/**
- * This helper takes the result of a load operation that reads 32-bit elements
- * in this format:
+/*
+ * This helper takes a source register and un/shuffles it into the destination
+ * register.
  *
- * x x x x x x x x
- * y y y y y y y y
- * z z z z z z z z
- * w w w w w w w w
+ * If source type size is smaller than destination type size the operation
+ * needed is a component shuffle. The opposite case would be an unshuffle. If
+ * source/destination type size is equal a shuffle is done that would be
+ * equivalent to a simple MOV.
  *
- * and shuffles the data to get this:
+ * For example, if source is a 16-bit type and destination is 32-bit. A 3
+ * components .xyz 16-bit vector on SIMD8 would be.
  *
- * x y x y x y x y
- * x y x y x y x y
- * z w z w z w z w
- * z w z w z w z w
+ *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
+ *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
  *
- * Which is exactly what we want if the load is reading 64-bit components
- * like doubles, where x represents the low 32-bit of the x double component
- * and y represents the high 32-bit of the x double component (likewise with
- * z and w for double component y). The parameter @components represents
- * the number of 64-bit components present in @src. This would typically be
- * 2 at most, since we can only fit 2 double elements in the result of a
- * vec4 load.
+ * This helper will return the following 2 32-bit components with the 16-bit
+ * values shuffled:
  *
- * Notice that @dst and @src can be the same register.
+ *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
+ *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
+ *
+ * For unshuffle, the example would be the opposite, a 64-bit type source
+ * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
+ * would be:
+ *
+ *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
+ *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
+ *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
+ *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
+ *
+ * The returned result would be the following 4 32-bit components unshuffled:
+ *
+ *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
+ *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
+ *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
+ *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
+ *
+ * - Source and destination register must not be overlapped.
+ * - components units are measured in terms of the smaller type between
+ *   source and destination because we are un/shuffling the smaller
+ *   components from/into the bigger ones.
+ * - first_component parameter allows skipping source components.
  */
 void
-shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
-                                        const fs_reg &dst,
-                                        const fs_reg &src,
-                                        uint32_t components)
+shuffle_src_to_dst(const fs_builder &bld,
+                   const fs_reg &dst,
+                   const fs_reg &src,
+                   uint32_t first_component,
+                   uint32_t components)
 {
-   assert(type_sz(src.type) == 4);
-   assert(type_sz(dst.type) == 8);
+   if (type_sz(src.type) == type_sz(dst.type)) {
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
+      for (unsigned i = 0; i < components; i++) {
+         bld.MOV(retype(offset(dst, bld, i), src.type),
+                 offset(src, bld, i + first_component));
+      }
+   } else if (type_sz(src.type) < type_sz(dst.type)) {
+      /* Source is shuffled into destination */
+      unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components, size_ratio),
+         offset(src, bld, first_component),
+         type_sz(src.type) * bld.dispatch_width() * components));
 
-   /* A temporary that we will use to shuffle the 32-bit data of each
-    * component in the vector into valid 64-bit data. We can't write directly
-    * to dst because dst can be (and would usually be) the same as src
-    * and in that case the first MOV in the loop below would overwrite the
-    * data read in the second MOV.
-    */
-   fs_reg tmp = bld.vgrf(dst.type);
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(src.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(dst, bld, i / size_ratio),
+                      shuffle_type, i % size_ratio);
+         bld.MOV(shuffle_component_i,
+                 retype(offset(src, bld, i + first_component), shuffle_type));
+      }
+   } else {
+      /* Source is unshuffled into destination */
+      unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
+      assert(!regions_overlap(dst,
+         type_sz(dst.type) * bld.dispatch_width() * components,
+         offset(src, bld, first_component / size_ratio),
+         type_sz(src.type) * bld.dispatch_width() *
+         DIV_ROUND_UP(components + (first_component % size_ratio),
+                      size_ratio)));
 
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i = offset(src, bld, 2 * i);
-
-      bld.MOV(subscript(tmp, src.type, 0), component_i);
-      bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
-
-      bld.MOV(offset(dst, bld, i), tmp);
-   }
-}
-
-void
-shuffle_32bit_load_result_to_16bit_data(const fs_builder &bld,
-                                        const fs_reg &dst,
-                                        const fs_reg &src,
-                                        uint32_t first_component,
-                                        uint32_t components)
-{
-   assert(type_sz(src.type) == 4);
-   assert(type_sz(dst.type) == 2);
-
-   /* A temporary is used to un-shuffle the 32-bit data of each component in
-    * into a valid 16-bit vector. We can't write directly to dst because it
-    * can be the same register as src and in that case the first MOV in the
-    * loop below would overwrite the data read in the second MOV.
-    */
-   fs_reg tmp = retype(bld.vgrf(src.type), dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i =
-         subscript(offset(src, bld, (first_component + i) / 2), dst.type,
-                   (first_component + i) % 2);
-
-      bld.MOV(offset(tmp, bld, i % 2), component_i);
-
-      if (i % 2) {
-         bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));
-         bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));
+      brw_reg_type shuffle_type =
+         brw_reg_type_from_bit_size(8 * type_sz(dst.type),
+                                    BRW_REGISTER_TYPE_D);
+      for (unsigned i = 0; i < components; i++) {
+         fs_reg shuffle_component_i =
+            subscript(offset(src, bld, (first_component + i) / size_ratio),
+                      shuffle_type, (first_component + i) % size_ratio);
+         bld.MOV(retype(offset(dst, bld, i), shuffle_type),
+                 shuffle_component_i);
       }
    }
-   if (components % 2) {
-      bld.MOV(offset(dst, bld, components - 1), tmp);
-   }
 }
 
-/**
- * This helper does the inverse operation of
- * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
- *
- * We need to do this when we are going to use untyped write messsages that
- * operate with 32-bit components in order to arrange our 64-bit data to be
- * in the expected layout.
- *
- * Notice that callers of this function, unlike in the case of the inverse
- * operation, would typically need to call this with dst and src being
- * different registers, since they would otherwise corrupt the original
- * 64-bit data they are about to write. Because of this the function checks
- * that the src and dst regions involved in the operation do not overlap.
- */
-fs_reg
-shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
-                                   const fs_reg &src,
-                                   uint32_t components)
+void
+shuffle_from_32bit_read(const fs_builder &bld,
+                        const fs_reg &dst,
+                        const fs_reg &src,
+                        uint32_t first_component,
+                        uint32_t components)
 {
-   assert(type_sz(src.type) == 8);
+   assert(type_sz(src.type) == 4);
 
-   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 2 * components);
-
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i = offset(src, bld, i);
-      bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
-      bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
+   /* This function takes components in units of the destination type while
+    * shuffle_src_to_dst takes components in units of the smallest type
+    */
+   if (type_sz(dst.type) > 4) {
+      assert(type_sz(dst.type) == 8);
+      first_component *= 2;
+      components *= 2;
    }
 
+   shuffle_src_to_dst(bld, dst, src, first_component, components);
+}
+
+fs_reg
+shuffle_for_32bit_write(const fs_builder &bld,
+                        const fs_reg &src,
+                        uint32_t first_component,
+                        uint32_t components)
+{
+   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
+                         DIV_ROUND_UP (components * type_sz(src.type), 4));
+   /* This function takes components in units of the source type while
+    * shuffle_src_to_dst takes components in units of the smallest type
+    */
+   if (type_sz(src.type) > 4) {
+      assert(type_sz(src.type) == 8);
+      first_component *= 2;
+      components *= 2;
+   }
+
+   shuffle_src_to_dst(bld, dst, src, first_component, components);
+
    return dst;
 }
 
-void
-shuffle_16bit_data_for_32bit_write(const fs_builder &bld,
-                                   const fs_reg &dst,
-                                   const fs_reg &src,
-                                   uint32_t components)
-{
-   assert(type_sz(src.type) == 2);
-   assert(type_sz(dst.type) == 4);
-
-   /* A temporary is used to shuffle the 16-bit data of each component in the
-    * 32-bit data vector. We can't write directly to dst because it can be the
-    * same register as src and in that case the first MOV in the loop below
-    * would overwrite the data read in the second MOV.
-    */
-   fs_reg tmp = bld.vgrf(dst.type);
-
-   for (unsigned i = 0; i < components; i++) {
-      const fs_reg component_i = offset(src, bld, i);
-      bld.MOV(subscript(tmp, src.type, i % 2), component_i);
-      if (i % 2) {
-         bld.MOV(offset(dst, bld, i / 2), tmp);
-      }
-   }
-   if (components % 2) {
-      bld.MOV(offset(dst, bld, components / 2), tmp);
-   }
-}
-
 fs_reg
 setup_imm_df(const fs_builder &bld, double v)
 {
@@ -5368,3 +5401,19 @@
 
    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
 }
+
+fs_reg
+setup_imm_b(const fs_builder &bld, int8_t v)
+{
+   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
+   bld.MOV(tmp, brw_imm_w(v));
+   return tmp;
+}
+
+fs_reg
+setup_imm_ub(const fs_builder &bld, uint8_t v)
+{
+   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
+   bld.MOV(tmp, brw_imm_uw(v));
+   return tmp;
+}
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
index 7a5f645..cd2abbb 100644
--- a/src/intel/compiler/brw_fs_visitor.cpp
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -103,6 +103,7 @@
    fs_inst *write;
    write = bld.emit(FS_OPCODE_FB_WRITE);
    write->eot = true;
+   write->last_rt = true;
    if (devinfo->gen >= 6) {
       write->base_mrf = 2;
       write->mlen = 4 * reg_width;
@@ -125,7 +126,8 @@
    stage_prog_data->nr_pull_params = 0;
    stage_prog_data->curb_read_length = 0;
    stage_prog_data->dispatch_grf_start_reg = 2;
-   wm_prog_data->dispatch_grf_start_reg_2 = 2;
+   wm_prog_data->dispatch_grf_start_reg_16 = 2;
+   wm_prog_data->dispatch_grf_start_reg_32 = 2;
    grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
 
    calculate_cfg();
@@ -135,17 +137,15 @@
  * data.  It will get adjusted to be a real location before
  * generate_code() time.
  */
-struct brw_reg
+fs_reg
 fs_visitor::interp_reg(int location, int channel)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
-   int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
-   int stride = (channel & 1) * 4;
-
+   int regnr = prog_data->urb_setup[location] * 4 + channel;
    assert(prog_data->urb_setup[location] != -1);
 
-   return brw_vec1_grf(regnr, stride);
+   return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
 }
 
 /** Emits the interpolation for the varying inputs. */
@@ -192,7 +192,7 @@
     */
    this->wpos_w = vgrf(glsl_type::float_type);
    abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
-             interp_reg(VARYING_SLOT_POS, 3));
+             component(interp_reg(VARYING_SLOT_POS, 3), 0));
    /* Compute the pixel 1/W value from wpos.w. */
    this->pixel_w = vgrf(glsl_type::float_type);
    abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
@@ -202,101 +202,106 @@
 void
 fs_visitor::emit_interpolation_setup_gen6()
 {
-   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
-
    fs_builder abld = bld.annotate("compute pixel centers");
-   if (devinfo->gen >= 8 || dispatch_width == 8) {
-      /* The "Register Region Restrictions" page says for BDW (and newer,
-       * presumably):
-       *
-       *     "When destination spans two registers, the source may be one or
-       *      two registers. The destination elements must be evenly split
-       *      between the two registers."
-       *
-       * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
-       * compute our pixel centers.
-       */
-      fs_reg int_pixel_xy(VGRF, alloc.allocate(dispatch_width / 8),
-                          BRW_REGISTER_TYPE_UW);
 
-      const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0);
-      dbld.ADD(int_pixel_xy,
-               fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
-               fs_reg(brw_imm_v(0x11001010)));
+   this->pixel_x = vgrf(glsl_type::float_type);
+   this->pixel_y = vgrf(glsl_type::float_type);
 
-      this->pixel_x = vgrf(glsl_type::float_type);
-      this->pixel_y = vgrf(glsl_type::float_type);
-      abld.emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
-      abld.emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
-   } else {
-      /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
-       *
-       *     "When destination spans two registers, the source MUST span two
-       *      registers."
-       *
-       * Since the GRF source of the ADD will only read a single register, we
-       * must do two separate ADDs in SIMD16.
-       */
-      fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
-      fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
-      int_pixel_x.type = BRW_REGISTER_TYPE_UW;
-      int_pixel_y.type = BRW_REGISTER_TYPE_UW;
-      abld.ADD(int_pixel_x,
-               fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
-               fs_reg(brw_imm_v(0x10101010)));
-      abld.ADD(int_pixel_y,
-               fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
-               fs_reg(brw_imm_v(0x11001100)));
+   for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
+      const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
+      struct brw_reg gi_uw = retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UW);
 
-      /* As of gen6, we can no longer mix float and int sources.  We have
-       * to turn the integer pixel centers into floats for their actual
-       * use.
-       */
-      this->pixel_x = vgrf(glsl_type::float_type);
-      this->pixel_y = vgrf(glsl_type::float_type);
-      abld.MOV(this->pixel_x, int_pixel_x);
-      abld.MOV(this->pixel_y, int_pixel_y);
+      if (devinfo->gen >= 8 || dispatch_width == 8) {
+         /* The "Register Region Restrictions" page says for BDW (and newer,
+          * presumably):
+          *
+          *     "When destination spans two registers, the source may be one or
+          *      two registers. The destination elements must be evenly split
+          *      between the two registers."
+          *
+          * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
+          * to compute our pixel centers.
+          */
+         const fs_builder dbld =
+            abld.exec_all().group(hbld.dispatch_width() * 2, 0);
+         fs_reg int_pixel_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
+
+         dbld.ADD(int_pixel_xy,
+                  fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
+                  fs_reg(brw_imm_v(0x11001010)));
+
+         hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy);
+         hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy);
+      } else {
+         /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
+          *
+          *     "When destination spans two registers, the source MUST span
+          *      two registers."
+          *
+          * Since the GRF source of the ADD will only read a single register,
+          * we must do two separate ADDs in SIMD16.
+          */
+         const fs_reg int_pixel_x = hbld.vgrf(BRW_REGISTER_TYPE_UW);
+         const fs_reg int_pixel_y = hbld.vgrf(BRW_REGISTER_TYPE_UW);
+
+         hbld.ADD(int_pixel_x,
+                  fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)),
+                  fs_reg(brw_imm_v(0x10101010)));
+         hbld.ADD(int_pixel_y,
+                  fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)),
+                  fs_reg(brw_imm_v(0x11001100)));
+
+         /* As of gen6, we can no longer mix float and int sources.  We have
+          * to turn the integer pixel centers into floats for their actual
+          * use.
+          */
+         hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x);
+         hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y);
+      }
    }
 
    abld = bld.annotate("compute pos.w");
-   this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
+   this->pixel_w = fetch_payload_reg(abld, payload.source_w_reg);
    this->wpos_w = vgrf(glsl_type::float_type);
    abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
 
    struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
+
+   for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+      this->delta_xy[i] = fetch_payload_reg(
+         bld, payload.barycentric_coord_reg[i], BRW_REGISTER_TYPE_F, 2);
+   }
+
    uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
       (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
        1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
 
-   for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
-      uint8_t reg = payload.barycentric_coord_reg[i];
-      this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
+   if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
+      /* Get the pixel/sample mask into f0 so that we know which
+       * pixels are lit.  Then, for each channel that is unlit,
+       * replace the centroid data with non-centroid data.
+       */
+      for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
+         bld.exec_all().group(1, 0)
+            .MOV(retype(brw_flag_reg(0, i), BRW_REGISTER_TYPE_UW),
+                 retype(brw_vec1_grf(1 + i, 7), BRW_REGISTER_TYPE_UW));
+      }
 
-      if (devinfo->needs_unlit_centroid_workaround &&
-          (centroid_modes & (1 << i))) {
-         /* Get the pixel/sample mask into f0 so that we know which
-          * pixels are lit.  Then, for each channel that is unlit,
-          * replace the centroid data with non-centroid data.
-          */
-         bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+         if (!(centroid_modes & (1 << i)))
+            continue;
 
-         uint8_t pixel_reg = payload.barycentric_coord_reg[i - 1];
+         const fs_reg &pixel_delta_xy = delta_xy[i - 1];
 
-         set_predicate_inv(BRW_PREDICATE_NORMAL, true,
-                           bld.half(0).MOV(brw_vec8_grf(reg, 0),
-                                           brw_vec8_grf(pixel_reg, 0)));
-         set_predicate_inv(BRW_PREDICATE_NORMAL, true,
-                           bld.half(0).MOV(brw_vec8_grf(reg + 1, 0),
-                                           brw_vec8_grf(pixel_reg + 1, 0)));
-         if (dispatch_width == 16) {
-            set_predicate_inv(BRW_PREDICATE_NORMAL, true,
-                              bld.half(1).MOV(brw_vec8_grf(reg + 2, 0),
-                                              brw_vec8_grf(pixel_reg + 2, 0)));
-            set_predicate_inv(BRW_PREDICATE_NORMAL, true,
-                              bld.half(1).MOV(brw_vec8_grf(reg + 3, 0),
-                                              brw_vec8_grf(pixel_reg + 3, 0)));
+         for (unsigned q = 0; q < dispatch_width / 8; q++) {
+            for (unsigned c = 0; c < 2; c++) {
+               const unsigned idx = c + (q & 2) + (q & 1) * dispatch_width / 8;
+               set_predicate_inv(
+                  BRW_PREDICATE_NORMAL, true,
+                  bld.half(q).MOV(horiz_offset(delta_xy[i], idx * 8),
+                                  horiz_offset(pixel_delta_xy, idx * 8)));
+            }
          }
-         assert(dispatch_width != 32); /* not implemented yet */
       }
    }
 }
@@ -364,16 +369,14 @@
    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
 
    /* Hand over gl_FragDepth or the payload depth. */
-   const fs_reg dst_depth = (payload.dest_depth_reg ?
-                             fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) :
-                             fs_reg());
+   const fs_reg dst_depth = fetch_payload_reg(bld, payload.dest_depth_reg);
    fs_reg src_depth, src_stencil;
 
    if (source_depth_to_render_target) {
       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
          src_depth = frag_depth;
       else
-         src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
+         src_depth = fetch_payload_reg(bld, payload.source_depth_reg);
    }
 
    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
@@ -441,7 +444,8 @@
       inst->target = target;
    }
 
-   prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE);
+   prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
+                                this->outputs[0].file != BAD_FILE);
    assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
 
    if (inst == NULL) {
@@ -461,6 +465,7 @@
       inst->target = 0;
    }
 
+   inst->last_rt = true;
    inst->eot = true;
 }
 
@@ -597,7 +602,7 @@
          per_slot_offsets = brw_imm_ud(output_vertex_size_owords *
                                        gs_vertex_count.ud);
       } else {
-         per_slot_offsets = vgrf(glsl_type::int_type);
+         per_slot_offsets = vgrf(glsl_type::uint_type);
          bld.MUL(per_slot_offsets, gs_vertex_count,
                  brw_imm_ud(output_vertex_size_owords));
       }
diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
index 8663c1b..438a473 100644
--- a/src/intel/compiler/brw_inst.h
+++ b/src/intel/compiler/brw_inst.h
@@ -459,6 +459,84 @@
 #define MD(x) ((x) + 96)
 
 /**
+ * Set the SEND(C) message descriptor immediate.
+ *
+ * This doesn't include the SFID nor the EOT field that were considered to be
+ * part of the message descriptor by ancient versions of the BSpec, because
+ * they are present in the instruction even if the message descriptor is
+ * provided indirectly in the address register, so we want to specify them
+ * separately.
+ */
+static inline void
+brw_inst_set_send_desc(const struct gen_device_info *devinfo,
+                       brw_inst *inst, uint32_t value)
+{
+   if (devinfo->gen >= 9) {
+      brw_inst_set_bits(inst, 126, 96, value);
+      assert(value >> 31 == 0);
+   } else if (devinfo->gen >= 5) {
+      brw_inst_set_bits(inst, 124, 96, value);
+      assert(value >> 29 == 0);
+   } else {
+      brw_inst_set_bits(inst, 119, 96, value);
+      assert(value >> 24 == 0);
+   }
+}
+
+/**
+ * Get the SEND(C) message descriptor immediate.
+ *
+ * \sa brw_inst_set_send_desc().
+ */
+static inline uint32_t
+brw_inst_send_desc(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   if (devinfo->gen >= 9)
+      return brw_inst_bits(inst, 126, 96);
+   else if (devinfo->gen >= 5)
+      return brw_inst_bits(inst, 124, 96);
+   else
+      return brw_inst_bits(inst, 119, 96);
+}
+
+/**
+ * Set the SEND(C) message extended descriptor immediate.
+ *
+ * This doesn't include the SFID nor the EOT field that were considered to be
+ * part of the extended message descriptor by some versions of the BSpec,
+ * because they are present in the instruction even if the extended message
+ * descriptor is provided indirectly in a register, so we want to specify them
+ * separately.
+ */
+static inline void
+brw_inst_set_send_ex_desc(const struct gen_device_info *devinfo,
+                          brw_inst *inst, uint32_t value)
+{
+   assert(devinfo->gen >= 9);
+   brw_inst_set_bits(inst, 94, 91, (value >> 28) & ((1u << 4) - 1));
+   brw_inst_set_bits(inst, 88, 85, (value >> 24) & ((1u << 4) - 1));
+   brw_inst_set_bits(inst, 83, 80, (value >> 20) & ((1u << 4) - 1));
+   brw_inst_set_bits(inst, 67, 64, (value >> 16) & ((1u << 4) - 1));
+   assert((value & ((1u << 16) - 1)) == 0);
+}
+
+/**
+ * Get the SEND(C) message extended descriptor immediate.
+ *
+ * \sa brw_inst_set_send_ex_desc().
+ */
+static inline uint32_t
+brw_inst_send_ex_desc(const struct gen_device_info *devinfo,
+                      const brw_inst *inst)
+{
+   assert(devinfo->gen >= 9);
+   return (brw_inst_bits(inst, 94, 91) << 28 |
+           brw_inst_bits(inst, 88, 85) << 24 |
+           brw_inst_bits(inst, 83, 80) << 20 |
+           brw_inst_bits(inst, 67, 64) << 16);
+}
+
+/**
  * Fields for SEND messages:
  *  @{
  */
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index f06a33c..07e7224 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -354,6 +354,7 @@
    unsigned components_read(unsigned i) const;
    unsigned size_read(int arg) const;
    bool can_do_source_mods(const struct gen_device_info *devinfo);
+   bool can_do_cmod();
    bool can_change_types() const;
    bool has_source_and_destination_hazard() const;
 
@@ -374,6 +375,7 @@
 
    uint8_t sources; /**< Number of fs_reg sources. */
 
+   bool last_rt:1;
    bool pi_noperspective:1;   /**< Pixel interpolator noperspective flag */
 };
 
diff --git a/src/intel/compiler/brw_ir_vec4.h b/src/intel/compiler/brw_ir_vec4.h
index e401d8b..65b1e4f 100644
--- a/src/intel/compiler/brw_ir_vec4.h
+++ b/src/intel/compiler/brw_ir_vec4.h
@@ -291,6 +291,7 @@
                       int swizzle, int swizzle_mask);
    void reswizzle(int dst_writemask, int swizzle);
    bool can_do_source_mods(const struct gen_device_info *devinfo);
+   bool can_do_cmod();
    bool can_do_writemask(const struct gen_device_info *devinfo);
    bool can_change_types() const;
    bool has_source_and_destination_hazard() const;
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
index 16b0d86..29ad68f 100644
--- a/src/intel/compiler/brw_nir.c
+++ b/src/intel/compiler/brw_nir.c
@@ -238,8 +238,7 @@
     */
    const bool has_sgvs =
       nir->info.system_values_read &
-      (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
-       BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX) |
+      (BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX) |
        BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
        BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
        BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID));
@@ -261,11 +260,11 @@
             nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
             switch (intrin->intrinsic) {
-            case nir_intrinsic_load_base_vertex:
             case nir_intrinsic_load_first_vertex:
             case nir_intrinsic_load_base_instance:
             case nir_intrinsic_load_vertex_id_zero_base:
             case nir_intrinsic_load_instance_id:
+            case nir_intrinsic_load_is_indexed_draw:
             case nir_intrinsic_load_draw_id: {
                b.cursor = nir_after_instr(&intrin->instr);
 
@@ -279,7 +278,6 @@
 
                nir_intrinsic_set_base(load, num_inputs);
                switch (intrin->intrinsic) {
-               case nir_intrinsic_load_base_vertex:
                case nir_intrinsic_load_first_vertex:
                   nir_intrinsic_set_component(load, 0);
                   break;
@@ -293,11 +291,15 @@
                   nir_intrinsic_set_component(load, 3);
                   break;
                case nir_intrinsic_load_draw_id:
-                  /* gl_DrawID is stored right after gl_VertexID and friends
-                   * if any of them exist.
+               case nir_intrinsic_load_is_indexed_draw:
+                  /* gl_DrawID and IsIndexedDraw are stored right after
+                   * gl_VertexID and friends if any of them exist.
                    */
                   nir_intrinsic_set_base(load, num_inputs + has_sgvs);
-                  nir_intrinsic_set_component(load, 0);
+                  if (intrin->intrinsic == nir_intrinsic_load_draw_id)
+                     nir_intrinsic_set_component(load, 0);
+                  else
+                     nir_intrinsic_set_component(load, 1);
                   break;
                default:
                   unreachable("Invalid system value intrinsic");
@@ -458,8 +460,7 @@
 }
 
 void
-brw_nir_lower_vue_outputs(nir_shader *nir,
-                          bool is_scalar)
+brw_nir_lower_vue_outputs(nir_shader *nir)
 {
    nir_foreach_variable(var, &nir->outputs) {
       var->data.driver_location = var->data.location;
@@ -584,12 +585,31 @@
                              nir_lower_dfract |
                              nir_lower_dround_even |
                              nir_lower_dmod);
-      OPT(nir_lower_64bit_pack);
+      OPT(nir_lower_pack);
    } while (progress);
 
    return nir;
 }
 
+static unsigned
+lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data)
+{
+   assert(alu->dest.dest.is_ssa);
+   if (alu->dest.dest.ssa.bit_size != 16)
+      return 0;
+
+   switch (alu->op) {
+   case nir_op_idiv:
+   case nir_op_imod:
+   case nir_op_irem:
+   case nir_op_udiv:
+   case nir_op_umod:
+      return 32;
+   default:
+      return 0;
+   }
+}
+
 /* Does some simple lowering and runs the standard suite of optimizations
  *
  * This is intended to be called more-or-less directly after you get the
@@ -643,6 +663,15 @@
 
    nir = brw_nir_optimize(nir, compiler, is_scalar);
 
+   /* This needs to be run after the first optimization pass but before we
+    * lower indirect derefs away
+    */
+   if (compiler->supports_shader_constants) {
+      OPT(nir_opt_large_constants, NULL, 32);
+   }
+
+   nir_lower_bit_size(nir, lower_bit_size_callback, NULL);
+
    if (is_scalar) {
       OPT(nir_lower_load_const_to_scalar);
    }
@@ -680,6 +709,10 @@
 brw_nir_link_shaders(const struct brw_compiler *compiler,
                      nir_shader **producer, nir_shader **consumer)
 {
+   nir_lower_io_arrays_to_elements(*producer, *consumer);
+   nir_validate_shader(*producer);
+   nir_validate_shader(*consumer);
+
    NIR_PASS_V(*producer, nir_remove_dead_variables, nir_var_shader_out);
    NIR_PASS_V(*consumer, nir_remove_dead_variables, nir_var_shader_in);
 
@@ -701,7 +734,7 @@
       *producer = brw_nir_optimize(*producer, compiler, p_is_scalar);
 
       const bool c_is_scalar =
-         compiler->scalar_stage[(*producer)->info.stage];
+         compiler->scalar_stage[(*consumer)->info.stage];
       *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar);
    }
 }
@@ -764,6 +797,8 @@
       OPT(nir_lower_vec_to_movs);
    }
 
+   OPT(nir_opt_dce);
+
    /* This is the last pass we run before we start emitting stuff.  It
     * determines when we need to insert boolean resolves on Gen <= 5.  We
     * run it last because it stashes data in instr->pass_flags and we don't
@@ -856,6 +891,10 @@
       return BRW_REGISTER_TYPE_W;
    case nir_type_uint16:
       return BRW_REGISTER_TYPE_UW;
+   case nir_type_int8:
+      return BRW_REGISTER_TYPE_B;
+   case nir_type_uint8:
+      return BRW_REGISTER_TYPE_UB;
    default:
       unreachable("unknown type");
    }
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
index 5e417e3..00b6173 100644
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@@ -109,7 +109,7 @@
 void brw_nir_lower_fs_inputs(nir_shader *nir,
                              const struct gen_device_info *devinfo,
                              const struct brw_wm_prog_key *key);
-void brw_nir_lower_vue_outputs(nir_shader *nir, bool is_scalar);
+void brw_nir_lower_vue_outputs(nir_shader *nir);
 void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue,
                                GLenum tes_primitive_mode);
 void brw_nir_lower_fs_outputs(nir_shader *nir);
@@ -144,8 +144,6 @@
                                 struct gl_program *prog,
                                 struct brw_stage_prog_data *stage_prog_data);
 
-void brw_nir_lower_patch_vertices_in_to_uniform(nir_shader *nir);
-
 void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler,
                                 nir_shader *nir,
                                 const struct brw_vs_prog_key *vs_key,
diff --git a/src/intel/compiler/brw_nir_analyze_ubo_ranges.c b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c
index 48bedab..cfa5316 100644
--- a/src/intel/compiler/brw_nir_analyze_ubo_ranges.c
+++ b/src/intel/compiler/brw_nir_analyze_ubo_ranges.c
@@ -126,17 +126,17 @@
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       switch (intrin->intrinsic) {
       case nir_intrinsic_load_uniform:
-      case nir_intrinsic_image_var_load:
-      case nir_intrinsic_image_var_store:
-      case nir_intrinsic_image_var_atomic_add:
-      case nir_intrinsic_image_var_atomic_min:
-      case nir_intrinsic_image_var_atomic_max:
-      case nir_intrinsic_image_var_atomic_and:
-      case nir_intrinsic_image_var_atomic_or:
-      case nir_intrinsic_image_var_atomic_xor:
-      case nir_intrinsic_image_var_atomic_exchange:
-      case nir_intrinsic_image_var_atomic_comp_swap:
-      case nir_intrinsic_image_var_size:
+      case nir_intrinsic_image_deref_load:
+      case nir_intrinsic_image_deref_store:
+      case nir_intrinsic_image_deref_atomic_add:
+      case nir_intrinsic_image_deref_atomic_min:
+      case nir_intrinsic_image_deref_atomic_max:
+      case nir_intrinsic_image_deref_atomic_and:
+      case nir_intrinsic_image_deref_atomic_or:
+      case nir_intrinsic_image_deref_atomic_xor:
+      case nir_intrinsic_image_deref_atomic_exchange:
+      case nir_intrinsic_image_deref_atomic_comp_swap:
+      case nir_intrinsic_image_deref_size:
          state->uses_regular_uniforms = true;
          continue;
 
@@ -154,14 +154,26 @@
          const int block = block_const->u32[0];
          const int offset = offset_const->u32[0] / 32;
 
-         /* Won't fit in our bitfield */
+         /* Avoid shifting by larger than the width of our bitfield, as this
+          * is undefined in C.  Even if we require multiple bits to represent
+          * the entire value, it's OK to record a partial value - the backend
+          * is capable of falling back to pull loads for later components of
+          * vectors, as it has to shrink ranges for other reasons anyway.
+          */
          if (offset >= 64)
             continue;
 
+         /* The value might span multiple 32-byte chunks. */
+         const int bytes = nir_intrinsic_dest_components(intrin) *
+                           (nir_dest_bit_size(intrin->dest) / 8);
+         const int start = ROUND_DOWN_TO(offset_const->u32[0], 32);
+         const int end = ALIGN(offset_const->u32[0] + bytes, 32);
+         const int chunks = (end - start) / 32;
+
          /* TODO: should we count uses in loops as higher benefit? */
 
          struct ubo_block_info *info = get_block_info(state, block);
-         info->offsets |= 1ull << offset;
+         info->offsets |= ((1ull << chunks) - 1) << offset;
          info->uses[offset]++;
       }
    }
diff --git a/src/intel/compiler/brw_nir_trig_workarounds.py b/src/intel/compiler/brw_nir_trig_workarounds.py
index 3d08b9a..d60e094 100644
--- a/src/intel/compiler/brw_nir_trig_workarounds.py
+++ b/src/intel/compiler/brw_nir_trig_workarounds.py
@@ -31,6 +31,8 @@
 # amplitude slightly.  Apparently this also minimizes the error function,
 # reducing the maximum error from 0.00006 to about 0.00003.
 
+from __future__ import print_function
+
 import argparse
 import sys
 
@@ -51,9 +53,9 @@
 def run():
     import nir_algebraic  # pylint: disable=import-error
 
-    print '#include "brw_nir.h"'
-    print nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds",
-                                      TRIG_WORKAROUNDS).render()
+    print('#include "brw_nir.h"')
+    print(nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds",
+                                      TRIG_WORKAROUNDS).render())
 
 
 if __name__ == '__main__':
diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
index ac12ab3..46d6619 100644
--- a/src/intel/compiler/brw_reg.h
+++ b/src/intel/compiler/brw_reg.h
@@ -376,6 +376,15 @@
    }
 }
 
+static inline bool
+type_is_unsigned_int(enum brw_reg_type tp)
+{
+   return tp == BRW_REGISTER_TYPE_UB ||
+          tp == BRW_REGISTER_TYPE_UW ||
+          tp == BRW_REGISTER_TYPE_UD ||
+          tp == BRW_REGISTER_TYPE_UQ;
+}
+
 /**
  * Construct a brw_reg.
  * \param file      one of the BRW_x_REGISTER_FILE values
diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp
index 74d7c8e..7b7a0d7 100644
--- a/src/intel/compiler/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -763,22 +763,22 @@
 }
 
 void
-vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+vec4_instruction_scheduler::count_reads_remaining(backend_instruction *)
 {
 }
 
 void
-vec4_instruction_scheduler::setup_liveness(cfg_t *cfg)
+vec4_instruction_scheduler::setup_liveness(cfg_t *)
 {
 }
 
 void
-vec4_instruction_scheduler::update_register_pressure(backend_instruction *be)
+vec4_instruction_scheduler::update_register_pressure(backend_instruction *)
 {
 }
 
 int
-vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *)
 {
    return 0;
 }
@@ -1560,7 +1560,7 @@
 }
 
 int
-vec4_instruction_scheduler::issue_time(backend_instruction *inst)
+vec4_instruction_scheduler::issue_time(backend_instruction *)
 {
    /* We always execute as two vec4s in parallel. */
    return 2;
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
index cc37712..804573f 100644
--- a/src/intel/compiler/brw_shader.cpp
+++ b/src/intel/compiler/brw_shader.cpp
@@ -296,6 +296,9 @@
       return "typed_surface_write_logical";
    case SHADER_OPCODE_MEMORY_FENCE:
       return "memory_fence";
+   case SHADER_OPCODE_INTERLOCK:
+      /* For an interlock we actually issue a memory fence via sendc. */
+      return "interlock";
 
    case SHADER_OPCODE_BYTE_SCATTERED_READ:
       return "byte_scattered_read";
@@ -378,8 +381,6 @@
    case FS_OPCODE_DDY_FINE:
       return "ddy_fine";
 
-   case FS_OPCODE_CINTERP:
-      return "cinterp";
    case FS_OPCODE_LINTERP:
       return "linterp";
 
@@ -399,8 +400,6 @@
    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
       return "varying_pull_const_logical";
 
-   case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
-      return "mov_dispatch_to_flags";
    case FS_OPCODE_DISCARD_JUMP:
       return "discard_jump";
 
@@ -605,7 +604,8 @@
    case BRW_REGISTER_TYPE_V:
       assert(!"unimplemented: negate UV/V immediate");
    case BRW_REGISTER_TYPE_HF:
-      assert(!"unimplemented: negate HF immediate");
+      reg->ud ^= 0x80008000;
+      return true;
    case BRW_REGISTER_TYPE_NF:
       unreachable("no NF immediates");
    }
@@ -651,7 +651,8 @@
    case BRW_REGISTER_TYPE_V:
       assert(!"unimplemented: abs V immediate");
    case BRW_REGISTER_TYPE_HF:
-      assert(!"unimplemented: abs HF immediate");
+      reg->ud &= ~0x80008000;
+      return true;
    case BRW_REGISTER_TYPE_NF:
       unreachable("no NF immediates");
    }
@@ -958,7 +959,6 @@
    case BRW_OPCODE_SHR:
    case BRW_OPCODE_SUBB:
    case BRW_OPCODE_XOR:
-   case FS_OPCODE_CINTERP:
    case FS_OPCODE_LINTERP:
       return true;
    default:
@@ -985,9 +985,9 @@
    return writes_accumulator ||
           (devinfo->gen < 6 &&
            ((opcode >= BRW_OPCODE_ADD && opcode < BRW_OPCODE_NOP) ||
-            (opcode >= FS_OPCODE_DDX_COARSE && opcode <= FS_OPCODE_LINTERP &&
-             opcode != FS_OPCODE_CINTERP))) ||
-          (opcode == FS_OPCODE_LINTERP && !devinfo->has_pln);
+            (opcode >= FS_OPCODE_DDX_COARSE && opcode <= FS_OPCODE_LINTERP))) ||
+          (opcode == FS_OPCODE_LINTERP &&
+           (!devinfo->has_pln || devinfo->gen <= 6));
 }
 
 bool
@@ -1006,12 +1006,14 @@
    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
    case SHADER_OPCODE_MEMORY_FENCE:
+   case SHADER_OPCODE_INTERLOCK:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
    case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case FS_OPCODE_FB_WRITE:
    case FS_OPCODE_FB_WRITE_LOGICAL:
+   case FS_OPCODE_REP_FB_WRITE:
    case SHADER_OPCODE_BARRIER:
    case TCS_OPCODE_URB_WRITE:
    case TCS_OPCODE_RELEASE_INPUT:
@@ -1193,7 +1195,7 @@
 
    nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar);
    brw_nir_lower_tes_inputs(nir, input_vue_map);
-   brw_nir_lower_vue_outputs(nir, is_scalar);
+   brw_nir_lower_vue_outputs(nir);
    nir = brw_postprocess_nir(nir, compiler, is_scalar);
 
    brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
@@ -1281,7 +1283,7 @@
       prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
-      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+      fs_generator g(compiler, log_data, mem_ctx,
                      &prog_data->base.base, v.promoted_constants, false,
                      MESA_SHADER_TESS_EVAL);
       if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
index d662a73..e2fa585 100644
--- a/src/intel/compiler/brw_vec4.cpp
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -41,7 +41,7 @@
 void
 src_reg::init()
 {
-   memset(this, 0, sizeof(*this));
+   memset((void*)this, 0, sizeof(*this));
    this->file = BAD_FILE;
    this->type = BRW_REGISTER_TYPE_UD;
 }
@@ -83,7 +83,7 @@
 void
 dst_reg::init()
 {
-   memset(this, 0, sizeof(*this));
+   memset((void*)this, 0, sizeof(*this));
    this->file = BAD_FILE;
    this->type = BRW_REGISTER_TYPE_UD;
    this->writemask = WRITEMASK_XYZW;
@@ -257,6 +257,26 @@
 }
 
 bool
+vec4_instruction::can_do_cmod()
+{
+   if (!backend_instruction::can_do_cmod())
+      return false;
+
+   /* The accumulator result appears to get used for the conditional modifier
+    * generation.  When negating a UD value, there is a 33rd bit generated for
+    * the sign in the accumulator value, so now you can't check, for example,
+    * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
+    */
+   for (unsigned i = 0; i < 3; i++) {
+      if (src[i].file != BAD_FILE &&
+          type_is_unsigned_int(src[i].type) && src[i].negate)
+         return false;
+   }
+
+   return true;
+}
+
+bool
 vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo)
 {
    switch (opcode) {
@@ -819,6 +839,14 @@
          }
          break;
 
+      case BRW_OPCODE_OR:
+         if (inst->src[1].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = src_reg();
+            progress = true;
+         }
+         break;
+
       case VEC4_OPCODE_UNPACK_UNIFORM:
          if (inst->src[0].file != UNIFORM) {
             inst->opcode = BRW_OPCODE_MOV;
@@ -1294,6 +1322,15 @@
                }
             }
 
+            /* VS_OPCODE_UNPACK_FLAGS_SIMD4X2 generates a bunch of mov(1)
+             * instructions, and this optimization pass is not capable of
+             * handling that.  Bail on these instructions and hope that some
+             * later optimization pass can do the right thing after they are
+             * expanded.
+             */
+            if (scan_inst->opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
+               break;
+
             /* This doesn't handle saturation on the instruction we
              * want to coalesce away if the register types do not match.
              * But if scan_inst is a non type-converting 'mov', we can fix
@@ -2819,7 +2856,7 @@
    prog_data->double_inputs_read = shader->info.vs.double_inputs;
 
    brw_nir_lower_vs_inputs(shader, key->gl_attrib_wa_flags);
-   brw_nir_lower_vue_outputs(shader, is_scalar);
+   brw_nir_lower_vue_outputs(shader);
    shader = brw_postprocess_nir(shader, compiler, is_scalar);
 
    prog_data->base.clip_distance_mask =
@@ -2834,17 +2871,23 @@
     * incoming vertex attribute.  So, add an extra slot.
     */
    if (shader->info.system_values_read &
-       (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
-        BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX) |
+       (BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX) |
         BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
         BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
         BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
       nr_attribute_slots++;
    }
 
+   /* gl_DrawID and IsIndexedDraw share its very own vec4 */
    if (shader->info.system_values_read &
-       BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX))
-      prog_data->uses_basevertex = true;
+       (BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID) |
+        BITFIELD64_BIT(SYSTEM_VALUE_IS_INDEXED_DRAW))) {
+      nr_attribute_slots++;
+   }
+
+   if (shader->info.system_values_read &
+       BITFIELD64_BIT(SYSTEM_VALUE_IS_INDEXED_DRAW))
+      prog_data->uses_is_indexed_draw = true;
 
    if (shader->info.system_values_read &
        BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX))
@@ -2862,12 +2905,9 @@
        BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))
       prog_data->uses_instanceid = true;
 
-   /* gl_DrawID has its very own vec4 */
    if (shader->info.system_values_read &
-       BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
-      prog_data->uses_drawid = true;
-      nr_attribute_slots++;
-   }
+       BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID))
+          prog_data->uses_drawid = true;
 
    /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
     * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
@@ -2921,7 +2961,7 @@
 
       prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
 
-      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+      fs_generator g(compiler, log_data, mem_ctx,
                      &prog_data->base.base, v.promoted_constants,
                      v.runtime_check_aads_emit, MESA_SHADER_VERTEX);
       if (INTEL_DEBUG & DEBUG_VS) {
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
index 8ff34a5..888cb35 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -324,17 +324,15 @@
          gen6_resolve_implied_move(p, &src, inst->base_mrf);
 
       /* dst = send(offset, a0.0 | <descriptor>) */
-      brw_inst *insn = brw_send_indirect_message(
-         p, BRW_SFID_SAMPLER, dst, src, addr);
-      brw_set_sampler_message(p, insn,
-                              0 /* surface */,
-                              0 /* sampler */,
-                              msg_type,
-                              1 /* rlen */,
-                              inst->mlen /* mlen */,
-                              inst->header_size != 0 /* header */,
-                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
-                              return_format);
+      brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, src, addr,
+         brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
+         brw_sampler_desc(devinfo,
+                          0 /* surface */,
+                          0 /* sampler */,
+                          msg_type,
+                          BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                          return_format));
 
       /* visitor knows more than we do about the surface limit required,
        * so has already done marking.
@@ -777,10 +775,9 @@
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    brw_set_dest(p, send, brw_null_reg());
    brw_set_src0(p, send, urb_header);
+   brw_set_desc(p, send, brw_message_desc(devinfo, inst->mlen, 0, true));
 
-   brw_set_message_descriptor(p, send, BRW_SFID_URB,
-                              inst->mlen /* mlen */, 0 /* rlen */,
-                              true /* header */, false /* eot */);
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
    brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
    if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
@@ -966,9 +963,9 @@
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, header);
 
-   brw_set_message_descriptor(p, send, BRW_SFID_URB,
-                              1 /* mlen */, 1 /* rlen */,
-                              true /* header */, false /* eot */);
+   brw_set_desc(p, send, brw_message_desc(devinfo, 1, 1, true));
+
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
    brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
    brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
@@ -1002,9 +999,9 @@
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    brw_set_dest(p, send, brw_null_reg());
    brw_set_src0(p, send, header);
-   brw_set_message_descriptor(p, send, BRW_SFID_URB,
-                              1 /* mlen */, 0 /* rlen */,
-                              true /* header */, false /* eot */);
+   brw_set_desc(p, send, brw_message_desc(devinfo, 1, 0, true));
+
+   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
    brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
    brw_inst_set_urb_complete(devinfo, send, 1);
    brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
@@ -1171,23 +1168,23 @@
    const unsigned target_cache =
       devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
-      BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
+      BRW_SFID_DATAPORT_READ;
 
    /* Each of the 8 channel enables is considered for whether each
     * dword is written.
     */
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, send, target_cache);
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, header);
    if (devinfo->gen < 6)
       brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
-   brw_set_dp_read_message(p, send,
-                           brw_scratch_surface_idx(p),
-			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
-			   msg_type, target_cache,
-			   2, /* mlen */
-                           true, /* header_present */
-			   1 /* rlen */);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, 2, 1, true) |
+                brw_dp_read_desc(devinfo,
+                                 brw_scratch_surface_idx(p),
+                                 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                                 msg_type, BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
 }
 
 static void
@@ -1201,7 +1198,7 @@
    const unsigned target_cache =
       (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
        devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
-       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+       BRW_SFID_DATAPORT_WRITE);
    struct brw_reg header = brw_vec8_grf(0, 0);
    bool write_commit;
 
@@ -1253,21 +1250,19 @@
     * dword is written.
     */
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(p->devinfo, send, target_cache);
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, header);
    if (devinfo->gen < 6)
       brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
-   brw_set_dp_write_message(p, send,
-                            brw_scratch_surface_idx(p),
-			    BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
-			    msg_type,
-                            target_cache,
-			    3, /* mlen */
-			    true, /* header present */
-			    false, /* not a render target write */
-			    write_commit, /* rlen */
-			    false, /* eot */
-			    write_commit);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, 3, write_commit, true) |
+                brw_dp_write_desc(devinfo,
+                                  brw_scratch_surface_idx(p),
+                                  BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                                  msg_type,
+                                  false, /* not a render target write */
+                                  write_commit));
 }
 
 static void
@@ -1281,7 +1276,7 @@
    const struct gen_device_info *devinfo = p->devinfo;
    const unsigned target_cache =
       (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
-       BRW_DATAPORT_READ_TARGET_DATA_CACHE);
+       BRW_SFID_DATAPORT_READ);
    assert(index.file == BRW_IMMEDIATE_VALUE &&
 	  index.type == BRW_REGISTER_TYPE_UD);
    uint32_t surf_index = index.ud;
@@ -1319,18 +1314,17 @@
     * dword is written.
     */
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_sfid(devinfo, send, target_cache);
    brw_set_dest(p, send, dst);
    brw_set_src0(p, send, header);
    if (devinfo->gen < 6)
       brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
-   brw_set_dp_read_message(p, send,
-			   surf_index,
-			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
-			   msg_type,
-                           target_cache,
-			   2, /* mlen */
-                           true, /* header_present */
-			   1 /* rlen */);
+   brw_set_desc(p, send,
+                brw_message_desc(devinfo, 2, 1, true) |
+                brw_dp_read_desc(devinfo, surf_index,
+                                 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+                                 msg_type,
+                                 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
 }
 
 static void
@@ -1369,22 +1363,21 @@
                                  struct brw_reg surf_index,
                                  struct brw_reg offset)
 {
+   const struct gen_device_info *devinfo = p->devinfo;
    assert(surf_index.type == BRW_REGISTER_TYPE_UD);
 
    if (surf_index.file == BRW_IMMEDIATE_VALUE) {
 
       brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+      brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
       brw_set_dest(p, insn, dst);
       brw_set_src0(p, insn, offset);
-      brw_set_sampler_message(p, insn,
-                              surf_index.ud,
-                              0, /* LD message ignores sampler unit */
-                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
-                              1, /* rlen */
-                              inst->mlen,
-                              inst->header_size != 0,
-                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
-                              0);
+      brw_set_desc(p, insn,
+                   brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
+                   brw_sampler_desc(devinfo, surf_index.ud,
+                                    0, /* LD message ignores sampler unit */
+                                    GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                                    BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0));
 
       brw_mark_surface_used(&prog_data->base, surf_index.ud);
 
@@ -1398,7 +1391,7 @@
 
       /* a0.0 = surf_index & 0xff */
       brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
-      brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
+      brw_inst_set_exec_size(devinfo, insn_and, BRW_EXECUTE_1);
       brw_set_dest(p, insn_and, addr);
       brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
       brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
@@ -1406,23 +1399,21 @@
       brw_pop_insn_state(p);
 
       /* dst = send(offset, a0.0 | <descriptor>) */
-      brw_inst *insn = brw_send_indirect_message(
-         p, BRW_SFID_SAMPLER, dst, offset, addr);
-      brw_set_sampler_message(p, insn,
-                              0 /* surface */,
-                              0 /* sampler */,
-                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
-                              1 /* rlen */,
-                              inst->mlen,
-                              inst->header_size != 0,
-                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
-                              0);
+      brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, offset, addr,
+         brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
+         brw_sampler_desc(devinfo,
+                          0 /* surface */,
+                          0 /* sampler */,
+                          GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                          BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                          0));
    }
 }
 
 static void
 generate_set_simd4x2_header_gen9(struct brw_codegen *p,
-                                 vec4_instruction *inst,
+                                 vec4_instruction *,
                                  struct brw_reg dst)
 {
    brw_push_insn_state(p);
@@ -1440,9 +1431,9 @@
 
 static void
 generate_mov_indirect(struct brw_codegen *p,
-                      vec4_instruction *inst,
+                      vec4_instruction *,
                       struct brw_reg dst, struct brw_reg reg,
-                      struct brw_reg indirect, struct brw_reg length)
+                      struct brw_reg indirect)
 {
    assert(indirect.type == BRW_REGISTER_TYPE_UD);
    assert(p->devinfo->gen >= 6);
@@ -1917,7 +1908,7 @@
          break;
 
       case SHADER_OPCODE_MEMORY_FENCE:
-         brw_memory_fence(p, dst);
+         brw_memory_fence(p, dst, BRW_OPCODE_SEND);
          break;
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
@@ -2154,7 +2145,7 @@
          break;
 
       case SHADER_OPCODE_MOV_INDIRECT:
-         generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
+         generate_mov_indirect(p, inst, dst, src[0], src[1]);
          break;
 
       case BRW_OPCODE_DIM:
diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp
index fb4c125..c4c876a 100644
--- a/src/intel/compiler/brw_vec4_gs_visitor.cpp
+++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp
@@ -642,7 +642,7 @@
 
    shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar);
    brw_nir_lower_vue_inputs(shader, &c.input_vue_map);
-   brw_nir_lower_vue_outputs(shader, is_scalar);
+   brw_nir_lower_vue_outputs(shader);
    shader = brw_postprocess_nir(shader, compiler, is_scalar);
 
    prog_data->base.clip_distance_mask =
@@ -856,7 +856,7 @@
          prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
          prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
 
-         fs_generator g(compiler, log_data, mem_ctx, &c.key,
+         fs_generator g(compiler, log_data, mem_ctx,
                         &prog_data->base.base, v.promoted_constants,
                         false, MESA_SHADER_GEOMETRY);
          if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
index 34034ca..4c3a2d2 100644
--- a/src/intel/compiler/brw_vec4_nir.cpp
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -169,8 +169,7 @@
       break;
 
    default:
-      fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n");
-      break;
+      unreachable("VS instruction not yet implemented by NIR->vec4");
    }
 }
 
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp
index 8024011..1f0a69f 100644
--- a/src/intel/compiler/brw_vec4_tcs.cpp
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -474,7 +474,7 @@
       prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
       prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
 
-      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+      fs_generator g(compiler, log_data, mem_ctx,
                      &prog_data->base.base, v.promoted_constants, false,
                      MESA_SHADER_TESS_CTRL);
       if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
diff --git a/src/intel/compiler/brw_wm_iz.cpp b/src/intel/compiler/brw_wm_iz.cpp
index fead165..b9b7e70 100644
--- a/src/intel/compiler/brw_wm_iz.cpp
+++ b/src/intel/compiler/brw_wm_iz.cpp
@@ -122,9 +122,10 @@
 void fs_visitor::setup_fs_payload_gen4()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
+   assert(dispatch_width <= 16);
    struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   GLuint reg = 2;
+   GLuint reg = 1;
    bool kill_stats_promoted_workaround = false;
    int lookup = key->iz_lookup;
 
@@ -141,11 +142,13 @@
       kill_stats_promoted_workaround = true;
    }
 
+   payload.subspan_coord_reg[0] = reg++;
+
    prog_data->uses_src_depth =
       (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
    if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
        kill_stats_promoted_workaround) {
-      payload.source_depth_reg = reg;
+      payload.source_depth_reg[0] = reg;
       reg += 2;
    }
 
@@ -153,14 +156,14 @@
       source_depth_to_render_target = true;
 
    if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_WM_AA_NEVER) {
-      payload.aa_dest_stencil_reg = reg;
+      payload.aa_dest_stencil_reg[0] = reg;
       runtime_check_aads_emit =
          !wm_iz_table[lookup].ds_present && key->line_aa == BRW_WM_AA_SOMETIMES;
       reg++;
    }
 
    if (wm_iz_table[lookup].dd_present) {
-      payload.dest_depth_reg = reg;
+      payload.dest_depth_reg[0] = reg;
       reg+=2;
    }
 
diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp
index b132b87..73300b2 100644
--- a/src/intel/compiler/test_eu_validate.cpp
+++ b/src/intel/compiler/test_eu_validate.cpp
@@ -40,8 +40,10 @@
    { "skl", },
    { "bxt", },
    { "kbl", },
+   { "aml", },
    { "glk", },
    { "cfl", },
+   { "whl", },
    { "cnl", },
    { "icl", },
 };
diff --git a/src/intel/dev/gen_device_info.c b/src/intel/dev/gen_device_info.c
index c7462a5..0f12d17 100644
--- a/src/intel/dev/gen_device_info.c
+++ b/src/intel/dev/gen_device_info.c
@@ -57,8 +57,10 @@
       { "skl", 0x1912 },
       { "bxt", 0x5A85 },
       { "kbl", 0x5912 },
+      { "aml", 0x591C },
       { "glk", 0x3185 },
       { "cfl", 0x3E9B },
+      { "whl", 0x3EA1 },
       { "cnl", 0x5a52 },
       { "icl", 0x8a52 },
    };
@@ -105,6 +107,7 @@
       .size = 256,
    },
    .timestamp_frequency = 12500000,
+   .simulator_id = -1,
 };
 
 static const struct gen_device_info gen_device_info_g4x = {
@@ -124,6 +127,7 @@
       .size = 384,
    },
    .timestamp_frequency = 12500000,
+   .simulator_id = -1,
 };
 
 static const struct gen_device_info gen_device_info_ilk = {
@@ -142,6 +146,7 @@
       .size = 1024,
    },
    .timestamp_frequency = 12500000,
+   .simulator_id = -1,
 };
 
 static const struct gen_device_info gen_device_info_snb_gt1 = {
@@ -170,6 +175,7 @@
       },
    },
    .timestamp_frequency = 12500000,
+   .simulator_id = -1,
 };
 
 static const struct gen_device_info gen_device_info_snb_gt2 = {
@@ -198,6 +204,7 @@
       },
    },
    .timestamp_frequency = 12500000,
+   .simulator_id = -1,
 };
 
 #define GEN7_FEATURES                               \
@@ -236,6 +243,7 @@
          [MESA_SHADER_GEOMETRY]  = 192,
       },
    },
+   .simulator_id = 7,
 };
 
 static const struct gen_device_info gen_device_info_ivb_gt2 = {
@@ -265,6 +273,7 @@
          [MESA_SHADER_GEOMETRY]  = 320,
       },
    },
+   .simulator_id = 7,
 };
 
 static const struct gen_device_info gen_device_info_byt = {
@@ -294,6 +303,7 @@
          [MESA_SHADER_GEOMETRY]  = 192,
       },
    },
+   .simulator_id = 10,
 };
 
 #define HSW_FEATURES             \
@@ -328,6 +338,7 @@
          [MESA_SHADER_GEOMETRY]  = 256,
       },
    },
+   .simulator_id = 9,
 };
 
 static const struct gen_device_info gen_device_info_hsw_gt2 = {
@@ -356,6 +367,7 @@
          [MESA_SHADER_GEOMETRY]  = 640,
       },
    },
+   .simulator_id = 9,
 };
 
 static const struct gen_device_info gen_device_info_hsw_gt3 = {
@@ -384,6 +396,7 @@
          [MESA_SHADER_GEOMETRY]  = 640,
       },
    },
+   .simulator_id = 9,
 };
 
 /* It's unclear how well supported sampling from the hiz buffer is on GEN8,
@@ -429,7 +442,8 @@
          [MESA_SHADER_TESS_EVAL] = 1536,
          [MESA_SHADER_GEOMETRY]  = 960,
       },
-   }
+   },
+   .simulator_id = 11,
 };
 
 static const struct gen_device_info gen_device_info_bdw_gt2 = {
@@ -453,7 +467,8 @@
          [MESA_SHADER_TESS_EVAL] = 1536,
          [MESA_SHADER_GEOMETRY]  = 960,
       },
-   }
+   },
+   .simulator_id = 11,
 };
 
 static const struct gen_device_info gen_device_info_bdw_gt3 = {
@@ -477,7 +492,8 @@
          [MESA_SHADER_TESS_EVAL] = 1536,
          [MESA_SHADER_GEOMETRY]  = 960,
       },
-   }
+   },
+   .simulator_id = 11,
 };
 
 static const struct gen_device_info gen_device_info_chv = {
@@ -507,7 +523,8 @@
          [MESA_SHADER_TESS_EVAL] = 384,
          [MESA_SHADER_GEOMETRY]  = 256,
       },
-   }
+   },
+   .simulator_id = 13,
 };
 
 #define GEN9_HW_INFO                                \
@@ -603,6 +620,7 @@
    .num_eu_per_subslice = 6,
    .l3_banks = 2,
    .urb.size = 192,
+   .simulator_id = 12,
 };
 
 static const struct gen_device_info gen_device_info_skl_gt2 = {
@@ -612,6 +630,7 @@
    .num_subslices = { 3, },
    .num_eu_per_subslice = 8,
    .l3_banks = 4,
+   .simulator_id = 12,
 };
 
 static const struct gen_device_info gen_device_info_skl_gt3 = {
@@ -621,6 +640,7 @@
    .num_subslices = { 3, 3, },
    .num_eu_per_subslice = 8,
    .l3_banks = 8,
+   .simulator_id = 12,
 };
 
 static const struct gen_device_info gen_device_info_skl_gt4 = {
@@ -639,18 +659,21 @@
     * only 1008KB of this will be used."
     */
    .urb.size = 1008 / 3,
+   .simulator_id = 12,
 };
 
 static const struct gen_device_info gen_device_info_bxt = {
    GEN9_LP_FEATURES_3X6,
    .is_broxton = true,
    .l3_banks = 2,
+   .simulator_id = 14,
 };
 
 static const struct gen_device_info gen_device_info_bxt_2x6 = {
    GEN9_LP_FEATURES_2X6,
    .is_broxton = true,
    .l3_banks = 1,
+   .simulator_id = 14,
 };
 /*
  * Note: for all KBL SKUs, the PRM says SKL for GS entries, not SKL+.
@@ -668,6 +691,7 @@
    .num_subslices = { 2, },
    .num_eu_per_subslice = 6,
    .l3_banks = 2,
+   .simulator_id = 16,
 };
 
 static const struct gen_device_info gen_device_info_kbl_gt1_5 = {
@@ -680,6 +704,7 @@
    .num_subslices = { 3, },
    .num_eu_per_subslice = 6,
    .l3_banks = 4,
+   .simulator_id = 16,
 };
 
 static const struct gen_device_info gen_device_info_kbl_gt2 = {
@@ -691,6 +716,7 @@
    .num_subslices = { 3, },
    .num_eu_per_subslice = 8,
    .l3_banks = 4,
+   .simulator_id = 16,
 };
 
 static const struct gen_device_info gen_device_info_kbl_gt3 = {
@@ -702,6 +728,7 @@
    .num_subslices = { 3, 3, },
    .num_eu_per_subslice = 8,
    .l3_banks = 8,
+   .simulator_id = 16,
 };
 
 static const struct gen_device_info gen_device_info_kbl_gt4 = {
@@ -724,18 +751,21 @@
    .num_subslices = { 3, 3, 3, },
    .num_eu_per_subslice = 8,
    .l3_banks = 12,
+   .simulator_id = 16,
 };
 
 static const struct gen_device_info gen_device_info_glk = {
    GEN9_LP_FEATURES_3X6,
    .is_geminilake = true,
    .l3_banks = 2,
+   .simulator_id = 17,
 };
 
 static const struct gen_device_info gen_device_info_glk_2x6 = {
    GEN9_LP_FEATURES_2X6,
    .is_geminilake = true,
    .l3_banks = 2,
+   .simulator_id = 17,
 };
 
 static const struct gen_device_info gen_device_info_cfl_gt1 = {
@@ -747,6 +777,7 @@
    .num_subslices = { 2, },
    .num_eu_per_subslice = 6,
    .l3_banks = 2,
+   .simulator_id = 24,
 };
 static const struct gen_device_info gen_device_info_cfl_gt2 = {
    GEN9_FEATURES,
@@ -757,6 +788,7 @@
    .num_subslices = { 3, },
    .num_eu_per_subslice = 8,
    .l3_banks = 4,
+   .simulator_id = 24,
 };
 
 static const struct gen_device_info gen_device_info_cfl_gt3 = {
@@ -768,6 +800,7 @@
    .num_subslices = { 3, 3, },
    .num_eu_per_subslice = 8,
    .l3_banks = 8,
+   .simulator_id = 24,
 };
 
 #define GEN10_HW_INFO                               \
@@ -809,24 +842,28 @@
    /* GT0.5 */
    GEN10_FEATURES(1, 1, subslices(2), 2),
    .is_cannonlake = true,
+   .simulator_id = 15,
 };
 
 static const struct gen_device_info gen_device_info_cnl_3x8 = {
    /* GT1 */
    GEN10_FEATURES(1, 1, subslices(3), 3),
    .is_cannonlake = true,
+   .simulator_id = 15,
 };
 
 static const struct gen_device_info gen_device_info_cnl_4x8 = {
    /* GT 1.5 */
    GEN10_FEATURES(1, 2, subslices(2, 2), 6),
    .is_cannonlake = true,
+   .simulator_id = 15,
 };
 
 static const struct gen_device_info gen_device_info_cnl_5x8 = {
    /* GT2 */
    GEN10_FEATURES(2, 2, subslices(3, 2), 6),
    .is_cannonlake = true,
+   .simulator_id = 15,
 };
 
 #define GEN11_HW_INFO                               \
@@ -863,18 +900,22 @@
 
 static const struct gen_device_info gen_device_info_icl_8x8 = {
    GEN11_FEATURES(2, 1, subslices(8), 8),
+   .simulator_id = 19,
 };
 
 static const struct gen_device_info gen_device_info_icl_6x8 = {
    GEN11_FEATURES(1, 1, subslices(6), 6),
+   .simulator_id = 19,
 };
 
 static const struct gen_device_info gen_device_info_icl_4x8 = {
    GEN11_FEATURES(1, 1, subslices(4), 6),
+   .simulator_id = 19,
 };
 
 static const struct gen_device_info gen_device_info_icl_1x8 = {
    GEN11_FEATURES(1, 1, subslices(1), 6),
+   .simulator_id = 19,
 };
 
 static void
@@ -1034,6 +1075,7 @@
       }
       n_subslices += devinfo->num_subslices[s];
    }
+   assert(n_subslices > 0);
 
    uint32_t eu_mask_len =
       topology->eu_stride * topology->max_subslices * topology->max_slices;
diff --git a/src/intel/dev/gen_device_info.h b/src/intel/dev/gen_device_info.h
index 40b7238..291a3cc 100644
--- a/src/intel/dev/gen_device_info.h
+++ b/src/intel/dev/gen_device_info.h
@@ -241,6 +241,11 @@
     */
    uint64_t timestamp_frequency;
 
+   /**
+    * ID to put into the .aub files.
+    */
+   int simulator_id;
+
    /** @} */
 };
 
diff --git a/src/intel/genxml/gen10.xml b/src/intel/genxml/gen10.xml
index 60238ae..541e440 100644
--- a/src/intel/genxml/gen10.xml
+++ b/src/intel/genxml/gen10.xml
@@ -2949,7 +2949,7 @@
     <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
     <field name="Stereoscopic 3D Mode" start="63" end="63" type="bool"/>
     <field name="Display Buffer Pitch" start="38" end="47" type="uint"/>
-    <field name="Tile Parameter" start="32" end="34" type="bool"/>
+    <field name="Tile Parameter" start="32" end="34" type="uint"/>
     <field name="Display Buffer Base Address" start="76" end="95" type="address"/>
     <field name="VRR Master Flip" start="75" end="75" type="uint"/>
     <field name="Flip Type" start="64" end="65" type="uint">
@@ -3021,7 +3021,7 @@
       <value name="Display Plane 1 B" value="1"/>
       <value name="Display Plane 1 C" value="4"/>
     </field>
-    <field name="Scan Line Event Done Forward" start="17" end="18" type="bool"/>
+    <field name="Scan Line Event Done Forward" start="17" end="18" type="uint"/>
     <field name="DWord Length" start="0" end="5" type="uint" default="0"/>
     <field name="Start Scan Line Number" start="48" end="60" type="uint"/>
     <field name="End Scan Line Number" start="32" end="44" type="uint"/>
@@ -3631,6 +3631,10 @@
     <field name="3D Rendering Instruction Disable" start="0" end="0" type="bool"/>
     <field name="Media Instruction Disable" start="1" end="1" type="bool"/>
     <field name="CONSTANT_BUFFER Address Offset Disable" start="4" end="4" type="bool"/>
+
+    <field name="3D Rendering Instruction Disable Mask" start="16" end="16" type="bool"/>
+    <field name="Media Instruction Disable Mask" start="17" end="17" type="bool"/>
+    <field name="CONSTANT_BUFFER Address Offset Disable Mask" start="20" end="20" type="bool"/>
   </register>
 
 </genxml>
diff --git a/src/intel/genxml/gen11.xml b/src/intel/genxml/gen11.xml
index 893c5b2..bd3800e 100644
--- a/src/intel/genxml/gen11.xml
+++ b/src/intel/genxml/gen11.xml
@@ -2934,7 +2934,7 @@
     <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
     <field name="Stereoscopic 3D Mode" start="63" end="63" type="bool"/>
     <field name="Display Buffer Pitch" start="38" end="47" type="uint"/>
-    <field name="Tile Parameter" start="32" end="34" type="bool"/>
+    <field name="Tile Parameter" start="32" end="34" type="uint"/>
     <field name="Display Buffer Base Address" start="76" end="95" type="address"/>
     <field name="VRR Master Flip" start="75" end="75" type="uint"/>
     <field name="Flip Type" start="64" end="65" type="uint">
@@ -3016,7 +3016,7 @@
       <value name="Display Plane 1 C" value="4"/>
       <value name="Display Plane 1 D" value="5"/>
     </field>
-    <field name="Scan Line Event Done Forward" start="17" end="18" type="bool"/>
+    <field name="Scan Line Event Done Forward" start="17" end="18" type="uint"/>
     <field name="DWord Length" start="0" end="5" type="uint" default="0"/>
     <field name="Start Scan Line Number" start="48" end="60" type="uint"/>
     <field name="End Scan Line Number" start="32" end="44" type="uint"/>
@@ -3250,9 +3250,9 @@
   <instruction name="MI_WAIT_FOR_EVENT_2" bias="1" length="1">
     <field name="Command Type" start="29" end="31" type="uint" default="0"/>
     <field name="MI Command Opcode" start="23" end="28" type="uint" default="4"/>
-    <field name="Display Pipe Scan Line Wait Enable" start="12" end="14" type="bool"/>
-    <field name="Display Pipe Vertical Blank Wait Enable" start="8" end="10" type="bool"/>
-    <field name="Display Plane Flip Pending Wait Enable" start="0" end="5" type="bool"/>
+    <field name="Display Pipe Scan Line Wait Enable" start="12" end="14" type="uint"/>
+    <field name="Display Pipe Vertical Blank Wait Enable" start="8" end="10" type="uint"/>
+    <field name="Display Plane Flip Pending Wait Enable" start="0" end="5" type="uint"/>
   </instruction>
 
   <instruction name="PIPELINE_SELECT" bias="1" length="1">
@@ -3629,6 +3629,10 @@
     <field name="3D Rendering Instruction Disable" start="0" end="0" type="bool"/>
     <field name="Media Instruction Disable" start="1" end="1" type="bool"/>
     <field name="CONSTANT_BUFFER Address Offset Disable" start="4" end="4" type="bool"/>
+
+    <field name="3D Rendering Instruction Disable Mask" start="16" end="16" type="bool"/>
+    <field name="Media Instruction Disable Mask" start="17" end="17" type="bool"/>
+    <field name="CONSTANT_BUFFER Address Offset Disable Mask" start="20" end="20" type="bool"/>
   </register>
 
 </genxml>
diff --git a/src/intel/genxml/gen4.xml b/src/intel/genxml/gen4.xml
index 6f513c58..cd50a10 100644
--- a/src/intel/genxml/gen4.xml
+++ b/src/intel/genxml/gen4.xml
@@ -961,12 +961,12 @@
     <field name="CLIP Unit URB Reallocation Request" start="10" end="10" type="bool"/>
     <field name="GS Unit URB Reallocation Request" start="9" end="9" type="bool"/>
     <field name="VS Unit URB Reallocation Request" start="8" end="8" type="bool"/>
-    <field name="CLIP Fence" start="52" end="61" type="bool"/>
-    <field name="GS Fence" start="42" end="51" type="bool"/>
-    <field name="VS Fence" start="32" end="41" type="bool"/>
-    <field name="CS Fence" start="84" end="94" type="bool"/>
-    <field name="VFE Fence" start="74" end="83" type="bool"/>
-    <field name="SF Fence" start="64" end="73" type="bool"/>
+    <field name="CLIP Fence" start="52" end="61" type="uint"/>
+    <field name="GS Fence" start="42" end="51" type="uint"/>
+    <field name="VS Fence" start="32" end="41" type="uint"/>
+    <field name="CS Fence" start="84" end="94" type="uint"/>
+    <field name="VFE Fence" start="74" end="83" type="uint"/>
+    <field name="SF Fence" start="64" end="73" type="uint"/>
   </instruction>
 
   <instruction name="XY_COLOR_BLT" bias="2" length="6">
diff --git a/src/intel/genxml/gen45.xml b/src/intel/genxml/gen45.xml
index fbd57a0..4d2c1534 100644
--- a/src/intel/genxml/gen45.xml
+++ b/src/intel/genxml/gen45.xml
@@ -994,12 +994,12 @@
     <field name="CLIP Unit URB Reallocation Request" start="10" end="10" type="bool"/>
     <field name="GS Unit URB Reallocation Request" start="9" end="9" type="bool"/>
     <field name="VS Unit URB Reallocation Request" start="8" end="8" type="bool"/>
-    <field name="CLIP Fence" start="52" end="61" type="bool"/>
-    <field name="GS Fence" start="42" end="51" type="bool"/>
-    <field name="VS Fence" start="32" end="41" type="bool"/>
-    <field name="CS Fence" start="84" end="94" type="bool"/>
-    <field name="VFE Fence" start="74" end="83" type="bool"/>
-    <field name="SF Fence" start="64" end="73" type="bool"/>
+    <field name="CLIP Fence" start="52" end="61" type="uint"/>
+    <field name="GS Fence" start="42" end="51" type="uint"/>
+    <field name="VS Fence" start="32" end="41" type="uint"/>
+    <field name="CS Fence" start="84" end="94" type="uint"/>
+    <field name="VFE Fence" start="74" end="83" type="uint"/>
+    <field name="SF Fence" start="64" end="73" type="uint"/>
   </instruction>
 
   <instruction name="XY_COLOR_BLT" bias="2" length="6">
diff --git a/src/intel/genxml/gen5.xml b/src/intel/genxml/gen5.xml
index 5c93ecd..5bb5a2c 100644
--- a/src/intel/genxml/gen5.xml
+++ b/src/intel/genxml/gen5.xml
@@ -1086,12 +1086,12 @@
     <field name="CLIP Unit URB Reallocation Request" start="10" end="10" type="bool"/>
     <field name="GS Unit URB Reallocation Request" start="9" end="9" type="bool"/>
     <field name="VS Unit URB Reallocation Request" start="8" end="8" type="bool"/>
-    <field name="CLIP Fence" start="52" end="61" type="bool"/>
-    <field name="GS Fence" start="42" end="51" type="bool"/>
-    <field name="VS Fence" start="32" end="41" type="bool"/>
-    <field name="CS Fence" start="84" end="94" type="bool"/>
-    <field name="VFE Fence" start="74" end="83" type="bool"/>
-    <field name="SF Fence" start="64" end="73" type="bool"/>
+    <field name="CLIP Fence" start="52" end="61" type="uint"/>
+    <field name="GS Fence" start="42" end="51" type="uint"/>
+    <field name="VS Fence" start="32" end="41" type="uint"/>
+    <field name="CS Fence" start="84" end="94" type="uint"/>
+    <field name="VFE Fence" start="74" end="83" type="uint"/>
+    <field name="SF Fence" start="64" end="73" type="uint"/>
   </instruction>
 
   <instruction name="XY_COLOR_BLT" bias="2" length="6">
diff --git a/src/intel/genxml/gen6.xml b/src/intel/genxml/gen6.xml
index 0493221..62d2574 100644
--- a/src/intel/genxml/gen6.xml
+++ b/src/intel/genxml/gen6.xml
@@ -622,6 +622,17 @@
     <field name="Maximum VP Index" start="96" end="99" type="uint"/>
   </instruction>
 
+  <struct name="3DSTATE_CONSTANT_BODY" length="4">
+    <field name="Pointer to Constant Buffer 0" start="5" end="31" type="offset"/>
+    <field name="Constant Buffer 0 Read Length" start="0" end="4" type="uint"/>
+    <field name="Pointer to Constant Buffer 1" start="37" end="63" type="address"/>
+    <field name="Constant Buffer 1 Read Length" start="32" end="36" type="uint"/>
+    <field name="Pointer to Constant Buffer 2" start="69" end="95" type="address"/>
+    <field name="Constant Buffer 2 Read Length" start="64" end="68" type="uint"/>
+    <field name="Pointer to Constant Buffer 3" start="101" end="127" type="address"/>
+    <field name="Constant Buffer 3 Read Length" start="96" end="100" type="uint"/>
+  </struct>
+
   <instruction name="3DSTATE_CONSTANT_GS" bias="2" length="5">
     <field name="Command Type" start="29" end="31" type="uint" default="3"/>
     <field name="Command SubType" start="27" end="28" type="uint" default="3"/>
@@ -633,14 +644,7 @@
     <field name="Buffer 0 Valid" start="12" end="12" type="bool"/>
     <field name="Constant Buffer Object Control State" start="8" end="11" type="MEMORY_OBJECT_CONTROL_STATE"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
-    <field name="Pointer to GS Constant Buffer 0" start="37" end="63" type="offset"/>
-    <field name="GS Constant Buffer 0 Read Length" start="32" end="36" type="uint"/>
-    <field name="Pointer to GS Constant Buffer 1" start="69" end="95" type="address"/>
-    <field name="GS Constant Buffer 1 Read Length" start="64" end="68" type="uint"/>
-    <field name="Pointer to GS Constant Buffer 2" start="101" end="127" type="address"/>
-    <field name="GS Constant Buffer 2 Read Length" start="96" end="100" type="uint"/>
-    <field name="Pointer to GS Constant Buffer 3" start="133" end="159" type="address"/>
-    <field name="GS Constant Buffer 3 Read Length" start="128" end="132" type="uint"/>
+    <field name="Constant Body" start="32" end="159" type="3DSTATE_CONSTANT_BODY"/>
   </instruction>
 
   <instruction name="3DSTATE_CONSTANT_PS" bias="2" length="5">
@@ -654,14 +658,7 @@
     <field name="Buffer 0 Valid" start="12" end="12" type="bool"/>
     <field name="Constant Buffer Object Control State" start="8" end="11" type="MEMORY_OBJECT_CONTROL_STATE"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
-    <field name="Pointer to PS Constant Buffer 0" start="37" end="63" type="offset"/>
-    <field name="PS Constant Buffer 0 Read Length" start="32" end="36" type="uint"/>
-    <field name="Pointer to PS Constant Buffer 1" start="69" end="95" type="address"/>
-    <field name="PS Constant Buffer 1 Read Length" start="64" end="68" type="uint"/>
-    <field name="Pointer to PS Constant Buffer 2" start="101" end="127" type="address"/>
-    <field name="PS Constant Buffer 2 Read Length" start="96" end="100" type="uint"/>
-    <field name="Pointer to PS Constant Buffer 3" start="133" end="159" type="address"/>
-    <field name="PS Constant Buffer 3 Read Length" start="128" end="132" type="uint"/>
+    <field name="Constant Body" start="32" end="159" type="3DSTATE_CONSTANT_BODY"/>
   </instruction>
 
   <instruction name="3DSTATE_CONSTANT_VS" bias="2" length="5">
@@ -675,14 +672,7 @@
     <field name="Buffer 0 Valid" start="12" end="12" type="bool"/>
     <field name="Constant Buffer Object Control State" start="8" end="11" type="MEMORY_OBJECT_CONTROL_STATE"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
-    <field name="Pointer to VS Constant Buffer 0" start="37" end="63" type="offset"/>
-    <field name="VS Constant Buffer 0 Read Length" start="32" end="36" type="uint"/>
-    <field name="Pointer to VS Constant Buffer 1" start="69" end="95" type="address"/>
-    <field name="VS Constant Buffer 1 Read Length" start="64" end="68" type="uint"/>
-    <field name="Pointer to VS Constant Buffer 2" start="101" end="127" type="address"/>
-    <field name="VS Constant Buffer 2 Read Length" start="96" end="100" type="uint"/>
-    <field name="Pointer to VS Constant Buffer 3" start="133" end="159" type="address"/>
-    <field name="VS Constant Buffer 3 Read Length" start="128" end="132" type="uint"/>
+    <field name="Constant Body" start="32" end="159" type="3DSTATE_CONSTANT_BODY"/>
   </instruction>
 
   <instruction name="3DSTATE_DEPTH_BUFFER" bias="2" length="7">
@@ -1888,7 +1878,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -1904,7 +1894,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -1920,7 +1910,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -1972,6 +1962,11 @@
     <field name="3D Rendering Instruction Disable" start="2" end="2" type="bool"/>
     <field name="Media Instruction Disable" start="3" end="3" type="bool"/>
     <field name="CONSTANT_BUFFER Address Offset Disable" start="6" end="6" type="bool"/>
+
+    <field name="3D State Instruction Disable Mask" start="17" end="17" type="bool"/>
+    <field name="3D Rendering Instruction Disable Mask" start="18" end="18" type="bool"/>
+    <field name="Media Instruction Disable Mask" start="19" end="19" type="bool"/>
+    <field name="CONSTANT_BUFFER Address Offset Disable Mask" start="22" end="22" type="bool"/>
   </register>
 
 </genxml>
diff --git a/src/intel/genxml/gen7.xml b/src/intel/genxml/gen7.xml
index baf42a7..6dde797 100644
--- a/src/intel/genxml/gen7.xml
+++ b/src/intel/genxml/gen7.xml
@@ -2537,7 +2537,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -2553,7 +2553,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -2569,7 +2569,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -2621,6 +2621,11 @@
     <field name="3D Rendering Instruction Disable" start="2" end="2" type="bool"/>
     <field name="Media Instruction Disable" start="3" end="3" type="bool"/>
     <field name="CONSTANT_BUFFER Address Offset Disable" start="6" end="6" type="bool"/>
+
+    <field name="3D State Instruction Disable Mask" start="17" end="17" type="bool"/>
+    <field name="3D Rendering Instruction Disable Mask" start="18" end="18" type="bool"/>
+    <field name="Media Instruction Disable Mask" start="19" end="19" type="bool"/>
+    <field name="CONSTANT_BUFFER Address Offset Disable Mask" start="22" end="22" type="bool"/>
   </register>
 
 </genxml>
diff --git a/src/intel/genxml/gen75.xml b/src/intel/genxml/gen75.xml
index 7b635b2..dfc3d89 100644
--- a/src/intel/genxml/gen75.xml
+++ b/src/intel/genxml/gen75.xml
@@ -2314,9 +2314,9 @@
   <instruction name="MI_BATCH_BUFFER_START" bias="2" length="2">
     <field name="Command Type" start="29" end="31" type="uint" default="0"/>
     <field name="MI Command Opcode" start="23" end="28" type="uint" default="49"/>
-    <field name="2nd Level Batch Buffer" start="22" end="22" type="uint">
-      <value name="1st level batch" value="0"/>
-      <value name="2nd level batch" value="1"/>
+    <field name="Second Level Batch Buffer" start="22" end="22" type="uint">
+      <value name="First level batch" value="0"/>
+      <value name="Second level batch" value="1"/>
     </field>
     <field name="Add Offset Enable" start="16" end="16" type="bool"/>
     <field name="Predication Enable" start="15" end="15" type="bool"/>
@@ -2546,7 +2546,7 @@
   <instruction name="MI_SET_PREDICATE" bias="1" length="1">
     <field name="Command Type" start="29" end="31" type="uint" default="0"/>
     <field name="MI Command Opcode" start="23" end="28" type="uint" default="1"/>
-    <field name="PREDICATE ENABLE" start="0" end="1" type="bool" default="6"/>
+    <field name="PREDICATE ENABLE" start="0" end="1" type="uint"/>
   </instruction>
 
   <instruction name="MI_STORE_DATA_IMM" bias="2" length="4">
@@ -3021,7 +3021,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -3037,7 +3037,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -3053,7 +3053,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -3069,7 +3069,7 @@
       <value name="Invalid and Unloaded PD fault" value="3"/>
     </field>
     <field name= "SRCID of Fault" start="3" end="10" type="uint"/>
-    <field name="GTTSEL" start="11" end="1" type="uint">
+    <field name="GTTSEL" start="11" end="11" type="uint">
       <value name="PPGTT" value="0"/>
       <value name="GGTT" value="1"/>
     </field>
@@ -3135,6 +3135,11 @@
     <field name="3D Rendering Instruction Disable" start="2" end="2" type="bool"/>
     <field name="Media Instruction Disable" start="3" end="3" type="bool"/>
     <field name="CONSTANT_BUFFER Address Offset Disable" start="6" end="6" type="bool"/>
+
+    <field name="3D State Instruction Disable Mask" start="17" end="17" type="bool"/>
+    <field name="3D Rendering Instruction Disable Mask" start="18" end="18" type="bool"/>
+    <field name="Media Instruction Disable Mask" start="19" end="19" type="bool"/>
+    <field name="CONSTANT_BUFFER Address Offset Disable Mask" start="22" end="22" type="bool"/>
   </register>
 
 </genxml>
diff --git a/src/intel/genxml/gen8.xml b/src/intel/genxml/gen8.xml
index 0f37570..330366b 100644
--- a/src/intel/genxml/gen8.xml
+++ b/src/intel/genxml/gen8.xml
@@ -2553,9 +2553,9 @@
   <instruction name="MI_BATCH_BUFFER_START" bias="2" length="3">
     <field name="Command Type" start="29" end="31" type="uint" default="0"/>
     <field name="MI Command Opcode" start="23" end="28" type="uint" default="49"/>
-    <field name="2nd Level Batch Buffer" start="22" end="22" type="uint">
-      <value name="1st level batch" value="0"/>
-      <value name="2nd level batch" value="1"/>
+    <field name="Second Level Batch Buffer" start="22" end="22" type="uint">
+      <value name="First level batch" value="0"/>
+      <value name="Second level batch" value="1"/>
     </field>
     <field name="Add Offset Enable" start="16" end="16" type="bool"/>
     <field name="Predication Enable" start="15" end="15" type="bool"/>
@@ -2652,7 +2652,7 @@
       <value name="Display Plane B" value="1"/>
       <value name="Display Plane C" value="4"/>
     </field>
-    <field name="Scan Line Event Done Forward" start="17" end="18" type="bool"/>
+    <field name="Scan Line Event Done Forward" start="17" end="18" type="uint"/>
     <field name="DWord Length" start="0" end="5" type="uint" default="0"/>
     <field name="Start Scan Line Number" start="48" end="60" type="uint"/>
     <field name="End Scan Line Number" start="32" end="44" type="uint"/>
@@ -3374,6 +3374,11 @@
     <field name="3D Rendering Instruction Disable" start="2" end="2" type="bool"/>
     <field name="Media Instruction Disable" start="3" end="3" type="bool"/>
     <field name="CONSTANT_BUFFER Address Offset Disable" start="6" end="6" type="bool"/>
+
+    <field name="3D State Instruction Disable Mask" start="17" end="17" type="bool"/>
+    <field name="3D Rendering Instruction Disable Mask" start="18" end="18" type="bool"/>
+    <field name="Media Instruction Disable Mask" start="19" end="19" type="bool"/>
+    <field name="CONSTANT_BUFFER Address Offset Disable Mask" start="22" end="22" type="bool"/>
   </register>
 
 </genxml>
diff --git a/src/intel/genxml/gen9.xml b/src/intel/genxml/gen9.xml
index 7d3c74d..318ae89 100644
--- a/src/intel/genxml/gen9.xml
+++ b/src/intel/genxml/gen9.xml
@@ -2868,7 +2868,7 @@
     <field name="DWord Length" start="0" end="7" type="uint" default="1"/>
     <field name="Stereoscopic 3D Mode" start="63" end="63" type="bool"/>
     <field name="Display Buffer Pitch" start="38" end="47" type="uint"/>
-    <field name="Tile Parameter" start="32" end="34" type="bool"/>
+    <field name="Tile Parameter" start="32" end="34" type="uint"/>
     <field name="Display Buffer Base Address" start="76" end="95" type="address"/>
     <field name="Flip Type" start="64" end="65" type="uint">
       <value name="Sync Flip" value="0"/>
@@ -2939,7 +2939,7 @@
       <value name="Display Plane 1 B" value="1"/>
       <value name="Display Plane 1 C" value="4"/>
     </field>
-    <field name="Scan Line Event Done Forward" start="17" end="18" type="bool"/>
+    <field name="Scan Line Event Done Forward" start="17" end="18" type="uint"/>
     <field name="DWord Length" start="0" end="5" type="uint" default="0"/>
     <field name="Start Scan Line Number" start="48" end="60" type="uint"/>
     <field name="End Scan Line Number" start="32" end="44" type="uint"/>
@@ -3703,6 +3703,10 @@
     <field name="3D Rendering Instruction Disable" start="0" end="0" type="bool"/>
     <field name="Media Instruction Disable" start="1" end="1" type="bool"/>
     <field name="CONSTANT_BUFFER Address Offset Disable" start="4" end="4" type="bool"/>
+
+    <field name="3D Rendering Instruction Disable Mask" start="16" end="16" type="bool"/>
+    <field name="Media Instruction Disable Mask" start="17" end="17" type="bool"/>
+    <field name="CONSTANT_BUFFER Address Offset Disable Mask" start="20" end="20" type="bool"/>
   </register>
 
 </genxml>
diff --git a/src/intel/genxml/gen_bits_header.py b/src/intel/genxml/gen_bits_header.py
index faba79d..dcd6ccb 100644
--- a/src/intel/genxml/gen_bits_header.py
+++ b/src/intel/genxml/gen_bits_header.py
@@ -108,13 +108,13 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-% for _, container in sorted(containers.iteritems(), key=itemgetter(0)):
+% for _, container in sorted(containers.items(), key=itemgetter(0)):
 
 /* ${container.name} */
 
 ${emit_per_gen_prop_func(container, 'length')}
 
-% for _, field in sorted(container.fields.iteritems(), key=itemgetter(0)):
+% for _, field in sorted(container.fields.items(), key=itemgetter(0)):
 
 /* ${container.name}::${field.name} */
 
@@ -220,7 +220,7 @@
 
     def iter_prop(self, prop):
         if prop == 'length':
-            return self.length_by_gen.iteritems()
+            return self.length_by_gen.items()
         else:
             raise ValueError('Invalid property: "{0}"'.format(prop))
 
@@ -253,9 +253,9 @@
 
     def iter_prop(self, prop):
         if prop == 'bits':
-            return self.bits_by_gen.iteritems()
+            return self.bits_by_gen.items()
         elif prop == 'start':
-            return self.start_by_gen.iteritems()
+            return self.start_by_gen.items()
         else:
             raise ValueError('Invalid property: "{0}"'.format(prop))
 
@@ -282,7 +282,7 @@
         self.container = None
 
     def parse(self, filename):
-        with open(filename) as f:
+        with open(filename, 'rb') as f:
             self.parser.ParseFile(f)
 
     def start_element(self, name, attrs):
diff --git a/src/intel/genxml/gen_pack_header.py b/src/intel/genxml/gen_pack_header.py
index 8989f62..c3d712c 100644
--- a/src/intel/genxml/gen_pack_header.py
+++ b/src/intel/genxml/gen_pack_header.py
@@ -220,7 +220,7 @@
     if num_str.lower().startswith('0x'):
         return int(num_str, base=16)
     else:
-        assert(not num_str.startswith('0') and 'octals numbers not allowed')
+        assert not num_str.startswith('0'), 'octals numbers not allowed'
         return int(num_str)
 
 class Field(object):
@@ -235,13 +235,21 @@
         self.end = int(attrs["end"])
         self.type = attrs["type"]
 
+        assert self.start <= self.end, \
+               'field {} has end ({}) < start ({})'.format(self.name, self.end,
+                                                           self.start)
+        if self.type == 'bool':
+            assert self.end == self.start, \
+                   'bool field ({}) is too wide'.format(self.name)
+
         if "prefix" in attrs:
             self.prefix = attrs["prefix"]
         else:
             self.prefix = None
 
         if "default" in attrs:
-            self.default = int(attrs["default"])
+            # Base 0 recognizes 0x, 0o, 0b prefixes in addition to decimal ints.
+            self.default = int(attrs["default"], base=0)
         else:
             self.default = None
 
diff --git a/src/intel/genxml/gen_zipped_file.py b/src/intel/genxml/gen_zipped_file.py
index af2008b..6164091 100644
--- a/src/intel/genxml/gen_zipped_file.py
+++ b/src/intel/genxml/gen_zipped_file.py
@@ -42,10 +42,10 @@
     print("} genxml_files_table[] = {")
 
     xml_offset = 0
-    compressed_data = ''
+    compressed_data = b''
     for i in range(1, len(sys.argv)):
         filename = sys.argv[i]
-        xml = open(filename).read()
+        xml = open(filename, "rb").read()
         xml_length = len(xml)
         root = et.fromstring(xml)
 
@@ -62,8 +62,8 @@
     print("")
     print("static const uint8_t compress_genxmls[] = {")
     print("   ", end='')
-    for i, c in enumerate(compressed_data, start=1):
-        print("0x%.2x, " % ord(c), end='\n   ' if not i % 12 else '')
+    for i, c in enumerate(bytearray(compressed_data), start=1):
+        print("0x%.2x, " % c, end='\n   ' if not i % 12 else '')
     print('\n};')
 
 
diff --git a/src/intel/isl/gen_format_layout.py b/src/intel/isl/gen_format_layout.py
index 353d0c4..fe90a12 100755
--- a/src/intel/isl/gen_format_layout.py
+++ b/src/intel/isl/gen_format_layout.py
@@ -81,7 +81,7 @@
     % for mask in ['r', 'g', 'b', 'a', 'l', 'i', 'p']:
       <% channel = getattr(format, mask, None) %>\\
       % if channel.type is not None:
-        .${mask} = { ISL_${channel.type}, ${channel.size} },
+        .${mask} = { ISL_${channel.type}, ${channel.start}, ${channel.size} },
       % else:
         .${mask} = {},
       % endif
@@ -152,7 +152,10 @@
         else:
             grouped = self._splitter.match(line)
             self.type = self._types[grouped.group('type')].upper()
-            self.size = grouped.group('size')
+            self.size = int(grouped.group('size'))
+
+        # Default the start bit to -1
+        self.start = -1;
 
 
 class Format(object):
@@ -174,13 +177,21 @@
         self.i = Channel(line[10])
         self.p = Channel(line[11])
 
+        # Set the start bit value for each channel
+        self.order = line[12].strip()
+        bit = 0
+        for c in self.order:
+            chan = getattr(self, c)
+            chan.start = bit;
+            bit = bit + chan.size
+
         # alpha doesn't have a colorspace of it's own.
-        self.colorspace = line[12].strip().upper()
-        if self.colorspace in ['', 'ALPHA']:
+        self.colorspace = line[13].strip().upper()
+        if self.colorspace in ['']:
             self.colorspace = 'NONE'
 
         # This sets it to the line value, or if it's an empty string 'NONE'
-        self.txc = line[13].strip().upper() or 'NONE'
+        self.txc = line[14].strip().upper() or 'NONE'
 
 
 def reader(csvfile):
diff --git a/src/intel/isl/isl.c b/src/intel/isl/isl.c
index fda1c08..f164ad8 100644
--- a/src/intel/isl/isl.c
+++ b/src/intel/isl/isl.c
@@ -1642,6 +1642,8 @@
                       const struct isl_surf *surf,
                       struct isl_surf *mcs_surf)
 {
+   assert(ISL_DEV_GEN(dev) >= 7);
+
    /* It must be multisampled with an array layout */
    assert(surf->samples > 1 && surf->msaa_layout == ISL_MSAA_LAYOUT_ARRAY);
 
@@ -2308,3 +2310,121 @@
       return 5; /* D16_UNORM */
    }
 }
+
+bool
+isl_swizzle_supports_rendering(const struct gen_device_info *devinfo,
+                               struct isl_swizzle swizzle)
+{
+   if (devinfo->is_haswell) {
+      /* From the Haswell PRM,
+       * RENDER_SURFACE_STATE::Shader Channel Select Red
+       *
+       *    "The Shader channel selects also define which shader channels are
+       *    written to which surface channel. If the Shader channel select is
+       *    SCS_ZERO or SCS_ONE then it is not written to the surface. If the
+       *    shader channel select is SCS_RED it is written to the surface red
+       *    channel and so on. If more than one shader channel select is set
+       *    to the same surface channel only the first shader channel in RGBA
+       *    order will be written."
+       */
+      return true;
+   } else if (devinfo->gen <= 7) {
+      /* Ivy Bridge and early doesn't have any swizzling */
+      return isl_swizzle_is_identity(swizzle);
+   } else {
+      /* From the Sky Lake PRM Vol. 2d,
+       * RENDER_SURFACE_STATE::Shader Channel Select Red
+       *
+       *    "For Render Target, Red, Green and Blue Shader Channel Selects
+       *    MUST be such that only valid components can be swapped i.e. only
+       *    change the order of components in the pixel. Any other values for
+       *    these Shader Channel Select fields are not valid for Render
+       *    Targets. This also means that there MUST not be multiple shader
+       *    channels mapped to the same RT channel."
+       *
+       * From the Sky Lake PRM Vol. 2d,
+       * RENDER_SURFACE_STATE::Shader Channel Select Alpha
+       *
+       *    "For Render Target, this field MUST be programmed to
+       *    value = SCS_ALPHA."
+       */
+      return (swizzle.r == ISL_CHANNEL_SELECT_RED ||
+              swizzle.r == ISL_CHANNEL_SELECT_GREEN ||
+              swizzle.r == ISL_CHANNEL_SELECT_BLUE) &&
+             (swizzle.g == ISL_CHANNEL_SELECT_RED ||
+              swizzle.g == ISL_CHANNEL_SELECT_GREEN ||
+              swizzle.g == ISL_CHANNEL_SELECT_BLUE) &&
+             (swizzle.b == ISL_CHANNEL_SELECT_RED ||
+              swizzle.b == ISL_CHANNEL_SELECT_GREEN ||
+              swizzle.b == ISL_CHANNEL_SELECT_BLUE) &&
+             swizzle.r != swizzle.g &&
+             swizzle.r != swizzle.b &&
+             swizzle.g != swizzle.b &&
+             swizzle.a == ISL_CHANNEL_SELECT_ALPHA;
+   }
+}
+
+static enum isl_channel_select
+swizzle_select(enum isl_channel_select chan, struct isl_swizzle swizzle)
+{
+   switch (chan) {
+   case ISL_CHANNEL_SELECT_ZERO:
+   case ISL_CHANNEL_SELECT_ONE:
+      return chan;
+   case ISL_CHANNEL_SELECT_RED:
+      return swizzle.r;
+   case ISL_CHANNEL_SELECT_GREEN:
+      return swizzle.g;
+   case ISL_CHANNEL_SELECT_BLUE:
+      return swizzle.b;
+   case ISL_CHANNEL_SELECT_ALPHA:
+      return swizzle.a;
+   default:
+      unreachable("Invalid swizzle component");
+   }
+}
+
+/**
+ * Returns the single swizzle that is equivalent to applying the two given
+ * swizzles in sequence.
+ */
+struct isl_swizzle
+isl_swizzle_compose(struct isl_swizzle first, struct isl_swizzle second)
+{
+   return (struct isl_swizzle) {
+      .r = swizzle_select(first.r, second),
+      .g = swizzle_select(first.g, second),
+      .b = swizzle_select(first.b, second),
+      .a = swizzle_select(first.a, second),
+   };
+}
+
+/**
+ * Returns a swizzle that is the pseudo-inverse of this swizzle.
+ */
+struct isl_swizzle
+isl_swizzle_invert(struct isl_swizzle swizzle)
+{
+   /* Default to zero for channels which do not show up in the swizzle */
+   enum isl_channel_select chans[4] = {
+      ISL_CHANNEL_SELECT_ZERO,
+      ISL_CHANNEL_SELECT_ZERO,
+      ISL_CHANNEL_SELECT_ZERO,
+      ISL_CHANNEL_SELECT_ZERO,
+   };
+
+   /* We go in ABGR order so that, if there are any duplicates, the first one
+    * is taken if you look at it in RGBA order.  This is what Haswell hardware
+    * does for render target swizzles.
+    */
+   if ((unsigned)(swizzle.a - ISL_CHANNEL_SELECT_RED) < 4)
+      chans[swizzle.a - ISL_CHANNEL_SELECT_RED] = ISL_CHANNEL_SELECT_ALPHA;
+   if ((unsigned)(swizzle.b - ISL_CHANNEL_SELECT_RED) < 4)
+      chans[swizzle.b - ISL_CHANNEL_SELECT_RED] = ISL_CHANNEL_SELECT_BLUE;
+   if ((unsigned)(swizzle.g - ISL_CHANNEL_SELECT_RED) < 4)
+      chans[swizzle.g - ISL_CHANNEL_SELECT_RED] = ISL_CHANNEL_SELECT_GREEN;
+   if ((unsigned)(swizzle.r - ISL_CHANNEL_SELECT_RED) < 4)
+      chans[swizzle.r - ISL_CHANNEL_SELECT_RED] = ISL_CHANNEL_SELECT_RED;
+
+   return (struct isl_swizzle) { chans[0], chans[1], chans[2], chans[3] };
+}
diff --git a/src/intel/isl/isl.h b/src/intel/isl/isl.h
index 6d963d4..6800b1d 100644
--- a/src/intel/isl/isl.h
+++ b/src/intel/isl/isl.h
@@ -1008,6 +1008,7 @@
 
 struct isl_channel_layout {
    enum isl_base_type type;
+   uint8_t start_bit; /**< Bit at which this channel starts */
    uint8_t bits; /**< Size in bits */
 };
 
@@ -1569,8 +1570,28 @@
           fmtl->channels.a.bits == 0;
 }
 
+static inline bool
+isl_format_is_rgbx(enum isl_format fmt)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(fmt);
+
+   return fmtl->channels.r.bits > 0 &&
+          fmtl->channels.g.bits > 0 &&
+          fmtl->channels.b.bits > 0 &&
+          fmtl->channels.a.bits > 0 &&
+          fmtl->channels.a.type == ISL_VOID;
+}
+
 enum isl_format isl_format_rgb_to_rgba(enum isl_format rgb) ATTRIBUTE_CONST;
 enum isl_format isl_format_rgb_to_rgbx(enum isl_format rgb) ATTRIBUTE_CONST;
+enum isl_format isl_format_rgbx_to_rgba(enum isl_format rgb) ATTRIBUTE_CONST;
+
+void isl_color_value_pack(const union isl_color_value *value,
+                          enum isl_format format,
+                          uint32_t *data_out);
+void isl_color_value_unpack(union isl_color_value *value,
+                            enum isl_format format,
+                            const uint32_t *data_in);
 
 bool isl_is_storage_image_format(enum isl_format fmt);
 
@@ -1739,6 +1760,24 @@
 bool isl_color_value_is_zero_one(union isl_color_value value,
                                  enum isl_format format);
 
+static inline bool
+isl_swizzle_is_identity(struct isl_swizzle swizzle)
+{
+   return swizzle.r == ISL_CHANNEL_SELECT_RED &&
+          swizzle.g == ISL_CHANNEL_SELECT_GREEN &&
+          swizzle.b == ISL_CHANNEL_SELECT_BLUE &&
+          swizzle.a == ISL_CHANNEL_SELECT_ALPHA;
+}
+
+bool
+isl_swizzle_supports_rendering(const struct gen_device_info *devinfo,
+                               struct isl_swizzle swizzle);
+
+struct isl_swizzle
+isl_swizzle_compose(struct isl_swizzle first, struct isl_swizzle second);
+struct isl_swizzle
+isl_swizzle_invert(struct isl_swizzle swizzle);
+
 #define isl_surf_init(dev, surf, ...) \
    isl_surf_init_s((dev), (surf), \
                    &(struct isl_surf_init_info) {  __VA_ARGS__ });
diff --git a/src/intel/isl/isl_format.c b/src/intel/isl/isl_format.c
index a9b11df..968f981 100644
--- a/src/intel/isl/isl_format.c
+++ b/src/intel/isl/isl_format.c
@@ -24,8 +24,17 @@
 #include <assert.h>
 
 #include "isl.h"
+#include "isl_priv.h"
 #include "dev/gen_device_info.h"
 
+#include "main/macros.h" /* Needed for MAX3 and MAX2 for format_rgb9e5 */
+#include "util/format_srgb.h"
+#include "util/format_rgb9e5.h"
+#include "util/format_r11g11b10f.h"
+
+/* Header-only format conversion include */
+#include "main/format_utils.h"
+
 struct surface_format_info {
    bool exists;
    uint8_t sampling;
@@ -108,8 +117,8 @@
    SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32_SSCALED)
    SF(  x,   x,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R32G32B32_USCALED)
    SF(  x,   x,   x,   x,   x,   x,  75,   x,   x,   x,   x,   x,   R32G32B32_SFIXED)
-   SF(  Y,   Y,   x,   x,   Y,  45,   Y,   x,  60,  70,   x,  90,   R16G16B16A16_UNORM)
-   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,  90,   R16G16B16A16_SNORM)
+   SF(  Y,   Y,   x,   x,   Y,  45,   Y,   x,  60,  70, 110,  90,   R16G16B16A16_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70, 110,  90,   R16G16B16A16_SNORM)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,  90,   R16G16B16A16_SINT)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,  90,   R16G16B16A16_UINT)
    SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70,  90,  90,   R16G16B16A16_FLOAT)
@@ -141,13 +150,13 @@
    SF(  Y,   Y,   x,   x,   x,   x,   x,   x,  60,   x,   x,   x,   R10G10B10A2_UNORM_SRGB)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,   x, 100,   R10G10B10A2_UINT)
    SF(  Y,   Y,   x,   x,   x,   x,   Y,   x,   x,   x,   x,   x,   R10G10B10_SNORM_A2_UNORM)
-   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,  60,  70,   x,  90,   R8G8B8A8_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,  60,  70, 110,  90,   R8G8B8A8_UNORM)
    SF(  Y,   Y,   x,   x,   Y,   Y,   x,   x,  60,   x,   x, 100,   R8G8B8A8_UNORM_SRGB)
-   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,  90,   R8G8B8A8_SNORM)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70, 110,  90,   R8G8B8A8_SNORM)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,  90,   R8G8B8A8_SINT)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,  90,   R8G8B8A8_UINT)
-   SF(  Y,   Y,   x,   x,   Y,  45,   Y,   x,   x,  70,   x,  90,   R16G16_UNORM)
-   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,  90,   R16G16_SNORM)
+   SF(  Y,   Y,   x,   x,   Y,  45,   Y,   x,   x,  70, 110,  90,   R16G16_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70, 110,  90,   R16G16_SNORM)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,  90,   R16G16_SINT)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,  90,   R16G16_UINT)
    SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70,  90,  90,   R16G16_FLOAT)
@@ -189,12 +198,12 @@
    SF(  Y,   Y,   x,   x,   Y,   Y,   x,   x,   x,   x,   x,   x,   B5G5R5A1_UNORM_SRGB)
    SF(  Y,   Y,   x,   Y,   Y,   Y,   x,   x,   x,  70,   x,   x,   B4G4R4A4_UNORM)
    SF(  Y,   Y,   x,   x,   Y,   Y,   x,   x,   x,   x,   x,   x,   B4G4R4A4_UNORM_SRGB)
-   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70,   x,   x,   R8G8_UNORM)
-   SF(  Y,   Y,   x,   Y,   Y,  60,   Y,   x,   x,  70,   x,   x,   R8G8_SNORM)
+   SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70, 110,   x,   R8G8_UNORM)
+   SF(  Y,   Y,   x,   Y,   Y,  60,   Y,   x,   x,  70, 110,   x,   R8G8_SNORM)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,   x,   R8G8_SINT)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,   x,   R8G8_UINT)
-   SF(  Y,   Y,   Y,   x,   Y,  45,   Y,   x,  70,  70,   x,   x,   R16_UNORM)
-   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,   x,   R16_SNORM)
+   SF(  Y,   Y,   Y,   x,   Y,  45,   Y,   x,  70,  70, 110,   x,   R16_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70, 110,   x,   R16_SNORM)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,   x,   R16_SINT)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,   x,   R16_UINT)
    SF(  Y,   Y,   x,   x,   Y,   Y,   Y,   x,   x,  70,  90,   x,   R16_FLOAT)
@@ -226,11 +235,11 @@
    SF( 80,  80,   x,   x,  90,   x,   x,   x,   x,   x,   x,   x,   A4B4G4R4_UNORM)
    SF( 90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L8A8_UINT)
    SF( 90,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   L8A8_SINT)
-   SF(  Y,   Y,   x,  45,   Y,   Y,   Y,   x,   x,  70,   x,   x,   R8_UNORM)
-   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70,   x,   x,   R8_SNORM)
+   SF(  Y,   Y,   x,  45,   Y,   Y,   Y,   x,   x,  70, 110,   x,   R8_UNORM)
+   SF(  Y,   Y,   x,   x,   Y,  60,   Y,   x,   x,  70, 110,   x,   R8_SNORM)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  90,   x,   R8_SINT)
    SF(  Y,   x,   x,   x,   Y,   x,   Y,   x,   x,  70,  75,   x,   R8_UINT)
-   SF(  Y,   Y,   x,   Y,   Y,   Y,   x,   x,   x,  70,   x,   x,   A8_UNORM)
+   SF(  Y,   Y,   x,   Y,   Y,   Y,   x,   x,   x,  70, 110,   x,   A8_UNORM)
    SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   I8_UNORM)
    SF(  Y,   Y,   x,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   L8_UNORM)
    SF(  Y,   Y,   x,   x,   x,   x,   x,   x,   x,   x,   x,   x,   P4A4_UNORM_PALETTE0)
@@ -557,17 +566,6 @@
    if (format == ISL_FORMAT_R11G11B10_FLOAT)
       return false;
 
-   /* blorp_copy currently doesn't support formats with different bit-widths
-    * per-channel. Until that support is added, report that these formats don't
-    * support CCS_E. FIXME: Add support for these formats.
-    */
-   if (format == ISL_FORMAT_B10G10R10A2_UNORM ||
-       format == ISL_FORMAT_B10G10R10A2_UNORM_SRGB ||
-       format == ISL_FORMAT_R10G10B10A2_UNORM ||
-       format == ISL_FORMAT_R10G10B10A2_UINT) {
-      return false;
-   }
-
    return format_gen(devinfo) >= format_info[format].ccs_e;
 }
 
@@ -782,3 +780,240 @@
       return ISL_FORMAT_UNSUPPORTED;
    }
 }
+
+enum isl_format
+isl_format_rgbx_to_rgba(enum isl_format rgbx)
+{
+   assert(isl_format_is_rgbx(rgbx));
+
+   switch (rgbx) {
+   case ISL_FORMAT_R32G32B32X32_FLOAT:
+      return ISL_FORMAT_R32G32B32A32_FLOAT;
+   case ISL_FORMAT_R16G16B16X16_UNORM:
+      return ISL_FORMAT_R16G16B16A16_UNORM;
+   case ISL_FORMAT_R16G16B16X16_FLOAT:
+      return ISL_FORMAT_R16G16B16A16_FLOAT;
+   case ISL_FORMAT_B8G8R8X8_UNORM:
+      return ISL_FORMAT_B8G8R8A8_UNORM;
+   case ISL_FORMAT_B8G8R8X8_UNORM_SRGB:
+      return ISL_FORMAT_B8G8R8A8_UNORM_SRGB;
+   case ISL_FORMAT_R8G8B8X8_UNORM:
+      return ISL_FORMAT_R8G8B8A8_UNORM;
+   case ISL_FORMAT_R8G8B8X8_UNORM_SRGB:
+      return ISL_FORMAT_R8G8B8A8_UNORM_SRGB;
+   case ISL_FORMAT_B10G10R10X2_UNORM:
+      return ISL_FORMAT_B10G10R10A2_UNORM;
+   case ISL_FORMAT_B5G5R5X1_UNORM:
+      return ISL_FORMAT_B5G5R5A1_UNORM;
+   case ISL_FORMAT_B5G5R5X1_UNORM_SRGB:
+      return ISL_FORMAT_B5G5R5A1_UNORM_SRGB;
+   default:
+      assert(!"Invalid RGBX format");
+      return rgbx;
+   }
+}
+
+static inline void
+pack_channel(const union isl_color_value *value, unsigned i,
+             const struct isl_channel_layout *layout,
+             enum isl_colorspace colorspace,
+             uint32_t data_out[4])
+{
+   if (layout->type == ISL_VOID)
+      return;
+
+   if (colorspace == ISL_COLORSPACE_SRGB)
+      assert(layout->type == ISL_UNORM);
+
+   uint32_t packed;
+   switch (layout->type) {
+   case ISL_UNORM:
+      if (colorspace == ISL_COLORSPACE_SRGB) {
+         if (layout->bits == 8) {
+            packed = util_format_linear_float_to_srgb_8unorm(value->f32[i]);
+         } else {
+            float srgb = util_format_linear_to_srgb_float(value->f32[i]);
+            packed = _mesa_float_to_unorm(srgb, layout->bits);
+         }
+      } else {
+         packed = _mesa_float_to_unorm(value->f32[i], layout->bits);
+      }
+      break;
+   case ISL_SNORM:
+      packed = _mesa_float_to_snorm(value->f32[i], layout->bits);
+      break;
+   case ISL_SFLOAT:
+      assert(layout->bits == 16 || layout->bits == 32);
+      if (layout->bits == 16) {
+         packed = _mesa_float_to_half(value->f32[i]);
+      } else {
+         packed = value->u32[i];
+      }
+      break;
+   case ISL_UINT:
+      packed = MIN(value->u32[i], MAX_UINT(layout->bits));
+      break;
+   case ISL_SINT:
+      packed = MIN(MAX(value->u32[i], MIN_INT(layout->bits)),
+                   MAX_INT(layout->bits));
+      break;
+
+   default:
+      unreachable("Invalid channel type");
+   }
+
+   unsigned dword = layout->start_bit / 32;
+   unsigned bit = layout->start_bit % 32;
+   assert(bit + layout->bits <= 32);
+   data_out[dword] |= (packed & MAX_UINT(layout->bits)) << bit;
+}
+
+/**
+ * Take an isl_color_value and pack it into the actual bits as specified by
+ * the isl_format.  This function is very slow for a format conversion
+ * function but should be fine for a single pixel worth of data.
+ */
+void
+isl_color_value_pack(const union isl_color_value *value,
+                     enum isl_format format,
+                     uint32_t *data_out)
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+   assert(fmtl->colorspace == ISL_COLORSPACE_LINEAR ||
+          fmtl->colorspace == ISL_COLORSPACE_SRGB);
+   assert(!isl_format_is_compressed(format));
+
+   memset(data_out, 0, isl_align(fmtl->bpb, 32) / 8);
+
+   if (format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) {
+      data_out[0] = float3_to_rgb9e5(value->f32);
+      return;
+   } else if (format == ISL_FORMAT_R11G11B10_FLOAT) {
+      data_out[0] = float3_to_r11g11b10f(value->f32);
+      return;
+   }
+
+   pack_channel(value, 0, &fmtl->channels.r, fmtl->colorspace, data_out);
+   pack_channel(value, 1, &fmtl->channels.g, fmtl->colorspace, data_out);
+   pack_channel(value, 2, &fmtl->channels.b, fmtl->colorspace, data_out);
+   pack_channel(value, 3, &fmtl->channels.a, ISL_COLORSPACE_LINEAR, data_out);
+   pack_channel(value, 0, &fmtl->channels.l, fmtl->colorspace, data_out);
+   pack_channel(value, 0, &fmtl->channels.i, ISL_COLORSPACE_LINEAR, data_out);
+   assert(fmtl->channels.p.bits == 0);
+}
+
+/** Extend an N-bit signed integer to 32 bits */
+static inline int32_t
+sign_extend(int32_t x, unsigned bits)
+{
+   if (bits < 32) {
+      unsigned shift = 32 - bits;
+      return (x << shift) >> shift;
+   } else {
+      return x;
+   }
+}
+
+static inline void
+unpack_channel(union isl_color_value *value,
+               unsigned start, unsigned count,
+               const struct isl_channel_layout *layout,
+               enum isl_colorspace colorspace,
+               const uint32_t *data_in)
+{
+   if (layout->type == ISL_VOID)
+      return;
+
+   unsigned dword = layout->start_bit / 32;
+   unsigned bit = layout->start_bit % 32;
+   assert(bit + layout->bits <= 32);
+   uint32_t packed = (data_in[dword] >> bit) & MAX_UINT(layout->bits);
+
+   union {
+      uint32_t u32;
+      float f32;
+   } unpacked;
+
+   if (colorspace == ISL_COLORSPACE_SRGB)
+      assert(layout->type == ISL_UNORM);
+
+   switch (layout->type) {
+   case ISL_UNORM:
+      unpacked.f32 = _mesa_unorm_to_float(packed, layout->bits);
+      if (colorspace == ISL_COLORSPACE_SRGB) {
+         if (layout->bits == 8) {
+            unpacked.f32 = util_format_srgb_8unorm_to_linear_float(packed);
+         } else {
+            float srgb = _mesa_unorm_to_float(packed, layout->bits);
+            unpacked.f32 = util_format_srgb_to_linear_float(srgb);
+         }
+      } else {
+         unpacked.f32 = _mesa_unorm_to_float(packed, layout->bits);
+      }
+      break;
+   case ISL_SNORM:
+      unpacked.f32 = _mesa_snorm_to_float(sign_extend(packed, layout->bits),
+                                          layout->bits);
+      break;
+   case ISL_SFLOAT:
+      assert(layout->bits == 16 || layout->bits == 32);
+      if (layout->bits == 16) {
+         unpacked.f32 = _mesa_half_to_float(packed);
+      } else {
+         unpacked.u32 = packed;
+      }
+      break;
+   case ISL_UINT:
+      unpacked.u32 = packed;
+      break;
+   case ISL_SINT:
+      unpacked.u32 = sign_extend(packed, layout->bits);
+      break;
+
+   default:
+      unreachable("Invalid channel type");
+   }
+
+   for (unsigned i = 0; i < count; i++)
+      value->u32[start + i] = unpacked.u32;
+}
+
+/**
+ * Take unpack an isl_color_value from the actual bits as specified by
+ * the isl_format.  This function is very slow for a format conversion
+ * function but should be fine for a single pixel worth of data.
+ */
+void
+isl_color_value_unpack(union isl_color_value *value,
+                       enum isl_format format,
+                       const uint32_t data_in[4])
+{
+   const struct isl_format_layout *fmtl = isl_format_get_layout(format);
+   assert(fmtl->colorspace == ISL_COLORSPACE_LINEAR ||
+          fmtl->colorspace == ISL_COLORSPACE_SRGB);
+   assert(!isl_format_is_compressed(format));
+
+   /* Default to opaque black. */
+   memset(value, 0, sizeof(*value));
+   if (isl_format_has_int_channel(format)) {
+      value->u32[3] = 1u;
+   } else {
+      value->f32[3] = 1.0f;
+   }
+
+   if (format == ISL_FORMAT_R9G9B9E5_SHAREDEXP) {
+      rgb9e5_to_float3(data_in[0], value->f32);
+      return;
+   } else if (format == ISL_FORMAT_R11G11B10_FLOAT) {
+      r11g11b10f_to_float3(data_in[0], value->f32);
+      return;
+   }
+
+   unpack_channel(value, 0, 1, &fmtl->channels.r, fmtl->colorspace, data_in);
+   unpack_channel(value, 1, 1, &fmtl->channels.g, fmtl->colorspace, data_in);
+   unpack_channel(value, 2, 1, &fmtl->channels.b, fmtl->colorspace, data_in);
+   unpack_channel(value, 3, 1, &fmtl->channels.a, ISL_COLORSPACE_LINEAR, data_in);
+   unpack_channel(value, 0, 3, &fmtl->channels.l, fmtl->colorspace, data_in);
+   unpack_channel(value, 0, 4, &fmtl->channels.i, ISL_COLORSPACE_LINEAR, data_in);
+   assert(fmtl->channels.p.bits == 0);
+}
diff --git a/src/intel/isl/isl_format_layout.csv b/src/intel/isl/isl_format_layout.csv
index ebb3d22..0b9421e 100644
--- a/src/intel/isl/isl_format_layout.csv
+++ b/src/intel/isl/isl_format_layout.csv
@@ -63,282 +63,282 @@
 #     :AlignCtrl lr+ p8000000000000P1
 #     /^# name/,$ Align,
 
-# name                      , bpb, bw, bh, bd,    r,    g,    b,    a,    l,    i,   p,  space,   txc
-R32G32B32A32_FLOAT          , 128,  1,  1,  1, sf32, sf32, sf32, sf32,     ,     ,    , linear,
-R32G32B32A32_SINT           , 128,  1,  1,  1, si32, si32, si32, si32,     ,     ,    , linear,
-R32G32B32A32_UINT           , 128,  1,  1,  1, ui32, ui32, ui32, ui32,     ,     ,    , linear,
-R32G32B32A32_UNORM          , 128,  1,  1,  1, un32, un32, un32, un32,     ,     ,    , linear,
-R32G32B32A32_SNORM          , 128,  1,  1,  1, sn32, sn32, sn32, sn32,     ,     ,    , linear,
-R64G64_FLOAT                , 128,  1,  1,  1, sf64, sf64,     ,     ,     ,     ,    , linear,
-R32G32B32X32_FLOAT          , 128,  1,  1,  1, sf32, sf32, sf32,  x32,     ,     ,    , linear,
-R32G32B32A32_SSCALED        , 128,  1,  1,  1, ss32, ss32, ss32, ss32,     ,     ,    , linear,
-R32G32B32A32_USCALED        , 128,  1,  1,  1, us32, us32, us32, us32,     ,     ,    , linear,
-R32G32B32A32_SFIXED         , 128,  1,  1,  1, sx32, sx32, sx32, sx32,     ,     ,    , linear,
-R64G64_PASSTHRU             , 128,  1,  1,  1,  r64,  r64,     ,     ,     ,     ,    ,       ,
-R32G32B32_FLOAT             ,  96,  1,  1,  1, sf32, sf32, sf32,     ,     ,     ,    , linear,
-R32G32B32_SINT              ,  96,  1,  1,  1, si32, si32, si32,     ,     ,     ,    , linear,
-R32G32B32_UINT              ,  96,  1,  1,  1, ui32, ui32, ui32,     ,     ,     ,    , linear,
-R32G32B32_UNORM             ,  96,  1,  1,  1, un32, un32, un32,     ,     ,     ,    , linear,
-R32G32B32_SNORM             ,  96,  1,  1,  1, sn32, sn32, sn32,     ,     ,     ,    , linear,
-R32G32B32_SSCALED           ,  96,  1,  1,  1, ss32, ss32, ss32,     ,     ,     ,    , linear,
-R32G32B32_USCALED           ,  96,  1,  1,  1, us32, us32, us32,     ,     ,     ,    , linear,
-R32G32B32_SFIXED            ,  96,  1,  1,  1, sx32, sx32, sx32,     ,     ,     ,    , linear,
-R16G16B16A16_UNORM          ,  64,  1,  1,  1, un16, un16, un16, un16,     ,     ,    , linear,
-R16G16B16A16_SNORM          ,  64,  1,  1,  1, sn16, sn16, sn16, sn16,     ,     ,    , linear,
-R16G16B16A16_SINT           ,  64,  1,  1,  1, si16, si16, si16, si16,     ,     ,    , linear,
-R16G16B16A16_UINT           ,  64,  1,  1,  1, ui16, ui16, ui16, ui16,     ,     ,    , linear,
-R16G16B16A16_FLOAT          ,  64,  1,  1,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,
-R32G32_FLOAT                ,  64,  1,  1,  1, sf32, sf32,     ,     ,     ,     ,    , linear,
-R32G32_SINT                 ,  64,  1,  1,  1, si32, si32,     ,     ,     ,     ,    , linear,
-R32G32_UINT                 ,  64,  1,  1,  1, ui32, ui32,     ,     ,     ,     ,    , linear,
-R32_FLOAT_X8X24_TYPELESS    ,  64,  1,  1,  1, sf32,   x8,  x24,     ,     ,     ,    , linear,
-X32_TYPELESS_G8X24_UINT     ,  64,  1,  1,  1,  x32,  ui8,  x24,     ,     ,     ,    , linear,
-L32A32_FLOAT                ,  64,  1,  1,  1,     ,     ,     , sf32, sf32,     ,    , linear,
-R32G32_UNORM                ,  64,  1,  1,  1, un32, un32,     ,     ,     ,     ,    , linear,
-R32G32_SNORM                ,  64,  1,  1,  1, sn32, sn32,     ,     ,     ,     ,    , linear,
-R64_FLOAT                   ,  64,  1,  1,  1, sf64,     ,     ,     ,     ,     ,    , linear,
-R16G16B16X16_UNORM          ,  64,  1,  1,  1, un16, un16, un16,  x16,     ,     ,    , linear,
-R16G16B16X16_FLOAT          ,  64,  1,  1,  1, sf16, sf16, sf16,  x16,     ,     ,    , linear,
-A32X32_FLOAT                ,  64,  1,  1,  1,     ,     ,     , sf32,  x32,     ,    ,  alpha,
-L32X32_FLOAT                ,  64,  1,  1,  1,     ,     ,     ,  x32, sf32,     ,    , linear,
-I32X32_FLOAT                ,  64,  1,  1,  1,     ,     ,     ,  x32,     , sf32,    , linear,
-R16G16B16A16_SSCALED        ,  64,  1,  1,  1, ss16, ss16, ss16, ss16,     ,     ,    , linear,
-R16G16B16A16_USCALED        ,  64,  1,  1,  1, us16, us16, us16, us16,     ,     ,    , linear,
-R32G32_SSCALED              ,  64,  1,  1,  1, ss32, ss32,     ,     ,     ,     ,    , linear,
-R32G32_USCALED              ,  64,  1,  1,  1, us32, us32,     ,     ,     ,     ,    , linear,
-R32G32_FLOAT_LD             ,  64,  1,  1,  1, sf32, sf32,     ,     ,     ,     ,    , linear,
-R32G32_SFIXED               ,  64,  1,  1,  1, sx32, sx32,     ,     ,     ,     ,    , linear,
-R64_PASSTHRU                ,  64,  1,  1,  1,  r64,     ,     ,     ,     ,     ,    ,       ,
-B8G8R8A8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    , linear,
-B8G8R8A8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,
-R10G10B10A2_UNORM           ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    , linear,
-R10G10B10A2_UNORM_SRGB      ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    ,   srgb,
-R10G10B10A2_UINT            ,  32,  1,  1,  1, ui10, ui10, ui10,  ui2,     ,     ,    , linear,
-R10G10B10_SNORM_A2_UNORM    ,  32,  1,  1,  1, sn10, sn10, sn10,  un2,     ,     ,    , linear,
-R8G8B8A8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    , linear,
-R8G8B8A8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,
-R8G8B8A8_SNORM              ,  32,  1,  1,  1,  sn8,  sn8,  sn8,  sn8,     ,     ,    , linear,
-R8G8B8A8_SINT               ,  32,  1,  1,  1,  si8,  si8,  si8,  si8,     ,     ,    , linear,
-R8G8B8A8_UINT               ,  32,  1,  1,  1,  ui8,  ui8,  ui8,  ui8,     ,     ,    , linear,
-R16G16_UNORM                ,  32,  1,  1,  1, un16, un16,     ,     ,     ,     ,    , linear,
-R16G16_SNORM                ,  32,  1,  1,  1, sn16, sn16,     ,     ,     ,     ,    , linear,
-R16G16_SINT                 ,  32,  1,  1,  1, si16, si16,     ,     ,     ,     ,    , linear,
-R16G16_UINT                 ,  32,  1,  1,  1, ui16, ui16,     ,     ,     ,     ,    , linear,
-R16G16_FLOAT                ,  32,  1,  1,  1, sf16, sf16,     ,     ,     ,     ,    , linear,
-B10G10R10A2_UNORM           ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    , linear,
-B10G10R10A2_UNORM_SRGB      ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    ,   srgb,
-R11G11B10_FLOAT             ,  32,  1,  1,  1, uf11, uf11, uf10,     ,     ,     ,    , linear,
-R32_SINT                    ,  32,  1,  1,  1, si32,     ,     ,     ,     ,     ,    , linear,
-R32_UINT                    ,  32,  1,  1,  1, ui32,     ,     ,     ,     ,     ,    , linear,
-R32_FLOAT                   ,  32,  1,  1,  1, sf32,     ,     ,     ,     ,     ,    , linear,
-R24_UNORM_X8_TYPELESS       ,  32,  1,  1,  1, un24,   x8,     ,     ,     ,     ,    , linear,
-X24_TYPELESS_G8_UINT        ,  32,  1,  1,  1,  x24,  ui8,     ,     ,     ,     ,    , linear,
-L32_UNORM                   ,  32,  1,  1,  1,     ,     ,     ,     , un32,     ,    , linear,
-A32_UNORM                   ,  32,  1,  1,  1,     ,     ,     , un32,     ,     ,    ,  alpha,
-L16A16_UNORM                ,  32,  1,  1,  1,     ,     ,     , un16, un16,     ,    , linear,
-I24X8_UNORM                 ,  32,  1,  1,  1,     ,     ,     ,   x8,     , un24,    , linear,
-L24X8_UNORM                 ,  32,  1,  1,  1,     ,     ,     ,   x8, un24,     ,    , linear,
-A24X8_UNORM                 ,  32,  1,  1,  1,     ,     ,     , un24,   x8,     ,    ,  alpha,
-I32_FLOAT                   ,  32,  1,  1,  1,     ,     ,     ,     ,     , sf32,    , linear,
-L32_FLOAT                   ,  32,  1,  1,  1,     ,     ,     ,     , sf32,     ,    , linear,
-A32_FLOAT                   ,  32,  1,  1,  1,     ,     ,     , sf32,     ,     ,    ,  alpha,
-X8B8_UNORM_G8R8_SNORM       ,  32,  1,  1,  1,  sn8,  sn8,  un8,   x8,     ,     ,    , linear,
-A8X8_UNORM_G8R8_SNORM       ,  32,  1,  1,  1,  sn8,  sn8,   x8,  un8,     ,     ,    , linear,
-B8X8_UNORM_G8R8_SNORM       ,  32,  1,  1,  1,  sn8,  sn8,  un8,   x8,     ,     ,    , linear,
-B8G8R8X8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    , linear,
-B8G8R8X8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    ,   srgb,
-R8G8B8X8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    , linear,
-R8G8B8X8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    ,   srgb,
-R9G9B9E5_SHAREDEXP          ,  32,  1,  1,  1,  uf9,  uf9,  uf9,     ,     ,     ,    , linear,
-B10G10R10X2_UNORM           ,  32,  1,  1,  1, un10, un10, un10,   x2,     ,     ,    , linear,
-L16A16_FLOAT                ,  32,  1,  1,  1,     ,     ,     , sf16, sf16,     ,    , linear,
-R32_UNORM                   ,  32,  1,  1,  1, un32,     ,     ,     ,     ,     ,    , linear,
-R32_SNORM                   ,  32,  1,  1,  1, sn32,     ,     ,     ,     ,     ,    , linear,
-R10G10B10X2_USCALED         ,  32,  1,  1,  1, us10, us10, us10,   x2,     ,     ,    , linear,
-R8G8B8A8_SSCALED            ,  32,  1,  1,  1,  ss8,  ss8,  ss8,  ss8,     ,     ,    , linear,
-R8G8B8A8_USCALED            ,  32,  1,  1,  1,  us8,  us8,  us8,  us8,     ,     ,    , linear,
-R16G16_SSCALED              ,  32,  1,  1,  1, ss16,  ss6,     ,     ,     ,     ,    , linear,
-R16G16_USCALED              ,  32,  1,  1,  1, us16, us16,     ,     ,     ,     ,    , linear,
-R32_SSCALED                 ,  32,  1,  1,  1, ss32,     ,     ,     ,     ,     ,    , linear,
-R32_USCALED                 ,  32,  1,  1,  1, us32,     ,     ,     ,     ,     ,    , linear,
-B5G6R5_UNORM                ,  16,  1,  1,  1,  un5,  un6,  un5,     ,     ,     ,    , linear,
-B5G6R5_UNORM_SRGB           ,  16,  1,  1,  1,  un5,  un6,  un5,     ,     ,     ,    ,   srgb,
-B5G5R5A1_UNORM              ,  16,  1,  1,  1,  un5,  un5,  un5,  un1,     ,     ,    , linear,
-B5G5R5A1_UNORM_SRGB         ,  16,  1,  1,  1,  un5,  un5,  un5,  un1,     ,     ,    ,   srgb,
-B4G4R4A4_UNORM              ,  16,  1,  1,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,
-B4G4R4A4_UNORM_SRGB         ,  16,  1,  1,  1,  un4,  un4,  un4,  un4,     ,     ,    ,   srgb,
-R8G8_UNORM                  ,  16,  1,  1,  1,  un8,  un8,     ,     ,     ,     ,    , linear,
-R8G8_SNORM                  ,  16,  1,  1,  1,  sn8,  sn8,     ,     ,     ,     ,    , linear,
-R8G8_SINT                   ,  16,  1,  1,  1,  si8,  si8,     ,     ,     ,     ,    , linear,
-R8G8_UINT                   ,  16,  1,  1,  1,  ui8,  ui8,     ,     ,     ,     ,    , linear,
-R16_UNORM                   ,  16,  1,  1,  1, un16,     ,     ,     ,     ,     ,    , linear,
-R16_SNORM                   ,  16,  1,  1,  1, sn16,     ,     ,     ,     ,     ,    , linear,
-R16_SINT                    ,  16,  1,  1,  1, si16,     ,     ,     ,     ,     ,    , linear,
-R16_UINT                    ,  16,  1,  1,  1, ui16,     ,     ,     ,     ,     ,    , linear,
-R16_FLOAT                   ,  16,  1,  1,  1, sf16,     ,     ,     ,     ,     ,    , linear,
-A8P8_UNORM_PALETTE0         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8, linear,
-A8P8_UNORM_PALETTE1         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8, linear,
-I16_UNORM                   ,  16,  1,  1,  1,     ,     ,     ,     ,     , un16,    , linear,
-L16_UNORM                   ,  16,  1,  1,  1,     ,     ,     ,     , un16,     ,    , linear,
-A16_UNORM                   ,  16,  1,  1,  1,     ,     ,     , un16,     ,     ,    ,  alpha,
-L8A8_UNORM                  ,  16,  1,  1,  1,     ,     ,     ,  un8,  un8,     ,    , linear,
-I16_FLOAT                   ,  16,  1,  1,  1,     ,     ,     ,     ,     , sf16,    , linear,
-L16_FLOAT                   ,  16,  1,  1,  1,     ,     ,     ,     , sf16,     ,    , linear,
-A16_FLOAT                   ,  16,  1,  1,  1,     ,     ,     , sf16,     ,     ,    ,  alpha,
-L8A8_UNORM_SRGB             ,  16,  1,  1,  1,     ,     ,     ,  un8,  un8,     ,    ,   srgb,
-R5G5_SNORM_B6_UNORM         ,  16,  1,  1,  1,  sn5,  sn5,  un6,     ,     ,     ,    , linear,
-B5G5R5X1_UNORM              ,  16,  1,  1,  1,  un5,  un5,  un5,   x1,     ,     ,    , linear,
-B5G5R5X1_UNORM_SRGB         ,  16,  1,  1,  1,  un5,  un5,  un5,   x1,     ,     ,    ,   srgb,
-R8G8_SSCALED                ,  16,  1,  1,  1,  ss8,  ss8,     ,     ,     ,     ,    , linear,
-R8G8_USCALED                ,  16,  1,  1,  1,  us8,  us8,     ,     ,     ,     ,    , linear,
-R16_SSCALED                 ,  16,  1,  1,  1, ss16,     ,     ,     ,     ,     ,    , linear,
-R16_USCALED                 ,  16,  1,  1,  1, us16,     ,     ,     ,     ,     ,    , linear,
-P8A8_UNORM_PALETTE0         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8, linear,
-P8A8_UNORM_PALETTE1         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8, linear,
-A1B5G5R5_UNORM              ,  16,  1,  1,  1,  un5,  un5,  un5,  un1,     ,     ,    , linear,
-A4B4G4R4_UNORM              ,  16,  1,  1,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,
-L8A8_UINT                   ,  16,  1,  1,  1,     ,     ,     ,  ui8,  ui8,     ,    , linear,
-L8A8_SINT                   ,  16,  1,  1,  1,     ,     ,     ,  si8,  si8,     ,    , linear,
-R8_UNORM                    ,   8,  1,  1,  1,  un8,     ,     ,     ,     ,     ,    , linear,
-R8_SNORM                    ,   8,  1,  1,  1,  sn8,     ,     ,     ,     ,     ,    , linear,
-R8_SINT                     ,   8,  1,  1,  1,  si8,     ,     ,     ,     ,     ,    , linear,
-R8_UINT                     ,   8,  1,  1,  1,  ui8,     ,     ,     ,     ,     ,    , linear,
-A8_UNORM                    ,   8,  1,  1,  1,     ,     ,     ,  un8,     ,     ,    ,  alpha,
-I8_UNORM                    ,   8,  1,  1,  1,     ,     ,     ,     ,     ,  un8,    , linear,
-L8_UNORM                    ,   8,  1,  1,  1,     ,     ,     ,     ,  un8,     ,    , linear,
-P4A4_UNORM_PALETTE0         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4, linear,
-A4P4_UNORM_PALETTE0         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4, linear,
-R8_SSCALED                  ,   8,  1,  1,  1,  ss8,     ,     ,     ,     ,     ,    , linear,
-R8_USCALED                  ,   8,  1,  1,  1,  us8,     ,     ,     ,     ,     ,    , linear,
-P8_UNORM_PALETTE0           ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     , un8, linear,
-L8_UNORM_SRGB               ,   8,  1,  1,  1,     ,     ,     ,     ,  un8,     ,    , linear,
-P8_UNORM_PALETTE1           ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     , un8, linear,
-P4A4_UNORM_PALETTE1         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4, linear,
-A4P4_UNORM_PALETTE1         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4, linear,
-Y8_UNORM                    ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,    yuv,
-L8_UINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,  ui8,     ,    , linear,
-L8_SINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,  si8,     ,    , linear,
-I8_UINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,     ,  ui8,    , linear,
-I8_SINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,     ,  si8,    , linear,
-DXT1_RGB_SRGB               ,  64,  4,  4,  1,  un4,  un4,  un4,     ,     ,     ,    ,   srgb,  dxt1
-R1_UNORM                    ,   1,  1,  1,  1,  un1,     ,     ,     ,     ,     ,    , linear,
-YCRCB_NORMAL                ,  16,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,    yuv,
-YCRCB_SWAPUVY               ,  16,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,    yuv,
-P2_UNORM_PALETTE0           ,   2,  1,  1,  1,     ,     ,     ,     ,     ,     , un2, linear,
-P2_UNORM_PALETTE1           ,   2,  1,  1,  1,     ,     ,     ,     ,     ,     , un2, linear,
-BC1_UNORM                   ,  64,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,  dxt1
-BC2_UNORM                   , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,  dxt3
-BC3_UNORM                   , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    , linear,  dxt5
-BC4_UNORM                   ,  64,  4,  4,  1,  un8,     ,     ,     ,     ,     ,    , linear, rgtc1
-BC5_UNORM                   , 128,  4,  4,  1,  un8,  un8,     ,     ,     ,     ,    , linear, rgtc2
-BC1_UNORM_SRGB              ,  64,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,   srgb,  dxt1
-BC2_UNORM_SRGB              , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,   srgb,  dxt3
-BC3_UNORM_SRGB              , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,   srgb,  dxt5
-MONO8                       ,   1,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,       ,
-YCRCB_SWAPUV                ,  16,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,    yuv,
-YCRCB_SWAPY                 ,  16,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,    yuv,
-DXT1_RGB                    ,  64,  4,  4,  1,  un4,  un4,  un4,     ,     ,     ,    , linear,  dxt1
-FXT1                        , 128,  8,  4,  1,  un4,  un4,  un4,     ,     ,     ,    , linear,  fxt1
-R8G8B8_UNORM                ,  24,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    , linear,
-R8G8B8_SNORM                ,  24,  1,  1,  1,  sn8,  sn8,  sn8,     ,     ,     ,    , linear,
-R8G8B8_SSCALED              ,  24,  1,  1,  1,  ss8,  ss8,  ss8,     ,     ,     ,    , linear,
-R8G8B8_USCALED              ,  24,  1,  1,  1,  us8,  us8,  us8,     ,     ,     ,    , linear,
-R64G64B64A64_FLOAT          , 256,  1,  1,  1, sf64, sf64, sf64, sf64,     ,     ,    , linear,
-R64G64B64_FLOAT             , 196,  1,  1,  1, sf64, sf64, sf64,     ,     ,     ,    , linear,
-BC4_SNORM                   ,  64,  4,  4,  1,  sn8,     ,     ,     ,     ,     ,    , linear, rgtc1
-BC5_SNORM                   , 128,  4,  4,  1,  sn8,  sn8,     ,     ,     ,     ,    , linear, rgtc2
-R16G16B16_FLOAT             ,  48,  1,  1,  1, sf16, sf16, sf16,     ,     ,     ,    , linear,
-R16G16B16_UNORM             ,  48,  1,  1,  1, un16, un16, un16,     ,     ,     ,    , linear,
-R16G16B16_SNORM             ,  48,  1,  1,  1, sn16, sn16, sn16,     ,     ,     ,    , linear,
-R16G16B16_SSCALED           ,  48,  1,  1,  1, ss16, ss16, ss16,     ,     ,     ,    , linear,
-R16G16B16_USCALED           ,  48,  1,  1,  1, us16, us16, us16,     ,     ,     ,    , linear,
-BC6H_SF16                   , 128,  4,  4,  1, sf16, sf16, sf16,     ,     ,     ,    , linear,  bptc
-BC7_UNORM                   , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    , linear,  bptc
-BC7_UNORM_SRGB              , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  bptc
-BC6H_UF16                   , 128,  4,  4,  1, uf16, uf16, uf16,     ,     ,     ,    , linear,  bptc
-PLANAR_420_8                ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,    yuv,
-R8G8B8_UNORM_SRGB           ,  24,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,   srgb,
-ETC1_RGB8                   ,  64,  4,  4,  1,  un8,  un8,  un8,     ,     ,     ,    , linear,  etc1
-ETC2_RGB8                   ,  64,  4,  4,  1,  un8,  un8,  un8,     ,     ,     ,    , linear,  etc2
-EAC_R11                     ,  64,  4,  4,  1, un11,     ,     ,     ,     ,     ,    , linear,  etc2
-EAC_RG11                    , 128,  4,  4,  1, un11, un11,     ,     ,     ,     ,    , linear,  etc2
-EAC_SIGNED_R11              ,  64,  4,  4,  1, sn11,     ,     ,     ,     ,     ,    , linear,  etc2
-EAC_SIGNED_RG11             , 128,  4,  4,  1, sn11, sn11,     ,     ,     ,     ,    , linear,  etc2
-ETC2_SRGB8                  ,  64,  4,  4,  1,  un8,  un8,  un8,     ,     ,     ,    ,   srgb,  etc2
-R16G16B16_UINT              ,  48,  1,  1,  1, ui16, ui16, ui16,     ,     ,     ,    , linear,
-R16G16B16_SINT              ,  48,  1,  1,  1, si16, si16, si16,     ,     ,     ,    , linear,
-R32_SFIXED                  ,  32,  1,  1,  1, sx16,     ,     ,     ,     ,     ,    , linear,
-R10G10B10A2_SNORM           ,  32,  1,  1,  1, sn10, sn10, sn10,  sn2,     ,     ,    , linear,
-R10G10B10A2_USCALED         ,  32,  1,  1,  1, us10, us10, us10,  us2,     ,     ,    , linear,
-R10G10B10A2_SSCALED         ,  32,  1,  1,  1, ss10, ss10, ss10,  ss2,     ,     ,    , linear,
-R10G10B10A2_SINT            ,  32,  1,  1,  1, si10, si10, si10,  si2,     ,     ,    , linear,
-B10G10R10A2_SNORM           ,  32,  1,  1,  1, sn10, sn10, sn10,  sn2,     ,     ,    , linear,
-B10G10R10A2_USCALED         ,  32,  1,  1,  1, us10, us10, us10,  us2,     ,     ,    , linear,
-B10G10R10A2_SSCALED         ,  32,  1,  1,  1, ss10, ss10, ss10,  ss2,     ,     ,    , linear,
-B10G10R10A2_UINT            ,  32,  1,  1,  1, ui10, ui10, ui10,  ui2,     ,     ,    , linear,
-B10G10R10A2_SINT            ,  32,  1,  1,  1, si10, si10, si10,  si2,     ,     ,    , linear,
-R64G64B64A64_PASSTHRU       , 256,  1,  1,  1,  r64,  r64,  r64,  r64,     ,     ,    ,       ,
-R64G64B64_PASSTHRU          , 192,  1,  1,  1,  r64,  r64,  r64,     ,     ,     ,    ,       ,
-ETC2_RGB8_PTA               ,  64,  4,  4,  1,  un8,  un8,  un8,  un1,     ,     ,    , linear,  etc2
-ETC2_SRGB8_PTA              ,  64,  4,  4,  1,  un8,  un8,  un8,  un1,     ,     ,    ,   srgb,  etc2
-ETC2_EAC_RGBA8              , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    , linear,  etc2
-ETC2_EAC_SRGB8_A8           , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  etc2
-R8G8B8_UINT                 ,  24,  1,  1,  1,  ui8,  ui8,  ui8,     ,     ,     ,    , linear,
-R8G8B8_SINT                 ,  24,  1,  1,  1,  si8,  si8,  si8,     ,     ,     ,    , linear,
-RAW                         ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,       ,
-ASTC_LDR_2D_4X4_U8SRGB      , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_5X4_U8SRGB      , 128,  5,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_5X5_U8SRGB      , 128,  5,  5,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_6X5_U8SRGB      , 128,  6,  5,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_6X6_U8SRGB      , 128,  6,  6,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_8X5_U8SRGB      , 128,  8,  5,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_8X6_U8SRGB      , 128,  8,  6,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_8X8_U8SRGB      , 128,  8,  8,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_10X5_U8SRGB     , 128, 10,  5,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_10X6_U8SRGB     , 128, 10,  6,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_10X8_U8SRGB     , 128, 10,  8,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_10X10_U8SRGB    , 128, 10, 10,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_12X10_U8SRGB    , 128, 12, 10,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_12X12_U8SRGB    , 128, 12, 12,  1,  un8,  un8,  un8,  un8,     ,     ,    ,   srgb,  astc
-ASTC_LDR_2D_4X4_FLT16       , 128,  4,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_5X4_FLT16       , 128,  5,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_5X5_FLT16       , 128,  5,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_6X5_FLT16       , 128,  6,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_6X6_FLT16       , 128,  6,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_8X5_FLT16       , 128,  8,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_8X6_FLT16       , 128,  8,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_8X8_FLT16       , 128,  8,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_10X5_FLT16      , 128, 10,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_10X6_FLT16      , 128, 10,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_10X8_FLT16      , 128, 10,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_10X10_FLT16     , 128, 10, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_12X10_FLT16     , 128, 12, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_LDR_2D_12X12_FLT16     , 128, 12, 12,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_4X4_FLT16       , 128,  4,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_5X4_FLT16       , 128,  5,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_5X5_FLT16       , 128,  5,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_6X5_FLT16       , 128,  6,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_6X6_FLT16       , 128,  6,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_8X5_FLT16       , 128,  8,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_8X6_FLT16       , 128,  8,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_8X8_FLT16       , 128,  8,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_10X5_FLT16      , 128, 10,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_10X6_FLT16      , 128, 10,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_10X8_FLT16      , 128, 10,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_10X10_FLT16     , 128, 10, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_12X10_FLT16     , 128, 12, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-ASTC_HDR_2D_12X12_FLT16     , 128, 12, 12,  1, sf16, sf16, sf16, sf16,     ,     ,    , linear,  astc
-HIZ                         , 128,  8,  4,  1,     ,     ,     ,     ,     ,     ,    ,       ,   hiz
-MCS_2X                      ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,       ,   mcs
-MCS_4X                      ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,       ,   mcs
-MCS_8X                      ,  32,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,       ,   mcs
-MCS_16X                     ,  64,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,       ,   mcs
-GEN7_CCS_32BPP_X            ,   1, 16,  2,  1,     ,     ,     ,     ,     ,     ,    ,       ,   ccs
-GEN7_CCS_64BPP_X            ,   1,  8,  2,  1,     ,     ,     ,     ,     ,     ,    ,       ,   ccs
-GEN7_CCS_128BPP_X           ,   1,  4,  2,  1,     ,     ,     ,     ,     ,     ,    ,       ,   ccs
-GEN7_CCS_32BPP_Y            ,   1,  8,  4,  1,     ,     ,     ,     ,     ,     ,    ,       ,   ccs
-GEN7_CCS_64BPP_Y            ,   1,  4,  4,  1,     ,     ,     ,     ,     ,     ,    ,       ,   ccs
-GEN7_CCS_128BPP_Y           ,   1,  2,  4,  1,     ,     ,     ,     ,     ,     ,    ,       ,   ccs
-GEN9_CCS_32BPP              ,   2,  8,  4,  1,     ,     ,     ,     ,     ,     ,    ,       ,   ccs
-GEN9_CCS_64BPP              ,   2,  4,  4,  1,     ,     ,     ,     ,     ,     ,    ,       ,   ccs
-GEN9_CCS_128BPP             ,   2,  2,  4,  1,     ,     ,     ,     ,     ,     ,    ,       ,   ccs
+# name                      , bpb, bw, bh, bd,    r,    g,    b,    a,    l,    i,   p, order,  space,   txc
+R32G32B32A32_FLOAT          , 128,  1,  1,  1, sf32, sf32, sf32, sf32,     ,     ,    ,  rgba, linear,
+R32G32B32A32_SINT           , 128,  1,  1,  1, si32, si32, si32, si32,     ,     ,    ,  rgba, linear,
+R32G32B32A32_UINT           , 128,  1,  1,  1, ui32, ui32, ui32, ui32,     ,     ,    ,  rgba, linear,
+R32G32B32A32_UNORM          , 128,  1,  1,  1, un32, un32, un32, un32,     ,     ,    ,  rgba, linear,
+R32G32B32A32_SNORM          , 128,  1,  1,  1, sn32, sn32, sn32, sn32,     ,     ,    ,  rgba, linear,
+R64G64_FLOAT                , 128,  1,  1,  1, sf64, sf64,     ,     ,     ,     ,    ,    rg, linear,
+R32G32B32X32_FLOAT          , 128,  1,  1,  1, sf32, sf32, sf32,  x32,     ,     ,    ,  rgba, linear,
+R32G32B32A32_SSCALED        , 128,  1,  1,  1, ss32, ss32, ss32, ss32,     ,     ,    ,  rgba, linear,
+R32G32B32A32_USCALED        , 128,  1,  1,  1, us32, us32, us32, us32,     ,     ,    ,  rgba, linear,
+R32G32B32A32_SFIXED         , 128,  1,  1,  1, sx32, sx32, sx32, sx32,     ,     ,    ,  rgba, linear,
+R64G64_PASSTHRU             , 128,  1,  1,  1,  r64,  r64,     ,     ,     ,     ,    ,    rg,       ,
+R32G32B32_FLOAT             ,  96,  1,  1,  1, sf32, sf32, sf32,     ,     ,     ,    ,   rgb, linear,
+R32G32B32_SINT              ,  96,  1,  1,  1, si32, si32, si32,     ,     ,     ,    ,   rgb, linear,
+R32G32B32_UINT              ,  96,  1,  1,  1, ui32, ui32, ui32,     ,     ,     ,    ,   rgb, linear,
+R32G32B32_UNORM             ,  96,  1,  1,  1, un32, un32, un32,     ,     ,     ,    ,   rgb, linear,
+R32G32B32_SNORM             ,  96,  1,  1,  1, sn32, sn32, sn32,     ,     ,     ,    ,   rgb, linear,
+R32G32B32_SSCALED           ,  96,  1,  1,  1, ss32, ss32, ss32,     ,     ,     ,    ,   rgb, linear,
+R32G32B32_USCALED           ,  96,  1,  1,  1, us32, us32, us32,     ,     ,     ,    ,   rgb, linear,
+R32G32B32_SFIXED            ,  96,  1,  1,  1, sx32, sx32, sx32,     ,     ,     ,    ,   rgb, linear,
+R16G16B16A16_UNORM          ,  64,  1,  1,  1, un16, un16, un16, un16,     ,     ,    ,  rgba, linear,
+R16G16B16A16_SNORM          ,  64,  1,  1,  1, sn16, sn16, sn16, sn16,     ,     ,    ,  rgba, linear,
+R16G16B16A16_SINT           ,  64,  1,  1,  1, si16, si16, si16, si16,     ,     ,    ,  rgba, linear,
+R16G16B16A16_UINT           ,  64,  1,  1,  1, ui16, ui16, ui16, ui16,     ,     ,    ,  rgba, linear,
+R16G16B16A16_FLOAT          ,  64,  1,  1,  1, sf16, sf16, sf16, sf16,     ,     ,    ,  rgba, linear,
+R32G32_FLOAT                ,  64,  1,  1,  1, sf32, sf32,     ,     ,     ,     ,    ,    rg, linear,
+R32G32_SINT                 ,  64,  1,  1,  1, si32, si32,     ,     ,     ,     ,    ,    rg, linear,
+R32G32_UINT                 ,  64,  1,  1,  1, ui32, ui32,     ,     ,     ,     ,    ,    rg, linear,
+R32_FLOAT_X8X24_TYPELESS    ,  64,  1,  1,  1, sf32,   x8,  x24,     ,     ,     ,    ,   rgb, linear,
+X32_TYPELESS_G8X24_UINT     ,  64,  1,  1,  1,  x32,  ui8,  x24,     ,     ,     ,    ,   rgb, linear,
+L32A32_FLOAT                ,  64,  1,  1,  1,     ,     ,     , sf32, sf32,     ,    ,    la, linear,
+R32G32_UNORM                ,  64,  1,  1,  1, un32, un32,     ,     ,     ,     ,    ,    rg, linear,
+R32G32_SNORM                ,  64,  1,  1,  1, sn32, sn32,     ,     ,     ,     ,    ,    rg, linear,
+R64_FLOAT                   ,  64,  1,  1,  1, sf64,     ,     ,     ,     ,     ,    ,     r, linear,
+R16G16B16X16_UNORM          ,  64,  1,  1,  1, un16, un16, un16,  x16,     ,     ,    ,  rgba, linear,
+R16G16B16X16_FLOAT          ,  64,  1,  1,  1, sf16, sf16, sf16,  x16,     ,     ,    ,  rgba, linear,
+A32X32_FLOAT                ,  64,  1,  1,  1,     ,     ,     , sf32,  x32,     ,    ,    al, linear,
+L32X32_FLOAT                ,  64,  1,  1,  1,     ,     ,     ,  x32, sf32,     ,    ,    la, linear,
+I32X32_FLOAT                ,  64,  1,  1,  1,     ,     ,     ,  x32,     , sf32,    ,    ia, linear,
+R16G16B16A16_SSCALED        ,  64,  1,  1,  1, ss16, ss16, ss16, ss16,     ,     ,    ,  rgba, linear,
+R16G16B16A16_USCALED        ,  64,  1,  1,  1, us16, us16, us16, us16,     ,     ,    ,  rgba, linear,
+R32G32_SSCALED              ,  64,  1,  1,  1, ss32, ss32,     ,     ,     ,     ,    ,    rg, linear,
+R32G32_USCALED              ,  64,  1,  1,  1, us32, us32,     ,     ,     ,     ,    ,    rg, linear,
+R32G32_FLOAT_LD             ,  64,  1,  1,  1, sf32, sf32,     ,     ,     ,     ,    ,    rg, linear,
+R32G32_SFIXED               ,  64,  1,  1,  1, sx32, sx32,     ,     ,     ,     ,    ,    rg, linear,
+R64_PASSTHRU                ,  64,  1,  1,  1,  r64,     ,     ,     ,     ,     ,    ,     r,       ,
+B8G8R8A8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    ,  bgra, linear,
+B8G8R8A8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    ,  bgra,   srgb,
+R10G10B10A2_UNORM           ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    ,  rgba, linear,
+R10G10B10A2_UNORM_SRGB      ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    ,  rgba,   srgb,
+R10G10B10A2_UINT            ,  32,  1,  1,  1, ui10, ui10, ui10,  ui2,     ,     ,    ,  rgba, linear,
+R10G10B10_SNORM_A2_UNORM    ,  32,  1,  1,  1, sn10, sn10, sn10,  un2,     ,     ,    ,  rgba, linear,
+R8G8B8A8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    ,  rgba, linear,
+R8G8B8A8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,  un8,     ,     ,    ,  rgba,   srgb,
+R8G8B8A8_SNORM              ,  32,  1,  1,  1,  sn8,  sn8,  sn8,  sn8,     ,     ,    ,  rgba, linear,
+R8G8B8A8_SINT               ,  32,  1,  1,  1,  si8,  si8,  si8,  si8,     ,     ,    ,  rgba, linear,
+R8G8B8A8_UINT               ,  32,  1,  1,  1,  ui8,  ui8,  ui8,  ui8,     ,     ,    ,  rgba, linear,
+R16G16_UNORM                ,  32,  1,  1,  1, un16, un16,     ,     ,     ,     ,    ,    rg, linear,
+R16G16_SNORM                ,  32,  1,  1,  1, sn16, sn16,     ,     ,     ,     ,    ,    rg, linear,
+R16G16_SINT                 ,  32,  1,  1,  1, si16, si16,     ,     ,     ,     ,    ,    rg, linear,
+R16G16_UINT                 ,  32,  1,  1,  1, ui16, ui16,     ,     ,     ,     ,    ,    rg, linear,
+R16G16_FLOAT                ,  32,  1,  1,  1, sf16, sf16,     ,     ,     ,     ,    ,    rg, linear,
+B10G10R10A2_UNORM           ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    ,  bgra, linear,
+B10G10R10A2_UNORM_SRGB      ,  32,  1,  1,  1, un10, un10, un10,  un2,     ,     ,    ,  bgra,   srgb,
+R11G11B10_FLOAT             ,  32,  1,  1,  1, uf11, uf11, uf10,     ,     ,     ,    ,   rgb, linear,
+R32_SINT                    ,  32,  1,  1,  1, si32,     ,     ,     ,     ,     ,    ,     r, linear,
+R32_UINT                    ,  32,  1,  1,  1, ui32,     ,     ,     ,     ,     ,    ,     r, linear,
+R32_FLOAT                   ,  32,  1,  1,  1, sf32,     ,     ,     ,     ,     ,    ,     r, linear,
+R24_UNORM_X8_TYPELESS       ,  32,  1,  1,  1, un24,   x8,     ,     ,     ,     ,    ,    rg, linear,
+X24_TYPELESS_G8_UINT        ,  32,  1,  1,  1,  x24,  ui8,     ,     ,     ,     ,    ,    rg, linear,
+L32_UNORM                   ,  32,  1,  1,  1,     ,     ,     ,     , un32,     ,    ,     l, linear,
+A32_UNORM                   ,  32,  1,  1,  1,     ,     ,     , un32,     ,     ,    ,     a, linear,
+L16A16_UNORM                ,  32,  1,  1,  1,     ,     ,     , un16, un16,     ,    ,    la, linear,
+I24X8_UNORM                 ,  32,  1,  1,  1,     ,     ,     ,   x8,     , un24,    ,    ia, linear,
+L24X8_UNORM                 ,  32,  1,  1,  1,     ,     ,     ,   x8, un24,     ,    ,    la, linear,
+A24X8_UNORM                 ,  32,  1,  1,  1,     ,     ,     , un24,   x8,     ,    ,    al, linear,
+I32_FLOAT                   ,  32,  1,  1,  1,     ,     ,     ,     ,     , sf32,    ,     i, linear,
+L32_FLOAT                   ,  32,  1,  1,  1,     ,     ,     ,     , sf32,     ,    ,     l, linear,
+A32_FLOAT                   ,  32,  1,  1,  1,     ,     ,     , sf32,     ,     ,    ,     a, linear,
+X8B8_UNORM_G8R8_SNORM       ,  32,  1,  1,  1,  sn8,  sn8,  un8,   x8,     ,     ,    ,  abgr, linear,
+A8X8_UNORM_G8R8_SNORM       ,  32,  1,  1,  1,  sn8,  sn8,   x8,  un8,     ,     ,    ,  abgr, linear,
+B8X8_UNORM_G8R8_SNORM       ,  32,  1,  1,  1,  sn8,  sn8,  un8,   x8,     ,     ,    ,  bagr, linear,
+B8G8R8X8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    ,  bgra, linear,
+B8G8R8X8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    ,  bgra,   srgb,
+R8G8B8X8_UNORM              ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    ,  rgba, linear,
+R8G8B8X8_UNORM_SRGB         ,  32,  1,  1,  1,  un8,  un8,  un8,   x8,     ,     ,    ,  rgba,   srgb,
+R9G9B9E5_SHAREDEXP          ,  32,  1,  1,  1,  uf9,  uf9,  uf9,     ,     ,     ,    ,   rgb, linear,
+B10G10R10X2_UNORM           ,  32,  1,  1,  1, un10, un10, un10,   x2,     ,     ,    ,  bgra, linear,
+L16A16_FLOAT                ,  32,  1,  1,  1,     ,     ,     , sf16, sf16,     ,    ,    la, linear,
+R32_UNORM                   ,  32,  1,  1,  1, un32,     ,     ,     ,     ,     ,    ,     r, linear,
+R32_SNORM                   ,  32,  1,  1,  1, sn32,     ,     ,     ,     ,     ,    ,     r, linear,
+R10G10B10X2_USCALED         ,  32,  1,  1,  1, us10, us10, us10,   x2,     ,     ,    ,  rgba, linear,
+R8G8B8A8_SSCALED            ,  32,  1,  1,  1,  ss8,  ss8,  ss8,  ss8,     ,     ,    ,  rgba, linear,
+R8G8B8A8_USCALED            ,  32,  1,  1,  1,  us8,  us8,  us8,  us8,     ,     ,    ,  rgba, linear,
+R16G16_SSCALED              ,  32,  1,  1,  1, ss16,  ss6,     ,     ,     ,     ,    ,    rg, linear,
+R16G16_USCALED              ,  32,  1,  1,  1, us16, us16,     ,     ,     ,     ,    ,    rg, linear,
+R32_SSCALED                 ,  32,  1,  1,  1, ss32,     ,     ,     ,     ,     ,    ,     r, linear,
+R32_USCALED                 ,  32,  1,  1,  1, us32,     ,     ,     ,     ,     ,    ,     r, linear,
+B5G6R5_UNORM                ,  16,  1,  1,  1,  un5,  un6,  un5,     ,     ,     ,    ,   bgr, linear,
+B5G6R5_UNORM_SRGB           ,  16,  1,  1,  1,  un5,  un6,  un5,     ,     ,     ,    ,   bgr,   srgb,
+B5G5R5A1_UNORM              ,  16,  1,  1,  1,  un5,  un5,  un5,  un1,     ,     ,    ,  bgra, linear,
+B5G5R5A1_UNORM_SRGB         ,  16,  1,  1,  1,  un5,  un5,  un5,  un1,     ,     ,    ,  bgra,   srgb,
+B4G4R4A4_UNORM              ,  16,  1,  1,  1,  un4,  un4,  un4,  un4,     ,     ,    ,  bgra, linear,
+B4G4R4A4_UNORM_SRGB         ,  16,  1,  1,  1,  un4,  un4,  un4,  un4,     ,     ,    ,  bgra,   srgb,
+R8G8_UNORM                  ,  16,  1,  1,  1,  un8,  un8,     ,     ,     ,     ,    ,    rg, linear,
+R8G8_SNORM                  ,  16,  1,  1,  1,  sn8,  sn8,     ,     ,     ,     ,    ,    rg, linear,
+R8G8_SINT                   ,  16,  1,  1,  1,  si8,  si8,     ,     ,     ,     ,    ,    rg, linear,
+R8G8_UINT                   ,  16,  1,  1,  1,  ui8,  ui8,     ,     ,     ,     ,    ,    rg, linear,
+R16_UNORM                   ,  16,  1,  1,  1, un16,     ,     ,     ,     ,     ,    ,     r, linear,
+R16_SNORM                   ,  16,  1,  1,  1, sn16,     ,     ,     ,     ,     ,    ,     r, linear,
+R16_SINT                    ,  16,  1,  1,  1, si16,     ,     ,     ,     ,     ,    ,     r, linear,
+R16_UINT                    ,  16,  1,  1,  1, ui16,     ,     ,     ,     ,     ,    ,     r, linear,
+R16_FLOAT                   ,  16,  1,  1,  1, sf16,     ,     ,     ,     ,     ,    ,     r, linear,
+A8P8_UNORM_PALETTE0         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8,    ap, linear,
+A8P8_UNORM_PALETTE1         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8,    ap, linear,
+I16_UNORM                   ,  16,  1,  1,  1,     ,     ,     ,     ,     , un16,    ,     i, linear,
+L16_UNORM                   ,  16,  1,  1,  1,     ,     ,     ,     , un16,     ,    ,     l, linear,
+A16_UNORM                   ,  16,  1,  1,  1,     ,     ,     , un16,     ,     ,    ,     a, linear,
+L8A8_UNORM                  ,  16,  1,  1,  1,     ,     ,     ,  un8,  un8,     ,    ,    la, linear,
+I16_FLOAT                   ,  16,  1,  1,  1,     ,     ,     ,     ,     , sf16,    ,     i, linear,
+L16_FLOAT                   ,  16,  1,  1,  1,     ,     ,     ,     , sf16,     ,    ,     l, linear,
+A16_FLOAT                   ,  16,  1,  1,  1,     ,     ,     , sf16,     ,     ,    ,     a, linear,
+L8A8_UNORM_SRGB             ,  16,  1,  1,  1,     ,     ,     ,  un8,  un8,     ,    ,    la,   srgb,
+R5G5_SNORM_B6_UNORM         ,  16,  1,  1,  1,  sn5,  sn5,  un6,     ,     ,     ,    ,   rgb, linear,
+B5G5R5X1_UNORM              ,  16,  1,  1,  1,  un5,  un5,  un5,   x1,     ,     ,    ,  bgra, linear,
+B5G5R5X1_UNORM_SRGB         ,  16,  1,  1,  1,  un5,  un5,  un5,   x1,     ,     ,    ,  bgra,   srgb,
+R8G8_SSCALED                ,  16,  1,  1,  1,  ss8,  ss8,     ,     ,     ,     ,    ,    rg, linear,
+R8G8_USCALED                ,  16,  1,  1,  1,  us8,  us8,     ,     ,     ,     ,    ,    rg, linear,
+R16_SSCALED                 ,  16,  1,  1,  1, ss16,     ,     ,     ,     ,     ,    ,     r, linear,
+R16_USCALED                 ,  16,  1,  1,  1, us16,     ,     ,     ,     ,     ,    ,     r, linear,
+P8A8_UNORM_PALETTE0         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8,    pa, linear,
+P8A8_UNORM_PALETTE1         ,  16,  1,  1,  1,     ,     ,     ,  un8,     ,     , un8,    pa, linear,
+A1B5G5R5_UNORM              ,  16,  1,  1,  1,  un5,  un5,  un5,  un1,     ,     ,    ,  abgr, linear,
+A4B4G4R4_UNORM              ,  16,  1,  1,  1,  un4,  un4,  un4,  un4,     ,     ,    ,  abgr, linear,
+L8A8_UINT                   ,  16,  1,  1,  1,     ,     ,     ,  ui8,  ui8,     ,    ,    la, linear,
+L8A8_SINT                   ,  16,  1,  1,  1,     ,     ,     ,  si8,  si8,     ,    ,    la, linear,
+R8_UNORM                    ,   8,  1,  1,  1,  un8,     ,     ,     ,     ,     ,    ,     r, linear,
+R8_SNORM                    ,   8,  1,  1,  1,  sn8,     ,     ,     ,     ,     ,    ,     r, linear,
+R8_SINT                     ,   8,  1,  1,  1,  si8,     ,     ,     ,     ,     ,    ,     r, linear,
+R8_UINT                     ,   8,  1,  1,  1,  ui8,     ,     ,     ,     ,     ,    ,     r, linear,
+A8_UNORM                    ,   8,  1,  1,  1,     ,     ,     ,  un8,     ,     ,    ,     a, linear,
+I8_UNORM                    ,   8,  1,  1,  1,     ,     ,     ,     ,     ,  un8,    ,     i, linear,
+L8_UNORM                    ,   8,  1,  1,  1,     ,     ,     ,     ,  un8,     ,    ,     l, linear,
+P4A4_UNORM_PALETTE0         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4,    pa, linear,
+A4P4_UNORM_PALETTE0         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4,    ap, linear,
+R8_SSCALED                  ,   8,  1,  1,  1,  ss8,     ,     ,     ,     ,     ,    ,     r, linear,
+R8_USCALED                  ,   8,  1,  1,  1,  us8,     ,     ,     ,     ,     ,    ,     r, linear,
+P8_UNORM_PALETTE0           ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     , un8,     p, linear,
+L8_UNORM_SRGB               ,   8,  1,  1,  1,     ,     ,     ,     ,  un8,     ,    ,     l, linear,
+P8_UNORM_PALETTE1           ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     , un8,     p, linear,
+P4A4_UNORM_PALETTE1         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4,    pa, linear,
+A4P4_UNORM_PALETTE1         ,   8,  1,  1,  1,     ,     ,     ,  un4,     ,     , un4,    ap, linear,
+Y8_UNORM                    ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,      ,    yuv,
+L8_UINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,  ui8,     ,    ,     l, linear,
+L8_SINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,  si8,     ,    ,     l, linear,
+I8_UINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,     ,  ui8,    ,     i, linear,
+I8_SINT                     ,   8,  1,  1,  1,     ,     ,     ,     ,     ,  si8,    ,     i, linear,
+DXT1_RGB_SRGB               ,  64,  4,  4,  1,  un4,  un4,  un4,     ,     ,     ,    ,      ,   srgb,  dxt1
+R1_UNORM                    ,   1,  1,  1,  1,  un1,     ,     ,     ,     ,     ,    ,     r, linear,
+YCRCB_NORMAL                ,  16,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,      ,    yuv,
+YCRCB_SWAPUVY               ,  16,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,      ,    yuv,
+P2_UNORM_PALETTE0           ,   2,  1,  1,  1,     ,     ,     ,     ,     ,     , un2,     p, linear,
+P2_UNORM_PALETTE1           ,   2,  1,  1,  1,     ,     ,     ,     ,     ,     , un2,     p, linear,
+BC1_UNORM                   ,  64,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,      , linear,  dxt1
+BC2_UNORM                   , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,      , linear,  dxt3
+BC3_UNORM                   , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,      , linear,  dxt5
+BC4_UNORM                   ,  64,  4,  4,  1,  un8,     ,     ,     ,     ,     ,    ,      , linear, rgtc1
+BC5_UNORM                   , 128,  4,  4,  1,  un8,  un8,     ,     ,     ,     ,    ,      , linear, rgtc2
+BC1_UNORM_SRGB              ,  64,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,      ,   srgb,  dxt1
+BC2_UNORM_SRGB              , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,      ,   srgb,  dxt3
+BC3_UNORM_SRGB              , 128,  4,  4,  1,  un4,  un4,  un4,  un4,     ,     ,    ,      ,   srgb,  dxt5
+MONO8                       ,   1,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,
+YCRCB_SWAPUV                ,  16,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,      ,    yuv,
+YCRCB_SWAPY                 ,  16,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,      ,    yuv,
+DXT1_RGB                    ,  64,  4,  4,  1,  un4,  un4,  un4,     ,     ,     ,    ,      , linear,  dxt1
+FXT1                        , 128,  8,  4,  1,  un4,  un4,  un4,     ,     ,     ,    ,      , linear,  fxt1
+R8G8B8_UNORM                ,  24,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,   rgb, linear,
+R8G8B8_SNORM                ,  24,  1,  1,  1,  sn8,  sn8,  sn8,     ,     ,     ,    ,   rgb, linear,
+R8G8B8_SSCALED              ,  24,  1,  1,  1,  ss8,  ss8,  ss8,     ,     ,     ,    ,   rgb, linear,
+R8G8B8_USCALED              ,  24,  1,  1,  1,  us8,  us8,  us8,     ,     ,     ,    ,   rgb, linear,
+R64G64B64A64_FLOAT          , 256,  1,  1,  1, sf64, sf64, sf64, sf64,     ,     ,    ,  rgba, linear,
+R64G64B64_FLOAT             , 196,  1,  1,  1, sf64, sf64, sf64,     ,     ,     ,    ,   rgb, linear,
+BC4_SNORM                   ,  64,  4,  4,  1,  sn8,     ,     ,     ,     ,     ,    ,      , linear, rgtc1
+BC5_SNORM                   , 128,  4,  4,  1,  sn8,  sn8,     ,     ,     ,     ,    ,      , linear, rgtc2
+R16G16B16_FLOAT             ,  48,  1,  1,  1, sf16, sf16, sf16,     ,     ,     ,    ,   rgb, linear,
+R16G16B16_UNORM             ,  48,  1,  1,  1, un16, un16, un16,     ,     ,     ,    ,   rgb, linear,
+R16G16B16_SNORM             ,  48,  1,  1,  1, sn16, sn16, sn16,     ,     ,     ,    ,   rgb, linear,
+R16G16B16_SSCALED           ,  48,  1,  1,  1, ss16, ss16, ss16,     ,     ,     ,    ,   rgb, linear,
+R16G16B16_USCALED           ,  48,  1,  1,  1, us16, us16, us16,     ,     ,     ,    ,   rgb, linear,
+BC6H_SF16                   , 128,  4,  4,  1, sf16, sf16, sf16,     ,     ,     ,    ,      , linear,  bptc
+BC7_UNORM                   , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      , linear,  bptc
+BC7_UNORM_SRGB              , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  bptc
+BC6H_UF16                   , 128,  4,  4,  1, uf16, uf16, uf16,     ,     ,     ,    ,      , linear,  bptc
+PLANAR_420_8                ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,      ,    yuv,
+R8G8B8_UNORM_SRGB           ,  24,  1,  1,  1,  un8,  un8,  un8,     ,     ,     ,    ,   rgb,   srgb,
+ETC1_RGB8                   ,  64,  4,  4,  1,  un8,  un8,  un8,     ,     ,     ,    ,      , linear,  etc1
+ETC2_RGB8                   ,  64,  4,  4,  1,  un8,  un8,  un8,     ,     ,     ,    ,      , linear,  etc2
+EAC_R11                     ,  64,  4,  4,  1, un11,     ,     ,     ,     ,     ,    ,      , linear,  etc2
+EAC_RG11                    , 128,  4,  4,  1, un11, un11,     ,     ,     ,     ,    ,      , linear,  etc2
+EAC_SIGNED_R11              ,  64,  4,  4,  1, sn11,     ,     ,     ,     ,     ,    ,      , linear,  etc2
+EAC_SIGNED_RG11             , 128,  4,  4,  1, sn11, sn11,     ,     ,     ,     ,    ,      , linear,  etc2
+ETC2_SRGB8                  ,  64,  4,  4,  1,  un8,  un8,  un8,     ,     ,     ,    ,      ,   srgb,  etc2
+R16G16B16_UINT              ,  48,  1,  1,  1, ui16, ui16, ui16,     ,     ,     ,    ,   rgb, linear,
+R16G16B16_SINT              ,  48,  1,  1,  1, si16, si16, si16,     ,     ,     ,    ,   rgb, linear,
+R32_SFIXED                  ,  32,  1,  1,  1, sx16,     ,     ,     ,     ,     ,    ,     r, linear,
+R10G10B10A2_SNORM           ,  32,  1,  1,  1, sn10, sn10, sn10,  sn2,     ,     ,    ,  rgba, linear,
+R10G10B10A2_USCALED         ,  32,  1,  1,  1, us10, us10, us10,  us2,     ,     ,    ,  rgba, linear,
+R10G10B10A2_SSCALED         ,  32,  1,  1,  1, ss10, ss10, ss10,  ss2,     ,     ,    ,  rgba, linear,
+R10G10B10A2_SINT            ,  32,  1,  1,  1, si10, si10, si10,  si2,     ,     ,    ,  rgba, linear,
+B10G10R10A2_SNORM           ,  32,  1,  1,  1, sn10, sn10, sn10,  sn2,     ,     ,    ,  rgba, linear,
+B10G10R10A2_USCALED         ,  32,  1,  1,  1, us10, us10, us10,  us2,     ,     ,    ,  rgba, linear,
+B10G10R10A2_SSCALED         ,  32,  1,  1,  1, ss10, ss10, ss10,  ss2,     ,     ,    ,  rgba, linear,
+B10G10R10A2_UINT            ,  32,  1,  1,  1, ui10, ui10, ui10,  ui2,     ,     ,    ,  rgba, linear,
+B10G10R10A2_SINT            ,  32,  1,  1,  1, si10, si10, si10,  si2,     ,     ,    ,  rgba, linear,
+R64G64B64A64_PASSTHRU       , 256,  1,  1,  1,  r64,  r64,  r64,  r64,     ,     ,    ,  rgba,       ,
+R64G64B64_PASSTHRU          , 192,  1,  1,  1,  r64,  r64,  r64,     ,     ,     ,    ,   rgb,       ,
+ETC2_RGB8_PTA               ,  64,  4,  4,  1,  un8,  un8,  un8,  un1,     ,     ,    ,      , linear,  etc2
+ETC2_SRGB8_PTA              ,  64,  4,  4,  1,  un8,  un8,  un8,  un1,     ,     ,    ,      ,   srgb,  etc2
+ETC2_EAC_RGBA8              , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      , linear,  etc2
+ETC2_EAC_SRGB8_A8           , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  etc2
+R8G8B8_UINT                 ,  24,  1,  1,  1,  ui8,  ui8,  ui8,     ,     ,     ,    ,   rgb, linear,
+R8G8B8_SINT                 ,  24,  1,  1,  1,  si8,  si8,  si8,     ,     ,     ,    ,   rgb, linear,
+RAW                         ,   0,  0,  0,  0,     ,     ,     ,     ,     ,     ,    ,      ,       ,
+ASTC_LDR_2D_4X4_U8SRGB      , 128,  4,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_5X4_U8SRGB      , 128,  5,  4,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_5X5_U8SRGB      , 128,  5,  5,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_6X5_U8SRGB      , 128,  6,  5,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_6X6_U8SRGB      , 128,  6,  6,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_8X5_U8SRGB      , 128,  8,  5,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_8X6_U8SRGB      , 128,  8,  6,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_8X8_U8SRGB      , 128,  8,  8,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_10X5_U8SRGB     , 128, 10,  5,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_10X6_U8SRGB     , 128, 10,  6,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_10X8_U8SRGB     , 128, 10,  8,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_10X10_U8SRGB    , 128, 10, 10,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_12X10_U8SRGB    , 128, 12, 10,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_12X12_U8SRGB    , 128, 12, 12,  1,  un8,  un8,  un8,  un8,     ,     ,    ,      ,   srgb,  astc
+ASTC_LDR_2D_4X4_FLT16       , 128,  4,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_5X4_FLT16       , 128,  5,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_5X5_FLT16       , 128,  5,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_6X5_FLT16       , 128,  6,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_6X6_FLT16       , 128,  6,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_8X5_FLT16       , 128,  8,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_8X6_FLT16       , 128,  8,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_8X8_FLT16       , 128,  8,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_10X5_FLT16      , 128, 10,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_10X6_FLT16      , 128, 10,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_10X8_FLT16      , 128, 10,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_10X10_FLT16     , 128, 10, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_12X10_FLT16     , 128, 12, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_LDR_2D_12X12_FLT16     , 128, 12, 12,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_4X4_FLT16       , 128,  4,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_5X4_FLT16       , 128,  5,  4,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_5X5_FLT16       , 128,  5,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_6X5_FLT16       , 128,  6,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_6X6_FLT16       , 128,  6,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_8X5_FLT16       , 128,  8,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_8X6_FLT16       , 128,  8,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_8X8_FLT16       , 128,  8,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_10X5_FLT16      , 128, 10,  5,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_10X6_FLT16      , 128, 10,  6,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_10X8_FLT16      , 128, 10,  8,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_10X10_FLT16     , 128, 10, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_12X10_FLT16     , 128, 12, 10,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+ASTC_HDR_2D_12X12_FLT16     , 128, 12, 12,  1, sf16, sf16, sf16, sf16,     ,     ,    ,      , linear,  astc
+HIZ                         , 128,  8,  4,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   hiz
+MCS_2X                      ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   mcs
+MCS_4X                      ,   8,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   mcs
+MCS_8X                      ,  32,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   mcs
+MCS_16X                     ,  64,  1,  1,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   mcs
+GEN7_CCS_32BPP_X            ,   1, 16,  2,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   ccs
+GEN7_CCS_64BPP_X            ,   1,  8,  2,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   ccs
+GEN7_CCS_128BPP_X           ,   1,  4,  2,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   ccs
+GEN7_CCS_32BPP_Y            ,   1,  8,  4,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   ccs
+GEN7_CCS_64BPP_Y            ,   1,  4,  4,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   ccs
+GEN7_CCS_128BPP_Y           ,   1,  2,  4,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   ccs
+GEN9_CCS_32BPP              ,   2,  8,  4,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   ccs
+GEN9_CCS_64BPP              ,   2,  4,  4,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   ccs
+GEN9_CCS_128BPP             ,   2,  2,  4,  1,     ,     ,     ,     ,     ,     ,    ,      ,       ,   ccs
diff --git a/src/intel/isl/isl_gen4.c b/src/intel/isl/isl_gen4.c
index 14706c8..a212d0e 100644
--- a/src/intel/isl/isl_gen4.c
+++ b/src/intel/isl/isl_gen4.c
@@ -51,8 +51,15 @@
       /* From the g35 PRM Vol. 2, 3DSTATE_DEPTH_BUFFER::Tile Walk:
        *
        *    "The Depth Buffer, if tiled, must use Y-Major tiling"
+       *
+       *    Errata   Description    Project
+       *    BWT014   The Depth Buffer Must be Tiled, it cannot be linear. This
+       *    field must be set to 1 on DevBW-A.  [DevBW -A,B]
+       *
+       * In testing, the linear configuration doesn't seem to work on gen4.
        */
-      *flags &= (ISL_TILING_LINEAR_BIT | ISL_TILING_Y0_BIT);
+      *flags &= (ISL_DEV_GEN(dev) == 4 && !ISL_DEV_IS_G4X(dev)) ?
+                ISL_TILING_Y0_BIT : (ISL_TILING_Y0_BIT | ISL_TILING_LINEAR_BIT);
    }
 
    if (info->usage & (ISL_SURF_USAGE_DISPLAY_ROTATE_90_BIT |
diff --git a/src/intel/isl/isl_storage_image.c b/src/intel/isl/isl_storage_image.c
index e98fcf9..e2bd4dd 100644
--- a/src/intel/isl/isl_storage_image.c
+++ b/src/intel/isl/isl_storage_image.c
@@ -161,32 +161,36 @@
    /* No normalized fixed-point formats are supported by the hardware. */
    case ISL_FORMAT_R16G16B16A16_UNORM:
    case ISL_FORMAT_R16G16B16A16_SNORM:
-      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+      return (devinfo->gen >= 11 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
               ISL_FORMAT_R16G16B16A16_UINT :
               ISL_FORMAT_R32G32_UINT);
 
    case ISL_FORMAT_R8G8B8A8_UNORM:
    case ISL_FORMAT_R8G8B8A8_SNORM:
-      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+      return (devinfo->gen >= 11 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
               ISL_FORMAT_R8G8B8A8_UINT : ISL_FORMAT_R32_UINT);
 
    case ISL_FORMAT_R16G16_UNORM:
    case ISL_FORMAT_R16G16_SNORM:
-      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+      return (devinfo->gen >= 11 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
               ISL_FORMAT_R16G16_UINT : ISL_FORMAT_R32_UINT);
 
    case ISL_FORMAT_R8G8_UNORM:
    case ISL_FORMAT_R8G8_SNORM:
-      return (devinfo->gen >= 8 || devinfo->is_haswell ?
+      return (devinfo->gen >= 11 ? format :
+              devinfo->gen >= 8 || devinfo->is_haswell ?
               ISL_FORMAT_R8G8_UINT : ISL_FORMAT_R16_UINT);
 
    case ISL_FORMAT_R16_UNORM:
    case ISL_FORMAT_R16_SNORM:
-      return ISL_FORMAT_R16_UINT;
+      return (devinfo->gen >= 11 ? format : ISL_FORMAT_R16_UINT);
 
    case ISL_FORMAT_R8_UNORM:
    case ISL_FORMAT_R8_SNORM:
-      return ISL_FORMAT_R8_UINT;
+      return (devinfo->gen >= 11 ? format : ISL_FORMAT_R8_UINT);
 
    default:
       assert(!"Unknown image format");
diff --git a/src/intel/isl/isl_surface_state.c b/src/intel/isl/isl_surface_state.c
index bff9693..f181c3d 100644
--- a/src/intel/isl/isl_surface_state.c
+++ b/src/intel/isl/isl_surface_state.c
@@ -470,42 +470,15 @@
 #endif
 
 #if (GEN_GEN >= 8 || GEN_IS_HASWELL)
-   if (info->view->usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) {
-      /* From the Sky Lake PRM Vol. 2d,
-       * RENDER_SURFACE_STATE::Shader Channel Select Red
-       *
-       *    "For Render Target, Red, Green and Blue Shader Channel Selects
-       *    MUST be such that only valid components can be swapped i.e. only
-       *    change the order of components in the pixel. Any other values for
-       *    these Shader Channel Select fields are not valid for Render
-       *    Targets. This also means that there MUST not be multiple shader
-       *    channels mapped to the same RT channel."
-       */
-      assert(info->view->swizzle.r == ISL_CHANNEL_SELECT_RED ||
-             info->view->swizzle.r == ISL_CHANNEL_SELECT_GREEN ||
-             info->view->swizzle.r == ISL_CHANNEL_SELECT_BLUE);
-      assert(info->view->swizzle.g == ISL_CHANNEL_SELECT_RED ||
-             info->view->swizzle.g == ISL_CHANNEL_SELECT_GREEN ||
-             info->view->swizzle.g == ISL_CHANNEL_SELECT_BLUE);
-      assert(info->view->swizzle.b == ISL_CHANNEL_SELECT_RED ||
-             info->view->swizzle.b == ISL_CHANNEL_SELECT_GREEN ||
-             info->view->swizzle.b == ISL_CHANNEL_SELECT_BLUE);
-      assert(info->view->swizzle.r != info->view->swizzle.g);
-      assert(info->view->swizzle.r != info->view->swizzle.b);
-      assert(info->view->swizzle.g != info->view->swizzle.b);
+   if (info->view->usage & ISL_SURF_USAGE_RENDER_TARGET_BIT)
+      assert(isl_swizzle_supports_rendering(dev->info, info->view->swizzle));
 
-      /* From the Sky Lake PRM Vol. 2d,
-       * RENDER_SURFACE_STATE::Shader Channel Select Alpha
-       *
-       *    "For Render Target, this field MUST be programmed to
-       *    value = SCS_ALPHA."
-       */
-      assert(info->view->swizzle.a == ISL_CHANNEL_SELECT_ALPHA);
-   }
    s.ShaderChannelSelectRed = (enum GENX(ShaderChannelSelect)) info->view->swizzle.r;
    s.ShaderChannelSelectGreen = (enum GENX(ShaderChannelSelect)) info->view->swizzle.g;
    s.ShaderChannelSelectBlue = (enum GENX(ShaderChannelSelect)) info->view->swizzle.b;
    s.ShaderChannelSelectAlpha = (enum GENX(ShaderChannelSelect)) info->view->swizzle.a;
+#else
+   assert(isl_swizzle_is_identity(info->view->swizzle));
 #endif
 
    s.SurfaceBaseAddress = info->address;
diff --git a/src/intel/isl/meson.build b/src/intel/isl/meson.build
index b4d707f..5b57188 100644
--- a/src/intel/isl/meson.build
+++ b/src/intel/isl/meson.build
@@ -95,7 +95,7 @@
       'tests/isl_surf_get_image_offset_test.c',
       dependencies : dep_m,
       include_directories : [inc_common, inc_intel],
-      link_with : [libisl, libintel_dev],
+      link_with : [libisl, libintel_dev, libmesa_util],
     )
   )
 endif
diff --git a/src/intel/tools/aub_write.c b/src/intel/tools/aub_write.c
new file mode 100644
index 0000000..e92bdaf
--- /dev/null
+++ b/src/intel/tools/aub_write.c
@@ -0,0 +1,635 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "aub_write.h"
+
+#include <inttypes.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "i915_drm.h"
+#include "intel_aub.h"
+#include "gen_context.h"
+
+#ifndef ALIGN
+#define ALIGN(x, y) (((x) + (y)-1) & ~((y)-1))
+#endif
+
+#define MI_BATCH_NON_SECURE_I965 (1 << 8)
+
+#define min(a, b) ({                            \
+         __typeof(a) _a = (a);                  \
+         __typeof(b) _b = (b);                  \
+         _a < _b ? _a : _b;                     \
+      })
+
+#define max(a, b) ({                            \
+         __typeof(a) _a = (a);                  \
+         __typeof(b) _b = (b);                  \
+         _a > _b ? _a : _b;                     \
+      })
+
+
+enum gen_ring {
+   GEN_RING_RENDER,
+   GEN_RING_BLITTER,
+   GEN_RING_VIDEO,
+};
+
+static const uint32_t *
+get_context_init(const struct gen_device_info *devinfo, enum gen_ring ring)
+{
+   static const uint32_t *gen8_contexts[] = {
+      [GEN_RING_RENDER] = gen8_render_context_init,
+      [GEN_RING_BLITTER] = gen8_blitter_context_init,
+      [GEN_RING_VIDEO] = gen8_video_context_init,
+   };
+   static const uint32_t *gen10_contexts[] = {
+      [GEN_RING_RENDER] = gen10_render_context_init,
+      [GEN_RING_BLITTER] = gen10_blitter_context_init,
+      [GEN_RING_VIDEO] = gen10_video_context_init,
+   };
+
+   assert(devinfo->gen >= 8);
+
+   if (devinfo->gen <= 10)
+      return gen8_contexts[ring];
+   return gen10_contexts[ring];
+}
+
+static void __attribute__ ((format(__printf__, 2, 3)))
+fail_if(int cond, const char *format, ...)
+{
+   va_list args;
+
+   if (!cond)
+      return;
+
+   va_start(args, format);
+   vfprintf(stderr, format, args);
+   va_end(args);
+
+   raise(SIGTRAP);
+}
+
+static inline uint32_t
+align_u32(uint32_t v, uint32_t a)
+{
+   return (v + a - 1) & ~(a - 1);
+}
+
+static void
+aub_ppgtt_table_finish(struct aub_ppgtt_table *table, int level)
+{
+   if (level == 1)
+      return;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(table->subtables); i++) {
+      if (table->subtables[i]) {
+         aub_ppgtt_table_finish(table->subtables[i], level - 1);
+         free(table->subtables[i]);
+      }
+   }
+}
+
+void
+aub_file_init(struct aub_file *aub, FILE *file, uint16_t pci_id)
+{
+   memset(aub, 0, sizeof(*aub));
+
+   aub->file = file;
+   aub->pci_id = pci_id;
+   fail_if(!gen_get_device_info(pci_id, &aub->devinfo),
+           "failed to identify chipset=0x%x\n", pci_id);
+   aub->addr_bits = aub->devinfo.gen >= 8 ? 48 : 32;
+
+   aub->pml4.phys_addr = PML4_PHYS_ADDR;
+}
+
+void
+aub_file_finish(struct aub_file *aub)
+{
+   aub_ppgtt_table_finish(&aub->pml4, 4);
+   fclose(aub->file);
+}
+
+uint32_t
+aub_gtt_size(struct aub_file *aub)
+{
+   return NUM_PT_ENTRIES * (aub->addr_bits > 32 ? GEN8_PTE_SIZE : PTE_SIZE);
+}
+
+static void
+data_out(struct aub_file *aub, const void *data, size_t size)
+{
+   if (size == 0)
+      return;
+
+   fail_if(fwrite(data, 1, size, aub->file) == 0,
+           "Writing to output failed\n");
+}
+
+static void
+dword_out(struct aub_file *aub, uint32_t data)
+{
+   data_out(aub, &data, sizeof(data));
+}
+
+static void
+mem_trace_memory_write_header_out(struct aub_file *aub, uint64_t addr,
+                                  uint32_t len, uint32_t addr_space,
+                                  const char *desc)
+{
+   uint32_t dwords = ALIGN(len, sizeof(uint32_t)) / sizeof(uint32_t);
+
+   if (aub->verbose_log_file) {
+      fprintf(aub->verbose_log_file,
+              "  MEM WRITE (0x%016" PRIx64 "-0x%016" PRIx64 ") %s\n",
+              addr, addr + len, desc);
+   }
+
+   dword_out(aub, CMD_MEM_TRACE_MEMORY_WRITE | (5 + dwords - 1));
+   dword_out(aub, addr & 0xFFFFFFFF);   /* addr lo */
+   dword_out(aub, addr >> 32);   /* addr hi */
+   dword_out(aub, addr_space);   /* gtt */
+   dword_out(aub, len);
+}
+
+static void
+register_write_out(struct aub_file *aub, uint32_t addr, uint32_t value)
+{
+   uint32_t dwords = 1;
+
+   dword_out(aub, CMD_MEM_TRACE_REGISTER_WRITE | (5 + dwords - 1));
+   dword_out(aub, addr);
+   dword_out(aub, AUB_MEM_TRACE_REGISTER_SIZE_DWORD |
+                  AUB_MEM_TRACE_REGISTER_SPACE_MMIO);
+   dword_out(aub, 0xFFFFFFFF);   /* mask lo */
+   dword_out(aub, 0x00000000);   /* mask hi */
+   dword_out(aub, value);
+}
+
+static void
+populate_ppgtt_table(struct aub_file *aub, struct aub_ppgtt_table *table,
+                     int start, int end, int level)
+{
+   static uint64_t phys_addrs_allocator = (PML4_PHYS_ADDR >> 12) + 1;
+   uint64_t entries[512] = {0};
+   int dirty_start = 512, dirty_end = 0;
+
+   if (aub->verbose_log_file) {
+      fprintf(aub->verbose_log_file,
+              "  PPGTT (0x%016" PRIx64 "), lvl %d, start: %x, end: %x\n",
+              table->phys_addr, level, start, end);
+   }
+
+   for (int i = start; i <= end; i++) {
+      if (!table->subtables[i]) {
+         dirty_start = min(dirty_start, i);
+         dirty_end = max(dirty_end, i);
+         if (level == 1) {
+            table->subtables[i] =
+               (void *)(phys_addrs_allocator++ << 12);
+            if (aub->verbose_log_file) {
+               fprintf(aub->verbose_log_file,
+                       "   Adding entry: %x, phys_addr: 0x%016" PRIx64 "\n",
+                       i, (uint64_t)table->subtables[i]);
+            }
+         } else {
+            table->subtables[i] =
+               calloc(1, sizeof(struct aub_ppgtt_table));
+            table->subtables[i]->phys_addr =
+               phys_addrs_allocator++ << 12;
+            if (aub->verbose_log_file) {
+               fprintf(aub->verbose_log_file,
+                       "   Adding entry: %x, phys_addr: 0x%016" PRIx64 "\n",
+                       i, table->subtables[i]->phys_addr);
+            }
+         }
+      }
+      entries[i] = 3 /* read/write | present */ |
+         (level == 1 ? (uint64_t)table->subtables[i] :
+          table->subtables[i]->phys_addr);
+   }
+
+   if (dirty_start <= dirty_end) {
+      uint64_t write_addr = table->phys_addr + dirty_start *
+         sizeof(uint64_t);
+      uint64_t write_size = (dirty_end - dirty_start + 1) *
+         sizeof(uint64_t);
+      mem_trace_memory_write_header_out(aub, write_addr, write_size,
+                                        AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_PHYSICAL,
+                                        "PPGTT update");
+      data_out(aub, entries + dirty_start, write_size);
+   }
+}
+
+void
+aub_map_ppgtt(struct aub_file *aub, uint64_t start, uint64_t size)
+{
+   uint64_t l4_start = start & 0xff8000000000;
+   uint64_t l4_end = ((start + size - 1) | 0x007fffffffff) & 0xffffffffffff;
+
+#define L4_index(addr) (((addr) >> 39) & 0x1ff)
+#define L3_index(addr) (((addr) >> 30) & 0x1ff)
+#define L2_index(addr) (((addr) >> 21) & 0x1ff)
+#define L1_index(addr) (((addr) >> 12) & 0x1ff)
+
+#define L3_table(addr) (aub->pml4.subtables[L4_index(addr)])
+#define L2_table(addr) (L3_table(addr)->subtables[L3_index(addr)])
+#define L1_table(addr) (L2_table(addr)->subtables[L2_index(addr)])
+
+   if (aub->verbose_log_file) {
+      fprintf(aub->verbose_log_file,
+              " Mapping PPGTT address: 0x%" PRIx64 ", size: %" PRIu64"\n",
+              start, size);
+   }
+
+   populate_ppgtt_table(aub, &aub->pml4, L4_index(l4_start), L4_index(l4_end), 4);
+
+   for (uint64_t l4 = l4_start; l4 < l4_end; l4 += (1ULL << 39)) {
+      uint64_t l3_start = max(l4, start & 0xffffc0000000);
+      uint64_t l3_end = min(l4 + (1ULL << 39) - 1,
+                            ((start + size - 1) | 0x00003fffffff) & 0xffffffffffff);
+      uint64_t l3_start_idx = L3_index(l3_start);
+      uint64_t l3_end_idx = L3_index(l3_end);
+
+      populate_ppgtt_table(aub, L3_table(l4), l3_start_idx, l3_end_idx, 3);
+
+      for (uint64_t l3 = l3_start; l3 < l3_end; l3 += (1ULL << 30)) {
+         uint64_t l2_start = max(l3, start & 0xffffffe00000);
+         uint64_t l2_end = min(l3 + (1ULL << 30) - 1,
+                               ((start + size - 1) | 0x0000001fffff) & 0xffffffffffff);
+         uint64_t l2_start_idx = L2_index(l2_start);
+         uint64_t l2_end_idx = L2_index(l2_end);
+
+         populate_ppgtt_table(aub, L2_table(l3), l2_start_idx, l2_end_idx, 2);
+
+         for (uint64_t l2 = l2_start; l2 < l2_end; l2 += (1ULL << 21)) {
+            uint64_t l1_start = max(l2, start & 0xfffffffff000);
+            uint64_t l1_end = min(l2 + (1ULL << 21) - 1,
+                                  ((start + size - 1) | 0x000000000fff) & 0xffffffffffff);
+            uint64_t l1_start_idx = L1_index(l1_start);
+            uint64_t l1_end_idx = L1_index(l1_end);
+
+            populate_ppgtt_table(aub, L1_table(l2), l1_start_idx, l1_end_idx, 1);
+         }
+      }
+   }
+}
+
+static uint64_t
+ppgtt_lookup(struct aub_file *aub, uint64_t ppgtt_addr)
+{
+   return (uint64_t)L1_table(ppgtt_addr)->subtables[L1_index(ppgtt_addr)];
+}
+
+static void
+write_execlists_header(struct aub_file *aub, const char *name)
+{
+   char app_name[8 * 4];
+   int app_name_len, dwords;
+
+   app_name_len =
+      snprintf(app_name, sizeof(app_name), "PCI-ID=0x%X %s",
+               aub->pci_id, name);
+   app_name_len = ALIGN(app_name_len, sizeof(uint32_t));
+
+   dwords = 5 + app_name_len / sizeof(uint32_t);
+   dword_out(aub, CMD_MEM_TRACE_VERSION | (dwords - 1));
+   dword_out(aub, AUB_MEM_TRACE_VERSION_FILE_VERSION);
+   dword_out(aub, aub->devinfo.simulator_id << AUB_MEM_TRACE_VERSION_DEVICE_SHIFT);
+   dword_out(aub, 0);      /* version */
+   dword_out(aub, 0);      /* version */
+   data_out(aub, app_name, app_name_len);
+
+   /* GGTT PT */
+   uint32_t ggtt_ptes = STATIC_GGTT_MAP_SIZE >> 12;
+
+   mem_trace_memory_write_header_out(aub, STATIC_GGTT_MAP_START >> 12,
+                                     ggtt_ptes * GEN8_PTE_SIZE,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT_ENTRY,
+                                     "GGTT PT");
+   for (uint32_t i = 0; i < ggtt_ptes; i++) {
+      dword_out(aub, 1 + 0x1000 * i + STATIC_GGTT_MAP_START);
+      dword_out(aub, 0);
+   }
+
+   /* RENDER_RING */
+   mem_trace_memory_write_header_out(aub, RENDER_RING_ADDR, RING_SIZE,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT,
+                                     "RENDER RING");
+   for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t))
+      dword_out(aub, 0);
+
+   /* RENDER_PPHWSP */
+   mem_trace_memory_write_header_out(aub, RENDER_CONTEXT_ADDR,
+                                     PPHWSP_SIZE +
+                                     CONTEXT_RENDER_SIZE,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT,
+                                     "RENDER PPHWSP");
+   for (uint32_t i = 0; i < PPHWSP_SIZE; i += sizeof(uint32_t))
+      dword_out(aub, 0);
+
+   /* RENDER_CONTEXT */
+   data_out(aub, get_context_init(&aub->devinfo, GEN_RING_RENDER), CONTEXT_RENDER_SIZE);
+
+   /* BLITTER_RING */
+   mem_trace_memory_write_header_out(aub, BLITTER_RING_ADDR, RING_SIZE,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT,
+                                     "BLITTER RING");
+   for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t))
+      dword_out(aub, 0);
+
+   /* BLITTER_PPHWSP */
+   mem_trace_memory_write_header_out(aub, BLITTER_CONTEXT_ADDR,
+                                     PPHWSP_SIZE +
+                                     CONTEXT_OTHER_SIZE,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT,
+                                     "BLITTER PPHWSP");
+   for (uint32_t i = 0; i < PPHWSP_SIZE; i += sizeof(uint32_t))
+      dword_out(aub, 0);
+
+   /* BLITTER_CONTEXT */
+   data_out(aub, get_context_init(&aub->devinfo, GEN_RING_BLITTER), CONTEXT_OTHER_SIZE);
+
+   /* VIDEO_RING */
+   mem_trace_memory_write_header_out(aub, VIDEO_RING_ADDR, RING_SIZE,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT,
+                                     "VIDEO RING");
+   for (uint32_t i = 0; i < RING_SIZE; i += sizeof(uint32_t))
+      dword_out(aub, 0);
+
+   /* VIDEO_PPHWSP */
+   mem_trace_memory_write_header_out(aub, VIDEO_CONTEXT_ADDR,
+                                     PPHWSP_SIZE +
+                                     CONTEXT_OTHER_SIZE,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT,
+                                     "VIDEO PPHWSP");
+   for (uint32_t i = 0; i < PPHWSP_SIZE; i += sizeof(uint32_t))
+      dword_out(aub, 0);
+
+   /* VIDEO_CONTEXT */
+   data_out(aub, get_context_init(&aub->devinfo, GEN_RING_VIDEO), CONTEXT_OTHER_SIZE);
+
+   register_write_out(aub, HWS_PGA_RCSUNIT, RENDER_CONTEXT_ADDR);
+   register_write_out(aub, HWS_PGA_VCSUNIT0, VIDEO_CONTEXT_ADDR);
+   register_write_out(aub, HWS_PGA_BCSUNIT, BLITTER_CONTEXT_ADDR);
+
+   register_write_out(aub, GFX_MODE_RCSUNIT, 0x80008000 /* execlist enable */);
+   register_write_out(aub, GFX_MODE_VCSUNIT0, 0x80008000 /* execlist enable */);
+   register_write_out(aub, GFX_MODE_BCSUNIT, 0x80008000 /* execlist enable */);
+}
+
+static void write_legacy_header(struct aub_file *aub, const char *name)
+{
+   char app_name[8 * 4];
+   char comment[16];
+   int comment_len, comment_dwords, dwords;
+   uint32_t entry = 0x200003;
+
+   comment_len = snprintf(comment, sizeof(comment), "PCI-ID=0x%x", aub->pci_id);
+   comment_dwords = ((comment_len + 3) / 4);
+
+   /* Start with a (required) version packet. */
+   dwords = 13 + comment_dwords;
+   dword_out(aub, CMD_AUB_HEADER | (dwords - 2));
+   dword_out(aub, (4 << AUB_HEADER_MAJOR_SHIFT) |
+                  (0 << AUB_HEADER_MINOR_SHIFT));
+
+   /* Next comes a 32-byte application name. */
+   strncpy(app_name, name, sizeof(app_name));
+   app_name[sizeof(app_name) - 1] = 0;
+   data_out(aub, app_name, sizeof(app_name));
+
+   dword_out(aub, 0); /* timestamp */
+   dword_out(aub, 0); /* timestamp */
+   dword_out(aub, comment_len);
+   data_out(aub, comment, comment_dwords * 4);
+
+   /* Set up the GTT. The max we can handle is 64M */
+   dword_out(aub, CMD_AUB_TRACE_HEADER_BLOCK |
+                  ((aub->addr_bits > 32 ? 6 : 5) - 2));
+   dword_out(aub, AUB_TRACE_MEMTYPE_GTT_ENTRY |
+                  AUB_TRACE_TYPE_NOTYPE | AUB_TRACE_OP_DATA_WRITE);
+   dword_out(aub, 0); /* subtype */
+   dword_out(aub, 0); /* offset */
+   dword_out(aub, aub_gtt_size(aub)); /* size */
+   if (aub->addr_bits > 32)
+      dword_out(aub, 0);
+   for (uint32_t i = 0; i < NUM_PT_ENTRIES; i++) {
+      dword_out(aub, entry + 0x1000 * i);
+      if (aub->addr_bits > 32)
+         dword_out(aub, 0);
+   }
+}
+
+void
+aub_write_header(struct aub_file *aub, const char *app_name)
+{
+   if (aub_use_execlists(aub))
+      write_execlists_header(aub, app_name);
+   else
+      write_legacy_header(aub, app_name);
+}
+
+/**
+ * Break up large objects into multiple writes.  Otherwise a 128kb VBO
+ * would overflow the 16 bits of size field in the packet header and
+ * everything goes badly after that.
+ */
+void
+aub_write_trace_block(struct aub_file *aub,
+                      uint32_t type, void *virtual,
+                      uint32_t size, uint64_t gtt_offset)
+{
+   uint32_t block_size;
+   uint32_t subtype = 0;
+   static const char null_block[8 * 4096];
+
+   for (uint32_t offset = 0; offset < size; offset += block_size) {
+      block_size = min(8 * 4096, size - offset);
+
+      if (aub_use_execlists(aub)) {
+         block_size = min(4096, block_size);
+         mem_trace_memory_write_header_out(aub,
+                                           ppgtt_lookup(aub, gtt_offset + offset),
+                                           block_size,
+                                           AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_PHYSICAL,
+                                           "legacy");
+      } else {
+         dword_out(aub, CMD_AUB_TRACE_HEADER_BLOCK |
+                        ((aub->addr_bits > 32 ? 6 : 5) - 2));
+         dword_out(aub, AUB_TRACE_MEMTYPE_GTT |
+                        type | AUB_TRACE_OP_DATA_WRITE);
+         dword_out(aub, subtype);
+         dword_out(aub, gtt_offset + offset);
+         dword_out(aub, align_u32(block_size, 4));
+         if (aub->addr_bits > 32)
+            dword_out(aub, (gtt_offset + offset) >> 32);
+      }
+
+      if (virtual)
+         data_out(aub, ((char *) virtual) + offset, block_size);
+      else
+         data_out(aub, null_block, block_size);
+
+      /* Pad to a multiple of 4 bytes. */
+      data_out(aub, null_block, -block_size & 3);
+   }
+}
+
+static void
+aub_dump_execlist(struct aub_file *aub, uint64_t batch_offset, int ring_flag)
+{
+   uint32_t ring_addr;
+   uint64_t descriptor;
+   uint32_t elsp_reg;
+   uint32_t elsq_reg;
+   uint32_t status_reg;
+   uint32_t control_reg;
+
+   switch (ring_flag) {
+   case I915_EXEC_DEFAULT:
+   case I915_EXEC_RENDER:
+      ring_addr = RENDER_RING_ADDR;
+      descriptor = RENDER_CONTEXT_DESCRIPTOR;
+      elsp_reg = EXECLIST_SUBMITPORT_RCSUNIT;
+      elsq_reg = EXECLIST_SQ_CONTENTS0_RCSUNIT;
+      status_reg = EXECLIST_STATUS_RCSUNIT;
+      control_reg = EXECLIST_CONTROL_RCSUNIT;
+      break;
+   case I915_EXEC_BSD:
+      ring_addr = VIDEO_RING_ADDR;
+      descriptor = VIDEO_CONTEXT_DESCRIPTOR;
+      elsp_reg = EXECLIST_SUBMITPORT_VCSUNIT0;
+      elsq_reg = EXECLIST_SQ_CONTENTS0_VCSUNIT0;
+      status_reg = EXECLIST_STATUS_VCSUNIT0;
+      control_reg = EXECLIST_CONTROL_VCSUNIT0;
+      break;
+   case I915_EXEC_BLT:
+      ring_addr = BLITTER_RING_ADDR;
+      descriptor = BLITTER_CONTEXT_DESCRIPTOR;
+      elsp_reg = EXECLIST_SUBMITPORT_BCSUNIT;
+      elsq_reg = EXECLIST_SQ_CONTENTS0_BCSUNIT;
+      status_reg = EXECLIST_STATUS_BCSUNIT;
+      control_reg = EXECLIST_CONTROL_BCSUNIT;
+      break;
+   default:
+      unreachable("unknown ring");
+   }
+
+   mem_trace_memory_write_header_out(aub, ring_addr, 16,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT,
+                                     "RING MI_BATCH_BUFFER_START user");
+   dword_out(aub, AUB_MI_BATCH_BUFFER_START | MI_BATCH_NON_SECURE_I965 | (3 - 2));
+   dword_out(aub, batch_offset & 0xFFFFFFFF);
+   dword_out(aub, batch_offset >> 32);
+   dword_out(aub, 0 /* MI_NOOP */);
+
+   mem_trace_memory_write_header_out(aub, ring_addr + 8192 + 20, 4,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT,
+                                     "RING BUFFER HEAD");
+   dword_out(aub, 0); /* RING_BUFFER_HEAD */
+   mem_trace_memory_write_header_out(aub, ring_addr + 8192 + 28, 4,
+                                     AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT,
+                                     "RING BUFFER TAIL");
+   dword_out(aub, 16); /* RING_BUFFER_TAIL */
+
+   if (aub->devinfo.gen >= 11) {
+      register_write_out(aub, elsq_reg, descriptor & 0xFFFFFFFF);
+      register_write_out(aub, elsq_reg + sizeof(uint32_t), descriptor >> 32);
+      register_write_out(aub, control_reg, 1);
+   } else {
+      register_write_out(aub, elsp_reg, 0);
+      register_write_out(aub, elsp_reg, 0);
+      register_write_out(aub, elsp_reg, descriptor >> 32);
+      register_write_out(aub, elsp_reg, descriptor & 0xFFFFFFFF);
+   }
+
+   dword_out(aub, CMD_MEM_TRACE_REGISTER_POLL | (5 + 1 - 1));
+   dword_out(aub, status_reg);
+   dword_out(aub, AUB_MEM_TRACE_REGISTER_SIZE_DWORD |
+                  AUB_MEM_TRACE_REGISTER_SPACE_MMIO);
+   if (aub->devinfo.gen >= 11) {
+      dword_out(aub, 0x00000001);   /* mask lo */
+      dword_out(aub, 0x00000000);   /* mask hi */
+      dword_out(aub, 0x00000001);
+   } else {
+      dword_out(aub, 0x00000010);   /* mask lo */
+      dword_out(aub, 0x00000000);   /* mask hi */
+      dword_out(aub, 0x00000000);
+   }
+}
+
+static void
+aub_dump_ringbuffer(struct aub_file *aub, uint64_t batch_offset,
+                    uint64_t offset, int ring_flag)
+{
+   uint32_t ringbuffer[4096];
+   unsigned aub_mi_bbs_len;
+   int ring = AUB_TRACE_TYPE_RING_PRB0; /* The default ring */
+   int ring_count = 0;
+
+   if (ring_flag == I915_EXEC_BSD)
+      ring = AUB_TRACE_TYPE_RING_PRB1;
+   else if (ring_flag == I915_EXEC_BLT)
+      ring = AUB_TRACE_TYPE_RING_PRB2;
+
+   /* Make a ring buffer to execute our batchbuffer. */
+   memset(ringbuffer, 0, sizeof(ringbuffer));
+
+   aub_mi_bbs_len = aub->addr_bits > 32 ? 3 : 2;
+   ringbuffer[ring_count] = AUB_MI_BATCH_BUFFER_START | (aub_mi_bbs_len - 2);
+   aub_write_reloc(&aub->devinfo, &ringbuffer[ring_count + 1], batch_offset);
+   ring_count += aub_mi_bbs_len;
+
+   /* Write out the ring.  This appears to trigger execution of
+    * the ring in the simulator.
+    */
+   dword_out(aub, CMD_AUB_TRACE_HEADER_BLOCK |
+                  ((aub->addr_bits > 32 ? 6 : 5) - 2));
+   dword_out(aub, AUB_TRACE_MEMTYPE_GTT | ring | AUB_TRACE_OP_COMMAND_WRITE);
+   dword_out(aub, 0); /* general/surface subtype */
+   dword_out(aub, offset);
+   dword_out(aub, ring_count * 4);
+   if (aub->addr_bits > 32)
+      dword_out(aub, offset >> 32);
+
+   data_out(aub, ringbuffer, ring_count * 4);
+}
+
+void
+aub_write_exec(struct aub_file *aub, uint64_t batch_addr,
+               uint64_t offset, int ring_flag)
+{
+   if (aub_use_execlists(aub)) {
+      aub_dump_execlist(aub, batch_addr, ring_flag);
+   } else {
+      /* Dump ring buffer */
+      aub_dump_ringbuffer(aub, batch_addr, offset, ring_flag);
+   }
+   fflush(aub->file);
+}
diff --git a/src/intel/tools/aub_write.h b/src/intel/tools/aub_write.h
new file mode 100644
index 0000000..2e42e3d
--- /dev/null
+++ b/src/intel/tools/aub_write.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright © 2007-2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef INTEL_AUB_WRITE
+#define INTEL_AUB_WRITE
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "dev/gen_device_info.h"
+
+struct aub_ppgtt_table {
+   uint64_t phys_addr;
+   struct aub_ppgtt_table *subtables[512];
+};
+
+struct aub_file {
+   FILE *file;
+
+   /* Set if you want extra logging */
+   FILE *verbose_log_file;
+
+   uint16_t pci_id;
+   struct gen_device_info devinfo;
+
+   int addr_bits;
+
+   struct aub_ppgtt_table pml4;
+};
+
+void aub_file_init(struct aub_file *aub, FILE *file, uint16_t pci_id);
+void aub_file_finish(struct aub_file *aub);
+
+static inline bool aub_use_execlists(const struct aub_file *aub)
+{
+   return aub->devinfo.gen >= 8;
+}
+
+uint32_t aub_gtt_size(struct aub_file *aub);
+
+static inline void
+aub_write_reloc(const struct gen_device_info *devinfo, void *p, uint64_t v)
+{
+   if (devinfo->gen >= 8) {
+      /* From the Broadwell PRM Vol. 2a,
+       * MI_LOAD_REGISTER_MEM::MemoryAddress:
+       *
+       *   "This field specifies the address of the memory
+       *   location where the register value specified in the
+       *   DWord above will read from.  The address specifies
+       *   the DWord location of the data. Range =
+       *   GraphicsVirtualAddress[63:2] for a DWord register
+       *   GraphicsAddress [63:48] are ignored by the HW and
+       *   assumed to be in correct canonical form [63:48] ==
+       *   [47]."
+       *
+       * In practice, this will always mean the top bits are zero
+       * because of the GTT size limitation of the aubdump tool.
+       */
+      const int shift = 63 - 47;
+      *(uint64_t *)p = (((int64_t)v) << shift) >> shift;
+   } else {
+      *(uint32_t *)p = v;
+   }
+}
+
+void aub_write_header(struct aub_file *aub, const char *app_name);
+void aub_map_ppgtt(struct aub_file *aub, uint64_t start, uint64_t size);
+void aub_write_trace_block(struct aub_file *aub,
+                           uint32_t type, void *virtual,
+                           uint32_t size, uint64_t gtt_offset);
+void aub_write_exec(struct aub_file *aub, uint64_t batch_addr,
+                    uint64_t offset, int ring_flag);
+
+#endif /* INTEL_AUB_WRITE */
diff --git a/src/intel/tools/aubinator.c b/src/intel/tools/aubinator.c
index 2a72efa..cd5d6d2 100644
--- a/src/intel/tools/aubinator.c
+++ b/src/intel/tools/aubinator.c
@@ -37,11 +37,24 @@
 #include <sys/wait.h>
 #include <sys/mman.h>
 
+#include "util/list.h"
 #include "util/macros.h"
+#include "util/rb_tree.h"
 
 #include "common/gen_decoder.h"
+#include "common/gen_disasm.h"
+#include "common/gen_gem.h"
 #include "intel_aub.h"
-#include "gen_disasm.h"
+
+#ifndef HAVE_MEMFD_CREATE
+#include <sys/syscall.h>
+
+static inline int
+memfd_create(const char *name, unsigned int flags)
+{
+   return syscall(SYS_memfd_create, name, flags);
+}
+#endif
 
 /* Below is the only command missing from intel_aub.h in libdrm
  * So, reuse intel_aub.h from libdrm and #define the
@@ -56,8 +69,9 @@
 
 /* options */
 
-static bool option_full_decode = true;
-static bool option_print_offsets = true;
+static int option_full_decode = true;
+static int option_print_offsets = true;
+static int max_vbo_lines = -1;
 static enum { COLOR_AUTO, COLOR_ALWAYS, COLOR_NEVER } option_color;
 
 /* state */
@@ -67,32 +81,312 @@
 struct gen_device_info devinfo;
 struct gen_batch_decode_ctx batch_ctx;
 
-uint64_t gtt_size, gtt_end;
-void *gtt;
-uint64_t general_state_base;
-uint64_t surface_state_base;
-uint64_t dynamic_state_base;
-uint64_t instruction_base;
-uint64_t instruction_bound;
+struct bo_map {
+   struct list_head link;
+   struct gen_batch_decode_bo bo;
+   bool unmap_after_use;
+};
+
+struct ggtt_entry {
+   struct rb_node node;
+   uint64_t virt_addr;
+   uint64_t phys_addr;
+};
+
+struct phys_mem {
+   struct rb_node node;
+   uint64_t fd_offset;
+   uint64_t phys_addr;
+   uint8_t *data;
+};
+
+static struct list_head maps;
+static struct rb_tree ggtt = {NULL};
+static struct rb_tree mem = {NULL};
+int mem_fd = -1;
+off_t mem_fd_len = 0;
 
 FILE *outfile;
 
-static inline uint32_t
-field(uint32_t value, int start, int end)
-{
-   uint32_t mask;
-
-   mask = ~0U >> (31 - end + start);
-
-   return (value >> start) & mask;
-}
-
 struct brw_instruction;
 
-static inline int
-valid_offset(uint32_t offset)
+static void
+add_gtt_bo_map(struct gen_batch_decode_bo bo, bool unmap_after_use)
 {
-   return offset < gtt_end;
+   struct bo_map *m = calloc(1, sizeof(*m));
+
+   m->bo = bo;
+   m->unmap_after_use = unmap_after_use;
+   list_add(&m->link, &maps);
+}
+
+static void
+clear_bo_maps(void)
+{
+   list_for_each_entry_safe(struct bo_map, i, &maps, link) {
+      if (i->unmap_after_use)
+         munmap((void *)i->bo.map, i->bo.size);
+      list_del(&i->link);
+      free(i);
+   }
+}
+
+static inline struct ggtt_entry *
+ggtt_entry_next(struct ggtt_entry *entry)
+{
+   if (!entry)
+      return NULL;
+   struct rb_node *node = rb_node_next(&entry->node);
+   if (!node)
+      return NULL;
+   return rb_node_data(struct ggtt_entry, node, node);
+}
+
+static inline int
+cmp_uint64(uint64_t a, uint64_t b)
+{
+   if (a < b)
+      return -1;
+   if (a > b)
+      return 1;
+   return 0;
+}
+
+static inline int
+cmp_ggtt_entry(const struct rb_node *node, const void *addr)
+{
+   struct ggtt_entry *entry = rb_node_data(struct ggtt_entry, node, node);
+   return cmp_uint64(entry->virt_addr, *(const uint64_t *)addr);
+}
+
+static struct ggtt_entry *
+ensure_ggtt_entry(struct rb_tree *tree, uint64_t virt_addr)
+{
+   struct rb_node *node = rb_tree_search_sloppy(&ggtt, &virt_addr,
+                                                cmp_ggtt_entry);
+   int cmp = 0;
+   if (!node || (cmp = cmp_ggtt_entry(node, &virt_addr))) {
+      struct ggtt_entry *new_entry = calloc(1, sizeof(*new_entry));
+      new_entry->virt_addr = virt_addr;
+      rb_tree_insert_at(&ggtt, node, &new_entry->node, cmp > 0);
+      node = &new_entry->node;
+   }
+
+   return rb_node_data(struct ggtt_entry, node, node);
+}
+
+static struct ggtt_entry *
+search_ggtt_entry(uint64_t virt_addr)
+{
+   virt_addr &= ~0xfff;
+
+   struct rb_node *node = rb_tree_search(&ggtt, &virt_addr, cmp_ggtt_entry);
+
+   if (!node)
+      return NULL;
+
+   return rb_node_data(struct ggtt_entry, node, node);
+}
+
+static inline int
+cmp_phys_mem(const struct rb_node *node, const void *addr)
+{
+   struct phys_mem *mem = rb_node_data(struct phys_mem, node, node);
+   return cmp_uint64(mem->phys_addr, *(uint64_t *)addr);
+}
+
+static struct phys_mem *
+ensure_phys_mem(uint64_t phys_addr)
+{
+   struct rb_node *node = rb_tree_search_sloppy(&mem, &phys_addr, cmp_phys_mem);
+   int cmp = 0;
+   if (!node || (cmp = cmp_phys_mem(node, &phys_addr))) {
+      struct phys_mem *new_mem = calloc(1, sizeof(*new_mem));
+      new_mem->phys_addr = phys_addr;
+      new_mem->fd_offset = mem_fd_len;
+
+      int ftruncate_res = ftruncate(mem_fd, mem_fd_len += 4096);
+      assert(ftruncate_res == 0);
+
+      new_mem->data = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED,
+                           mem_fd, new_mem->fd_offset);
+      assert(new_mem->data != MAP_FAILED);
+
+      rb_tree_insert_at(&mem, node, &new_mem->node, cmp > 0);
+      node = &new_mem->node;
+   }
+
+   return rb_node_data(struct phys_mem, node, node);
+}
+
+static struct phys_mem *
+search_phys_mem(uint64_t phys_addr)
+{
+   phys_addr &= ~0xfff;
+
+   struct rb_node *node = rb_tree_search(&mem, &phys_addr, cmp_phys_mem);
+
+   if (!node)
+      return NULL;
+
+   return rb_node_data(struct phys_mem, node, node);
+}
+
+static void
+handle_ggtt_entry_write(uint64_t address, const void *_data, uint32_t _size)
+{
+   uint64_t virt_addr = (address / sizeof(uint64_t)) << 12;
+   const uint64_t *data = _data;
+   size_t size = _size / sizeof(*data);
+   for (const uint64_t *entry = data;
+        entry < data + size;
+        entry++, virt_addr += 4096) {
+      struct ggtt_entry *pt = ensure_ggtt_entry(&ggtt, virt_addr);
+      pt->phys_addr = *entry;
+   }
+}
+
+static void
+handle_physical_write(uint64_t phys_address, const void *data, uint32_t size)
+{
+   uint32_t to_write = size;
+   for (uint64_t page = phys_address & ~0xfff; page < phys_address + size; page += 4096) {
+      struct phys_mem *mem = ensure_phys_mem(page);
+      uint64_t offset = MAX2(page, phys_address) - page;
+      uint32_t size_this_page = MIN2(to_write, 4096 - offset);
+      to_write -= size_this_page;
+      memcpy(mem->data + offset, data, size_this_page);
+      data = (const uint8_t *)data + size_this_page;
+   }
+}
+
+static void
+handle_ggtt_write(uint64_t virt_address, const void *data, uint32_t size)
+{
+   uint32_t to_write = size;
+   for (uint64_t page = virt_address & ~0xfff; page < virt_address + size; page += 4096) {
+      struct ggtt_entry *entry = search_ggtt_entry(page);
+      assert(entry && entry->phys_addr & 0x1);
+
+      uint64_t offset = MAX2(page, virt_address) - page;
+      uint32_t size_this_page = MIN2(to_write, 4096 - offset);
+      to_write -= size_this_page;
+
+      uint64_t phys_page = entry->phys_addr & ~0xfff; /* Clear the validity bits. */
+      handle_physical_write(phys_page + offset, data, size_this_page);
+      data = (const uint8_t *)data + size_this_page;
+   }
+}
+
+static struct gen_batch_decode_bo
+get_ggtt_batch_bo(void *user_data, uint64_t address)
+{
+   struct gen_batch_decode_bo bo = {0};
+
+   list_for_each_entry(struct bo_map, i, &maps, link)
+      if (i->bo.addr <= address && i->bo.addr + i->bo.size > address)
+         return i->bo;
+
+   address &= ~0xfff;
+
+   struct ggtt_entry *start =
+      (struct ggtt_entry *)rb_tree_search_sloppy(&ggtt, &address,
+                                                 cmp_ggtt_entry);
+   if (start && start->virt_addr < address)
+      start = ggtt_entry_next(start);
+   if (!start)
+      return bo;
+
+   struct ggtt_entry *last = start;
+   for (struct ggtt_entry *i = ggtt_entry_next(last);
+        i && last->virt_addr + 4096 == i->virt_addr;
+        last = i, i = ggtt_entry_next(last))
+      ;
+
+   bo.addr = MIN2(address, start->virt_addr);
+   bo.size = last->virt_addr - bo.addr + 4096;
+   bo.map = mmap(NULL, bo.size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+   assert(bo.map != MAP_FAILED);
+
+   for (struct ggtt_entry *i = start;
+        i;
+        i = i == last ? NULL : ggtt_entry_next(i)) {
+      uint64_t phys_addr = i->phys_addr & ~0xfff;
+      struct phys_mem *phys_mem = search_phys_mem(phys_addr);
+
+      if (!phys_mem)
+         continue;
+
+      uint32_t map_offset = i->virt_addr - address;
+      void *res = mmap((uint8_t *)bo.map + map_offset, 4096, PROT_READ,
+                       MAP_SHARED | MAP_FIXED, mem_fd, phys_mem->fd_offset);
+      assert(res != MAP_FAILED);
+   }
+
+   add_gtt_bo_map(bo, true);
+
+   return bo;
+}
+
+static struct phys_mem *
+ppgtt_walk(uint64_t pml4, uint64_t address)
+{
+   uint64_t shift = 39;
+   uint64_t addr = pml4;
+   for (int level = 4; level > 0; level--) {
+      struct phys_mem *table = search_phys_mem(addr);
+      if (!table)
+         return NULL;
+      int index = (address >> shift) & 0x1ff;
+      uint64_t entry = ((uint64_t *)table->data)[index];
+      if (!(entry & 1))
+         return NULL;
+      addr = entry & ~0xfff;
+      shift -= 9;
+   }
+   return search_phys_mem(addr);
+}
+
+static bool
+ppgtt_mapped(uint64_t pml4, uint64_t address)
+{
+   return ppgtt_walk(pml4, address) != NULL;
+}
+
+static struct gen_batch_decode_bo
+get_ppgtt_batch_bo(void *user_data, uint64_t address)
+{
+   struct gen_batch_decode_bo bo = {0};
+   uint64_t pml4 = *(uint64_t *)user_data;
+
+   address &= ~0xfff;
+
+   if (!ppgtt_mapped(pml4, address))
+      return bo;
+
+   /* Map everything until the first gap since we don't know how much the
+    * decoder actually needs.
+    */
+   uint64_t end = address;
+   while (ppgtt_mapped(pml4, end))
+      end += 4096;
+
+   bo.addr = address;
+   bo.size = end - address;
+   bo.map = mmap(NULL, bo.size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+   assert(bo.map != MAP_FAILED);
+
+   for (uint64_t page = address; page < end; page += 4096) {
+      struct phys_mem *phys_mem = ppgtt_walk(pml4, page);
+
+      void *res = mmap((uint8_t *)bo.map + (page - bo.addr), 4096, PROT_READ,
+                       MAP_SHARED | MAP_FIXED, mem_fd, phys_mem->fd_offset);
+      assert(res != MAP_FAILED);
+   }
+
+   add_gtt_bo_map(bo, true);
+
+   return bo;
 }
 
 #define GEN_ENGINE_RENDER 1
@@ -104,26 +398,23 @@
    int operation = p[1] & AUB_TRACE_OPERATION_MASK;
    int type = p[1] & AUB_TRACE_TYPE_MASK;
    int address_space = p[1] & AUB_TRACE_ADDRESS_SPACE_MASK;
-   uint64_t offset = p[3];
-   uint32_t size = p[4];
    int header_length = p[0] & 0xffff;
-   uint32_t *data = p + header_length + 2;
    int engine = GEN_ENGINE_RENDER;
-
-   if (devinfo.gen >= 8)
-      offset += (uint64_t) p[5] << 32;
+   struct gen_batch_decode_bo bo = {
+      .map = p + header_length + 2,
+      /* Addresses written by aubdump here are in canonical form but the batch
+       * decoder always gives us addresses with the top 16bits zeroed, so do
+       * the same here.
+       */
+      .addr = gen_48b_address((devinfo.gen >= 8 ? ((uint64_t) p[5] << 32) : 0) |
+                              ((uint64_t) p[3])),
+      .size = p[4],
+   };
 
    switch (operation) {
    case AUB_TRACE_OP_DATA_WRITE:
-      if (address_space != AUB_TRACE_MEMTYPE_GTT)
-         break;
-      if (gtt_size < offset + size) {
-         fprintf(stderr, "overflow gtt space: %s\n", strerror(errno));
-         exit(EXIT_FAILURE);
-      }
-      memcpy((char *) gtt + offset, data, size);
-      if (gtt_end < offset + size)
-         gtt_end = offset + size;
+      if (address_space == AUB_TRACE_MEMTYPE_GTT)
+         add_gtt_bo_map(bo, false);
       break;
    case AUB_TRACE_OP_COMMAND_WRITE:
       switch (type) {
@@ -139,27 +430,14 @@
       }
 
       (void)engine; /* TODO */
-      gen_print_batch(&batch_ctx, data, size, 0);
+      batch_ctx.get_bo = get_ggtt_batch_bo;
+      gen_print_batch(&batch_ctx, bo.map, bo.size, 0);
 
-      gtt_end = 0;
+      clear_bo_maps();
       break;
    }
 }
 
-static struct gen_batch_decode_bo
-get_gen_batch_bo(void *user_data, uint64_t address)
-{
-   if (address > gtt_end)
-      return (struct gen_batch_decode_bo) { .map = NULL };
-
-   /* We really only have one giant address range */
-   return (struct gen_batch_decode_bo) {
-      .addr = 0,
-      .map = gtt,
-      .size = gtt_size
-   };
-}
-
 static void
 aubinator_init(uint16_t aub_pci_id, const char *app_name)
 {
@@ -178,7 +456,8 @@
    batch_flags |= GEN_BATCH_DECODE_FLOATS;
 
    gen_batch_decode_ctx_init(&batch_ctx, &devinfo, outfile, batch_flags,
-                             xml_path, get_gen_batch_bo, NULL);
+                             xml_path, NULL, NULL, NULL);
+   batch_ctx.max_vbo_decoded_lines = max_vbo_lines;
 
    char *color = GREEN_HEADER, *reset_color = NORMAL;
    if (option_color == COLOR_NEVER)
@@ -243,73 +522,128 @@
 static void
 handle_memtrace_reg_write(uint32_t *p)
 {
+   static struct execlist_regs {
+      uint32_t render_elsp[4];
+      int render_elsp_index;
+      uint32_t blitter_elsp[4];
+      int blitter_elsp_index;
+   } state = {};
+
    uint32_t offset = p[1];
    uint32_t value = p[5];
+
    int engine;
-   static int render_elsp_writes = 0;
-   static int blitter_elsp_writes = 0;
-   static int render_elsq0 = 0;
-   static int blitter_elsq0 = 0;
-   uint8_t *pphwsp;
+   uint64_t context_descriptor;
 
-   if (offset == 0x2230) {
-      render_elsp_writes++;
+   switch (offset) {
+   case 0x2230: /* render elsp */
+      state.render_elsp[state.render_elsp_index++] = value;
+      if (state.render_elsp_index < 4)
+         return;
+
+      state.render_elsp_index = 0;
       engine = GEN_ENGINE_RENDER;
-   } else if (offset == 0x22230) {
-      blitter_elsp_writes++;
+      context_descriptor = (uint64_t)state.render_elsp[2] << 32 |
+         state.render_elsp[3];
+      break;
+   case 0x22230: /* blitter elsp */
+      state.blitter_elsp[state.blitter_elsp_index++] = value;
+      if (state.blitter_elsp_index < 4)
+         return;
+
+      state.blitter_elsp_index = 0;
       engine = GEN_ENGINE_BLITTER;
-   } else if (offset == 0x2510) {
-      render_elsq0 = value;
-   } else if (offset == 0x22510) {
-      blitter_elsq0 = value;
-   } else if (offset == 0x2550 || offset == 0x22550) {
-      /* nothing */;
-   } else {
+      context_descriptor = (uint64_t)state.blitter_elsp[2] << 32 |
+         state.blitter_elsp[3];
+      break;
+   case 0x2510: /* render elsq0 lo */
+      state.render_elsp[3] = value;
       return;
-   }
-
-   if (render_elsp_writes > 3 || blitter_elsp_writes > 3) {
-      render_elsp_writes = blitter_elsp_writes = 0;
-      pphwsp = (uint8_t*)gtt + (value & 0xfffff000);
-   } else if (offset == 0x2550) {
+      break;
+   case 0x2514: /* render elsq0 hi */
+      state.render_elsp[2] = value;
+      return;
+      break;
+   case 0x22510: /* blitter elsq0 lo */
+      state.blitter_elsp[3] = value;
+      return;
+      break;
+   case 0x22514: /* blitter elsq0 hi */
+      state.blitter_elsp[2] = value;
+      return;
+      break;
+   case 0x2550: /* render elsc */
       engine = GEN_ENGINE_RENDER;
-      pphwsp = (uint8_t*)gtt + (render_elsq0 & 0xfffff000);
-   } else if (offset == 0x22550) {
+      context_descriptor = (uint64_t)state.render_elsp[2] << 32 |
+         state.render_elsp[3];
+      break;
+   case 0x22550: /* blitter elsc */
       engine = GEN_ENGINE_BLITTER;
-      pphwsp = (uint8_t*)gtt + (blitter_elsq0 & 0xfffff000);
-   } else {
+      context_descriptor = (uint64_t)state.blitter_elsp[2] << 32 |
+         state.blitter_elsp[3];
+      break;
+   default:
       return;
    }
 
    const uint32_t pphwsp_size = 4096;
-   uint32_t *context = (uint32_t*)(pphwsp + pphwsp_size);
+   uint32_t pphwsp_addr = context_descriptor & 0xfffff000;
+   struct gen_batch_decode_bo pphwsp_bo = get_ggtt_batch_bo(NULL, pphwsp_addr);
+   uint32_t *context = (uint32_t *)((uint8_t *)pphwsp_bo.map +
+                                    (pphwsp_addr - pphwsp_bo.addr) +
+                                    pphwsp_size);
+
    uint32_t ring_buffer_head = context[5];
    uint32_t ring_buffer_tail = context[7];
    uint32_t ring_buffer_start = context[9];
-   uint32_t *commands = (uint32_t*)((uint8_t*)gtt + ring_buffer_start + ring_buffer_head);
+   uint64_t pml4 = (uint64_t)context[49] << 32 | context[51];
+
+   struct gen_batch_decode_bo ring_bo = get_ggtt_batch_bo(NULL,
+                                                          ring_buffer_start);
+   assert(ring_bo.size > 0);
+   void *commands = (uint8_t *)ring_bo.map + (ring_buffer_start - ring_bo.addr);
+
+   if (context_descriptor & 0x100 /* ppgtt */) {
+      batch_ctx.get_bo = get_ppgtt_batch_bo;
+      batch_ctx.user_data = &pml4;
+   } else {
+      batch_ctx.get_bo = get_ggtt_batch_bo;
+   }
+
    (void)engine; /* TODO */
-   gen_print_batch(&batch_ctx, commands, ring_buffer_tail - ring_buffer_head, 0);
+   gen_print_batch(&batch_ctx, commands, ring_buffer_tail - ring_buffer_head,
+                   0);
+   clear_bo_maps();
 }
 
 static void
 handle_memtrace_mem_write(uint32_t *p)
 {
-   uint64_t address = *(uint64_t*)&p[1];
+   struct gen_batch_decode_bo bo = {
+      .map = p + 5,
+      /* Addresses written by aubdump here are in canonical form but the batch
+       * decoder always gives us addresses with the top 16bits zeroed, so do
+       * the same here.
+       */
+      .addr = gen_48b_address(*(uint64_t*)&p[1]),
+      .size = p[4],
+   };
    uint32_t address_space = p[3] >> 28;
-   uint32_t size = p[4];
-   uint32_t *data = p + 5;
 
-   if (address_space != 1)
-      return;
-
-   if (gtt_size < address + size) {
-      fprintf(stderr, "overflow gtt space: %s\n", strerror(errno));
-      exit(EXIT_FAILURE);
+   switch (address_space) {
+   case 0: /* GGTT */
+      handle_ggtt_write(bo.addr, bo.map, bo.size);
+      break;
+   case 1: /* Local */
+      add_gtt_bo_map(bo, false);
+      break;
+   case 2: /* Physical */
+      handle_physical_write(bo.addr, bo.map, bo.size);
+      break;
+   case 4: /* GGTT Entry */
+      handle_ggtt_entry_write(bo.addr, bo.map, bo.size);
+      break;
    }
-
-   memcpy((char *) gtt + address, data, size);
-   if (gtt_end < address + size)
-      gtt_end = address + size;
 }
 
 struct aub_file {
@@ -353,17 +687,6 @@
    return file;
 }
 
-static struct aub_file *
-aub_file_stdin(void)
-{
-   struct aub_file *file;
-
-   file = calloc(1, sizeof *file);
-   file->stream = stdin;
-
-   return file;
-}
-
 #define TYPE(dw)       (((dw) >> 29) & 7)
 #define OPCODE(dw)     (((dw) >> 23) & 0x3f)
 #define SUBOPCODE(dw)  (((dw) >> 16) & 0x7f)
@@ -401,8 +724,7 @@
    uint32_t *p, h, *new_cursor;
    int header_length, bias;
 
-   if (file->end - file->cursor < 1)
-      return AUB_ITEM_DECODE_NEED_MORE_DATA;
+   assert(file->cursor < file->end);
 
    p = file->cursor;
    h = *p;
@@ -424,13 +746,11 @@
 
    new_cursor = p + header_length + bias;
    if ((h & 0xffff0000) == MAKE_HEADER(TYPE_AUB, OPCODE_AUB, SUBOPCODE_BLOCK)) {
-      if (file->end - file->cursor < 4)
-         return AUB_ITEM_DECODE_NEED_MORE_DATA;
+      assert(file->end - file->cursor >= 4);
       new_cursor += p[4] / 4;
    }
 
-   if (new_cursor > file->end)
-      return AUB_ITEM_DECODE_NEED_MORE_DATA;
+   assert(new_cursor <= file->end);
 
    switch (h & 0xffff0000) {
    case MAKE_HEADER(TYPE_AUB, OPCODE_AUB, SUBOPCODE_HEADER):
@@ -471,48 +791,6 @@
    return file->cursor < file->end || (file->stream && !feof(file->stream));
 }
 
-#define AUB_READ_BUFFER_SIZE (4096)
-#define MAX(a, b) ((a) < (b) ? (b) : (a))
-
-static void
-aub_file_data_grow(struct aub_file *file)
-{
-   size_t old_size = (file->mem_end - file->map) * 4;
-   size_t new_size = MAX(old_size * 2, AUB_READ_BUFFER_SIZE);
-   uint32_t *new_start = realloc(file->map, new_size);
-
-   file->cursor = new_start + (file->cursor - file->map);
-   file->end = new_start + (file->end - file->map);
-   file->map = new_start;
-   file->mem_end = file->map + (new_size / 4);
-}
-
-static bool
-aub_file_data_load(struct aub_file *file)
-{
-   size_t r;
-
-   if (file->stream == NULL)
-      return false;
-
-   /* First remove any consumed data */
-   if (file->cursor > file->map) {
-      memmove(file->map, file->cursor,
-              (file->end - file->cursor) * 4);
-      file->end -= file->cursor - file->map;
-      file->cursor = file->map;
-   }
-
-   /* Then load some new data in */
-   if ((file->mem_end - file->end) < (AUB_READ_BUFFER_SIZE / 4))
-      aub_file_data_grow(file);
-
-   r = fread(file->end, 1, (file->mem_end - file->end) * 4, file->stream);
-   file->end += r / 4;
-
-   return r != 0;
-}
-
 static void
 setup_pager(void)
 {
@@ -544,17 +822,17 @@
 print_help(const char *progname, FILE *file)
 {
    fprintf(file,
-           "Usage: %s [OPTION]... [FILE]\n"
-           "Decode aub file contents from either FILE or the standard input.\n\n"
-           "A valid --gen option must be provided.\n\n"
-           "      --help          display this help and exit\n"
-           "      --gen=platform  decode for given platform (3 letter platform name)\n"
-           "      --headers       decode only command headers\n"
-           "      --color[=WHEN]  colorize the output; WHEN can be 'auto' (default\n"
-           "                        if omitted), 'always', or 'never'\n"
-           "      --no-pager      don't launch pager\n"
-           "      --no-offsets    don't print instruction offsets\n"
-           "      --xml=DIR       load hardware xml description from directory DIR\n",
+           "Usage: %s [OPTION]... FILE\n"
+           "Decode aub file contents from FILE.\n\n"
+           "      --help             display this help and exit\n"
+           "      --gen=platform     decode for given platform (3 letter platform name)\n"
+           "      --headers          decode only command headers\n"
+           "      --color[=WHEN]     colorize the output; WHEN can be 'auto' (default\n"
+           "                         if omitted), 'always', or 'never'\n"
+           "      --max-vbo-lines=N  limit the number of decoded VBO lines\n"
+           "      --no-pager         don't launch pager\n"
+           "      --no-offsets       don't print instruction offsets\n"
+           "      --xml=DIR          load hardware xml description from directory DIR\n",
            progname);
 }
 
@@ -564,14 +842,15 @@
    int c, i;
    bool help = false, pager = true;
    const struct option aubinator_opts[] = {
-      { "help",       no_argument,       (int *) &help,                 true },
-      { "no-pager",   no_argument,       (int *) &pager,                false },
-      { "no-offsets", no_argument,       (int *) &option_print_offsets, false },
-      { "gen",        required_argument, NULL,                          'g' },
-      { "headers",    no_argument,       (int *) &option_full_decode,   false },
-      { "color",      required_argument, NULL,                          'c' },
-      { "xml",        required_argument, NULL,                          'x' },
-      { NULL,         0,                 NULL,                          0 }
+      { "help",          no_argument,       (int *) &help,                 true },
+      { "no-pager",      no_argument,       (int *) &pager,                false },
+      { "no-offsets",    no_argument,       (int *) &option_print_offsets, false },
+      { "gen",           required_argument, NULL,                          'g' },
+      { "headers",       no_argument,       (int *) &option_full_decode,   false },
+      { "color",         required_argument, NULL,                          'c' },
+      { "xml",           required_argument, NULL,                          'x' },
+      { "max-vbo-lines", required_argument, NULL,                          'v' },
+      { NULL,            0,                 NULL,                          0 }
    };
 
    outfile = stdout;
@@ -582,8 +861,9 @@
       case 'g': {
          const int id = gen_device_name_to_pci_device_id(optarg);
          if (id < 0) {
-            fprintf(stderr, "can't parse gen: '%s', expected ivb, byt, hsw, "
-                                   "bdw, chv, skl, kbl or bxt\n", optarg);
+            fprintf(stderr, "can't parse gen: '%s', expected brw, g4x, ilk, "
+                            "snb, ivb, hsw, byt, bdw, chv, skl, bxt, kbl, "
+                            "aml, glk, cfl, whl, cnl, icl", optarg);
             exit(EXIT_FAILURE);
          } else {
             pci_id = id;
@@ -605,19 +885,22 @@
       case 'x':
          xml_path = strdup(optarg);
          break;
+      case 'v':
+         max_vbo_lines = atoi(optarg);
+         break;
       default:
          break;
       }
    }
 
-   if (help || argc == 1) {
+   if (optind < argc)
+      input_file = argv[optind];
+
+   if (help || !input_file) {
       print_help(argv[0], stderr);
       exit(0);
    }
 
-   if (optind < argc)
-      input_file = argv[optind];
-
    /* Do this before we redirect stdout to pager. */
    if (option_color == COLOR_AUTO)
       option_color = isatty(1) ? COLOR_ALWAYS : COLOR_NEVER;
@@ -625,40 +908,14 @@
    if (isatty(1) && pager)
       setup_pager();
 
-   if (input_file == NULL)
-      file = aub_file_stdin();
-   else
-      file = aub_file_open(input_file);
+   mem_fd = memfd_create("phys memory", 0);
 
-   /* mmap a terabyte for our gtt space. */
-   gtt_size = 1ull << 40;
-   gtt = mmap(NULL, gtt_size, PROT_READ | PROT_WRITE,
-              MAP_PRIVATE | MAP_ANONYMOUS |  MAP_NORESERVE, -1, 0);
-   if (gtt == MAP_FAILED) {
-      fprintf(stderr, "failed to alloc gtt space: %s\n", strerror(errno));
-      exit(EXIT_FAILURE);
-   }
+   list_inithead(&maps);
 
-   while (aub_file_more_stuff(file)) {
-      switch (aub_file_decode_batch(file)) {
-      case AUB_ITEM_DECODE_OK:
-         break;
-      case AUB_ITEM_DECODE_NEED_MORE_DATA:
-         if (!file->stream) {
-            file->cursor = file->end;
-            break;
-         }
-         if (aub_file_more_stuff(file) && !aub_file_data_load(file)) {
-            fprintf(stderr, "failed to load data from stdin\n");
-            exit(EXIT_FAILURE);
-         }
-         break;
-      default:
-         fprintf(stderr, "failed to parse aubdump data\n");
-         exit(EXIT_FAILURE);
-      }
-   }
+   file = aub_file_open(input_file);
 
+   while (aub_file_more_stuff(file) &&
+          aub_file_decode_batch(file) == AUB_ITEM_DECODE_OK);
 
    fflush(stdout);
    /* close the stdout which is opened to write the output */
diff --git a/src/intel/tools/aubinator_error_decode.c b/src/intel/tools/aubinator_error_decode.c
index 0234c59..4e3359b 100644
--- a/src/intel/tools/aubinator_error_decode.c
+++ b/src/intel/tools/aubinator_error_decode.c
@@ -295,7 +295,8 @@
    int count;
 };
 
-#define MAX_SECTIONS 30
+#define MAX_SECTIONS 256
+static unsigned num_sections;
 static struct section sections[MAX_SECTIONS];
 
 static int zlib_inflate(uint32_t **ptr, int len)
@@ -386,7 +387,7 @@
 static struct gen_batch_decode_bo
 get_gen_batch_bo(void *user_data, uint64_t address)
 {
-   for (int s = 0; s < MAX_SECTIONS; s++) {
+   for (int s = 0; s < num_sections; s++) {
       if (sections[s].gtt_offset <= address &&
           address < sections[s].gtt_offset + sections[s].count * 4) {
          return (struct gen_batch_decode_bo) {
@@ -411,7 +412,6 @@
    uint32_t offset, value;
    char *ring_name = NULL;
    struct gen_device_info devinfo;
-   int sect_num = 0;
 
    while (getline(&line, &line_size, file) > 0) {
       char *new_ring_name = NULL;
@@ -429,9 +429,10 @@
             fprintf(stderr, "ASCII85 decode failed.\n");
             exit(EXIT_FAILURE);
          }
-         sections[sect_num].data = data;
-         sections[sect_num].count = count;
-         sect_num++;
+         assert(num_sections < MAX_SECTIONS);
+         sections[num_sections].data = data;
+         sections[num_sections].count = count;
+         num_sections++;
          continue;
       }
 
@@ -465,13 +466,14 @@
                break;
          }
 
-         sections[sect_num].buffer_name = b->name;
-         sections[sect_num].ring_name = strdup(ring_name);
+         assert(num_sections < MAX_SECTIONS);
+         sections[num_sections].buffer_name = b->name;
+         sections[num_sections].ring_name = strdup(ring_name);
 
          uint32_t hi, lo;
          dashes = strchr(dashes, '=');
          if (dashes && sscanf(dashes, "= 0x%08x %08x\n", &hi, &lo))
-            sections[sect_num].gtt_offset = ((uint64_t) hi) << 32 | lo;
+            sections[num_sections].gtt_offset = ((uint64_t) hi) << 32 | lo;
 
          continue;
       }
@@ -595,10 +597,10 @@
 
    struct gen_batch_decode_ctx batch_ctx;
    gen_batch_decode_ctx_init(&batch_ctx, &devinfo, stdout, batch_flags,
-                             xml_path, get_gen_batch_bo, NULL);
+                             xml_path, get_gen_batch_bo, NULL, NULL);
 
 
-   for (int s = 0; s < sect_num; s++) {
+   for (int s = 0; s < num_sections; s++) {
       printf("--- %s (%s) at 0x%08x %08x\n",
              sections[s].buffer_name, sections[s].ring_name,
              (unsigned) (sections[s].gtt_offset >> 32),
@@ -615,7 +617,7 @@
 
    gen_batch_decode_ctx_finish(&batch_ctx);
 
-   for (int s = 0; s < sect_num; s++) {
+   for (int s = 0; s < num_sections; s++) {
       free(sections[s].ring_name);
       free(sections[s].data);
    }
diff --git a/src/intel/tools/error2aub.c b/src/intel/tools/error2aub.c
new file mode 100644
index 0000000..68a5b96
--- /dev/null
+++ b/src/intel/tools/error2aub.c
@@ -0,0 +1,333 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <assert.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <zlib.h>
+
+#include "aub_write.h"
+#include "i915_drm.h"
+#include "intel_aub.h"
+
+static void __attribute__ ((format(__printf__, 2, 3)))
+fail_if(int cond, const char *format, ...)
+{
+   va_list args;
+
+   if (!cond)
+      return;
+
+   va_start(args, format);
+   vfprintf(stderr, format, args);
+   va_end(args);
+
+   raise(SIGTRAP);
+}
+
+#define fail(...) fail_if(true, __VA_ARGS__)
+
+static int zlib_inflate(uint32_t **ptr, int len)
+{
+   struct z_stream_s zstream;
+   void *out;
+   const uint32_t out_size = 128*4096;  /* approximate obj size */
+
+   memset(&zstream, 0, sizeof(zstream));
+
+   zstream.next_in = (unsigned char *)*ptr;
+   zstream.avail_in = 4*len;
+
+   if (inflateInit(&zstream) != Z_OK)
+      return 0;
+
+   out = malloc(out_size);
+   zstream.next_out = out;
+   zstream.avail_out = out_size;
+
+   do {
+      switch (inflate(&zstream, Z_SYNC_FLUSH)) {
+      case Z_STREAM_END:
+         goto end;
+      case Z_OK:
+         break;
+      default:
+         inflateEnd(&zstream);
+         return 0;
+      }
+
+      if (zstream.avail_out)
+         break;
+
+      out = realloc(out, 2*zstream.total_out);
+      if (out == NULL) {
+         inflateEnd(&zstream);
+         return 0;
+      }
+
+      zstream.next_out = (unsigned char *)out + zstream.total_out;
+      zstream.avail_out = zstream.total_out;
+   } while (1);
+ end:
+   inflateEnd(&zstream);
+   free(*ptr);
+   *ptr = out;
+   return zstream.total_out / 4;
+}
+
+static int ascii85_decode(const char *in, uint32_t **out, bool inflate)
+{
+   int len = 0, size = 1024;
+
+   *out = realloc(*out, sizeof(uint32_t)*size);
+   if (*out == NULL)
+      return 0;
+
+   while (*in >= '!' && *in <= 'z') {
+      uint32_t v = 0;
+
+      if (len == size) {
+         size *= 2;
+         *out = realloc(*out, sizeof(uint32_t)*size);
+         if (*out == NULL)
+            return 0;
+      }
+
+      if (*in == 'z') {
+         in++;
+      } else {
+         v += in[0] - 33; v *= 85;
+         v += in[1] - 33; v *= 85;
+         v += in[2] - 33; v *= 85;
+         v += in[3] - 33; v *= 85;
+         v += in[4] - 33;
+         in += 5;
+      }
+      (*out)[len++] = v;
+   }
+
+   if (!inflate)
+      return len;
+
+   return zlib_inflate(out, len);
+}
+
+static void
+print_help(const char *progname, FILE *file)
+{
+   fprintf(file,
+           "Usage: %s [OPTION]... [FILE]\n"
+           "Convert an Intel GPU i915 error state to an aub file.\n"
+           "  -h, --help          display this help and exit\n"
+           "  -o, --output=FILE   the output aub file (default FILE.aub)\n",
+           progname);
+}
+
+int
+main(int argc, char *argv[])
+{
+   int i, c;
+   bool help = false;
+   char *out_filename = NULL, *in_filename = NULL;
+   const struct option aubinator_opts[] = {
+      { "help",       no_argument,       NULL,     'h' },
+      { "output",     required_argument, NULL,     'o' },
+      { NULL,         0,                 NULL,     0 }
+   };
+
+   i = 0;
+   while ((c = getopt_long(argc, argv, "ho:", aubinator_opts, &i)) != -1) {
+      switch (c) {
+      case 'h':
+         help = true;
+         break;
+      case 'o':
+         out_filename = strdup(optarg);
+         break;
+      default:
+         break;
+      }
+   }
+
+   if (optind < argc)
+      in_filename = argv[optind++];
+
+   if (help || argc == 1 || !in_filename) {
+      print_help(argv[0], stderr);
+      return in_filename ? EXIT_SUCCESS : EXIT_FAILURE;
+   }
+
+   if (out_filename == NULL) {
+      int out_filename_size = strlen(in_filename) + 5;
+      out_filename = malloc(out_filename_size);
+      snprintf(out_filename, out_filename_size, "%s.aub", in_filename);
+   }
+
+   FILE *err_file = fopen(in_filename, "r");
+   fail_if(!err_file, "Failed to open error file \"%s\": %m\n", in_filename);
+
+   FILE *aub_file = fopen(out_filename, "w");
+   fail_if(!aub_file, "Failed to open aub file \"%s\": %m\n", in_filename);
+
+   struct aub_file aub = {};
+
+   uint32_t active_ring = 0;
+   int num_ring_bos = 0;
+
+   uint64_t batch_addr = 0;
+
+   enum bo_type {
+      BO_TYPE_UNKNOWN = 0,
+      BO_TYPE_BATCH,
+      BO_TYPE_USER,
+   } bo_type = BO_TYPE_UNKNOWN;
+   uint64_t bo_addr;
+
+   char *line = NULL;
+   size_t line_size;
+   while (getline(&line, &line_size, err_file) > 0) {
+      const char *pci_id_start = strstr(line, "PCI ID");
+      if (pci_id_start) {
+         int pci_id;
+         int matched = sscanf(line, "PCI ID: 0x%04x\n", &pci_id);
+         fail_if(!matched, "Invalid error state file!\n");
+
+         aub_file_init(&aub, aub_file, pci_id);
+         fail_if(!aub_use_execlists(&aub),
+                 "%s currently only works on gen8+\n", argv[0]);
+
+         aub_write_header(&aub, "error state");
+         continue;
+      }
+
+      const char *active_start = "Active (";
+      if (strncmp(line, active_start, strlen(active_start)) == 0) {
+         fail_if(active_ring != 0, "TODO: Handle multiple active rings\n");
+
+         char *ring = line + strlen(active_start);
+
+         const struct {
+            const char *match;
+            uint32_t ring;
+         } rings[] = {
+            { "rcs", I915_EXEC_RENDER },
+            { "vcs", I915_EXEC_VEBOX },
+            { "bcs", I915_EXEC_BLT },
+            { NULL, BO_TYPE_UNKNOWN },
+         }, *r;
+
+         for (r = rings; r->match; r++) {
+            if (strncasecmp(ring, r->match, strlen(r->match)) == 0) {
+               active_ring = r->ring;
+               break;
+            }
+         }
+
+         char *count = strchr(ring, '[');
+         fail_if(!count || sscanf(count, "[%d]:", &num_ring_bos) < 1,
+                 "Failed to parse BO table header\n");
+         continue;
+      }
+
+      if (num_ring_bos > 0) {
+         unsigned hi, lo, size;
+         if (sscanf(line, " %x_%x %d", &hi, &lo, &size) == 3) {
+            assert(aub_use_execlists(&aub));
+            aub_map_ppgtt(&aub, ((uint64_t)hi) << 32 | lo, size);
+            num_ring_bos--;
+         } else {
+            fail("Not enough BO entries in the active table\n");
+         }
+         continue;
+      }
+
+      if (line[0] == ':' || line[0] == '~') {
+         if (bo_type == BO_TYPE_UNKNOWN)
+            continue;
+
+         uint32_t *data = NULL;
+         int count = ascii85_decode(line+1, &data, line[0] == ':');
+         fail_if(count == 0, "ASCII85 decode failed.\n");
+         uint64_t bo_size = count * 4;
+
+         if (bo_type == BO_TYPE_BATCH) {
+            aub_write_trace_block(&aub, AUB_TRACE_TYPE_BATCH,
+                                  data, bo_size, bo_addr);
+            batch_addr = bo_addr;
+         } else {
+            assert(bo_type == BO_TYPE_USER);
+            aub_write_trace_block(&aub, AUB_TRACE_TYPE_NOTYPE,
+                                  data, bo_size, bo_addr);
+         }
+
+         continue;
+      }
+
+      char *dashes = strstr(line, "---");
+      if (dashes) {
+         dashes += 4;
+
+         const struct {
+            const char *match;
+            enum bo_type type;
+         } bo_types[] = {
+            { "gtt_offset", BO_TYPE_BATCH },
+            { "user", BO_TYPE_USER },
+            { NULL, BO_TYPE_UNKNOWN },
+         }, *b;
+
+         bo_type = BO_TYPE_UNKNOWN;
+         for (b = bo_types; b->match; b++) {
+            if (strncasecmp(dashes, b->match, strlen(b->match)) == 0) {
+               bo_type = b->type;
+               break;
+            }
+         }
+
+         if (bo_type != BO_TYPE_UNKNOWN) {
+            uint32_t hi, lo;
+            dashes = strchr(dashes, '=');
+            if (dashes && sscanf(dashes, "= 0x%08x %08x\n", &hi, &lo) == 2) {
+               bo_addr = ((uint64_t) hi) << 32 | lo;
+            } else {
+               fail("User BO does not have an address\n");
+            }
+         }
+         continue;
+      }
+   }
+
+   fail_if(!batch_addr, "Failed to find batch buffer.\n");
+
+   aub_write_exec(&aub, batch_addr, aub_gtt_size(&aub), I915_EXEC_RENDER);
+
+   return EXIT_SUCCESS;
+}
+
+/* vim: set ts=8 sw=8 tw=0 cino=:0,(0 noet :*/
diff --git a/src/intel/tools/gen10_context.h b/src/intel/tools/gen10_context.h
new file mode 100644
index 0000000..8b1973c
--- /dev/null
+++ b/src/intel/tools/gen10_context.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GEN10_CONTEXT_H
+#define GEN10_CONTEXT_H
+
+static const uint32_t gen10_render_context_init[CONTEXT_RENDER_SIZE / sizeof(uint32_t)] = {
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(14) | MI_LRI_FORCE_POSTED,
+   0x2244 /* CONTEXT_CONTROL */,      0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
+   0x2034 /* RING_HEAD */,         0,
+   0x2030 /* RING_TAIL */,         0,
+   0x2038 /* RING_BUFFER_START */,      RENDER_RING_ADDR,
+   0x203C /* RING_BUFFER_CONTROL */,   (RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
+   0x2168 /* BB_HEAD_U */,         0,
+   0x2140 /* BB_HEAD_L */,         0,
+   0x2110 /* BB_STATE */,         0,
+   0x211C /* SECOND_BB_HEAD_U */,      0,
+   0x2114 /* SECOND_BB_HEAD_L */,      0,
+   0x2118 /* SECOND_BB_STATE */,      0,
+   0x21C0 /* BB_PER_CTX_PTR */,      0,
+   0x21C4 /* RCS_INDIRECT_CTX */,      0,
+   0x21C8 /* RCS_INDIRECT_CTX_OFFSET */,   0,
+   0x2180 /* CCID */,		0,
+
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(9) | MI_LRI_FORCE_POSTED,
+   0x23A8 /* CTX_TIMESTAMP */,   0,
+   0x228C /* PDP3_UDW */,      0,
+   0x2288 /* PDP3_LDW */,      0,
+   0x2284 /* PDP2_UDW */,      0,
+   0x2280 /* PDP2_LDW */,      0,
+   0x227C /* PDP1_UDW */,      0,
+   0x2278 /* PDP1_LDW */,      0,
+   0x2274 /* PDP0_UDW */,      PML4_PHYS_ADDR >> 32,
+   0x2270 /* PDP0_LDW */,      PML4_PHYS_ADDR,
+   /* MI_NOOP */
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(1),
+   0x20C8 /* R_PWR_CLK_STATE */, 0x7FFFFFFF,
+   0, 0, 0 /* GPGPU_CSR_BASE_ADDRESS ? */,
+   0, 0, 0, 0, 0, 0, 0, 0, 0 /* MI_NOOP */,
+
+   MI_BATCH_BUFFER_END | 1 /* End Context */
+};
+
+static const uint32_t gen10_blitter_context_init[CONTEXT_OTHER_SIZE / sizeof(uint32_t)] = {
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(14) | MI_LRI_FORCE_POSTED,
+   0x22244 /* CONTEXT_CONTROL */,      0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
+   0x22034 /* RING_HEAD */,      0,
+   0x22030 /* RING_TAIL */,      0,
+   0x22038 /* RING_BUFFER_START */,   BLITTER_RING_ADDR,
+   0x2203C /* RING_BUFFER_CONTROL */,   (RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
+   0x22168 /* BB_HEAD_U */,      0,
+   0x22140 /* BB_HEAD_L */,      0,
+   0x22110 /* BB_STATE */,         0,
+   0x2211C /* SECOND_BB_HEAD_U */,      0,
+   0x22114 /* SECOND_BB_HEAD_L */,      0,
+   0x22118 /* SECOND_BB_STATE */,      0,
+   0x221C0 /* BB_PER_CTX_PTR */,	0,
+   0x221C4 /* INDIRECT_CTX */,	0,
+   0x221C8 /* INDIRECT_CTX_OFFSET */, 0,
+   0, 0 /* MI_NOOP */,
+
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(9) | MI_LRI_FORCE_POSTED,
+   0x223A8 /* CTX_TIMESTAMP */, 0,
+   0x2228C /* PDP3_UDW */,      0,
+   0x22288 /* PDP3_LDW */,      0,
+   0x22284 /* PDP2_UDW */,      0,
+   0x22280 /* PDP2_LDW */,      0,
+   0x2227C /* PDP1_UDW */,      0,
+   0x22278 /* PDP1_LDW */,      0,
+   0x22274 /* PDP0_UDW */,      PML4_PHYS_ADDR >> 32,
+   0x22270 /* PDP0_LDW */,      PML4_PHYS_ADDR,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(1),
+   0x22200 /* BCS_SWCTRL */,	0,
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* MI_NOOP */,
+
+   MI_BATCH_BUFFER_END | 1 /* End Context */
+};
+
+static const uint32_t gen10_video_context_init[CONTEXT_OTHER_SIZE / sizeof(uint32_t)] = {
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(11) | MI_LRI_FORCE_POSTED,
+   0x1C244 /* CONTEXT_CONTROL */,      0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
+   0x1C034 /* RING_HEAD */,      0,
+   0x1C030 /* RING_TAIL */,      0,
+   0x1C038 /* RING_BUFFER_START */,   VIDEO_RING_ADDR,
+   0x1C03C /* RING_BUFFER_CONTROL */,   (RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
+   0x1C168 /* BB_HEAD_U */,      0,
+   0x1C140 /* BB_HEAD_L */,      0,
+   0x1C110 /* BB_STATE */,         0,
+   0x1C11C /* SECOND_BB_HEAD_U */,      0,
+   0x1C114 /* SECOND_BB_HEAD_L */,      0,
+   0x1C118 /* SECOND_BB_STATE */,      0,
+   /* MI_NOOP */
+   0, 0, 0, 0, 0, 0, 0, 0,
+
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(9) | MI_LRI_FORCE_POSTED,
+   0x1C3A8 /* CTX_TIMESTAMP */,   0,
+   0x1C28C /* PDP3_UDW */,      0,
+   0x1C288 /* PDP3_LDW */,      0,
+   0x1C284 /* PDP2_UDW */,      0,
+   0x1C280 /* PDP2_LDW */,      0,
+   0x1C27C /* PDP1_UDW */,      0,
+   0x1C278 /* PDP1_LDW */,      0,
+   0x1C274 /* PDP0_UDW */,      PML4_PHYS_ADDR >> 32,
+   0x1C270 /* PDP0_LDW */,      PML4_PHYS_ADDR,
+   /* MI_NOOP */
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+   MI_BATCH_BUFFER_END | 1  /* End Context */
+};
+
+#endif /* GEN10_CONTEXT_H */
diff --git a/src/intel/tools/gen8_context.h b/src/intel/tools/gen8_context.h
new file mode 100644
index 0000000..d01c3c8
--- /dev/null
+++ b/src/intel/tools/gen8_context.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GEN8_CONTEXT_H
+#define GEN8_CONTEXT_H
+
+static const uint32_t gen8_render_context_init[CONTEXT_RENDER_SIZE / sizeof(uint32_t)] = {
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(14) | MI_LRI_FORCE_POSTED,
+   0x2244 /* CONTEXT_CONTROL */,      0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
+   0x2034 /* RING_HEAD */,         0,
+   0x2030 /* RING_TAIL */,         0,
+   0x2038 /* RING_BUFFER_START */,      RENDER_RING_ADDR,
+   0x203C /* RING_BUFFER_CONTROL */,   (RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
+   0x2168 /* BB_HEAD_U */,         0,
+   0x2140 /* BB_HEAD_L */,         0,
+   0x2110 /* BB_STATE */,         0,
+   0x211C /* SECOND_BB_HEAD_U */,      0,
+   0x2114 /* SECOND_BB_HEAD_L */,      0,
+   0x2118 /* SECOND_BB_STATE */,      0,
+   0x21C0 /* BB_PER_CTX_PTR */,      0,
+   0x21C4 /* RCS_INDIRECT_CTX */,      0,
+   0x21C8 /* RCS_INDIRECT_CTX_OFFSET */,   0,
+   /* MI_NOOP */
+   0, 0,
+
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(9) | MI_LRI_FORCE_POSTED,
+   0x23A8 /* CTX_TIMESTAMP */,   0,
+   0x228C /* PDP3_UDW */,      0,
+   0x2288 /* PDP3_LDW */,      0,
+   0x2284 /* PDP2_UDW */,      0,
+   0x2280 /* PDP2_LDW */,      0,
+   0x227C /* PDP1_UDW */,      0,
+   0x2278 /* PDP1_LDW */,      0,
+   0x2274 /* PDP0_UDW */,      PML4_PHYS_ADDR >> 32,
+   0x2270 /* PDP0_LDW */,      PML4_PHYS_ADDR,
+   /* MI_NOOP */
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(1),
+   0x20C8 /* R_PWR_CLK_STATE */, 0x7FFFFFFF,
+   MI_BATCH_BUFFER_END
+};
+
+static const uint32_t gen8_blitter_context_init[CONTEXT_OTHER_SIZE / sizeof(uint32_t)] = {
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(11) | MI_LRI_FORCE_POSTED,
+   0x22244 /* CONTEXT_CONTROL */,      0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
+   0x22034 /* RING_HEAD */,      0,
+   0x22030 /* RING_TAIL */,      0,
+   0x22038 /* RING_BUFFER_START */,   BLITTER_RING_ADDR,
+   0x2203C /* RING_BUFFER_CONTROL */,   (RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
+   0x22168 /* BB_HEAD_U */,      0,
+   0x22140 /* BB_HEAD_L */,      0,
+   0x22110 /* BB_STATE */,         0,
+   0x2211C /* SECOND_BB_HEAD_U */,      0,
+   0x22114 /* SECOND_BB_HEAD_L */,      0,
+   0x22118 /* SECOND_BB_STATE */,      0,
+   /* MI_NOOP */
+   0, 0, 0, 0, 0, 0, 0, 0,
+
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(9) | MI_LRI_FORCE_POSTED,
+   0x223A8 /* CTX_TIMESTAMP */,   0,
+   0x2228C /* PDP3_UDW */,      0,
+   0x22288 /* PDP3_LDW */,      0,
+   0x22284 /* PDP2_UDW */,      0,
+   0x22280 /* PDP2_LDW */,      0,
+   0x2227C /* PDP1_UDW */,      0,
+   0x22278 /* PDP1_LDW */,      0,
+   0x22274 /* PDP0_UDW */,      PML4_PHYS_ADDR >> 32,
+   0x22270 /* PDP0_LDW */,      PML4_PHYS_ADDR,
+   /* MI_NOOP */
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+   MI_BATCH_BUFFER_END
+};
+
+static const uint32_t gen8_video_context_init[CONTEXT_OTHER_SIZE / sizeof(uint32_t)] = {
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(11) | MI_LRI_FORCE_POSTED,
+   0x1C244 /* CONTEXT_CONTROL */,      0x90009 /* Inhibit Synchronous Context Switch | Engine Context Restore Inhibit */,
+   0x1C034 /* RING_HEAD */,      0,
+   0x1C030 /* RING_TAIL */,      0,
+   0x1C038 /* RING_BUFFER_START */,   VIDEO_RING_ADDR,
+   0x1C03C /* RING_BUFFER_CONTROL */,   (RING_SIZE - 4096) | 1 /* Buffer Length | Ring Buffer Enable */,
+   0x1C168 /* BB_HEAD_U */,      0,
+   0x1C140 /* BB_HEAD_L */,      0,
+   0x1C110 /* BB_STATE */,         0,
+   0x1C11C /* SECOND_BB_HEAD_U */,      0,
+   0x1C114 /* SECOND_BB_HEAD_L */,      0,
+   0x1C118 /* SECOND_BB_STATE */,      0,
+   /* MI_NOOP */
+   0, 0, 0, 0, 0, 0, 0, 0,
+
+   0 /* MI_NOOP */,
+   MI_LOAD_REGISTER_IMM_n(9) | MI_LRI_FORCE_POSTED,
+   0x1C3A8 /* CTX_TIMESTAMP */,   0,
+   0x1C28C /* PDP3_UDW */,      0,
+   0x1C288 /* PDP3_LDW */,      0,
+   0x1C284 /* PDP2_UDW */,      0,
+   0x1C280 /* PDP2_LDW */,      0,
+   0x1C27C /* PDP1_UDW */,      0,
+   0x1C278 /* PDP1_LDW */,      0,
+   0x1C274 /* PDP0_UDW */,      PML4_PHYS_ADDR >> 32,
+   0x1C270 /* PDP0_LDW */,      PML4_PHYS_ADDR,
+   /* MI_NOOP */
+   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+   MI_BATCH_BUFFER_END
+};
+
+#endif /* GEN8_CONTEXT_H */
diff --git a/src/intel/tools/gen_context.h b/src/intel/tools/gen_context.h
new file mode 100644
index 0000000..3f488c0
--- /dev/null
+++ b/src/intel/tools/gen_context.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef GEN_CONTEXT_H
+#define GEN_CONTEXT_H
+
+#include <stdint.h>
+
+#define RING_SIZE         (1 * 4096)
+#define PPHWSP_SIZE         (1 * 4096)
+
+#define GEN11_LR_CONTEXT_RENDER_SIZE    (14 * 4096)
+#define GEN10_LR_CONTEXT_RENDER_SIZE    (19 * 4096)
+#define GEN9_LR_CONTEXT_RENDER_SIZE     (22 * 4096)
+#define GEN8_LR_CONTEXT_RENDER_SIZE     (20 * 4096)
+#define GEN8_LR_CONTEXT_OTHER_SIZE      (2 * 4096)
+
+#define CONTEXT_RENDER_SIZE GEN9_LR_CONTEXT_RENDER_SIZE /* largest size */
+#define CONTEXT_OTHER_SIZE GEN8_LR_CONTEXT_OTHER_SIZE
+
+#define MI_LOAD_REGISTER_IMM_n(n) ((0x22 << 23) | (2 * (n) - 1))
+#define MI_LRI_FORCE_POSTED       (1<<12)
+
+#define MI_BATCH_BUFFER_END (0xA << 23)
+
+#define HWS_PGA_RCSUNIT      0x02080
+#define HWS_PGA_VCSUNIT0   0x12080
+#define HWS_PGA_BCSUNIT      0x22080
+
+#define GFX_MODE_RCSUNIT   0x0229c
+#define GFX_MODE_VCSUNIT0   0x1229c
+#define GFX_MODE_BCSUNIT   0x2229c
+
+#define EXECLIST_SUBMITPORT_RCSUNIT   0x02230
+#define EXECLIST_SUBMITPORT_VCSUNIT0   0x12230
+#define EXECLIST_SUBMITPORT_BCSUNIT   0x22230
+
+#define EXECLIST_STATUS_RCSUNIT      0x02234
+#define EXECLIST_STATUS_VCSUNIT0   0x12234
+#define EXECLIST_STATUS_BCSUNIT      0x22234
+
+#define EXECLIST_SQ_CONTENTS0_RCSUNIT   0x02510
+#define EXECLIST_SQ_CONTENTS0_VCSUNIT0   0x12510
+#define EXECLIST_SQ_CONTENTS0_BCSUNIT   0x22510
+
+#define EXECLIST_CONTROL_RCSUNIT   0x02550
+#define EXECLIST_CONTROL_VCSUNIT0   0x12550
+#define EXECLIST_CONTROL_BCSUNIT   0x22550
+
+#define MEMORY_MAP_SIZE (64 /* MiB */ * 1024 * 1024)
+
+#define PTE_SIZE 4
+#define GEN8_PTE_SIZE 8
+
+#define NUM_PT_ENTRIES (ALIGN(MEMORY_MAP_SIZE, 4096) / 4096)
+#define PT_SIZE ALIGN(NUM_PT_ENTRIES * GEN8_PTE_SIZE, 4096)
+
+#define STATIC_GGTT_MAP_START 0
+
+#define RENDER_RING_ADDR STATIC_GGTT_MAP_START
+#define RENDER_CONTEXT_ADDR (RENDER_RING_ADDR + RING_SIZE)
+
+#define BLITTER_RING_ADDR (RENDER_CONTEXT_ADDR + PPHWSP_SIZE + GEN9_LR_CONTEXT_RENDER_SIZE)
+#define BLITTER_CONTEXT_ADDR (BLITTER_RING_ADDR + RING_SIZE)
+
+#define VIDEO_RING_ADDR (BLITTER_CONTEXT_ADDR + PPHWSP_SIZE + GEN8_LR_CONTEXT_OTHER_SIZE)
+#define VIDEO_CONTEXT_ADDR (VIDEO_RING_ADDR + RING_SIZE)
+
+#define STATIC_GGTT_MAP_END (VIDEO_CONTEXT_ADDR + PPHWSP_SIZE + GEN8_LR_CONTEXT_OTHER_SIZE)
+#define STATIC_GGTT_MAP_SIZE (STATIC_GGTT_MAP_END - STATIC_GGTT_MAP_START)
+
+#define PML4_PHYS_ADDR ((uint64_t)(STATIC_GGTT_MAP_END))
+
+#define CONTEXT_FLAGS (0x339)   /* Normal Priority | L3-LLC Coherency |
+                                 * PPGTT Enabled |
+                                 * Legacy Context with 64 bit VA support |
+                                 * Valid
+                                 */
+
+#define RENDER_CONTEXT_DESCRIPTOR  ((uint64_t)1 << 62 | RENDER_CONTEXT_ADDR  | CONTEXT_FLAGS)
+#define BLITTER_CONTEXT_DESCRIPTOR ((uint64_t)2 << 62 | BLITTER_CONTEXT_ADDR | CONTEXT_FLAGS)
+#define VIDEO_CONTEXT_DESCRIPTOR   ((uint64_t)3 << 62 | VIDEO_CONTEXT_ADDR   | CONTEXT_FLAGS)
+
+#include "gen8_context.h"
+#include "gen10_context.h"
+
+#endif /* GEN_CONTEXT_H */
diff --git a/src/intel/tools/intel_aub.h b/src/intel/tools/intel_aub.h
index 5f0aba8..74ca26a 100644
--- a/src/intel/tools/intel_aub.h
+++ b/src/intel/tools/intel_aub.h
@@ -49,6 +49,12 @@
 #define CMD_AUB			(7 << 29)
 
 #define CMD_AUB_HEADER		(CMD_AUB | (1 << 23) | (0x05 << 16))
+
+#define CMD_MEM_TRACE_REGISTER_POLL	(CMD_AUB | (0x2e << 23) | (0x02 << 16))
+#define CMD_MEM_TRACE_REGISTER_WRITE	(CMD_AUB | (0x2e << 23) | (0x03 << 16))
+#define CMD_MEM_TRACE_MEMORY_WRITE	(CMD_AUB | (0x2e << 23) | (0x06 << 16))
+#define CMD_MEM_TRACE_VERSION		(CMD_AUB | (0x2e << 23) | (0x0e << 16))
+
 /* DW1 */
 # define AUB_HEADER_MAJOR_SHIFT		24
 # define AUB_HEADER_MINOR_SHIFT		16
@@ -92,8 +98,29 @@
 #define AUB_TRACE_MEMTYPE_PCI		(3 << 16)
 #define AUB_TRACE_MEMTYPE_GTT_ENTRY     (4 << 16)
 
+#define AUB_MEM_TRACE_VERSION_FILE_VERSION	1
+
 /* DW2 */
 
+#define AUB_MEM_TRACE_VERSION_DEVICE_MASK	0x0000ff00
+#define AUB_MEM_TRACE_VERSION_DEVICE_SHIFT      8
+
+#define AUB_MEM_TRACE_VERSION_METHOD_MASK	0x000c0000
+#define AUB_MEM_TRACE_VERSION_METHOD_PHY	(1 << 18)
+
+#define AUB_MEM_TRACE_REGISTER_SIZE_MASK	0x000f0000
+#define AUB_MEM_TRACE_REGISTER_SIZE_DWORD	(2 << 16)
+
+#define AUB_MEM_TRACE_REGISTER_SPACE_MASK	0xf0000000
+#define AUB_MEM_TRACE_REGISTER_SPACE_MMIO	(0 << 28)
+
+/* DW3 */
+
+#define AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_MASK		0xf0000000
+#define AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT		(0 << 28)
+#define AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_PHYSICAL	(2 << 28)
+#define AUB_MEM_TRACE_MEMORY_ADDRESS_SPACE_GGTT_ENTRY	(4 << 28)
+
 /**
  * aub_state_struct_type enum values are encoded with the top 16 bits
  * representing the type to be delivered to the .aub file, and the bottom 16
diff --git a/src/intel/tools/intel_dump_gpu.c b/src/intel/tools/intel_dump_gpu.c
new file mode 100644
index 0000000..a71103f
--- /dev/null
+++ b/src/intel/tools/intel_dump_gpu.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <dlfcn.h>
+#include <i915_drm.h>
+#include <inttypes.h>
+
+#include "intel_aub.h"
+#include "aub_write.h"
+
+#include "dev/gen_device_info.h"
+#include "util/macros.h"
+
+static int close_init_helper(int fd);
+static int ioctl_init_helper(int fd, unsigned long request, ...);
+
+static int (*libc_close)(int fd) = close_init_helper;
+static int (*libc_ioctl)(int fd, unsigned long request, ...) = ioctl_init_helper;
+
+static int drm_fd = -1;
+static char *output_filename = NULL;
+static FILE *output_file = NULL;
+static int verbose = 0;
+static bool device_override;
+
+#define MAX_BO_COUNT 64 * 1024
+
+struct bo {
+   uint32_t size;
+   uint64_t offset;
+   void *map;
+};
+
+static struct bo *bos;
+
+#define DRM_MAJOR 226
+
+/* We set bit 0 in the map pointer for userptr BOs so we know not to
+ * munmap them on DRM_IOCTL_GEM_CLOSE.
+ */
+#define USERPTR_FLAG 1
+#define IS_USERPTR(p) ((uintptr_t) (p) & USERPTR_FLAG)
+#define GET_PTR(p) ( (void *) ((uintptr_t) p & ~(uintptr_t) 1) )
+
+static void __attribute__ ((format(__printf__, 2, 3)))
+fail_if(int cond, const char *format, ...)
+{
+   va_list args;
+
+   if (!cond)
+      return;
+
+   va_start(args, format);
+   fprintf(stderr, "intel_dump_gpu: ");
+   vfprintf(stderr, format, args);
+   va_end(args);
+
+   raise(SIGTRAP);
+}
+
+static struct bo *
+get_bo(uint32_t handle)
+{
+   struct bo *bo;
+
+   fail_if(handle >= MAX_BO_COUNT, "bo handle too large\n");
+   bo = &bos[handle];
+
+   return bo;
+}
+
+static inline uint32_t
+align_u32(uint32_t v, uint32_t a)
+{
+   return (v + a - 1) & ~(a - 1);
+}
+
+static struct gen_device_info devinfo = {0};
+static uint32_t device = 0;
+static struct aub_file aub_file;
+
+static void *
+relocate_bo(struct bo *bo, const struct drm_i915_gem_execbuffer2 *execbuffer2,
+            const struct drm_i915_gem_exec_object2 *obj)
+{
+   const struct drm_i915_gem_exec_object2 *exec_objects =
+      (struct drm_i915_gem_exec_object2 *) (uintptr_t) execbuffer2->buffers_ptr;
+   const struct drm_i915_gem_relocation_entry *relocs =
+      (const struct drm_i915_gem_relocation_entry *) (uintptr_t) obj->relocs_ptr;
+   void *relocated;
+   int handle;
+
+   relocated = malloc(bo->size);
+   fail_if(relocated == NULL, "out of memory\n");
+   memcpy(relocated, GET_PTR(bo->map), bo->size);
+   for (size_t i = 0; i < obj->relocation_count; i++) {
+      fail_if(relocs[i].offset >= bo->size, "reloc outside bo\n");
+
+      if (execbuffer2->flags & I915_EXEC_HANDLE_LUT)
+         handle = exec_objects[relocs[i].target_handle].handle;
+      else
+         handle = relocs[i].target_handle;
+
+      aub_write_reloc(&devinfo, ((char *)relocated) + relocs[i].offset,
+                      get_bo(handle)->offset + relocs[i].delta);
+   }
+
+   return relocated;
+}
+
+static int
+gem_ioctl(int fd, unsigned long request, void *argp)
+{
+   int ret;
+
+   do {
+      ret = libc_ioctl(fd, request, argp);
+   } while (ret == -1 && (errno == EINTR || errno == EAGAIN));
+
+   return ret;
+}
+
+static void *
+gem_mmap(int fd, uint32_t handle, uint64_t offset, uint64_t size)
+{
+   struct drm_i915_gem_mmap mmap = {
+      .handle = handle,
+      .offset = offset,
+      .size = size
+   };
+
+   if (gem_ioctl(fd, DRM_IOCTL_I915_GEM_MMAP, &mmap) == -1)
+      return MAP_FAILED;
+
+   return (void *)(uintptr_t) mmap.addr_ptr;
+}
+
+static int
+gem_get_param(int fd, uint32_t param)
+{
+   int value;
+   drm_i915_getparam_t gp = {
+      .param = param,
+      .value = &value
+   };
+
+   if (gem_ioctl(fd, DRM_IOCTL_I915_GETPARAM, &gp) == -1)
+      return 0;
+
+   return value;
+}
+
+static void
+dump_execbuffer2(int fd, struct drm_i915_gem_execbuffer2 *execbuffer2)
+{
+   struct drm_i915_gem_exec_object2 *exec_objects =
+      (struct drm_i915_gem_exec_object2 *) (uintptr_t) execbuffer2->buffers_ptr;
+   uint32_t ring_flag = execbuffer2->flags & I915_EXEC_RING_MASK;
+   uint32_t offset;
+   struct drm_i915_gem_exec_object2 *obj;
+   struct bo *bo, *batch_bo;
+   int batch_index;
+   void *data;
+
+   /* We can't do this at open time as we're not yet authenticated. */
+   if (device == 0) {
+      device = gem_get_param(fd, I915_PARAM_CHIPSET_ID);
+      fail_if(device == 0 || devinfo.gen == 0, "failed to identify chipset\n");
+   }
+   if (devinfo.gen == 0) {
+      fail_if(!gen_get_device_info(device, &devinfo),
+              "failed to identify chipset=0x%x\n", device);
+
+      aub_file_init(&aub_file, output_file, device);
+      if (verbose == 2)
+         aub_file.verbose_log_file = stdout;
+      aub_write_header(&aub_file, program_invocation_short_name);
+
+      if (verbose)
+         printf("[running, output file %s, chipset id 0x%04x, gen %d]\n",
+                output_filename, device, devinfo.gen);
+   }
+
+   if (aub_use_execlists(&aub_file))
+      offset = 0x1000;
+   else
+      offset = aub_gtt_size(&aub_file);
+
+   if (verbose)
+      printf("Dumping execbuffer2:\n");
+
+   for (uint32_t i = 0; i < execbuffer2->buffer_count; i++) {
+      obj = &exec_objects[i];
+      bo = get_bo(obj->handle);
+
+      /* If bo->size == 0, this means they passed us an invalid
+       * buffer.  The kernel will reject it and so should we.
+       */
+      if (bo->size == 0) {
+         if (verbose)
+            printf("BO #%d is invalid!\n", obj->handle);
+         return;
+      }
+
+      if (obj->flags & EXEC_OBJECT_PINNED) {
+         bo->offset = obj->offset;
+         if (verbose)
+            printf("BO #%d (%dB) pinned @ 0x%lx\n",
+                   obj->handle, bo->size, bo->offset);
+      } else {
+         if (obj->alignment != 0)
+            offset = align_u32(offset, obj->alignment);
+         bo->offset = offset;
+         if (verbose)
+            printf("BO #%d (%dB) @ 0x%lx\n", obj->handle,
+                   bo->size, bo->offset);
+         offset = align_u32(offset + bo->size + 4095, 4096);
+      }
+
+      if (bo->map == NULL && bo->size > 0)
+         bo->map = gem_mmap(fd, obj->handle, 0, bo->size);
+      fail_if(bo->map == MAP_FAILED, "bo mmap failed\n");
+
+      if (aub_use_execlists(&aub_file))
+         aub_map_ppgtt(&aub_file, bo->offset, bo->size);
+   }
+
+   batch_index = (execbuffer2->flags & I915_EXEC_BATCH_FIRST) ? 0 :
+      execbuffer2->buffer_count - 1;
+   batch_bo = get_bo(exec_objects[batch_index].handle);
+   for (uint32_t i = 0; i < execbuffer2->buffer_count; i++) {
+      obj = &exec_objects[i];
+      bo = get_bo(obj->handle);
+
+      if (obj->relocation_count > 0)
+         data = relocate_bo(bo, execbuffer2, obj);
+      else
+         data = bo->map;
+
+      if (bo == batch_bo) {
+         aub_write_trace_block(&aub_file, AUB_TRACE_TYPE_BATCH,
+                               GET_PTR(data), bo->size, bo->offset);
+      } else {
+         aub_write_trace_block(&aub_file, AUB_TRACE_TYPE_NOTYPE,
+                               GET_PTR(data), bo->size, bo->offset);
+      }
+
+      if (data != bo->map)
+         free(data);
+   }
+
+   aub_write_exec(&aub_file,
+                  batch_bo->offset + execbuffer2->batch_start_offset,
+                  offset, ring_flag);
+
+   if (device_override &&
+       (execbuffer2->flags & I915_EXEC_FENCE_ARRAY) != 0) {
+      struct drm_i915_gem_exec_fence *fences =
+         (void*)(uintptr_t)execbuffer2->cliprects_ptr;
+      for (uint32_t i = 0; i < execbuffer2->num_cliprects; i++) {
+         if ((fences[i].flags & I915_EXEC_FENCE_SIGNAL) != 0) {
+            struct drm_syncobj_array arg = {
+               .handles = (uintptr_t)&fences[i].handle,
+               .count_handles = 1,
+               .pad = 0,
+            };
+            libc_ioctl(fd, DRM_IOCTL_SYNCOBJ_SIGNAL, &arg);
+         }
+      }
+   }
+}
+
+static void
+add_new_bo(int handle, uint64_t size, void *map)
+{
+   struct bo *bo = &bos[handle];
+
+   fail_if(handle >= MAX_BO_COUNT, "bo handle out of range\n");
+   fail_if(size == 0, "bo size is invalid\n");
+
+   bo->size = size;
+   bo->map = map;
+}
+
+static void
+remove_bo(int handle)
+{
+   struct bo *bo = get_bo(handle);
+
+   if (bo->map && !IS_USERPTR(bo->map))
+      munmap(bo->map, bo->size);
+   bo->size = 0;
+   bo->map = NULL;
+}
+
+__attribute__ ((visibility ("default"))) int
+close(int fd)
+{
+   if (fd == drm_fd)
+      drm_fd = -1;
+
+   return libc_close(fd);
+}
+
+static void
+maybe_init(void)
+{
+   static bool initialized = false;
+   FILE *config;
+   char *key, *value;
+
+   if (initialized)
+      return;
+
+   initialized = true;
+
+   config = fopen(getenv("INTEL_DUMP_GPU_CONFIG"), "r");
+   while (fscanf(config, "%m[^=]=%m[^\n]\n", &key, &value) != EOF) {
+      if (!strcmp(key, "verbose")) {
+         if (!strcmp(value, "1")) {
+            verbose = 1;
+         } else if (!strcmp(value, "2")) {
+            verbose = 2;
+         }
+      } else if (!strcmp(key, "device")) {
+         fail_if(sscanf(value, "%i", &device) != 1,
+                 "failed to parse device id '%s'",
+                 value);
+         device_override = true;
+      } else if (!strcmp(key, "file")) {
+         output_filename = strdup(value);
+         output_file = fopen(output_filename, "w+");
+         fail_if(output_file == NULL,
+                 "failed to open file '%s'\n",
+                 output_filename);
+      } else {
+         fprintf(stderr, "unknown option '%s'\n", key);
+      }
+
+      free(key);
+      free(value);
+   }
+   fclose(config);
+
+   bos = calloc(MAX_BO_COUNT, sizeof(bos[0]));
+   fail_if(bos == NULL, "out of memory\n");
+}
+
+__attribute__ ((visibility ("default"))) int
+ioctl(int fd, unsigned long request, ...)
+{
+   va_list args;
+   void *argp;
+   int ret;
+   struct stat buf;
+
+   va_start(args, request);
+   argp = va_arg(args, void *);
+   va_end(args);
+
+   if (_IOC_TYPE(request) == DRM_IOCTL_BASE &&
+       drm_fd != fd && fstat(fd, &buf) == 0 &&
+       (buf.st_mode & S_IFMT) == S_IFCHR && major(buf.st_rdev) == DRM_MAJOR) {
+      drm_fd = fd;
+      if (verbose)
+         printf("[intercept drm ioctl on fd %d]\n", fd);
+   }
+
+   if (fd == drm_fd) {
+      maybe_init();
+
+      switch (request) {
+      case DRM_IOCTL_I915_GETPARAM: {
+         struct drm_i915_getparam *getparam = argp;
+
+         if (device_override && getparam->param == I915_PARAM_CHIPSET_ID) {
+            *getparam->value = device;
+            return 0;
+         }
+
+         ret = libc_ioctl(fd, request, argp);
+
+         /* If the application looks up chipset_id
+          * (they typically do), we'll piggy-back on
+          * their ioctl and store the id for later
+          * use. */
+         if (ret == 0 && getparam->param == I915_PARAM_CHIPSET_ID)
+            device = *getparam->value;
+
+         return ret;
+      }
+
+      case DRM_IOCTL_I915_GEM_EXECBUFFER: {
+         static bool once;
+         if (!once) {
+            fprintf(stderr,
+                    "application uses DRM_IOCTL_I915_GEM_EXECBUFFER, not handled\n");
+            once = true;
+         }
+         return libc_ioctl(fd, request, argp);
+      }
+
+      case DRM_IOCTL_I915_GEM_EXECBUFFER2:
+      case DRM_IOCTL_I915_GEM_EXECBUFFER2_WR: {
+         dump_execbuffer2(fd, argp);
+         if (device_override)
+            return 0;
+
+         return libc_ioctl(fd, request, argp);
+      }
+
+      case DRM_IOCTL_I915_GEM_CREATE: {
+         struct drm_i915_gem_create *create = argp;
+
+         ret = libc_ioctl(fd, request, argp);
+         if (ret == 0)
+            add_new_bo(create->handle, create->size, NULL);
+
+         return ret;
+      }
+
+      case DRM_IOCTL_I915_GEM_USERPTR: {
+         struct drm_i915_gem_userptr *userptr = argp;
+
+         ret = libc_ioctl(fd, request, argp);
+         if (ret == 0)
+            add_new_bo(userptr->handle, userptr->user_size,
+                       (void *) (uintptr_t) (userptr->user_ptr | USERPTR_FLAG));
+         return ret;
+      }
+
+      case DRM_IOCTL_GEM_CLOSE: {
+         struct drm_gem_close *close = argp;
+
+         remove_bo(close->handle);
+
+         return libc_ioctl(fd, request, argp);
+      }
+
+      case DRM_IOCTL_GEM_OPEN: {
+         struct drm_gem_open *open = argp;
+
+         ret = libc_ioctl(fd, request, argp);
+         if (ret == 0)
+            add_new_bo(open->handle, open->size, NULL);
+
+         return ret;
+      }
+
+      case DRM_IOCTL_PRIME_FD_TO_HANDLE: {
+         struct drm_prime_handle *prime = argp;
+
+         ret = libc_ioctl(fd, request, argp);
+         if (ret == 0) {
+            off_t size;
+
+            size = lseek(prime->fd, 0, SEEK_END);
+            fail_if(size == -1, "failed to get prime bo size\n");
+            add_new_bo(prime->handle, size, NULL);
+         }
+
+         return ret;
+      }
+
+      default:
+         return libc_ioctl(fd, request, argp);
+      }
+   } else {
+      return libc_ioctl(fd, request, argp);
+   }
+}
+
+static void
+init(void)
+{
+   libc_close = dlsym(RTLD_NEXT, "close");
+   libc_ioctl = dlsym(RTLD_NEXT, "ioctl");
+   fail_if(libc_close == NULL || libc_ioctl == NULL,
+           "failed to get libc ioctl or close\n");
+}
+
+static int
+close_init_helper(int fd)
+{
+   init();
+   return libc_close(fd);
+}
+
+static int
+ioctl_init_helper(int fd, unsigned long request, ...)
+{
+   va_list args;
+   void *argp;
+
+   va_start(args, request);
+   argp = va_arg(args, void *);
+   va_end(args);
+
+   init();
+   return libc_ioctl(fd, request, argp);
+}
+
+static void __attribute__ ((destructor))
+fini(void)
+{
+   free(output_filename);
+   aub_file_finish(&aub_file);
+   free(bos);
+}
diff --git a/src/intel/tools/intel_dump_gpu.in b/src/intel/tools/intel_dump_gpu.in
new file mode 100755
index 0000000..aa187ba
--- /dev/null
+++ b/src/intel/tools/intel_dump_gpu.in
@@ -0,0 +1,106 @@
+#!/bin/bash
+# -*- mode: sh -*-
+
+function show_help() {
+    cat <<EOF
+Usage: intel_dump_gpu [OPTION]... [--] COMMAND ARGUMENTS
+
+Run COMMAND with ARGUMENTS and dump an AUB file that captures buffer
+contents and execution of the GEM application.
+
+  -o, --output=FILE  Name of AUB file. Defaults to COMMAND.aub
+
+      --device=ID    Override PCI ID of the reported device
+
+  -v                 Enable verbose output
+
+  -vv                Enable extra verbosity - dumps gtt mappings
+
+      --help         Display this help message and exit
+
+EOF
+
+    exit 0
+}
+
+ld_preload="@install_libexecdir@/libintel_dump_gpu.so${LD_PPRELOAD:+:$LD_PRELOAD}"
+args=""
+file=""
+gdb=""
+
+function add_arg() {
+    arg=$1
+    args="$args$arg\n"
+}
+
+while true; do
+    case "$1" in
+        -o)
+            file=$2
+            add_arg "file=${file:-$(basename ${file}).aub}"
+            shift 2
+            ;;
+        -v)
+            add_arg "verbose=1"
+            shift 1
+            ;;
+        -vv)
+            add_arg "verbose=2"
+            shift 1
+            ;;
+        -o*)
+            file=${1##-o}
+            add_arg "file=${file:-$(basename ${file}).aub}"
+            shift
+            ;;
+        --output=*)
+            file=${1##--output=}
+            add_arg "file=${file:-$(basename ${file}).aub}"
+            shift
+            ;;
+        --device=*)
+            add_arg "device=${1##--device=}"
+            shift
+            ;;
+        --gdb)
+            gdb=1
+            shift
+            ;;
+        -g)
+            gdb=1
+            shift
+            ;;
+        --help)
+            show_help
+            ;;
+        --)
+            shift
+            break
+            ;;
+        -*)
+            echo "intel_aubdump: invalid option: $1"
+            echo
+            show_help
+            ;;
+        *)
+            break
+            ;;
+    esac
+done
+
+[ -z $1 ] && show_help
+
+[ -z $file ] && add_arg "file=intel.aub"
+
+tmp_file=`mktemp`
+echo -e $args > $tmp_file
+
+if [ -z $gdb ]; then
+    LD_PRELOAD="$ld_preload" INTEL_DUMP_GPU_CONFIG=$tmp_file $@
+else
+    gdb -iex "set exec-wrapper env LD_PRELOAD=$ld_preload INTEL_DUMP_GPU_CONFIG=$tmp_file" --args $@
+fi
+
+ret=$?
+rm $tmp_file
+exit $ret
diff --git a/src/intel/tools/meson.build b/src/intel/tools/meson.build
index e19de5a..bef0af0 100644
--- a/src/intel/tools/meson.build
+++ b/src/intel/tools/meson.build
@@ -20,8 +20,7 @@
 
 aubinator = executable(
   'aubinator',
-  files('aubinator.c', 'disasm.c', 'gen_batch_decoder.c',
-        'gen_disasm.h', 'intel_aub.h'),
+  files('aubinator.c', 'intel_aub.h'),
   dependencies : [dep_expat, dep_zlib, dep_dl, dep_thread, dep_m],
   include_directories : [inc_common, inc_intel],
   link_with : [libintel_common, libintel_compiler, libintel_dev, libmesa_util],
@@ -32,8 +31,7 @@
 
 aubinator_error_decode = executable(
   'aubinator_error_decode',
-  files('aubinator_error_decode.c', 'disasm.c', 'gen_disasm.h',
-        'gen_batch_decoder.c'),
+  files('aubinator_error_decode.c'),
   dependencies : [dep_zlib, dep_thread],
   include_directories : [inc_common, inc_intel],
   link_with : [libintel_common, libintel_compiler, libintel_dev, libmesa_util],
@@ -42,6 +40,17 @@
   install : with_tools.contains('intel'),
 )
 
+error2aub = executable(
+  'intel_error2aub',
+  files('aub_write.h', 'aub_write.c', 'error2aub.c'),
+  dependencies : [dep_zlib, dep_dl, dep_thread, dep_m],
+  include_directories : [inc_common, inc_intel, inc_drm_uapi],
+  link_with : [libintel_dev],
+  c_args : [c_vis_args, no_override_init_args],
+  build_by_default : with_tools.contains('intel'),
+  install : with_tools.contains('intel'),
+)
+
 if with_tools.contains('intel')
   sanitize_data = configuration_data()
   sanitize_data.set(
@@ -61,8 +70,27 @@
     dependencies : [dep_dl, dep_thread],
     include_directories : [inc_common, inc_intel, inc_drm_uapi],
     link_with : [libintel_common, libmesa_util],
-    c_args : [c_vis_args, no_override_init_args],
+    c_args : [c_vis_args, no_override_init_args, c_sse2_args],
     install_dir: get_option('libexecdir'),
     install: true,
   )
+
+  configure_file(
+    input : 'intel_dump_gpu.in',
+    output : '@BASENAME@',
+    install_dir: get_option('bindir'),
+    configuration: sanitize_data,
+  )
+
+  libintel_dump_gpu = shared_library(
+    'intel_dump_gpu',
+    files('gen_context.h', 'gen8_context.h', 'gen10_context.h',
+          'intel_aub.h', 'aub_write.h', 'aub_write.c', 'intel_dump_gpu.c'),
+    dependencies : dep_dl,
+    include_directories : [inc_common, inc_intel, inc_drm_uapi],
+    link_with : libintel_dev,
+    c_args : [c_vis_args, no_override_init_args],
+    install_dir: get_option('libexecdir'),
+    install : true
+  )
 endif
diff --git a/src/intel/vulkan/.gitignore b/src/intel/vulkan/.gitignore
index 4ea978d..b84b171 100644
--- a/src/intel/vulkan/.gitignore
+++ b/src/intel/vulkan/.gitignore
@@ -1,5 +1,6 @@
 # Generated source files
 /anv_extensions.c
+/anv_extensions.h
 /anv_entrypoints.c
 /anv_entrypoints.h
 /anv_timestamp.h
diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c
index 2c72f02..7192776 100644
--- a/src/intel/vulkan/anv_allocator.c
+++ b/src/intel/vulkan/anv_allocator.c
@@ -248,6 +248,7 @@
 VkResult
 anv_block_pool_init(struct anv_block_pool *pool,
                     struct anv_device *device,
+                    uint64_t start_address,
                     uint32_t initial_size,
                     uint64_t bo_flags)
 {
@@ -255,6 +256,8 @@
 
    pool->device = device;
    pool->bo_flags = bo_flags;
+   pool->start_address = gen_canonical_address(start_address);
+
    anv_bo_init(&pool->bo, 0, 0);
 
    // Start with a large (2GB) size, assuming that the kernel won't commit pages
@@ -389,6 +392,10 @@
     * hard work for us.
     */
    anv_bo_init(&pool->bo, pool->bo.gem_handle, size);
+   if (pool->bo_flags & EXEC_OBJECT_PINNED) {
+      pool->bo.offset = pool->start_address + BLOCK_POOL_MEMFD_CENTER -
+         center_bo_offset;
+   }
    pool->bo.flags = pool->bo_flags;
    pool->bo.map = map;
    pool->bo.start_offset = BLOCK_POOL_MEMFD_CENTER - center_bo_offset;
@@ -598,10 +605,12 @@
 VkResult
 anv_state_pool_init(struct anv_state_pool *pool,
                     struct anv_device *device,
+                    uint64_t start_address,
                     uint32_t block_size,
                     uint64_t bo_flags)
 {
    VkResult result = anv_block_pool_init(&pool->block_pool, device,
+                                         start_address,
                                          block_size * 16,
                                          bo_flags);
    if (result != VK_SUCCESS)
@@ -964,6 +973,7 @@
          struct bo_pool_bo_link link_copy = VG_NOACCESS_READ(link);
 
          anv_gem_munmap(pool->device, link_copy.bo.gem_handle, link_copy.bo.map, link_copy.bo.size);
+         anv_vma_free(pool->device, &link_copy.bo);
          anv_gem_close(pool->device, link_copy.bo.gem_handle);
          link = link_copy.next;
       }
@@ -1003,11 +1013,15 @@
 
    new_bo.flags = pool->bo_flags;
 
+   if (!anv_vma_alloc(pool->device, &new_bo))
+      return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY);
+
    assert(new_bo.size == pow2_size);
 
    new_bo.map = anv_gem_mmap(pool->device, new_bo.gem_handle, 0, pow2_size, 0);
    if (new_bo.map == MAP_FAILED) {
       anv_gem_close(pool->device, new_bo.gem_handle);
+      anv_vma_free(pool->device, &new_bo);
       return vk_error(VK_ERROR_MEMORY_MAP_FAILED);
    }
 
@@ -1051,8 +1065,10 @@
    for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
       for (unsigned i = 0; i < 16; i++) {
          struct anv_scratch_bo *bo = &pool->bos[i][s];
-         if (bo->exists > 0)
+         if (bo->exists > 0) {
+            anv_vma_free(device, &bo->bo);
             anv_gem_close(device, bo->bo.gem_handle);
+         }
       }
    }
 }
@@ -1150,6 +1166,11 @@
    if (device->instance->physicalDevice.has_exec_async)
       bo->bo.flags |= EXEC_OBJECT_ASYNC;
 
+   if (device->instance->physicalDevice.use_softpin)
+      bo->bo.flags |= EXEC_OBJECT_PINNED;
+
+   anv_vma_alloc(device, &bo->bo);
+
    /* Set the exists last because it may be read by other threads */
    __sync_synchronize();
    bo->exists = true;
@@ -1216,11 +1237,21 @@
    return bo ? &bo->bo : NULL;
 }
 
+#define ANV_BO_CACHE_SUPPORTED_FLAGS \
+   (EXEC_OBJECT_WRITE | \
+    EXEC_OBJECT_ASYNC | \
+    EXEC_OBJECT_SUPPORTS_48B_ADDRESS | \
+    EXEC_OBJECT_PINNED | \
+    ANV_BO_EXTERNAL)
+
 VkResult
 anv_bo_cache_alloc(struct anv_device *device,
                    struct anv_bo_cache *cache,
-                   uint64_t size, struct anv_bo **bo_out)
+                   uint64_t size, uint64_t bo_flags,
+                   struct anv_bo **bo_out)
 {
+   assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
+
    struct anv_cached_bo *bo =
       vk_alloc(&device->alloc, sizeof(struct anv_cached_bo), 8,
                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
@@ -1238,6 +1269,16 @@
       return result;
    }
 
+   bo->bo.flags = bo_flags;
+
+   if (!anv_vma_alloc(device, &bo->bo)) {
+      anv_gem_close(device, bo->bo.gem_handle);
+      vk_free(&device->alloc, bo);
+      return vk_errorf(device->instance, NULL,
+                       VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                       "failed to allocate virtual address for BO");
+   }
+
    assert(bo->bo.gem_handle);
 
    pthread_mutex_lock(&cache->mutex);
@@ -1255,12 +1296,58 @@
 VkResult anv_bo_cache_import_buffer_handle(struct anv_device* device,
                                            struct anv_bo_cache* cache,
                                            anv_buffer_handle_t gem_handle,
+                                           uint64_t bo_flags,
                                            uint64_t import_size,
                                            struct anv_bo** bo_out)
 {
+   assert(bo_flags == (bo_flags & ANV_BO_CACHE_SUPPORTED_FLAGS));
+   assert(bo_flags & ANV_BO_EXTERNAL);
+
    pthread_mutex_lock(&cache->mutex);
    struct anv_cached_bo *bo = anv_bo_cache_lookup_locked(cache, gem_handle);
    if (bo) {
+      /* We have to be careful how we combine flags so that it makes sense.
+       * Really, though, if we get to this case and it actually matters, the
+       * client has imported a BO twice in different ways and they get what
+       * they have coming.
+       */
+      uint64_t new_flags = ANV_BO_EXTERNAL;
+      new_flags |= (bo->bo.flags | bo_flags) & EXEC_OBJECT_WRITE;
+      new_flags |= (bo->bo.flags & bo_flags) & EXEC_OBJECT_ASYNC;
+      new_flags |= (bo->bo.flags & bo_flags) & EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+      new_flags |= (bo->bo.flags | bo_flags) & EXEC_OBJECT_PINNED;
+
+      /* It's theoretically possible for a BO to get imported such that it's
+       * both pinned and not pinned.  The only way this can happen is if it
+       * gets imported as both a semaphore and a memory object and that would
+       * be an application error.  Just fail out in that case.
+       */
+      if ((bo->bo.flags & EXEC_OBJECT_PINNED) !=
+          (bo_flags & EXEC_OBJECT_PINNED)) {
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_errorf(device->instance, NULL,
+                          VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                          "The same BO was imported two different ways");
+      }
+
+      /* It's also theoretically possible that someone could export a BO from
+       * one heap and import it into another or to import the same BO into two
+       * different heaps.  If this happens, we could potentially end up both
+       * allowing and disallowing 48-bit addresses.  There's not much we can
+       * do about it if we're pinning so we just throw an error and hope no
+       * app is actually that stupid.
+       */
+      if ((new_flags & EXEC_OBJECT_PINNED) &&
+          (bo->bo.flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) !=
+          (bo_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS)) {
+         pthread_mutex_unlock(&cache->mutex);
+         return vk_errorf(device->instance, NULL,
+                          VK_ERROR_INVALID_EXTERNAL_HANDLE,
+                          "The same BO was imported on two different heaps");
+      }
+
+      bo->bo.flags = new_flags;
+
       __sync_fetch_and_add(&bo->refcount, 1);
    } else {
       off_t size = import_size;
@@ -1281,6 +1368,16 @@
       bo->refcount = 1;
 
       anv_bo_init(&bo->bo, gem_handle, size);
+      bo->bo.flags = bo_flags;
+
+      if (!anv_vma_alloc(device, &bo->bo)) {
+         anv_gem_close(device, bo->bo.gem_handle);
+         pthread_mutex_unlock(&cache->mutex);
+         vk_free(&device->alloc, bo);
+         return vk_errorf(device->instance, NULL,
+                          VK_ERROR_OUT_OF_DEVICE_MEMORY,
+                          "failed to allocate virtual address for BO");
+      }
 
       _mesa_hash_table_insert(cache->bo_map, (void *)(uintptr_t)gem_handle, bo);
    }
@@ -1294,14 +1391,16 @@
 VkResult
 anv_bo_cache_import(struct anv_device* device, 
                     struct anv_bo_cache* cache,
-                    int fd, struct anv_bo** bo_out)
+                    int fd, uint64_t bo_flags,
+                    struct anv_bo** bo_out)
 {
    anv_buffer_handle_t gem_handle = anv_gem_fd_to_handle(device, fd);
    if (!gem_handle) {
       return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
    }
 
-   return anv_bo_cache_import_buffer_handle(device, cache, gem_handle, lseek(fd, 0, SEEK_END), bo_out);
+   return anv_bo_cache_import_buffer_handle(device, cache, gem_handle, bo_flags, 
+      lseek(fd, 0, SEEK_END), bo_out);
 }
 
 VkResult
@@ -1312,6 +1411,12 @@
    assert(anv_bo_cache_lookup(cache, bo_in->gem_handle) == bo_in);
    struct anv_cached_bo *bo = (struct anv_cached_bo *)bo_in;
 
+   /* This BO must have been flagged external in order for us to be able
+    * to export it.  This is done based on external options passed into
+    * anv_AllocateMemory.
+    */
+   assert(bo->bo.flags & ANV_BO_EXTERNAL);
+
    int fd = anv_gem_handle_to_fd(device, bo->bo.gem_handle);
    if (fd < 0)
       return vk_error(VK_ERROR_TOO_MANY_OBJECTS);
@@ -1376,6 +1481,8 @@
    if (bo->bo.map)
       anv_gem_munmap(device, bo->bo.gem_handle, bo->bo.map, bo->bo.size);
 
+   anv_vma_free(device, &bo->bo);
+
    anv_gem_close(device, bo->bo.gem_handle);
 
    /* Don't unlock until we've actually closed the BO.  The whole point of
diff --git a/src/intel/vulkan/anv_android.c b/src/intel/vulkan/anv_android.c
index d453fb8c..bb67a3a 100644
--- a/src/intel/vulkan/anv_android.c
+++ b/src/intel/vulkan/anv_android.c
@@ -128,7 +128,13 @@
     */
    int dma_buf = gralloc_info->handle->data[0];
 
-   result = anv_bo_cache_import(device, &device->bo_cache, dma_buf, &bo);
+   uint64_t bo_flags = ANV_BO_EXTERNAL;
+   if (device->instance->physicalDevice.supports_48bit_addresses)
+      bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+   if (device->instance->physicalDevice.use_softpin)
+      bo_flags |= EXEC_OBJECT_PINNED;
+
+   result = anv_bo_cache_import(device, &device->bo_cache, dma_buf, bo_flags, &bo);
    if (result != VK_SUCCESS) {
       return vk_errorf(device->instance, device, result,
                        "failed to import dma-buf from VkNativeBufferANDROID");
@@ -183,9 +189,9 @@
    }
 
    assert(image->n_planes == 1);
-   assert(image->planes[0].bo_offset == 0);
+   assert(image->planes[0].address.offset == 0);
 
-   image->planes[0].bo = bo;
+   image->planes[0].address.bo = bo;
    image->planes[0].bo_is_owned = true;
 
    /* We need to set the WRITE flag on window system buffers so that GEM will
diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c
index 2e60a40..5e21a38 100644
--- a/src/intel/vulkan/anv_batch_chain.c
+++ b/src/intel/vulkan/anv_batch_chain.c
@@ -75,11 +75,24 @@
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
+   list->deps = _mesa_set_create(NULL, _mesa_hash_pointer,
+                                 _mesa_key_pointer_equal);
+
+   if (!list->deps) {
+      vk_free(alloc, list->relocs);
+      vk_free(alloc, list->reloc_bos);
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+   }
+
    if (other_list) {
       memcpy(list->relocs, other_list->relocs,
              list->array_length * sizeof(*list->relocs));
       memcpy(list->reloc_bos, other_list->reloc_bos,
              list->array_length * sizeof(*list->reloc_bos));
+      struct set_entry *entry;
+      set_foreach(other_list->deps, entry) {
+         _mesa_set_add_pre_hashed(list->deps, entry->hash, entry->key);
+      }
    }
 
    return VK_SUCCESS;
@@ -98,6 +111,7 @@
 {
    vk_free(alloc, list->relocs);
    vk_free(alloc, list->reloc_bos);
+   _mesa_set_destroy(list->deps, NULL);
 }
 
 static VkResult
@@ -148,6 +162,11 @@
    struct drm_i915_gem_relocation_entry *entry;
    int index;
 
+   if (target_bo->flags & EXEC_OBJECT_PINNED) {
+      _mesa_set_add(list->deps, target_bo);
+      return VK_SUCCESS;
+   }
+
    VkResult result = anv_reloc_list_grow(list, alloc, 1);
    if (result != VK_SUCCESS)
       return result;
@@ -185,6 +204,12 @@
       list->relocs[i + list->num_relocs].offset += offset;
 
    list->num_relocs += other->num_relocs;
+
+   struct set_entry *entry;
+   set_foreach(other->deps, entry) {
+      _mesa_set_add_pre_hashed(list->deps, entry->hash, entry->key);
+   }
+
    return VK_SUCCESS;
 }
 
@@ -338,6 +363,7 @@
    batch->end = bbo->bo.map + bbo->bo.size - batch_padding;
    batch->relocs = &bbo->relocs;
    bbo->relocs.num_relocs = 0;
+   _mesa_set_clear(bbo->relocs.deps, NULL);
 }
 
 static void
@@ -390,6 +416,39 @@
 }
 
 static void
+anv_batch_bo_link(struct anv_cmd_buffer *cmd_buffer,
+                  struct anv_batch_bo *prev_bbo,
+                  struct anv_batch_bo *next_bbo,
+                  uint32_t next_bbo_offset)
+{
+   MAYBE_UNUSED const uint32_t bb_start_offset =
+      prev_bbo->length - GEN8_MI_BATCH_BUFFER_START_length * 4;
+   MAYBE_UNUSED const uint32_t *bb_start = prev_bbo->bo.map + bb_start_offset;
+
+   /* Make sure we're looking at a MI_BATCH_BUFFER_START */
+   assert(((*bb_start >> 29) & 0x07) == 0);
+   assert(((*bb_start >> 23) & 0x3f) == 49);
+
+   if (cmd_buffer->device->instance->physicalDevice.use_softpin) {
+      assert(prev_bbo->bo.flags & EXEC_OBJECT_PINNED);
+      assert(next_bbo->bo.flags & EXEC_OBJECT_PINNED);
+
+      write_reloc(cmd_buffer->device,
+                  prev_bbo->bo.map + bb_start_offset + 4,
+                  next_bbo->bo.offset + next_bbo_offset, true);
+   } else {
+      uint32_t reloc_idx = prev_bbo->relocs.num_relocs - 1;
+      assert(prev_bbo->relocs.relocs[reloc_idx].offset == bb_start_offset + 4);
+
+      prev_bbo->relocs.reloc_bos[reloc_idx] = &next_bbo->bo;
+      prev_bbo->relocs.relocs[reloc_idx].delta = next_bbo_offset;
+
+      /* Use a bogus presumed offset to force a relocation */
+      prev_bbo->relocs.relocs[reloc_idx].presumed_offset = -1;
+   }
+}
+
+static void
 anv_batch_bo_destroy(struct anv_batch_bo *bbo,
                      struct anv_cmd_buffer *cmd_buffer)
 {
@@ -415,16 +474,8 @@
          break;
       list_addtail(&new_bbo->link, new_list);
 
-      if (prev_bbo) {
-         /* As we clone this list of batch_bo's, they chain one to the
-          * other using MI_BATCH_BUFFER_START commands.  We need to fix up
-          * those relocations as we go.  Fortunately, this is pretty easy
-          * as it will always be the last relocation in the list.
-          */
-         uint32_t last_idx = prev_bbo->relocs.num_relocs - 1;
-         assert(prev_bbo->relocs.reloc_bos[last_idx] == &bbo->bo);
-         prev_bbo->relocs.reloc_bos[last_idx] = &new_bbo->bo;
-      }
+      if (prev_bbo)
+         anv_batch_bo_link(cmd_buffer, prev_bbo, new_bbo, 0);
 
       prev_bbo = new_bbo;
    }
@@ -452,7 +503,7 @@
 {
    struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
    return (struct anv_address) {
-      .bo = &cmd_buffer->device->surface_state_pool.block_pool.bo,
+      .bo = &anv_binding_table_pool(cmd_buffer->device)->block_pool.bo,
       .offset = bt_block->offset,
    };
 }
@@ -480,7 +531,7 @@
    anv_batch_emit(&cmd_buffer->batch, GEN8_MI_BATCH_BUFFER_START, bbs) {
       bbs.DWordLength               = cmd_buffer->device->info.gen < 8 ?
                                       gen7_length : gen8_length;
-      bbs._2ndLevelBatchBuffer      = _1stlevelbatch;
+      bbs.SecondLevelBatchBuffer    = Firstlevelbatch;
       bbs.AddressSpaceIndicator     = ASI_PPGTT;
       bbs.BatchBufferStartAddress   = (struct anv_address) { bo, offset };
    }
@@ -619,7 +670,8 @@
 anv_cmd_buffer_alloc_binding_table(struct anv_cmd_buffer *cmd_buffer,
                                    uint32_t entries, uint32_t *state_offset)
 {
-   struct anv_state_pool *state_pool = &cmd_buffer->device->surface_state_pool;
+   struct anv_device *device = cmd_buffer->device;
+   struct anv_state_pool *state_pool = &device->surface_state_pool;
    struct anv_state *bt_block = u_vector_head(&cmd_buffer->bt_block_states);
    struct anv_state state;
 
@@ -629,12 +681,19 @@
       return (struct anv_state) { 0 };
 
    state.offset = cmd_buffer->bt_next;
-   state.map = state_pool->block_pool.map + bt_block->offset + state.offset;
+   state.map = anv_binding_table_pool(device)->block_pool.map +
+      bt_block->offset + state.offset;
 
    cmd_buffer->bt_next += state.alloc_size;
 
-   assert(bt_block->offset < 0);
-   *state_offset = -bt_block->offset;
+   if (device->instance->physicalDevice.use_softpin) {
+      assert(bt_block->offset >= 0);
+      *state_offset = device->surface_state_pool.block_pool.start_address -
+         device->binding_table_pool.block_pool.start_address - bt_block->offset;
+   } else {
+      assert(bt_block->offset < 0);
+      *state_offset = -bt_block->offset;
+   }
 
    return state;
 }
@@ -658,15 +717,13 @@
 VkResult
 anv_cmd_buffer_new_binding_table_block(struct anv_cmd_buffer *cmd_buffer)
 {
-   struct anv_state_pool *state_pool = &cmd_buffer->device->surface_state_pool;
-
    struct anv_state *bt_block = u_vector_add(&cmd_buffer->bt_block_states);
    if (bt_block == NULL) {
       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY);
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
    }
 
-   *bt_block = anv_state_pool_alloc_back(state_pool);
+   *bt_block = anv_binding_table_pool_alloc(cmd_buffer->device);
    cmd_buffer->bt_next = 0;
 
    return VK_SUCCESS;
@@ -740,7 +797,7 @@
 {
    struct anv_state *bt_block;
    u_vector_foreach(bt_block, &cmd_buffer->bt_block_states)
-      anv_state_pool_free(&cmd_buffer->device->surface_state_pool, *bt_block);
+      anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
    u_vector_finish(&cmd_buffer->bt_block_states);
 
    anv_reloc_list_finish(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc);
@@ -772,12 +829,13 @@
 
    while (u_vector_length(&cmd_buffer->bt_block_states) > 1) {
       struct anv_state *bt_block = u_vector_remove(&cmd_buffer->bt_block_states);
-      anv_state_pool_free(&cmd_buffer->device->surface_state_pool, *bt_block);
+      anv_binding_table_pool_free(cmd_buffer->device, *bt_block);
    }
    assert(u_vector_length(&cmd_buffer->bt_block_states) == 1);
    cmd_buffer->bt_next = 0;
 
    cmd_buffer->surface_relocs.num_relocs = 0;
+   _mesa_set_clear(cmd_buffer->surface_relocs.deps, NULL);
    cmd_buffer->last_ss_pool_center = 0;
 
    /* Reset the list of seen buffers */
@@ -810,20 +868,18 @@
          anv_batch_emit(&cmd_buffer->batch, GEN8_MI_NOOP, noop);
 
       cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_PRIMARY;
-   }
-
-   anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
-
-   if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+   } else {
+      assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
       /* If this is a secondary command buffer, we need to determine the
        * mode in which it will be executed with vkExecuteCommands.  We
        * determine this statically here so that this stays in sync with the
        * actual ExecuteCommands implementation.
        */
+      const uint32_t length = cmd_buffer->batch.next - cmd_buffer->batch.start;
       if (!cmd_buffer->device->can_chain_batches) {
          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_GROW_AND_EMIT;
       } else if ((cmd_buffer->batch_bos.next == cmd_buffer->batch_bos.prev) &&
-          (batch_bo->length < ANV_CMD_BUFFER_BATCH_SIZE / 2)) {
+                 (length < ANV_CMD_BUFFER_BATCH_SIZE / 2)) {
          /* If the secondary has exactly one batch buffer in its list *and*
           * that batch buffer is less than half of the maximum size, we're
           * probably better of simply copying it into our batch.
@@ -833,17 +889,28 @@
                    VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_CHAIN;
 
-         /* When we chain, we need to add an MI_BATCH_BUFFER_START command
-          * with its relocation.  In order to handle this we'll increment here
-          * so we can unconditionally decrement right before adding the
-          * MI_BATCH_BUFFER_START command.
+         /* In order to chain, we need this command buffer to contain an
+          * MI_BATCH_BUFFER_START which will jump back to the calling batch.
+          * It doesn't matter where it points now so long as has a valid
+          * relocation.  We'll adjust it later as part of the chaining
+          * process.
+          *
+          * We set the end of the batch a little short so we would be sure we
+          * have room for the chaining command.  Since we're about to emit the
+          * chaining command, let's set it back where it should go.
           */
-         batch_bo->relocs.num_relocs++;
-         cmd_buffer->batch.next += GEN8_MI_BATCH_BUFFER_START_length * 4;
+         cmd_buffer->batch.end += GEN8_MI_BATCH_BUFFER_START_length * 4;
+         assert(cmd_buffer->batch.start == batch_bo->bo.map);
+         assert(cmd_buffer->batch.end == batch_bo->bo.map + batch_bo->bo.size);
+
+         emit_batch_buffer_start(cmd_buffer, &batch_bo->bo, 0);
+         assert(cmd_buffer->batch.start == batch_bo->bo.map);
       } else {
          cmd_buffer->exec_mode = ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN;
       }
    }
+
+   anv_batch_bo_finish(batch_bo, &cmd_buffer->batch);
 }
 
 static VkResult
@@ -888,33 +955,13 @@
       struct anv_batch_bo *this_bbo = anv_cmd_buffer_current_batch_bo(primary);
       assert(primary->batch.start == this_bbo->bo.map);
       uint32_t offset = primary->batch.next - primary->batch.start;
-      const uint32_t inst_size = GEN8_MI_BATCH_BUFFER_START_length * 4;
 
-      /* Roll back the previous MI_BATCH_BUFFER_START and its relocation so we
-       * can emit a new command and relocation for the current splice.  In
-       * order to handle the initial-use case, we incremented next and
-       * num_relocs in end_batch_buffer() so we can alyways just subtract
-       * here.
+      /* Make the tail of the secondary point back to right after the
+       * MI_BATCH_BUFFER_START in the primary batch.
        */
-      last_bbo->relocs.num_relocs--;
-      secondary->batch.next -= inst_size;
-      emit_batch_buffer_start(secondary, &this_bbo->bo, offset);
+      anv_batch_bo_link(primary, last_bbo, this_bbo, offset);
+
       anv_cmd_buffer_add_seen_bbos(primary, &secondary->batch_bos);
-
-      /* After patching up the secondary buffer, we need to clflush the
-       * modified instruction in case we're on a !llc platform. We use a
-       * little loop to handle the case where the instruction crosses a cache
-       * line boundary.
-       */
-      if (!primary->device->info.has_llc) {
-         void *inst = secondary->batch.next - inst_size;
-         void *p = (void *) (((uintptr_t) inst) & ~CACHELINE_MASK);
-         __builtin_ia32_mfence();
-         while (p < secondary->batch.next) {
-            __builtin_ia32_clflush(p);
-            p += CACHELINE_SIZE;
-         }
-      }
       break;
    }
    case ANV_CMD_BUFFER_EXEC_MODE_COPY_AND_CHAIN: {
@@ -958,6 +1005,8 @@
    /* Allocated length of the 'objects' and 'bos' arrays */
    uint32_t                                  array_length;
 
+   bool                                      has_relocs;
+
    uint32_t                                  fence_count;
    uint32_t                                  fence_array_length;
    struct drm_i915_gem_exec_fence *          fences;
@@ -980,6 +1029,15 @@
    vk_free(alloc, exec->syncobjs);
 }
 
+static int
+_compare_bo_handles(const void *_bo1, const void *_bo2)
+{
+   struct anv_bo * const *bo1 = _bo1;
+   struct anv_bo * const *bo2 = _bo2;
+
+   return (*bo1)->gem_handle - (*bo2)->gem_handle;
+}
+
 static VkResult
 anv_execbuf_add_bo(struct anv_execbuf *exec,
                    struct anv_bo *bo,
@@ -1039,26 +1097,60 @@
       obj->relocs_ptr = 0;
       obj->alignment = 0;
       obj->offset = bo->offset;
-      obj->flags = bo->flags | extra_flags;
+      obj->flags = (bo->flags & ~ANV_BO_FLAG_MASK) | extra_flags;
       obj->rsvd1 = bo->start_offset;
       obj->rsvd2 = bo->size;
    }
 
-   if (relocs != NULL && obj->relocation_count == 0) {
-      /* This is the first time we've ever seen a list of relocations for
-       * this BO.  Go ahead and set the relocations and then walk the list
-       * of relocations and add them all.
-       */
-      obj->relocation_count = relocs->num_relocs;
-      obj->relocs_ptr = (uintptr_t) relocs->relocs;
+   if (relocs != NULL) {
+      assert(obj->relocation_count == 0);
 
-      for (size_t i = 0; i < relocs->num_relocs; i++) {
-         VkResult result;
+      if (relocs->num_relocs > 0) {
+         /* This is the first time we've ever seen a list of relocations for
+          * this BO.  Go ahead and set the relocations and then walk the list
+          * of relocations and add them all.
+          */
+         exec->has_relocs = true;
+         obj->relocation_count = relocs->num_relocs;
+         obj->relocs_ptr = (uintptr_t) relocs->relocs;
 
-         /* A quick sanity check on relocations */
-         assert(relocs->relocs[i].offset < bo->size);
-         result = anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL,
-                                     extra_flags, alloc);
+         for (size_t i = 0; i < relocs->num_relocs; i++) {
+            VkResult result;
+
+            /* A quick sanity check on relocations */
+            assert(relocs->relocs[i].offset < bo->size);
+            result = anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL,
+                                        extra_flags, alloc);
+
+            if (result != VK_SUCCESS)
+               return result;
+         }
+      }
+
+      if (relocs->deps && relocs->deps->entries > 0) {
+         const uint32_t entries = relocs->deps->entries;
+         struct anv_bo **bos =
+            vk_alloc(alloc, entries * sizeof(*bos),
+                     8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
+         if (bos == NULL)
+            return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+         struct set_entry *entry;
+         struct anv_bo **bo = bos;
+         set_foreach(relocs->deps, entry) {
+            *bo++ = (void *)entry->key;
+         }
+
+         qsort(bos, entries, sizeof(struct anv_bo*), _compare_bo_handles);
+
+         VkResult result = VK_SUCCESS;
+         for (bo = bos; bo < bos + entries; bo++) {
+            result = anv_execbuf_add_bo(exec, *bo, NULL, extra_flags, alloc);
+            if (result != VK_SUCCESS)
+               break;
+         }
+
+         vk_free(alloc, bos);
 
          if (result != VK_SUCCESS)
             return result;
@@ -1106,32 +1198,6 @@
 }
 
 static void
-write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush)
-{
-   unsigned reloc_size = 0;
-   if (device->info.gen >= 8) {
-      /* From the Broadwell PRM Vol. 2a, MI_LOAD_REGISTER_MEM::MemoryAddress:
-       *
-       *    "This field specifies the address of the memory location where the
-       *    register value specified in the DWord above will read from. The
-       *    address specifies the DWord location of the data. Range =
-       *    GraphicsVirtualAddress[63:2] for a DWord register GraphicsAddress
-       *    [63:48] are ignored by the HW and assumed to be in correct
-       *    canonical form [63:48] == [47]."
-       */
-      const int shift = 63 - 47;
-      reloc_size = sizeof(uint64_t);
-      *(uint64_t *)p = (((int64_t)v) << shift) >> shift;
-   } else {
-      reloc_size = sizeof(uint32_t);
-      *(uint32_t *)p = v;
-   }
-
-   if (flush && !device->info.has_llc)
-      gen_flush_range(p, reloc_size);
-}
-
-static void
 adjust_relocations_from_state_pool(struct anv_state_pool *pool,
                                    struct anv_reloc_list *relocs,
                                    uint32_t last_pool_center_bo_offset)
@@ -1246,6 +1312,9 @@
 relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer,
                     struct anv_execbuf *exec)
 {
+   if (!exec->has_relocs)
+      return true;
+
    static int userspace_relocs = -1;
    if (userspace_relocs < 0)
       userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true);
@@ -1349,14 +1418,20 @@
       first_batch_bo->bo.index = last_idx;
    }
 
+   /* If we are pinning our BOs, we shouldn't have to relocate anything */
+   if (cmd_buffer->device->instance->physicalDevice.use_softpin)
+      assert(!execbuf->has_relocs);
+
    /* Now we go through and fixup all of the relocation lists to point to
     * the correct indices in the object array.  We have to do this after we
     * reorder the list above as some of the indices may have changed.
     */
-   u_vector_foreach(bbo, &cmd_buffer->seen_bbos)
-      anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs);
+   if (execbuf->has_relocs) {
+      u_vector_foreach(bbo, &cmd_buffer->seen_bbos)
+         anv_cmd_buffer_process_relocs(cmd_buffer, &(*bbo)->relocs);
 
-   anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs);
+      anv_cmd_buffer_process_relocs(cmd_buffer, &cmd_buffer->surface_relocs);
+   }
 
    if (!cmd_buffer->device->info.has_llc) {
       __builtin_ia32_mfence();
diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
index 520f371..1cc632b 100644
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@@ -30,11 +30,11 @@
 {
    struct anv_device *device = blorp->driver_ctx;
 
-   /* The blorp cache must be a real cache */
-   assert(device->blorp_shader_cache.cache);
+   /* The default cache must be a real cache */
+   assert(device->default_pipeline_cache.cache);
 
    struct anv_shader_bin *bin =
-      anv_pipeline_cache_search(&device->blorp_shader_cache, key, key_size);
+      anv_pipeline_cache_search(&device->default_pipeline_cache, key, key_size);
    if (!bin)
       return false;
 
@@ -60,7 +60,7 @@
    struct anv_device *device = blorp->driver_ctx;
 
    /* The blorp cache must be a real cache */
-   assert(device->blorp_shader_cache.cache);
+   assert(device->default_pipeline_cache.cache);
 
    struct anv_pipeline_bind_map bind_map = {
       .surface_count = 0,
@@ -68,8 +68,9 @@
    };
 
    struct anv_shader_bin *bin =
-      anv_pipeline_cache_upload_kernel(&device->blorp_shader_cache,
+      anv_pipeline_cache_upload_kernel(&device->default_pipeline_cache,
                                        key, key_size, kernel, kernel_size,
+                                       NULL, 0,
                                        prog_data, prog_data_size, &bind_map);
 
    if (!bin)
@@ -89,7 +90,6 @@
 void
 anv_device_init_blorp(struct anv_device *device)
 {
-   anv_pipeline_cache_init(&device->blorp_shader_cache, device, true);
    blorp_init(&device->blorp, device, &device->isl_dev);
    device->blorp.compiler = device->instance->physicalDevice.compiler;
    device->blorp.lookup_shader = lookup_blorp_shader;
@@ -123,7 +123,6 @@
 anv_device_finish_blorp(struct anv_device *device)
 {
    blorp_finish(&device->blorp);
-   anv_pipeline_cache_finish(&device->blorp_shader_cache);
 }
 
 static void
@@ -154,9 +153,9 @@
    *blorp_surf = (struct blorp_surf) {
       .surf = isl_surf,
       .addr = {
-         .buffer = buffer->bo,
-         .offset = buffer->offset + offset,
-         .mocs = device->default_mocs,
+         .buffer = buffer->address.bo,
+         .offset = buffer->address.offset + offset,
+         .mocs = anv_mocs_for_bo(device, buffer->address.bo),
       },
    };
 
@@ -207,9 +206,9 @@
    *blorp_surf = (struct blorp_surf) {
       .surf = &surface->isl,
       .addr = {
-         .buffer = image->planes[plane].bo,
-         .offset = image->planes[plane].bo_offset + surface->offset,
-         .mocs = device->default_mocs,
+         .buffer = image->planes[plane].address.bo,
+         .offset = image->planes[plane].address.offset + surface->offset,
+         .mocs = anv_mocs_for_bo(device, image->planes[plane].address.bo),
       },
    };
 
@@ -217,9 +216,9 @@
       const struct anv_surface *aux_surface = &image->planes[plane].aux_surface;
       blorp_surf->aux_surf = &aux_surface->isl,
       blorp_surf->aux_addr = (struct blorp_address) {
-         .buffer = image->planes[plane].bo,
-         .offset = image->planes[plane].bo_offset + aux_surface->offset,
-         .mocs = device->default_mocs,
+         .buffer = image->planes[plane].address.bo,
+         .offset = image->planes[plane].address.offset + aux_surface->offset,
+         .mocs = anv_mocs_for_bo(device, image->planes[plane].address.bo),
       };
       blorp_surf->aux_usage = aux_usage;
 
@@ -514,13 +513,13 @@
 
    struct blorp_surf src, dst;
 
-   uint32_t gl_filter;
+   enum blorp_filter blorp_filter;
    switch (filter) {
    case VK_FILTER_NEAREST:
-      gl_filter = 0x2600; /* GL_NEAREST */
+      blorp_filter = BLORP_FILTER_NEAREST;
       break;
    case VK_FILTER_LINEAR:
-      gl_filter = 0x2601; /* GL_LINEAR */
+      blorp_filter = BLORP_FILTER_BILINEAR;
       break;
    default:
       unreachable("Invalid filter");
@@ -610,7 +609,7 @@
                        dst_format.isl_format, dst_format.swizzle,
                        src_x0, src_y0, src_x1, src_y1,
                        dst_x0, dst_y0, dst_x1, dst_y1,
-                       gl_filter, flip_x, flip_y);
+                       blorp_filter, flip_x, flip_y);
          }
       }
    }
@@ -667,14 +666,14 @@
 
    for (unsigned r = 0; r < regionCount; r++) {
       struct blorp_address src = {
-         .buffer = src_buffer->bo,
-         .offset = src_buffer->offset + pRegions[r].srcOffset,
-         .mocs = cmd_buffer->device->default_mocs,
+         .buffer = src_buffer->address.bo,
+         .offset = src_buffer->address.offset + pRegions[r].srcOffset,
+         .mocs = anv_mocs_for_bo(cmd_buffer->device, src_buffer->address.bo),
       };
       struct blorp_address dst = {
-         .buffer = dst_buffer->bo,
-         .offset = dst_buffer->offset + pRegions[r].dstOffset,
-         .mocs = cmd_buffer->device->default_mocs,
+         .buffer = dst_buffer->address.bo,
+         .offset = dst_buffer->address.offset + pRegions[r].dstOffset,
+         .mocs = anv_mocs_for_bo(cmd_buffer->device, dst_buffer->address.bo),
       };
 
       blorp_buffer_copy(&batch, src, dst, pRegions[r].size);
@@ -725,9 +724,9 @@
          .mocs = cmd_buffer->device->default_mocs,
       };
       struct blorp_address dst = {
-         .buffer = dst_buffer->bo,
-         .offset = dst_buffer->offset + dstOffset,
-         .mocs = cmd_buffer->device->default_mocs,
+         .buffer = dst_buffer->address.bo,
+         .offset = dst_buffer->address.offset + dstOffset,
+         .mocs = anv_mocs_for_bo(cmd_buffer->device, dst_buffer->address.bo),
       };
 
       blorp_buffer_copy(&batch, src, dst, copy_size);
@@ -1066,7 +1065,7 @@
 {
    static const union isl_color_value color_value = { .u32 = { 0, } };
    const struct anv_subpass *subpass = cmd_buffer->state.subpass;
-   const uint32_t att_idx = subpass->depth_stencil_attachment.attachment;
+   const uint32_t att_idx = subpass->depth_stencil_attachment->attachment;
 
    if (att_idx == VK_ATTACHMENT_UNUSED)
       return;
@@ -1177,7 +1176,8 @@
                 struct blorp_surf *dst_surf,
                 uint32_t dst_level, uint32_t dst_layer,
                 uint32_t src_x, uint32_t src_y, uint32_t dst_x, uint32_t dst_y,
-                uint32_t width, uint32_t height)
+                uint32_t width, uint32_t height,
+                enum blorp_filter filter)
 {
    blorp_blit(batch,
               src_surf, src_level, src_layer,
@@ -1186,7 +1186,7 @@
               ISL_FORMAT_UNSUPPORTED, ISL_SWIZZLE_IDENTITY,
               src_x, src_y, src_x + width, src_y + height,
               dst_x, dst_y, dst_x + width, dst_y + height,
-              0x2600 /* GL_NEAREST */, false, false);
+              filter, false, false);
 }
 
 static void
@@ -1225,13 +1225,22 @@
                                         dst_surf.aux_usage,
                                         dst_level, dst_layer, 1);
 
+      enum blorp_filter filter;
+      if ((src_surf.surf->usage & ISL_SURF_USAGE_DEPTH_BIT) ||
+          (src_surf.surf->usage & ISL_SURF_USAGE_STENCIL_BIT) ||
+          isl_format_has_int_channel(src_surf.surf->format)) {
+         filter = BLORP_FILTER_SAMPLE_0;
+      } else {
+         filter = BLORP_FILTER_AVERAGE;
+      }
+
       assert(!src_image->format->can_ycbcr);
       assert(!dst_image->format->can_ycbcr);
 
       resolve_surface(batch,
                       &src_surf, src_level, src_layer,
                       &dst_surf, dst_level, dst_layer,
-                      src_x, src_y, dst_x, dst_y, width, height);
+                      src_x, src_y, dst_x, dst_y, width, height, filter);
    }
 }
 
@@ -1346,6 +1355,13 @@
          assert(src_iview->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT &&
                 dst_iview->aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT);
 
+         enum blorp_filter filter;
+         if (isl_format_has_int_channel(src_iview->planes[0].isl.format)) {
+            filter = BLORP_FILTER_SAMPLE_0;
+         } else {
+            filter = BLORP_FILTER_AVERAGE;
+         }
+
          struct blorp_surf src_surf, dst_surf;
          get_blorp_surf_for_anv_image(cmd_buffer->device, src_iview->image,
                                       VK_IMAGE_ASPECT_COLOR_BIT,
@@ -1387,7 +1403,8 @@
                             base_dst_layer + i,
                             render_area.offset.x, render_area.offset.y,
                             render_area.offset.x, render_area.offset.y,
-                            render_area.extent.width, render_area.extent.height);
+                            render_area.extent.width, render_area.extent.height,
+                            filter);
          }
       }
 
@@ -1416,10 +1433,11 @@
    struct blorp_surf shadow_surf = {
       .surf = &image->planes[0].shadow_surface.isl,
       .addr = {
-         .buffer = image->planes[0].bo,
-         .offset = image->planes[0].bo_offset +
+         .buffer = image->planes[0].address.bo,
+         .offset = image->planes[0].address.offset +
                    image->planes[0].shadow_surface.offset,
-         .mocs = cmd_buffer->device->default_mocs,
+         .mocs = anv_mocs_for_bo(cmd_buffer->device,
+                                 image->planes[0].address.bo),
       },
    };
 
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 67c4f8c..8ef71b0 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -916,11 +916,11 @@
    const struct anv_subpass *subpass = cmd_buffer->state.subpass;
    const struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
 
-   if (subpass->depth_stencil_attachment.attachment == VK_ATTACHMENT_UNUSED)
+   if (subpass->depth_stencil_attachment == NULL)
       return NULL;
 
    const struct anv_image_view *iview =
-      fb->attachments[subpass->depth_stencil_attachment.attachment];
+      fb->attachments[subpass->depth_stencil_attachment->attachment];
 
    assert(iview->aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT |
                                 VK_IMAGE_ASPECT_STENCIL_BIT));
diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c
index 8b200f8..66ed282 100644
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -780,9 +780,8 @@
          &set->buffer_views[bind_layout->buffer_index + element];
 
       bview->format = anv_isl_format_for_descriptor_type(type);
-      bview->bo = buffer->bo;
-      bview->offset = buffer->offset + offset;
       bview->range = anv_buffer_get_range(buffer, offset, range);
+      bview->address = anv_address_add(buffer->address, offset);
 
       /* If we're writing descriptors through a push command, we need to
        * allocate the surface state from the command buffer. Otherwise it will
@@ -793,7 +792,7 @@
 
       anv_fill_buffer_surface_state(device, bview->surface_state,
                                     bview->format,
-                                    bview->offset, bview->range, 1);
+                                    bview->address, bview->range, 1);
 
       *desc = (struct anv_descriptor) {
          .type = type,
@@ -903,15 +902,9 @@
                                   const struct anv_descriptor_update_template *template,
                                   const void *data)
 {
-   const struct anv_descriptor_set_layout *layout = set->layout;
-
    for (uint32_t i = 0; i < template->entry_count; i++) {
       const struct anv_descriptor_template_entry *entry =
          &template->entries[i];
-      const struct anv_descriptor_set_binding_layout *bind_layout =
-         &layout->binding[entry->binding];
-      struct anv_descriptor *desc = &set->descriptors[bind_layout->descriptor_index];
-      desc += entry->array_element;
 
       switch (entry->type) {
       case VK_DESCRIPTOR_TYPE_SAMPLER:
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index edf96d6..de6dfdf 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -35,6 +35,7 @@
 #include "util/strtod.h"
 #include "util/debug.h"
 #include "util/build_id.h"
+#include "util/disk_cache.h"
 #include "util/mesa-sha1.h"
 #include "vk_util.h"
 #include "common/gen_defines.h"
@@ -58,23 +59,8 @@
 }
 
 static VkResult
-anv_compute_heap_size(int fd, uint64_t *heap_size)
+anv_compute_heap_size(int fd, uint64_t gtt_size, uint64_t *heap_size)
 {
-   uint64_t gtt_size;
-   if (anv_gem_get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE,
-                                 &gtt_size) == -1) {
-      /* If, for whatever reason, we can't actually get the GTT size from the
-       * kernel (too old?) fall back to the aperture size.
-       */
-      anv_perf_warn(NULL, NULL,
-                    "Failed to get I915_CONTEXT_PARAM_GTT_SIZE: %m");
-
-      if (anv_gem_get_aperture(fd, &gtt_size) == -1) {
-         return vk_errorf(NULL, NULL, VK_ERROR_INITIALIZATION_FAILED,
-                          "failed to get aperture size: %m");
-      }
-   }
-
    /* Query the total ram from the system */
    // struct sysinfo info;
    // sysinfo(&info);
@@ -104,15 +90,26 @@
 static VkResult
 anv_physical_device_init_heaps(struct anv_physical_device *device, int fd)
 {
-   /* The kernel query only tells us whether or not the kernel supports the
-    * EXEC_OBJECT_SUPPORTS_48B_ADDRESS flag and not whether or not the
-    * hardware has actual 48bit address support.
-    */
-   device->supports_48bit_addresses =
-      (device->info.gen >= 8) && anv_gem_supports_48b_addresses(fd);
+   uint64_t gtt_size;
+   if (anv_gem_get_context_param(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE,
+                                 &gtt_size) == -1) {
+      /* If, for whatever reason, we can't actually get the GTT size from the
+       * kernel (too old?) fall back to the aperture size.
+       */
+      anv_perf_warn(NULL, NULL,
+                    "Failed to get I915_CONTEXT_PARAM_GTT_SIZE: %m");
+
+      if (anv_gem_get_aperture(fd, &gtt_size) == -1) {
+         return vk_errorf(NULL, NULL, VK_ERROR_INITIALIZATION_FAILED,
+                          "failed to get aperture size: %m");
+      }
+   }
+
+   device->supports_48bit_addresses = (device->info.gen >= 8) &&
+      gtt_size > (4ULL << 30 /* GiB */);
 
    uint64_t heap_size = 0;
-   VkResult result = anv_compute_heap_size(fd, &heap_size);
+   VkResult result = anv_compute_heap_size(fd, gtt_size, &heap_size);
    if (result != VK_SUCCESS)
       return result;
 
@@ -236,6 +233,8 @@
    //                     "build-id too short.  It needs to be a SHA");
    // }
 
+   //memcpy(device->driver_build_sha1, build_id_data(note), 20);
+
    struct mesa_sha1 sha1_ctx;
    uint8_t sha1[20];
    STATIC_ASSERT(VK_UUID_SIZE <= sizeof(sha1));
@@ -274,13 +273,46 @@
    return VK_SUCCESS;
 }
 
+static void
+anv_physical_device_init_disk_cache(struct anv_physical_device *device)
+{
+#ifdef ENABLE_SHADER_CACHE
+   char renderer[10];
+   MAYBE_UNUSED int len = snprintf(renderer, sizeof(renderer), "anv_%04x",
+                                   device->chipset_id);
+   assert(len == sizeof(renderer) - 2);
+
+   char timestamp[41];
+   _mesa_sha1_format(timestamp, device->driver_build_sha1);
+
+   const uint64_t driver_flags =
+      brw_get_compiler_config_value(device->compiler);
+   device->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
+#else
+   device->disk_cache = NULL;
+#endif
+}
+
+static void
+anv_physical_device_free_disk_cache(struct anv_physical_device *device)
+{
+#ifdef ENABLE_SHADER_CACHE
+   if (device->disk_cache)
+      disk_cache_destroy(device->disk_cache);
+#else
+   assert(device->disk_cache == NULL);
+#endif
+}
+
 static VkResult
 anv_physical_device_init(struct anv_physical_device *device,
                          struct anv_instance *instance,
+                         const char *primary_path,
                          const char *path)
 {
    VkResult result;
    int fd;
+   int master_fd = -1;
 
    brw_process_intel_debug_variable();
 
@@ -322,6 +354,8 @@
       intel_logw("Bay Trail Vulkan support is incomplete");
    } else if (device->info.gen >= 8 && device->info.gen <= 10) {
       /* Gen8-10 fully supported */
+   } else if (device->info.gen == 11) {
+      intel_logw("Vulkan is not yet fully supported on gen11.");
    } else {
       result = vk_errorf(device->instance, device,
                          VK_ERROR_INCOMPATIBLE_DRIVER,
@@ -375,6 +409,12 @@
                               anv_gem_supports_syncobj_wait(fd);
    device->has_context_priority = anv_gem_has_context_priority(fd);
 
+   device->use_softpin = anv_gem_get_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN)
+      && device->supports_48bit_addresses;
+
+   device->has_context_isolation =
+      anv_gem_get_param(fd, I915_PARAM_HAS_CONTEXT_ISOLATION);
+
    bool swizzled = anv_gem_get_bit6_swizzle(fd, I915_TILING_X);
 
    /* Starting with Gen10, the timestamp frequency of the command streamer may
@@ -427,7 +467,9 @@
    device->compiler->shader_debug_log = compiler_debug_log;
    device->compiler->shader_perf_log = compiler_perf_log;
    device->compiler->supports_pull_constants = false;
-   device->compiler->constant_buffer_0_is_relative = true;
+   device->compiler->constant_buffer_0_is_relative =
+      device->info.gen < 8 || !device->has_context_isolation;
+   device->compiler->supports_shader_constants = true;
 
    isl_device_init(&device->isl_dev, &device->info, swizzled);
 
@@ -435,20 +477,41 @@
    if (result != VK_SUCCESS)
       goto fail;
 
+   anv_physical_device_init_disk_cache(device);
+
+   if (instance->enabled_extensions.KHR_display) {
+      master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
+      if (master_fd >= 0) {
+         /* prod the device with a GETPARAM call which will fail if
+          * we don't have permission to even render on this device
+          */
+         if (anv_gem_get_param(master_fd, I915_PARAM_CHIPSET_ID) == 0) {
+            close(master_fd);
+            master_fd = -1;
+         }
+      }
+   }
+   device->master_fd = master_fd;
+
    result = anv_init_wsi(device);
    if (result != VK_SUCCESS) {
       ralloc_free(device->compiler);
+      anv_physical_device_free_disk_cache(device);
       goto fail;
    }
 
    anv_physical_device_get_supported_extensions(device,
                                                 &device->supported_extensions);
 
+
    device->local_fd = fd;
+
    return VK_SUCCESS;
 
 fail:
    close(fd);
+   if (master_fd != -1)
+      close(master_fd);
    return result;
 }
 
@@ -456,8 +519,11 @@
 anv_physical_device_finish(struct anv_physical_device *device)
 {
    anv_finish_wsi(device);
+   anv_physical_device_free_disk_cache(device);
    ralloc_free(device->compiler);
    close(device->local_fd);
+   if (device->master_fd >= 0)
+      close(device->master_fd);
 }
 
 static void *
@@ -545,20 +611,33 @@
    else
       instance->alloc = default_alloc;
 
-   if (pCreateInfo->pApplicationInfo &&
-       pCreateInfo->pApplicationInfo->apiVersion != 0) {
-      instance->apiVersion = pCreateInfo->pApplicationInfo->apiVersion;
-   } else {
-      anv_EnumerateInstanceVersion(&instance->apiVersion);
+   instance->app_info = (struct anv_app_info) { .api_version = 0 };
+   if (pCreateInfo->pApplicationInfo) {
+      const VkApplicationInfo *app = pCreateInfo->pApplicationInfo;
+
+      instance->app_info.app_name =
+         vk_strdup(&instance->alloc, app->pApplicationName,
+                   VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+      instance->app_info.app_version = app->applicationVersion;
+
+      instance->app_info.engine_name =
+         vk_strdup(&instance->alloc, app->pEngineName,
+                   VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+      instance->app_info.engine_version = app->engineVersion;
+
+      instance->app_info.api_version = app->apiVersion;
    }
 
+   if (instance->app_info.api_version == 0)
+      anv_EnumerateInstanceVersion(&instance->app_info.api_version);
+
    instance->enabled_extensions = enabled_extensions;
 
    for (unsigned i = 0; i < ARRAY_SIZE(instance->dispatch.entrypoints); i++) {
       /* Vulkan requires that entrypoints for extensions which have not been
        * enabled must not be advertised.
        */
-      if (!anv_entrypoint_is_enabled(i, instance->apiVersion,
+      if (!anv_entrypoint_is_enabled(i, instance->app_info.api_version,
                                      &instance->enabled_extensions, NULL)) {
          instance->dispatch.entrypoints[i] = NULL;
       } else if (anv_dispatch_table.entrypoints[i] != NULL) {
@@ -577,6 +656,9 @@
       return vk_error(result);
    }
 
+   instance->pipeline_cache_enabled =
+      env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true);
+
    _mesa_locale_init();
 
    VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
@@ -601,6 +683,9 @@
       anv_physical_device_finish(&instance->physicalDevice);
    }
 
+   vk_free(&instance->alloc, instance->app_info.app_name);
+   vk_free(&instance->alloc, instance->app_info.engine_name);
+
    VG(VALGRIND_DESTROY_MEMPOOL(instance));
 
    vk_debug_report_instance_destroy(&instance->debug_report_callbacks);
@@ -623,27 +708,28 @@
    if (!dir) {
       printf("Error opening %s\n", DEV_GPU);
       return VK_ERROR_INCOMPATIBLE_DRIVER;
-    }
+   }
 
-    while ((de = readdir(dir)) != NULL) {
-        // extra +1 ensures space for null termination
-        char name[sizeof(DEV_GPU) + sizeof('/') + (NAME_MAX + 1) + 1];
-        snprintf(name, sizeof(name), "%s/%s", DEV_GPU, de->d_name);
+   while ((de = readdir(dir)) != NULL) {
+      // extra +1 ensures space for null termination
+      char name[sizeof(DEV_GPU) + sizeof('/') + (NAME_MAX + 1) + 1];
+      snprintf(name, sizeof(name), "%s/%s", DEV_GPU, de->d_name);
 
-        struct stat path_stat;
-        stat(name, &path_stat);
-        if (!S_ISDIR(path_stat.st_mode)) {
-            result = anv_physical_device_init(&instance->physicalDevice, instance, name);
-            if (result != VK_ERROR_INCOMPATIBLE_DRIVER)
-               break;
-        }
-    }
-    closedir(dir);
+      struct stat path_stat;
+      stat(name, &path_stat);
+      if (!S_ISDIR(path_stat.st_mode)) {
+         result = anv_physical_device_init(&instance->physicalDevice, instance, 
+            name, name);
+         if (result != VK_ERROR_INCOMPATIBLE_DRIVER)
+            break;
+      }
+   }
+   closedir(dir);
 
-    if (result == VK_SUCCESS)
-       instance->physicalDeviceCount = 1;
+   if (result == VK_SUCCESS)
+      instance->physicalDeviceCount = 1;
 
-    return result;
+   return result;
 }
 
 static VkResult
@@ -760,11 +846,13 @@
       .shaderStorageImageArrayDynamicIndexing   = true,
       .shaderClipDistance                       = true,
       .shaderCullDistance                       = true,
-      .shaderFloat64                            = pdevice->info.gen >= 8,
-      .shaderInt64                              = pdevice->info.gen >= 8,
-      .shaderInt16                              = false,
+      .shaderFloat64                            = pdevice->info.gen >= 8 &&
+                                                  pdevice->info.has_64bit_types,
+      .shaderInt64                              = pdevice->info.gen >= 8 &&
+                                                  pdevice->info.has_64bit_types,
+      .shaderInt16                              = pdevice->info.gen >= 8,
       .shaderResourceMinLod                     = false,
-      .variableMultisampleRate                  = false,
+      .variableMultisampleRate                  = true,
       .inheritedQueries                         = true,
    };
 
@@ -772,6 +860,15 @@
    pFeatures->vertexPipelineStoresAndAtomics =
       pdevice->compiler->scalar_stage[MESA_SHADER_VERTEX] &&
       pdevice->compiler->scalar_stage[MESA_SHADER_GEOMETRY];
+
+   struct anv_app_info *app_info = &pdevice->instance->app_info;
+
+   /* The new DOOM and Wolfenstein games require depthBounds without
+    * checking for it.  They seem to run fine without it so just claim it's
+    * there and accept the consequences.
+    */
+   if (app_info->engine_name && strcmp(app_info->engine_name, "idTech") == 0)
+      pFeatures->depthBounds = true;
 }
 
 void anv_GetPhysicalDeviceFeatures2(
@@ -829,6 +926,25 @@
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR: {
+         VkPhysicalDevice8BitStorageFeaturesKHR *features =
+            (VkPhysicalDevice8BitStorageFeaturesKHR *)ext;
+         ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+         features->storageBuffer8BitAccess = pdevice->info.gen >= 8;
+         features->uniformAndStorageBuffer8BitAccess = pdevice->info.gen >= 8;
+         features->storagePushConstant8 = pdevice->info.gen >= 8;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_FEATURES_EXT: {
+         VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *features =
+            (VkPhysicalDeviceVertexAttributeDivisorFeaturesEXT *)ext;
+         features->vertexAttributeInstanceRateDivisor = VK_TRUE;
+         features->vertexAttributeInstanceRateZeroDivisor = VK_TRUE;
+         break;
+      }
+
       default:
          anv_debug_ignored_stype(ext->sType);
          break;
@@ -901,7 +1017,7 @@
       .maxGeometryOutputComponents              = 128,
       .maxGeometryOutputVertices                = 256,
       .maxGeometryTotalOutputComponents         = 1024,
-      .maxFragmentInputComponents               = 128,
+      .maxFragmentInputComponents               = 112, /* 128 components - (POS, PSIZ, CLIP_DIST0, CLIP_DIST1) */
       .maxFragmentOutputAttachments             = 8,
       .maxFragmentDualSrcAttachments            = 1,
       .maxFragmentCombinedOutputResources       = 8,
@@ -1062,6 +1178,21 @@
          break;
       }
 
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VERTEX_ATTRIBUTE_DIVISOR_PROPERTIES_EXT: {
+         VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *props =
+            (VkPhysicalDeviceVertexAttributeDivisorPropertiesEXT *)ext;
+         /* We have to restrict this a bit for multiview */
+         props->maxVertexAttribDivisor = UINT32_MAX / 16;
+         break;
+      }
+
+      case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROTECTED_MEMORY_PROPERTIES: {
+         VkPhysicalDeviceProtectedMemoryProperties *props =
+            (VkPhysicalDeviceProtectedMemoryProperties *)ext;
+         props->protectedNoFault = false;
+         break;
+      }
+
       default:
          anv_debug_ignored_stype(ext->sType);
          break;
@@ -1271,7 +1402,6 @@
 {
    queue->_loader_data.loaderMagic = ICD_LOADER_MAGIC;
    queue->device = device;
-   queue->pool = &device->surface_state_pool;
    queue->flags = 0;
 }
 
@@ -1327,6 +1457,11 @@
    if (device->instance->physicalDevice.has_exec_async)
       device->trivial_batch_bo.flags |= EXEC_OBJECT_ASYNC;
 
+   if (device->instance->physicalDevice.use_softpin)
+      device->trivial_batch_bo.flags |= EXEC_OBJECT_PINNED;
+
+   anv_vma_alloc(device, &device->trivial_batch_bo);
+
    void *map = anv_gem_mmap(device, device->trivial_batch_bo.gem_handle,
                             0, 4096, 0);
 
@@ -1397,7 +1532,7 @@
       /* Vulkan requires that entrypoints for extensions which have not been
        * enabled must not be advertised.
        */
-      if (!anv_entrypoint_is_enabled(i, device->instance->apiVersion,
+      if (!anv_entrypoint_is_enabled(i, device->instance->app_info.api_version,
                                      &device->instance->enabled_extensions,
                                      &device->enabled_extensions)) {
          device->dispatch.entrypoints[i] = NULL;
@@ -1536,6 +1671,27 @@
       goto fail_fd;
    }
 
+   if (physical_device->use_softpin) {
+      if (pthread_mutex_init(&device->vma_mutex, NULL) != 0) {
+         result = vk_error(VK_ERROR_INITIALIZATION_FAILED);
+         goto fail_fd;
+      }
+
+      /* keep the page with address zero out of the allocator */
+      util_vma_heap_init(&device->vma_lo, LOW_HEAP_MIN_ADDRESS, LOW_HEAP_SIZE);
+      device->vma_lo_available =
+         physical_device->memory.heaps[physical_device->memory.heap_count - 1].size;
+
+      /* Leave the last 4GiB out of the high vma range, so that no state base
+       * address + size can overflow 48 bits. For more information see the
+       * comment about Wa32bitGeneralStateOffset in anv_allocator.c
+       */
+      util_vma_heap_init(&device->vma_hi, HIGH_HEAP_MIN_ADDRESS,
+                         HIGH_HEAP_SIZE);
+      device->vma_hi_available = physical_device->memory.heap_count == 1 ? 0 :
+         physical_device->memory.heaps[0].size;
+   }
+
    /* As per spec, the driver implementation may deny requests to acquire
     * a priority above the default priority (MEDIUM) if the caller does not
     * have sufficient privileges. In this scenario VK_ERROR_NOT_PERMITTED_EXT
@@ -1592,7 +1748,8 @@
    uint64_t bo_flags =
       (physical_device->supports_48bit_addresses ? EXEC_OBJECT_SUPPORTS_48B_ADDRESS : 0) |
       (physical_device->has_exec_async ? EXEC_OBJECT_ASYNC : 0) |
-      (physical_device->has_exec_capture ? EXEC_OBJECT_CAPTURE : 0);
+      (physical_device->has_exec_capture ? EXEC_OBJECT_CAPTURE : 0) |
+      (physical_device->use_softpin ? EXEC_OBJECT_PINNED : 0);
 
    anv_bo_pool_init(&device->batch_bo_pool, device, bo_flags);
 
@@ -1600,28 +1757,48 @@
    if (result != VK_SUCCESS)
       goto fail_batch_bo_pool;
 
-   /* For the state pools we explicitly disable 48bit. */
-   bo_flags = (physical_device->has_exec_async ? EXEC_OBJECT_ASYNC : 0) |
-              (physical_device->has_exec_capture ? EXEC_OBJECT_CAPTURE : 0);
+   if (!physical_device->use_softpin)
+      bo_flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
 
-   result = anv_state_pool_init(&device->dynamic_state_pool, device, 16384,
+   result = anv_state_pool_init(&device->dynamic_state_pool, device,
+                                DYNAMIC_STATE_POOL_MIN_ADDRESS,
+                                16384,
                                 bo_flags);
    if (result != VK_SUCCESS)
       goto fail_bo_cache;
 
-   result = anv_state_pool_init(&device->instruction_state_pool, device, 16384,
+   result = anv_state_pool_init(&device->instruction_state_pool, device,
+                                INSTRUCTION_STATE_POOL_MIN_ADDRESS,
+                                16384,
                                 bo_flags);
    if (result != VK_SUCCESS)
       goto fail_dynamic_state_pool;
 
-   result = anv_state_pool_init(&device->surface_state_pool, device, 4096,
+   result = anv_state_pool_init(&device->surface_state_pool, device,
+                                SURFACE_STATE_POOL_MIN_ADDRESS,
+                                4096,
                                 bo_flags);
    if (result != VK_SUCCESS)
       goto fail_instruction_state_pool;
 
+   if (physical_device->use_softpin) {
+      result = anv_state_pool_init(&device->binding_table_pool, device,
+                                   BINDING_TABLE_POOL_MIN_ADDRESS,
+                                   4096,
+                                   bo_flags);
+      if (result != VK_SUCCESS)
+         goto fail_surface_state_pool;
+   }
+
    result = anv_bo_init_new(&device->workaround_bo, device, 1024);
    if (result != VK_SUCCESS)
-      goto fail_surface_state_pool;
+      goto fail_binding_table_pool;
+
+   if (physical_device->use_softpin)
+      device->workaround_bo.flags |= EXEC_OBJECT_PINNED;
+
+   if (!anv_vma_alloc(device, &device->workaround_bo))
+      goto fail_workaround_bo;
 
    anv_device_init_trivial_batch(device);
 
@@ -1659,6 +1836,8 @@
    if (result != VK_SUCCESS)
       goto fail_workaround_bo;
 
+   anv_pipeline_cache_init(&device->default_pipeline_cache, device, true);
+
    anv_device_init_blorp(device);
 
    anv_device_init_border_colors(device);
@@ -1672,6 +1851,9 @@
    anv_scratch_pool_finish(device, &device->scratch_pool);
    anv_gem_munmap(device, device->workaround_bo.gem_handle, device->workaround_bo.map, device->workaround_bo.size);
    anv_gem_close(device, device->workaround_bo.gem_handle);
+ fail_binding_table_pool:
+   if (physical_device->use_softpin)
+      anv_state_pool_finish(&device->binding_table_pool);
  fail_surface_state_pool:
    anv_state_pool_finish(&device->surface_state_pool);
  fail_instruction_state_pool:
@@ -1700,12 +1882,17 @@
     const VkAllocationCallbacks*                pAllocator)
 {
    ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_physical_device *physical_device;
 
    if (!device)
       return;
 
+   physical_device = &device->instance->physicalDevice;
+
    anv_device_finish_blorp(device);
 
+   anv_pipeline_cache_finish(&device->default_pipeline_cache);
+
    anv_queue_finish(&device->queue);
 
 #ifdef HAVE_VALGRIND
@@ -1718,12 +1905,16 @@
    anv_scratch_pool_finish(device, &device->scratch_pool);
 
    anv_gem_munmap(device, device->workaround_bo.gem_handle, device->workaround_bo.map, device->workaround_bo.size);
+   anv_vma_free(device, &device->workaround_bo);
    anv_gem_close(device, device->workaround_bo.gem_handle);
 
+   anv_vma_free(device, &device->trivial_batch_bo);
    anv_gem_close(device, device->trivial_batch_bo.gem_handle);
    if (device->info.gen >= 10)
       anv_gem_close(device, device->hiz_clear_bo.gem_handle);
 
+   if (physical_device->use_softpin)
+      anv_state_pool_finish(&device->binding_table_pool);
    anv_state_pool_finish(&device->surface_state_pool);
    anv_state_pool_finish(&device->instruction_state_pool);
    anv_state_pool_finish(&device->dynamic_state_pool);
@@ -1897,6 +2088,66 @@
    return anv_device_submit_simple_batch(device, &batch);
 }
 
+bool
+anv_vma_alloc(struct anv_device *device, struct anv_bo *bo)
+{
+   if (!(bo->flags & EXEC_OBJECT_PINNED))
+      return true;
+
+   pthread_mutex_lock(&device->vma_mutex);
+
+   bo->offset = 0;
+
+   if (bo->flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS &&
+       device->vma_hi_available >= bo->size) {
+      uint64_t addr = util_vma_heap_alloc(&device->vma_hi, bo->size, 4096);
+      if (addr) {
+         bo->offset = gen_canonical_address(addr);
+         assert(addr == gen_48b_address(bo->offset));
+         device->vma_hi_available -= bo->size;
+      }
+   }
+
+   if (bo->offset == 0 && device->vma_lo_available >= bo->size) {
+      uint64_t addr = util_vma_heap_alloc(&device->vma_lo, bo->size, 4096);
+      if (addr) {
+         bo->offset = gen_canonical_address(addr);
+         assert(addr == gen_48b_address(bo->offset));
+         device->vma_lo_available -= bo->size;
+      }
+   }
+
+   pthread_mutex_unlock(&device->vma_mutex);
+
+   return bo->offset != 0;
+}
+
+void
+anv_vma_free(struct anv_device *device, struct anv_bo *bo)
+{
+   if (!(bo->flags & EXEC_OBJECT_PINNED))
+      return;
+
+   const uint64_t addr_48b = gen_48b_address(bo->offset);
+
+   pthread_mutex_lock(&device->vma_mutex);
+
+   if (addr_48b >= LOW_HEAP_MIN_ADDRESS &&
+       addr_48b <= LOW_HEAP_MAX_ADDRESS) {
+      util_vma_heap_free(&device->vma_lo, addr_48b, bo->size);
+      device->vma_lo_available += bo->size;
+   } else {
+      assert(addr_48b >= HIGH_HEAP_MIN_ADDRESS &&
+             addr_48b <= HIGH_HEAP_MAX_ADDRESS);
+      util_vma_heap_free(&device->vma_hi, addr_48b, bo->size);
+      device->vma_hi_available += bo->size;
+   }
+
+   pthread_mutex_unlock(&device->vma_mutex);
+
+   bo->offset = 0;
+}
+
 VkResult
 anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size)
 {
@@ -1940,6 +2191,27 @@
    mem->map = NULL;
    mem->map_size = 0;
 
+   uint64_t bo_flags = 0;
+
+   assert(mem->type->heapIndex < pdevice->memory.heap_count);
+   if (pdevice->memory.heaps[mem->type->heapIndex].supports_48bit_addresses)
+      bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+   const struct wsi_memory_allocate_info *wsi_info =
+      vk_find_struct_const(pAllocateInfo->pNext, WSI_MEMORY_ALLOCATE_INFO_MESA);
+   if (wsi_info && wsi_info->implicit_sync) {
+      /* We need to set the WRITE flag on window system buffers so that GEM
+       * will know we're writing to them and synchronize uses on other rings
+       * (eg if the display server uses the blitter ring).
+       */
+      bo_flags |= EXEC_OBJECT_WRITE;
+   } else if (pdevice->has_exec_async) {
+      bo_flags |= EXEC_OBJECT_ASYNC;
+   }
+
+   if (pdevice->use_softpin)
+      bo_flags |= EXEC_OBJECT_PINNED;
+
    const VkImportMemoryFdInfoKHR *fd_info =
       vk_find_struct_const(pAllocateInfo->pNext, IMPORT_MEMORY_FD_INFO_KHR);
 
@@ -1958,8 +2230,8 @@
              fd_info->handleType ==
                VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
 
-      result = anv_bo_cache_import(device, &device->bo_cache,
-                                   fd_info->fd, &mem->bo);
+      result = anv_bo_cache_import(device, &device->bo_cache, fd_info->fd,
+                                   bo_flags | ANV_BO_EXTERNAL, &mem->bo);
       if (result != VK_SUCCESS)
          goto fail;
 
@@ -2010,7 +2282,8 @@
       // then clients will be able to import a buffer more than once.
       anv_buffer_handle_t buffer;
       uint64_t import_size;
-      int status = anv_gem_import_fuchsia_buffer(device, fuchsia_info->handle, &buffer, &import_size);
+      int status = anv_gem_import_fuchsia_buffer(
+         device, fuchsia_info->handle, &buffer, &import_size);
       if (status != 0)
          return vk_error(VK_ERROR_INVALID_EXTERNAL_HANDLE_KHR);
       if (import_size < aligned_alloc_size) {
@@ -2025,14 +2298,20 @@
       }
 
       VkResult result = anv_bo_cache_import_buffer_handle(
-          device, &device->bo_cache, buffer, aligned_alloc_size, &mem->bo);
+          device, &device->bo_cache, buffer, bo_flags | ANV_BO_EXTERNAL,
+          aligned_alloc_size, &mem->bo);
       if (result != VK_SUCCESS)
          goto fail;
 #endif // VK_USE_PLATFORM_MAGMA_KHR
 
    } else {
+      const VkExportMemoryAllocateInfoKHR *fd_info =
+         vk_find_struct_const(pAllocateInfo->pNext, EXPORT_MEMORY_ALLOCATE_INFO_KHR);
+      if (fd_info && fd_info->handleTypes)
+         bo_flags |= ANV_BO_EXTERNAL;
+
       result = anv_bo_cache_alloc(device, &device->bo_cache,
-                                  pAllocateInfo->allocationSize,
+                                  pAllocateInfo->allocationSize, bo_flags,
                                   &mem->bo);
       if (result != VK_SUCCESS)
          goto fail;
@@ -2061,22 +2340,6 @@
       }
    }
 
-   assert(mem->type->heapIndex < pdevice->memory.heap_count);
-   if (pdevice->memory.heaps[mem->type->heapIndex].supports_48bit_addresses)
-      mem->bo->flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
-
-   const struct wsi_memory_allocate_info *wsi_info =
-      vk_find_struct_const(pAllocateInfo->pNext, WSI_MEMORY_ALLOCATE_INFO_MESA);
-   if (wsi_info && wsi_info->implicit_sync) {
-      /* We need to set the WRITE flag on window system buffers so that GEM
-       * will know we're writing to them and synchronize uses on other rings
-       * (eg if the display server uses the blitter ring).
-       */
-      mem->bo->flags |= EXEC_OBJECT_WRITE;
-   } else if (pdevice->has_exec_async) {
-      mem->bo->flags |= EXEC_OBJECT_ASYNC;
-   }
-
    *pMem = anv_device_memory_to_handle(mem);
 
    return VK_SUCCESS;
@@ -2483,11 +2746,12 @@
 
    if (mem) {
       assert((buffer->usage & mem->type->valid_buffer_usage) == buffer->usage);
-      buffer->bo = mem->bo;
-      buffer->offset = pBindInfo->memoryOffset;
+      buffer->address = (struct anv_address) {
+         .bo = mem->bo,
+         .offset = pBindInfo->memoryOffset,
+      };
    } else {
-      buffer->bo = NULL;
-      buffer->offset = 0;
+      buffer->address = ANV_NULL_ADDRESS;
    }
 }
 
@@ -2653,8 +2917,7 @@
 
    buffer->size = pCreateInfo->size;
    buffer->usage = pCreateInfo->usage;
-   buffer->bo = NULL;
-   buffer->offset = 0;
+   buffer->address = ANV_NULL_ADDRESS;
 
    *pBuffer = anv_buffer_to_handle(buffer);
 
@@ -2678,10 +2941,11 @@
 void
 anv_fill_buffer_surface_state(struct anv_device *device, struct anv_state state,
                               enum isl_format format,
-                              uint32_t offset, uint32_t range, uint32_t stride)
+                              struct anv_address address,
+                              uint32_t range, uint32_t stride)
 {
    isl_buffer_fill_state(&device->isl_dev, state.map,
-                         .address = offset,
+                         .address = anv_address_physical(address),
                          .mocs = device->default_mocs,
                          .size = range,
                          .format = format,
diff --git a/src/intel/vulkan/anv_entrypoints_gen.py b/src/intel/vulkan/anv_entrypoints_gen.py
index a6c5d12..2471636 100755
--- a/src/intel/vulkan/anv_entrypoints_gen.py
+++ b/src/intel/vulkan/anv_entrypoints_gen.py
@@ -24,7 +24,6 @@
 #
 
 import argparse
-import functools
 import math
 import os
 import xml.etree.cElementTree as et
@@ -36,7 +35,7 @@
 from collections import OrderedDict, namedtuple
 from mako.template import Template
 
-from anv_extensions import *
+from anv_extensions import VkVersion, MAX_API_VERSION, EXTENSIONS
 
 # We generate a static hash table for entry point lookup
 # (vkGetProcAddress). We use a linear congruential generator for our hash
@@ -150,7 +149,7 @@
 /* Hash table stats:
  * size ${len(strmap.sorted_strings)} entries
  * collisions entries:
-% for i in xrange(10):
+% for i in range(10):
  *     ${i}${'+' if i == 9 else ' '}     ${strmap.collisions[i]}
 % endfor
  */
@@ -500,9 +499,6 @@
         if ext_name not in supported_exts:
             continue
 
-        if extension.attrib['supported'] != 'vulkan':
-            continue
-
         ext = supported_exts[ext_name]
         ext.type = extension.attrib['type']
 
@@ -512,7 +508,7 @@
             assert e.core_version is None
             e.extensions.append(ext)
 
-    return [e for e in entrypoints.itervalues() if e.enabled]
+    return [e for e in entrypoints.values() if e.enabled]
 
 
 def get_entrypoints_defines(doc):
@@ -521,7 +517,10 @@
 
     for extension in doc.findall('./extensions/extension[@platform]'):
         platform = extension.attrib['platform']
-        define = 'VK_USE_PLATFORM_' + platform.upper() + '_KHR'
+        ext = '_KHR'
+        if platform.upper() == 'XLIB_XRANDR':
+            ext = '_EXT'
+        define = 'VK_USE_PLATFORM_' + platform.upper() + ext
         if 'protect' in extension.attrib:
           define = extension.attrib['protect']
 
diff --git a/src/intel/vulkan/anv_extensions.py b/src/intel/vulkan/anv_extensions.py
index 96134d2..4d56497 100755
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -36,11 +36,11 @@
 
 def _bool_to_c_expr(b):
     if b is True:
-        return 'true';
+        return 'true'
     elif b is False:
-        return 'false';
+        return 'false'
     else:
-        return b;
+        return b
 
 class Extension:
     def __init__(self, name, ext_version, enable):
@@ -49,20 +49,22 @@
         self.enable = _bool_to_c_expr(enable)
 
 class ApiVersion:
-    def __init__(self, max_patch_version, enable):
-        self.max_patch_version = max_patch_version
+    def __init__(self, version, enable):
+        self.version = version
         self.enable = _bool_to_c_expr(enable)
 
+API_PATCH_VERSION = 80
+
 # Supported API versions.  Each one is the maximum patch version for the given
 # version.  Version come in increasing order and each version is available if
 # it's provided "enable" condition is true and all previous versions are
 # available.
 API_VERSIONS = [
-    ApiVersion('1.0.57',    True),
+    ApiVersion('1.0',   True),
 
     # DRM_IOCTL_SYNCOBJ_WAIT is required for VK_KHR_external_fence which is a
     # required core feature in Vulkan 1.1
-    ApiVersion('1.1.0',     'device->has_syncobj_wait'),
+    ApiVersion('1.1',   'device->has_syncobj_wait'),
 ]
 
 MAX_API_VERSION = None # Computed later
@@ -75,7 +77,9 @@
 EXTENSIONS = [
     Extension('VK_ANDROID_native_buffer',                 5, 'ANDROID'),
     Extension('VK_KHR_16bit_storage',                     1, 'device->info.gen >= 8'),
+    Extension('VK_KHR_8bit_storage',                      1, 'device->info.gen >= 8'),
     Extension('VK_KHR_bind_memory2',                      1, True),
+    Extension('VK_KHR_create_renderpass2',                1, True),
     Extension('VK_KHR_dedicated_allocation',              1, True),
     Extension('VK_KHR_descriptor_update_template',        1, True),
     Extension('VK_KHR_device_group',                      1, True),
@@ -90,6 +94,7 @@
     Extension('VK_KHR_external_semaphore',                1, True),
     Extension('VK_KHR_external_semaphore_capabilities',   1, True),
     Extension('VK_KHR_external_semaphore_fd',             1, False),
+    Extension('VK_KHR_get_display_properties2',           1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
     Extension('VK_KHR_get_memory_requirements2',          1, True),
     Extension('VK_KHR_get_physical_device_properties2',   1, True),
     Extension('VK_KHR_get_surface_capabilities2',         1, 'ANV_HAS_SURFACE'),
@@ -111,10 +116,19 @@
     Extension('VK_KHR_xcb_surface',                       6, 'VK_USE_PLATFORM_XCB_KHR'),
     Extension('VK_KHR_xlib_surface',                      6, 'VK_USE_PLATFORM_XLIB_KHR'),
     Extension('VK_KHR_multiview',                         1, True),
+    Extension('VK_KHR_display',                          23, 'VK_USE_PLATFORM_DISPLAY_KHR'),
+    Extension('VK_EXT_acquire_xlib_display',              1, 'VK_USE_PLATFORM_XLIB_XRANDR_EXT'),
     Extension('VK_EXT_debug_report',                      8, True),
+    Extension('VK_EXT_direct_mode_display',               1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
+    Extension('VK_EXT_display_control',                   1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
+    Extension('VK_EXT_display_surface_counter',           1, 'VK_USE_PLATFORM_DISPLAY_KHR'),
     Extension('VK_EXT_external_memory_dma_buf',           1, True),
     Extension('VK_EXT_global_priority',                   1,
               'device->has_context_priority'),
+    Extension('VK_EXT_shader_viewport_index_layer',       1, True),
+    Extension('VK_EXT_shader_stencil_export',             1, 'device->info.gen >= 9'),
+    Extension('VK_EXT_vertex_attribute_divisor',          3, True),
+    Extension('VK_EXT_post_depth_coverage',               1, 'device->info.gen >= 9'),
     Extension('VK_KHR_external_memory_fuchsia',           1, 'VK_USE_PLATFORM_MAGMA_KHR'),
     Extension('VK_KHR_external_semaphore_fuchsia',        1, 'VK_USE_PLATFORM_MAGMA_KHR'),
     Extension('VK_GOOGLE_image_usage_scanout',            1, 'VK_USE_PLATFORM_MAGMA_KHR'),
@@ -165,6 +179,7 @@
 
 MAX_API_VERSION = VkVersion('0.0.0')
 for version in API_VERSIONS:
-    version.max_patch_version = VkVersion(version.max_patch_version)
-    assert version.max_patch_version > MAX_API_VERSION
-    MAX_API_VERSION = version.max_patch_version
+    version.version = VkVersion(version.version)
+    version.version.patch = API_PATCH_VERSION
+    assert version.version > MAX_API_VERSION
+    MAX_API_VERSION = version.version
diff --git a/src/intel/vulkan/anv_extensions_gen.py b/src/intel/vulkan/anv_extensions_gen.py
index 3c8cd8a..454eedf 100755
--- a/src/intel/vulkan/anv_extensions_gen.py
+++ b/src/intel/vulkan/anv_extensions_gen.py
@@ -108,12 +108,12 @@
 #include "vk_util.h"
 
 /* Convert the VK_USE_PLATFORM_* defines to booleans */
-%for platform in ['ANDROID', 'WAYLAND', 'XCB', 'XLIB']:
-#ifdef VK_USE_PLATFORM_${platform}_KHR
-#   undef VK_USE_PLATFORM_${platform}_KHR
-#   define VK_USE_PLATFORM_${platform}_KHR true
+%for platform in ['ANDROID_KHR', 'WAYLAND_KHR', 'XCB_KHR', 'XLIB_KHR', 'DISPLAY_KHR', 'XLIB_XRANDR_EXT']:
+#ifdef VK_USE_PLATFORM_${platform}
+#   undef VK_USE_PLATFORM_${platform}
+#   define VK_USE_PLATFORM_${platform} true
 #else
-#   define VK_USE_PLATFORM_${platform}_KHR false
+#   define VK_USE_PLATFORM_${platform} false
 #endif
 %endfor
 
@@ -127,7 +127,8 @@
 
 #define ANV_HAS_SURFACE (VK_USE_PLATFORM_WAYLAND_KHR || \\
                          VK_USE_PLATFORM_XCB_KHR || \\
-                         VK_USE_PLATFORM_XLIB_KHR)
+                         VK_USE_PLATFORM_XLIB_KHR || \\
+                         VK_USE_PLATFORM_DISPLAY_KHR)
 
 static const uint32_t MAX_API_VERSION = ${MAX_API_VERSION.c_vk_version()};
 
@@ -162,7 +163,7 @@
 %for version in API_VERSIONS:
     if (!(${version.enable}))
         return version;
-    version = ${version.max_patch_version.c_vk_version()};
+    version = ${version.version.c_vk_version()};
 
 %endfor
     return version;
diff --git a/src/intel/vulkan/anv_formats.c b/src/intel/vulkan/anv_formats.c
index 8337290..9501e2f 100644
--- a/src/intel/vulkan/anv_formats.c
+++ b/src/intel/vulkan/anv_formats.c
@@ -519,8 +519,7 @@
       return 0;
 
    if (isl_format_supports_sampling(devinfo, plane_format.isl_format)) {
-      flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT |
-               VK_FORMAT_FEATURE_BLIT_SRC_BIT;
+      flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT;
 
       if (isl_format_supports_filtering(devinfo, plane_format.isl_format))
          flags |= VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
@@ -532,8 +531,7 @@
     */
    if (isl_format_supports_rendering(devinfo, plane_format.isl_format) &&
        plane_format.swizzle.a == ISL_CHANNEL_SELECT_ALPHA) {
-      flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
-               VK_FORMAT_FEATURE_BLIT_DST_BIT;
+      flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT;
 
       if (isl_format_supports_alpha_blending(devinfo, plane_format.isl_format))
          flags |= VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
@@ -550,7 +548,9 @@
       flags |= VK_FORMAT_FEATURE_STORAGE_IMAGE_ATOMIC_BIT;
 
    if (flags) {
-      flags |= VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
+      flags |= VK_FORMAT_FEATURE_BLIT_SRC_BIT |
+               VK_FORMAT_FEATURE_BLIT_DST_BIT |
+               VK_FORMAT_FEATURE_TRANSFER_SRC_BIT |
                VK_FORMAT_FEATURE_TRANSFER_DST_BIT;
    }
 
@@ -724,7 +724,8 @@
                                          &pFormatProperties->formatProperties);
 
    vk_foreach_struct(ext, pFormatProperties->pNext) {
-      switch (ext->sType) {
+      /* Use unsigned since some cases are not in the VkStructureType enum. */
+      switch ((unsigned)ext->sType) {
       case VK_STRUCTURE_TYPE_WSI_FORMAT_MODIFIER_PROPERTIES_LIST_MESA:
          get_wsi_format_modifier_properties_list(physical_device, format,
                                                  (void *)ext);
diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c
index 2a8f8b1..c43b5ef 100644
--- a/src/intel/vulkan/anv_gem.c
+++ b/src/intel/vulkan/anv_gem.c
@@ -377,24 +377,6 @@
    return 0;
 }
 
-bool
-anv_gem_supports_48b_addresses(int fd)
-{
-   struct drm_i915_gem_exec_object2 obj = {
-      .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
-   };
-
-   struct drm_i915_gem_execbuffer2 execbuf = {
-      .buffers_ptr = (uintptr_t)&obj,
-      .buffer_count = 1,
-      .rsvd1 = 0xffffffu,
-   };
-
-   int ret = anv_ioctl(fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
-
-   return ret == -1 && errno == ENOENT;
-}
-
 int
 anv_gem_gpu_get_reset_stats(struct anv_device *device,
                             uint32_t *active, uint32_t *pending)
@@ -462,12 +444,11 @@
 int
 anv_gem_sync_file_merge(struct anv_device *device, int fd1, int fd2)
 {
-   const char name[] = "anv merge fence";
    struct sync_merge_data args = {
+      .name = "anv merge fence",
       .fd2 = fd2,
       .fence = -1,
    };
-   memcpy(args.name, name, sizeof(name));
 
    int ret = anv_ioctl(fd1, SYNC_IOC_MERGE, &args);
    if (ret == -1)
diff --git a/src/intel/vulkan/anv_gem_stubs.c b/src/intel/vulkan/anv_gem_stubs.c
index 0f4a3f5..5093bd5 100644
--- a/src/intel/vulkan/anv_gem_stubs.c
+++ b/src/intel/vulkan/anv_gem_stubs.c
@@ -170,12 +170,6 @@
    unreachable("Unused");
 }
 
-bool
-anv_gem_supports_48b_addresses(int fd)
-{
-   unreachable("Unused");
-}
-
 int
 anv_gem_gpu_get_reset_stats(struct anv_device *device,
                             uint32_t *active, uint32_t *pending)
diff --git a/src/intel/vulkan/anv_icd.py b/src/intel/vulkan/anv_icd.py
index 31bb068..73cc645 100644
--- a/src/intel/vulkan/anv_icd.py
+++ b/src/intel/vulkan/anv_icd.py
@@ -22,8 +22,9 @@
 
 import json
 import os.path
+import argparse
 
-from anv_extensions import *
+from anv_extensions import MAX_API_VERSION
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
@@ -44,4 +45,4 @@
     }
 
     with open(args.out, 'w') as f:
-        json.dump(json_data, f, indent = 4, sort_keys=True)
+        json.dump(json_data, f, indent=4, sort_keys=True, separators=(',', ': '))
diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c
index b002df3..c63c72b 100644
--- a/src/intel/vulkan/anv_image.c
+++ b/src/intel/vulkan/anv_image.c
@@ -307,7 +307,7 @@
              VkImageAspectFlagBits aspect)
 {
    const VkImageCreateInfo *vk_info = anv_info->vk_info;
-   bool ok UNUSED;
+   bool ok;
 
    static const enum isl_surf_dim vk_to_isl_surf_dim[] = {
       [VK_IMAGE_TYPE_1D] = ISL_SURF_DIM_1D,
@@ -662,8 +662,9 @@
 
    for (uint32_t p = 0; p < image->n_planes; ++p) {
       if (image->planes[p].bo_is_owned) {
-         assert(image->planes[p].bo != NULL);
-         anv_bo_cache_release(device, &device->bo_cache, image->planes[p].bo);
+         assert(image->planes[p].address.bo != NULL);
+         anv_bo_cache_release(device, &device->bo_cache,
+                              image->planes[p].address.bo);
       }
    }
 
@@ -679,13 +680,14 @@
    assert(!image->planes[plane].bo_is_owned);
 
    if (!memory) {
-      image->planes[plane].bo = NULL;
-      image->planes[plane].bo_offset = 0;
+      image->planes[plane].address = ANV_NULL_ADDRESS;
       return;
    }
 
-   image->planes[plane].bo = memory->bo;
-   image->planes[plane].bo_offset = memory_offset;
+   image->planes[plane].address = (struct anv_address) {
+      .bo = memory->bo,
+      .offset = memory_offset,
+   };
 }
 
 VkResult anv_BindImageMemory(
@@ -766,17 +768,26 @@
 
    assert(__builtin_popcount(subresource->aspectMask) == 1);
 
-   /* If we are on a non-zero mip level or array slice, we need to
-    * calculate a real offset.
-    */
-   anv_assert(subresource->mipLevel == 0);
-   anv_assert(subresource->arrayLayer == 0);
-
    layout->offset = surface->offset;
    layout->rowPitch = surface->isl.row_pitch;
    layout->depthPitch = isl_surf_get_array_pitch(&surface->isl);
    layout->arrayPitch = isl_surf_get_array_pitch(&surface->isl);
-   layout->size = surface->isl.size;
+
+   if (subresource->mipLevel > 0 || subresource->arrayLayer > 0) {
+      assert(surface->isl.tiling == ISL_TILING_LINEAR);
+
+      uint32_t offset_B;
+      isl_surf_get_image_offset_B_tile_sa(&surface->isl,
+                                          subresource->mipLevel,
+                                          subresource->arrayLayer,
+                                          0 /* logical_z_offset_px */,
+                                          &offset_B, NULL, NULL);
+      layout->offset += offset_B;
+      layout->size = layout->rowPitch * anv_minify(image->extent.height,
+                                                   subresource->mipLevel);
+   } else {
+      layout->size = surface->isl.size;
+   }
 }
 
 /**
@@ -1071,20 +1082,8 @@
    if (!clear_color)
       clear_color = &default_clear_color;
 
-   const uint64_t address = image->planes[plane].bo_offset + surface->offset;
-   const uint64_t aux_address = aux_usage == ISL_AUX_USAGE_NONE ?
-      0 : (image->planes[plane].bo_offset + aux_surface->offset);
-
-   struct anv_address clear_address = { .bo = NULL };
-   state_inout->clear_address = 0;
-
-   if (device->info.gen >= 10 && aux_usage != ISL_AUX_USAGE_NONE) {
-      if (aux_usage == ISL_AUX_USAGE_HIZ) {
-         clear_address = (struct anv_address) { .bo = &device->hiz_clear_bo };
-      } else {
-         clear_address = anv_image_get_clear_color_addr(device, image, aspect);
-      }
-   }
+   const struct anv_address address =
+      anv_address_add(image->planes[plane].address, surface->offset);
 
    if (view_usage == ISL_SURF_USAGE_STORAGE_BIT &&
        !(flags & ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY) &&
@@ -1096,14 +1095,14 @@
        */
       assert(aux_usage == ISL_AUX_USAGE_NONE);
       isl_buffer_fill_state(&device->isl_dev, state_inout->state.map,
-                            .address = address,
+                            .address = anv_address_physical(address),
                             .size = surface->isl.size,
                             .format = ISL_FORMAT_RAW,
                             .stride = 1,
-                            .mocs = device->default_mocs);
+                            .mocs = anv_mocs_for_bo(device, address.bo));
       state_inout->address = address,
-      state_inout->aux_address = 0;
-      state_inout->clear_address = 0;
+      state_inout->aux_address = ANV_NULL_ADDRESS;
+      state_inout->clear_address = ANV_NULL_ADDRESS;
    } else {
       if (view_usage == ISL_SURF_USAGE_STORAGE_BIT &&
           !(flags & ANV_IMAGE_VIEW_STATE_STORAGE_WRITE_ONLY)) {
@@ -1169,23 +1168,44 @@
          }
       }
 
+      state_inout->address = anv_address_add(address, offset_B);
+
+      struct anv_address aux_address = ANV_NULL_ADDRESS;
+      if (aux_usage != ISL_AUX_USAGE_NONE) {
+         aux_address = anv_address_add(image->planes[plane].address,
+                                       aux_surface->offset);
+      }
+      state_inout->aux_address = aux_address;
+
+      struct anv_address clear_address = ANV_NULL_ADDRESS;
+      if (device->info.gen >= 10 && aux_usage != ISL_AUX_USAGE_NONE) {
+         if (aux_usage == ISL_AUX_USAGE_HIZ) {
+            clear_address = (struct anv_address) {
+               .bo = &device->hiz_clear_bo,
+               .offset = 0,
+            };
+         } else {
+            clear_address = anv_image_get_clear_color_addr(device, image, aspect);
+         }
+      }
+      state_inout->clear_address = clear_address;
+
       uint32_t mocs = (view_usage == ISL_SURF_USAGE_RENDER_TARGET_BIT) && 
          (image->usage & VK_IMAGE_USAGE_SCANOUT_BIT_GOOGLE) ? 
-         device->uncached_mocs : device->default_mocs;
+         device->uncached_mocs : anv_mocs_for_bo(device, state_inout->address.bo);
       isl_surf_fill_state(&device->isl_dev, state_inout->state.map,
                           .surf = isl_surf,
                           .view = &view,
-                          .address = address + offset_B,
+                          .address = anv_address_physical(state_inout->address),
                           .clear_color = *clear_color,
                           .aux_surf = &aux_surface->isl,
                           .aux_usage = aux_usage,
-                          .aux_address = aux_address,
-                          .clear_address = clear_address.offset,
-                          .use_clear_address = clear_address.bo != NULL,
+                          .aux_address = anv_address_physical(aux_address),
+                          .clear_address = anv_address_physical(clear_address),
+                          .use_clear_address = !anv_address_is_null(clear_address),
                           .mocs = mocs,
                           .x_offset_sa = tile_x_sa,
                           .y_offset_sa = tile_y_sa);
-      state_inout->address = address + offset_B;
 
       /* With the exception of gen8, the bottom 12 bits of the MCS base address
        * are used to store other information.  This should be ok, however,
@@ -1193,15 +1213,14 @@
        */
       uint32_t *aux_addr_dw = state_inout->state.map +
          device->isl_dev.ss.aux_addr_offset;
-      assert((aux_address & 0xfff) == 0);
-      assert(aux_address == (*aux_addr_dw & 0xfffff000));
-      state_inout->aux_address = *aux_addr_dw;
+      assert((aux_address.offset & 0xfff) == 0);
+      state_inout->aux_address.offset |= *aux_addr_dw & 0xfff;
 
       if (device->info.gen >= 10 && clear_address.bo) {
          uint32_t *clear_addr_dw = state_inout->state.map +
                                    device->isl_dev.ss.clear_color_state_offset;
          assert((clear_address.offset & 0x3f) == 0);
-         state_inout->clear_address = *clear_addr_dw;
+         state_inout->clear_address.offset |= *clear_addr_dw & 0x3f;
       }
    }
 
@@ -1463,18 +1482,18 @@
                                      VK_IMAGE_ASPECT_COLOR_BIT,
                                      VK_IMAGE_TILING_LINEAR);
    const uint32_t format_bs = isl_format_get_layout(view->format)->bpb / 8;
-   view->bo = buffer->bo;
-   view->offset = buffer->offset + pCreateInfo->offset;
    view->range = anv_buffer_get_range(buffer, pCreateInfo->offset,
                                               pCreateInfo->range);
    view->range = align_down_npot_u32(view->range, format_bs);
 
+   view->address = anv_address_add(buffer->address, pCreateInfo->offset);
+
    if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT) {
       view->surface_state = alloc_surface_state(device);
 
       anv_fill_buffer_surface_state(device, view->surface_state,
                                     view->format,
-                                    view->offset, view->range, format_bs);
+                                    view->address, view->range, format_bs);
    } else {
       view->surface_state = (struct anv_state){ 0 };
    }
@@ -1491,14 +1510,14 @@
 
       anv_fill_buffer_surface_state(device, view->storage_surface_state,
                                     storage_format,
-                                    view->offset, view->range,
+                                    view->address, view->range,
                                     (storage_format == ISL_FORMAT_RAW ? 1 :
                                      isl_format_get_layout(storage_format)->bpb / 8));
 
       /* Write-only accesses should use the original format. */
       anv_fill_buffer_surface_state(device, view->writeonly_storage_surface_state,
                                     view->format,
-                                    view->offset, view->range,
+                                    view->address, view->range,
                                     isl_format_get_layout(view->format)->bpb / 8);
 
       isl_buffer_fill_image_param(&device->isl_dev,
diff --git a/src/intel/vulkan/anv_intel.c b/src/intel/vulkan/anv_intel.c
index 976c833..ed1bc09 100644
--- a/src/intel/vulkan/anv_intel.c
+++ b/src/intel/vulkan/anv_intel.c
@@ -73,8 +73,14 @@
 
    image = anv_image_from_handle(image_h);
 
+   uint64_t bo_flags = ANV_BO_EXTERNAL;
+   if (device->instance->physicalDevice.supports_48bit_addresses)
+      bo_flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+   if (device->instance->physicalDevice.use_softpin)
+      bo_flags |= EXEC_OBJECT_PINNED;
+
    result = anv_bo_cache_import(device, &device->bo_cache,
-                                pCreateInfo->fd, &mem->bo);
+                                pCreateInfo->fd, bo_flags, &mem->bo);
    if (result != VK_SUCCESS)
       goto fail_import;
 
@@ -90,11 +96,10 @@
       goto fail_import;
    }
 
-   if (device->instance->physicalDevice.supports_48bit_addresses)
-      mem->bo->flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
-
-   image->planes[0].bo = mem->bo;
-   image->planes[0].bo_offset = 0;
+   image->planes[0].address = (struct anv_address) {
+      .bo = mem->bo,
+      .offset = 0,
+   };
 
    assert(image->extent.width > 0);
    assert(image->extent.height > 0);
diff --git a/src/intel/vulkan/anv_magma.cc b/src/intel/vulkan/anv_magma.cc
index 0f610ab..982f58b 100644
--- a/src/intel/vulkan/anv_magma.cc
+++ b/src/intel/vulkan/anv_magma.cc
@@ -449,9 +449,28 @@
    magma_reset_semaphore(fence);
 }
 
-int anv_gem_syncobj_wait(anv_device* device, anv_syncobj_handle_t* fences, uint32_t fence_count,
-                         int64_t abs_timeout_ns, bool wait_all, uint64_t timeout_ns)
+static uint64_t gettime_ns(void)
 {
+   struct timespec current;
+   clock_gettime(CLOCK_MONOTONIC, &current);
+#define NSEC_PER_SEC 1000000000
+   return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
+#undef NSEC_PER_SEC   
+}
+
+static int64_t anv_get_relative_timeout(uint64_t abs_timeout)
+{
+   uint64_t now = gettime_ns();
+
+   if (abs_timeout < now)
+      return 0;
+   return abs_timeout - now;
+}
+
+int anv_gem_syncobj_wait(anv_device* device, anv_syncobj_handle_t* fences, uint32_t fence_count,
+                         int64_t abs_timeout_ns, bool wait_all)
+{
+   int64_t timeout_ns = anv_get_relative_timeout(abs_timeout_ns);
    magma_status_t status =
        magma_wait_semaphores(fences, fence_count, magma::ns_to_ms(timeout_ns), wait_all);
    switch (status) {
diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
index d5a08f7..c287a00 100644
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@@ -32,6 +32,8 @@
    struct anv_pipeline_layout *layout;
    bool add_bounds_checks;
 
+   bool uses_constants;
+   uint8_t constants_offset;
    struct {
       BITSET_WORD *used;
       uint8_t *surface_offsets;
@@ -54,6 +56,24 @@
 }
 
 static void
+add_deref_src_binding(struct apply_pipeline_layout_state *state, nir_src src)
+{
+   nir_deref_instr *deref = nir_src_as_deref(src);
+   add_var_binding(state, nir_deref_instr_get_variable(deref));
+}
+
+static void
+add_tex_src_binding(struct apply_pipeline_layout_state *state,
+                    nir_tex_instr *tex, nir_tex_src_type deref_src_type)
+{
+   int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type);
+   if (deref_src_idx < 0)
+      return;
+
+   add_deref_src_binding(state, tex->src[deref_src_idx].src);
+}
+
+static void
 get_used_bindings_block(nir_block *block,
                         struct apply_pipeline_layout_state *state)
 {
@@ -67,19 +87,23 @@
                         nir_intrinsic_binding(intrin));
             break;
 
-         case nir_intrinsic_image_var_load:
-         case nir_intrinsic_image_var_store:
-         case nir_intrinsic_image_var_atomic_add:
-         case nir_intrinsic_image_var_atomic_min:
-         case nir_intrinsic_image_var_atomic_max:
-         case nir_intrinsic_image_var_atomic_and:
-         case nir_intrinsic_image_var_atomic_or:
-         case nir_intrinsic_image_var_atomic_xor:
-         case nir_intrinsic_image_var_atomic_exchange:
-         case nir_intrinsic_image_var_atomic_comp_swap:
-         case nir_intrinsic_image_var_size:
-         case nir_intrinsic_image_var_samples:
-            add_var_binding(state, intrin->variables[0]->var);
+         case nir_intrinsic_image_deref_load:
+         case nir_intrinsic_image_deref_store:
+         case nir_intrinsic_image_deref_atomic_add:
+         case nir_intrinsic_image_deref_atomic_min:
+         case nir_intrinsic_image_deref_atomic_max:
+         case nir_intrinsic_image_deref_atomic_and:
+         case nir_intrinsic_image_deref_atomic_or:
+         case nir_intrinsic_image_deref_atomic_xor:
+         case nir_intrinsic_image_deref_atomic_exchange:
+         case nir_intrinsic_image_deref_atomic_comp_swap:
+         case nir_intrinsic_image_deref_size:
+         case nir_intrinsic_image_deref_samples:
+            add_deref_src_binding(state, intrin->src[0]);
+            break;
+
+         case nir_intrinsic_load_constant:
+            state->uses_constants = true;
             break;
 
          default:
@@ -89,10 +113,8 @@
       }
       case nir_instr_type_tex: {
          nir_tex_instr *tex = nir_instr_as_tex(instr);
-         assert(tex->texture);
-         add_var_binding(state, tex->texture->var);
-         if (tex->sampler)
-            add_var_binding(state, tex->sampler->var);
+         add_tex_src_binding(state, tex, nir_tex_src_texture_deref);
+         add_tex_src_binding(state, tex, nir_tex_src_sampler_deref);
          break;
       }
       default:
@@ -157,18 +179,69 @@
 }
 
 static void
-lower_tex_deref(nir_tex_instr *tex, nir_deref_var *deref,
-                unsigned *const_index, unsigned array_size,
-                nir_tex_src_type src_type, bool allow_indirect,
-                struct apply_pipeline_layout_state *state)
+lower_load_constant(nir_intrinsic_instr *intrin,
+                    struct apply_pipeline_layout_state *state)
 {
    nir_builder *b = &state->builder;
 
-   if (deref->deref.child) {
-      assert(deref->deref.child->deref_type == nir_deref_type_array);
-      nir_deref_array *deref_array = nir_deref_as_array(deref->deref.child);
+   b->cursor = nir_before_instr(&intrin->instr);
 
-      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+   nir_ssa_def *index = nir_imm_int(b, state->constants_offset);
+   nir_ssa_def *offset = nir_iadd(b, nir_ssa_for_src(b, intrin->src[0], 1),
+                                  nir_imm_int(b, nir_intrinsic_base(intrin)));
+
+   nir_intrinsic_instr *load_ubo =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo);
+   load_ubo->num_components = intrin->num_components;
+   load_ubo->src[0] = nir_src_for_ssa(index);
+   load_ubo->src[1] = nir_src_for_ssa(offset);
+   nir_ssa_dest_init(&load_ubo->instr, &load_ubo->dest,
+                     intrin->dest.ssa.num_components,
+                     intrin->dest.ssa.bit_size, NULL);
+   nir_builder_instr_insert(b, &load_ubo->instr);
+
+   nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                            nir_src_for_ssa(&load_ubo->dest.ssa));
+   nir_instr_remove(&intrin->instr);
+}
+
+static void
+lower_tex_deref(nir_tex_instr *tex, nir_tex_src_type deref_src_type,
+                unsigned *base_index,
+                struct apply_pipeline_layout_state *state)
+{
+   int deref_src_idx = nir_tex_instr_src_index(tex, deref_src_type);
+   if (deref_src_idx < 0)
+      return;
+
+   nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
+   nir_variable *var = nir_deref_instr_get_variable(deref);
+
+   unsigned set = var->data.descriptor_set;
+   unsigned binding = var->data.binding;
+   unsigned array_size =
+      state->layout->set[set].layout->binding[binding].array_size;
+
+   nir_tex_src_type offset_src_type;
+   if (deref_src_type == nir_tex_src_texture_deref) {
+      offset_src_type = nir_tex_src_texture_offset;
+      *base_index = state->set[set].surface_offsets[binding];
+   } else {
+      assert(deref_src_type == nir_tex_src_sampler_deref);
+      offset_src_type = nir_tex_src_sampler_offset;
+      *base_index = state->set[set].sampler_offsets[binding];
+   }
+
+   nir_ssa_def *index = NULL;
+   if (deref->deref_type != nir_deref_type_var) {
+      assert(deref->deref_type == nir_deref_type_array);
+
+      nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
+      if (const_index) {
+         *base_index += MIN2(const_index->u32[0], array_size - 1);
+      } else {
+         nir_builder *b = &state->builder;
+
          /* From VK_KHR_sampler_ycbcr_conversion:
           *
           * If sampler Y’CBCR conversion is enabled, the combined image
@@ -176,67 +249,34 @@
           * aggregated into arrays in shader code, irrespective of the
           * shaderSampledImageArrayDynamicIndexing feature.
           */
-         assert(allow_indirect);
+         assert(nir_tex_instr_src_index(tex, nir_tex_src_plane) == -1);
 
-         nir_ssa_def *index =
-            nir_iadd(b, nir_imm_int(b, deref_array->base_offset),
-                        nir_ssa_for_src(b, deref_array->indirect, 1));
+         index = nir_ssa_for_src(b, deref->arr.index, 1);
 
          if (state->add_bounds_checks)
             index = nir_umin(b, index, nir_imm_int(b, array_size - 1));
-
-         nir_tex_instr_add_src(tex, src_type, nir_src_for_ssa(index));
-      } else {
-         *const_index += MIN2(deref_array->base_offset, array_size - 1);
       }
    }
-}
 
-static void
-cleanup_tex_deref(nir_tex_instr *tex, nir_deref_var *deref)
-{
-   if (deref->deref.child == NULL)
-      return;
-
-   nir_deref_array *deref_array = nir_deref_as_array(deref->deref.child);
-
-   if (deref_array->deref_array_type != nir_deref_array_type_indirect)
-      return;
-
-   nir_instr_rewrite_src(&tex->instr, &deref_array->indirect, NIR_SRC_INIT);
-}
-
-static bool
-has_tex_src_plane(nir_tex_instr *tex)
-{
-   for (unsigned i = 0; i < tex->num_srcs; i++) {
-      if (tex->src[i].src_type == nir_tex_src_plane)
-         return true;
+   if (index) {
+      nir_instr_rewrite_src(&tex->instr, &tex->src[deref_src_idx].src,
+                            nir_src_for_ssa(index));
+      tex->src[deref_src_idx].src_type = offset_src_type;
+   } else {
+      nir_tex_instr_remove_src(tex, deref_src_idx);
    }
-
-   return false;
 }
 
 static uint32_t
-extract_tex_src_plane(nir_tex_instr *tex)
+tex_instr_get_and_remove_plane_src(nir_tex_instr *tex)
 {
-   unsigned plane = 0;
+   int plane_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_plane);
+   if (plane_src_idx < 0)
+      return 0;
 
-   int plane_src_idx = -1;
-   for (unsigned i = 0; i < tex->num_srcs; i++) {
-      if (tex->src[i].src_type == nir_tex_src_plane) {
-         nir_const_value *const_plane =
-            nir_src_as_const_value(tex->src[i].src);
+   unsigned plane =
+      nir_src_as_const_value(tex->src[plane_src_idx].src)->u32[0];
 
-         /* Our color conversion lowering pass should only ever insert
-          * constants. */
-         assert(const_plane);
-         plane = const_plane->u32[0];
-         plane_src_idx = i;
-      }
-   }
-
-   assert(plane_src_idx >= 0);
    nir_tex_instr_remove_src(tex, plane_src_idx);
 
    return plane;
@@ -245,44 +285,22 @@
 static void
 lower_tex(nir_tex_instr *tex, struct apply_pipeline_layout_state *state)
 {
-   /* No one should have come by and lowered it already */
-   assert(tex->texture);
-
    state->builder.cursor = nir_before_instr(&tex->instr);
 
-   unsigned set = tex->texture->var->data.descriptor_set;
-   unsigned binding = tex->texture->var->data.binding;
-   unsigned array_size =
-      state->layout->set[set].layout->binding[binding].array_size;
-   bool has_plane = has_tex_src_plane(tex);
-   unsigned plane = has_plane ? extract_tex_src_plane(tex) : 0;
+   unsigned plane = tex_instr_get_and_remove_plane_src(tex);
 
-   tex->texture_index = state->set[set].surface_offsets[binding];
-   lower_tex_deref(tex, tex->texture, &tex->texture_index, array_size,
-                   nir_tex_src_texture_offset, !has_plane, state);
+   lower_tex_deref(tex, nir_tex_src_texture_deref,
+                   &tex->texture_index, state);
    tex->texture_index += plane;
 
-   if (tex->sampler) {
-      unsigned set = tex->sampler->var->data.descriptor_set;
-      unsigned binding = tex->sampler->var->data.binding;
-      unsigned array_size =
-         state->layout->set[set].layout->binding[binding].array_size;
-      tex->sampler_index = state->set[set].sampler_offsets[binding];
-      lower_tex_deref(tex, tex->sampler, &tex->sampler_index, array_size,
-                      nir_tex_src_sampler_offset, !has_plane, state);
-      tex->sampler_index += plane;
-   }
+   lower_tex_deref(tex, nir_tex_src_sampler_deref,
+                   &tex->sampler_index, state);
+   tex->sampler_index += plane;
 
    /* The backend only ever uses this to mark used surfaces.  We don't care
     * about that little optimization so it just needs to be non-zero.
     */
    tex->texture_array_size = 1;
-
-   cleanup_tex_deref(tex, tex->texture);
-   if (tex->sampler)
-      cleanup_tex_deref(tex, tex->sampler);
-   tex->texture = NULL;
-   tex->sampler = NULL;
 }
 
 static void
@@ -300,6 +318,9 @@
          case nir_intrinsic_vulkan_resource_reindex:
             lower_res_reindex_intrinsic(intrin, state);
             break;
+         case nir_intrinsic_load_constant:
+            lower_load_constant(intrin, state);
+            break;
          default:
             break;
          }
@@ -358,6 +379,9 @@
          get_used_bindings_block(block, &state);
    }
 
+   if (state.uses_constants)
+      map->surface_count++;
+
    for (uint32_t set = 0; set < layout->num_sets; set++) {
       struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
 
@@ -380,6 +404,14 @@
    unsigned surface = 0;
    unsigned sampler = 0;
    unsigned image = 0;
+
+   if (state.uses_constants) {
+      state.constants_offset = surface;
+      map->surface_to_descriptor[surface].set =
+         ANV_DESCRIPTOR_SET_SHADER_CONSTANTS;
+      surface++;
+   }
+
    for (uint32_t set = 0; set < layout->num_sets; set++) {
       struct anv_descriptor_set_layout *set_layout = layout->set[set].layout;
 
@@ -427,10 +459,12 @@
    }
 
    nir_foreach_variable(var, &shader->uniforms) {
-      if (!glsl_type_is_image(var->interface_type))
+      const struct glsl_type *glsl_type = glsl_without_array(var->type);
+
+      if (!glsl_type_is_image(glsl_type))
          continue;
 
-      enum glsl_sampler_dim dim = glsl_get_sampler_dim(var->interface_type);
+      enum glsl_sampler_dim dim = glsl_get_sampler_dim(glsl_type);
 
       const uint32_t set = var->data.descriptor_set;
       const uint32_t binding = var->data.binding;
diff --git a/src/intel/vulkan/anv_nir_lower_input_attachments.c b/src/intel/vulkan/anv_nir_lower_input_attachments.c
index 6dc4f90..81e5ad5 100644
--- a/src/intel/vulkan/anv_nir_lower_input_attachments.c
+++ b/src/intel/vulkan/anv_nir_lower_input_attachments.c
@@ -43,10 +43,10 @@
 static void
 try_lower_input_load(nir_function_impl *impl, nir_intrinsic_instr *load)
 {
+   nir_deref_instr *deref = nir_src_as_deref(load->src[0]);
+   assert(glsl_type_is_image(deref->type));
 
-   const struct glsl_type *image_type =
-      glsl_without_array(load->variables[0]->var->type);
-   enum glsl_sampler_dim image_dim = glsl_get_sampler_dim(image_type);
+   enum glsl_sampler_dim image_dim = glsl_get_sampler_dim(deref->type);
    if (image_dim != GLSL_SAMPLER_DIM_SUBPASS &&
        image_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
       return;
@@ -58,7 +58,7 @@
    b.cursor = nir_before_instr(&load->instr);
 
    nir_ssa_def *frag_coord = nir_f2i32(&b, load_frag_coord(&b));
-   nir_ssa_def *offset = nir_ssa_for_src(&b, load->src[0], 2);
+   nir_ssa_def *offset = nir_ssa_for_src(&b, load->src[1], 2);
    nir_ssa_def *pos = nir_iadd(&b, frag_coord, offset);
 
    nir_ssa_def *layer =
@@ -66,11 +66,11 @@
    nir_ssa_def *coord =
       nir_vec3(&b, nir_channel(&b, pos, 0), nir_channel(&b, pos, 1), layer);
 
-   nir_tex_instr *tex = nir_tex_instr_create(b.shader, 2 + multisampled);
+   nir_tex_instr *tex = nir_tex_instr_create(b.shader, 3 + multisampled);
 
    tex->op = nir_texop_txf;
 
-   switch (glsl_get_sampler_result_type(image_type)) {
+   switch (glsl_get_sampler_result_type(deref->type)) {
    case GLSL_TYPE_FLOAT:
       tex->dest_type = nir_type_float;
       break;
@@ -86,22 +86,23 @@
    tex->is_array = true;
    tex->is_shadow = false;
 
-   tex->texture = nir_deref_var_clone(load->variables[0], tex);
-   tex->sampler = NULL;
    tex->texture_index = 0;
    tex->sampler_index = 0;
 
-   tex->src[0].src_type = nir_tex_src_coord;
-   tex->src[0].src = nir_src_for_ssa(coord);
+   tex->src[0].src_type = nir_tex_src_texture_deref;
+   tex->src[0].src = nir_src_for_ssa(&deref->dest.ssa);
+
+   tex->src[1].src_type = nir_tex_src_coord;
+   tex->src[1].src = nir_src_for_ssa(coord);
    tex->coord_components = 3;
 
-   tex->src[1].src_type = nir_tex_src_lod;
-   tex->src[1].src = nir_src_for_ssa(nir_imm_int(&b, 0));
+   tex->src[2].src_type = nir_tex_src_lod;
+   tex->src[2].src = nir_src_for_ssa(nir_imm_int(&b, 0));
 
    if (image_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) {
       tex->op = nir_texop_txf_ms;
-      tex->src[2].src_type = nir_tex_src_ms_index;
-      tex->src[2].src = load->src[1];
+      tex->src[3].src_type = nir_tex_src_ms_index;
+      tex->src[3].src = load->src[2];
    }
 
    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
@@ -127,7 +128,7 @@
 
             nir_intrinsic_instr *load = nir_instr_as_intrinsic(instr);
 
-            if (load->intrinsic != nir_intrinsic_image_var_load)
+            if (load->intrinsic != nir_intrinsic_image_deref_load)
                continue;
 
             try_lower_input_load(function->impl, load);
diff --git a/src/intel/vulkan/anv_nir_lower_multiview.c b/src/intel/vulkan/anv_nir_lower_multiview.c
index 6822595..bde7aad 100644
--- a/src/intel/vulkan/anv_nir_lower_multiview.c
+++ b/src/intel/vulkan/anv_nir_lower_multiview.c
@@ -134,18 +134,11 @@
          if (b->shader->info.stage == MESA_SHADER_FRAGMENT)
             idx_var->data.interpolation = INTERP_MODE_FLAT;
 
-         if (glsl_type_is_array(type)) {
-            nir_deref_var *deref = nir_deref_var_create(b->shader, idx_var);
-            nir_deref_array *arr = nir_deref_array_create(b->shader);
-            arr->deref.type = glsl_int_type();
-            arr->deref_array_type = nir_deref_array_type_direct;
-            arr->base_offset = 0;
-            deref->deref.child = &arr->deref;
+         nir_deref_instr *deref = nir_build_deref_var(b, idx_var);
+         if (glsl_type_is_array(type))
+            deref = nir_build_deref_array(b, deref, nir_imm_int(b, 0));
 
-            state->view_index = nir_load_deref_var(b, deref);
-         } else {
-            state->view_index = nir_load_var(b, idx_var);
-         }
+         state->view_index = nir_load_deref(b, deref);
       }
    }
 
diff --git a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
index e2b5603..71e511f 100644
--- a/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
+++ b/src/intel/vulkan/anv_nir_lower_ycbcr_textures.c
@@ -30,6 +30,7 @@
    nir_builder *builder;
    nir_ssa_def *image_size;
    nir_tex_instr *origin_tex;
+   nir_deref_instr *tex_deref;
    struct anv_ycbcr_conversion *conversion;
 };
 
@@ -152,22 +153,24 @@
 
 /* TODO: we should probably replace this with a push constant/uniform. */
 static nir_ssa_def *
-get_texture_size(struct ycbcr_state *state, nir_deref_var *texture)
+get_texture_size(struct ycbcr_state *state, nir_deref_instr *texture)
 {
    if (state->image_size)
       return state->image_size;
 
    nir_builder *b = state->builder;
-   const struct glsl_type *type = nir_deref_tail(&texture->deref)->type;
-   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 0);
+   const struct glsl_type *type = texture->type;
+   nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
 
    tex->op = nir_texop_txs;
    tex->sampler_dim = glsl_get_sampler_dim(type);
    tex->is_array = glsl_sampler_type_is_array(type);
    tex->is_shadow = glsl_sampler_type_is_shadow(type);
-   tex->texture = nir_deref_var_clone(texture, tex);
    tex->dest_type = nir_type_int;
 
+   tex->src[0].src_type = nir_tex_src_texture_deref;
+   tex->src[0].src = nir_src_for_ssa(&texture->dest.ssa);
+
    nir_ssa_dest_init(&tex->instr, &tex->dest,
                      nir_tex_instr_dest_size(tex), 32, NULL);
    nir_builder_instr_insert(b, &tex->instr);
@@ -199,8 +202,7 @@
 {
    nir_builder *b = state->builder;
    struct anv_ycbcr_conversion *conversion = state->conversion;
-   nir_ssa_def *image_size = get_texture_size(state,
-                                              state->origin_tex->texture);
+   nir_ssa_def *image_size = get_texture_size(state, state->tex_deref);
    nir_ssa_def *comp[4] = { NULL, };
    int c;
 
@@ -266,10 +268,7 @@
 
    tex->texture_index = old_tex->texture_index;
    tex->texture_array_size = old_tex->texture_array_size;
-   tex->texture = nir_deref_var_clone(old_tex->texture, tex);
-
    tex->sampler_index = old_tex->sampler_index;
-   tex->sampler = nir_deref_var_clone(old_tex->sampler, tex);
 
    nir_ssa_dest_init(&tex->instr, &tex->dest,
                      old_tex->dest.ssa.num_components,
@@ -320,7 +319,11 @@
                     nir_builder *builder,
                     nir_tex_instr *tex)
 {
-   nir_variable *var = tex->texture->var;
+   int deref_src_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_deref);
+   assert(deref_src_idx >= 0);
+   nir_deref_instr *deref = nir_src_as_deref(tex->src[deref_src_idx].src);
+
+   nir_variable *var = nir_deref_instr_get_variable(deref);
    const struct anv_descriptor_set_layout *set_layout =
       layout->set[var->data.descriptor_set].layout;
    const struct anv_descriptor_set_binding_layout *binding =
@@ -339,12 +342,12 @@
 
    assert(tex->texture_index == 0);
    unsigned array_index = 0;
-   if (tex->texture->deref.child) {
-      assert(tex->texture->deref.child->deref_type == nir_deref_type_array);
-      nir_deref_array *deref_array = nir_deref_as_array(tex->texture->deref.child);
-      if (deref_array->deref_array_type != nir_deref_array_type_direct)
+   if (deref->deref_type != nir_deref_type_var) {
+      assert(deref->deref_type == nir_deref_type_array);
+      nir_const_value *const_index = nir_src_as_const_value(deref->arr.index);
+      if (!const_index)
          return false;
-      array_index = MIN2(deref_array->base_offset, binding->array_size - 1);
+      array_index = MIN2(const_index->u32[0], binding->array_size - 1);
    }
    const struct anv_sampler *sampler = binding->immutable_samplers[array_index];
 
@@ -354,6 +357,7 @@
    struct ycbcr_state state = {
       .builder = builder,
       .origin_tex = tex,
+      .tex_deref = deref,
       .conversion = sampler->conversion,
    };
 
diff --git a/src/intel/vulkan/anv_pass.c b/src/intel/vulkan/anv_pass.c
index 994a7e2..72bd992 100644
--- a/src/intel/vulkan/anv_pass.c
+++ b/src/intel/vulkan/anv_pass.c
@@ -25,210 +25,97 @@
 
 #include "vk_util.h"
 
-static unsigned
-num_subpass_attachments(const VkSubpassDescription *desc)
-{
-   return desc->inputAttachmentCount +
-          desc->colorAttachmentCount +
-          (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
-          (desc->pDepthStencilAttachment != NULL);
-}
-
 static void
-init_first_subpass_layout(struct anv_render_pass_attachment * const att,
-                          const VkAttachmentReference att_ref)
+anv_render_pass_add_subpass_dep(struct anv_render_pass *pass,
+                                const VkSubpassDependency2KHR *dep)
 {
-   if (att->first_subpass_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
-      att->first_subpass_layout = att_ref.layout;
-      assert(att->first_subpass_layout != VK_IMAGE_LAYOUT_UNDEFINED);
+   if (dep->dstSubpass == VK_SUBPASS_EXTERNAL) {
+      pass->subpass_flushes[pass->subpass_count] |=
+         anv_pipe_invalidate_bits_for_access_flags(dep->dstAccessMask);
+   } else {
+      assert(dep->dstSubpass < pass->subpass_count);
+      pass->subpass_flushes[dep->dstSubpass] |=
+         anv_pipe_invalidate_bits_for_access_flags(dep->dstAccessMask);
+   }
+
+   if (dep->srcSubpass == VK_SUBPASS_EXTERNAL) {
+      pass->subpass_flushes[0] |=
+         anv_pipe_flush_bits_for_access_flags(dep->srcAccessMask);
+   } else {
+      assert(dep->srcSubpass < pass->subpass_count);
+      pass->subpass_flushes[dep->srcSubpass + 1] |=
+         anv_pipe_flush_bits_for_access_flags(dep->srcAccessMask);
    }
 }
 
-VkResult anv_CreateRenderPass(
-    VkDevice                                    _device,
-    const VkRenderPassCreateInfo*               pCreateInfo,
-    const VkAllocationCallbacks*                pAllocator,
-    VkRenderPass*                               pRenderPass)
+/* Do a second "compile" step on a render pass */
+static void
+anv_render_pass_compile(struct anv_render_pass *pass)
 {
-   ANV_FROM_HANDLE(anv_device, device, _device);
-
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO);
-
-   struct anv_render_pass *pass;
-   struct anv_subpass *subpasses;
-   struct anv_render_pass_attachment *attachments;
-   enum anv_pipe_bits *subpass_flushes;
-
-   ANV_MULTIALLOC(ma);
-   anv_multialloc_add(&ma, &pass, 1);
-   anv_multialloc_add(&ma, &subpasses, pCreateInfo->subpassCount);
-   anv_multialloc_add(&ma, &attachments, pCreateInfo->attachmentCount);
-   anv_multialloc_add(&ma, &subpass_flushes, pCreateInfo->subpassCount + 1);
-
-   struct anv_subpass_attachment *subpass_attachments;
-   uint32_t subpass_attachment_count = 0;
-   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
-      subpass_attachment_count +=
-         num_subpass_attachments(&pCreateInfo->pSubpasses[i]);
-   }
-   anv_multialloc_add(&ma, &subpass_attachments, subpass_attachment_count);
-
-   if (!anv_multialloc_alloc2(&ma, &device->alloc, pAllocator,
-                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
-      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
-
-   /* Clear the subpasses along with the parent pass. This required because
-    * each array member of anv_subpass must be a valid pointer if not NULL.
+   /* The CreateRenderPass code zeros the entire render pass and also uses a
+    * designated initializer for filling these out.  There's no need for us to
+    * do it again.
+    *
+    * for (uint32_t i = 0; i < pass->attachment_count; i++) {
+    *    pass->attachments[i].usage = 0;
+    *    pass->attachments[i].first_subpass_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+    * }
     */
-   memset(pass, 0, ma.size);
-   pass->attachment_count = pCreateInfo->attachmentCount;
-   pass->subpass_count = pCreateInfo->subpassCount;
-   pass->attachments = attachments;
-   pass->subpass_flushes = subpass_flushes;
 
-   for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
-      struct anv_render_pass_attachment *att = &pass->attachments[i];
-
-      att->format = pCreateInfo->pAttachments[i].format;
-      att->samples = pCreateInfo->pAttachments[i].samples;
-      att->usage = 0;
-      att->load_op = pCreateInfo->pAttachments[i].loadOp;
-      att->store_op = pCreateInfo->pAttachments[i].storeOp;
-      att->stencil_load_op = pCreateInfo->pAttachments[i].stencilLoadOp;
-      att->initial_layout = pCreateInfo->pAttachments[i].initialLayout;
-      att->final_layout = pCreateInfo->pAttachments[i].finalLayout;
-      att->first_subpass_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-   }
-
-   bool has_color = false, has_depth = false, has_input = false;
-   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
-      const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
+   VkImageUsageFlags all_usage = 0;
+   for (uint32_t i = 0; i < pass->subpass_count; i++) {
       struct anv_subpass *subpass = &pass->subpasses[i];
 
-      subpass->input_count = desc->inputAttachmentCount;
-      subpass->color_count = desc->colorAttachmentCount;
-      subpass->attachment_count = num_subpass_attachments(desc);
-      subpass->attachments = subpass_attachments;
-      subpass->view_mask = 0;
+      /* We don't allow depth_stencil_attachment to be non-NULL and be
+       * VK_ATTACHMENT_UNUSED.  This way something can just check for NULL
+       * and be guaranteed that they have a valid attachment.
+       */
+      if (subpass->depth_stencil_attachment &&
+          subpass->depth_stencil_attachment->attachment == VK_ATTACHMENT_UNUSED)
+         subpass->depth_stencil_attachment = NULL;
 
-      if (desc->inputAttachmentCount > 0) {
-         subpass->input_attachments = subpass_attachments;
-         subpass_attachments += desc->inputAttachmentCount;
+      for (uint32_t j = 0; j < subpass->attachment_count; j++) {
+         struct anv_subpass_attachment *subpass_att = &subpass->attachments[j];
+         if (subpass_att->attachment == VK_ATTACHMENT_UNUSED)
+            continue;
 
-         for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
-            uint32_t a = desc->pInputAttachments[j].attachment;
-            subpass->input_attachments[j] = (struct anv_subpass_attachment) {
-               .usage =       VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
-               .attachment =  desc->pInputAttachments[j].attachment,
-               .layout =      desc->pInputAttachments[j].layout,
-            };
-            if (a != VK_ATTACHMENT_UNUSED) {
-               has_input = true;
-               pass->attachments[a].usage |= VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT;
-               pass->attachments[a].last_subpass_idx = i;
+         struct anv_render_pass_attachment *pass_att =
+            &pass->attachments[subpass_att->attachment];
 
-               init_first_subpass_layout(&pass->attachments[a],
-                                         desc->pInputAttachments[j]);
-               if (desc->pDepthStencilAttachment &&
-                   a == desc->pDepthStencilAttachment->attachment)
-                  subpass->has_ds_self_dep = true;
-            }
+         assert(__builtin_popcount(subpass_att->usage) == 1);
+         pass_att->usage |= subpass_att->usage;
+         pass_att->last_subpass_idx = i;
+
+         all_usage |= subpass_att->usage;
+
+         if (pass_att->first_subpass_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
+            pass_att->first_subpass_layout = subpass_att->layout;
+            assert(pass_att->first_subpass_layout != VK_IMAGE_LAYOUT_UNDEFINED);
          }
+
+         if (subpass_att->usage == VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT &&
+             subpass->depth_stencil_attachment &&
+             subpass_att->attachment == subpass->depth_stencil_attachment->attachment)
+            subpass->has_ds_self_dep = true;
       }
 
-      if (desc->colorAttachmentCount > 0) {
-         subpass->color_attachments = subpass_attachments;
-         subpass_attachments += desc->colorAttachmentCount;
-
-         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
-            uint32_t a = desc->pColorAttachments[j].attachment;
-            subpass->color_attachments[j] = (struct anv_subpass_attachment) {
-               .usage =       VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
-               .attachment =  desc->pColorAttachments[j].attachment,
-               .layout =      desc->pColorAttachments[j].layout,
-            };
-            if (a != VK_ATTACHMENT_UNUSED) {
-               has_color = true;
-               pass->attachments[a].usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
-               pass->attachments[a].last_subpass_idx = i;
-
-               init_first_subpass_layout(&pass->attachments[a],
-                                         desc->pColorAttachments[j]);
-            }
-         }
-      }
-
+      /* We have to handle resolve attachments specially */
       subpass->has_resolve = false;
-      if (desc->pResolveAttachments) {
-         subpass->resolve_attachments = subpass_attachments;
-         subpass_attachments += desc->colorAttachmentCount;
+      if (subpass->resolve_attachments) {
+         for (uint32_t j = 0; j < subpass->color_count; j++) {
+            struct anv_subpass_attachment *color_att =
+               &subpass->color_attachments[j];
+            struct anv_subpass_attachment *resolve_att =
+               &subpass->resolve_attachments[j];
+            if (resolve_att->attachment == VK_ATTACHMENT_UNUSED)
+               continue;
 
-         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
-            uint32_t a = desc->pResolveAttachments[j].attachment;
-            subpass->resolve_attachments[j] = (struct anv_subpass_attachment) {
-               .usage =       VK_IMAGE_USAGE_TRANSFER_DST_BIT,
-               .attachment =  desc->pResolveAttachments[j].attachment,
-               .layout =      desc->pResolveAttachments[j].layout,
-            };
-            if (a != VK_ATTACHMENT_UNUSED) {
-               subpass->has_resolve = true;
-               uint32_t color_att = desc->pColorAttachments[j].attachment;
-               pass->attachments[color_att].usage |=
-                  VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
-               pass->attachments[a].usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
-               pass->attachments[a].last_subpass_idx = i;
+            subpass->has_resolve = true;
 
-               init_first_subpass_layout(&pass->attachments[a],
-                                         desc->pResolveAttachments[j]);
-            }
+            assert(resolve_att->usage == VK_IMAGE_USAGE_TRANSFER_DST_BIT);
+            color_att->usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
          }
       }
-
-      if (desc->pDepthStencilAttachment) {
-         uint32_t a = desc->pDepthStencilAttachment->attachment;
-         subpass->depth_stencil_attachment = (struct anv_subpass_attachment) {
-            .usage =       VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
-            .attachment =  desc->pDepthStencilAttachment->attachment,
-            .layout =      desc->pDepthStencilAttachment->layout,
-         };
-         *subpass_attachments++ = subpass->depth_stencil_attachment;
-         if (a != VK_ATTACHMENT_UNUSED) {
-            has_depth = true;
-            pass->attachments[a].usage |=
-               VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT;
-            pass->attachments[a].last_subpass_idx = i;
-
-            init_first_subpass_layout(&pass->attachments[a],
-                                      *desc->pDepthStencilAttachment);
-         }
-      } else {
-         subpass->depth_stencil_attachment = (struct anv_subpass_attachment) {
-            .usage =       VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
-            .attachment =  VK_ATTACHMENT_UNUSED,
-            .layout =   VK_IMAGE_LAYOUT_UNDEFINED,
-         };
-      }
-   }
-
-   for (uint32_t i = 0; i < pCreateInfo->dependencyCount; i++) {
-      const VkSubpassDependency *dep = &pCreateInfo->pDependencies[i];
-      if (dep->dstSubpass == VK_SUBPASS_EXTERNAL) {
-         pass->subpass_flushes[pass->subpass_count] |=
-            anv_pipe_invalidate_bits_for_access_flags(dep->dstAccessMask);
-      } else {
-         assert(dep->dstSubpass < pass->subpass_count);
-         pass->subpass_flushes[dep->dstSubpass] |=
-            anv_pipe_invalidate_bits_for_access_flags(dep->dstAccessMask);
-      }
-
-      if (dep->srcSubpass == VK_SUBPASS_EXTERNAL) {
-         pass->subpass_flushes[0] |=
-            anv_pipe_flush_bits_for_access_flags(dep->srcAccessMask);
-      } else {
-         assert(dep->srcSubpass < pass->subpass_count);
-         pass->subpass_flushes[dep->srcSubpass + 1] |=
-            anv_pipe_flush_bits_for_access_flags(dep->srcAccessMask);
-      }
    }
 
    /* From the Vulkan 1.0.39 spec:
@@ -278,18 +165,156 @@
     * dependency.  Or, we could just be lazy and add a couple extra flushes.
     * We choose to be lazy.
     */
-   if (has_input) {
+   if (all_usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT) {
       pass->subpass_flushes[0] |=
          ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
    }
-   if (has_color) {
+   if (all_usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) {
       pass->subpass_flushes[pass->subpass_count] |=
          ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
    }
-   if (has_depth) {
+   if (all_usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) {
       pass->subpass_flushes[pass->subpass_count] |=
          ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
    }
+}
+
+static unsigned
+num_subpass_attachments(const VkSubpassDescription *desc)
+{
+   return desc->inputAttachmentCount +
+          desc->colorAttachmentCount +
+          (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
+          (desc->pDepthStencilAttachment != NULL);
+}
+
+VkResult anv_CreateRenderPass(
+    VkDevice                                    _device,
+    const VkRenderPassCreateInfo*               pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkRenderPass*                               pRenderPass)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO);
+
+   struct anv_render_pass *pass;
+   struct anv_subpass *subpasses;
+   struct anv_render_pass_attachment *attachments;
+   enum anv_pipe_bits *subpass_flushes;
+
+   ANV_MULTIALLOC(ma);
+   anv_multialloc_add(&ma, &pass, 1);
+   anv_multialloc_add(&ma, &subpasses, pCreateInfo->subpassCount);
+   anv_multialloc_add(&ma, &attachments, pCreateInfo->attachmentCount);
+   anv_multialloc_add(&ma, &subpass_flushes, pCreateInfo->subpassCount + 1);
+
+   struct anv_subpass_attachment *subpass_attachments;
+   uint32_t subpass_attachment_count = 0;
+   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+      subpass_attachment_count +=
+         num_subpass_attachments(&pCreateInfo->pSubpasses[i]);
+   }
+   anv_multialloc_add(&ma, &subpass_attachments, subpass_attachment_count);
+
+   if (!anv_multialloc_alloc2(&ma, &device->alloc, pAllocator,
+                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* Clear the subpasses along with the parent pass. This required because
+    * each array member of anv_subpass must be a valid pointer if not NULL.
+    */
+   memset(pass, 0, ma.size);
+   pass->attachment_count = pCreateInfo->attachmentCount;
+   pass->subpass_count = pCreateInfo->subpassCount;
+   pass->attachments = attachments;
+   pass->subpass_flushes = subpass_flushes;
+
+   for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
+      pass->attachments[i] = (struct anv_render_pass_attachment) {
+         .format                 = pCreateInfo->pAttachments[i].format,
+         .samples                = pCreateInfo->pAttachments[i].samples,
+         .load_op                = pCreateInfo->pAttachments[i].loadOp,
+         .store_op               = pCreateInfo->pAttachments[i].storeOp,
+         .stencil_load_op        = pCreateInfo->pAttachments[i].stencilLoadOp,
+         .initial_layout         = pCreateInfo->pAttachments[i].initialLayout,
+         .final_layout           = pCreateInfo->pAttachments[i].finalLayout,
+      };
+   }
+
+   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+      const VkSubpassDescription *desc = &pCreateInfo->pSubpasses[i];
+      struct anv_subpass *subpass = &pass->subpasses[i];
+
+      subpass->input_count = desc->inputAttachmentCount;
+      subpass->color_count = desc->colorAttachmentCount;
+      subpass->attachment_count = num_subpass_attachments(desc);
+      subpass->attachments = subpass_attachments;
+      subpass->view_mask = 0;
+
+      if (desc->inputAttachmentCount > 0) {
+         subpass->input_attachments = subpass_attachments;
+         subpass_attachments += desc->inputAttachmentCount;
+
+         for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
+            subpass->input_attachments[j] = (struct anv_subpass_attachment) {
+               .usage =       VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
+               .attachment =  desc->pInputAttachments[j].attachment,
+               .layout =      desc->pInputAttachments[j].layout,
+            };
+         }
+      }
+
+      if (desc->colorAttachmentCount > 0) {
+         subpass->color_attachments = subpass_attachments;
+         subpass_attachments += desc->colorAttachmentCount;
+
+         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+            subpass->color_attachments[j] = (struct anv_subpass_attachment) {
+               .usage =       VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+               .attachment =  desc->pColorAttachments[j].attachment,
+               .layout =      desc->pColorAttachments[j].layout,
+            };
+         }
+      }
+
+      if (desc->pResolveAttachments) {
+         subpass->resolve_attachments = subpass_attachments;
+         subpass_attachments += desc->colorAttachmentCount;
+
+         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+            subpass->resolve_attachments[j] = (struct anv_subpass_attachment) {
+               .usage =       VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+               .attachment =  desc->pResolveAttachments[j].attachment,
+               .layout =      desc->pResolveAttachments[j].layout,
+            };
+         }
+      }
+
+      if (desc->pDepthStencilAttachment) {
+         subpass->depth_stencil_attachment = subpass_attachments++;
+
+         *subpass->depth_stencil_attachment = (struct anv_subpass_attachment) {
+            .usage =       VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+            .attachment =  desc->pDepthStencilAttachment->attachment,
+            .layout =      desc->pDepthStencilAttachment->layout,
+         };
+      }
+   }
+
+   for (uint32_t i = 0; i < pCreateInfo->dependencyCount; i++) {
+      /* Convert to a Dependency2KHR */
+      struct VkSubpassDependency2KHR dep2 = {
+         .srcSubpass       = pCreateInfo->pDependencies[i].srcSubpass,
+         .dstSubpass       = pCreateInfo->pDependencies[i].dstSubpass,
+         .srcStageMask     = pCreateInfo->pDependencies[i].srcStageMask,
+         .dstStageMask     = pCreateInfo->pDependencies[i].dstStageMask,
+         .srcAccessMask    = pCreateInfo->pDependencies[i].srcAccessMask,
+         .dstAccessMask    = pCreateInfo->pDependencies[i].dstAccessMask,
+         .dependencyFlags  = pCreateInfo->pDependencies[i].dependencyFlags,
+      };
+      anv_render_pass_add_subpass_dep(pass, &dep2);
+   }
 
    vk_foreach_struct(ext, pCreateInfo->pNext) {
       switch (ext->sType) {
@@ -307,6 +332,148 @@
       }
    }
 
+   anv_render_pass_compile(pass);
+
+   *pRenderPass = anv_render_pass_to_handle(pass);
+
+   return VK_SUCCESS;
+}
+
+static unsigned
+num_subpass_attachments2(const VkSubpassDescription2KHR *desc)
+{
+   return desc->inputAttachmentCount +
+          desc->colorAttachmentCount +
+          (desc->pResolveAttachments ? desc->colorAttachmentCount : 0) +
+          (desc->pDepthStencilAttachment != NULL);
+}
+
+VkResult anv_CreateRenderPass2KHR(
+    VkDevice                                    _device,
+    const VkRenderPassCreateInfo2KHR*           pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkRenderPass*                               pRenderPass)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO_2_KHR);
+
+   struct anv_render_pass *pass;
+   struct anv_subpass *subpasses;
+   struct anv_render_pass_attachment *attachments;
+   enum anv_pipe_bits *subpass_flushes;
+
+   ANV_MULTIALLOC(ma);
+   anv_multialloc_add(&ma, &pass, 1);
+   anv_multialloc_add(&ma, &subpasses, pCreateInfo->subpassCount);
+   anv_multialloc_add(&ma, &attachments, pCreateInfo->attachmentCount);
+   anv_multialloc_add(&ma, &subpass_flushes, pCreateInfo->subpassCount + 1);
+
+   struct anv_subpass_attachment *subpass_attachments;
+   uint32_t subpass_attachment_count = 0;
+   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+      subpass_attachment_count +=
+         num_subpass_attachments2(&pCreateInfo->pSubpasses[i]);
+   }
+   anv_multialloc_add(&ma, &subpass_attachments, subpass_attachment_count);
+
+   if (!anv_multialloc_alloc2(&ma, &device->alloc, pAllocator,
+                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT))
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   /* Clear the subpasses along with the parent pass. This required because
+    * each array member of anv_subpass must be a valid pointer if not NULL.
+    */
+   memset(pass, 0, ma.size);
+   pass->attachment_count = pCreateInfo->attachmentCount;
+   pass->subpass_count = pCreateInfo->subpassCount;
+   pass->attachments = attachments;
+   pass->subpass_flushes = subpass_flushes;
+
+   for (uint32_t i = 0; i < pCreateInfo->attachmentCount; i++) {
+      pass->attachments[i] = (struct anv_render_pass_attachment) {
+         .format                 = pCreateInfo->pAttachments[i].format,
+         .samples                = pCreateInfo->pAttachments[i].samples,
+         .load_op                = pCreateInfo->pAttachments[i].loadOp,
+         .store_op               = pCreateInfo->pAttachments[i].storeOp,
+         .stencil_load_op        = pCreateInfo->pAttachments[i].stencilLoadOp,
+         .initial_layout         = pCreateInfo->pAttachments[i].initialLayout,
+         .final_layout           = pCreateInfo->pAttachments[i].finalLayout,
+      };
+   }
+
+   for (uint32_t i = 0; i < pCreateInfo->subpassCount; i++) {
+      const VkSubpassDescription2KHR *desc = &pCreateInfo->pSubpasses[i];
+      struct anv_subpass *subpass = &pass->subpasses[i];
+
+      subpass->input_count = desc->inputAttachmentCount;
+      subpass->color_count = desc->colorAttachmentCount;
+      subpass->attachment_count = num_subpass_attachments2(desc);
+      subpass->attachments = subpass_attachments;
+      subpass->view_mask = desc->viewMask;
+
+      if (desc->inputAttachmentCount > 0) {
+         subpass->input_attachments = subpass_attachments;
+         subpass_attachments += desc->inputAttachmentCount;
+
+         for (uint32_t j = 0; j < desc->inputAttachmentCount; j++) {
+            subpass->input_attachments[j] = (struct anv_subpass_attachment) {
+               .usage =       VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT,
+               .attachment =  desc->pInputAttachments[j].attachment,
+               .layout =      desc->pInputAttachments[j].layout,
+            };
+         }
+      }
+
+      if (desc->colorAttachmentCount > 0) {
+         subpass->color_attachments = subpass_attachments;
+         subpass_attachments += desc->colorAttachmentCount;
+
+         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+            subpass->color_attachments[j] = (struct anv_subpass_attachment) {
+               .usage =       VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+               .attachment =  desc->pColorAttachments[j].attachment,
+               .layout =      desc->pColorAttachments[j].layout,
+            };
+         }
+      }
+
+      if (desc->pResolveAttachments) {
+         subpass->resolve_attachments = subpass_attachments;
+         subpass_attachments += desc->colorAttachmentCount;
+
+         for (uint32_t j = 0; j < desc->colorAttachmentCount; j++) {
+            subpass->resolve_attachments[j] = (struct anv_subpass_attachment) {
+               .usage =       VK_IMAGE_USAGE_TRANSFER_DST_BIT,
+               .attachment =  desc->pResolveAttachments[j].attachment,
+               .layout =      desc->pResolveAttachments[j].layout,
+            };
+         }
+      }
+
+      if (desc->pDepthStencilAttachment) {
+         subpass->depth_stencil_attachment = subpass_attachments++;
+
+         *subpass->depth_stencil_attachment = (struct anv_subpass_attachment) {
+            .usage =       VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
+            .attachment =  desc->pDepthStencilAttachment->attachment,
+            .layout =      desc->pDepthStencilAttachment->layout,
+         };
+      }
+   }
+
+   for (uint32_t i = 0; i < pCreateInfo->dependencyCount; i++)
+      anv_render_pass_add_subpass_dep(pass, &pCreateInfo->pDependencies[i]);
+
+   vk_foreach_struct(ext, pCreateInfo->pNext) {
+      switch (ext->sType) {
+      default:
+         anv_debug_ignored_stype(ext->sType);
+      }
+   }
+
+   anv_render_pass_compile(pass);
+
    *pRenderPass = anv_render_pass_to_handle(pass);
 
    return VK_SUCCESS;
@@ -334,8 +501,7 @@
     * for all sample counts.
     */
    for (unsigned i = 0; i < pass->subpass_count; ++i) {
-      if (pass->subpasses[i].depth_stencil_attachment.attachment !=
-          VK_ATTACHMENT_UNUSED) {
+      if (pass->subpasses[i].depth_stencil_attachment) {
          *pGranularity = (VkExtent2D) { .width = 8, .height = 4 };
          return;
       }
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 64aab64..184a238 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -33,6 +33,7 @@
 #include "compiler/brw_nir.h"
 #include "anv_nir.h"
 #include "spirv/nir_spirv.h"
+#include "vk_util.h"
 
 /* Needed for SWIZZLE macros */
 #include "program/prog_instruction.h"
@@ -144,12 +145,17 @@
          .multiview = true,
          .variable_pointers = true,
          .storage_16bit = device->instance->physicalDevice.info.gen >= 8,
+         .int16 = device->instance->physicalDevice.info.gen >= 8,
+         .shader_viewport_index_layer = true,
          .subgroup_arithmetic = true,
          .subgroup_basic = true,
          .subgroup_ballot = true,
          .subgroup_quad = true,
          .subgroup_shuffle = true,
          .subgroup_vote = true,
+         .stencil_export = device->instance->physicalDevice.info.gen >= 9,
+         .storage_8bit = device->instance->physicalDevice.info.gen >= 8,
+         .post_depth_coverage = device->instance->physicalDevice.info.gen >= 9,
       },
    };
 
@@ -177,6 +183,7 @@
    NIR_PASS_V(nir, nir_lower_constant_initializers, nir_var_local);
    NIR_PASS_V(nir, nir_lower_returns);
    NIR_PASS_V(nir, nir_inline_functions);
+   NIR_PASS_V(nir, nir_copy_prop);
 
    /* Pick off the single entrypoint that we want */
    foreach_list_typed_safe(nir_function, func, node, &nir->functions) {
@@ -193,6 +200,12 @@
     */
    NIR_PASS_V(nir, nir_lower_constant_initializers, ~0);
 
+   /* Split member structs.  We do this before lower_io_to_temporaries so that
+    * it doesn't lower system values to temporaries by accident.
+    */
+   NIR_PASS_V(nir, nir_split_var_copies);
+   NIR_PASS_V(nir, nir_split_per_member_structs);
+
    NIR_PASS_V(nir, nir_remove_dead_variables,
               nir_var_shader_in | nir_var_shader_out | nir_var_system_value);
 
@@ -295,6 +308,27 @@
 }
 
 static void
+populate_tcs_prog_key(const struct gen_device_info *devinfo,
+                      unsigned input_vertices,
+                      struct brw_tcs_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_sampler_prog_key(devinfo, &key->tex);
+
+   key->input_vertices = input_vertices;
+}
+
+static void
+populate_tes_prog_key(const struct gen_device_info *devinfo,
+                      struct brw_tes_prog_key *key)
+{
+   memset(key, 0, sizeof(*key));
+
+   populate_sampler_prog_key(devinfo, &key->tex);
+}
+
+static void
 populate_gs_prog_key(const struct gen_device_info *devinfo,
                      struct brw_gs_prog_key *key)
 {
@@ -304,21 +338,19 @@
 }
 
 static void
-populate_wm_prog_key(const struct anv_pipeline *pipeline,
-                     const VkGraphicsPipelineCreateInfo *info,
+populate_wm_prog_key(const struct gen_device_info *devinfo,
+                     const struct anv_subpass *subpass,
+                     const VkPipelineMultisampleStateCreateInfo *ms_info,
                      struct brw_wm_prog_key *key)
 {
-   const struct gen_device_info *devinfo = &pipeline->device->info;
-
    memset(key, 0, sizeof(*key));
 
    populate_sampler_prog_key(devinfo, &key->tex);
 
-   /* TODO: we could set this to 0 based on the information in nir_shader, but
-    * this function is called before spirv_to_nir. */
-   const struct brw_vue_map *vue_map =
-      &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
-   key->input_slots_valid = vue_map->slots_valid;
+   /* We set this to 0 here and set to the actual value before we call
+    * brw_compile_fs.
+    */
+   key->input_slots_valid = 0;
 
    /* Vulkan doesn't specify a default */
    key->high_quality_derivatives = false;
@@ -326,25 +358,28 @@
    /* XXX Vulkan doesn't appear to specify */
    key->clamp_fragment_color = false;
 
-   key->nr_color_regions = pipeline->subpass->color_count;
+   assert(subpass->color_count <= MAX_RTS);
+   for (uint32_t i = 0; i < subpass->color_count; i++) {
+      if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
+         key->color_outputs_valid |= (1 << i);
+   }
+
+   key->nr_color_regions = _mesa_bitcount(key->color_outputs_valid);
 
    key->replicate_alpha = key->nr_color_regions > 1 &&
-                          info->pMultisampleState &&
-                          info->pMultisampleState->alphaToCoverageEnable;
+                          ms_info && ms_info->alphaToCoverageEnable;
 
-   if (info->pMultisampleState) {
+   if (ms_info) {
       /* We should probably pull this out of the shader, but it's fairly
        * harmless to compute it and then let dead-code take care of it.
        */
-      if (info->pMultisampleState->rasterizationSamples > 1) {
+      if (ms_info->rasterizationSamples > 1) {
          key->persample_interp =
-            (info->pMultisampleState->minSampleShading *
-             info->pMultisampleState->rasterizationSamples) > 1;
+            (ms_info->minSampleShading * ms_info->rasterizationSamples) > 1;
          key->multisample_fbo = true;
       }
 
-      key->frag_coord_adds_sample_pos =
-         info->pMultisampleState->sampleShadingEnable;
+      key->frag_coord_adds_sample_pos = ms_info->sampleShadingEnable;
    }
 }
 
@@ -376,6 +411,8 @@
    }
    if (layout)
       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
+   const bool rba = pipeline->device->robust_buffer_access;
+   _mesa_sha1_update(&ctx, &rba, sizeof(rba));
    _mesa_sha1_update(&ctx, module->sha1, sizeof(module->sha1));
    _mesa_sha1_update(&ctx, entrypoint, strlen(entrypoint));
    _mesa_sha1_update(&ctx, &stage, sizeof(stage));
@@ -470,29 +507,6 @@
    prog_data->binding_table.image_start = bias;
 }
 
-static struct anv_shader_bin *
-anv_pipeline_upload_kernel(struct anv_pipeline *pipeline,
-                           struct anv_pipeline_cache *cache,
-                           const void *key_data, uint32_t key_size,
-                           const void *kernel_data, uint32_t kernel_size,
-                           const struct brw_stage_prog_data *prog_data,
-                           uint32_t prog_data_size,
-                           const struct anv_pipeline_bind_map *bind_map)
-{
-   if (cache) {
-      return anv_pipeline_cache_upload_kernel(cache, key_data, key_size,
-                                              kernel_data, kernel_size,
-                                              prog_data, prog_data_size,
-                                              bind_map);
-   } else {
-      return anv_shader_bin_create(pipeline->device, key_data, key_size,
-                                   kernel_data, kernel_size,
-                                   prog_data, prog_data_size,
-                                   prog_data->param, bind_map);
-   }
-}
-
-
 static void
 anv_pipeline_add_compiled_stage(struct anv_pipeline *pipeline,
                                 gl_shader_stage stage,
@@ -513,18 +527,16 @@
       pipeline->device->instance->physicalDevice.compiler;
    struct brw_vs_prog_key key;
    struct anv_shader_bin *bin = NULL;
-   unsigned char sha1[20];
 
    populate_vs_prog_key(&pipeline->device->info, &key);
 
    ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
 
-   if (cache) {
-      anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
-                               MESA_SHADER_VERTEX, spec_info,
-                               &key, sizeof(key), sha1);
-      bin = anv_pipeline_cache_search(cache, sha1, 20);
-   }
+   unsigned char sha1[20];
+   anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
+                            MESA_SHADER_VERTEX, spec_info,
+                            &key, sizeof(key), sha1);
+   bin = anv_device_search_for_kernel(pipeline->device, cache, sha1, 20);
 
    if (bin == NULL) {
       struct brw_vs_prog_data prog_data = {};
@@ -563,10 +575,12 @@
       }
 
       unsigned code_size = prog_data.base.base.program_size;
-      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
-                                       shader_code, code_size,
-                                       &prog_data.base.base, sizeof(prog_data),
-                                       &map);
+      bin = anv_device_upload_kernel(pipeline->device, cache, sha1, 20,
+                                     shader_code, code_size,
+                                     nir->constant_data,
+                                     nir->constant_data_size,
+                                     &prog_data.base.base, sizeof(prog_data),
+                                     &map);
       if (!bin) {
          ralloc_free(mem_ctx);
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -637,27 +651,29 @@
    struct brw_tes_prog_key tes_key = {};
    struct anv_shader_bin *tcs_bin = NULL;
    struct anv_shader_bin *tes_bin = NULL;
-   unsigned char tcs_sha1[40];
-   unsigned char tes_sha1[40];
 
-   populate_sampler_prog_key(&pipeline->device->info, &tcs_key.tex);
-   populate_sampler_prog_key(&pipeline->device->info, &tes_key.tex);
-   tcs_key.input_vertices = info->pTessellationState->patchControlPoints;
+   populate_tcs_prog_key(&pipeline->device->info,
+                         info->pTessellationState->patchControlPoints,
+                         &tcs_key);
+   populate_tes_prog_key(&pipeline->device->info, &tes_key);
 
    ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
 
-   if (cache) {
-      anv_pipeline_hash_shader(pipeline, layout, tcs_module, tcs_entrypoint,
-                               MESA_SHADER_TESS_CTRL, tcs_spec_info,
-                               &tcs_key, sizeof(tcs_key), tcs_sha1);
-      anv_pipeline_hash_shader(pipeline, layout, tes_module, tes_entrypoint,
-                               MESA_SHADER_TESS_EVAL, tes_spec_info,
-                               &tes_key, sizeof(tes_key), tes_sha1);
-      memcpy(&tcs_sha1[20], tes_sha1, 20);
-      memcpy(&tes_sha1[20], tcs_sha1, 20);
-      tcs_bin = anv_pipeline_cache_search(cache, tcs_sha1, sizeof(tcs_sha1));
-      tes_bin = anv_pipeline_cache_search(cache, tes_sha1, sizeof(tes_sha1));
-   }
+   unsigned char tcs_sha1[40];
+   unsigned char tes_sha1[40];
+   anv_pipeline_hash_shader(pipeline, layout, tcs_module, tcs_entrypoint,
+                            MESA_SHADER_TESS_CTRL, tcs_spec_info,
+                            &tcs_key, sizeof(tcs_key), tcs_sha1);
+   anv_pipeline_hash_shader(pipeline, layout, tes_module, tes_entrypoint,
+                            MESA_SHADER_TESS_EVAL, tes_spec_info,
+                            &tes_key, sizeof(tes_key), tes_sha1);
+   memcpy(&tcs_sha1[20], tes_sha1, 20);
+   memcpy(&tes_sha1[20], tcs_sha1, 20);
+
+   tcs_bin = anv_device_search_for_kernel(pipeline->device, cache,
+                                          tcs_sha1, sizeof(tcs_sha1));
+   tes_bin = anv_device_search_for_kernel(pipeline->device, cache,
+                                          tes_sha1, sizeof(tes_sha1));
 
    if (tcs_bin == NULL || tes_bin == NULL) {
       struct brw_tcs_prog_data tcs_prog_data = {};
@@ -693,8 +709,8 @@
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
       }
 
-      nir_lower_tes_patch_vertices(tes_nir,
-                                   tcs_nir->info.tess.tcs_vertices_out);
+      nir_lower_patch_vertices(tes_nir, tcs_nir->info.tess.tcs_vertices_out,
+                               NULL);
 
       /* Copy TCS info into the TES info */
       merge_tess_info(&tes_nir->info, &tcs_nir->info);
@@ -729,12 +745,14 @@
       }
 
       unsigned code_size = tcs_prog_data.base.base.program_size;
-      tcs_bin = anv_pipeline_upload_kernel(pipeline, cache,
-                                           tcs_sha1, sizeof(tcs_sha1),
-                                           shader_code, code_size,
-                                           &tcs_prog_data.base.base,
-                                           sizeof(tcs_prog_data),
-                                           &tcs_map);
+      tcs_bin = anv_device_upload_kernel(pipeline->device, cache,
+                                         tcs_sha1, sizeof(tcs_sha1),
+                                         shader_code, code_size,
+                                         tcs_nir->constant_data,
+                                         tcs_nir->constant_data_size,
+                                         &tcs_prog_data.base.base,
+                                         sizeof(tcs_prog_data),
+                                         &tcs_map);
       if (!tcs_bin) {
          ralloc_free(mem_ctx);
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -750,12 +768,14 @@
       }
 
       code_size = tes_prog_data.base.base.program_size;
-      tes_bin = anv_pipeline_upload_kernel(pipeline, cache,
-                                           tes_sha1, sizeof(tes_sha1),
-                                           shader_code, code_size,
-                                           &tes_prog_data.base.base,
-                                           sizeof(tes_prog_data),
-                                           &tes_map);
+      tes_bin = anv_device_upload_kernel(pipeline->device, cache,
+                                         tes_sha1, sizeof(tes_sha1),
+                                         shader_code, code_size,
+                                         tes_nir->constant_data,
+                                         tes_nir->constant_data_size,
+                                         &tes_prog_data.base.base,
+                                         sizeof(tes_prog_data),
+                                         &tes_map);
       if (!tes_bin) {
          ralloc_free(mem_ctx);
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -782,18 +802,16 @@
       pipeline->device->instance->physicalDevice.compiler;
    struct brw_gs_prog_key key;
    struct anv_shader_bin *bin = NULL;
-   unsigned char sha1[20];
 
    populate_gs_prog_key(&pipeline->device->info, &key);
 
    ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
 
-   if (cache) {
-      anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
-                               MESA_SHADER_GEOMETRY, spec_info,
-                               &key, sizeof(key), sha1);
-      bin = anv_pipeline_cache_search(cache, sha1, 20);
-   }
+   unsigned char sha1[20];
+   anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
+                            MESA_SHADER_GEOMETRY, spec_info,
+                            &key, sizeof(key), sha1);
+   bin = anv_device_search_for_kernel(pipeline->device, cache, sha1, 20);
 
    if (bin == NULL) {
       struct brw_gs_prog_data prog_data = {};
@@ -833,10 +851,12 @@
 
       /* TODO: SIMD8 GS */
       const unsigned code_size = prog_data.base.base.program_size;
-      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
-                                       shader_code, code_size,
-                                       &prog_data.base.base, sizeof(prog_data),
-                                       &map);
+      bin = anv_device_upload_kernel(pipeline->device, cache, sha1, 20,
+                                     shader_code, code_size,
+                                     nir->constant_data,
+                                     nir->constant_data_size,
+                                     &prog_data.base.base, sizeof(prog_data),
+                                     &map);
       if (!bin) {
          ralloc_free(mem_ctx);
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -862,18 +882,24 @@
       pipeline->device->instance->physicalDevice.compiler;
    struct brw_wm_prog_key key;
    struct anv_shader_bin *bin = NULL;
-   unsigned char sha1[20];
 
-   populate_wm_prog_key(pipeline, info, &key);
+   populate_wm_prog_key(&pipeline->device->info, pipeline->subpass,
+                        info->pMultisampleState, &key);
+
+   /* TODO: we could set this to 0 based on the information in nir_shader, but
+    * we need this before we call spirv_to_nir.
+    */
+   const struct brw_vue_map *vue_map =
+      &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
+   key.input_slots_valid = vue_map->slots_valid;
 
    ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
 
-   if (cache) {
-      anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
-                               MESA_SHADER_FRAGMENT, spec_info,
-                               &key, sizeof(key), sha1);
-      bin = anv_pipeline_cache_search(cache, sha1, 20);
-   }
+   unsigned char sha1[20];
+   anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
+                            MESA_SHADER_FRAGMENT, spec_info,
+                            &key, sizeof(key), sha1);
+   bin = anv_device_search_for_kernel(pipeline->device, cache, sha1, 20);
 
    if (bin == NULL) {
       struct brw_wm_prog_data prog_data = {};
@@ -911,8 +937,8 @@
             continue;
 
          const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
-         /* Out-of-bounds */
-         if (rt >= key.nr_color_regions)
+         /* Unused or out-of-bounds */
+         if (rt >= MAX_RTS || !(key.color_outputs_valid & (1 << rt)))
             continue;
 
          const unsigned array_len =
@@ -937,13 +963,15 @@
          num_rts++;
       }
 
+      bool deleted_output = false;
       nir_foreach_variable_safe(var, &nir->outputs) {
          if (var->data.location < FRAG_RESULT_DATA0)
             continue;
 
          const unsigned rt = var->data.location - FRAG_RESULT_DATA0;
-         if (rt >= key.nr_color_regions) {
-            /* Out-of-bounds, throw it away */
+         if (rt >= MAX_RTS || !(key.color_outputs_valid & (1 << rt))) {
+            /* Unused or out-of-bounds, throw it away */
+            deleted_output = true;
             var->data.mode = nir_var_local;
             exec_node_remove(&var->node);
             exec_list_push_tail(&impl->locals, &var->node);
@@ -955,6 +983,9 @@
          var->data.location = rt_to_bindings[rt] + FRAG_RESULT_DATA0;
       }
 
+      if (deleted_output)
+         nir_fixup_deref_modes(nir);
+
       if (num_rts == 0) {
          /* If we have no render targets, we need a null render target */
          rt_bindings[0] = (struct anv_pipeline_binding) {
@@ -965,6 +996,12 @@
          num_rts = 1;
       }
 
+      /* Now that we've determined the actual number of render targets, adjust
+       * the key accordingly.
+       */
+      key.nr_color_regions = num_rts;
+      key.color_outputs_valid = (1 << num_rts) - 1;
+
       assert(num_rts <= max_rt);
       map.surface_to_descriptor -= num_rts;
       map.surface_count += num_rts;
@@ -976,17 +1013,19 @@
 
       const unsigned *shader_code =
          brw_compile_fs(compiler, NULL, mem_ctx, &key, &prog_data, nir,
-                        NULL, -1, -1, true, false, NULL, NULL);
+                        NULL, -1, -1, -1, true, false, NULL, NULL);
       if (shader_code == NULL) {
          ralloc_free(mem_ctx);
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
       }
 
       unsigned code_size = prog_data.base.program_size;
-      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
-                                       shader_code, code_size,
-                                       &prog_data.base, sizeof(prog_data),
-                                       &map);
+      bin = anv_device_upload_kernel(pipeline->device, cache, sha1, 20,
+                                     shader_code, code_size,
+                                     nir->constant_data,
+                                     nir->constant_data_size,
+                                     &prog_data.base, sizeof(prog_data),
+                                     &map);
       if (!bin) {
          ralloc_free(mem_ctx);
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -1012,18 +1051,16 @@
       pipeline->device->instance->physicalDevice.compiler;
    struct brw_cs_prog_key key;
    struct anv_shader_bin *bin = NULL;
-   unsigned char sha1[20];
 
    populate_cs_prog_key(&pipeline->device->info, &key);
 
    ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout);
 
-   if (cache) {
-      anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
-                               MESA_SHADER_COMPUTE, spec_info,
-                               &key, sizeof(key), sha1);
-      bin = anv_pipeline_cache_search(cache, sha1, 20);
-   }
+   unsigned char sha1[20];
+   anv_pipeline_hash_shader(pipeline, layout, module, entrypoint,
+                            MESA_SHADER_COMPUTE, spec_info,
+                            &key, sizeof(key), sha1);
+   bin = anv_device_search_for_kernel(pipeline->device, cache, sha1, 20);
 
    if (bin == NULL) {
       struct brw_cs_prog_data prog_data = {};
@@ -1059,10 +1096,12 @@
       }
 
       const unsigned code_size = prog_data.base.program_size;
-      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
-                                       shader_code, code_size,
-                                       &prog_data.base, sizeof(prog_data),
-                                       &map);
+      bin = anv_device_upload_kernel(pipeline->device, cache, sha1, 20,
+                                     shader_code, code_size,
+                                     nir->constant_data,
+                                     nir->constant_data_size,
+                                     &prog_data.base, sizeof(prog_data),
+                                     &map);
       if (!bin) {
          ralloc_free(mem_ctx);
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
@@ -1181,7 +1220,7 @@
     *    against does not use a depth/stencil attachment.
     */
    if (!pCreateInfo->pRasterizationState->rasterizerDiscardEnable &&
-       subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+       subpass->depth_stencil_attachment) {
       assert(pCreateInfo->pDepthStencilState);
 
       if (states & (1 << VK_DYNAMIC_STATE_DEPTH_BOUNDS)) {
@@ -1242,11 +1281,21 @@
       assert(info->pViewportState);
       assert(info->pMultisampleState);
 
-      if (subpass && subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED)
+      if (subpass && subpass->depth_stencil_attachment)
          assert(info->pDepthStencilState);
 
-      if (subpass && subpass->color_count > 0)
-         assert(info->pColorBlendState);
+      if (subpass && subpass->color_count > 0) {
+         bool all_color_unused = true;
+         for (int i = 0; i < subpass->color_count; i++) {
+            if (subpass->color_attachments[i].attachment != VK_ATTACHMENT_UNUSED)
+               all_color_unused = false;
+         }
+         /* pColorBlendState is ignored if the pipeline has rasterization
+          * disabled or if the subpass of the render pass the pipeline is
+          * created against does not use any color attachments.
+          */
+         assert(info->pColorBlendState || all_color_unused);
+      }
    }
 
    for (uint32_t i = 0; i < info->stageCount; ++i) {
@@ -1404,7 +1453,7 @@
       const VkVertexInputBindingDescription *desc =
          &vi_info->pVertexBindingDescriptions[i];
 
-      pipeline->binding_stride[desc->binding] = desc->stride;
+      pipeline->vb[desc->binding].stride = desc->stride;
 
       /* Step rate is programmed per vertex element (attribute), not
        * binding. Set up a map of which bindings step per instance, for
@@ -1412,12 +1461,39 @@
       switch (desc->inputRate) {
       default:
       case VK_VERTEX_INPUT_RATE_VERTEX:
-         pipeline->instancing_enable[desc->binding] = false;
+         pipeline->vb[desc->binding].instanced = false;
          break;
       case VK_VERTEX_INPUT_RATE_INSTANCE:
-         pipeline->instancing_enable[desc->binding] = true;
+         pipeline->vb[desc->binding].instanced = true;
          break;
       }
+
+      pipeline->vb[desc->binding].instance_divisor = 1;
+   }
+
+   const VkPipelineVertexInputDivisorStateCreateInfoEXT *vi_div_state =
+      vk_find_struct_const(vi_info->pNext,
+                           PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
+   if (vi_div_state) {
+      for (uint32_t i = 0; i < vi_div_state->vertexBindingDivisorCount; i++) {
+         const VkVertexInputBindingDivisorDescriptionEXT *desc =
+            &vi_div_state->pVertexBindingDivisors[i];
+
+         pipeline->vb[desc->binding].instance_divisor = desc->divisor;
+      }
+   }
+
+   /* Our implementation of VK_KHR_multiview uses instancing to draw the
+    * different views.  If the client asks for instancing, we need to multiply
+    * the instance divisor by the number of views ensure that we repeat the
+    * client's per-instance data once for each view.
+    */
+   if (pipeline->subpass->view_mask) {
+      const uint32_t view_count = anv_subpass_view_count(pipeline->subpass);
+      for (uint32_t vb = 0; vb < MAX_VBS; vb++) {
+         if (pipeline->vb[vb].instanced)
+            pipeline->vb[vb].instance_divisor *= view_count;
+      }
    }
 
    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c
index 82551e9..3efa427 100644
--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@@ -24,12 +24,15 @@
 #include "compiler/blob.h"
 #include "util/hash_table.h"
 #include "util/debug.h"
+#include "util/disk_cache.h"
+#include "util/mesa-sha1.h"
 #include "anv_private.h"
 
 struct anv_shader_bin *
 anv_shader_bin_create(struct anv_device *device,
                       const void *key_data, uint32_t key_size,
                       const void *kernel_data, uint32_t kernel_size,
+                      const void *constant_data, uint32_t constant_data_size,
                       const struct brw_stage_prog_data *prog_data_in,
                       uint32_t prog_data_size, const void *prog_data_param_in,
                       const struct anv_pipeline_bind_map *bind_map)
@@ -65,6 +68,16 @@
    memcpy(shader->kernel.map, kernel_data, kernel_size);
    shader->kernel_size = kernel_size;
 
+   if (constant_data_size) {
+      shader->constant_data =
+         anv_state_pool_alloc(&device->dynamic_state_pool,
+                              constant_data_size, 32);
+      memcpy(shader->constant_data.map, constant_data, constant_data_size);
+   } else {
+      shader->constant_data = ANV_STATE_NULL;
+   }
+   shader->constant_data_size = constant_data_size;
+
    memcpy(prog_data, prog_data_in, prog_data_size);
    memcpy(prog_data_param, prog_data_param_in,
           prog_data->nr_params * sizeof(*prog_data_param));
@@ -89,6 +102,7 @@
 {
    assert(shader->ref_cnt == 0);
    anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+   anv_state_pool_free(&device->dynamic_state_pool, shader->constant_data);
    vk_free(&device->alloc, shader);
 }
 
@@ -104,6 +118,10 @@
    ok = blob_write_uint32(blob, shader->kernel_size);
    ok = blob_write_bytes(blob, shader->kernel.map, shader->kernel_size);
 
+   ok = blob_write_uint32(blob, shader->constant_data_size);
+   ok = blob_write_bytes(blob, shader->constant_data.map,
+                         shader->constant_data_size);
+
    ok = blob_write_uint32(blob, shader->prog_data_size);
    ok = blob_write_bytes(blob, shader->prog_data, shader->prog_data_size);
    ok = blob_write_bytes(blob, shader->prog_data->param,
@@ -133,6 +151,9 @@
    uint32_t kernel_size = blob_read_uint32(blob);
    const void *kernel_data = blob_read_bytes(blob, kernel_size);
 
+   uint32_t constant_data_size = blob_read_uint32(blob);
+   const void *constant_data = blob_read_bytes(blob, constant_data_size);
+
    uint32_t prog_data_size = blob_read_uint32(blob);
    const struct brw_stage_prog_data *prog_data =
       blob_read_bytes(blob, prog_data_size);
@@ -158,6 +179,7 @@
    return anv_shader_bin_create(device,
                                 key_data, key_size,
                                 kernel_data, kernel_size,
+                                constant_data, constant_data_size,
                                 prog_data, prog_data_size, prog_data_param,
                                 &bind_map);
 }
@@ -260,14 +282,36 @@
    return shader;
 }
 
+static void
+anv_pipeline_cache_add_shader_bin(struct anv_pipeline_cache *cache,
+                                  struct anv_shader_bin *bin)
+{
+   if (!cache->cache)
+      return;
+
+   pthread_mutex_lock(&cache->mutex);
+
+   struct hash_entry *entry = _mesa_hash_table_search(cache->cache, bin->key);
+   if (entry == NULL) {
+      /* Take a reference for the cache */
+      anv_shader_bin_ref(bin);
+      _mesa_hash_table_insert(cache->cache, bin->key, bin);
+   }
+
+   pthread_mutex_unlock(&cache->mutex);
+}
+
 static struct anv_shader_bin *
-anv_pipeline_cache_add_shader(struct anv_pipeline_cache *cache,
-                              const void *key_data, uint32_t key_size,
-                              const void *kernel_data, uint32_t kernel_size,
-                              const struct brw_stage_prog_data *prog_data,
-                              uint32_t prog_data_size,
-                              const void *prog_data_param,
-                              const struct anv_pipeline_bind_map *bind_map)
+anv_pipeline_cache_add_shader_locked(struct anv_pipeline_cache *cache,
+                                     const void *key_data, uint32_t key_size,
+                                     const void *kernel_data,
+                                     uint32_t kernel_size,
+                                     const void *constant_data,
+                                     uint32_t constant_data_size,
+                                     const struct brw_stage_prog_data *prog_data,
+                                     uint32_t prog_data_size,
+                                     const void *prog_data_param,
+                                     const struct anv_pipeline_bind_map *bind_map)
 {
    struct anv_shader_bin *shader =
       anv_pipeline_cache_search_locked(cache, key_data, key_size);
@@ -277,6 +321,7 @@
    struct anv_shader_bin *bin =
       anv_shader_bin_create(cache->device, key_data, key_size,
                             kernel_data, kernel_size,
+                            constant_data, constant_data_size,
                             prog_data, prog_data_size, prog_data_param,
                             bind_map);
    if (!bin)
@@ -291,6 +336,8 @@
 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
                                  const void *key_data, uint32_t key_size,
                                  const void *kernel_data, uint32_t kernel_size,
+                                 const void *constant_data,
+                                 uint32_t constant_data_size,
                                  const struct brw_stage_prog_data *prog_data,
                                  uint32_t prog_data_size,
                                  const struct anv_pipeline_bind_map *bind_map)
@@ -299,10 +346,11 @@
       pthread_mutex_lock(&cache->mutex);
 
       struct anv_shader_bin *bin =
-         anv_pipeline_cache_add_shader(cache, key_data, key_size,
-                                       kernel_data, kernel_size,
-                                       prog_data, prog_data_size,
-                                       prog_data->param, bind_map);
+         anv_pipeline_cache_add_shader_locked(cache, key_data, key_size,
+                                              kernel_data, kernel_size,
+                                              constant_data, constant_data_size,
+                                              prog_data, prog_data_size,
+                                              prog_data->param, bind_map);
 
       pthread_mutex_unlock(&cache->mutex);
 
@@ -315,6 +363,7 @@
       /* In this case, we're not caching it so the caller owns it entirely */
       return anv_shader_bin_create(cache->device, key_data, key_size,
                                    kernel_data, kernel_size,
+                                   constant_data, constant_data_size,
                                    prog_data, prog_data_size,
                                    prog_data->param, bind_map);
    }
@@ -367,15 +416,6 @@
    }
 }
 
-static bool
-pipeline_cache_enabled()
-{
-   static int enabled = -1;
-   if (enabled < 0)
-      enabled = env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true);
-   return enabled;
-}
-
 VkResult anv_CreatePipelineCache(
     VkDevice                                    _device,
     const VkPipelineCacheCreateInfo*            pCreateInfo,
@@ -394,7 +434,8 @@
    if (cache == NULL)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   anv_pipeline_cache_init(cache, device, pipeline_cache_enabled());
+   anv_pipeline_cache_init(cache, device,
+                           device->instance->pipeline_cache_enabled);
 
    if (pCreateInfo->initialDataSize > 0)
       anv_pipeline_cache_load(cache,
@@ -514,3 +555,92 @@
 
    return VK_SUCCESS;
 }
+
+struct anv_shader_bin *
+anv_device_search_for_kernel(struct anv_device *device,
+                             struct anv_pipeline_cache *cache,
+                             const void *key_data, uint32_t key_size)
+{
+   struct anv_shader_bin *bin;
+
+   if (cache) {
+      bin = anv_pipeline_cache_search(cache, key_data, key_size);
+      if (bin)
+         return bin;
+   }
+
+#ifdef ENABLE_SHADER_CACHE
+   struct disk_cache *disk_cache = device->instance->physicalDevice.disk_cache;
+   if (disk_cache && device->instance->pipeline_cache_enabled) {
+      cache_key cache_key;
+      disk_cache_compute_key(disk_cache, key_data, key_size, cache_key);
+
+      size_t buffer_size;
+      uint8_t *buffer = disk_cache_get(disk_cache, cache_key, &buffer_size);
+      if (buffer) {
+         struct blob_reader blob;
+         blob_reader_init(&blob, buffer, buffer_size);
+         bin = anv_shader_bin_create_from_blob(device, &blob);
+         free(buffer);
+
+         if (bin) {
+            if (cache)
+               anv_pipeline_cache_add_shader_bin(cache, bin);
+            return bin;
+         }
+      }
+   }
+#endif
+
+   return NULL;
+}
+
+struct anv_shader_bin *
+anv_device_upload_kernel(struct anv_device *device,
+                         struct anv_pipeline_cache *cache,
+                         const void *key_data, uint32_t key_size,
+                         const void *kernel_data, uint32_t kernel_size,
+                         const void *constant_data,
+                         uint32_t constant_data_size,
+                         const struct brw_stage_prog_data *prog_data,
+                         uint32_t prog_data_size,
+                         const struct anv_pipeline_bind_map *bind_map)
+{
+   struct anv_shader_bin *bin;
+   if (cache) {
+      bin = anv_pipeline_cache_upload_kernel(cache, key_data, key_size,
+                                             kernel_data, kernel_size,
+                                             constant_data, constant_data_size,
+                                             prog_data, prog_data_size,
+                                             bind_map);
+   } else {
+      bin = anv_shader_bin_create(device, key_data, key_size,
+                                  kernel_data, kernel_size,
+                                  constant_data, constant_data_size,
+                                  prog_data, prog_data_size,
+                                  prog_data->param, bind_map);
+   }
+
+   if (bin == NULL)
+      return NULL;
+
+#ifdef ENABLE_SHADER_CACHE
+   struct disk_cache *disk_cache = device->instance->physicalDevice.disk_cache;
+   if (disk_cache) {
+      struct blob binary;
+      blob_init(&binary);
+      anv_shader_bin_write_to_blob(bin, &binary);
+
+      if (!binary.out_of_memory) {
+         cache_key cache_key;
+         disk_cache_compute_key(disk_cache, key_data, key_size, cache_key);
+
+         disk_cache_put(disk_cache, cache_key, binary.data, binary.size, NULL);
+      }
+
+      blob_finish(&binary);
+   }
+#endif
+
+   return bin;
+}
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index b886c4b..f5ec764 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -45,13 +45,17 @@
 #endif
 
 #include "common/gen_clflush.h"
+#include "common/gen_gem.h"
 #include "dev/gen_device_info.h"
 #include "blorp/blorp.h"
 #include "compiler/brw_compiler.h"
 #include "util/macros.h"
+#include "util/hash_table.h"
 #include "util/list.h"
+#include "util/set.h"
 #include "util/u_atomic.h"
 #include "util/u_vector.h"
+#include "util/vma.h"
 #include "vk_alloc.h"
 #include "vk_debug_report.h"
 
@@ -86,6 +90,55 @@
 #include "common/intel_log.h"
 #include "wsi_common.h"
 
+/* anv Virtual Memory Layout
+ * =========================
+ *
+ * When the anv driver is determining the virtual graphics addresses of memory
+ * objects itself using the softpin mechanism, the following memory ranges
+ * will be used.
+ *
+ * Three special considerations to notice:
+ *
+ * (1) the dynamic state pool is located within the same 4 GiB as the low
+ * heap. This is to work around a VF cache issue described in a comment in
+ * anv_physical_device_init_heaps.
+ *
+ * (2) the binding table pool is located at lower addresses than the surface
+ * state pool, within a 4 GiB range. This allows surface state base addresses
+ * to cover both binding tables (16 bit offsets) and surface states (32 bit
+ * offsets).
+ *
+ * (3) the last 4 GiB of the address space is withheld from the high
+ * heap. Various hardware units will read past the end of an object for
+ * various reasons. This healthy margin prevents reads from wrapping around
+ * 48-bit addresses.
+ */
+#define LOW_HEAP_MIN_ADDRESS               0x000000001000ULL /* 4 KiB */
+#define LOW_HEAP_MAX_ADDRESS               0x0000bfffffffULL
+#define DYNAMIC_STATE_POOL_MIN_ADDRESS     0x0000c0000000ULL /* 3 GiB */
+#define DYNAMIC_STATE_POOL_MAX_ADDRESS     0x0000ffffffffULL
+#define BINDING_TABLE_POOL_MIN_ADDRESS     0x000100000000ULL /* 4 GiB */
+#define BINDING_TABLE_POOL_MAX_ADDRESS     0x00013fffffffULL
+#define SURFACE_STATE_POOL_MIN_ADDRESS     0x000140000000ULL /* 5 GiB */
+#define SURFACE_STATE_POOL_MAX_ADDRESS     0x00017fffffffULL
+#define INSTRUCTION_STATE_POOL_MIN_ADDRESS 0x000180000000ULL /* 6 GiB */
+#define INSTRUCTION_STATE_POOL_MAX_ADDRESS 0x0001bfffffffULL
+#define HIGH_HEAP_MIN_ADDRESS              0x0001c0000000ULL /* 7 GiB */
+#define HIGH_HEAP_MAX_ADDRESS              0xfffeffffffffULL
+
+#define LOW_HEAP_SIZE               \
+   (LOW_HEAP_MAX_ADDRESS - LOW_HEAP_MIN_ADDRESS + 1)
+#define HIGH_HEAP_SIZE              \
+   (HIGH_HEAP_MAX_ADDRESS - HIGH_HEAP_MIN_ADDRESS + 1)
+#define DYNAMIC_STATE_POOL_SIZE     \
+   (DYNAMIC_STATE_POOL_MAX_ADDRESS - DYNAMIC_STATE_POOL_MIN_ADDRESS + 1)
+#define BINDING_TABLE_POOL_SIZE     \
+   (BINDING_TABLE_POOL_MAX_ADDRESS - BINDING_TABLE_POOL_MIN_ADDRESS + 1)
+#define SURFACE_STATE_POOL_SIZE     \
+   (SURFACE_STATE_POOL_MAX_ADDRESS - SURFACE_STATE_POOL_MIN_ADDRESS + 1)
+#define INSTRUCTION_STATE_POOL_SIZE \
+   (INSTRUCTION_STATE_POOL_MAX_ADDRESS - INSTRUCTION_STATE_POOL_MIN_ADDRESS + 1)
+
 /* Allowing different clear colors requires us to perform a depth resolve at
  * the end of certain render passes. This is because while slow clears store
  * the clear color in the HiZ buffer, fast clears (without a resolve) don't.
@@ -510,6 +563,10 @@
 typedef uintptr_t anv_buffer_handle_t;
 typedef uintptr_t anv_syncobj_handle_t;
 
+/* Extra ANV-defined BO flags which won't be passed to the kernel */
+#define ANV_BO_EXTERNAL    (1ull << 31)
+#define ANV_BO_FLAG_MASK   (1ull << 31)
+
 struct anv_bo {
    anv_buffer_handle_t gem_handle;
 
@@ -582,6 +639,12 @@
 
    struct anv_bo bo;
 
+   /* The address where the start of the pool is pinned. The various bos that
+    * are created as the pool grows will have addresses in the range
+    * [start_address, start_address + BLOCK_POOL_MEMFD_SIZE).
+    */
+   uint64_t start_address;
+
    /* The offset from the start of the bo to the "center" of the block
     * pool.  Pointers to allocated blocks are given by
     * bo.map + center_bo_offset + offsets.
@@ -677,6 +740,7 @@
  */
 VkResult anv_block_pool_init(struct anv_block_pool *pool,
                              struct anv_device *device,
+                             uint64_t start_address,
                              uint32_t initial_size,
                              uint64_t bo_flags);
 void anv_block_pool_finish(struct anv_block_pool *pool);
@@ -687,6 +751,7 @@
 
 VkResult anv_state_pool_init(struct anv_state_pool *pool,
                              struct anv_device *device,
+                             uint64_t start_address,
                              uint32_t block_size,
                              uint64_t bo_flags);
 void anv_state_pool_finish(struct anv_state_pool *pool);
@@ -749,12 +814,14 @@
 void anv_bo_cache_finish(struct anv_bo_cache *cache);
 VkResult anv_bo_cache_alloc(struct anv_device *device,
                             struct anv_bo_cache *cache,
-                            uint64_t size, struct anv_bo **bo);
+                            uint64_t size, uint64_t bo_flags,
+                            struct anv_bo **bo);
 VkResult anv_bo_cache_import(struct anv_device *device,
                              struct anv_bo_cache *cache,
-                             int fd, struct anv_bo **bo);
+                             int fd, uint64_t bo_flags,
+                             struct anv_bo **bo);
 VkResult anv_bo_cache_import_buffer_handle(struct anv_device* device, struct anv_bo_cache* cache,
-                                           anv_buffer_handle_t gem_handle,
+                                           anv_buffer_handle_t gem_handle, uint64_t bo_flags,
                                            uint64_t import_size, struct anv_bo** bo_out);
 VkResult anv_bo_cache_export(struct anv_device *device,
                              struct anv_bo_cache *cache,
@@ -808,6 +875,8 @@
     bool                                        has_syncobj;
     bool                                        has_syncobj_wait;
     bool                                        has_context_priority;
+    bool                                        use_softpin;
+    bool                                        has_context_isolation;
 
     struct anv_device_extension_table           supported_extensions;
 
@@ -821,12 +890,24 @@
       struct anv_memory_heap                    heaps[VK_MAX_MEMORY_HEAPS];
     } memory;
 
+    uint8_t                                     driver_build_sha1[20];
     uint8_t                                     pipeline_cache_uuid[VK_UUID_SIZE];
     uint8_t                                     driver_uuid[VK_UUID_SIZE];
     uint8_t                                     device_uuid[VK_UUID_SIZE];
 
+    struct disk_cache *                         disk_cache;
+
     struct wsi_device                       wsi_device;
     int                                         local_fd;
+    int                                         master_fd;
+};
+
+struct anv_app_info {
+   const char*        app_name;
+   uint32_t           app_version;
+   const char*        engine_name;
+   uint32_t           engine_version;
+   uint32_t           api_version;
 };
 
 struct anv_instance {
@@ -834,13 +915,16 @@
 
     VkAllocationCallbacks                       alloc;
 
-    uint32_t                                    apiVersion;
+    struct anv_app_info                         app_info;
+
     struct anv_instance_extension_table         enabled_extensions;
     struct anv_dispatch_table                   dispatch;
 
     int                                         physicalDeviceCount;
     struct anv_physical_device                  physicalDevice;
 
+    bool                                        pipeline_cache_enabled;
+
     struct vk_debug_report_instance             debug_report_callbacks;
 };
 
@@ -856,8 +940,6 @@
 
     struct anv_device *                         device;
 
-    struct anv_state_pool *                     pool;
-
     VkDeviceQueueCreateFlags                    flags;
 };
 
@@ -882,10 +964,28 @@
 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
                                  const void *key_data, uint32_t key_size,
                                  const void *kernel_data, uint32_t kernel_size,
+                                 const void *constant_data,
+                                 uint32_t constant_data_size,
                                  const struct brw_stage_prog_data *prog_data,
                                  uint32_t prog_data_size,
                                  const struct anv_pipeline_bind_map *bind_map);
 
+struct anv_shader_bin *
+anv_device_search_for_kernel(struct anv_device *device,
+                             struct anv_pipeline_cache *cache,
+                             const void *key_data, uint32_t key_size);
+
+struct anv_shader_bin *
+anv_device_upload_kernel(struct anv_device *device,
+                         struct anv_pipeline_cache *cache,
+                         const void *key_data, uint32_t key_size,
+                         const void *kernel_data, uint32_t kernel_size,
+                         const void *constant_data,
+                         uint32_t constant_data_size,
+                         const struct brw_stage_prog_data *prog_data,
+                         uint32_t prog_data_size,
+                         const struct anv_pipeline_bind_map *bind_map);
+
 /* May be extended by the anv_gem implementation */
 struct anv_connection {
    uint32_t unused;
@@ -908,19 +1008,26 @@
     struct anv_device_extension_table           enabled_extensions;
     struct anv_dispatch_table                   dispatch;
 
+    pthread_mutex_t                             vma_mutex;
+    struct util_vma_heap                        vma_lo;
+    struct util_vma_heap                        vma_hi;
+    uint64_t                                    vma_lo_available;
+    uint64_t                                    vma_hi_available;
+
     struct anv_bo_pool                          batch_bo_pool;
 
     struct anv_bo_cache                         bo_cache;
 
     struct anv_state_pool                       dynamic_state_pool;
     struct anv_state_pool                       instruction_state_pool;
+    struct anv_state_pool                       binding_table_pool;
     struct anv_state_pool                       surface_state_pool;
 
     struct anv_bo                               workaround_bo;
     struct anv_bo                               trivial_batch_bo;
     struct anv_bo                               hiz_clear_bo;
 
-    struct anv_pipeline_cache                   blorp_shader_cache;
+    struct anv_pipeline_cache                   default_pipeline_cache;
     struct blorp_context                        blorp;
 
     struct anv_state                            border_colors;
@@ -930,6 +1037,7 @@
     struct anv_scratch_pool                     scratch_pool;
 
     uint32_t                                    default_mocs;
+    uint32_t                                    external_mocs;
     uint32_t                                    uncached_mocs;
 
     pthread_mutex_t                             mutex;
@@ -939,6 +1047,38 @@
     struct anv_connection* connection;
 };
 
+static inline struct anv_state_pool *
+anv_binding_table_pool(struct anv_device *device)
+{
+   if (device->instance->physicalDevice.use_softpin)
+      return &device->binding_table_pool;
+   else
+      return &device->surface_state_pool;
+}
+
+static inline struct anv_state
+anv_binding_table_pool_alloc(struct anv_device *device) {
+   if (device->instance->physicalDevice.use_softpin)
+      return anv_state_pool_alloc(&device->binding_table_pool,
+                                  device->binding_table_pool.block_size, 0);
+   else
+      return anv_state_pool_alloc_back(&device->surface_state_pool);
+}
+
+static inline void
+anv_binding_table_pool_free(struct anv_device *device, struct anv_state state) {
+   anv_state_pool_free(anv_binding_table_pool(device), state);
+}
+
+static inline uint32_t
+anv_mocs_for_bo(const struct anv_device *device, const struct anv_bo *bo)
+{
+   if (bo->flags & ANV_BO_EXTERNAL)
+      return device->external_mocs;
+   else
+      return device->default_mocs;
+}
+
 static void inline
 anv_state_flush(struct anv_device *device, struct anv_state state)
 {
@@ -987,7 +1127,6 @@
 int anv_gem_get_tiling(struct anv_device *device, uint32_t gem_handle);
 bool anv_gem_get_bit6_swizzle(int fd, uint32_t tiling);
 int anv_gem_get_aperture(int fd, uint64_t *size);
-bool anv_gem_supports_48b_addresses(int fd);
 int anv_gem_gpu_get_reset_stats(struct anv_device *device,
                                 uint32_t *active, uint32_t *pending);
 int anv_gem_handle_to_fd(struct anv_device *device, anv_buffer_handle_t gem_handle);
@@ -1008,12 +1147,15 @@
 bool anv_gem_supports_syncobj_wait(int fd);
 int anv_gem_syncobj_wait(struct anv_device *device,
                          anv_syncobj_handle_t *handles, uint32_t num_handles,
-                         int64_t abs_timeout_ns, bool wait_all, uint64_t timeout_ns);
+                         int64_t abs_timeout_ns, bool wait_all);
 
 int anv_platform_futex_wake(uint32_t *addr, int count);
 int anv_platform_futex_wait(uint32_t *addr, int32_t value);
 int anv_gem_import_fuchsia_buffer(struct anv_device *device, uint32_t handle, anv_buffer_handle_t* buffer_out, uint64_t* size_out);
 
+bool anv_vma_alloc(struct anv_device *device, struct anv_bo *bo);
+void anv_vma_free(struct anv_device *device, struct anv_bo *bo);
+
 VkResult anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size);
 
 struct anv_reloc_list {
@@ -1021,6 +1163,7 @@
    uint32_t                                     array_length;
    struct drm_i915_gem_relocation_entry *       relocs;
    struct anv_bo **                             reloc_bos;
+   struct set *                                 deps;
 };
 
 VkResult anv_reloc_list_init(struct anv_reloc_list *list,
@@ -1097,6 +1240,46 @@
    uint32_t offset;
 };
 
+#define ANV_NULL_ADDRESS ((struct anv_address) { NULL, 0 })
+
+static inline bool
+anv_address_is_null(struct anv_address addr)
+{
+   return addr.bo == NULL && addr.offset == 0;
+}
+
+static inline uint64_t
+anv_address_physical(struct anv_address addr)
+{
+   if (addr.bo && (addr.bo->flags & EXEC_OBJECT_PINNED))
+      return gen_canonical_address(addr.bo->offset + addr.offset);
+   else
+      return gen_canonical_address(addr.offset);
+}
+
+static inline struct anv_address
+anv_address_add(struct anv_address addr, uint64_t offset)
+{
+   addr.offset += offset;
+   return addr;
+}
+
+static inline void
+write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush)
+{
+   unsigned reloc_size = 0;
+   if (device->info.gen >= 8) {
+      reloc_size = sizeof(uint64_t);
+      *(uint64_t *)p = gen_canonical_address(v);
+   } else {
+      reloc_size = sizeof(uint32_t);
+      *(uint32_t *)p = v;
+   }
+
+   if (flush && !device->info.has_llc)
+      gen_flush_range(p, reloc_size);
+}
+
 static inline uint64_t
 _anv_combine_address(struct anv_batch *batch, void *location,
                      const struct anv_address address, uint32_t delta)
@@ -1186,6 +1369,12 @@
       .AgeforQUADLRU = 0                                       \
    }
 
+#define GEN8_EXTERNAL_MOCS (struct GEN8_MEMORY_OBJECT_CONTROL_STATE) {     \
+      .MemoryTypeLLCeLLCCacheabilityControl = UCwithFenceifcoherentcycle,  \
+      .TargetCache = L3DefertoPATforLLCeLLCselection,                      \
+      .AgeforQUADLRU = 0                                                   \
+   }
+
 /* Skylake: MOCS is now an index into an array of 62 different caching
  * configurations programmed by the kernel.
  */
@@ -1195,9 +1384,9 @@
       .IndextoMOCSTables                           = 2         \
    }
 
-#define GEN9_MOCS_PTE {                                 \
-      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */       \
-      .IndextoMOCSTables                           = 1  \
+#define GEN9_EXTERNAL_MOCS (struct GEN9_MEMORY_OBJECT_CONTROL_STATE) {  \
+      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */                       \
+      .IndextoMOCSTables                           = 1                  \
    }
 
 /* Cannonlake MOCS defines are duplicates of Skylake MOCS defines. */
@@ -1206,9 +1395,9 @@
       .IndextoMOCSTables                           = 2         \
    }
 
-#define GEN10_MOCS_PTE {                                 \
-      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */       \
-      .IndextoMOCSTables                           = 1  \
+#define GEN10_EXTERNAL_MOCS (struct GEN10_MEMORY_OBJECT_CONTROL_STATE) {   \
+      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */                          \
+      .IndextoMOCSTables                           = 1                     \
    }
 
 /* Ice Lake MOCS defines are duplicates of Skylake MOCS defines. */
@@ -1217,9 +1406,9 @@
       .IndextoMOCSTables                           = 2         \
    }
 
-#define GEN11_MOCS_PTE {                                 \
-      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */       \
-      .IndextoMOCSTables                           = 1  \
+#define GEN11_EXTERNAL_MOCS (struct GEN11_MEMORY_OBJECT_CONTROL_STATE) {   \
+      /* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */                          \
+      .IndextoMOCSTables                           = 1                     \
    }
 
 struct anv_device_memory {
@@ -1341,10 +1530,10 @@
 
 struct anv_buffer_view {
    enum isl_format format; /**< VkBufferViewCreateInfo::format */
-   struct anv_bo *bo;
-   uint32_t offset; /**< Offset into bo. */
    uint64_t range; /**< VkBufferViewCreateInfo::range */
 
+   struct anv_address address;
+
    struct anv_state surface_state;
    struct anv_state storage_surface_state;
    struct anv_state writeonly_storage_surface_state;
@@ -1464,6 +1653,7 @@
                            struct anv_descriptor_pool *pool,
                            struct anv_descriptor_set *set);
 
+#define ANV_DESCRIPTOR_SET_SHADER_CONSTANTS (UINT8_MAX - 1)
 #define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX
 
 struct anv_pipeline_binding {
@@ -1511,8 +1701,7 @@
    VkBufferUsageFlags                           usage;
 
    /* Set when bound */
-   struct anv_bo *                              bo;
-   VkDeviceSize                                 offset;
+   struct anv_address                           address;
 };
 
 static inline uint64_t
@@ -1740,20 +1929,20 @@
     *
     * This address is relative to the start of the BO.
     */
-   uint64_t address;
+   struct anv_address address;
    /* Address of the aux surface, if any
     *
-    * This field is 0 if and only if no aux surface exists.
+    * This field is ANV_NULL_ADDRESS if and only if no aux surface exists.
     *
-    * This address is relative to the start of the BO.  With the exception of
-    * gen8, the bottom 12 bits of this address include extra aux information.
+    * With the exception of gen8, the bottom 12 bits of this address' offset
+    * include extra aux information.
     */
-   uint64_t aux_address;
+   struct anv_address aux_address;
    /* Address of the clear color, if any
     *
     * This address is relative to the start of the BO.
     */
-   uint64_t clear_address;
+   struct anv_address clear_address;
 };
 
 /**
@@ -2040,6 +2229,7 @@
    ANV_FENCE_TYPE_NONE = 0,
    ANV_FENCE_TYPE_BO,
    ANV_FENCE_TYPE_SYNCOBJ,
+   ANV_FENCE_TYPE_WSI,
 };
 
 enum anv_bo_fence_state {
@@ -2074,6 +2264,9 @@
 
       /** DRM syncobj handle for syncobj-based fences */
       anv_syncobj_handle_t syncobj;
+
+      /** WSI fence */
+      struct wsi_fence *fence_wsi;
    };
 };
 
@@ -2210,6 +2403,9 @@
    struct anv_state kernel;
    uint32_t kernel_size;
 
+   struct anv_state constant_data;
+   uint32_t constant_data_size;
+
    const struct brw_stage_prog_data *prog_data;
    uint32_t prog_data_size;
 
@@ -2220,6 +2416,7 @@
 anv_shader_bin_create(struct anv_device *device,
                       const void *key, uint32_t key_size,
                       const void *kernel, uint32_t kernel_size,
+                      const void *constant_data, uint32_t constant_data_size,
                       const struct brw_stage_prog_data *prog_data,
                       uint32_t prog_data_size, const void *prog_data_param,
                       const struct anv_pipeline_bind_map *bind_map);
@@ -2265,8 +2462,12 @@
    struct anv_state                             blend_state;
 
    uint32_t                                     vb_used;
-   uint32_t                                     binding_stride[MAX_VBS];
-   bool                                         instancing_enable[MAX_VBS];
+   struct anv_pipeline_vertex_binding {
+      uint32_t                                  stride;
+      bool                                      instanced;
+      uint32_t                                  instance_divisor;
+   } vb[MAX_VBS];
+
    bool                                         primitive_restart;
    uint32_t                                     topology;
 
@@ -2601,8 +2802,7 @@
       /**
        * BO associated with this plane, set when bound.
        */
-      struct anv_bo *bo;
-      VkDeviceSize bo_offset;
+      struct anv_address address;
 
       /**
        * When destroying the image, also free the bo.
@@ -2662,11 +2862,8 @@
    assert(image->aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
 
    uint32_t plane = anv_image_aspect_to_plane(image->aspects, aspect);
-   return (struct anv_address) {
-      .bo = image->planes[plane].bo,
-      .offset = image->planes[plane].bo_offset +
-                image->planes[plane].fast_clear_state_offset,
-   };
+   return anv_address_add(image->planes[plane].address,
+                          image->planes[plane].fast_clear_state_offset);
 }
 
 static inline struct anv_address
@@ -2963,8 +3160,8 @@
 void anv_fill_buffer_surface_state(struct anv_device *device,
                                    struct anv_state state,
                                    enum isl_format format,
-                                   uint32_t offset, uint32_t range,
-                                   uint32_t stride);
+                                   struct anv_address address,
+                                   uint32_t range, uint32_t stride);
 
 static inline void
 anv_clear_color_from_att_state(union isl_color_value *clear_color,
@@ -3032,7 +3229,7 @@
    struct anv_subpass_attachment *              color_attachments;
    struct anv_subpass_attachment *              resolve_attachments;
 
-   struct anv_subpass_attachment                depth_stencil_attachment;
+   struct anv_subpass_attachment *              depth_stencil_attachment;
 
    uint32_t                                     view_mask;
 
diff --git a/src/intel/vulkan/anv_queue.c b/src/intel/vulkan/anv_queue.c
index fdbe1cad..7284c18 100644
--- a/src/intel/vulkan/anv_queue.c
+++ b/src/intel/vulkan/anv_queue.c
@@ -49,8 +49,11 @@
 
    struct drm_i915_gem_exec_object2 *objects =
       (void *)(uintptr_t)execbuf->buffers_ptr;
-   for (uint32_t k = 0; k < execbuf->buffer_count; k++)
+   for (uint32_t k = 0; k < execbuf->buffer_count; k++) {
+      if (execbuf_bos[k]->flags & EXEC_OBJECT_PINNED)
+         assert(execbuf_bos[k]->offset == objects[k].offset);
       execbuf_bos[k]->offset = objects[k].offset;
+   }
 
    return VK_SUCCESS;
 }
@@ -81,7 +84,7 @@
    exec2_objects[0].relocs_ptr = 0;
    exec2_objects[0].alignment = 0;
    exec2_objects[0].offset = bo.offset;
-   exec2_objects[0].flags = 0;
+   exec2_objects[0].flags = bo.flags;
    exec2_objects[0].rsvd1 = 0;
    exec2_objects[0].rsvd2 = bo.size;
 
@@ -321,6 +324,10 @@
       anv_gem_syncobj_destroy(device, impl->syncobj);
       break;
 
+   case ANV_FENCE_TYPE_WSI:
+      impl->fence_wsi->destroy(impl->fence_wsi);
+      break;
+
    default:
       unreachable("Invalid fence type");
    }
@@ -425,7 +432,7 @@
       }
 
    case ANV_FENCE_TYPE_SYNCOBJ: {
-      int ret = anv_gem_syncobj_wait(device, &impl->syncobj, 1, 0, true, 0);
+      int ret = anv_gem_syncobj_wait(device, &impl->syncobj, 1, 0, true);
       if (ret == -1) {
          if (errno == ETIME) {
             return VK_NOT_READY;
@@ -456,12 +463,33 @@
    return (uint64_t)current.tv_sec * NSEC_PER_SEC + current.tv_nsec;
 }
 
+static uint64_t anv_get_absolute_timeout(uint64_t timeout)
+{
+   if (timeout == 0)
+      return 0;
+   uint64_t current_time = gettime_ns();
+   uint64_t max_timeout = (uint64_t) INT64_MAX - current_time;
+
+   timeout = MIN2(max_timeout, timeout);
+
+   return (current_time + timeout);
+}
+
+static int64_t anv_get_relative_timeout(uint64_t abs_timeout)
+{
+   uint64_t now = gettime_ns();
+
+   if (abs_timeout < now)
+      return 0;
+   return abs_timeout - now;
+}
+
 static VkResult
 anv_wait_for_syncobj_fences(struct anv_device *device,
                             uint32_t fenceCount,
                             const VkFence *pFences,
                             bool waitAll,
-                            uint64_t _timeout)
+                            uint64_t abs_timeout_ns)
 {
    anv_syncobj_handle_t *syncobjs = vk_zalloc(&device->alloc,
                                   sizeof(*syncobjs) * fenceCount, 8,
@@ -481,19 +509,6 @@
       syncobjs[i] = impl->syncobj;
    }
 
-   int64_t abs_timeout_ns = 0;
-   if (_timeout > 0) {
-      uint64_t current_ns = gettime_ns();
-
-      /* Add but saturate to INT32_MAX */
-      if (current_ns + _timeout < current_ns)
-         abs_timeout_ns = INT64_MAX;
-      else if (current_ns + _timeout > INT64_MAX)
-         abs_timeout_ns = INT64_MAX;
-      else
-         abs_timeout_ns = current_ns + _timeout;
-   }
-
    /* The gem_syncobj_wait ioctl may return early due to an inherent
     * limitation in the way it computes timeouts.  Loop until we've actually
     * passed the timeout.
@@ -501,7 +516,7 @@
    int ret;
    do {
       ret = anv_gem_syncobj_wait(device, syncobjs, fenceCount,
-                                 abs_timeout_ns, waitAll, _timeout);
+                                 abs_timeout_ns, waitAll);
    } while (ret == -1 && errno == ETIME && gettime_ns() < abs_timeout_ns);
 
    vk_free(&device->alloc, syncobjs);
@@ -536,7 +551,7 @@
     * best we can do is to clamp the timeout to INT64_MAX.  This limits the
     * maximum timeout from 584 years to 292 years - likely not a big deal.
     */
-   int64_t timeout = MIN2(_timeout, INT64_MAX);
+   int64_t timeout = MIN2(_timeout, (uint64_t) INT64_MAX);
 
    VkResult result = VK_SUCCESS;
    uint32_t pending_fences = fenceCount;
@@ -662,6 +677,81 @@
    return result;
 }
 
+static VkResult
+anv_wait_for_wsi_fence(struct anv_device *device,
+                       const VkFence _fence,
+                       uint64_t abs_timeout)
+{
+   ANV_FROM_HANDLE(anv_fence, fence, _fence);
+   struct anv_fence_impl *impl = &fence->permanent;
+
+   return impl->fence_wsi->wait(impl->fence_wsi, abs_timeout);
+}
+
+static VkResult
+anv_wait_for_fences(struct anv_device *device,
+                    uint32_t fenceCount,
+                    const VkFence *pFences,
+                    bool waitAll,
+                    uint64_t abs_timeout)
+{
+   VkResult result = VK_SUCCESS;
+
+   if (fenceCount <= 1 || waitAll) {
+      for (uint32_t i = 0; i < fenceCount; i++) {
+         ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+         switch (fence->permanent.type) {
+         case ANV_FENCE_TYPE_BO:
+            result = anv_wait_for_bo_fences(
+               device, 1, &pFences[i], true,
+               anv_get_relative_timeout(abs_timeout));
+            break;
+         case ANV_FENCE_TYPE_SYNCOBJ:
+            result = anv_wait_for_syncobj_fences(device, 1, &pFences[i],
+                                                 true, abs_timeout);
+            break;
+         case ANV_FENCE_TYPE_WSI:
+            result = anv_wait_for_wsi_fence(device, pFences[i], abs_timeout);
+            break;
+         case ANV_FENCE_TYPE_NONE:
+            result = VK_SUCCESS;
+            break;
+         }
+         if (result != VK_SUCCESS)
+            return result;
+      }
+   } else {
+      do {
+         for (uint32_t i = 0; i < fenceCount; i++) {
+            if (anv_wait_for_fences(device, 1, &pFences[i], true, 0) == VK_SUCCESS)
+               return VK_SUCCESS;
+         }
+      } while (gettime_ns() < abs_timeout);
+      result = VK_TIMEOUT;
+   }
+   return result;
+}
+
+static bool anv_all_fences_syncobj(uint32_t fenceCount, const VkFence *pFences)
+{
+   for (uint32_t i = 0; i < fenceCount; ++i) {
+      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+      if (fence->permanent.type != ANV_FENCE_TYPE_SYNCOBJ)
+         return false;
+   }
+   return true;
+}
+
+static bool anv_all_fences_bo(uint32_t fenceCount, const VkFence *pFences)
+{
+   for (uint32_t i = 0; i < fenceCount; ++i) {
+      ANV_FROM_HANDLE(anv_fence, fence, pFences[i]);
+      if (fence->permanent.type != ANV_FENCE_TYPE_BO)
+         return false;
+   }
+   return true;
+}
+
 VkResult anv_WaitForFences(
     VkDevice                                    _device,
     uint32_t                                    fenceCount,
@@ -674,12 +764,15 @@
    if (unlikely(device->lost))
       return VK_ERROR_DEVICE_LOST;
 
-   if (device->instance->physicalDevice.has_syncobj_wait) {
+   if (anv_all_fences_syncobj(fenceCount, pFences)) {
       return anv_wait_for_syncobj_fences(device, fenceCount, pFences,
-                                         waitAll, timeout);
-   } else {
+                                         waitAll, anv_get_absolute_timeout(timeout));
+   } else if (anv_all_fences_bo(fenceCount, pFences)) {
       return anv_wait_for_bo_fences(device, fenceCount, pFences,
                                     waitAll, timeout);
+   } else {
+      return anv_wait_for_fences(device, fenceCount, pFences,
+                                 waitAll, anv_get_absolute_timeout(timeout));
    }
 }
 
@@ -881,7 +974,8 @@
       } else {
          semaphore->permanent.type = ANV_SEMAPHORE_TYPE_BO;
          VkResult result = anv_bo_cache_alloc(device, &device->bo_cache,
-                                              4096, &semaphore->permanent.bo);
+                                              4096, ANV_BO_EXTERNAL,
+                                              &semaphore->permanent.bo);
          if (result != VK_SUCCESS) {
             vk_free2(&device->alloc, pAllocator, semaphore);
             return result;
@@ -1039,7 +1133,8 @@
          new_impl.type = ANV_SEMAPHORE_TYPE_BO;
 
          VkResult result = anv_bo_cache_import(device, &device->bo_cache,
-                                               fd, &new_impl.bo);
+                                               fd, ANV_BO_EXTERNAL,
+                                               &new_impl.bo);
          if (result != VK_SUCCESS)
             return result;
 
diff --git a/src/intel/vulkan/anv_util.c b/src/intel/vulkan/anv_util.c
index 3c1803a..9082707 100644
--- a/src/intel/vulkan/anv_util.c
+++ b/src/intel/vulkan/anv_util.c
@@ -57,7 +57,7 @@
 {
    va_list ap;
    char buffer[256];
-   char report[256];
+   char report[512];
 
    va_start(ap, format);
    vsnprintf(buffer, sizeof(buffer), format, ap);
@@ -84,7 +84,7 @@
 {
    va_list ap;
    char buffer[256];
-   char report[256];
+   char report[512];
 
    const char *error_str = vk_Result_to_str(error);
 
diff --git a/src/intel/vulkan/anv_wsi.c b/src/intel/vulkan/anv_wsi.c
index 20094f9..5ed1d71 100644
--- a/src/intel/vulkan/anv_wsi.c
+++ b/src/intel/vulkan/anv_wsi.c
@@ -48,7 +48,8 @@
    result = wsi_device_init(&physical_device->wsi_device,
                             anv_physical_device_to_handle(physical_device),
                             anv_wsi_proc_addr,
-                            &physical_device->instance->alloc);
+                            &physical_device->instance->alloc,
+                            physical_device->master_fd);
    if (result != VK_SUCCESS)
       return result;
 
@@ -119,6 +120,18 @@
                                                pSurfaceCapabilities);
 }
 
+VkResult anv_GetPhysicalDeviceSurfaceCapabilities2EXT(
+ 	VkPhysicalDevice                            physicalDevice,
+	VkSurfaceKHR                                surface,
+	VkSurfaceCapabilities2EXT*                  pSurfaceCapabilities)
+{
+   ANV_FROM_HANDLE(anv_physical_device, device, physicalDevice);
+
+   return wsi_common_get_surface_capabilities2ext(&device->wsi_device,
+                                                  surface,
+                                                  pSurfaceCapabilities);
+}
+
 VkResult anv_GetPhysicalDeviceSurfaceFormatsKHR(
     VkPhysicalDevice                            physicalDevice,
     VkSurfaceKHR                                surface,
@@ -203,28 +216,45 @@
 }
 
 VkResult anv_AcquireNextImageKHR(
-    VkDevice                                     _device,
+    VkDevice                                     device,
     VkSwapchainKHR                               swapchain,
     uint64_t                                     timeout,
     VkSemaphore                                  semaphore,
     VkFence                                      fence,
     uint32_t*                                    pImageIndex)
 {
+   VkAcquireNextImageInfoKHR acquire_info = {
+      .sType = VK_STRUCTURE_TYPE_ACQUIRE_NEXT_IMAGE_INFO_KHR,
+      .swapchain = swapchain,
+      .timeout = timeout,
+      .semaphore = semaphore,
+      .fence = fence,
+      .deviceMask = 0,
+   };
+
+   return anv_AcquireNextImage2KHR(device, &acquire_info, pImageIndex);
+}
+
+VkResult anv_AcquireNextImage2KHR(
+    VkDevice                                     _device,
+    const VkAcquireNextImageInfoKHR*             pAcquireInfo,
+    uint32_t*                                    pImageIndex)
+{
    ANV_FROM_HANDLE(anv_device, device, _device);
    struct anv_physical_device *pdevice = &device->instance->physicalDevice;
 
-   VkResult result = wsi_common_acquire_next_image(&pdevice->wsi_device,
-                                                   _device,
-                                                   swapchain,
-                                                   timeout,
-                                                   semaphore,
-                                                   pImageIndex);
+   VkResult result = wsi_common_acquire_next_image2(&pdevice->wsi_device,
+                                                    _device,
+                                                    pAcquireInfo,
+                                                    pImageIndex);
 
    /* Thanks to implicit sync, the image is ready immediately.  However, we
     * should wait for the current GPU state to finish.
     */
-   if (fence != VK_NULL_HANDLE)
-      anv_QueueSubmit(anv_queue_to_handle(&device->queue), 0, NULL, fence);
+   if (pAcquireInfo->fence != VK_NULL_HANDLE) {
+      anv_QueueSubmit(anv_queue_to_handle(&device->queue), 0, NULL,
+                      pAcquireInfo->fence);
+   }
 
    return result;
 }
diff --git a/src/intel/vulkan/anv_wsi_display.c b/src/intel/vulkan/anv_wsi_display.c
new file mode 100644
index 0000000..3212c23
--- /dev/null
+++ b/src/intel/vulkan/anv_wsi_display.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright © 2017 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include "anv_private.h"
+#include "wsi_common.h"
+#include "vk_format_info.h"
+#include "vk_util.h"
+#include "wsi_common_display.h"
+
+VkResult
+anv_GetPhysicalDeviceDisplayPropertiesKHR(VkPhysicalDevice physical_device,
+                                          uint32_t *property_count,
+                                          VkDisplayPropertiesKHR *properties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
+
+   return wsi_display_get_physical_device_display_properties(
+      physical_device,
+      &pdevice->wsi_device,
+      property_count,
+      properties);
+}
+
+VkResult
+anv_GetPhysicalDeviceDisplayProperties2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayProperties2KHR*                    pProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+   return wsi_display_get_physical_device_display_properties2(
+      physicalDevice, &pdevice->wsi_device,
+      pPropertyCount, pProperties);
+}
+
+VkResult
+anv_GetPhysicalDeviceDisplayPlanePropertiesKHR(
+   VkPhysicalDevice physical_device,
+   uint32_t *property_count,
+   VkDisplayPlanePropertiesKHR *properties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
+
+   return wsi_display_get_physical_device_display_plane_properties(
+      physical_device, &pdevice->wsi_device,
+      property_count, properties);
+}
+
+VkResult
+anv_GetPhysicalDeviceDisplayPlaneProperties2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayPlaneProperties2KHR*               pProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+   return wsi_display_get_physical_device_display_plane_properties2(
+      physicalDevice, &pdevice->wsi_device,
+      pPropertyCount, pProperties);
+}
+
+VkResult
+anv_GetDisplayPlaneSupportedDisplaysKHR(VkPhysicalDevice physical_device,
+                                        uint32_t plane_index,
+                                        uint32_t *display_count,
+                                        VkDisplayKHR *displays)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
+
+   return wsi_display_get_display_plane_supported_displays(physical_device,
+                                                           &pdevice->wsi_device,
+                                                           plane_index,
+                                                           display_count,
+                                                           displays);
+}
+
+
+VkResult
+anv_GetDisplayModePropertiesKHR(VkPhysicalDevice physical_device,
+                                VkDisplayKHR display,
+                                uint32_t *property_count,
+                                VkDisplayModePropertiesKHR *properties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
+
+   return wsi_display_get_display_mode_properties(physical_device,
+                                                  &pdevice->wsi_device,
+                                                  display,
+                                                  property_count,
+                                                  properties);
+}
+
+VkResult
+anv_GetDisplayModeProperties2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    VkDisplayKHR                                display,
+    uint32_t*                                   pPropertyCount,
+    VkDisplayModeProperties2KHR*                pProperties)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+   return wsi_display_get_display_mode_properties2(physicalDevice,
+                                                   &pdevice->wsi_device,
+                                                   display,
+                                                   pPropertyCount,
+                                                   pProperties);
+}
+
+VkResult
+anv_CreateDisplayModeKHR(VkPhysicalDevice physical_device,
+                         VkDisplayKHR display,
+                         const VkDisplayModeCreateInfoKHR *create_info,
+                         const VkAllocationCallbacks *allocator,
+                         VkDisplayModeKHR *mode)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
+
+   return wsi_display_create_display_mode(physical_device,
+                                          &pdevice->wsi_device,
+                                          display,
+                                          create_info,
+                                          allocator,
+                                          mode);
+}
+
+VkResult
+anv_GetDisplayPlaneCapabilitiesKHR(VkPhysicalDevice physical_device,
+                                   VkDisplayModeKHR mode_khr,
+                                   uint32_t plane_index,
+                                   VkDisplayPlaneCapabilitiesKHR *capabilities)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
+
+   return wsi_get_display_plane_capabilities(physical_device,
+                                             &pdevice->wsi_device,
+                                             mode_khr,
+                                             plane_index,
+                                             capabilities);
+}
+
+VkResult
+anv_GetDisplayPlaneCapabilities2KHR(
+    VkPhysicalDevice                            physicalDevice,
+    const VkDisplayPlaneInfo2KHR*               pDisplayPlaneInfo,
+    VkDisplayPlaneCapabilities2KHR*             pCapabilities)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+   return wsi_get_display_plane_capabilities2(physicalDevice,
+                                              &pdevice->wsi_device,
+                                              pDisplayPlaneInfo,
+                                              pCapabilities);
+}
+
+VkResult
+anv_CreateDisplayPlaneSurfaceKHR(
+   VkInstance _instance,
+   const VkDisplaySurfaceCreateInfoKHR *create_info,
+   const VkAllocationCallbacks *allocator,
+   VkSurfaceKHR *surface)
+{
+   ANV_FROM_HANDLE(anv_instance, instance, _instance);
+   const VkAllocationCallbacks *alloc;
+
+   if (allocator)
+     alloc = allocator;
+   else
+     alloc = &instance->alloc;
+
+   return wsi_create_display_surface(_instance, alloc, create_info, surface);
+}
+
+VkResult
+anv_ReleaseDisplayEXT(VkPhysicalDevice physical_device,
+                       VkDisplayKHR     display)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
+
+   return wsi_release_display(physical_device,
+                              &pdevice->wsi_device,
+                              display);
+}
+
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+VkResult
+anv_AcquireXlibDisplayEXT(VkPhysicalDevice     physical_device,
+                           Display              *dpy,
+                           VkDisplayKHR         display)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
+
+   return wsi_acquire_xlib_display(physical_device,
+                                   &pdevice->wsi_device,
+                                   dpy,
+                                   display);
+}
+
+VkResult
+anv_GetRandROutputDisplayEXT(VkPhysicalDevice  physical_device,
+                              Display           *dpy,
+                              RROutput          output,
+                              VkDisplayKHR      *display)
+{
+   ANV_FROM_HANDLE(anv_physical_device, pdevice, physical_device);
+
+   return wsi_get_randr_output_display(physical_device,
+                                       &pdevice->wsi_device,
+                                       dpy,
+                                       output,
+                                       display);
+}
+#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */
+
+/* VK_EXT_display_control */
+
+VkResult
+anv_DisplayPowerControlEXT(VkDevice                    _device,
+                            VkDisplayKHR                display,
+                            const VkDisplayPowerInfoEXT *display_power_info)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   return wsi_display_power_control(
+      _device, &device->instance->physicalDevice.wsi_device,
+      display, display_power_info);
+}
+
+VkResult
+anv_RegisterDeviceEventEXT(VkDevice _device,
+                            const VkDeviceEventInfoEXT *device_event_info,
+                            const VkAllocationCallbacks *allocator,
+                            VkFence *_fence)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_fence *fence;
+   VkResult ret;
+
+   fence = vk_zalloc2(&device->instance->alloc, allocator, sizeof (*fence), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!fence)
+      return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+
+   fence->permanent.type = ANV_FENCE_TYPE_WSI;
+
+   ret = wsi_register_device_event(_device,
+                                   &device->instance->physicalDevice.wsi_device,
+                                   device_event_info,
+                                   allocator,
+                                   &fence->permanent.fence_wsi);
+   if (ret == VK_SUCCESS)
+      *_fence = anv_fence_to_handle(fence);
+   else
+      vk_free2(&device->instance->alloc, allocator, fence);
+   return ret;
+}
+
+VkResult
+anv_RegisterDisplayEventEXT(VkDevice _device,
+                             VkDisplayKHR display,
+                             const VkDisplayEventInfoEXT *display_event_info,
+                             const VkAllocationCallbacks *allocator,
+                             VkFence *_fence)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+   struct anv_fence *fence;
+   VkResult ret;
+
+   fence = vk_zalloc2(&device->alloc, allocator, sizeof (*fence), 8,
+                      VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+   if (!fence)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   fence->permanent.type = ANV_FENCE_TYPE_WSI;
+
+   ret = wsi_register_display_event(
+      _device, &device->instance->physicalDevice.wsi_device,
+      display, display_event_info, allocator, &(fence->permanent.fence_wsi));
+
+   if (ret == VK_SUCCESS)
+      *_fence = anv_fence_to_handle(fence);
+   else
+      vk_free2(&device->alloc, allocator, fence);
+   return ret;
+}
+
+VkResult
+anv_GetSwapchainCounterEXT(VkDevice _device,
+                            VkSwapchainKHR swapchain,
+                            VkSurfaceCounterFlagBitsEXT flag_bits,
+                            uint64_t *value)
+{
+   ANV_FROM_HANDLE(anv_device, device, _device);
+
+   return wsi_get_swapchain_counter(
+      _device, &device->instance->physicalDevice.wsi_device,
+      swapchain, flag_bits, value);
+}
diff --git a/src/intel/vulkan/gen7_cmd_buffer.c b/src/intel/vulkan/gen7_cmd_buffer.c
index 71c3677..da51cb9 100644
--- a/src/intel/vulkan/gen7_cmd_buffer.c
+++ b/src/intel/vulkan/gen7_cmd_buffer.c
@@ -128,11 +128,11 @@
    const struct anv_render_pass *pass = cmd_buffer->state.pass;
    const struct anv_subpass *subpass = cmd_buffer->state.subpass;
 
-   if (subpass->depth_stencil_attachment.attachment >= pass->attachment_count)
+   if (!subpass->depth_stencil_attachment)
       return D16_UNORM;
 
    struct anv_render_pass_attachment *att =
-      &pass->attachments[subpass->depth_stencil_attachment.attachment];
+      &pass->attachments[subpass->depth_stencil_attachment->attachment];
 
    switch (att->format) {
    case VK_FORMAT_D16_UNORM:
@@ -246,12 +246,13 @@
          ib.CutIndexEnable             = pipeline->primitive_restart;
 #endif
          ib.IndexFormat                = cmd_buffer->state.gfx.gen7.index_type;
-         ib.MemoryObjectControlState   = GENX(MOCS);
+         ib.IndexBufferMOCS            = anv_mocs_for_bo(cmd_buffer->device,
+                                                         buffer->address.bo);
 
-         ib.BufferStartingAddress =
-            (struct anv_address) { buffer->bo, buffer->offset + offset };
-         ib.BufferEndingAddress =
-            (struct anv_address) { buffer->bo, buffer->offset + buffer->size };
+         ib.BufferStartingAddress      = anv_address_add(buffer->address,
+                                                         offset);
+         ib.BufferEndingAddress        = anv_address_add(buffer->address,
+                                                         buffer->size);
       }
    }
 
diff --git a/src/intel/vulkan/gen8_cmd_buffer.c b/src/intel/vulkan/gen8_cmd_buffer.c
index 2999c99..752d04f3 100644
--- a/src/intel/vulkan/gen8_cmd_buffer.c
+++ b/src/intel/vulkan/gen8_cmd_buffer.c
@@ -565,9 +565,9 @@
 
    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
       ib.IndexFormat                = vk_to_gen_index_type[indexType];
-      ib.MemoryObjectControlState   = GENX(MOCS);
-      ib.BufferStartingAddress      =
-         (struct anv_address) { buffer->bo, buffer->offset + offset };
+      ib.IndexBufferMOCS            = anv_mocs_for_bo(cmd_buffer->device,
+                                                      buffer->address.bo);
+      ib.BufferStartingAddress      = anv_address_add(buffer->address, offset);
       ib.BufferSize                 = buffer->size - offset;
    }
 
diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c
index 9023269..2035017 100644
--- a/src/intel/vulkan/genX_blorp_exec.c
+++ b/src/intel/vulkan/genX_blorp_exec.c
@@ -62,6 +62,12 @@
                          ss_offset, address.buffer, address.offset + delta);
    if (result != VK_SUCCESS)
       anv_batch_set_error(&cmd_buffer->batch, result);
+
+   void *dest = cmd_buffer->device->surface_state_pool.block_pool.map +
+      ss_offset;
+   uint64_t val = ((struct anv_bo*)address.buffer)->offset + address.offset +
+      delta;
+   write_reloc(cmd_buffer->device, dest, val, false);
 }
 
 #if GEN_GEN >= 7 && GEN_GEN < 10
@@ -152,6 +158,16 @@
    return vb_state.map;
 }
 
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+                                           const struct blorp_address *addrs,
+                                           unsigned num_vbs)
+{
+   /* anv forces all vertex buffers into the low 4GB so there are never any
+    * transitions that require a VF invalidation.
+    */
+}
+
 #if GEN_GEN >= 8
 static struct blorp_address
 blorp_get_workaround_page(struct blorp_batch *batch)
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 97d83cd..7cef4c5 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -170,49 +170,45 @@
 }
 
 static void
-add_surface_state_reloc(struct anv_cmd_buffer *cmd_buffer,
-                        struct anv_state state,
-                        struct anv_bo *bo, uint32_t offset)
+add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
+                  struct anv_state state, struct anv_address addr)
 {
    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
 
    VkResult result =
       anv_reloc_list_add(&cmd_buffer->surface_relocs, &cmd_buffer->pool->alloc,
-                         state.offset + isl_dev->ss.addr_offset, bo, offset);
+                         state.offset + isl_dev->ss.addr_offset,
+                         addr.bo, addr.offset);
    if (result != VK_SUCCESS)
       anv_batch_set_error(&cmd_buffer->batch, result);
 }
 
 static void
-add_image_view_relocs(struct anv_cmd_buffer *cmd_buffer,
-                      const struct anv_image_view *image_view,
-                      const uint32_t plane,
-                      struct anv_surface_state state)
+add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
+                         struct anv_surface_state state)
 {
    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
-   const struct anv_image *image = image_view->image;
-   uint32_t image_plane = image_view->planes[plane].image_plane;
 
-   add_surface_state_reloc(cmd_buffer, state.state,
-                           image->planes[image_plane].bo, state.address);
+   assert(!anv_address_is_null(state.address));
+   add_surface_reloc(cmd_buffer, state.state, state.address);
 
-   if (state.aux_address) {
+   if (!anv_address_is_null(state.aux_address)) {
       VkResult result =
          anv_reloc_list_add(&cmd_buffer->surface_relocs,
                             &cmd_buffer->pool->alloc,
                             state.state.offset + isl_dev->ss.aux_addr_offset,
-                            image->planes[image_plane].bo, state.aux_address);
+                            state.aux_address.bo, state.aux_address.offset);
       if (result != VK_SUCCESS)
          anv_batch_set_error(&cmd_buffer->batch, result);
    }
 
-   if (state.clear_address) {
+   if (!anv_address_is_null(state.clear_address)) {
       VkResult result =
          anv_reloc_list_add(&cmd_buffer->surface_relocs,
                             &cmd_buffer->pool->alloc,
                             state.state.offset +
                             isl_dev->ss.clear_color_state_offset,
-                            image->planes[image_plane].bo, state.clear_address);
+                            state.clear_address.bo, state.clear_address.offset);
       if (result != VK_SUCCESS)
          anv_batch_set_error(&cmd_buffer->batch, result);
    }
@@ -1276,8 +1272,7 @@
                                          &state->attachments[i].color,
                                          NULL);
 
-            add_image_view_relocs(cmd_buffer, iview, 0,
-                                  state->attachments[i].color);
+            add_surface_state_relocs(cmd_buffer, state->attachments[i].color);
          } else {
             depth_stencil_attachment_compute_aux_usage(cmd_buffer->device,
                                                        state, i,
@@ -1296,8 +1291,7 @@
                                          &state->attachments[i].input,
                                          NULL);
 
-            add_image_view_relocs(cmd_buffer, iview, 0,
-                                  state->attachments[i].input);
+            add_surface_state_relocs(cmd_buffer, state->attachments[i].input);
          }
       }
    }
@@ -1378,7 +1372,7 @@
 
          if (iview) {
             VkImageLayout layout =
-                cmd_buffer->state.subpass->depth_stencil_attachment.layout;
+                cmd_buffer->state.subpass->depth_stencil_attachment->layout;
 
             enum isl_aux_usage aux_usage =
                anv_layout_to_aux_usage(&cmd_buffer->device->info, iview->image,
@@ -1978,9 +1972,6 @@
 
    if (stage == MESA_SHADER_COMPUTE &&
        get_cs_prog_data(pipeline)->uses_num_work_groups) {
-      struct anv_bo *bo = cmd_buffer->state.compute.num_workgroups.bo;
-      uint32_t bo_offset = cmd_buffer->state.compute.num_workgroups.offset;
-
       struct anv_state surface_state;
       surface_state =
          anv_cmd_buffer_alloc_surface_state(cmd_buffer);
@@ -1988,10 +1979,13 @@
       const enum isl_format format =
          anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
       anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
-                                    format, bo_offset, 12, 1);
+                                    format,
+                                    cmd_buffer->state.compute.num_workgroups,
+                                    12, 1);
 
       bt_map[0] = surface_state.offset + state_offset;
-      add_surface_state_reloc(cmd_buffer, surface_state, bo, bo_offset);
+      add_surface_reloc(cmd_buffer, surface_state,
+                        cmd_buffer->state.compute.num_workgroups);
    }
 
    if (map->surface_count == 0)
@@ -2037,6 +2031,26 @@
 
          bt_map[bias + s] = surface_state.offset + state_offset;
          continue;
+      } else if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) {
+         struct anv_state surface_state =
+            anv_cmd_buffer_alloc_surface_state(cmd_buffer);
+
+         struct anv_address constant_data = {
+            .bo = &pipeline->device->dynamic_state_pool.block_pool.bo,
+            .offset = pipeline->shaders[stage]->constant_data.offset,
+         };
+         unsigned constant_data_size =
+            pipeline->shaders[stage]->constant_data_size;
+
+         const enum isl_format format =
+            anv_isl_format_for_descriptor_type(VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
+         anv_fill_buffer_surface_state(cmd_buffer->device,
+                                       surface_state, format,
+                                       constant_data, constant_data_size, 1);
+
+         bt_map[bias + s] = surface_state.offset + state_offset;
+         add_surface_reloc(cmd_buffer, surface_state, constant_data);
+         continue;
       }
 
       const struct anv_descriptor *desc =
@@ -2055,8 +2069,7 @@
             desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
          surface_state = sstate.state;
          assert(surface_state.alloc_size);
-         add_image_view_relocs(cmd_buffer, desc->image_view,
-                               binding->plane, sstate);
+         add_surface_state_relocs(cmd_buffer, sstate);
          break;
       }
       case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
@@ -2071,8 +2084,7 @@
                desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
             surface_state = sstate.state;
             assert(surface_state.alloc_size);
-            add_image_view_relocs(cmd_buffer, desc->image_view,
-                                  binding->plane, sstate);
+            add_surface_state_relocs(cmd_buffer, sstate);
          } else {
             /* For color input attachments, we create the surface state at
              * vkBeginRenderPass time so that we can include aux and clear
@@ -2091,8 +2103,7 @@
             : desc->image_view->planes[binding->plane].storage_surface_state;
          surface_state = sstate.state;
          assert(surface_state.alloc_size);
-         add_image_view_relocs(cmd_buffer, desc->image_view,
-                               binding->plane, sstate);
+         add_surface_state_relocs(cmd_buffer, sstate);
 
          struct brw_image_param *image_param =
             &cmd_buffer->state.push_constants[stage]->images[image++];
@@ -2107,9 +2118,8 @@
       case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
          surface_state = desc->buffer_view->surface_state;
          assert(surface_state.alloc_size);
-         add_surface_state_reloc(cmd_buffer, surface_state,
-                                 desc->buffer_view->bo,
-                                 desc->buffer_view->offset);
+         add_surface_reloc(cmd_buffer, surface_state,
+                           desc->buffer_view->address);
          break;
 
       case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
@@ -2123,16 +2133,17 @@
          /* Clamp the range to the buffer size */
          uint32_t range = MIN2(desc->range, desc->buffer->size - offset);
 
+         struct anv_address address =
+            anv_address_add(desc->buffer->address, offset);
+
          surface_state =
             anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
          enum isl_format format =
             anv_isl_format_for_descriptor_type(desc->type);
 
          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
-                                       format, offset, range, 1);
-         add_surface_state_reloc(cmd_buffer, surface_state,
-                                 desc->buffer->bo,
-                                 desc->buffer->offset + offset);
+                                       format, address, range, 1);
+         add_surface_reloc(cmd_buffer, surface_state, address);
          break;
       }
 
@@ -2141,9 +2152,8 @@
             ? desc->buffer_view->writeonly_storage_surface_state
             : desc->buffer_view->storage_surface_state;
          assert(surface_state.alloc_size);
-         add_surface_state_reloc(cmd_buffer, surface_state,
-                                 desc->buffer_view->bo,
-                                 desc->buffer_view->offset);
+         add_surface_reloc(cmd_buffer, surface_state,
+                           desc->buffer_view->address);
 
          struct brw_image_param *image_param =
             &cmd_buffer->state.push_constants[stage]->images[image++];
@@ -2394,36 +2404,44 @@
                const struct anv_pipeline_binding *binding =
                   &bind_map->surface_to_descriptor[surface];
 
-               const struct anv_descriptor *desc =
-                  anv_descriptor_for_binding(&gfx_state->base, binding);
-
                struct anv_address read_addr;
                uint32_t read_len;
-               if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
-                  read_len = MIN2(range->length,
-                     DIV_ROUND_UP(desc->buffer_view->range, 32) - range->start);
-                  read_addr = (struct anv_address) {
-                     .bo = desc->buffer_view->bo,
-                     .offset = desc->buffer_view->offset +
-                               range->start * 32,
+               if (binding->set == ANV_DESCRIPTOR_SET_SHADER_CONSTANTS) {
+                  struct anv_address constant_data = {
+                     .bo = &pipeline->device->dynamic_state_pool.block_pool.bo,
+                     .offset = pipeline->shaders[stage]->constant_data.offset,
                   };
+                  unsigned constant_data_size =
+                     pipeline->shaders[stage]->constant_data_size;
+
+                  read_len = MIN2(range->length,
+                     DIV_ROUND_UP(constant_data_size, 32) - range->start);
+                  read_addr = anv_address_add(constant_data,
+                                              range->start * 32);
                } else {
-                  assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
+                  const struct anv_descriptor *desc =
+                     anv_descriptor_for_binding(&gfx_state->base, binding);
 
-                  uint32_t dynamic_offset =
-                     dynamic_offset_for_binding(&gfx_state->base, binding);
-                  uint32_t buf_offset =
-                     MIN2(desc->offset + dynamic_offset, desc->buffer->size);
-                  uint32_t buf_range =
-                     MIN2(desc->range, desc->buffer->size - buf_offset);
+                  if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
+                     read_len = MIN2(range->length,
+                        DIV_ROUND_UP(desc->buffer_view->range, 32) - range->start);
+                     read_addr = anv_address_add(desc->buffer_view->address,
+                                                 range->start * 32);
+                  } else {
+                     assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
 
-                  read_len = MIN2(range->length,
-                     DIV_ROUND_UP(buf_range, 32) - range->start);
-                  read_addr = (struct anv_address) {
-                     .bo = desc->buffer->bo,
-                     .offset = desc->buffer->offset + buf_offset +
-                               range->start * 32,
-                  };
+                     uint32_t dynamic_offset =
+                        dynamic_offset_for_binding(&gfx_state->base, binding);
+                     uint32_t buf_offset =
+                        MIN2(desc->offset + dynamic_offset, desc->buffer->size);
+                     uint32_t buf_range =
+                        MIN2(desc->range, desc->buffer->size - buf_offset);
+
+                     read_len = MIN2(range->length,
+                        DIV_ROUND_UP(buf_range, 32) - range->start);
+                     read_addr = anv_address_add(desc->buffer->address,
+                                                 buf_offset + range->start * 32);
+                  }
                }
 
                if (read_len > 0) {
@@ -2498,27 +2516,21 @@
          struct GENX(VERTEX_BUFFER_STATE) state = {
             .VertexBufferIndex = vb,
 
-#if GEN_GEN >= 8
-            .MemoryObjectControlState = GENX(MOCS),
-#else
-            .BufferAccessType = pipeline->instancing_enable[vb] ? INSTANCEDATA : VERTEXDATA,
-            /* Our implementation of VK_KHR_multiview uses instancing to draw
-             * the different views.  If the client asks for instancing, we
-             * need to use the Instance Data Step Rate to ensure that we
-             * repeat the client's per-instance data once for each view.
-             */
-            .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass),
-            .VertexBufferMemoryObjectControlState = GENX(MOCS),
+            .VertexBufferMOCS = anv_mocs_for_bo(cmd_buffer->device,
+                                                buffer->address.bo),
+#if GEN_GEN <= 7
+            .BufferAccessType = pipeline->vb[vb].instanced ? INSTANCEDATA : VERTEXDATA,
+            .InstanceDataStepRate = pipeline->vb[vb].instance_divisor,
 #endif
 
             .AddressModifyEnable = true,
-            .BufferPitch = pipeline->binding_stride[vb],
-            .BufferStartingAddress = { buffer->bo, buffer->offset + offset },
+            .BufferPitch = pipeline->vb[vb].stride,
+            .BufferStartingAddress = anv_address_add(buffer->address, offset),
 
 #if GEN_GEN >= 8
             .BufferSize = buffer->size - offset
 #else
-            .EndAddress = { buffer->bo, buffer->offset + buffer->size - 1},
+            .EndAddress = anv_address_add(buffer->address, buffer->size - 1),
 #endif
          };
 
@@ -2612,7 +2624,7 @@
 
 static void
 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
-               struct anv_bo *bo, uint32_t offset,
+               struct anv_address addr,
                uint32_t size, uint32_t index)
 {
    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
@@ -2623,23 +2635,22 @@
          .VertexBufferIndex = index,
          .AddressModifyEnable = true,
          .BufferPitch = 0,
+         .VertexBufferMOCS = anv_mocs_for_bo(cmd_buffer->device, addr.bo),
 #if (GEN_GEN >= 8)
-         .MemoryObjectControlState = GENX(MOCS),
-         .BufferStartingAddress = { bo, offset },
+         .BufferStartingAddress = addr,
          .BufferSize = size
 #else
-         .VertexBufferMemoryObjectControlState = GENX(MOCS),
-         .BufferStartingAddress = { bo, offset },
-         .EndAddress = { bo, offset + size },
+         .BufferStartingAddress = addr,
+         .EndAddress = anv_address_add(addr, size),
 #endif
       });
 }
 
 static void
 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
-                             struct anv_bo *bo, uint32_t offset)
+                             struct anv_address addr)
 {
-   emit_vertex_bo(cmd_buffer, bo, offset, 8, ANV_SVGS_VB_INDEX);
+   emit_vertex_bo(cmd_buffer, addr, 8, ANV_SVGS_VB_INDEX);
 }
 
 static void
@@ -2654,8 +2665,12 @@
 
    anv_state_flush(cmd_buffer->device, id_state);
 
-   emit_base_vertex_instance_bo(cmd_buffer,
-      &cmd_buffer->device->dynamic_state_pool.block_pool.bo, id_state.offset);
+   struct anv_address addr = {
+      .bo = &cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+      .offset = id_state.offset,
+   };
+
+   emit_base_vertex_instance_bo(cmd_buffer, addr);
 }
 
 static void
@@ -2668,9 +2683,12 @@
 
    anv_state_flush(cmd_buffer->device, state);
 
-   emit_vertex_bo(cmd_buffer,
-                  &cmd_buffer->device->dynamic_state_pool.block_pool.bo,
-                  state.offset, 4, ANV_DRAWID_VB_INDEX);
+   struct anv_address addr = {
+      .bo = &cmd_buffer->device->dynamic_state_pool.block_pool.bo,
+      .offset = state.offset,
+   };
+
+   emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
 }
 
 void genX(CmdDraw)(
@@ -2812,37 +2830,35 @@
 
 static void
 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
-                         struct anv_buffer *buffer, uint64_t offset,
+                         struct anv_address addr,
                          bool indexed)
 {
    struct anv_batch *batch = &cmd_buffer->batch;
-   struct anv_bo *bo = buffer->bo;
-   uint32_t bo_offset = buffer->offset + offset;
 
-   emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset);
+   emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, addr.bo, addr.offset);
 
    unsigned view_count = anv_subpass_view_count(cmd_buffer->state.subpass);
    if (view_count > 1) {
 #if GEN_IS_HASWELL || GEN_GEN >= 8
-      emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4);
+      emit_lrm(batch, CS_GPR(0), addr.bo, addr.offset + 4);
       emit_mul_gpr0(batch, view_count);
       emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0));
 #else
       anv_finishme("Multiview + indirect draw requires MI_MATH; "
                    "MI_MATH is not supported on Ivy Bridge");
-      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, addr.bo, addr.offset + 4);
 #endif
    } else {
-      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4);
+      emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, addr.bo, addr.offset + 4);
    }
 
-   emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8);
+   emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, addr.bo, addr.offset + 8);
 
    if (indexed) {
-      emit_lrm(batch, GEN7_3DPRIM_BASE_VERTEX, bo, bo_offset + 12);
-      emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 16);
+      emit_lrm(batch, GEN7_3DPRIM_BASE_VERTEX, addr.bo, addr.offset + 12);
+      emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, addr.bo, addr.offset + 16);
    } else {
-      emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, bo, bo_offset + 12);
+      emit_lrm(batch, GEN7_3DPRIM_START_INSTANCE, addr.bo, addr.offset + 12);
       emit_lri(batch, GEN7_3DPRIM_BASE_VERTEX, 0);
    }
 }
@@ -2865,16 +2881,15 @@
    genX(cmd_buffer_flush_state)(cmd_buffer);
 
    for (uint32_t i = 0; i < drawCount; i++) {
-      struct anv_bo *bo = buffer->bo;
-      uint32_t bo_offset = buffer->offset + offset;
+      struct anv_address draw = anv_address_add(buffer->address, offset);
 
       if (vs_prog_data->uses_firstvertex ||
           vs_prog_data->uses_baseinstance)
-         emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 8);
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
       if (vs_prog_data->uses_drawid)
          emit_draw_index(cmd_buffer, i);
 
-      load_indirect_parameters(cmd_buffer, buffer, offset, false);
+      load_indirect_parameters(cmd_buffer, draw, false);
 
       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
          prim.IndirectParameterEnable  = true;
@@ -2904,17 +2919,16 @@
    genX(cmd_buffer_flush_state)(cmd_buffer);
 
    for (uint32_t i = 0; i < drawCount; i++) {
-      struct anv_bo *bo = buffer->bo;
-      uint32_t bo_offset = buffer->offset + offset;
+      struct anv_address draw = anv_address_add(buffer->address, offset);
 
       /* TODO: We need to stomp base vertex to 0 somehow */
       if (vs_prog_data->uses_firstvertex ||
           vs_prog_data->uses_baseinstance)
-         emit_base_vertex_instance_bo(cmd_buffer, bo, bo_offset + 12);
+         emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
       if (vs_prog_data->uses_drawid)
          emit_draw_index(cmd_buffer, i);
 
-      load_indirect_parameters(cmd_buffer, buffer, offset, true);
+      load_indirect_parameters(cmd_buffer, draw, true);
 
       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
          prim.IndirectParameterEnable  = true;
@@ -3159,8 +3173,7 @@
    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
    struct anv_pipeline *pipeline = cmd_buffer->state.compute.base.pipeline;
    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
-   struct anv_bo *bo = buffer->bo;
-   uint32_t bo_offset = buffer->offset + offset;
+   struct anv_address addr = anv_address_add(buffer->address, offset);
    struct anv_batch *batch = &cmd_buffer->batch;
 
    anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
@@ -3174,18 +3187,14 @@
       return;
 #endif
 
-   if (prog_data->uses_num_work_groups) {
-      cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
-         .bo = bo,
-         .offset = bo_offset,
-      };
-   }
+   if (prog_data->uses_num_work_groups)
+      cmd_buffer->state.compute.num_workgroups = addr;
 
    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
 
-   emit_lrm(batch, GPGPU_DISPATCHDIMX, bo, bo_offset);
-   emit_lrm(batch, GPGPU_DISPATCHDIMY, bo, bo_offset + 4);
-   emit_lrm(batch, GPGPU_DISPATCHDIMZ, bo, bo_offset + 8);
+   emit_lrm(batch, GPGPU_DISPATCHDIMX, addr.bo, addr.offset);
+   emit_lrm(batch, GPGPU_DISPATCHDIMY, addr.bo, addr.offset + 4);
+   emit_lrm(batch, GPGPU_DISPATCHDIMZ, addr.bo, addr.offset + 8);
 
 #if GEN_GEN <= 7
    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
@@ -3194,7 +3203,7 @@
    emit_lri(batch, MI_PREDICATE_SRC1 + 4, 0);
 
    /* Load compute_dispatch_indirect_x_size into SRC0 */
-   emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 0);
+   emit_lrm(batch, MI_PREDICATE_SRC0, addr.bo, addr.offset + 0);
 
    /* predicate = (compute_dispatch_indirect_x_size == 0); */
    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
@@ -3204,7 +3213,7 @@
    }
 
    /* Load compute_dispatch_indirect_y_size into SRC0 */
-   emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 4);
+   emit_lrm(batch, MI_PREDICATE_SRC0, addr.bo, addr.offset + 4);
 
    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
@@ -3214,7 +3223,7 @@
    }
 
    /* Load compute_dispatch_indirect_z_size into SRC0 */
-   emit_lrm(batch, MI_PREDICATE_SRC0, bo, bo_offset + 8);
+   emit_lrm(batch, MI_PREDICATE_SRC0, addr.bo, addr.offset + 8);
 
    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
@@ -3382,9 +3391,7 @@
    if (dw == NULL)
       return;
 
-   struct isl_depth_stencil_hiz_emit_info info = {
-      .mocs = device->default_mocs,
-   };
+   struct isl_depth_stencil_hiz_emit_info info = { };
 
    if (iview)
       info.view = &iview->planes[0].isl;
@@ -3399,12 +3406,14 @@
       info.depth_address =
          anv_batch_emit_reloc(&cmd_buffer->batch,
                               dw + device->isl_dev.ds.depth_offset / 4,
-                              image->planes[depth_plane].bo,
-                              image->planes[depth_plane].bo_offset +
+                              image->planes[depth_plane].address.bo,
+                              image->planes[depth_plane].address.offset +
                               surface->offset);
+      info.mocs =
+         anv_mocs_for_bo(device, image->planes[depth_plane].address.bo);
 
       const uint32_t ds =
-         cmd_buffer->state.subpass->depth_stencil_attachment.attachment;
+         cmd_buffer->state.subpass->depth_stencil_attachment->attachment;
       info.hiz_usage = cmd_buffer->state.attachments[ds].aux_usage;
       if (info.hiz_usage == ISL_AUX_USAGE_HIZ) {
          info.hiz_surf = &image->planes[depth_plane].aux_surface.isl;
@@ -3412,8 +3421,8 @@
          info.hiz_address =
             anv_batch_emit_reloc(&cmd_buffer->batch,
                                  dw + device->isl_dev.ds.hiz_offset / 4,
-                                 image->planes[depth_plane].bo,
-                                 image->planes[depth_plane].bo_offset +
+                                 image->planes[depth_plane].address.bo,
+                                 image->planes[depth_plane].address.offset +
                                  image->planes[depth_plane].aux_surface.offset);
 
          info.depth_clear_value = ANV_HZ_FC_VAL;
@@ -3430,8 +3439,11 @@
       info.stencil_address =
          anv_batch_emit_reloc(&cmd_buffer->batch,
                               dw + device->isl_dev.ds.stencil_offset / 4,
-                              image->planes[stencil_plane].bo,
-                              image->planes[stencil_plane].bo_offset + surface->offset);
+                              image->planes[stencil_plane].address.bo,
+                              image->planes[stencil_plane].address.offset +
+                              surface->offset);
+      info.mocs =
+         anv_mocs_for_bo(device, image->planes[stencil_plane].address.bo);
    }
 
    isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
@@ -3878,6 +3890,15 @@
    cmd_buffer_begin_subpass(cmd_buffer, 0);
 }
 
+void genX(CmdBeginRenderPass2KHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkRenderPassBeginInfo*                pRenderPassBeginInfo,
+    const VkSubpassBeginInfoKHR*                pSubpassBeginInfo)
+{
+   genX(CmdBeginRenderPass)(commandBuffer, pRenderPassBeginInfo,
+                            pSubpassBeginInfo->contents);
+}
+
 void genX(CmdNextSubpass)(
     VkCommandBuffer                             commandBuffer,
     VkSubpassContents                           contents)
@@ -3894,6 +3915,14 @@
    cmd_buffer_begin_subpass(cmd_buffer, prev_subpass + 1);
 }
 
+void genX(CmdNextSubpass2KHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkSubpassBeginInfoKHR*                pSubpassBeginInfo,
+    const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
+{
+   genX(CmdNextSubpass)(commandBuffer, pSubpassBeginInfo->contents);
+}
+
 void genX(CmdEndRenderPass)(
     VkCommandBuffer                             commandBuffer)
 {
@@ -3917,3 +3946,10 @@
    cmd_buffer->state.pass = NULL;
    cmd_buffer->state.subpass = NULL;
 }
+
+void genX(CmdEndRenderPass2KHR)(
+    VkCommandBuffer                             commandBuffer,
+    const VkSubpassEndInfoKHR*                  pSubpassEndInfo)
+{
+   genX(CmdEndRenderPass)(commandBuffer);
+}
diff --git a/src/intel/vulkan/genX_gpu_memcpy.c b/src/intel/vulkan/genX_gpu_memcpy.c
index eaafcfa..b51c180 100644
--- a/src/intel/vulkan/genX_gpu_memcpy.c
+++ b/src/intel/vulkan/genX_gpu_memcpy.c
@@ -158,11 +158,10 @@
          .AddressModifyEnable = true,
          .BufferStartingAddress = { src, src_offset },
          .BufferPitch = bs,
+         .VertexBufferMOCS = anv_mocs_for_bo(cmd_buffer->device, src),
 #if (GEN_GEN >= 8)
-         .MemoryObjectControlState = GENX(MOCS),
          .BufferSize = size,
 #else
-         .VertexBufferMemoryObjectControlState = GENX(MOCS),
          .EndAddress = { src, src_offset + size - 1 },
 #endif
       });
@@ -219,7 +218,7 @@
 
    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
       sob.SOBufferIndex = 0;
-      sob.SOBufferObjectControlState = GENX(MOCS);
+      sob.SOBufferMOCS = anv_mocs_for_bo(cmd_buffer->device, dst),
       sob.SurfaceBaseAddress = (struct anv_address) { dst, dst_offset };
 
 #if GEN_GEN >= 8
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index e43a408..cb560dc 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -182,14 +182,10 @@
        * VERTEX_BUFFER_STATE which we emit later.
        */
       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
-         vfi.InstancingEnable = pipeline->instancing_enable[desc->binding];
+         vfi.InstancingEnable = pipeline->vb[desc->binding].instanced;
          vfi.VertexElementIndex = slot;
-         /* Our implementation of VK_KHR_multiview uses instancing to draw
-          * the different views.  If the client asks for instancing, we
-          * need to use the Instance Data Step Rate to ensure that we
-          * repeat the client's per-instance data once for each view.
-          */
-         vfi.InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass);
+         vfi.InstanceDataStepRate =
+            pipeline->vb[desc->binding].instance_divisor;
       }
 #endif
    }
@@ -527,9 +523,9 @@
    /* Gen7 requires that we provide the depth format in 3DSTATE_SF so that it
     * can get the depth offsets correct.
     */
-   if (subpass->depth_stencil_attachment.attachment < pass->attachment_count) {
+   if (subpass->depth_stencil_attachment) {
       VkFormat vk_format =
-         pass->attachments[subpass->depth_stencil_attachment.attachment].format;
+         pass->attachments[subpass->depth_stencil_attachment->attachment].format;
       assert(vk_format_is_depth_or_stencil(vk_format));
       if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) {
          enum isl_format isl_format =
@@ -759,14 +755,14 @@
 {
    *stencilWriteEnable = state->stencilTestEnable;
 
-   /* If the depth test is disabled, we won't be writing anything. */
-   if (!state->depthTestEnable)
-      state->depthWriteEnable = false;
-
-   /* The Vulkan spec requires that if either depth or stencil is not present,
-    * the pipeline is to act as if the test silently passes.
+   /* If the depth test is disabled, we won't be writing anything. Make sure we
+    * treat the test as always passing later on as well.
+    *
+    * Also, the Vulkan spec requires that if either depth or stencil is not
+    * present, the pipeline is to act as if the test silently passes. In that
+    * case we won't write either.
     */
-   if (!(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
+   if (!state->depthTestEnable || !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
       state->depthWriteEnable = false;
       state->depthCompareOp = VK_COMPARE_OP_ALWAYS;
    }
@@ -844,9 +840,9 @@
    }
 
    VkImageAspectFlags ds_aspects = 0;
-   if (subpass->depth_stencil_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+   if (subpass->depth_stencil_attachment) {
       VkFormat depth_stencil_format =
-         pass->attachments[subpass->depth_stencil_attachment.attachment].format;
+         pass->attachments[subpass->depth_stencil_attachment->attachment].format;
       ds_aspects = vk_format_aspects(depth_stencil_format);
    }
 
@@ -1410,7 +1406,7 @@
       if (binding->index == UINT32_MAX)
          continue;
 
-      if (blend->pAttachments[binding->index].colorWriteMask != 0)
+      if (blend && blend->pAttachments[binding->index].colorWriteMask != 0)
          return true;
    }
 
@@ -1516,7 +1512,8 @@
 
 static void
 emit_3dstate_ps(struct anv_pipeline *pipeline,
-                const VkPipelineColorBlendStateCreateInfo *blend)
+                const VkPipelineColorBlendStateCreateInfo *blend,
+                const VkPipelineMultisampleStateCreateInfo *multisample)
 {
    MAYBE_UNUSED const struct gen_device_info *devinfo = &pipeline->device->info;
    const struct anv_shader_bin *fs_bin =
@@ -1559,13 +1556,30 @@
 #endif
 
    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) {
-      ps.KernelStartPointer0        = fs_bin->kernel.offset;
-      ps.KernelStartPointer1        = 0;
-      ps.KernelStartPointer2        = fs_bin->kernel.offset +
-                                      wm_prog_data->prog_offset_2;
       ps._8PixelDispatchEnable      = wm_prog_data->dispatch_8;
       ps._16PixelDispatchEnable     = wm_prog_data->dispatch_16;
-      ps._32PixelDispatchEnable     = false;
+      ps._32PixelDispatchEnable     = wm_prog_data->dispatch_32;
+
+      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+       *
+       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+       *
+       * Since 16x MSAA is first introduced on SKL, we don't need to apply
+       * the workaround on any older hardware.
+       */
+      if (GEN_GEN >= 9 && !wm_prog_data->persample_dispatch &&
+          multisample && multisample->rasterizationSamples == 16) {
+         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+         ps._32PixelDispatchEnable = false;
+      }
+
+      ps.KernelStartPointer0 = fs_bin->kernel.offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
+      ps.KernelStartPointer1 = fs_bin->kernel.offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
+      ps.KernelStartPointer2 = fs_bin->kernel.offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
 
       ps.SingleProgramFlow          = false;
       ps.VectorMaskEnable           = true;
@@ -1597,10 +1611,11 @@
 #endif
 
       ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         wm_prog_data->base.dispatch_grf_start_reg;
-      ps.DispatchGRFStartRegisterForConstantSetupData1 = 0;
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
       ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         wm_prog_data->dispatch_grf_start_reg_2;
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
 
       ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
       ps.ScratchSpaceBasePointer =
@@ -1640,9 +1655,16 @@
                                          wm_prog_data->uses_kill;
 
 #if GEN_GEN >= 9
+      ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
       ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
-      ps.InputCoverageMaskState  = wm_prog_data->uses_sample_mask ?
-                                   ICMS_INNER_CONSERVATIVE : ICMS_NONE;
+
+      ps.InputCoverageMaskState  = ICMS_NONE;
+      if (wm_prog_data->uses_sample_mask) {
+         if (wm_prog_data->post_depth_coverage)
+            ps.InputCoverageMaskState  = ICMS_DEPTH_COVERAGE;
+         else
+            ps.InputCoverageMaskState  = ICMS_INNER_CONSERVATIVE;
+      }
 #else
       ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
 #endif
@@ -1714,6 +1736,10 @@
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
 
+   /* Use the default pipeline cache if none is specified */
+   if (cache == NULL && device->instance->pipeline_cache_enabled)
+      cache = &device->default_pipeline_cache;
+
    pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pipeline == NULL)
@@ -1768,7 +1794,8 @@
    emit_3dstate_sbe(pipeline);
    emit_3dstate_wm(pipeline, subpass, pCreateInfo->pColorBlendState,
                    pCreateInfo->pMultisampleState);
-   emit_3dstate_ps(pipeline, pCreateInfo->pColorBlendState);
+   emit_3dstate_ps(pipeline, pCreateInfo->pColorBlendState,
+                   pCreateInfo->pMultisampleState);
 #if GEN_GEN >= 8
    emit_3dstate_ps_extra(pipeline, subpass, pCreateInfo->pColorBlendState);
    emit_3dstate_vf_topology(pipeline);
@@ -1797,6 +1824,10 @@
 
    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
 
+   /* Use the default pipeline cache if none is specified */
+   if (cache == NULL && device->instance->pipeline_cache_enabled)
+      cache = &device->default_pipeline_cache;
+
    pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
    if (pipeline == NULL)
diff --git a/src/intel/vulkan/genX_query.c b/src/intel/vulkan/genX_query.c
index 4e280f1..9da4d9d 100644
--- a/src/intel/vulkan/genX_query.c
+++ b/src/intel/vulkan/genX_query.c
@@ -94,9 +94,14 @@
    if (pdevice->supports_48bit_addresses)
       pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
 
+   if (pdevice->use_softpin)
+      pool->bo.flags |= EXEC_OBJECT_PINNED;
+
    if (pdevice->has_exec_async)
       pool->bo.flags |= EXEC_OBJECT_ASYNC;
 
+   anv_vma_alloc(device, &pool->bo);
+
    /* For query pools, we set the caching mode to I915_CACHING_CACHED.  On LLC
     * platforms, this does nothing.  On non-LLC platforms, this means snooping
     * which comes at a slight cost.  However, the buffers aren't big, won't be
@@ -129,6 +134,7 @@
       return;
 
    anv_gem_munmap(device, pool->bo.gem_handle, pool->bo.map, pool->bo.size);
+   anv_vma_free(device, &pool->bo);
    anv_gem_close(device, pool->bo.gem_handle);
    vk_free2(&device->alloc, pAllocator, pool);
 }
@@ -691,19 +697,14 @@
 
    anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
       srm.RegisterAddress  = reg;
-      srm.MemoryAddress    = (struct anv_address) {
-         .bo = dst_buffer->bo,
-         .offset = dst_buffer->offset + dst_offset,
-      };
+      srm.MemoryAddress    = anv_address_add(dst_buffer->address, dst_offset);
    }
 
    if (flags & VK_QUERY_RESULT_64_BIT) {
       anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
          srm.RegisterAddress  = reg + 4;
-         srm.MemoryAddress    = (struct anv_address) {
-            .bo = dst_buffer->bo,
-            .offset = dst_buffer->offset + dst_offset + 4,
-         };
+         srm.MemoryAddress    = anv_address_add(dst_buffer->address,
+                                                dst_offset + 4);
       }
    }
 }
diff --git a/src/intel/vulkan/genX_state.c b/src/intel/vulkan/genX_state.c
index 1919a28..78f15f0 100644
--- a/src/intel/vulkan/genX_state.c
+++ b/src/intel/vulkan/genX_state.c
@@ -93,6 +93,12 @@
 {
    GENX(MEMORY_OBJECT_CONTROL_STATE_pack)(NULL, &device->default_mocs,
                                           &GENX(MOCS));
+#if GEN_GEN >= 8
+   GENX(MEMORY_OBJECT_CONTROL_STATE_pack)(NULL, &device->external_mocs,
+                                          &GENX(EXTERNAL_MOCS));
+#else
+   device->external_mocs = device->default_mocs;
+#endif
    device->uncached_mocs = 0;
 
    struct anv_batch batch;
@@ -122,18 +128,6 @@
    }
 #endif
 
-#if GEN_GEN == 10 || GEN_GEN == 11
-   uint32_t cache_mode_ss;
-   anv_pack_struct(&cache_mode_ss, GENX(CACHE_MODE_SS),
-                   .FloatBlendOptimizationEnable = true,
-                   .FloatBlendOptimizationEnableMask = true);
-
-   anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
-      lri.RegisterOffset = GENX(CACHE_MODE_SS_num);
-      lri.DataDWord      = cache_mode_ss;
-   }
-#endif
-
    anv_batch_emit(&batch, GENX(3DSTATE_AA_LINE_PARAMETERS), aa);
 
    anv_batch_emit(&batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
@@ -170,6 +164,33 @@
    gen10_emit_wa_lri_to_cache_mode_zero(&batch);
 #endif
 
+   /* Set the "CONSTANT_BUFFER Address Offset Disable" bit, so
+    * 3DSTATE_CONSTANT_XS buffer 0 is an absolute address.
+    *
+    * This is only safe on kernels with context isolation support.
+    */
+   if (GEN_GEN >= 8 &&
+       device->instance->physicalDevice.has_context_isolation) {
+      UNUSED uint32_t tmp_reg;
+#if GEN_GEN >= 9
+      anv_pack_struct(&tmp_reg, GENX(CS_DEBUG_MODE2),
+                      .CONSTANT_BUFFERAddressOffsetDisable = true,
+                      .CONSTANT_BUFFERAddressOffsetDisableMask = true);
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = GENX(CS_DEBUG_MODE2_num);
+         lri.DataDWord      = tmp_reg;
+      }
+#elif GEN_GEN == 8
+      anv_pack_struct(&tmp_reg, GENX(INSTPM),
+                      .CONSTANT_BUFFERAddressOffsetDisable = true,
+                      .CONSTANT_BUFFERAddressOffsetDisableMask = true);
+      anv_batch_emit(&batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
+         lri.RegisterOffset = GENX(INSTPM_num);
+         lri.DataDWord      = tmp_reg;
+      }
+#endif
+   }
+
    anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe);
 
    assert(batch.next <= batch.end);
diff --git a/src/intel/vulkan/meson.build b/src/intel/vulkan/meson.build
index 218c27f..49ff641 100644
--- a/src/intel/vulkan/meson.build
+++ b/src/intel/vulkan/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2017 Intel Corporation
+# Copyright © 2017-2018 Intel Corporation
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -81,15 +81,6 @@
   install : false,
 )
 
-# TODO: workaround for anv_entrypoints combining the .h and .c files in it's
-# output. See issue #2346
-block_entrypoints = custom_target(
-  'block_entrypoints',
-  command : [prog_touch, '@OUTPUT@'],
-  output : 'null',
-  depends : anv_entrypoints,
-)
-
 libanv_gen_libs = []
 anv_gen_files = files(
   'genX_blorp_exec.c',
@@ -105,7 +96,7 @@
   _gen = g[0]
   libanv_gen_libs += static_library(
     'anv_gen@0@'.format(_gen),
-    [anv_gen_files, g[1], block_entrypoints],
+    [anv_gen_files, g[1], anv_entrypoints[0], anv_extensions_h],
     include_directories : [
       inc_common, inc_compiler, inc_drm_uapi, inc_intel, inc_vulkan_util,
       inc_vulkan_wsi,
@@ -173,6 +164,16 @@
   libanv_files += files('anv_wsi_wayland.c')
 endif
 
+if with_platform_drm
+  anv_flags += '-DVK_USE_PLATFORM_DISPLAY_KHR'
+  libanv_files += files('anv_wsi_display.c')
+endif
+
+if with_xlib_lease
+  anv_deps += [dep_xcb_xrandr, dep_xlib_xrandr]
+  anv_flags += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
+endif
+
 libanv_common = static_library(
   'anv_common',
   [libanv_files, anv_entrypoints, anv_extensions_c, anv_extensions_h],
@@ -186,7 +187,7 @@
 
 libvulkan_intel = shared_library(
   'vulkan_intel',
-  [files('anv_gem.c'), block_entrypoints, anv_extensions_h],
+  [files('anv_gem.c'), anv_entrypoints[0], anv_extensions_h],
   include_directories : [
     inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_util,
     inc_vulkan_wsi,
@@ -207,7 +208,7 @@
 if with_tests
   libvulkan_intel_test = static_library(
     'vulkan_intel_test',
-    [files('anv_gem_stubs.c'), block_entrypoints, anv_extensions_h],
+    [files('anv_gem_stubs.c'), anv_entrypoints[0], anv_extensions_h],
     include_directories : [
       inc_common, inc_intel, inc_compiler, inc_drm_uapi, inc_vulkan_util,
       inc_vulkan_wsi,
@@ -229,7 +230,7 @@
       'anv_@0@'.format(t),
       executable(
         t,
-        ['tests/@0@.c'.format(t), dummy_cpp, block_entrypoints],
+        ['tests/@0@.c'.format(t), anv_entrypoints[0], anv_extensions_h],
         link_with : libvulkan_intel_test,
         dependencies : [dep_libdrm, dep_thread, dep_m, dep_valgrind],
         include_directories : [
diff --git a/src/intel/vulkan/tests/block_pool_no_free.c b/src/intel/vulkan/tests/block_pool_no_free.c
index 7ffe858..0589d78 100644
--- a/src/intel/vulkan/tests/block_pool_no_free.c
+++ b/src/intel/vulkan/tests/block_pool_no_free.c
@@ -118,7 +118,7 @@
    anv_gem_connect(&device);
 
    pthread_mutex_init(&device.mutex, NULL);
-   anv_block_pool_init(&pool, &device, 4096, 0);
+   anv_block_pool_init(&pool, &device, 4096, 4096, 0);
 
    for (unsigned i = 0; i < NUM_THREADS; i++) {
       jobs[i].pool = &pool;
diff --git a/src/intel/vulkan/tests/state_pool.c b/src/intel/vulkan/tests/state_pool.c
index 4986011..3f06519 100644
--- a/src/intel/vulkan/tests/state_pool.c
+++ b/src/intel/vulkan/tests/state_pool.c
@@ -44,7 +44,7 @@
    pthread_mutex_init(&device.mutex, NULL);
 
    for (unsigned i = 0; i < NUM_RUNS; i++) {
-      anv_state_pool_init(&state_pool, &device, 256, 0);
+      anv_state_pool_init(&state_pool, &device, 4096, 256, 0);
 
       /* Grab one so a zero offset is impossible */
       anv_state_pool_alloc(&state_pool, 16, 16);
diff --git a/src/intel/vulkan/tests/state_pool_free_list_only.c b/src/intel/vulkan/tests/state_pool_free_list_only.c
index 138d68b..7dd39a5 100644
--- a/src/intel/vulkan/tests/state_pool_free_list_only.c
+++ b/src/intel/vulkan/tests/state_pool_free_list_only.c
@@ -42,7 +42,7 @@
    anv_gem_connect(&device);
 
    pthread_mutex_init(&device.mutex, NULL);
-   anv_state_pool_init(&state_pool, &device, 4096, 0);
+   anv_state_pool_init(&state_pool, &device, 4096, 4096, 0);
 
    /* Grab one so a zero offset is impossible */
    anv_state_pool_alloc(&state_pool, 16, 16);
diff --git a/src/intel/vulkan/tests/state_pool_no_free.c b/src/intel/vulkan/tests/state_pool_no_free.c
index 5076feb..1ff1960 100644
--- a/src/intel/vulkan/tests/state_pool_no_free.c
+++ b/src/intel/vulkan/tests/state_pool_no_free.c
@@ -63,7 +63,7 @@
    anv_gem_connect(&device);
 
    pthread_mutex_init(&device.mutex, NULL);
-   anv_state_pool_init(&state_pool, &device, 64, 0);
+   anv_state_pool_init(&state_pool, &device, 4096, 64, 0);
 
    pthread_barrier_init(&barrier, NULL, NUM_THREADS);
 
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index 076045a..fc42b8e 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -64,6 +64,55 @@
 static struct loader_dri3_buffer *
 dri3_find_back_alloc(struct loader_dri3_drawable *draw);
 
+static xcb_screen_t *
+get_screen_for_root(xcb_connection_t *conn, xcb_window_t root)
+{
+   xcb_screen_iterator_t screen_iter =
+   xcb_setup_roots_iterator(xcb_get_setup(conn));
+
+   for (; screen_iter.rem; xcb_screen_next (&screen_iter)) {
+      if (screen_iter.data->root == root)
+         return screen_iter.data;
+   }
+
+   return NULL;
+}
+
+static xcb_visualtype_t *
+get_xcb_visualtype_for_depth(struct loader_dri3_drawable *draw, int depth)
+{
+   xcb_visualtype_iterator_t visual_iter;
+   xcb_screen_t *screen = draw->screen;
+   xcb_depth_iterator_t depth_iter;
+
+   if (!screen)
+      return NULL;
+
+   depth_iter = xcb_screen_allowed_depths_iterator(screen);
+   for (; depth_iter.rem; xcb_depth_next(&depth_iter)) {
+      if (depth_iter.data->depth != depth)
+         continue;
+
+      visual_iter = xcb_depth_visuals_iterator(depth_iter.data);
+      if (visual_iter.rem)
+         return visual_iter.data;
+   }
+
+   return NULL;
+}
+
+/* Get red channel mask for given drawable at given depth. */
+static unsigned int
+dri3_get_red_mask_for_depth(struct loader_dri3_drawable *draw, int depth)
+{
+   xcb_visualtype_t *visual = get_xcb_visualtype_for_depth(draw, depth);
+
+   if (visual)
+      return visual->red_mask;
+
+   return 0;
+}
+
 /**
  * Do we have blit functionality in the image blit extension?
  *
@@ -323,6 +372,7 @@
       return 1;
    }
 
+   draw->screen = get_screen_for_root(draw->conn, reply->root);
    draw->width = reply->width;
    draw->height = reply->height;
    draw->depth = reply->depth;
@@ -1022,6 +1072,7 @@
    case  __DRI_IMAGE_FORMAT_XBGR2101010:
    case  __DRI_IMAGE_FORMAT_ABGR2101010:
    case  __DRI_IMAGE_FORMAT_SARGB8:
+   case  __DRI_IMAGE_FORMAT_SABGR8:
       return 4;
    case  __DRI_IMAGE_FORMAT_NONE:
    default:
@@ -1029,6 +1080,36 @@
    }
 }
 
+/* Map format of render buffer to corresponding format for the linear_buffer
+ * used for sharing with the display gpu of a Prime setup (== is_different_gpu).
+ * Usually linear_format == format, except for depth >= 30 formats, where
+ * different gpu vendors have different preferences wrt. color channel ordering.
+ */
+static uint32_t
+dri3_linear_format_for_format(struct loader_dri3_drawable *draw, uint32_t format)
+{
+   switch (format) {
+      case  __DRI_IMAGE_FORMAT_XRGB2101010:
+      case  __DRI_IMAGE_FORMAT_XBGR2101010:
+         /* Different preferred formats for different hw */
+         if (dri3_get_red_mask_for_depth(draw, 30) == 0x3ff)
+            return __DRI_IMAGE_FORMAT_XBGR2101010;
+         else
+            return __DRI_IMAGE_FORMAT_XRGB2101010;
+
+      case  __DRI_IMAGE_FORMAT_ARGB2101010:
+      case  __DRI_IMAGE_FORMAT_ABGR2101010:
+         /* Different preferred formats for different hw */
+         if (dri3_get_red_mask_for_depth(draw, 30) == 0x3ff)
+            return __DRI_IMAGE_FORMAT_ABGR2101010;
+         else
+            return __DRI_IMAGE_FORMAT_ARGB2101010;
+
+      default:
+         return format;
+   }
+}
+
 /* the DRIimage createImage function takes __DRI_IMAGE_FORMAT codes, while
  * the createImageFromFds call takes __DRI_IMAGE_FOURCC codes. To avoid
  * complete confusion, just deal in __DRI_IMAGE_FORMAT codes for now and
@@ -1041,6 +1122,7 @@
    /* Convert from __DRI_IMAGE_FORMAT to __DRI_IMAGE_FOURCC (sigh) */
    switch (format) {
    case __DRI_IMAGE_FORMAT_SARGB8: return __DRI_IMAGE_FOURCC_SARGB8888;
+   case __DRI_IMAGE_FORMAT_SABGR8: return __DRI_IMAGE_FOURCC_SABGR8888;
    case __DRI_IMAGE_FORMAT_RGB565: return __DRI_IMAGE_FOURCC_RGB565;
    case __DRI_IMAGE_FORMAT_XRGB8888: return __DRI_IMAGE_FOURCC_XRGB8888;
    case __DRI_IMAGE_FORMAT_ARGB8888: return __DRI_IMAGE_FOURCC_ARGB8888;
@@ -1225,7 +1307,8 @@
 
       buffer->linear_buffer =
         draw->ext->image->createImage(draw->dri_screen,
-                                      width, height, format,
+                                      width, height,
+                                      dri3_linear_format_for_format(draw, format),
                                       __DRI_IMAGE_USE_SHARE |
                                       __DRI_IMAGE_USE_LINEAR |
                                       __DRI_IMAGE_USE_BACKBUFFER,
@@ -1654,6 +1737,7 @@
                 struct loader_dri3_drawable *draw)
 {
    struct loader_dri3_buffer *buffer;
+   bool fence_await = buffer_type == loader_dri3_buffer_back;
    int buf_id;
 
    if (buffer_type == loader_dri3_buffer_back) {
@@ -1710,6 +1794,7 @@
                            0, 0, 0, 0,
                            draw->width, draw->height);
             dri3_fence_trigger(draw->conn, new_buffer);
+            fence_await = true;
          }
          dri3_free_render_buffer(draw, buffer);
       } else if (buffer_type == loader_dri3_buffer_front) {
@@ -1731,13 +1816,14 @@
                                           new_buffer->linear_buffer,
                                           0, 0, draw->width, draw->height,
                                           0, 0, 0);
-         }
+         } else
+            fence_await = true;
       }
       buffer = new_buffer;
       draw->buffers[buf_id] = buffer;
    }
 
-   if (buffer_type == loader_dri3_buffer_back)
+   if (fence_await)
       dri3_fence_await(draw->conn, draw, buffer);
 
    /*
diff --git a/src/loader/loader_dri3_helper.h b/src/loader/loader_dri3_helper.h
index 51d0003..0d18181 100644
--- a/src/loader/loader_dri3_helper.h
+++ b/src/loader/loader_dri3_helper.h
@@ -112,6 +112,7 @@
 
 struct loader_dri3_drawable {
    xcb_connection_t *conn;
+   xcb_screen_t *screen;
    __DRIdrawable *dri_drawable;
    xcb_drawable_t drawable;
    xcb_window_t window;
diff --git a/src/mapi/glapi/gen/AMD_performance_monitor.xml b/src/mapi/glapi/gen/AMD_performance_monitor.xml
index b29dc5d..785ea07 100644
--- a/src/mapi/glapi/gen/AMD_performance_monitor.xml
+++ b/src/mapi/glapi/gen/AMD_performance_monitor.xml
@@ -5,13 +5,13 @@
 
 <category name="GL_AMD_performance_monitor" number="360">
 
-    <function name="GetPerfMonitorGroupsAMD">
+    <function name="GetPerfMonitorGroupsAMD" es2="2.0">
         <param name="numGroups" type="GLint *"/>
         <param name="groupsSize" type="GLsizei"/>
         <param name="groups" type="GLuint *"/>
     </function>
 
-    <function name="GetPerfMonitorCountersAMD">
+    <function name="GetPerfMonitorCountersAMD" es2="2.0">
         <param name="group" type="GLuint"/>
         <param name="numCounters" type="GLint *"/>
         <param name="maxActiveCounters" type="GLint *"/>
@@ -19,14 +19,14 @@
         <param name="counters" type="GLuint *"/>
     </function>
 
-    <function name="GetPerfMonitorGroupStringAMD">
+    <function name="GetPerfMonitorGroupStringAMD" es2="2.0">
         <param name="group" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="length" type="GLsizei *"/>
         <param name="groupString" type="GLchar *"/>
     </function>
 
-    <function name="GetPerfMonitorCounterStringAMD">
+    <function name="GetPerfMonitorCounterStringAMD" es2="2.0">
         <param name="group" type="GLuint"/>
         <param name="counter" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
@@ -34,24 +34,24 @@
         <param name="counterString" type="GLchar *"/>
     </function>
 
-    <function name="GetPerfMonitorCounterInfoAMD">
+    <function name="GetPerfMonitorCounterInfoAMD" es2="2.0">
         <param name="group" type="GLuint"/>
         <param name="counter" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="data" type="GLvoid *"/>
     </function>
 
-    <function name="GenPerfMonitorsAMD">
+    <function name="GenPerfMonitorsAMD" es2="2.0">
         <param name="n" type="GLsizei"/>
         <param name="monitors" type="GLuint *"/>
     </function>
 
-    <function name="DeletePerfMonitorsAMD">
+    <function name="DeletePerfMonitorsAMD" es2="2.0">
         <param name="n" type="GLsizei"/>
         <param name="monitors" type="GLuint *"/>
     </function>
 
-    <function name="SelectPerfMonitorCountersAMD">
+    <function name="SelectPerfMonitorCountersAMD" es2="2.0">
         <param name="monitor" type="GLuint"/>
         <param name="enable" type="GLboolean"/>
         <param name="group" type="GLuint"/>
@@ -59,15 +59,15 @@
         <param name="counterList" type="GLuint *"/>
     </function>
 
-    <function name="BeginPerfMonitorAMD">
+    <function name="BeginPerfMonitorAMD" es2="2.0">
         <param name="monitor" type="GLuint"/>
     </function>
 
-    <function name="EndPerfMonitorAMD">
+    <function name="EndPerfMonitorAMD" es2="2.0">
         <param name="monitor" type="GLuint"/>
     </function>
 
-    <function name="GetPerfMonitorCounterDataAMD">
+    <function name="GetPerfMonitorCounterDataAMD" es2="2.0">
         <param name="monitor" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="dataSize" type="GLsizei"/>
diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
index b5e0ad4..b163d88 100644
--- a/src/mapi/glapi/gen/apiexec.py
+++ b/src/mapi/glapi/gen/apiexec.py
@@ -46,7 +46,6 @@
         if compatibility is not None:
             assert isinstance(compatibility, int)
             assert compatibility >= 10
-            assert compatibility <= 30
 
         if core is not None:
             assert isinstance(core, int)
@@ -70,40 +69,40 @@
     "TexBuffer": exec_info(compatibility=20, core=31, es2=31),
 
     # OpenGL 3.2 / GL_OES_geometry_shader.
-    "FramebufferTexture": exec_info(core=32, es2=31),
+    "FramebufferTexture": exec_info(compatibility=32, core=32, es2=31),
 
     # OpenGL 4.0 / GL_ARB_shader_subroutines. Mesa only exposes this
     # extension with core profile.
-    "GetSubroutineUniformLocation": exec_info(core=31),
-    "GetSubroutineIndex": exec_info(core=31),
-    "GetActiveSubroutineUniformiv": exec_info(core=31),
-    "GetActiveSubroutineUniformName": exec_info(core=31),
-    "GetActiveSubroutineName": exec_info(core=31),
-    "UniformSubroutinesuiv": exec_info(core=31),
-    "GetUniformSubroutineuiv": exec_info(core=31),
-    "GetProgramStageiv": exec_info(core=31),
+    "GetSubroutineUniformLocation": exec_info(compatibility=31, core=31),
+    "GetSubroutineIndex": exec_info(compatibility=31, core=31),
+    "GetActiveSubroutineUniformiv": exec_info(compatibility=31, core=31),
+    "GetActiveSubroutineUniformName": exec_info(compatibility=31, core=31),
+    "GetActiveSubroutineName": exec_info(compatibility=31, core=31),
+    "UniformSubroutinesuiv": exec_info(compatibility=31, core=31),
+    "GetUniformSubroutineuiv": exec_info(compatibility=31, core=31),
+    "GetProgramStageiv": exec_info(compatibility=31, core=31),
 
     # OpenGL 4.0 / GL_ARB_gpu_shader_fp64.  The extension spec says:
     #
     #     "OpenGL 3.2 and GLSL 1.50 are required."
-    "Uniform1d": exec_info(core=32),
-    "Uniform2d": exec_info(core=32),
-    "Uniform3d": exec_info(core=32),
-    "Uniform4d": exec_info(core=32),
-    "Uniform1dv": exec_info(core=32),
-    "Uniform2dv": exec_info(core=32),
-    "Uniform3dv": exec_info(core=32),
-    "Uniform4dv": exec_info(core=32),
-    "UniformMatrix2dv": exec_info(core=32),
-    "UniformMatrix3dv": exec_info(core=32),
-    "UniformMatrix4dv": exec_info(core=32),
-    "UniformMatrix2x3dv": exec_info(core=32),
-    "UniformMatrix2x4dv": exec_info(core=32),
-    "UniformMatrix3x2dv": exec_info(core=32),
-    "UniformMatrix3x4dv": exec_info(core=32),
-    "UniformMatrix4x2dv": exec_info(core=32),
-    "UniformMatrix4x3dv": exec_info(core=32),
-    "GetUniformdv": exec_info(core=32),
+    "Uniform1d": exec_info(compatibility=32, core=32),
+    "Uniform2d": exec_info(compatibility=32, core=32),
+    "Uniform3d": exec_info(compatibility=32, core=32),
+    "Uniform4d": exec_info(compatibility=32, core=32),
+    "Uniform1dv": exec_info(compatibility=32, core=32),
+    "Uniform2dv": exec_info(compatibility=32, core=32),
+    "Uniform3dv": exec_info(compatibility=32, core=32),
+    "Uniform4dv": exec_info(compatibility=32, core=32),
+    "UniformMatrix2dv": exec_info(compatibility=32, core=32),
+    "UniformMatrix3dv": exec_info(compatibility=32, core=32),
+    "UniformMatrix4dv": exec_info(compatibility=32, core=32),
+    "UniformMatrix2x3dv": exec_info(compatibility=32,core=32),
+    "UniformMatrix2x4dv": exec_info(compatibility=32, core=32),
+    "UniformMatrix3x2dv": exec_info(compatibility=32, core=32),
+    "UniformMatrix3x4dv": exec_info(compatibility=32, core=32),
+    "UniformMatrix4x2dv": exec_info(compatibility=32, core=32),
+    "UniformMatrix4x3dv": exec_info(compatibility=32, core=32),
+    "GetUniformdv": exec_info(compatibility=32, core=32),
 
     # OpenGL 4.1 / GL_ARB_vertex_attrib_64bit.  The extension spec says:
     #
@@ -114,16 +113,16 @@
     # For Mesa this effectively means OpenGL 3.2 is required.  It seems
     # unlikely that Mesa will ever get support for any of the NV extensions
     # that add "equivalent functionality."
-    "VertexAttribL1d": exec_info(core=32),
-    "VertexAttribL2d": exec_info(core=32),
-    "VertexAttribL3d": exec_info(core=32),
-    "VertexAttribL4d": exec_info(core=32),
-    "VertexAttribL1dv": exec_info(core=32),
-    "VertexAttribL2dv": exec_info(core=32),
-    "VertexAttribL3dv": exec_info(core=32),
-    "VertexAttribL4dv": exec_info(core=32),
-    "VertexAttribLPointer": exec_info(core=32),
-    "GetVertexAttribLdv": exec_info(core=32),
+    "VertexAttribL1d": exec_info(compatibility=32, core=32),
+    "VertexAttribL2d": exec_info(compatibility=32, core=32),
+    "VertexAttribL3d": exec_info(compatibility=32, core=32),
+    "VertexAttribL4d": exec_info(compatibility=32, core=32),
+    "VertexAttribL1dv": exec_info(compatibility=32, core=32),
+    "VertexAttribL2dv": exec_info(compatibility=32, core=32),
+    "VertexAttribL3dv": exec_info(compatibility=32, core=32),
+    "VertexAttribL4dv": exec_info(compatibility=32, core=32),
+    "VertexAttribLPointer": exec_info(compatibility=32, core=32),
+    "GetVertexAttribLdv": exec_info(compatibility=32, core=32),
 
     # OpenGL 4.1 / GL_ARB_viewport_array.  The extension spec says:
     #
@@ -132,14 +131,14 @@
     #
     # Mesa does not support either of the geometry shader extensions, so
     # OpenGL 3.2 is required.
-    "ViewportArrayv": exec_info(core=32, es2=31),
-    "ViewportIndexedf": exec_info(core=32, es2=31),
-    "ViewportIndexedfv": exec_info(core=32, es2=31),
-    "ScissorArrayv": exec_info(core=32, es2=31),
-    "ScissorIndexed": exec_info(core=32, es2=31),
-    "ScissorIndexedv": exec_info(core=32, es2=31),
-    "DepthRangeArrayv": exec_info(core=32),
-    "DepthRangeIndexed": exec_info(core=32),
+    "ViewportArrayv": exec_info(compatibility=32, core=32, es2=31),
+    "ViewportIndexedf": exec_info(compatibility=32, core=32, es2=31),
+    "ViewportIndexedfv": exec_info(compatibility=32, core=32, es2=31),
+    "ScissorArrayv": exec_info(compatibility=32, core=32, es2=31),
+    "ScissorIndexed": exec_info(compatibility=32, core=32, es2=31),
+    "ScissorIndexedv": exec_info(compatibility=32, core=32, es2=31),
+    "DepthRangeArrayv": exec_info(compatibility=32, core=32),
+    "DepthRangeIndexed": exec_info(compatibility=32, core=32),
     # GetFloati_v also GL_ARB_shader_atomic_counters
     # GetDoublei_v also GL_ARB_shader_atomic_counters
 
@@ -153,103 +152,103 @@
 
     # OpenGL 4.5 / GL_ARB_direct_state_access.   Mesa can expose the extension
     # with core profile.
-    "CreateTransformFeedbacks": exec_info(core=31),
-    "TransformFeedbackBufferBase": exec_info(core=31),
-    "TransformFeedbackBufferRange": exec_info(core=31),
-    "GetTransformFeedbackiv": exec_info(core=31),
-    "GetTransformFeedbacki_v": exec_info(core=31),
-    "GetTransformFeedbacki64_v": exec_info(core=31),
-    "CreateBuffers": exec_info(core=31),
-    "NamedBufferStorage": exec_info(core=31),
-    "NamedBufferData": exec_info(core=31),
-    "NamedBufferSubData": exec_info(core=31),
-    "CopyNamedBufferSubData": exec_info(core=31),
-    "ClearNamedBufferData": exec_info(core=31),
-    "ClearNamedBufferSubData": exec_info(core=31),
-    "MapNamedBuffer": exec_info(core=31),
-    "MapNamedBufferRange": exec_info(core=31),
-    "UnmapNamedBuffer": exec_info(core=31),
-    "FlushMappedNamedBufferRange": exec_info(core=31),
-    "GetNamedBufferParameteriv": exec_info(core=31),
-    "GetNamedBufferParameteri64v": exec_info(core=31),
-    "GetNamedBufferPointerv": exec_info(core=31),
-    "GetNamedBufferSubData": exec_info(core=31),
-    "CreateFramebuffers": exec_info(core=31),
-    "NamedFramebufferRenderbuffer": exec_info(core=31),
-    "NamedFramebufferParameteri": exec_info(core=31),
-    "NamedFramebufferTexture": exec_info(core=31),
-    "NamedFramebufferTextureLayer": exec_info(core=31),
-    "NamedFramebufferDrawBuffer": exec_info(core=31),
-    "NamedFramebufferDrawBuffers": exec_info(core=31),
-    "NamedFramebufferReadBuffer": exec_info(core=31),
-    "InvalidateNamedFramebufferData": exec_info(core=31),
-    "InvalidateNamedFramebufferSubData": exec_info(core=31),
-    "ClearNamedFramebufferiv": exec_info(core=31),
-    "ClearNamedFramebufferuiv": exec_info(core=31),
-    "ClearNamedFramebufferfv": exec_info(core=31),
-    "ClearNamedFramebufferfi": exec_info(core=31),
-    "BlitNamedFramebuffer": exec_info(core=31),
-    "CheckNamedFramebufferStatus": exec_info(core=31),
-    "GetNamedFramebufferParameteriv": exec_info(core=31),
-    "GetNamedFramebufferAttachmentParameteriv": exec_info(core=31),
-    "CreateRenderbuffers": exec_info(core=31),
-    "NamedRenderbufferStorage": exec_info(core=31),
-    "NamedRenderbufferStorageMultisample": exec_info(core=31),
-    "GetNamedRenderbufferParameteriv": exec_info(core=31),
-    "CreateTextures": exec_info(core=31),
-    "TextureBuffer": exec_info(core=31),
-    "TextureBufferRange": exec_info(core=31),
-    "TextureStorage1D": exec_info(core=31),
-    "TextureStorage2D": exec_info(core=31),
-    "TextureStorage3D": exec_info(core=31),
-    "TextureStorage2DMultisample": exec_info(core=31),
-    "TextureStorage3DMultisample": exec_info(core=31),
-    "TextureSubImage1D": exec_info(core=31),
-    "TextureSubImage2D": exec_info(core=31),
-    "TextureSubImage3D": exec_info(core=31),
-    "CompressedTextureSubImage1D": exec_info(core=31),
-    "CompressedTextureSubImage2D": exec_info(core=31),
-    "CompressedTextureSubImage3D": exec_info(core=31),
-    "CopyTextureSubImage1D": exec_info(core=31),
-    "CopyTextureSubImage2D": exec_info(core=31),
-    "CopyTextureSubImage3D": exec_info(core=31),
-    "TextureParameterf": exec_info(core=31),
-    "TextureParameterfv": exec_info(core=31),
-    "TextureParameteri": exec_info(core=31),
-    "TextureParameterIiv": exec_info(core=31),
-    "TextureParameterIuiv": exec_info(core=31),
-    "TextureParameteriv": exec_info(core=31),
-    "GenerateTextureMipmap": exec_info(core=31),
-    "BindTextureUnit": exec_info(core=31),
-    "GetTextureImage": exec_info(core=31),
-    "GetCompressedTextureImage": exec_info(core=31),
-    "GetTextureLevelParameterfv": exec_info(core=31),
-    "GetTextureLevelParameteriv": exec_info(core=31),
-    "GetTextureParameterfv": exec_info(core=31),
-    "GetTextureParameterIiv": exec_info(core=31),
-    "GetTextureParameterIuiv": exec_info(core=31),
-    "GetTextureParameteriv": exec_info(core=31),
-    "CreateVertexArrays": exec_info(core=31),
-    "DisableVertexArrayAttrib": exec_info(core=31),
-    "EnableVertexArrayAttrib": exec_info(core=31),
-    "VertexArrayElementBuffer": exec_info(core=31),
-    "VertexArrayVertexBuffer": exec_info(core=31),
-    "VertexArrayVertexBuffers": exec_info(core=31),
-    "VertexArrayAttribFormat": exec_info(core=31),
-    "VertexArrayAttribIFormat": exec_info(core=31),
-    "VertexArrayAttribLFormat": exec_info(core=31),
-    "VertexArrayAttribBinding": exec_info(core=31),
-    "VertexArrayBindingDivisor": exec_info(core=31),
-    "GetVertexArrayiv": exec_info(core=31),
-    "GetVertexArrayIndexediv": exec_info(core=31),
-    "GetVertexArrayIndexed64iv": exec_info(core=31),
-    "CreateSamplers": exec_info(core=31),
-    "CreateProgramPipelines": exec_info(core=31),
-    "CreateQueries": exec_info(core=31),
-    "GetQueryBufferObjectiv": exec_info(core=31),
-    "GetQueryBufferObjectuiv": exec_info(core=31),
-    "GetQueryBufferObjecti64v": exec_info(core=31),
-    "GetQueryBufferObjectui64v": exec_info(core=31),
+    "CreateTransformFeedbacks": exec_info(compatibility=45, core=31),
+    "TransformFeedbackBufferBase": exec_info(compatibility=45, core=31),
+    "TransformFeedbackBufferRange": exec_info(compatibility=45, core=31),
+    "GetTransformFeedbackiv": exec_info(compatibility=45, core=31),
+    "GetTransformFeedbacki_v": exec_info(compatibility=45, core=31),
+    "GetTransformFeedbacki64_v": exec_info(compatibility=45, core=31),
+    "CreateBuffers": exec_info(compatibility=45, core=31),
+    "NamedBufferStorage": exec_info(compatibility=45, core=31),
+    "NamedBufferData": exec_info(compatibility=45, core=31),
+    "NamedBufferSubData": exec_info(compatibility=45, core=31),
+    "CopyNamedBufferSubData": exec_info(compatibility=45, core=31),
+    "ClearNamedBufferData": exec_info(compatibility=45, core=31),
+    "ClearNamedBufferSubData": exec_info(compatibility=45, core=31),
+    "MapNamedBuffer": exec_info(compatibility=45, core=31),
+    "MapNamedBufferRange": exec_info(compatibility=45, core=31),
+    "UnmapNamedBuffer": exec_info(compatibility=45, core=31),
+    "FlushMappedNamedBufferRange": exec_info(compatibility=45, core=31),
+    "GetNamedBufferParameteriv": exec_info(compatibility=45, core=31),
+    "GetNamedBufferParameteri64v": exec_info(compatibility=45, core=31),
+    "GetNamedBufferPointerv": exec_info(compatibility=45, core=31),
+    "GetNamedBufferSubData": exec_info(compatibility=45, core=31),
+    "CreateFramebuffers": exec_info(compatibility=45, core=31),
+    "NamedFramebufferRenderbuffer": exec_info(compatibility=45, core=31),
+    "NamedFramebufferParameteri": exec_info(compatibility=45, core=31),
+    "NamedFramebufferTexture": exec_info(compatibility=45, core=31),
+    "NamedFramebufferTextureLayer": exec_info(compatibility=45, core=31),
+    "NamedFramebufferDrawBuffer": exec_info(compatibility=45, core=31),
+    "NamedFramebufferDrawBuffers": exec_info(compatibility=45, core=31),
+    "NamedFramebufferReadBuffer": exec_info(compatibility=45, core=31),
+    "InvalidateNamedFramebufferData": exec_info(compatibility=45, core=31),
+    "InvalidateNamedFramebufferSubData": exec_info(compatibility=45, core=31),
+    "ClearNamedFramebufferiv": exec_info(compatibility=45, core=31),
+    "ClearNamedFramebufferuiv": exec_info(compatibility=45, core=31),
+    "ClearNamedFramebufferfv": exec_info(compatibility=45, core=31),
+    "ClearNamedFramebufferfi": exec_info(compatibility=45, core=31),
+    "BlitNamedFramebuffer": exec_info(compatibility=45, core=31),
+    "CheckNamedFramebufferStatus": exec_info(compatibility=45, core=31),
+    "GetNamedFramebufferParameteriv": exec_info(compatibility=45, core=31),
+    "GetNamedFramebufferAttachmentParameteriv": exec_info(compatibility=45, core=31),
+    "CreateRenderbuffers": exec_info(compatibility=45, core=31),
+    "NamedRenderbufferStorage": exec_info(compatibility=45, core=31),
+    "NamedRenderbufferStorageMultisample": exec_info(compatibility=45, core=31),
+    "GetNamedRenderbufferParameteriv": exec_info(compatibility=45, core=31),
+    "CreateTextures": exec_info(compatibility=45, core=31),
+    "TextureBuffer": exec_info(compatibility=45, core=31),
+    "TextureBufferRange": exec_info(compatibility=45, core=31),
+    "TextureStorage1D": exec_info(compatibility=45, core=31),
+    "TextureStorage2D": exec_info(compatibility=45, core=31),
+    "TextureStorage3D": exec_info(compatibility=45, core=31),
+    "TextureStorage2DMultisample": exec_info(compatibility=45, core=31),
+    "TextureStorage3DMultisample": exec_info(compatibility=45, core=31),
+    "TextureSubImage1D": exec_info(compatibility=45, core=31),
+    "TextureSubImage2D": exec_info(compatibility=45, core=31),
+    "TextureSubImage3D": exec_info(compatibility=45, core=31),
+    "CompressedTextureSubImage1D": exec_info(compatibility=45, core=31),
+    "CompressedTextureSubImage2D": exec_info(compatibility=45, core=31),
+    "CompressedTextureSubImage3D": exec_info(compatibility=45, core=31),
+    "CopyTextureSubImage1D": exec_info(compatibility=45, core=31),
+    "CopyTextureSubImage2D": exec_info(compatibility=45, core=31),
+    "CopyTextureSubImage3D": exec_info(compatibility=45, core=31),
+    "TextureParameterf": exec_info(compatibility=45, core=31),
+    "TextureParameterfv": exec_info(compatibility=45, core=31),
+    "TextureParameteri": exec_info(compatibility=45, core=31),
+    "TextureParameterIiv": exec_info(compatibility=45, core=31),
+    "TextureParameterIuiv": exec_info(compatibility=45, core=31),
+    "TextureParameteriv": exec_info(compatibility=45, core=31),
+    "GenerateTextureMipmap": exec_info(compatibility=45, core=31),
+    "BindTextureUnit": exec_info(compatibility=45, core=31),
+    "GetTextureImage": exec_info(compatibility=45, core=31),
+    "GetCompressedTextureImage": exec_info(compatibility=45, core=31),
+    "GetTextureLevelParameterfv": exec_info(compatibility=45, core=31),
+    "GetTextureLevelParameteriv": exec_info(compatibility=45, core=31),
+    "GetTextureParameterfv": exec_info(compatibility=45, core=31),
+    "GetTextureParameterIiv": exec_info(compatibility=45, core=31),
+    "GetTextureParameterIuiv": exec_info(compatibility=45, core=31),
+    "GetTextureParameteriv": exec_info(compatibility=45, core=31),
+    "CreateVertexArrays": exec_info(compatibility=45, core=31),
+    "DisableVertexArrayAttrib": exec_info(compatibility=45, core=31),
+    "EnableVertexArrayAttrib": exec_info(compatibility=45, core=31),
+    "VertexArrayElementBuffer": exec_info(compatibility=45, core=31),
+    "VertexArrayVertexBuffer": exec_info(compatibility=45, core=31),
+    "VertexArrayVertexBuffers": exec_info(compatibility=45, core=31),
+    "VertexArrayAttribFormat": exec_info(compatibility=45, core=31),
+    "VertexArrayAttribIFormat": exec_info(compatibility=45, core=31),
+    "VertexArrayAttribLFormat": exec_info(compatibility=45, core=31),
+    "VertexArrayAttribBinding": exec_info(compatibility=45, core=31),
+    "VertexArrayBindingDivisor": exec_info(compatibility=45, core=31),
+    "GetVertexArrayiv": exec_info(compatibility=45, core=31),
+    "GetVertexArrayIndexediv": exec_info(compatibility=45, core=31),
+    "GetVertexArrayIndexed64iv": exec_info(compatibility=45, core=31),
+    "CreateSamplers": exec_info(compatibility=45, core=31),
+    "CreateProgramPipelines": exec_info(compatibility=45, core=31),
+    "CreateQueries": exec_info(compatibility=45, core=31),
+    "GetQueryBufferObjectiv": exec_info(compatibility=45, core=31),
+    "GetQueryBufferObjectuiv": exec_info(compatibility=45, core=31),
+    "GetQueryBufferObjecti64v": exec_info(compatibility=45, core=31),
+    "GetQueryBufferObjectui64v": exec_info(compatibility=45, core=31),
 
     # GL_ARB_gpu_shader_int64 - nominally requires OpenGL 4.0, and Mesa
     # only supports 4.0 in core profile.
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index a53fcd1..459f642 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -1445,4 +1445,19 @@
     </function>
 </category>
 
+<!-- 218. GL_OES_texture_view -->
+
+<category name="GL_OES_texture_view" number="218">
+    <function name="TextureViewOES" es2="3.1" alias="TextureView">
+        <param name="texture" type="GLuint"/>
+        <param name="target" type="GLenum"/>
+        <param name="origtexture" type="GLuint"/>
+        <param name="internalformat" type="GLenum"/>
+        <param name="minlevel" type="GLuint"/>
+        <param name="numlevels" type="GLuint"/>
+        <param name="minlayer" type="GLuint"/>
+        <param name="numlayers" type="GLuint"/>
+   </function>
+</category>
+
 </OpenGLAPI>
diff --git a/src/mapi/glapi/gen/glX_XML.py b/src/mapi/glapi/gen/glX_XML.py
index b6d305c..ff0011b 100644
--- a/src/mapi/glapi/gen/glX_XML.py
+++ b/src/mapi/glapi/gen/glX_XML.py
@@ -64,7 +64,7 @@
                 else:
                     mode = 1
 
-                if not self.functions.has_key(n):
+                if n not in self.functions:
                     self.functions[ n ] = [c, mode]
 
         return
@@ -296,7 +296,7 @@
         parameters.extend( temp[1] )
         if include_variable_parameters:
             parameters.extend( temp[2] )
-        return parameters.__iter__()
+        return iter(parameters)
 
 
     def parameterIterateCounters(self):
@@ -304,7 +304,7 @@
         for name in self.counter_list:
             temp.append( self.parameters_by_name[ name ] )
 
-        return temp.__iter__()
+        return iter(temp)
 
 
     def parameterIterateOutputs(self):
@@ -470,7 +470,7 @@
     def needs_reply(self):
         try:
             x = self._needs_reply
-        except Exception, e:
+        except Exception:
             x = 0
             if self.return_type != 'void':
                 x = 1
@@ -547,13 +547,14 @@
         return self
 
 
-    def next(self):
-        f = self.iterator.next()
+    def __next__(self):
+        while True:
+            f = next(self.iterator)
 
-        if f.client_supported_for_indirect():
-            return f
-        else:
-            return self.next()
+            if f.client_supported_for_indirect():
+                return f
+
+    next = __next__
 
 
 class glx_api(gl_XML.gl_api):
diff --git a/src/mapi/glapi/gen/glX_proto_common.py b/src/mapi/glapi/gen/glX_proto_common.py
index bd1192c..0559dd1 100644
--- a/src/mapi/glapi/gen/glX_proto_common.py
+++ b/src/mapi/glapi/gen/glX_proto_common.py
@@ -24,8 +24,9 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
 import gl_XML, glX_XML
-import string
 
 
 class glx_proto_item_factory(glX_XML.glx_item_factory):
@@ -65,7 +66,7 @@
                     return compsize
 
                 elif len(param.count_parameter_list):
-                    parameters = string.join( param.count_parameter_list, "," )
+                    parameters = ",".join( param.count_parameter_list )
                     compsize = "__gl%s_size(%s)" % (func.name, parameters)
 
                     return compsize
@@ -80,12 +81,12 @@
 
         compsize = self.size_call(f)
         if compsize:
-            print '    const GLuint compsize = %s;' % (compsize)
+            print('    const GLuint compsize = %s;' % (compsize))
 
         if bias:
-            print '    const GLuint cmdlen = %s - %u;' % (f.command_length(), bias)
+            print('    const GLuint cmdlen = %s - %u;' % (f.command_length(), bias))
         else:
-            print '    const GLuint cmdlen = %s;' % (f.command_length())
+            print('    const GLuint cmdlen = %s;' % (f.command_length()))
 
         #print ''
         return compsize
diff --git a/src/mapi/glapi/gen/glX_proto_send.py b/src/mapi/glapi/gen/glX_proto_send.py
index d458214..a920ecc 100644
--- a/src/mapi/glapi/gen/glX_proto_send.py
+++ b/src/mapi/glapi/gen/glX_proto_send.py
@@ -26,10 +26,12 @@
 #    Ian Romanick <idr@us.ibm.com>
 #    Jeremy Kolb <jkolb@brandeis.edu>
 
+from __future__ import print_function
+
 import argparse
 
 import gl_XML, glX_XML, glX_proto_common, license
-import copy, string
+import copy
 
 def convertStringForXCB(str):
     tmp = ""
@@ -37,10 +39,10 @@
     i = 0
     while i < len(str):
         if str[i:i+3] in special:
-            tmp = '%s_%s' % (tmp, string.lower(str[i:i+3]))
+            tmp = '%s_%s' % (tmp, str[i:i+3].lower())
             i = i + 2;
         elif str[i].isupper():
-            tmp = '%s_%s' % (tmp, string.lower(str[i]))
+            tmp = '%s_%s' % (tmp, str[i].lower())
         else:
             tmp = '%s%s' % (tmp, str[i])
         i += 1
@@ -163,58 +165,58 @@
         return
 
     def printRealHeader(self):
-        print ''
-        print '#include <GL/gl.h>'
-        print '#include "indirect.h"'
-        print '#include "glxclient.h"'
-        print '#include "indirect_size.h"'
-        print '#include "glapi.h"'
-        print '#include <GL/glxproto.h>'
-        print '#include <X11/Xlib-xcb.h>'
-        print '#include <xcb/xcb.h>'
-        print '#include <xcb/glx.h>'
-        print '#include <limits.h>'
+        print('')
+        print('#include <GL/gl.h>')
+        print('#include "indirect.h"')
+        print('#include "glxclient.h"')
+        print('#include "indirect_size.h"')
+        print('#include "glapi.h"')
+        print('#include <GL/glxproto.h>')
+        print('#include <X11/Xlib-xcb.h>')
+        print('#include <xcb/xcb.h>')
+        print('#include <xcb/glx.h>')
+        print('#include <limits.h>')
 
-        print ''
+        print('')
         self.printFastcall()
         self.printNoinline()
-        print ''
+        print('')
 
-        print 'static _X_INLINE int safe_add(int a, int b)'
-        print '{'
-        print '    if (a < 0 || b < 0) return -1;'
-        print '    if (INT_MAX - a < b) return -1;'
-        print '    return a + b;'
-        print '}'
-        print 'static _X_INLINE int safe_mul(int a, int b)'
-        print '{'
-        print '    if (a < 0 || b < 0) return -1;'
-        print '    if (a == 0 || b == 0) return 0;'
-        print '    if (a > INT_MAX / b) return -1;'
-        print '    return a * b;'
-        print '}'
-        print 'static _X_INLINE int safe_pad(int a)'
-        print '{'
-        print '    int ret;'
-        print '    if (a < 0) return -1;'
-        print '    if ((ret = safe_add(a, 3)) < 0) return -1;'
-        print '    return ret & (GLuint)~3;'
-        print '}'
-        print ''
+        print('static _X_INLINE int safe_add(int a, int b)')
+        print('{')
+        print('    if (a < 0 || b < 0) return -1;')
+        print('    if (INT_MAX - a < b) return -1;')
+        print('    return a + b;')
+        print('}')
+        print('static _X_INLINE int safe_mul(int a, int b)')
+        print('{')
+        print('    if (a < 0 || b < 0) return -1;')
+        print('    if (a == 0 || b == 0) return 0;')
+        print('    if (a > INT_MAX / b) return -1;')
+        print('    return a * b;')
+        print('}')
+        print('static _X_INLINE int safe_pad(int a)')
+        print('{')
+        print('    int ret;')
+        print('    if (a < 0) return -1;')
+        print('    if ((ret = safe_add(a, 3)) < 0) return -1;')
+        print('    return ret & (GLuint)~3;')
+        print('}')
+        print('')
 
-        print '#ifndef __GNUC__'
-        print '#  define __builtin_expect(x, y) x'
-        print '#endif'
-        print ''
-        print '/* If the size and opcode values are known at compile-time, this will, on'
-        print ' * x86 at least, emit them with a single instruction.'
-        print ' */'
-        print '#define emit_header(dest, op, size)            \\'
-        print '    do { union { short s[2]; int i; } temp;    \\'
-        print '         temp.s[0] = (size); temp.s[1] = (op); \\'
-        print '         *((int *)(dest)) = temp.i; } while(0)'
-        print ''
-        print """NOINLINE CARD32
+        print('#ifndef __GNUC__')
+        print('#  define __builtin_expect(x, y) x')
+        print('#endif')
+        print('')
+        print('/* If the size and opcode values are known at compile-time, this will, on')
+        print(' * x86 at least, emit them with a single instruction.')
+        print(' */')
+        print('#define emit_header(dest, op, size)            \\')
+        print('    do { union { short s[2]; int i; } temp;    \\')
+        print('         temp.s[0] = (size); temp.s[1] = (op); \\')
+        print('         *((int *)(dest)) = temp.i; } while(0)')
+        print('')
+        print("""NOINLINE CARD32
 __glXReadReply( Display *dpy, size_t size, void * dest, GLboolean reply_is_always_array )
 {
     xGLXSingleReply reply;
@@ -326,7 +328,7 @@
 #define default_pixel_store_3D_size 36
 #define default_pixel_store_4D      (__glXDefaultPixelStore+0)
 #define default_pixel_store_4D_size 36
-"""
+""")
 
         for size in self.generic_sizes:
             self.print_generic_function(size)
@@ -381,20 +383,19 @@
                 if func.has_different_protocol(n):
                     procs[n] = func.static_glx_name(n)
 
-        print """
+        print("""
 #ifdef GLX_INDIRECT_RENDERING
 
 static const struct proc_pair
 {
    const char *name;
    _glapi_proc proc;
-} proc_pairs[%d] = {""" % len(procs)
-        names = procs.keys()
-        names.sort()
-        for i in xrange(len(names)):
+} proc_pairs[%d] = {""" % len(procs))
+        names = sorted(procs.keys())
+        for i in range(len(names)):
             comma = ',' if i < len(names) - 1 else ''
-            print '   { "%s", (_glapi_proc) gl%s }%s' % (names[i], procs[names[i]], comma)
-        print """};
+            print('   { "%s", (_glapi_proc) gl%s }%s' % (names[i], procs[names[i]], comma))
+        print("""};
 
 static int
 __indirect_get_proc_compare(const void *key, const void *memb)
@@ -419,16 +420,16 @@
 }
 
 #endif /* GLX_INDIRECT_RENDERING */
-"""
+""")
         return
 
 
     def printFunction(self, func, name):
         footer = '}\n'
         if func.glx_rop == ~0:
-            print 'static %s' % (func.return_type)
-            print '%s( unsigned opcode, unsigned dim, %s )' % (func.name, func.get_parameter_string())
-            print '{'
+            print('static %s' % (func.return_type))
+            print('%s( unsigned opcode, unsigned dim, %s )' % (func.name, func.get_parameter_string()))
+            print('{')
         else:
             if func.has_different_protocol(name):
                 if func.return_type == "void":
@@ -437,27 +438,27 @@
                     ret_string = "return "
 
                 func_name = func.static_glx_name(name)
-                print '#define %s %d' % (func.opcode_vendor_name(name), func.glx_vendorpriv)
-                print '%s gl%s(%s)' % (func.return_type, func_name, func.get_parameter_string())
-                print '{'
-                print '    struct glx_context * const gc = __glXGetCurrentContext();'
-                print ''
-                print '#if defined(GLX_DIRECT_RENDERING) && !defined(GLX_USE_APPLEGL)'
-                print '    if (gc->isDirect) {'
-                print '        const _glapi_proc *const disp_table = (_glapi_proc *)GET_DISPATCH();'
-                print '        PFNGL%sPROC p =' % (name.upper())
-                print '            (PFNGL%sPROC) disp_table[%d];' % (name.upper(), func.offset)
-                print '    %sp(%s);' % (ret_string, func.get_called_parameter_string())
-                print '    } else'
-                print '#endif'
-                print '    {'
+                print('#define %s %d' % (func.opcode_vendor_name(name), func.glx_vendorpriv))
+                print('%s gl%s(%s)' % (func.return_type, func_name, func.get_parameter_string()))
+                print('{')
+                print('    struct glx_context * const gc = __glXGetCurrentContext();')
+                print('')
+                print('#if defined(GLX_DIRECT_RENDERING) && !defined(GLX_USE_APPLEGL)')
+                print('    if (gc->isDirect) {')
+                print('        const _glapi_proc *const disp_table = (_glapi_proc *)GET_DISPATCH();')
+                print('        PFNGL%sPROC p =' % (name.upper()))
+                print('            (PFNGL%sPROC) disp_table[%d];' % (name.upper(), func.offset))
+                print('    %sp(%s);' % (ret_string, func.get_called_parameter_string()))
+                print('    } else')
+                print('#endif')
+                print('    {')
 
                 footer = '}\n}\n'
             else:
-                print '#define %s %d' % (func.opcode_name(), func.opcode_value())
+                print('#define %s %d' % (func.opcode_name(), func.opcode_value()))
 
-                print '%s __indirect_gl%s(%s)' % (func.return_type, name, func.get_parameter_string())
-                print '{'
+                print('%s __indirect_gl%s(%s)' % (func.return_type, name, func.get_parameter_string()))
+                print('{')
 
 
         if func.glx_rop != 0 or func.vectorequiv != None:
@@ -469,15 +470,15 @@
             self.printSingleFunction(func, name)
             pass
         else:
-            print "/* Missing GLX protocol for %s. */" % (name)
+            print("/* Missing GLX protocol for %s. */" % (name))
 
-        print footer
+        print(footer)
         return
 
 
     def print_generic_function(self, n):
         size = (n + 3) & ~3
-        print """static FASTCALL NOINLINE void
+        print("""static FASTCALL NOINLINE void
 generic_%u_byte( GLint rop, const void * ptr )
 {
     struct glx_context * const gc = __glXGetCurrentContext();
@@ -488,7 +489,7 @@
     gc->pc += cmdlen;
     if (__builtin_expect(gc->pc > gc->limit, 0)) { (void) __glXFlushRenderBuffer(gc, gc->pc); }
 }
-""" % (n, size + 4, size)
+""" % (n, size + 4, size))
         return
 
 
@@ -499,14 +500,14 @@
             src_ptr = "&" + p.name
 
         if p.is_padding:
-            print '(void) memset((void *)(%s + %u), 0, %s);' \
-                % (pc, p.offset + adjust, p.size_string() )
+            print('(void) memset((void *)(%s + %u), 0, %s);' \
+                % (pc, p.offset + adjust, p.size_string() ))
         elif not extra_offset:
-            print '(void) memcpy((void *)(%s + %u), (void *)(%s), %s);' \
-                % (pc, p.offset + adjust, src_ptr, p.size_string() )
+            print('(void) memcpy((void *)(%s + %u), (void *)(%s), %s);' \
+                % (pc, p.offset + adjust, src_ptr, p.size_string() ))
         else:
-            print '(void) memcpy((void *)(%s + %u + %s), (void *)(%s), %s);' \
-                % (pc, p.offset + adjust, extra_offset, src_ptr, p.size_string() )
+            print('(void) memcpy((void *)(%s + %u + %s), (void *)(%s), %s);' \
+                % (pc, p.offset + adjust, extra_offset, src_ptr, p.size_string() ))
 
     def common_emit_args(self, f, pc, adjust, skip_vla):
         extra_offset = None
@@ -542,7 +543,7 @@
                 self.common_emit_one_arg(param, pc, adjust, None)
 
                 if f.pad_after(param):
-                    print '(void) memcpy((void *)(%s + %u), zero, 4);' % (pc, (param.offset + param.size()) + adjust)
+                    print('(void) memcpy((void *)(%s + %u), zero, 4);' % (pc, (param.offset + param.size()) + adjust))
 
             else:
                 [dim, width, height, depth, extent] = param.get_dimensions()
@@ -552,14 +553,14 @@
                     dim_str = str(dim)
 
                 if param.is_padding:
-                    print '(void) memset((void *)(%s + %u), 0, %s);' \
-                    % (pc, (param.offset - 4) + adjust, param.size_string() )
+                    print('(void) memset((void *)(%s + %u), 0, %s);' \
+                    % (pc, (param.offset - 4) + adjust, param.size_string() ))
 
                 if param.img_null_flag:
                     if large:
-                        print '(void) memcpy((void *)(%s + %u), zero, 4);' % (pc, (param.offset - 4) + adjust)
+                        print('(void) memcpy((void *)(%s + %u), zero, 4);' % (pc, (param.offset - 4) + adjust))
                     else:
-                        print '(void) memcpy((void *)(%s + %u), (void *)((%s == NULL) ? one : zero), 4);' % (pc, (param.offset - 4) + adjust, param.name)
+                        print('(void) memcpy((void *)(%s + %u), (void *)((%s == NULL) ? one : zero), 4);' % (pc, (param.offset - 4) + adjust, param.name))
 
 
                 pixHeaderPtr = "%s + %u" % (pc, adjust)
@@ -571,13 +572,13 @@
                     else:
                         condition = 'compsize > 0'
 
-                    print 'if (%s) {' % (condition)
-                    print '    gc->fillImage(gc, %s, %s, %s, %s, %s, %s, %s, %s, %s);' % (dim_str, width, height, depth, param.img_format, param.img_type, param.name, pcPtr, pixHeaderPtr)
-                    print '} else {'
-                    print '    (void) memcpy( %s, default_pixel_store_%uD, default_pixel_store_%uD_size );' % (pixHeaderPtr, dim, dim)
-                    print '}'
+                    print('if (%s) {' % (condition))
+                    print('    gc->fillImage(gc, %s, %s, %s, %s, %s, %s, %s, %s, %s);' % (dim_str, width, height, depth, param.img_format, param.img_type, param.name, pcPtr, pixHeaderPtr))
+                    print('} else {')
+                    print('    (void) memcpy( %s, default_pixel_store_%uD, default_pixel_store_%uD_size );' % (pixHeaderPtr, dim, dim))
+                    print('}')
                 else:
-                    print '__glXSendLargeImage(gc, compsize, %s, %s, %s, %s, %s, %s, %s, %s, %s);' % (dim_str, width, height, depth, param.img_format, param.img_type, param.name, pcPtr, pixHeaderPtr)
+                    print('__glXSendLargeImage(gc, compsize, %s, %s, %s, %s, %s, %s, %s, %s, %s);' % (dim_str, width, height, depth, param.img_format, param.img_type, param.name, pcPtr, pixHeaderPtr))
 
         return
 
@@ -586,16 +587,16 @@
         if not op_name:
             op_name = f.opcode_real_name()
 
-        print 'const GLint op = %s;' % (op_name)
-        print 'const GLuint cmdlenLarge = cmdlen + 4;'
-        print 'GLubyte * const pc = __glXFlushRenderBuffer(gc, gc->pc);'
-        print '(void) memcpy((void *)(pc + 0), (void *)(&cmdlenLarge), 4);'
-        print '(void) memcpy((void *)(pc + 4), (void *)(&op), 4);'
+        print('const GLint op = %s;' % (op_name))
+        print('const GLuint cmdlenLarge = cmdlen + 4;')
+        print('GLubyte * const pc = __glXFlushRenderBuffer(gc, gc->pc);')
+        print('(void) memcpy((void *)(pc + 0), (void *)(&cmdlenLarge), 4);')
+        print('(void) memcpy((void *)(pc + 4), (void *)(&op), 4);')
         return
 
 
     def common_func_print_just_start(self, f, name):
-        print '    struct glx_context * const gc = __glXGetCurrentContext();'
+        print('    struct glx_context * const gc = __glXGetCurrentContext();')
 
         # The only reason that single and vendor private commands need
         # a variable called 'dpy' is because they use the SyncHandle
@@ -613,10 +614,10 @@
         if not f.glx_rop:
             for p in f.parameterIterateOutputs():
                 if p.is_image() and (p.img_format != "GL_COLOR_INDEX" or p.img_type != "GL_BITMAP"):
-                    print '    const __GLXattribute * const state = gc->client_state_private;'
+                    print('    const __GLXattribute * const state = gc->client_state_private;')
                     break
 
-            print '    Display * const dpy = gc->currentDpy;'
+            print('    Display * const dpy = gc->currentDpy;')
             skip_condition = "dpy != NULL"
         elif f.can_be_large:
             skip_condition = "gc->currentDpy != NULL"
@@ -625,46 +626,46 @@
 
 
         if f.return_type != 'void':
-            print '    %s retval = (%s) 0;' % (f.return_type, f.return_type)
+            print('    %s retval = (%s) 0;' % (f.return_type, f.return_type))
 
 
         if name != None and name not in f.glx_vendorpriv_names:
-            print '#ifndef USE_XCB'
+            print('#ifndef USE_XCB')
         self.emit_packet_size_calculation(f, 0)
         if name != None and name not in f.glx_vendorpriv_names:
-            print '#endif'
+            print('#endif')
 
         if f.command_variable_length() != "":
-            print "    if (0%s < 0) {" % f.command_variable_length()
-            print "        __glXSetError(gc, GL_INVALID_VALUE);"
+            print("    if (0%s < 0) {" % f.command_variable_length())
+            print("        __glXSetError(gc, GL_INVALID_VALUE);")
             if f.return_type != 'void':
-                print "        return 0;"
+                print("        return 0;")
             else:
-                print "        return;"
-            print "    }"
+                print("        return;")
+            print("    }")
 
         condition_list = []
         for p in f.parameterIterateCounters():
             condition_list.append( "%s >= 0" % (p.name) )
             # 'counter' parameters cannot be negative
-            print "    if (%s < 0) {" % p.name
-            print "        __glXSetError(gc, GL_INVALID_VALUE);"
+            print("    if (%s < 0) {" % p.name)
+            print("        __glXSetError(gc, GL_INVALID_VALUE);")
             if f.return_type != 'void':
-                print "        return 0;"
+                print("        return 0;")
             else:
-                print "        return;"
-            print "    }"
+                print("        return;")
+            print("    }")
 
         if skip_condition:
             condition_list.append( skip_condition )
 
         if len( condition_list ) > 0:
             if len( condition_list ) > 1:
-                skip_condition = "(%s)" % (string.join( condition_list, ") && (" ))
+                skip_condition = "(%s)" % ") && (".join( condition_list )
             else:
                 skip_condition = "%s" % (condition_list.pop(0))
 
-            print '    if (__builtin_expect(%s, 1)) {' % (skip_condition)
+            print('    if (__builtin_expect(%s, 1)) {' % (skip_condition))
             return 1
         else:
             return 0
@@ -674,16 +675,16 @@
         self.common_func_print_just_start(f, name)
 
         if self.debug:
-            print '        printf( "Enter %%s...\\n", "gl%s" );' % (f.name)
+            print('        printf( "Enter %%s...\\n", "gl%s" );' % (f.name))
 
         if name not in f.glx_vendorpriv_names:
 
             # XCB specific:
-            print '#ifdef USE_XCB'
+            print('#ifdef USE_XCB')
             if self.debug:
-                print '        printf("\\tUsing XCB.\\n");'
-            print '        xcb_connection_t *c = XGetXCBConnection(dpy);'
-            print '        (void) __glXFlushRenderBuffer(gc, gc->pc);'
+                print('        printf("\\tUsing XCB.\\n");')
+            print('        xcb_connection_t *c = XGetXCBConnection(dpy);')
+            print('        (void) __glXFlushRenderBuffer(gc, gc->pc);')
             xcb_name = 'xcb_glx%s' % convertStringForXCB(name)
 
             iparams=[]
@@ -710,7 +711,7 @@
             xcb_request = '%s(%s)' % (xcb_name, ", ".join(["c", "gc->currentContextTag"] + iparams + extra_iparams))
 
             if f.needs_reply():
-                print '        %s_reply_t *reply = %s_reply(c, %s, NULL);' % (xcb_name, xcb_name, xcb_request)
+                print('        %s_reply_t *reply = %s_reply(c, %s, NULL);' % (xcb_name, xcb_name, xcb_request))
                 if output:
                     if output.is_image():
                         [dim, w, h, d, junk] = output.get_dimensions()
@@ -721,30 +722,30 @@
                             if dim < 2:
                                 h = "1"
                             else:
-                                print '        if (%s == 0) { %s = 1; }' % (h, h)
+                                print('        if (%s == 0) { %s = 1; }' % (h, h))
                             if dim < 3:
                                 d = "1"
                             else:
-                                print '        if (%s == 0) { %s = 1; }' % (d, d)
+                                print('        if (%s == 0) { %s = 1; }' % (d, d))
 
-                        print '        __glEmptyImage(gc, 3, %s, %s, %s, %s, %s, %s_data(reply), %s);' % (w, h, d, output.img_format, output.img_type, xcb_name, output.name)
+                        print('        __glEmptyImage(gc, 3, %s, %s, %s, %s, %s, %s_data(reply), %s);' % (w, h, d, output.img_format, output.img_type, xcb_name, output.name))
                     else:
                         if f.reply_always_array:
-                            print '        (void)memcpy(%s, %s_data(reply), %s_data_length(reply) * sizeof(%s));' % (output.name, xcb_name, xcb_name, output.get_base_type_string())
+                            print('        (void)memcpy(%s, %s_data(reply), %s_data_length(reply) * sizeof(%s));' % (output.name, xcb_name, xcb_name, output.get_base_type_string()))
                         else:
-                            print '        /* the XXX_data_length() xcb function name is misleading, it returns the number */'
-                            print '        /* of elements, not the length of the data part. A single element is embedded. */'
-                            print '        if (%s_data_length(reply) == 1)' % (xcb_name)
-                            print '            (void)memcpy(%s, &reply->datum, sizeof(reply->datum));' % (output.name)
-                            print '        else'
-                            print '            (void)memcpy(%s, %s_data(reply), %s_data_length(reply) * sizeof(%s));' % (output.name, xcb_name, xcb_name, output.get_base_type_string())
+                            print('        /* the XXX_data_length() xcb function name is misleading, it returns the number */')
+                            print('        /* of elements, not the length of the data part. A single element is embedded. */')
+                            print('        if (%s_data_length(reply) == 1)' % (xcb_name))
+                            print('            (void)memcpy(%s, &reply->datum, sizeof(reply->datum));' % (output.name))
+                            print('        else')
+                            print('            (void)memcpy(%s, %s_data(reply), %s_data_length(reply) * sizeof(%s));' % (output.name, xcb_name, xcb_name, output.get_base_type_string()))
 
                 if f.return_type != 'void':
-                    print '        retval = reply->ret_val;'
-                print '        free(reply);'
+                    print('        retval = reply->ret_val;')
+                print('        free(reply);')
             else:
-                print '        ' + xcb_request + ';'
-            print '#else'
+                print('        ' + xcb_request + ';')
+            print('#else')
             # End of XCB specific.
 
 
@@ -754,9 +755,9 @@
             pc_decl = "(void)"
 
         if name in f.glx_vendorpriv_names:
-            print '        %s __glXSetupVendorRequest(gc, %s, %s, cmdlen);' % (pc_decl, f.opcode_real_name(), f.opcode_vendor_name(name))
+            print('        %s __glXSetupVendorRequest(gc, %s, %s, cmdlen);' % (pc_decl, f.opcode_real_name(), f.opcode_vendor_name(name)))
         else:
-            print '        %s __glXSetupSingleRequest(gc, %s, cmdlen);' % (pc_decl, f.opcode_name())
+            print('        %s __glXSetupSingleRequest(gc, %s, cmdlen);' % (pc_decl, f.opcode_name()))
 
         self.common_emit_args(f, "pc", 0, 0)
 
@@ -765,12 +766,12 @@
         for img in images:
             if img.is_output:
                 o = f.command_fixed_length() - 4
-                print '        *(int32_t *)(pc + %u) = 0;' % (o)
+                print('        *(int32_t *)(pc + %u) = 0;' % (o))
                 if img.img_format != "GL_COLOR_INDEX" or img.img_type != "GL_BITMAP":
-                    print '        * (int8_t *)(pc + %u) = state->storePack.swapEndian;' % (o)
+                    print('        * (int8_t *)(pc + %u) = state->storePack.swapEndian;' % (o))
 
                 if f.img_reset:
-                    print '        * (int8_t *)(pc + %u) = %s;' % (o + 1, f.img_reset)
+                    print('        * (int8_t *)(pc + %u) = %s;' % (o + 1, f.img_reset))
 
 
         return_name = ''
@@ -787,9 +788,9 @@
                 if p.is_image():
                     [dim, w, h, d, junk] = p.get_dimensions()
                     if f.dimensions_in_reply:
-                        print "        __glXReadPixelReply(dpy, gc, %u, 0, 0, 0, %s, %s, %s, GL_TRUE);" % (dim, p.img_format, p.img_type, p.name)
+                        print("        __glXReadPixelReply(dpy, gc, %u, 0, 0, 0, %s, %s, %s, GL_TRUE);" % (dim, p.img_format, p.img_type, p.name))
                     else:
-                        print "        __glXReadPixelReply(dpy, gc, %u, %s, %s, %s, %s, %s, %s, GL_FALSE);" % (dim, w, h, d, p.img_format, p.img_type, p.name)
+                        print("        __glXReadPixelReply(dpy, gc, %u, %s, %s, %s, %s, %s, %s, GL_FALSE);" % (dim, w, h, d, p.img_format, p.img_type, p.name))
 
                     got_reply = 1
                 else:
@@ -809,7 +810,7 @@
                     # non-arrays) gives us this.
 
                     s = p.size() / p.get_element_count()
-                    print "       %s __glXReadReply(dpy, %s, %s, %s);" % (return_str, s, p.name, aa)
+                    print("       %s __glXReadReply(dpy, %s, %s, %s);" % (return_str, s, p.name, aa))
                     got_reply = 1
 
 
@@ -817,30 +818,30 @@
             # read a NULL reply to get the return value.
 
             if not got_reply:
-                print "       %s __glXReadReply(dpy, 0, NULL, GL_FALSE);" % (return_str)
+                print("       %s __glXReadReply(dpy, 0, NULL, GL_FALSE);" % (return_str))
 
 
         elif self.debug:
             # Only emit the extra glFinish call for functions
             # that don't already require a reply from the server.
-            print '        __indirect_glFinish();'
+            print('        __indirect_glFinish();')
 
         if self.debug:
-            print '        printf( "Exit %%s.\\n", "gl%s" );' % (name)
+            print('        printf( "Exit %%s.\\n", "gl%s" );' % (name))
 
 
-        print '        UnlockDisplay(dpy); SyncHandle();'
+        print('        UnlockDisplay(dpy); SyncHandle();')
 
         if name not in f.glx_vendorpriv_names:
-            print '#endif /* USE_XCB */'
+            print('#endif /* USE_XCB */')
 
-        print '    }'
-        print '    return%s;' % (return_name)
+        print('    }')
+        print('    return%s;' % (return_name))
         return
 
 
     def printPixelFunction(self, f):
-        if self.pixel_stubs.has_key( f.name ):
+        if f.name in self.pixel_stubs:
             # Normally gl_function::get_parameter_string could be
             # used.  However, this call needs to have the missing
             # dimensions (e.g., a fake height value for
@@ -859,7 +860,7 @@
                 if f.pad_after(param):
                     p_string += ", 1"
 
-            print '    %s(%s, %u%s );' % (self.pixel_stubs[f.name] , f.opcode_name(), dim, p_string)
+            print('    %s(%s, %u%s );' % (self.pixel_stubs[f.name] , f.opcode_name(), dim, p_string))
             return
 
 
@@ -870,32 +871,32 @@
 
 
         if f.can_be_large:
-            print 'if (cmdlen <= gc->maxSmallRenderCommandSize) {'
-            print '    if ( (gc->pc + cmdlen) > gc->bufEnd ) {'
-            print '        (void) __glXFlushRenderBuffer(gc, gc->pc);'
-            print '    }'
+            print('if (cmdlen <= gc->maxSmallRenderCommandSize) {')
+            print('    if ( (gc->pc + cmdlen) > gc->bufEnd ) {')
+            print('        (void) __glXFlushRenderBuffer(gc, gc->pc);')
+            print('    }')
 
         if f.glx_rop == ~0:
             opcode = "opcode"
         else:
             opcode = f.opcode_real_name()
 
-        print 'emit_header(gc->pc, %s, cmdlen);' % (opcode)
+        print('emit_header(gc->pc, %s, cmdlen);' % (opcode))
 
         self.pixel_emit_args( f, "gc->pc", 0 )
-        print 'gc->pc += cmdlen;'
-        print 'if (gc->pc > gc->limit) { (void) __glXFlushRenderBuffer(gc, gc->pc); }'
+        print('gc->pc += cmdlen;')
+        print('if (gc->pc > gc->limit) { (void) __glXFlushRenderBuffer(gc, gc->pc); }')
 
         if f.can_be_large:
-            print '}'
-            print 'else {'
+            print('}')
+            print('else {')
 
             self.large_emit_begin(f, opcode)
             self.pixel_emit_args(f, "pc", 1)
 
-            print '}'
+            print('}')
 
-        if trailer: print trailer
+        if trailer: print(trailer)
         return
 
 
@@ -912,7 +913,7 @@
             if p.is_pointer():
                 cmdlen = f.command_fixed_length()
                 if cmdlen in self.generic_sizes:
-                    print '    generic_%u_byte( %s, %s );' % (cmdlen, f.opcode_real_name(), p.name)
+                    print('    generic_%u_byte( %s, %s );' % (cmdlen, f.opcode_real_name(), p.name))
                     return
 
         if self.common_func_print_just_start(f, None):
@@ -921,36 +922,36 @@
             trailer = None
 
         if self.debug:
-            print 'printf( "Enter %%s...\\n", "gl%s" );' % (f.name)
+            print('printf( "Enter %%s...\\n", "gl%s" );' % (f.name))
 
         if f.can_be_large:
-            print 'if (cmdlen <= gc->maxSmallRenderCommandSize) {'
-            print '    if ( (gc->pc + cmdlen) > gc->bufEnd ) {'
-            print '        (void) __glXFlushRenderBuffer(gc, gc->pc);'
-            print '    }'
+            print('if (cmdlen <= gc->maxSmallRenderCommandSize) {')
+            print('    if ( (gc->pc + cmdlen) > gc->bufEnd ) {')
+            print('        (void) __glXFlushRenderBuffer(gc, gc->pc);')
+            print('    }')
 
-        print 'emit_header(gc->pc, %s, cmdlen);' % (f.opcode_real_name())
+        print('emit_header(gc->pc, %s, cmdlen);' % (f.opcode_real_name()))
 
         self.common_emit_args(f, "gc->pc", 4, 0)
-        print 'gc->pc += cmdlen;'
-        print 'if (__builtin_expect(gc->pc > gc->limit, 0)) { (void) __glXFlushRenderBuffer(gc, gc->pc); }'
+        print('gc->pc += cmdlen;')
+        print('if (__builtin_expect(gc->pc > gc->limit, 0)) { (void) __glXFlushRenderBuffer(gc, gc->pc); }')
 
         if f.can_be_large:
-            print '}'
-            print 'else {'
+            print('}')
+            print('else {')
 
             self.large_emit_begin(f)
             self.common_emit_args(f, "pc", 8, 1)
 
             p = f.variable_length_parameter()
-            print '    __glXSendLargeCommand(gc, pc, %u, %s, %s);' % (p.offset + 8, p.name, p.size_string())
-            print '}'
+            print('    __glXSendLargeCommand(gc, pc, %u, %s, %s);' % (p.offset + 8, p.name, p.size_string()))
+            print('}')
 
         if self.debug:
-            print '__indirect_glFinish();'
-            print 'printf( "Exit %%s.\\n", "gl%s" );' % (f.name)
+            print('__indirect_glFinish();')
+            print('printf( "Exit %%s.\\n", "gl%s" );' % (f.name))
 
-        if trailer: print trailer
+        if trailer: print(trailer)
         return
 
 
@@ -966,7 +967,7 @@
 
 
     def printRealHeader(self):
-        print """/**
+        print("""/**
  * \\file indirect_init.c
  * Initialize indirect rendering dispatch table.
  *
@@ -1012,15 +1013,15 @@
        table[i] = (_glapi_proc) NoOp;
     }
 
-    /* now, initialize the entries we understand */"""
+    /* now, initialize the entries we understand */""")
 
     def printRealFooter(self):
-        print """
+        print("""
     return (struct _glapi_table *) table;
 }
 
 #endif
-"""
+""")
         return
 
 
@@ -1034,15 +1035,15 @@
             for func in api.functionIterateByCategory(name):
                 if func.client_supported_for_indirect():
                     if preamble:
-                        print preamble
+                        print(preamble)
                         preamble = None
 
                     if func.is_abi():
-                        print '    table[{offset}] = (_glapi_proc) __indirect_gl{name};'.format(name = func.name, offset = func.offset)
+                        print('    table[{offset}] = (_glapi_proc) __indirect_gl{name};'.format(name = func.name, offset = func.offset))
                     else:
-                        print '    o = _glapi_get_proc_offset("gl{0}");'.format(func.name)
-                        print '    assert(o > 0);'
-                        print '    table[o] = (_glapi_proc) __indirect_gl{0};'.format(func.name)
+                        print('    o = _glapi_get_proc_offset("gl{0}");'.format(func.name))
+                        print('    assert(o > 0);')
+                        print('    table[o] = (_glapi_proc) __indirect_gl{0};'.format(func.name))
 
         return
 
@@ -1062,18 +1063,18 @@
 
 
     def printRealHeader(self):
-        print """/**
+        print("""/**
  * \\file
  * Prototypes for indirect rendering functions.
  *
  * \\author Kevin E. Martin <kevin@precisioninsight.com>
  * \\author Ian Romanick <idr@us.ibm.com>
  */
-"""
+""")
         self.printFastcall()
         self.printNoinline()
 
-        print """
+        print("""
 #include <X11/Xfuncproto.h>
 #include "glxclient.h"
 
@@ -1090,32 +1091,32 @@
 
 extern _X_HIDDEN NOINLINE FASTCALL GLubyte * __glXSetupVendorRequest(
     struct glx_context * gc, GLint code, GLint vop, GLint cmdlen );
-"""
+""")
 
 
     def printBody(self, api):
         for func in api.functionIterateGlx():
             params = func.get_parameter_string()
 
-            print 'extern _X_HIDDEN %s __indirect_gl%s(%s);' % (func.return_type, func.name, params)
+            print('extern _X_HIDDEN %s __indirect_gl%s(%s);' % (func.return_type, func.name, params))
 
             for n in func.entry_points:
                 if func.has_different_protocol(n):
                     asdf = func.static_glx_name(n)
                     if asdf not in func.static_entry_points:
-                        print 'extern _X_HIDDEN %s gl%s(%s);' % (func.return_type, asdf, params)
+                        print('extern _X_HIDDEN %s gl%s(%s);' % (func.return_type, asdf, params))
                         # give it a easy-to-remember name
                         if func.client_handcode:
-                            print '#define gl_dispatch_stub_%s gl%s' % (n, asdf)
+                            print('#define gl_dispatch_stub_%s gl%s' % (n, asdf))
                     else:
-                        print 'GLAPI %s GLAPIENTRY gl%s(%s);' % (func.return_type, asdf, params)
+                        print('GLAPI %s GLAPIENTRY gl%s(%s);' % (func.return_type, asdf, params))
 
                     break
 
-        print ''
-        print '#ifdef GLX_INDIRECT_RENDERING'
-        print 'extern _X_HIDDEN void (*__indirect_get_proc_address(const char *name))(void);'
-        print '#endif'
+        print('')
+        print('#ifdef GLX_INDIRECT_RENDERING')
+        print('extern _X_HIDDEN void (*__indirect_get_proc_address(const char *name))(void);')
+        print('#endif')
 
 
 def _parser():
diff --git a/src/mapi/glapi/gen/glX_proto_size.py b/src/mapi/glapi/gen/glX_proto_size.py
index e16dbab..2a843c3 100644
--- a/src/mapi/glapi/gen/glX_proto_size.py
+++ b/src/mapi/glapi/gen/glX_proto_size.py
@@ -24,6 +24,8 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
 import argparse
 import sys, string
 
@@ -69,7 +71,7 @@
         for enum_name in enum_dict:
             e = enum_dict[ enum_name ]
 
-            if e.functions.has_key( match_name ):
+            if match_name in e.functions:
                 [count, mode] = e.functions[ match_name ]
 
                 if mode_set and mode != self.mode:
@@ -77,11 +79,11 @@
 
                 self.mode = mode
 
-                if self.enums.has_key( e.value ):
+                if e.value in self.enums:
                     if e.name not in self.enums[ e.value ]:
                         self.enums[ e.value ].append( e )
                 else:
-                    if not self.count.has_key( count ):
+                    if count not in self.count:
                         self.count[ count ] = []
 
                     self.enums[ e.value ] = [ e ]
@@ -129,7 +131,7 @@
         for a in self.enums:
             count += 1
 
-        if self.count.has_key(-1):
+        if -1 in self.count:
             return 0
 
         # Determine if there is some mask M, such that M = (2^N) - 1,
@@ -167,19 +169,19 @@
                     masked_count[i] = c
 
 
-            print '    static const GLushort a[%u] = {' % (mask + 1)
+            print('    static const GLushort a[%u] = {' % (mask + 1))
             for e in masked_enums:
-                print '        %s, ' % (masked_enums[e])
-            print '    };'
+                print('        %s, ' % (masked_enums[e]))
+            print('    };')
 
-            print '    static const GLubyte b[%u] = {' % (mask + 1)
+            print('    static const GLubyte b[%u] = {' % (mask + 1))
             for c in masked_count:
-                print '        %u, ' % (masked_count[c])
-            print '    };'
+                print('        %u, ' % (masked_count[c]))
+            print('    };')
 
-            print '    const unsigned idx = (e & 0x%02xU);' % (mask)
-            print ''
-            print '    return (e == a[idx]) ? (GLint) b[idx] : 0;'
+            print('    const unsigned idx = (e & 0x%02xU);' % (mask))
+            print('')
+            print('    return (e == a[idx]) ? (GLint) b[idx] : 0;')
             return 1;
         else:
             return 0;
@@ -189,9 +191,9 @@
         """Emit the body of the __gl*_size function using a 
         switch-statement."""
 
-        print '    switch( e ) {'
+        print('    switch( e ) {')
 
-        for c in self.count:
+        for c in sorted(self.count):
             for e in self.count[c]:
                 first = 1
 
@@ -206,35 +208,34 @@
                 for enum_obj in self.enums[e]:
                     list[ enum_obj.priority() ] = enum_obj.name
 
-                keys = list.keys()
-                keys.sort()
+                keys = sorted(list.keys())
                 for k in keys:
                     j = list[k]
                     if first:
-                        print '        case GL_%s:' % (j)
+                        print('        case GL_%s:' % (j))
                         first = 0
                     else:
-                        print '/*      case GL_%s:*/' % (j)
+                        print('/*      case GL_%s:*/' % (j))
 
             if c == -1:
-                print '            return __gl%s_variable_size( e );' % (name)
+                print('            return __gl%s_variable_size( e );' % (name))
             else:
-                print '            return %u;' % (c)
+                print('            return %u;' % (c))
 
-        print '        default: return 0;'
-        print '    }'
+        print('        default: return 0;')
+        print('    }')
 
 
     def Print(self, name):
-        print '_X_INTERNAL PURE FASTCALL GLint'
-        print '__gl%s_size( GLenum e )' % (name)
-        print '{'
+        print('_X_INTERNAL PURE FASTCALL GLint')
+        print('__gl%s_size( GLenum e )' % (name))
+        print('{')
 
         if not self.PrintUsingTable():
             self.PrintUsingSwitch(name)
 
-        print '}'
-        print ''
+        print('}')
+        print('')
 
 
 class glx_server_enum_function(glx_enum_function):
@@ -273,8 +274,7 @@
             o = f.offset_of( param_name )
             foo[o] = param_name
 
-        keys = foo.keys()
-        keys.sort()
+        keys = sorted(foo.keys())
         for o in keys:
             p = f.parameters_by_name[ foo[o] ]
 
@@ -282,18 +282,18 @@
             fixup.append( p.name )
 
 
-        print '    GLsizei compsize;'
-        print ''
+        print('    GLsizei compsize;')
+        print('')
 
         printer.common_emit_fixups(fixup)
 
-        print ''
-        print '    compsize = __gl%s_size(%s);' % (f.name, string.join(f.count_parameter_list, ","))
+        print('')
+        print('    compsize = __gl%s_size(%s);' % (f.name, string.join(f.count_parameter_list, ",")))
         p = f.variable_length_parameter()
-        print '    return safe_pad(%s);' % (p.size_string())
+        print('    return safe_pad(%s);' % (p.size_string()))
 
-        print '}'
-        print ''
+        print('}')
+        print('')
 
 
 class PrintGlxSizeStubs_common(gl_XML.gl_print_base):
@@ -313,34 +313,34 @@
 
 class PrintGlxSizeStubs_c(PrintGlxSizeStubs_common):
     def printRealHeader(self):
-        print ''
-        print '#include <X11/Xfuncproto.h>'
-        print '#include <GL/gl.h>'
+        print('')
+        print('#include <X11/Xfuncproto.h>')
+        print('#include <GL/gl.h>')
         if self.emit_get:
-            print '#include "indirect_size_get.h"'
-            print '#include "glxserver.h"'
-            print '#include "indirect_util.h"'
+            print('#include "indirect_size_get.h"')
+            print('#include "glxserver.h"')
+            print('#include "indirect_util.h"')
 
-        print '#include "indirect_size.h"'
+        print('#include "indirect_size.h"')
 
-        print ''
+        print('')
         self.printPure()
-        print ''
+        print('')
         self.printFastcall()
-        print ''
-        print ''
-        print '#ifdef HAVE_FUNC_ATTRIBUTE_ALIAS'
-        print '#  define ALIAS2(from,to) \\'
-        print '    _X_INTERNAL PURE FASTCALL GLint __gl ## from ## _size( GLenum e ) \\'
-        print '        __attribute__ ((alias( # to )));'
-        print '#  define ALIAS(from,to) ALIAS2( from, __gl ## to ## _size )'
-        print '#else'
-        print '#  define ALIAS(from,to) \\'
-        print '    _X_INTERNAL PURE FASTCALL GLint __gl ## from ## _size( GLenum e ) \\'
-        print '    { return __gl ## to ## _size( e ); }'
-        print '#endif'
-        print ''
-        print ''
+        print('')
+        print('')
+        print('#ifdef HAVE_FUNC_ATTRIBUTE_ALIAS')
+        print('#  define ALIAS2(from,to) \\')
+        print('    _X_INTERNAL PURE FASTCALL GLint __gl ## from ## _size( GLenum e ) \\')
+        print('        __attribute__ ((alias( # to )));')
+        print('#  define ALIAS(from,to) ALIAS2( from, __gl ## to ## _size )')
+        print('#else')
+        print('#  define ALIAS(from,to) \\')
+        print('    _X_INTERNAL PURE FASTCALL GLint __gl ## from ## _size( GLenum e ) \\')
+        print('    { return __gl ## to ## _size( e ); }')
+        print('#endif')
+        print('')
+        print('')
 
 
     def printBody(self, api):
@@ -354,7 +354,7 @@
 
             if (ef.is_set() and self.emit_set) or (not ef.is_set() and self.emit_get):
                 sig = ef.signature()
-                if enum_sigs.has_key( sig ):
+                if sig in enum_sigs:
                     aliases.append( [func.name, enum_sigs[ sig ]] )
                 else:
                     enum_sigs[ sig ] = func.name
@@ -362,26 +362,26 @@
 
 
         for [alias_name, real_name] in aliases:
-            print 'ALIAS( %s, %s )' % (alias_name, real_name)
+            print('ALIAS( %s, %s )' % (alias_name, real_name))
 
 
 
 class PrintGlxSizeStubs_h(PrintGlxSizeStubs_common):
     def printRealHeader(self):
-        print """/**
+        print("""/**
  * \\file
  * Prototypes for functions used to determine the number of data elements in
  * various GLX protocol messages.
  *
  * \\author Ian Romanick <idr@us.ibm.com>
  */
-"""
-        print '#include <X11/Xfuncproto.h>'
-        print ''
+""")
+        print('#include <X11/Xfuncproto.h>')
+        print('')
         self.printPure();
-        print ''
+        print('')
         self.printFastcall();
-        print ''
+        print('')
 
 
     def printBody(self, api):
@@ -391,7 +391,7 @@
                 continue
 
             if (ef.is_set() and self.emit_set) or (not ef.is_set() and self.emit_get):
-                print 'extern _X_INTERNAL PURE FASTCALL GLint __gl%s_size(GLenum);' % (func.name)
+                print('extern _X_INTERNAL PURE FASTCALL GLint __gl%s_size(GLenum);' % (func.name))
 
 
 class PrintGlxReqSize_common(gl_XML.gl_print_base):
@@ -415,16 +415,16 @@
 
 
     def printRealHeader(self):
-        print '#include <X11/Xfuncproto.h>'
-        print ''
+        print('#include <X11/Xfuncproto.h>')
+        print('')
         self.printPure()
-        print ''
+        print('')
 
 
     def printBody(self, api):
         for func in api.functionIterateGlx():
             if not func.ignore and func.has_variable_size_request():
-                print 'extern PURE _X_HIDDEN int __glX%sReqSize(const GLbyte *pc, Bool swap, int reqlen);' % (func.name)
+                print('extern PURE _X_HIDDEN int __glX%sReqSize(const GLbyte *pc, Bool swap, int reqlen);' % (func.name))
 
 
 class PrintGlxReqSize_c(PrintGlxReqSize_common):
@@ -441,25 +441,25 @@
 
 
     def printRealHeader(self):
-        print ''
-        print '#include <GL/gl.h>'
-        print '#include "glxserver.h"'
-        print '#include "glxbyteorder.h"'
-        print '#include "indirect_size.h"'
-        print '#include "indirect_reqsize.h"'
-        print ''
-        print '#ifdef HAVE_FUNC_ATTRIBUTE_ALIAS'
-        print '#  define ALIAS2(from,to) \\'
-        print '    GLint __glX ## from ## ReqSize( const GLbyte * pc, Bool swap, int reqlen ) \\'
-        print '        __attribute__ ((alias( # to )));'
-        print '#  define ALIAS(from,to) ALIAS2( from, __glX ## to ## ReqSize )'
-        print '#else'
-        print '#  define ALIAS(from,to) \\'
-        print '    GLint __glX ## from ## ReqSize( const GLbyte * pc, Bool swap, int reqlen ) \\'
-        print '    { return __glX ## to ## ReqSize( pc, swap, reqlen ); }'
-        print '#endif'
-        print ''
-        print ''
+        print('')
+        print('#include <GL/gl.h>')
+        print('#include "glxserver.h"')
+        print('#include "glxbyteorder.h"')
+        print('#include "indirect_size.h"')
+        print('#include "indirect_reqsize.h"')
+        print('')
+        print('#ifdef HAVE_FUNC_ATTRIBUTE_ALIAS')
+        print('#  define ALIAS2(from,to) \\')
+        print('    GLint __glX ## from ## ReqSize( const GLbyte * pc, Bool swap, int reqlen ) \\')
+        print('        __attribute__ ((alias( # to )));')
+        print('#  define ALIAS(from,to) ALIAS2( from, __glX ## to ## ReqSize )')
+        print('#else')
+        print('#  define ALIAS(from,to) \\')
+        print('    GLint __glX ## from ## ReqSize( const GLbyte * pc, Bool swap, int reqlen ) \\')
+        print('    { return __glX ## to ## ReqSize( pc, swap, reqlen ); }')
+        print('#endif')
+        print('')
+        print('')
 
 
     def printBody(self, api):
@@ -475,10 +475,10 @@
 
             sig = ef.signature()
 
-            if not enum_functions.has_key(func.name):
+            if func.name not in enum_functions:
                 enum_functions[ func.name ] = sig
 
-            if not enum_sigs.has_key( sig ):
+            if sig not in enum_sigs:
                 enum_sigs[ sig ] = ef
 
 
@@ -494,7 +494,7 @@
             if func.server_handcode: continue
             if not func.has_variable_size_request(): continue
 
-            if enum_functions.has_key(func.name):
+            if func.name in enum_functions:
                 sig = enum_functions[func.name]
                 ef = enum_sigs[ sig ]
 
@@ -511,7 +511,7 @@
 
 
         for [alias_name, real_name] in aliases:
-            print 'ALIAS( %s, %s )' % (alias_name, real_name)
+            print('ALIAS( %s, %s )' % (alias_name, real_name))
 
         return
 
@@ -520,10 +520,10 @@
         """Utility function to emit conditional byte-swaps."""
 
         if fixup:
-            print '    if (swap) {'
+            print('    if (swap) {')
             for name in fixup:
-                print '        %s = bswap_32(%s);' % (name, name)
-            print '    }'
+                print('        %s = bswap_32(%s);' % (name, name))
+            print('    }')
 
         return
 
@@ -532,14 +532,14 @@
         offset = p.offset
         dst = p.string()
         src = '(%s *)' % (p.type_string())
-        print '%-18s = *%11s(%s + %u);' % (dst, src, pc, offset + adjust);
+        print('%-18s = *%11s(%s + %u);' % (dst, src, pc, offset + adjust));
         return
 
 
     def common_func_print_just_header(self, f):
-        print 'int'
-        print '__glX%sReqSize( const GLbyte * pc, Bool swap, int reqlen )' % (f.name)
-        print '{'
+        print('int')
+        print('__glX%sReqSize( const GLbyte * pc, Bool swap, int reqlen )' % (f.name))
+        print('{')
 
 
     def printPixelFunction(self, f):
@@ -548,20 +548,20 @@
         f.offset_of( f.parameters[0].name )
         [dim, w, h, d, junk] = f.get_images()[0].get_dimensions()
 
-        print '    GLint row_length   = *  (GLint *)(pc +  4);'
+        print('    GLint row_length   = *  (GLint *)(pc +  4);')
 
         if dim < 3:
             fixup = ['row_length', 'skip_rows', 'alignment']
-            print '    GLint image_height = 0;'
-            print '    GLint skip_images  = 0;'
-            print '    GLint skip_rows    = *  (GLint *)(pc +  8);'
-            print '    GLint alignment    = *  (GLint *)(pc + 16);'
+            print('    GLint image_height = 0;')
+            print('    GLint skip_images  = 0;')
+            print('    GLint skip_rows    = *  (GLint *)(pc +  8);')
+            print('    GLint alignment    = *  (GLint *)(pc + 16);')
         else:
             fixup = ['row_length', 'image_height', 'skip_rows', 'skip_images', 'alignment']
-            print '    GLint image_height = *  (GLint *)(pc +  8);'
-            print '    GLint skip_rows    = *  (GLint *)(pc + 16);'
-            print '    GLint skip_images  = *  (GLint *)(pc + 20);'
-            print '    GLint alignment    = *  (GLint *)(pc + 32);'
+            print('    GLint image_height = *  (GLint *)(pc +  8);')
+            print('    GLint skip_rows    = *  (GLint *)(pc + 16);')
+            print('    GLint skip_images  = *  (GLint *)(pc + 20);')
+            print('    GLint alignment    = *  (GLint *)(pc + 32);')
 
         img = f.images[0]
         for p in f.parameterIterateGlxSend():
@@ -569,21 +569,21 @@
                 self.common_emit_one_arg(p, "pc", 0)
                 fixup.append( p.name )
 
-        print ''
+        print('')
 
         self.common_emit_fixups(fixup)
 
         if img.img_null_flag:
-            print ''
-            print '	   if (*(CARD32 *) (pc + %s))' % (img.offset - 4)
-            print '	       return 0;'
+            print('')
+            print('	   if (*(CARD32 *) (pc + %s))' % (img.offset - 4))
+            print('	       return 0;')
 
-        print ''
-        print '    return __glXImageSize(%s, %s, %s, %s, %s, %s,' % (img.img_format, img.img_type, img.img_target, w, h, d )
-        print '                          image_height, row_length, skip_images,'
-        print '                          skip_rows, alignment);'
-        print '}'
-        print ''
+        print('')
+        print('    return __glXImageSize(%s, %s, %s, %s, %s, %s,' % (img.img_format, img.img_type, img.img_target, w, h, d ))
+        print('                          image_height, row_length, skip_images,')
+        print('                          skip_rows, alignment);')
+        print('}')
+        print('')
         return
 
 
@@ -610,16 +610,16 @@
                 if s == 0: s = 1
 
                 sig += "(%u,%u)" % (f.offset_of(p.counter), s)
-		if size == '':
-		    size = p.size_string()
-		else:
-		    size = "safe_add(%s, %s)" % (size, p.size_string())
+                if size == '':
+                    size = p.size_string()
+                else:
+                    size = "safe_add(%s, %s)" % (size, p.size_string())
 
         # If the calculated signature matches a function that has
         # already be emitted, don't emit this function.  Instead, add
         # it to the list of function aliases.
 
-        if self.counter_sigs.has_key(sig):
+        if sig in self.counter_sigs:
             n = self.counter_sigs[sig];
             alias = [f.name, n]
         else:
@@ -632,13 +632,13 @@
                 self.common_emit_one_arg(p, "pc", 0)
 
 
-            print ''
+            print('')
             self.common_emit_fixups(fixup)
-            print ''
+            print('')
 
-            print '    return safe_pad(%s);' % (size)
-            print '}'
-            print ''
+            print('    return safe_pad(%s);' % (size))
+            print('}')
+            print('')
 
         return alias
 
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 38c1921..49807e1 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8423,16 +8423,6 @@
     </function>
 </category>
 
-<category name="GL_EXT_polygon_offset" number="3">
-    <enum name="POLYGON_OFFSET_BIAS_EXT"                  value="0x8039"/>
-
-    <function name="PolygonOffsetEXT" deprecated="3.1">
-        <param name="factor" type="GLfloat"/>
-        <param name="bias" type="GLfloat"/>
-        <glx rop="4098" ignore="true"/>
-    </function>
-</category>
-
 <category name="GL_EXT_texture" number="4">
     <enum name="ALPHA4_EXT"                               value="0x803B"/>
     <enum name="ALPHA8_EXT"                               value="0x803C"/>
@@ -10891,6 +10881,110 @@
 
 <!-- Extension number 180 is not listed in the extension registry. -->
 
+<category name="GL_ARB_sample_locations" number="181">
+    <enum name="SAMPLE_LOCATION_SUBPIXEL_BITS_ARB"             value="0x933D">
+        <size name="Get" mode="get"/>
+    </enum>
+
+    <enum name="SAMPLE_LOCATION_PIXEL_GRID_WIDTH_ARB"          value="0x933E">
+        <size name="Get" mode="get"/>
+    </enum>
+
+    <enum name="SAMPLE_LOCATION_PIXEL_GRID_HEIGHT_ARB"         value="0x933F">
+        <size name="Get" mode="get"/>
+    </enum>
+
+    <enum name="PROGRAMMABLE_SAMPLE_LOCATION_TABLE_SIZE_ARB"   value="0x9340">
+        <size name="Get" mode="get"/>
+    </enum>
+
+    <enum name="SAMPLE_LOCATION_ARB"                           value="0x8E50">
+        <size name="GetMultisamplefv" mode="get"/>
+    </enum>
+
+    <enum name="PROGRAMMABLE_SAMPLE_LOCATION_ARB"              value="0x9341">
+        <size name="GetMultisamplefv" mode="get"/>
+    </enum>
+
+    <enum name="FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_ARB" value="0x9342">
+        <size name="FramebufferParameteri"/>
+        <size name="GetFramebufferParameteri"/>
+    </enum>
+
+    <enum name="FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_ARB"    value="0x9343">
+        <size name="FramebufferParameteri"/>
+        <size name="GetFramebufferParameteri"/>
+    </enum>
+
+    <function name="FramebufferSampleLocationsfvARB" no_error="true">
+        <param name="target" type="GLenum"/>
+        <param name="start"  type="GLuint"/>
+        <param name="count"  type="GLsizei"/>
+        <param name="v"      type="const GLfloat *"/>
+    </function>
+
+    <function name="NamedFramebufferSampleLocationsfvARB" no_error="true">
+        <param name="framebuffer" type="GLuint"/>
+        <param name="start"       type="GLuint"/>
+        <param name="count"       type="GLsizei"/>
+        <param name="v"           type="const GLfloat *"/>
+    </function>
+
+    <function name="EvaluateDepthValuesARB"/>
+</category>
+
+<category name="GL_NV_sample_locations" number="472">
+    <enum name="SAMPLE_LOCATION_SUBPIXEL_BITS_NV"             value="0x933D" alias="SAMPLE_LOCATION_SUBPIXEL_BITS_ARB">
+        <size name="Get" mode="get"/>
+    </enum>
+
+    <enum name="SAMPLE_LOCATION_PIXEL_GRID_WIDTH_NV"          value="0x933E" alias="SAMPLE_LOCATION_PIXEL_GRID_WIDTH_ARB">
+        <size name="Get" mode="get"/>
+    </enum>
+
+    <enum name="SAMPLE_LOCATION_PIXEL_GRID_HEIGHT_NV"         value="0x933F" alias="SAMPLE_LOCATION_PIXEL_GRID_HEIGHT_ARB">
+        <size name="Get" mode="get"/>
+    </enum>
+
+    <enum name="PROGRAMMABLE_SAMPLE_LOCATION_TABLE_SIZE_NV"   value="0x9340" alias="PROGRAMMABLE_SAMPLE_LOCATION_TABLE_SIZE_ARB">
+        <size name="Get" mode="get"/>
+    </enum>
+
+    <enum name="SAMPLE_LOCATION_NV"                           value="0x8E50" alias="SAMPLE_LOCATION_ARB">
+        <size name="GetMultisamplefv" mode="get"/>
+    </enum>
+
+    <enum name="PROGRAMMABLE_SAMPLE_LOCATION_NV"              value="0x9341" alias="PROGRAMMABLE_SAMPLE_LOCATION_ARB">
+        <size name="GetMultisamplefv" mode="get"/>
+    </enum>
+
+    <enum name="FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_NV" value="0x9342" alias="FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_ARB">
+        <size name="FramebufferParameteri"/>
+        <size name="GetFramebufferParameteri"/>
+    </enum>
+
+    <enum name="FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_NV"    value="0x9343" alias="FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_ARB">
+        <size name="FramebufferParameteri"/>
+        <size name="GetFramebufferParameteri"/>
+    </enum>
+
+    <function name="FramebufferSampleLocationsfvNV" no_error="true" es2="3.1" alias="FramebufferSampleLocationsfvARB">
+        <param name="target" type="GLenum"/>
+        <param name="start"  type="GLuint"/>
+        <param name="count"  type="GLsizei"/>
+        <param name="v"      type="const GLfloat *"/>
+    </function>
+
+    <function name="NamedFramebufferSampleLocationsfvNV" no_error="true" es2="3.1" alias="NamedFramebufferSampleLocationsfvARB">
+        <param name="framebuffer" type="GLuint"/>
+        <param name="start"       type="GLuint"/>
+        <param name="count"       type="GLsizei"/>
+        <param name="v"           type="const GLfloat *"/>
+    </function>
+
+    <function name="ResolveDepthValuesNV" es2="3.1" alias="EvaluateDepthValuesARB"/>
+</category>
+
 <category name="GL_SUN_convolution_border_modes" number="182">
     <enum name="WRAP_BORDER_SUN"                          value="0x81D4"/>
 </category>
@@ -12871,6 +12965,53 @@
   <enum name="CONSERVATIVE_RASTERIZATION_INTEL" value="0x83FE"/>
 </category>
 
+<category name="GL_NV_conservative_raster" number="465">
+    <enum name="CONSERVATIVE_RASTERIZATION_NV"       value="0x9346">
+        <size name="Get" mode="get"/>
+    </enum>
+    <enum name="SUBPIXEL_PRECISION_BIAS_X_BITS_NV"   value="0x9347">
+        <size name="Get" mode="get"/>
+    </enum>
+    <enum name="SUBPIXEL_PRECISION_BIAS_Y_BITS_NV"   value="0x9348">
+        <size name="Get" mode="get"/>
+    </enum>
+    <enum name="MAX_SUBPIXEL_PRECISION_BIAS_BITS_NV" value="0x9349">
+        <size name="Get" mode="get"/>
+    </enum>
+    <function name="SubpixelPrecisionBiasNV" es1="1.0" es2="2.0" no_error="true">
+        <param name="xbits" type="GLuint"/>
+        <param name="ybits" type="GLuint"/>
+    </function>
+</category>
+
+<category name="GL_NV_conservative_raster_dilate" number="480">
+    <enum name="CONSERVATIVE_RASTER_DILATE_NV"             value="0x9379">
+        <size name="Get" mode="get"/>
+    </enum>
+    <enum name="CONSERVATIVE_RASTER_DILATE_RANGE_NV"       value="0x937A">
+        <size name="Get" mode="get"/>
+    </enum>
+    <enum name="CONSERVATIVE_RASTER_DILATE_GRANULARITY_NV" value="0x937B">
+        <size name="Get" mode="get"/>
+    </enum>
+    <function name="ConservativeRasterParameterfNV" es1="1.0" es2="2.0" no_error="true">
+        <param name="pname" type="GLenum"/>
+        <param name="param" type="GLfloat"/>
+    </function>
+</category>
+
+<category name="GL_NV_conservative_pre_snap_triangles" number="487">
+    <enum name="CONSERVATIVE_RASTER_MODE_NV"       value="0x954D">
+        <size name="Get" mode="get"/>
+    </enum>
+    <enum name="CONSERVATIVE_RASTER_MODE_POST_SNAP_NV"   value="0x954E"/>
+    <enum name="CONSERVATIVE_RASTER_MODE_PRE_SNAP_TRIANGLES_NV"   value="0x954F"/>
+    <function name="ConservativeRasterParameteriNV" es1="1.0" es2="2.0" no_error="true">
+        <param name="pname" type="GLenum"/>
+        <param name="param" type="GLint"/>
+    </function>
+</category>
+
 <xi:include href="INTEL_performance_query.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <category name="GL_EXT_polygon_offset_clamp" number="460">
diff --git a/src/mapi/glapi/gen/gl_SPARC_asm.py b/src/mapi/glapi/gen/gl_SPARC_asm.py
index 7b5714e..0152958 100644
--- a/src/mapi/glapi/gen/gl_SPARC_asm.py
+++ b/src/mapi/glapi/gen/gl_SPARC_asm.py
@@ -24,6 +24,8 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
 import argparse
 
 import license
@@ -39,192 +41,192 @@
 
 
     def printRealHeader(self):
-        print '#ifdef __arch64__'
-        print '#define GL_OFF(N)\t((N) * 8)'
-        print '#define GL_LL\t\tldx'
-        print '#define GL_TIE_LD(SYM)\t%tie_ldx(SYM)'
-        print '#define GL_STACK_SIZE\t128'
-        print '#else'
-        print '#define GL_OFF(N)\t((N) * 4)'
-        print '#define GL_LL\t\tld'
-        print '#define GL_TIE_LD(SYM)\t%tie_ld(SYM)'
-        print '#define GL_STACK_SIZE\t64'
-        print '#endif'
-        print ''
-        print '#define GLOBL_FN(x) .globl x ; .type x, @function'
-        print '#define HIDDEN(x) .hidden x'
-        print ''
-        print '\t.register %g2, #scratch'
-        print '\t.register %g3, #scratch'
-        print ''
-        print '\t.text'
-        print ''
-        print '\tGLOBL_FN(__glapi_sparc_icache_flush)'
-        print '\tHIDDEN(__glapi_sparc_icache_flush)'
-        print '\t.type\t__glapi_sparc_icache_flush, @function'
-        print '__glapi_sparc_icache_flush: /* %o0 = insn_addr */'
-        print '\tflush\t%o0'
-        print '\tretl'
-        print '\t nop'
-        print ''
-        print '\t.align\t32'
-        print ''
-        print '\t.type\t__glapi_sparc_get_pc, @function'
-        print '__glapi_sparc_get_pc:'
-        print '\tretl'
-        print '\t add\t%o7, %g2, %g2'
-        print '\t.size\t__glapi_sparc_get_pc, .-__glapi_sparc_get_pc'
-        print ''
-        print '#ifdef GLX_USE_TLS'
-        print ''
-        print '\tGLOBL_FN(__glapi_sparc_get_dispatch)'
-        print '\tHIDDEN(__glapi_sparc_get_dispatch)'
-        print '__glapi_sparc_get_dispatch:'
-        print '\tmov\t%o7, %g1'
-        print '\tsethi\t%hi(_GLOBAL_OFFSET_TABLE_-4), %g2'
-        print '\tcall\t__glapi_sparc_get_pc'
-        print '\tadd\t%g2, %lo(_GLOBAL_OFFSET_TABLE_+4), %g2'
-        print '\tmov\t%g1, %o7'
-        print '\tsethi\t%tie_hi22(_glapi_tls_Dispatch), %g1'
-        print '\tadd\t%g1, %tie_lo10(_glapi_tls_Dispatch), %g1'
-        print '\tGL_LL\t[%g2 + %g1], %g2, GL_TIE_LD(_glapi_tls_Dispatch)'
-        print '\tretl'
-        print '\t mov\t%g2, %o0'
-        print ''
-        print '\t.data'
-        print '\t.align\t32'
-        print ''
-        print '\t/* --> sethi %hi(_glapi_tls_Dispatch), %g1 */'
-        print '\t/* --> or %g1, %lo(_glapi_tls_Dispatch), %g1 */'
-        print '\tGLOBL_FN(__glapi_sparc_tls_stub)'
-        print '\tHIDDEN(__glapi_sparc_tls_stub)'
-        print '__glapi_sparc_tls_stub: /* Call offset in %g3 */'
-        print '\tmov\t%o7, %g1'
-        print '\tsethi\t%hi(_GLOBAL_OFFSET_TABLE_-4), %g2'
-        print '\tcall\t__glapi_sparc_get_pc'
-        print '\tadd\t%g2, %lo(_GLOBAL_OFFSET_TABLE_+4), %g2'
-        print '\tmov\t%g1, %o7'
-        print '\tsrl\t%g3, 10, %g3'
-        print '\tsethi\t%tie_hi22(_glapi_tls_Dispatch), %g1'
-        print '\tadd\t%g1, %tie_lo10(_glapi_tls_Dispatch), %g1'
-        print '\tGL_LL\t[%g2 + %g1], %g2, GL_TIE_LD(_glapi_tls_Dispatch)'
-        print '\tGL_LL\t[%g7+%g2], %g1'
-        print '\tGL_LL\t[%g1 + %g3], %g1'
-        print '\tjmp\t%g1'
-        print '\t nop'
-        print '\t.size\t__glapi_sparc_tls_stub, .-__glapi_sparc_tls_stub'
-        print ''
-        print '#define GL_STUB(fn, off)\t\t\t\t\\'
-        print '\tGLOBL_FN(fn);\t\t\t\t\t\\'
-        print 'fn:\tba\t__glapi_sparc_tls_stub;\t\t\t\\'
-        print '\t sethi\tGL_OFF(off), %g3;\t\t\t\\'
-        print '\t.size\tfn,.-fn;'
-        print ''
-        print '#elif defined(HAVE_PTHREAD)'
-        print ''
-        print '\t/* 64-bit 0x00 --> sethi %hh(_glapi_Dispatch), %g1 */'
-        print '\t/* 64-bit 0x04 --> sethi %lm(_glapi_Dispatch), %g2 */'
-        print '\t/* 64-bit 0x08 --> or %g1, %hm(_glapi_Dispatch), %g1 */'
-        print '\t/* 64-bit 0x0c --> sllx %g1, 32, %g1 */'
-        print '\t/* 64-bit 0x10 --> add %g1, %g2, %g1 */'
-        print '\t/* 64-bit 0x14 --> ldx [%g1 + %lo(_glapi_Dispatch)], %g1 */'
-        print ''
-        print '\t/* 32-bit 0x00 --> sethi %hi(_glapi_Dispatch), %g1 */'
-        print '\t/* 32-bit 0x04 --> ld [%g1 + %lo(_glapi_Dispatch)], %g1 */'
-        print ''
-        print '\t.data'
-        print '\t.align\t32'
-        print ''
-        print '\tGLOBL_FN(__glapi_sparc_pthread_stub)'
-        print '\tHIDDEN(__glapi_sparc_pthread_stub)'
-        print '__glapi_sparc_pthread_stub: /* Call offset in %g3 */'
-        print '\tmov\t%o7, %g1'
-        print '\tsethi\t%hi(_GLOBAL_OFFSET_TABLE_-4), %g2'
-        print '\tcall\t__glapi_sparc_get_pc'
-        print '\t add\t%g2, %lo(_GLOBAL_OFFSET_TABLE_+4), %g2'
-        print '\tmov\t%g1, %o7'
-        print '\tsethi\t%hi(_glapi_Dispatch), %g1'
-        print '\tor\t%g1, %lo(_glapi_Dispatch), %g1'
-        print '\tsrl\t%g3, 10, %g3'
-        print '\tGL_LL\t[%g2+%g1], %g2'
-        print '\tGL_LL\t[%g2], %g1'
-        print '\tcmp\t%g1, 0'
-        print '\tbe\t2f'
-        print '\t nop'
-        print '1:\tGL_LL\t[%g1 + %g3], %g1'
-        print '\tjmp\t%g1'
-        print '\t nop'
-        print '2:\tsave\t%sp, GL_STACK_SIZE, %sp'
-        print '\tmov\t%g3, %l0'
-        print '\tcall\t_glapi_get_dispatch'
-        print '\t nop'
-        print '\tmov\t%o0, %g1'
-        print '\tmov\t%l0, %g3'
-        print '\tba\t1b'
-        print '\t restore %g0, %g0, %g0'
-        print '\t.size\t__glapi_sparc_pthread_stub, .-__glapi_sparc_pthread_stub'
-        print ''
-        print '#define GL_STUB(fn, off)\t\t\t\\'
-        print '\tGLOBL_FN(fn);\t\t\t\t\\'
-        print 'fn:\tba\t__glapi_sparc_pthread_stub;\t\\'
-        print '\t sethi\tGL_OFF(off), %g3;\t\t\\'
-        print '\t.size\tfn,.-fn;'
-        print ''
-        print '#else /* Non-threaded version. */'
-        print ''
-        print '\t.type	__glapi_sparc_nothread_stub, @function'
-        print '__glapi_sparc_nothread_stub: /* Call offset in %g3 */'
-        print '\tmov\t%o7, %g1'
-        print '\tsethi\t%hi(_GLOBAL_OFFSET_TABLE_-4), %g2'
-        print '\tcall\t__glapi_sparc_get_pc'
-        print '\t add\t%g2, %lo(_GLOBAL_OFFSET_TABLE_+4), %g2'
-        print '\tmov\t%g1, %o7'
-        print '\tsrl\t%g3, 10, %g3'
-        print '\tsethi\t%hi(_glapi_Dispatch), %g1'
-        print '\tor\t%g1, %lo(_glapi_Dispatch), %g1'
-        print '\tGL_LL\t[%g2+%g1], %g2'
-        print '\tGL_LL\t[%g2], %g1'
-        print '\tGL_LL\t[%g1 + %g3], %g1'
-        print '\tjmp\t%g1'
-        print '\t nop'
-        print '\t.size\t__glapi_sparc_nothread_stub, .-__glapi_sparc_nothread_stub'
-        print ''
-        print '#define GL_STUB(fn, off)\t\t\t\\'
-        print '\tGLOBL_FN(fn);\t\t\t\t\\'
-        print 'fn:\tba\t__glapi_sparc_nothread_stub;\t\\'
-        print '\t sethi\tGL_OFF(off), %g3;\t\t\\'
-        print '\t.size\tfn,.-fn;'
-        print ''
-        print '#endif'
-        print ''
-        print '#define GL_STUB_ALIAS(fn, alias)		\\'
-        print '	.globl	fn;				\\'
-        print '	.set	fn, alias'
-        print ''
-        print '\t.text'
-        print '\t.align\t32'
-        print ''
-        print '\t.globl\tgl_dispatch_functions_start'
-        print '\tHIDDEN(gl_dispatch_functions_start)'
-        print 'gl_dispatch_functions_start:'
-        print ''
+        print('#ifdef __arch64__')
+        print('#define GL_OFF(N)\t((N) * 8)')
+        print('#define GL_LL\t\tldx')
+        print('#define GL_TIE_LD(SYM)\t%tie_ldx(SYM)')
+        print('#define GL_STACK_SIZE\t128')
+        print('#else')
+        print('#define GL_OFF(N)\t((N) * 4)')
+        print('#define GL_LL\t\tld')
+        print('#define GL_TIE_LD(SYM)\t%tie_ld(SYM)')
+        print('#define GL_STACK_SIZE\t64')
+        print('#endif')
+        print('')
+        print('#define GLOBL_FN(x) .globl x ; .type x, @function')
+        print('#define HIDDEN(x) .hidden x')
+        print('')
+        print('\t.register %g2, #scratch')
+        print('\t.register %g3, #scratch')
+        print('')
+        print('\t.text')
+        print('')
+        print('\tGLOBL_FN(__glapi_sparc_icache_flush)')
+        print('\tHIDDEN(__glapi_sparc_icache_flush)')
+        print('\t.type\t__glapi_sparc_icache_flush, @function')
+        print('__glapi_sparc_icache_flush: /* %o0 = insn_addr */')
+        print('\tflush\t%o0')
+        print('\tretl')
+        print('\t nop')
+        print('')
+        print('\t.align\t32')
+        print('')
+        print('\t.type\t__glapi_sparc_get_pc, @function')
+        print('__glapi_sparc_get_pc:')
+        print('\tretl')
+        print('\t add\t%o7, %g2, %g2')
+        print('\t.size\t__glapi_sparc_get_pc, .-__glapi_sparc_get_pc')
+        print('')
+        print('#ifdef GLX_USE_TLS')
+        print('')
+        print('\tGLOBL_FN(__glapi_sparc_get_dispatch)')
+        print('\tHIDDEN(__glapi_sparc_get_dispatch)')
+        print('__glapi_sparc_get_dispatch:')
+        print('\tmov\t%o7, %g1')
+        print('\tsethi\t%hi(_GLOBAL_OFFSET_TABLE_-4), %g2')
+        print('\tcall\t__glapi_sparc_get_pc')
+        print('\tadd\t%g2, %lo(_GLOBAL_OFFSET_TABLE_+4), %g2')
+        print('\tmov\t%g1, %o7')
+        print('\tsethi\t%tie_hi22(_glapi_tls_Dispatch), %g1')
+        print('\tadd\t%g1, %tie_lo10(_glapi_tls_Dispatch), %g1')
+        print('\tGL_LL\t[%g2 + %g1], %g2, GL_TIE_LD(_glapi_tls_Dispatch)')
+        print('\tretl')
+        print('\t mov\t%g2, %o0')
+        print('')
+        print('\t.data')
+        print('\t.align\t32')
+        print('')
+        print('\t/* --> sethi %hi(_glapi_tls_Dispatch), %g1 */')
+        print('\t/* --> or %g1, %lo(_glapi_tls_Dispatch), %g1 */')
+        print('\tGLOBL_FN(__glapi_sparc_tls_stub)')
+        print('\tHIDDEN(__glapi_sparc_tls_stub)')
+        print('__glapi_sparc_tls_stub: /* Call offset in %g3 */')
+        print('\tmov\t%o7, %g1')
+        print('\tsethi\t%hi(_GLOBAL_OFFSET_TABLE_-4), %g2')
+        print('\tcall\t__glapi_sparc_get_pc')
+        print('\tadd\t%g2, %lo(_GLOBAL_OFFSET_TABLE_+4), %g2')
+        print('\tmov\t%g1, %o7')
+        print('\tsrl\t%g3, 10, %g3')
+        print('\tsethi\t%tie_hi22(_glapi_tls_Dispatch), %g1')
+        print('\tadd\t%g1, %tie_lo10(_glapi_tls_Dispatch), %g1')
+        print('\tGL_LL\t[%g2 + %g1], %g2, GL_TIE_LD(_glapi_tls_Dispatch)')
+        print('\tGL_LL\t[%g7+%g2], %g1')
+        print('\tGL_LL\t[%g1 + %g3], %g1')
+        print('\tjmp\t%g1')
+        print('\t nop')
+        print('\t.size\t__glapi_sparc_tls_stub, .-__glapi_sparc_tls_stub')
+        print('')
+        print('#define GL_STUB(fn, off)\t\t\t\t\\')
+        print('\tGLOBL_FN(fn);\t\t\t\t\t\\')
+        print('fn:\tba\t__glapi_sparc_tls_stub;\t\t\t\\')
+        print('\t sethi\tGL_OFF(off), %g3;\t\t\t\\')
+        print('\t.size\tfn,.-fn;')
+        print('')
+        print('#elif defined(HAVE_PTHREAD)')
+        print('')
+        print('\t/* 64-bit 0x00 --> sethi %hh(_glapi_Dispatch), %g1 */')
+        print('\t/* 64-bit 0x04 --> sethi %lm(_glapi_Dispatch), %g2 */')
+        print('\t/* 64-bit 0x08 --> or %g1, %hm(_glapi_Dispatch), %g1 */')
+        print('\t/* 64-bit 0x0c --> sllx %g1, 32, %g1 */')
+        print('\t/* 64-bit 0x10 --> add %g1, %g2, %g1 */')
+        print('\t/* 64-bit 0x14 --> ldx [%g1 + %lo(_glapi_Dispatch)], %g1 */')
+        print('')
+        print('\t/* 32-bit 0x00 --> sethi %hi(_glapi_Dispatch), %g1 */')
+        print('\t/* 32-bit 0x04 --> ld [%g1 + %lo(_glapi_Dispatch)], %g1 */')
+        print('')
+        print('\t.data')
+        print('\t.align\t32')
+        print('')
+        print('\tGLOBL_FN(__glapi_sparc_pthread_stub)')
+        print('\tHIDDEN(__glapi_sparc_pthread_stub)')
+        print('__glapi_sparc_pthread_stub: /* Call offset in %g3 */')
+        print('\tmov\t%o7, %g1')
+        print('\tsethi\t%hi(_GLOBAL_OFFSET_TABLE_-4), %g2')
+        print('\tcall\t__glapi_sparc_get_pc')
+        print('\t add\t%g2, %lo(_GLOBAL_OFFSET_TABLE_+4), %g2')
+        print('\tmov\t%g1, %o7')
+        print('\tsethi\t%hi(_glapi_Dispatch), %g1')
+        print('\tor\t%g1, %lo(_glapi_Dispatch), %g1')
+        print('\tsrl\t%g3, 10, %g3')
+        print('\tGL_LL\t[%g2+%g1], %g2')
+        print('\tGL_LL\t[%g2], %g1')
+        print('\tcmp\t%g1, 0')
+        print('\tbe\t2f')
+        print('\t nop')
+        print('1:\tGL_LL\t[%g1 + %g3], %g1')
+        print('\tjmp\t%g1')
+        print('\t nop')
+        print('2:\tsave\t%sp, GL_STACK_SIZE, %sp')
+        print('\tmov\t%g3, %l0')
+        print('\tcall\t_glapi_get_dispatch')
+        print('\t nop')
+        print('\tmov\t%o0, %g1')
+        print('\tmov\t%l0, %g3')
+        print('\tba\t1b')
+        print('\t restore %g0, %g0, %g0')
+        print('\t.size\t__glapi_sparc_pthread_stub, .-__glapi_sparc_pthread_stub')
+        print('')
+        print('#define GL_STUB(fn, off)\t\t\t\\')
+        print('\tGLOBL_FN(fn);\t\t\t\t\\')
+        print('fn:\tba\t__glapi_sparc_pthread_stub;\t\\')
+        print('\t sethi\tGL_OFF(off), %g3;\t\t\\')
+        print('\t.size\tfn,.-fn;')
+        print('')
+        print('#else /* Non-threaded version. */')
+        print('')
+        print('\t.type	__glapi_sparc_nothread_stub, @function')
+        print('__glapi_sparc_nothread_stub: /* Call offset in %g3 */')
+        print('\tmov\t%o7, %g1')
+        print('\tsethi\t%hi(_GLOBAL_OFFSET_TABLE_-4), %g2')
+        print('\tcall\t__glapi_sparc_get_pc')
+        print('\t add\t%g2, %lo(_GLOBAL_OFFSET_TABLE_+4), %g2')
+        print('\tmov\t%g1, %o7')
+        print('\tsrl\t%g3, 10, %g3')
+        print('\tsethi\t%hi(_glapi_Dispatch), %g1')
+        print('\tor\t%g1, %lo(_glapi_Dispatch), %g1')
+        print('\tGL_LL\t[%g2+%g1], %g2')
+        print('\tGL_LL\t[%g2], %g1')
+        print('\tGL_LL\t[%g1 + %g3], %g1')
+        print('\tjmp\t%g1')
+        print('\t nop')
+        print('\t.size\t__glapi_sparc_nothread_stub, .-__glapi_sparc_nothread_stub')
+        print('')
+        print('#define GL_STUB(fn, off)\t\t\t\\')
+        print('\tGLOBL_FN(fn);\t\t\t\t\\')
+        print('fn:\tba\t__glapi_sparc_nothread_stub;\t\\')
+        print('\t sethi\tGL_OFF(off), %g3;\t\t\\')
+        print('\t.size\tfn,.-fn;')
+        print('')
+        print('#endif')
+        print('')
+        print('#define GL_STUB_ALIAS(fn, alias)		\\')
+        print('	.globl	fn;				\\')
+        print('	.set	fn, alias')
+        print('')
+        print('\t.text')
+        print('\t.align\t32')
+        print('')
+        print('\t.globl\tgl_dispatch_functions_start')
+        print('\tHIDDEN(gl_dispatch_functions_start)')
+        print('gl_dispatch_functions_start:')
+        print('')
         return
 
     def printRealFooter(self):
-        print ''
-        print '\t.globl\tgl_dispatch_functions_end'
-        print '\tHIDDEN(gl_dispatch_functions_end)'
-        print 'gl_dispatch_functions_end:'
+        print('')
+        print('\t.globl\tgl_dispatch_functions_end')
+        print('\tHIDDEN(gl_dispatch_functions_end)')
+        print('gl_dispatch_functions_end:')
         return
 
     def printBody(self, api):
         for f in api.functionIterateByOffset():
             name = f.dispatch_name()
 
-            print '\tGL_STUB(gl%s, %d)' % (name, f.offset)
+            print('\tGL_STUB(gl%s, %d)' % (name, f.offset))
 
             if not f.is_static_entry_point(f.name):
-                print '\tHIDDEN(gl%s)' % (name)
+                print('\tHIDDEN(gl%s)' % (name))
 
         for f in api.functionIterateByOffset():
             name = f.dispatch_name()
@@ -235,11 +237,11 @@
                         text = '\tGL_STUB_ALIAS(gl%s, gl%s)' % (n, f.name)
 
                         if f.has_different_protocol(n):
-                            print '#ifndef GLX_INDIRECT_RENDERING'
-                            print text
-                            print '#endif'
+                            print('#ifndef GLX_INDIRECT_RENDERING')
+                            print(text)
+                            print('#endif')
                         else:
-                            print text
+                            print(text)
 
         return
 
diff --git a/src/mapi/glapi/gen/gl_XML.py b/src/mapi/glapi/gen/gl_XML.py
index a5320e9..b4aa6be 100644
--- a/src/mapi/glapi/gen/gl_XML.py
+++ b/src/mapi/glapi/gen/gl_XML.py
@@ -24,9 +24,12 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
+from collections import OrderedDict
 from decimal import Decimal
 import xml.etree.ElementTree as ET
-import re, sys, string
+import re, sys
 import os.path
 import typeexpr
 import static_data
@@ -125,17 +128,17 @@
     def printHeader(self):
         """Print the header associated with all files and call the printRealHeader method."""
 
-        print '/* DO NOT EDIT - This file generated automatically by %s script */' \
-                % (self.name)
-        print ''
-        print '/*'
-        print (' * ' + self.license.replace('\n', '\n * ')).replace(' \n', '\n')
-        print ' */'
-        print ''
+        print('/* DO NOT EDIT - This file generated automatically by %s script */' \
+                % (self.name))
+        print('')
+        print('/*')
+        print((' * ' + self.license.replace('\n', '\n * ')).replace(' \n', '\n'))
+        print(' */')
+        print('')
         if self.header_tag:
-            print '#if !defined( %s )' % (self.header_tag)
-            print '#  define %s' % (self.header_tag)
-            print ''
+            print('#if !defined( %s )' % (self.header_tag))
+            print('#  define %s' % (self.header_tag))
+            print('')
         self.printRealHeader();
         return
 
@@ -146,13 +149,13 @@
         self.printRealFooter()
 
         if self.undef_list:
-            print ''
+            print('')
             for u in self.undef_list:
-                print "#  undef %s" % (u)
+                print("#  undef %s" % (u))
 
         if self.header_tag:
-            print ''
-            print '#endif /* !defined( %s ) */' % (self.header_tag)
+            print('')
+            print('#endif /* !defined( %s ) */' % (self.header_tag))
 
 
     def printRealHeader(self):
@@ -182,11 +185,11 @@
         The name is also added to the file's undef_list.
         """
         self.undef_list.append("PURE")
-        print """#  if defined(__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+        print("""#  if defined(__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 #    define PURE __attribute__((pure))
 #  else
 #    define PURE
-#  endif"""
+#  endif""")
         return
 
 
@@ -202,11 +205,11 @@
         """
 
         self.undef_list.append("FASTCALL")
-        print """#  if defined(__i386__) && defined(__GNUC__) && !defined(__CYGWIN__) && !defined(__MINGW32__)
+        print("""#  if defined(__i386__) && defined(__GNUC__) && !defined(__CYGWIN__) && !defined(__MINGW32__)
 #    define FASTCALL __attribute__((fastcall))
 #  else
 #    define FASTCALL
-#  endif"""
+#  endif""")
         return
 
 
@@ -222,11 +225,11 @@
         """
 
         self.undef_list.append(S)
-        print """#  if defined(__GNUC__) && !defined(__CYGWIN__) && !defined(__MINGW32__)
+        print("""#  if defined(__GNUC__) && !defined(__CYGWIN__) && !defined(__MINGW32__)
 #    define %s  __attribute__((visibility("%s")))
 #  else
 #    define %s
-#  endif""" % (S, s, S)
+#  endif""" % (S, s, S))
         return
 
 
@@ -242,11 +245,11 @@
         """
 
         self.undef_list.append("NOINLINE")
-        print """#  if defined(__GNUC__)
+        print("""#  if defined(__GNUC__)
 #    define NOINLINE __attribute__((noinline))
 #  else
 #    define NOINLINE
-#  endif"""
+#  endif""")
         return
 
 
@@ -281,7 +284,7 @@
 
     try:
         core_version = float(name)
-    except Exception,e:
+    except Exception:
         core_version = 0.0
 
     if core_version > 0.0:
@@ -317,7 +320,7 @@
 
     if len(list) == 0: list = ["void"]
 
-    return string.join(list, ", ")
+    return ", ".join(list)
 
 
 class gl_item(object):
@@ -362,7 +365,7 @@
         else:
             try:
                 c = int(temp)
-            except Exception,e:
+            except Exception:
                 raise RuntimeError('Invalid count value "%s" for enum "%s" in function "%s" when an integer was expected.' % (temp, self.name, n))
 
             self.default_count = c
@@ -423,7 +426,7 @@
             count = int(c)
             self.count = count
             self.counter = None
-        except Exception,e:
+        except Exception:
             count = 1
             self.count = 0
             self.counter = c
@@ -575,9 +578,9 @@
                 list.append( str(s) )
 
             if len(list) > 1 and use_parens :
-                return "safe_mul(%s)" % (string.join(list, ", "))
+                return "safe_mul(%s)" % ", ".join(list)
             else:
-                return string.join(list, " * ")
+                return " * ".join(list)
 
         elif self.is_image():
             return "compsize"
@@ -779,9 +782,9 @@
 
     def parameterIterator(self, name = None):
         if name is not None:
-            return self.entry_point_parameters[name].__iter__();
+            return iter(self.entry_point_parameters[name]);
         else:
-            return self.parameters.__iter__();
+            return iter(self.parameters);
 
 
     def get_parameter_string(self, entrypoint = None):
@@ -831,7 +834,7 @@
         versions.
         """
         result = []
-        for entry_point, api_to_ver in self.entry_point_api_map.iteritems():
+        for entry_point, api_to_ver in self.entry_point_api_map.items():
             if api not in api_to_ver:
                 continue
             if version is not None and version < api_to_ver[api]:
@@ -861,7 +864,7 @@
 
 class gl_api(object):
     def __init__(self, factory):
-        self.functions_by_name = {}
+        self.functions_by_name = OrderedDict()
         self.enums_by_name = {}
         self.types_by_name = {}
 
@@ -878,7 +881,7 @@
     def filter_functions(self, entry_point_list):
         """Filter out entry points not in entry_point_list."""
         functions_by_name = {}
-        for func in self.functions_by_name.itervalues():
+        for func in self.functions_by_name.values():
             entry_points = [ent for ent in func.entry_points if ent in entry_point_list]
             if entry_points:
                 func.filter_entry_points(entry_points)
@@ -891,7 +894,7 @@
         optionally, not in the given version of the given API).
         """
         functions_by_name = {}
-        for func in self.functions_by_name.itervalues():
+        for func in self.functions_by_name.values():
             entry_points = func.entry_points_for_api_version(api, version)
             if entry_points:
                 func.filter_entry_points(entry_points)
@@ -940,7 +943,7 @@
                 temp_name = child.get( "name" )
                 self.category_dict[ temp_name ] = [cat_name, cat_number]
 
-                if self.functions_by_name.has_key( func_name ):
+                if func_name in self.functions_by_name:
                     func = self.functions_by_name[ func_name ]
                     func.process_element( child )
                 else:
@@ -977,7 +980,7 @@
             if (cat == None) or (cat == cat_name):
                 [func_cat_type, key] = classify_category(cat_name, cat_number)
 
-                if not lists[func_cat_type].has_key(key):
+                if key not in lists[func_cat_type]:
                     lists[func_cat_type][key] = {}
 
                 lists[func_cat_type][key][func.name] = func
@@ -985,28 +988,26 @@
 
         functions = []
         for func_cat_type in range(0,4):
-            keys = lists[func_cat_type].keys()
-            keys.sort()
+            keys = sorted(lists[func_cat_type].keys())
 
             for key in keys:
-                names = lists[func_cat_type][key].keys()
-                names.sort()
+                names = sorted(lists[func_cat_type][key].keys())
 
                 for name in names:
                     functions.append(lists[func_cat_type][key][name])
 
-        return functions.__iter__()
+        return iter(functions)
 
 
     def functionIterateByOffset(self):
         max_offset = -1
-        for func in self.functions_by_name.itervalues():
+        for func in self.functions_by_name.values():
             if func.offset > max_offset:
                 max_offset = func.offset
 
 
         temp = [None for i in range(0, max_offset + 1)]
-        for func in self.functions_by_name.itervalues():
+        for func in self.functions_by_name.values():
             if func.offset != -1:
                 temp[ func.offset ] = func
 
@@ -1016,22 +1017,21 @@
             if temp[i]:
                 list.append(temp[i])
 
-        return list.__iter__();
+        return iter(list);
 
 
     def functionIterateAll(self):
-        return self.functions_by_name.itervalues()
+        return self.functions_by_name.values()
 
 
     def enumIterateByName(self):
-        keys = self.enums_by_name.keys()
-        keys.sort()
+        keys = sorted(self.enums_by_name.keys())
 
         list = []
         for enum in keys:
             list.append( self.enums_by_name[ enum ] )
 
-        return list.__iter__()
+        return iter(list)
 
 
     def categoryIterate(self):
@@ -1044,29 +1044,28 @@
 
         list = []
         for cat_type in range(0,4):
-            keys = self.categories[cat_type].keys()
-            keys.sort()
+            keys = sorted(self.categories[cat_type].keys())
 
             for key in keys:
                 list.append(self.categories[cat_type][key])
 
-        return list.__iter__()
+        return iter(list)
 
 
     def get_category_for_name( self, name ):
-        if self.category_dict.has_key(name):
+        if name in self.category_dict:
             return self.category_dict[name]
         else:
             return ["<unknown category>", None]
 
 
     def typeIterate(self):
-        return self.types_by_name.itervalues()
+        return self.types_by_name.values()
 
 
     def find_type( self, type_name ):
         if type_name in self.types_by_name:
             return self.types_by_name[ type_name ].type_expr
         else:
-            print "Unable to find base type matching \"%s\"." % (type_name)
+            print("Unable to find base type matching \"%s\"." % (type_name))
             return None
diff --git a/src/mapi/glapi/gen/gl_apitemp.py b/src/mapi/glapi/gen/gl_apitemp.py
index a8e5d81..d0583f4 100644
--- a/src/mapi/glapi/gen/gl_apitemp.py
+++ b/src/mapi/glapi/gen/gl_apitemp.py
@@ -24,6 +24,8 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
 import argparse
 
 import gl_XML, glX_XML
@@ -97,27 +99,27 @@
             if (cat.startswith("es") or cat.startswith("GL_OES")):
                 need_proto = True
         if need_proto:
-            print '%s %s KEYWORD2 NAME(%s)(%s);' % (keyword, f.return_type, n, f.get_parameter_string(name))
-            print ''
+            print('%s %s KEYWORD2 NAME(%s)(%s);' % (keyword, f.return_type, n, f.get_parameter_string(name)))
+            print('')
 
-        print '%s %s KEYWORD2 NAME(%s)(%s)' % (keyword, f.return_type, n, f.get_parameter_string(name))
-        print '{'
+        print('%s %s KEYWORD2 NAME(%s)(%s)' % (keyword, f.return_type, n, f.get_parameter_string(name)))
+        print('{')
         if silence:
-            print '    %s' % (silence)
+            print('    %s' % (silence))
         if p_string == "":
-            print '   %s(%s, (), (F, "gl%s();\\n"));' \
-                    % (dispatch, f.name, name)
+            print('   %s(%s, (), (F, "gl%s();\\n"));' \
+                    % (dispatch, f.name, name))
         else:
-            print '   %s(%s, (%s), (F, "gl%s(%s);\\n", %s));' \
-                    % (dispatch, f.name, p_string, name, t_string, o_string)
-        print '}'
-        print ''
+            print('   %s(%s, (%s), (F, "gl%s(%s);\\n", %s));' \
+                    % (dispatch, f.name, p_string, name, t_string, o_string))
+        print('}')
+        print('')
         return
 
     def printRealHeader(self):
-        print ''
+        print('')
         self.printVisibility( "HIDDEN", "hidden" )
-        print """
+        print("""
 /*
  * This file is a template which generates the OpenGL API entry point
  * functions.  It should be included by a .c file which first defines
@@ -164,13 +166,13 @@
 #error RETURN_DISPATCH must be defined
 #endif
 
-"""
+""")
         return
 
 
 
     def printInitDispatch(self, api):
-        print """
+        print("""
 #endif /* defined( NAME ) */
 
 /*
@@ -187,31 +189,31 @@
 #error _GLAPI_SKIP_NORMAL_ENTRY_POINTS must not be defined
 #endif
 
-_glapi_proc DISPATCH_TABLE_NAME[] = {"""
+_glapi_proc DISPATCH_TABLE_NAME[] = {""")
         for f in api.functionIterateByOffset():
-            print '   TABLE_ENTRY(%s),' % (f.dispatch_name())
+            print('   TABLE_ENTRY(%s),' % (f.dispatch_name()))
 
-        print '   /* A whole bunch of no-op functions.  These might be called'
-        print '    * when someone tries to call a dynamically-registered'
-        print '    * extension function without a current rendering context.'
-        print '    */'
+        print('   /* A whole bunch of no-op functions.  These might be called')
+        print('    * when someone tries to call a dynamically-registered')
+        print('    * extension function without a current rendering context.')
+        print('    */')
         for i in range(1, 100):
-            print '   TABLE_ENTRY(Unused),'
+            print('   TABLE_ENTRY(Unused),')
 
-        print '};'
-        print '#endif /* DISPATCH_TABLE_NAME */'
-        print ''
+        print('};')
+        print('#endif /* DISPATCH_TABLE_NAME */')
+        print('')
         return
 
 
     def printAliasedTable(self, api):
-        print """
+        print("""
 /*
  * This is just used to silence compiler warnings.
  * We list the functions which are not otherwise used.
  */
 #ifdef UNUSED_TABLE_NAME
-_glapi_proc UNUSED_TABLE_NAME[] = {"""
+_glapi_proc UNUSED_TABLE_NAME[] = {""")
 
         normal_entries = []
         proto_entries = []
@@ -230,18 +232,18 @@
             normal_entries.extend(normal_ents)
             proto_entries.extend(proto_ents)
 
-        print '#ifndef _GLAPI_SKIP_NORMAL_ENTRY_POINTS'
+        print('#ifndef _GLAPI_SKIP_NORMAL_ENTRY_POINTS')
         for ent in normal_entries:
-            print '   TABLE_ENTRY(%s),' % (ent)
-        print '#endif /* _GLAPI_SKIP_NORMAL_ENTRY_POINTS */'
-        print '#ifndef _GLAPI_SKIP_PROTO_ENTRY_POINTS'
+            print('   TABLE_ENTRY(%s),' % (ent))
+        print('#endif /* _GLAPI_SKIP_NORMAL_ENTRY_POINTS */')
+        print('#ifndef _GLAPI_SKIP_PROTO_ENTRY_POINTS')
         for ent in proto_entries:
-            print '   TABLE_ENTRY(%s),' % (ent)
-        print '#endif /* _GLAPI_SKIP_PROTO_ENTRY_POINTS */'
+            print('   TABLE_ENTRY(%s),' % (ent))
+        print('#endif /* _GLAPI_SKIP_PROTO_ENTRY_POINTS */')
 
-        print '};'
-        print '#endif /*UNUSED_TABLE_NAME*/'
-        print ''
+        print('};')
+        print('#endif /*UNUSED_TABLE_NAME*/')
+        print('')
         return
 
 
@@ -278,23 +280,23 @@
             normal_entry_points.append((func, normal_ents))
             proto_entry_points.append((func, proto_ents))
 
-        print '#ifndef _GLAPI_SKIP_NORMAL_ENTRY_POINTS'
-        print ''
+        print('#ifndef _GLAPI_SKIP_NORMAL_ENTRY_POINTS')
+        print('')
         for func, ents in normal_entry_points:
             for ent in ents:
                 self.printFunction(func, ent)
-        print ''
-        print '#endif /* _GLAPI_SKIP_NORMAL_ENTRY_POINTS */'
-        print ''
-        print '/* these entry points might require different protocols */'
-        print '#ifndef _GLAPI_SKIP_PROTO_ENTRY_POINTS'
-        print ''
+        print('')
+        print('#endif /* _GLAPI_SKIP_NORMAL_ENTRY_POINTS */')
+        print('')
+        print('/* these entry points might require different protocols */')
+        print('#ifndef _GLAPI_SKIP_PROTO_ENTRY_POINTS')
+        print('')
         for func, ents in proto_entry_points:
             for ent in ents:
                 self.printFunction(func, ent)
-        print ''
-        print '#endif /* _GLAPI_SKIP_PROTO_ENTRY_POINTS */'
-        print ''
+        print('')
+        print('#endif /* _GLAPI_SKIP_PROTO_ENTRY_POINTS */')
+        print('')
 
         self.printInitDispatch(api)
         self.printAliasedTable(api)
diff --git a/src/mapi/glapi/gen/gl_enums.py b/src/mapi/glapi/gen/gl_enums.py
index 768a54a..00f8134 100644
--- a/src/mapi/glapi/gen/gl_enums.py
+++ b/src/mapi/glapi/gen/gl_enums.py
@@ -25,6 +25,8 @@
 # Authors:
 #    Zack Rusin <zack@kde.org>
 
+from __future__ import print_function
+
 import argparse
 
 import license
@@ -48,20 +50,20 @@
 
 
     def printRealHeader(self):
-        print '#include "main/glheader.h"'
-        print '#include "main/enums.h"'
-        print '#include "main/imports.h"'
-        print '#include "main/mtypes.h"'
-        print ''
-        print 'typedef struct PACKED {'
-        print '   uint32_t offset;'
-        print '   int n;'
-        print '} enum_elt;'
-        print ''
+        print('#include "main/glheader.h"')
+        print('#include "main/enums.h"')
+        print('#include "main/imports.h"')
+        print('#include "main/mtypes.h"')
+        print('')
+        print('typedef struct PACKED {')
+        print('   uint32_t offset;')
+        print('   int n;')
+        print('} enum_elt;')
+        print('')
         return
 
     def print_code(self):
-        print """
+        print("""
 typedef int (*cfunc)(const void *, const void *);
 
 /**
@@ -144,7 +146,7 @@
 }
 
 
-"""
+""")
         return
 
 
@@ -154,37 +156,37 @@
         sorted_enum_values = sorted(self.enum_table.keys())
         string_offsets = {}
         i = 0;
-        print '#if defined(__GNUC__)'
-        print '# define LONGSTRING __extension__'
-        print '#else'
-        print '# define LONGSTRING'
-        print '#endif'
-        print ''
-        print 'LONGSTRING static const char enum_string_table[] = {'
+        print('#if defined(__GNUC__)')
+        print('# define LONGSTRING __extension__')
+        print('#else')
+        print('# define LONGSTRING')
+        print('#endif')
+        print('')
+        print('LONGSTRING static const char enum_string_table[] = {')
         # We express the very long concatenation of enum strings as an array
         # of characters rather than as a string literal to work-around MSVC's
         # 65535 character limit.
         for enum in sorted_enum_values:
             (name, pri) = self.enum_table[enum]
-            print "  ",
+            print("  ", end=' ')
             for ch in name:
-                print "'%c'," % ch,
-            print "'\\0',"
+                print("'%c'," % ch, end=' ')
+            print("'\\0',")
 
             string_offsets[ enum ] = i
             i += len(name) + 1
 
-        print '};'
-        print ''
+        print('};')
+        print('')
 
 
-        print 'static const enum_elt enum_string_table_offsets[%u] =' % (len(self.enum_table))
-        print '{'
+        print('static const enum_elt enum_string_table_offsets[%u] =' % (len(self.enum_table)))
+        print('{')
         for enum in sorted_enum_values:
             (name, pri) = self.enum_table[enum]
-            print '   { %5u, 0x%08X }, /* %s */' % (string_offsets[enum], enum, name)
-        print '};'
-        print ''
+            print('   { %5u, 0x%08X }, /* %s */' % (string_offsets[enum], enum, name))
+        print('};')
+        print('')
 
         self.print_code()
         return
@@ -240,7 +242,7 @@
             # confuse us.  GL_ACTIVE_PROGRAM_EXT is OK to lose because
             # we choose GL_ACTIVE PROGRAM instead.
             if name in self.string_to_int and name != "GL_ACTIVE_PROGRAM_EXT":
-                print "#error Renumbering {0} from {1} to {2}".format(name, self.string_to_int[name], value)
+                print("#error Renumbering {0} from {1} to {2}".format(name, self.string_to_int[name], value))
 
             self.string_to_int[name] = value
 
diff --git a/src/mapi/glapi/gen/gl_genexec.py b/src/mapi/glapi/gen/gl_genexec.py
index aaff9f2..fc5b10a 100644
--- a/src/mapi/glapi/gen/gl_genexec.py
+++ b/src/mapi/glapi/gen/gl_genexec.py
@@ -24,6 +24,8 @@
 # _mesa_initialize_exec_table().  It is responsible for populating all
 # entries in the "exec" dispatch table that aren't dynamic.
 
+from __future__ import print_function
+
 import argparse
 import collections
 import license
@@ -62,6 +64,7 @@
 #include "main/colortab.h"
 #include "main/compute.h"
 #include "main/condrender.h"
+#include "main/conservativeraster.h"
 #include "main/context.h"
 #include "main/convolve.h"
 #include "main/copyimage.h"
@@ -169,10 +172,10 @@
             'Intel Corporation')
 
     def printRealHeader(self):
-        print header
+        print(header)
 
     def printRealFooter(self):
-        print footer
+        print(footer)
 
     def printBody(self, api):
         # Collect SET_* calls by the condition under which they should
@@ -248,10 +251,10 @@
         # Print out an if statement for each unique condition, with
         # the SET_* calls nested inside it.
         for condition in sorted(settings_by_condition.keys()):
-            print '   if ({0}) {{'.format(condition)
+            print('   if ({0}) {{'.format(condition))
             for setting in sorted(settings_by_condition[condition]):
-                print '      {0}'.format(setting)
-            print '   }'
+                print('      {0}'.format(setting))
+            print('   }')
 
 
 def _parser():
diff --git a/src/mapi/glapi/gen/gl_gentable.py b/src/mapi/glapi/gen/gl_gentable.py
index 50153bb..9d8923c 100644
--- a/src/mapi/glapi/gen/gl_gentable.py
+++ b/src/mapi/glapi/gen/gl_gentable.py
@@ -29,6 +29,8 @@
 # Based on code ogiginally by:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
 import argparse
 
 import license
@@ -187,12 +189,12 @@
 
 
     def printRealHeader(self):
-        print header
+        print(header)
         return
 
 
     def printRealFooter(self):
-        print footer
+        print(footer)
         return
 
 
@@ -200,13 +202,13 @@
 
         # Determine how many functions have a defined offset.
         func_count = 0
-        for f in api.functions_by_name.itervalues():
+        for f in api.functions_by_name.values():
             if f.offset != -1:
                 func_count += 1
 
         # Build the mapping from offset to function name.
         funcnames = [None] * func_count
-        for f in api.functions_by_name.itervalues():
+        for f in api.functions_by_name.values():
             if f.offset != -1:
                 if not (funcnames[f.offset] is None):
                     raise Exception("Function table has more than one function with same offset (offset %d, func %s)" % (f.offset, f.name))
@@ -214,15 +216,15 @@
 
         # Check that the table has no gaps.  We expect a function at every offset,
         # and the code which generates the table relies on this.
-        for i in xrange(0, func_count):
+        for i in range(0, func_count):
             if funcnames[i] is None:
                 raise Exception("Function table has no function at offset %d" % (i))
 
-        print "#define GLAPI_TABLE_COUNT %d" % func_count
-        print "static const char * const _glapi_table_func_names[GLAPI_TABLE_COUNT] = {"
-        for i in xrange(0, func_count):
-            print "    /* %5d */ \"%s\"," % (i, funcnames[i])
-        print "};"
+        print("#define GLAPI_TABLE_COUNT %d" % func_count)
+        print("static const char * const _glapi_table_func_names[GLAPI_TABLE_COUNT] = {")
+        for i in range(0, func_count):
+            print("    /* %5d */ \"%s\"," % (i, funcnames[i]))
+        print("};")
 
         return
 
diff --git a/src/mapi/glapi/gen/gl_marshal.py b/src/mapi/glapi/gen/gl_marshal.py
index 6a2c0d7..5b35357 100644
--- a/src/mapi/glapi/gen/gl_marshal.py
+++ b/src/mapi/glapi/gen/gl_marshal.py
@@ -20,6 +20,8 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
+from __future__ import print_function
+
 import contextlib
 import getopt
 import gl_XML
@@ -42,9 +44,9 @@
 
 def out(str):
     if str:
-        print ' '*current_indent + str
+        print(' '*current_indent + str)
     else:
-        print ''
+        print('')
 
 
 @contextlib.contextmanager
@@ -64,15 +66,15 @@
             'Copyright (C) 2012 Intel Corporation', 'INTEL CORPORATION')
 
     def printRealHeader(self):
-        print header
-        print 'static inline int safe_mul(int a, int b)'
-        print '{'
-        print '    if (a < 0 || b < 0) return -1;'
-        print '    if (a == 0 || b == 0) return 0;'
-        print '    if (a > INT_MAX / b) return -1;'
-        print '    return a * b;'
-        print '}'
-        print
+        print(header)
+        print('static inline int safe_mul(int a, int b)')
+        print('{')
+        print('    if (a < 0 || b < 0) return -1;')
+        print('    if (a == 0 || b == 0) return 0;')
+        print('    if (a > INT_MAX / b) return -1;')
+        print('    return a * b;')
+        print('}')
+        print()
 
     def printRealFooter(self):
         pass
@@ -342,7 +344,7 @@
 
 
 def show_usage():
-    print 'Usage: %s [-f input_file_name]' % sys.argv[0]
+    print('Usage: %s [-f input_file_name]' % sys.argv[0])
     sys.exit(1)
 
 
@@ -351,7 +353,7 @@
 
     try:
         (args, trail) = getopt.getopt(sys.argv[1:], 'm:f:')
-    except Exception,e:
+    except Exception:
         show_usage()
 
     for (arg,val) in args:
diff --git a/src/mapi/glapi/gen/gl_marshal_h.py b/src/mapi/glapi/gen/gl_marshal_h.py
index 998ca59..a7a9eda 100644
--- a/src/mapi/glapi/gen/gl_marshal_h.py
+++ b/src/mapi/glapi/gen/gl_marshal_h.py
@@ -20,6 +20,8 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
+from __future__ import print_function
+
 import getopt
 import gl_XML
 import license
@@ -46,24 +48,24 @@
             'Copyright (C) 2012 Intel Corporation', 'INTEL CORPORATION')
 
     def printRealHeader(self):
-        print header
+        print(header)
 
     def printRealFooter(self):
-        print footer
+        print(footer)
 
     def printBody(self, api):
-        print 'enum marshal_dispatch_cmd_id'
-        print '{'
+        print('enum marshal_dispatch_cmd_id')
+        print('{')
         for func in api.functionIterateAll():
             flavor = func.marshal_flavor()
             if flavor in ('skip', 'sync'):
                 continue
-            print '   DISPATCH_CMD_{0},'.format(func.name)
-        print '};'
+            print('   DISPATCH_CMD_{0},'.format(func.name))
+        print('};')
 
 
 def show_usage():
-    print 'Usage: %s [-f input_file_name]' % sys.argv[0]
+    print('Usage: %s [-f input_file_name]' % sys.argv[0])
     sys.exit(1)
 
 
@@ -72,7 +74,7 @@
 
     try:
         (args, trail) = getopt.getopt(sys.argv[1:], 'm:f:')
-    except Exception,e:
+    except Exception:
         show_usage()
 
     for (arg,val) in args:
diff --git a/src/mapi/glapi/gen/gl_procs.py b/src/mapi/glapi/gen/gl_procs.py
index d9ea1ab..4bd3321 100644
--- a/src/mapi/glapi/gen/gl_procs.py
+++ b/src/mapi/glapi/gen/gl_procs.py
@@ -24,6 +24,8 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
 import argparse
 
 import license
@@ -42,7 +44,7 @@
 (C) Copyright IBM Corporation 2004, 2006""", "BRIAN PAUL, IBM")
 
     def printRealHeader(self):
-        print """
+        print("""
 /* This file is only included by glapi.c and is used for
  * the GetProcAddress() function
  */
@@ -65,20 +67,20 @@
 #  define NAME_FUNC_OFFSET(n,f1,f2,f3,o) { n , (_glapi_proc) f3 , o }
 #endif
 
-"""
+""")
         return
 
     def printRealFooter(self):
-        print ''
-        print '#undef NAME_FUNC_OFFSET'
+        print('')
+        print('#undef NAME_FUNC_OFFSET')
         return
 
     def printFunctionString(self, name):
-        print '    "gl%s\\0"' % (name)
+        print('    "gl%s\\0"' % (name))
 
     def printBody(self, api):
-        print ''
-        print 'static const char gl_string_table[] ='
+        print('')
+        print('static const char gl_string_table[] =')
 
         base_offset = 0
         table = []
@@ -108,23 +110,23 @@
                     base_offset += len(n) + 3
 
 
-        print '    ;'
-        print ''
-        print ''
-        print "#ifdef USE_MGL_NAMESPACE"
+        print('    ;')
+        print('')
+        print('')
+        print("#ifdef USE_MGL_NAMESPACE")
         for func in api.functionIterateByOffset():
             for n in func.entry_points:
                 if (not func.is_static_entry_point(func.name)) or (func.has_different_protocol(n) and not func.is_static_entry_point(n)):
-                    print '#define gl_dispatch_stub_%u mgl_dispatch_stub_%u' % (func.offset, func.offset)
+                    print('#define gl_dispatch_stub_%u mgl_dispatch_stub_%u' % (func.offset, func.offset))
                     break
-        print "#endif /* USE_MGL_NAMESPACE */"
-        print ''
-        print ''
-        print '#if defined(NEED_FUNCTION_POINTER) || defined(GLX_INDIRECT_RENDERING)'
+        print("#endif /* USE_MGL_NAMESPACE */")
+        print('')
+        print('')
+        print('#if defined(NEED_FUNCTION_POINTER) || defined(GLX_INDIRECT_RENDERING)')
         for func in api.functionIterateByOffset():
             for n in func.entry_points:
                 if (not func.is_static_entry_point(func.name)) or (func.has_different_protocol(n) and not func.is_static_entry_point(n)):
-                    print '%s GLAPIENTRY gl_dispatch_stub_%u(%s);' % (func.return_type, func.offset, func.get_parameter_string())
+                    print('%s GLAPIENTRY gl_dispatch_stub_%u(%s);' % (func.return_type, func.offset, func.get_parameter_string()))
                     break
 
         if self.es:
@@ -133,32 +135,31 @@
                 for n in func.entry_points:
                     cat, num = api.get_category_for_name(n)
                     if (cat.startswith("es") or cat.startswith("GL_OES")):
-                        if not categories.has_key(cat):
+                        if cat not in categories:
                             categories[cat] = []
                         proto = 'GLAPI %s GLAPIENTRY %s(%s);' \
                                         % (func.return_type, "gl" + n, func.get_parameter_string(n))
                         categories[cat].append(proto)
             if categories:
-                print ''
-                print '/* OpenGL ES specific prototypes */'
-                print ''
-                keys = categories.keys()
-                keys.sort()
+                print('')
+                print('/* OpenGL ES specific prototypes */')
+                print('')
+                keys = sorted(categories.keys())
                 for key in keys:
-                    print '/* category %s */' % key
-                    print "\n".join(categories[key])
-                print ''
+                    print('/* category %s */' % key)
+                    print("\n".join(categories[key]))
+                print('')
 
-        print '#endif /* defined(NEED_FUNCTION_POINTER) || defined(GLX_INDIRECT_RENDERING) */'
+        print('#endif /* defined(NEED_FUNCTION_POINTER) || defined(GLX_INDIRECT_RENDERING) */')
 
-        print ''
-        print 'static const glprocs_table_t static_functions[] = {'
+        print('')
+        print('static const glprocs_table_t static_functions[] = {')
 
         for info in table:
-            print '    NAME_FUNC_OFFSET(%5u, %s, %s, %s, %d),' % info
+            print('    NAME_FUNC_OFFSET(%5u, %s, %s, %s, %d),' % info)
 
-        print '    NAME_FUNC_OFFSET(-1, NULL, NULL, NULL, 0)'
-        print '};'
+        print('    NAME_FUNC_OFFSET(-1, NULL, NULL, NULL, 0)')
+        print('};')
         return
 
 
diff --git a/src/mapi/glapi/gen/gl_table.py b/src/mapi/glapi/gen/gl_table.py
index 579efa8..4b89ef8 100644
--- a/src/mapi/glapi/gen/gl_table.py
+++ b/src/mapi/glapi/gen/gl_table.py
@@ -25,6 +25,8 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
 import argparse
 
 import gl_XML
@@ -45,33 +47,33 @@
     def printBody(self, api):
         for f in api.functionIterateByOffset():
             arg_string = f.get_parameter_string()
-            print '   %s (GLAPIENTRYP %s)(%s); /* %d */' % (
-                f.return_type, f.name, arg_string, f.offset)
+            print('   %s (GLAPIENTRYP %s)(%s); /* %d */' % (
+                f.return_type, f.name, arg_string, f.offset))
 
     def printRealHeader(self):
-        print '#ifndef GLAPIENTRYP'
-        print '# ifndef GLAPIENTRY'
-        print '#  define GLAPIENTRY'
-        print '# endif'
-        print ''
-        print '# define GLAPIENTRYP GLAPIENTRY *'
-        print '#endif'
-        print ''
-        print ''
-        print '#ifdef __cplusplus'
-        print 'extern "C" {'
-        print '#endif'
-        print ''
-        print 'struct _glapi_table'
-        print '{'
+        print('#ifndef GLAPIENTRYP')
+        print('# ifndef GLAPIENTRY')
+        print('#  define GLAPIENTRY')
+        print('# endif')
+        print('')
+        print('# define GLAPIENTRYP GLAPIENTRY *')
+        print('#endif')
+        print('')
+        print('')
+        print('#ifdef __cplusplus')
+        print('extern "C" {')
+        print('#endif')
+        print('')
+        print('struct _glapi_table')
+        print('{')
         return
 
     def printRealFooter(self):
-        print '};'
-        print ''
-        print '#ifdef __cplusplus'
-        print '}'
-        print '#endif'
+        print('};')
+        print('')
+        print('#ifdef __cplusplus')
+        print('}')
+        print('#endif')
         return
 
 
@@ -87,7 +89,7 @@
 
 
     def printRealHeader(self):
-        print """
+        print("""
 /**
  * \\file main/dispatch.h
  * Macros for handling GL dispatch tables.
@@ -98,27 +100,27 @@
  * can SET_FuncName, are used to get and set the dispatch pointer for the
  * named function in the specified dispatch table.
  */
-"""
+""")
         return
 
 
     def printBody(self, api):
-        print '#define CALL_by_offset(disp, cast, offset, parameters) \\'
-        print '    (*(cast (GET_by_offset(disp, offset)))) parameters'
-        print '#define GET_by_offset(disp, offset) \\'
-        print '    (offset >= 0) ? (((_glapi_proc *)(disp))[offset]) : NULL'
-        print '#define SET_by_offset(disp, offset, fn) \\'
-        print '    do { \\'
-        print '        if ( (offset) < 0 ) { \\'
-        print '            /* fprintf( stderr, "[%s:%u] SET_by_offset(%p, %d, %s)!\\n", */ \\'
-        print '            /*         __func__, __LINE__, disp, offset, # fn); */ \\'
-        print '            /* abort(); */ \\'
-        print '        } \\'
-        print '        else { \\'
-        print '            ( (_glapi_proc *) (disp) )[offset] = (_glapi_proc) fn; \\'
-        print '        } \\'
-        print '    } while(0)'
-        print ''
+        print('#define CALL_by_offset(disp, cast, offset, parameters) \\')
+        print('    (*(cast (GET_by_offset(disp, offset)))) parameters')
+        print('#define GET_by_offset(disp, offset) \\')
+        print('    (offset >= 0) ? (((_glapi_proc *)(disp))[offset]) : NULL')
+        print('#define SET_by_offset(disp, offset, fn) \\')
+        print('    do { \\')
+        print('        if ( (offset) < 0 ) { \\')
+        print('            /* fprintf( stderr, "[%s:%u] SET_by_offset(%p, %d, %s)!\\n", */ \\')
+        print('            /*         __func__, __LINE__, disp, offset, # fn); */ \\')
+        print('            /* abort(); */ \\')
+        print('        } \\')
+        print('        else { \\')
+        print('            ( (_glapi_proc *) (disp) )[offset] = (_glapi_proc) fn; \\')
+        print('        } \\')
+        print('    } while(0)')
+        print('')
 
         functions = []
         abi_functions = []
@@ -130,43 +132,43 @@
             else:
                 abi_functions.append([f, -1])
 
-        print '/* total number of offsets below */'
-        print '#define _gloffset_COUNT %d' % (len(abi_functions + functions))
-        print ''
+        print('/* total number of offsets below */')
+        print('#define _gloffset_COUNT %d' % (len(abi_functions + functions)))
+        print('')
 
         for f, index in abi_functions:
-            print '#define _gloffset_%s %d' % (f.name, f.offset)
+            print('#define _gloffset_%s %d' % (f.name, f.offset))
 
         remap_table = "driDispatchRemapTable"
 
-        print '#define %s_size %u' % (remap_table, count)
-        print 'extern int %s[ %s_size ];' % (remap_table, remap_table)
-        print ''
+        print('#define %s_size %u' % (remap_table, count))
+        print('extern int %s[ %s_size ];' % (remap_table, remap_table))
+        print('')
 
         for f, index in functions:
-            print '#define %s_remap_index %u' % (f.name, index)
+            print('#define %s_remap_index %u' % (f.name, index))
 
-        print ''
+        print('')
 
         for f, index in functions:
-            print '#define _gloffset_%s %s[%s_remap_index]' % (f.name, remap_table, f.name)
+            print('#define _gloffset_%s %s[%s_remap_index]' % (f.name, remap_table, f.name))
 
-        print ''
+        print('')
 
         for f, index in abi_functions + functions:
             arg_string = gl_XML.create_parameter_string(f.parameters, 0)
 
-            print 'typedef %s (GLAPIENTRYP _glptr_%s)(%s);' % (f.return_type, f.name, arg_string)
-            print '#define CALL_%s(disp, parameters) \\' % (f.name)
-            print '    (* GET_%s(disp)) parameters' % (f.name)
-            print 'static inline _glptr_%s GET_%s(struct _glapi_table *disp) {' % (f.name, f.name)
-            print '   return (_glptr_%s) (GET_by_offset(disp, _gloffset_%s));' % (f.name, f.name)
-            print '}'
-            print
-            print 'static inline void SET_%s(struct _glapi_table *disp, %s (GLAPIENTRYP fn)(%s)) {' % (f.name, f.return_type, arg_string)
-            print '   SET_by_offset(disp, _gloffset_%s, fn);' % (f.name)
-            print '}'
-            print
+            print('typedef %s (GLAPIENTRYP _glptr_%s)(%s);' % (f.return_type, f.name, arg_string))
+            print('#define CALL_%s(disp, parameters) \\' % (f.name))
+            print('    (* GET_%s(disp)) parameters' % (f.name))
+            print('static inline _glptr_%s GET_%s(struct _glapi_table *disp) {' % (f.name, f.name))
+            print('   return (_glptr_%s) (GET_by_offset(disp, _gloffset_%s));' % (f.name, f.name))
+            print('}')
+            print()
+            print('static inline void SET_%s(struct _glapi_table *disp, %s (GLAPIENTRYP fn)(%s)) {' % (f.name, f.return_type, arg_string))
+            print('   SET_by_offset(disp, _gloffset_%s, fn);' % (f.name))
+            print('}')
+            print()
 
         return
 
diff --git a/src/mapi/glapi/gen/gl_x86-64_asm.py b/src/mapi/glapi/gen/gl_x86-64_asm.py
index cde80ec..36d3ecd 100644
--- a/src/mapi/glapi/gen/gl_x86-64_asm.py
+++ b/src/mapi/glapi/gen/gl_x86-64_asm.py
@@ -24,6 +24,8 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
 import argparse
 import copy
 
@@ -54,7 +56,7 @@
     adjust_stack = 0
     if not should_use_push(registers):
         adjust_stack = local_size(registers)
-        print '\tsubq\t$%u, %%rsp' % (adjust_stack)
+        print('\tsubq\t$%u, %%rsp' % (adjust_stack))
 
     for [reg, stack_offset] in registers:
         save_reg( reg, stack_offset, adjust_stack )
@@ -72,18 +74,18 @@
         restore_reg(reg, stack_offset, adjust_stack)
 
     if adjust_stack:
-        print '\taddq\t$%u, %%rsp' % (adjust_stack)
+        print('\taddq\t$%u, %%rsp' % (adjust_stack))
     return
 
 
 def save_reg(reg, offset, use_move):
     if use_move:
         if offset == 0:
-            print '\tmovq\t%s, (%%rsp)' % (reg)
+            print('\tmovq\t%s, (%%rsp)' % (reg))
         else:
-            print '\tmovq\t%s, %u(%%rsp)' % (reg, offset)
+            print('\tmovq\t%s, %u(%%rsp)' % (reg, offset))
     else:
-        print '\tpushq\t%s' % (reg)
+        print('\tpushq\t%s' % (reg))
 
     return
 
@@ -91,11 +93,11 @@
 def restore_reg(reg, offset, use_move):
     if use_move:
         if offset == 0:
-            print '\tmovq\t(%%rsp), %s' % (reg)
+            print('\tmovq\t(%%rsp), %s' % (reg))
         else:
-            print '\tmovq\t%u(%%rsp), %s' % (offset, reg)
+            print('\tmovq\t%u(%%rsp), %s' % (offset, reg))
     else:
-        print '\tpopq\t%s' % (reg)
+        print('\tpopq\t%s' % (reg))
 
     return
 
@@ -119,62 +121,62 @@
 
 
     def printRealHeader(self):
-        print "/* If we build with gcc's -fvisibility=hidden flag, we'll need to change"
-        print " * the symbol visibility mode to 'default'."
-        print ' */'
-        print ''
-        print '#include "x86/assyntax.h"'
-        print ''
-        print '#ifdef __GNUC__'
-        print '#  pragma GCC visibility push(default)'
-        print '#  define HIDDEN(x) .hidden x'
-        print '#else'
-        print '#  define HIDDEN(x)'
-        print '#endif'
-        print ''
-        print '# if defined(USE_MGL_NAMESPACE)'
-        print '#  define GL_PREFIX(n) GLNAME(CONCAT(mgl,n))'
-        print '#  define _glapi_Dispatch _mglapi_Dispatch'
-        print '# else'
-        print '#  define GL_PREFIX(n) GLNAME(CONCAT(gl,n))'
-        print '# endif'
-        print ''
-        print '\t.text'
-        print ''
-        print '#ifdef GLX_USE_TLS'
-        print ''
-        print '_x86_64_get_dispatch:'
-        print '\tmovq\t_glapi_tls_Dispatch@GOTTPOFF(%rip), %rax'
-        print '\tmovq\t%fs:(%rax), %rax'
-        print '\tret'
-        print '\t.size\t_x86_64_get_dispatch, .-_x86_64_get_dispatch'
-        print ''
-        print '#elif defined(HAVE_PTHREAD)'
-        print ''
-        print '\t.extern\t_glapi_Dispatch'
-        print '\t.extern\t_gl_DispatchTSD'
-        print '\t.extern\tpthread_getspecific'
-        print ''
-        print '\t.p2align\t4,,15'
-        print '_x86_64_get_dispatch:'
-        print '\tmovq\t_gl_DispatchTSD@GOTPCREL(%rip), %rax'
-        print '\tmovl\t(%rax), %edi'
-        print '\tjmp\tpthread_getspecific@PLT'
-        print ''
-        print '#else'
-        print ''
-        print '\t.extern\t_glapi_get_dispatch'
-        print ''
-        print '#endif'
-        print ''
+        print("/* If we build with gcc's -fvisibility=hidden flag, we'll need to change")
+        print(" * the symbol visibility mode to 'default'.")
+        print(' */')
+        print('')
+        print('#include "x86/assyntax.h"')
+        print('')
+        print('#ifdef __GNUC__')
+        print('#  pragma GCC visibility push(default)')
+        print('#  define HIDDEN(x) .hidden x')
+        print('#else')
+        print('#  define HIDDEN(x)')
+        print('#endif')
+        print('')
+        print('# if defined(USE_MGL_NAMESPACE)')
+        print('#  define GL_PREFIX(n) GLNAME(CONCAT(mgl,n))')
+        print('#  define _glapi_Dispatch _mglapi_Dispatch')
+        print('# else')
+        print('#  define GL_PREFIX(n) GLNAME(CONCAT(gl,n))')
+        print('# endif')
+        print('')
+        print('\t.text')
+        print('')
+        print('#ifdef GLX_USE_TLS')
+        print('')
+        print('_x86_64_get_dispatch:')
+        print('\tmovq\t_glapi_tls_Dispatch@GOTTPOFF(%rip), %rax')
+        print('\tmovq\t%fs:(%rax), %rax')
+        print('\tret')
+        print('\t.size\t_x86_64_get_dispatch, .-_x86_64_get_dispatch')
+        print('')
+        print('#elif defined(HAVE_PTHREAD)')
+        print('')
+        print('\t.extern\t_glapi_Dispatch')
+        print('\t.extern\t_gl_DispatchTSD')
+        print('\t.extern\tpthread_getspecific')
+        print('')
+        print('\t.p2align\t4,,15')
+        print('_x86_64_get_dispatch:')
+        print('\tmovq\t_gl_DispatchTSD@GOTPCREL(%rip), %rax')
+        print('\tmovl\t(%rax), %edi')
+        print('\tjmp\tpthread_getspecific@PLT')
+        print('')
+        print('#else')
+        print('')
+        print('\t.extern\t_glapi_get_dispatch')
+        print('')
+        print('#endif')
+        print('')
         return
 
 
     def printRealFooter(self):
-        print ''
-        print '#if defined (__ELF__) && defined (__linux__)'
-        print '	.section .note.GNU-stack,"",%progbits'
-        print '#endif'
+        print('')
+        print('#if defined (__ELF__) && defined (__linux__)')
+        print('	.section .note.GNU-stack,"",%progbits')
+        print('#endif')
         return
 
 
@@ -219,47 +221,47 @@
 
         name = f.dispatch_name()
 
-        print '\t.p2align\t4,,15'
-        print '\t.globl\tGL_PREFIX(%s)' % (name)
-        print '\t.type\tGL_PREFIX(%s), @function' % (name)
+        print('\t.p2align\t4,,15')
+        print('\t.globl\tGL_PREFIX(%s)' % (name))
+        print('\t.type\tGL_PREFIX(%s), @function' % (name))
         if not f.is_static_entry_point(f.name):
-            print '\tHIDDEN(GL_PREFIX(%s))' % (name)
-        print 'GL_PREFIX(%s):' % (name)
-        print '#if defined(GLX_USE_TLS)'
-        print '\tcall\t_x86_64_get_dispatch@PLT'
-        print '\tmovq\t%u(%%rax), %%r11' % (f.offset * 8)
-        print '\tjmp\t*%r11'
-        print '#elif defined(HAVE_PTHREAD)'
+            print('\tHIDDEN(GL_PREFIX(%s))' % (name))
+        print('GL_PREFIX(%s):' % (name))
+        print('#if defined(GLX_USE_TLS)')
+        print('\tcall\t_x86_64_get_dispatch@PLT')
+        print('\tmovq\t%u(%%rax), %%r11' % (f.offset * 8))
+        print('\tjmp\t*%r11')
+        print('#elif defined(HAVE_PTHREAD)')
 
         save_all_regs(registers)
-        print '\tcall\t_x86_64_get_dispatch@PLT'
+        print('\tcall\t_x86_64_get_dispatch@PLT')
         restore_all_regs(registers)
 
         if f.offset == 0:
-            print '\tmovq\t(%rax), %r11'
+            print('\tmovq\t(%rax), %r11')
         else:
-            print '\tmovq\t%u(%%rax), %%r11' % (f.offset * 8)
+            print('\tmovq\t%u(%%rax), %%r11' % (f.offset * 8))
 
-        print '\tjmp\t*%r11'
+        print('\tjmp\t*%r11')
 
-        print '#else'
-        print '\tmovq\t_glapi_Dispatch(%rip), %rax'
-        print '\ttestq\t%rax, %rax'
-        print '\tje\t1f'
-        print '\tmovq\t%u(%%rax), %%r11' % (f.offset * 8)
-        print '\tjmp\t*%r11'
-        print '1:'
+        print('#else')
+        print('\tmovq\t_glapi_Dispatch(%rip), %rax')
+        print('\ttestq\t%rax, %rax')
+        print('\tje\t1f')
+        print('\tmovq\t%u(%%rax), %%r11' % (f.offset * 8))
+        print('\tjmp\t*%r11')
+        print('1:')
 
         save_all_regs(registers)
-        print '\tcall\t_glapi_get_dispatch'
+        print('\tcall\t_glapi_get_dispatch')
         restore_all_regs(registers)
 
-        print '\tmovq\t%u(%%rax), %%r11' % (f.offset * 8)
-        print '\tjmp\t*%r11'
-        print '#endif /* defined(GLX_USE_TLS) */'
+        print('\tmovq\t%u(%%rax), %%r11' % (f.offset * 8))
+        print('\tjmp\t*%r11')
+        print('#endif /* defined(GLX_USE_TLS) */')
 
-        print '\t.size\tGL_PREFIX(%s), .-GL_PREFIX(%s)' % (name, name)
-        print ''
+        print('\t.size\tGL_PREFIX(%s), .-GL_PREFIX(%s)' % (name, name))
+        print('')
         return
 
 
@@ -276,11 +278,11 @@
                         text = '\t.globl GL_PREFIX(%s) ; .set GL_PREFIX(%s), GL_PREFIX(%s)' % (n, n, dispatch)
 
                         if f.has_different_protocol(n):
-                            print '#ifndef GLX_INDIRECT_RENDERING'
-                            print text
-                            print '#endif'
+                            print('#ifndef GLX_INDIRECT_RENDERING')
+                            print(text)
+                            print('#endif')
                         else:
-                            print text
+                            print(text)
 
         return
 
diff --git a/src/mapi/glapi/gen/gl_x86_asm.py b/src/mapi/glapi/gen/gl_x86_asm.py
index 24c15a7..ada849f 100644
--- a/src/mapi/glapi/gen/gl_x86_asm.py
+++ b/src/mapi/glapi/gen/gl_x86_asm.py
@@ -24,6 +24,8 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+from __future__ import print_function
+
 import argparse
 
 import license
@@ -53,135 +55,135 @@
 
 
     def printRealHeader(self):
-        print '#include "x86/assyntax.h"'
-        print ''
-        print '#if defined(STDCALL_API)'
-        print '# if defined(USE_MGL_NAMESPACE)'
-        print '#  define GL_PREFIX(n,n2) GLNAME(CONCAT(mgl,n2))'
-        print '# else'
-        print '#  define GL_PREFIX(n,n2) GLNAME(CONCAT(gl,n2))'
-        print '# endif'
-        print '#else'
-        print '# if defined(USE_MGL_NAMESPACE)'
-        print '#  define GL_PREFIX(n,n2) GLNAME(CONCAT(mgl,n))'
-        print '#  define _glapi_Dispatch _mglapi_Dispatch'
-        print '# else'
-        print '#  define GL_PREFIX(n,n2) GLNAME(CONCAT(gl,n))'
-        print '# endif'
-        print '#endif'
-        print ''
-        print '#define GL_OFFSET(x) CODEPTR(REGOFF(4 * x, EAX))'
-        print ''
-        print '#if defined(GNU_ASSEMBLER) && !defined(__MINGW32__) && !defined(__APPLE__)'
-        print '#define GLOBL_FN(x) GLOBL x ; .type x, @function'
-        print '#else'
-        print '#define GLOBL_FN(x) GLOBL x'
-        print '#endif'
-        print ''
-        print ''
-        print '#ifdef GLX_USE_TLS'
-        print ''
-        print '#ifdef GLX_X86_READONLY_TEXT'
-        print '# define CTX_INSNS MOV_L(GS:(EAX), EAX)'
-        print '#else'
-        print '# define CTX_INSNS NOP /* Pad for init_glapi_relocs() */'
-        print '#endif'
-        print ''
-        print '#  define GL_STUB(fn,off,fn_alt)\t\t\t\\'
-        print 'ALIGNTEXT16;\t\t\t\t\t\t\\'
-        print 'GLOBL_FN(GL_PREFIX(fn, fn_alt));\t\t\t\\'
-        print 'GL_PREFIX(fn, fn_alt):\t\t\t\t\t\\'
-        print '\tCALL(_x86_get_dispatch) ;\t\t\t\\'
-        print '\tCTX_INSNS ;					\\'
-        print '\tJMP(GL_OFFSET(off))'
-        print ''
-        print '#elif defined(HAVE_PTHREAD)'
-        print '#  define GL_STUB(fn,off,fn_alt)\t\t\t\\'
-        print 'ALIGNTEXT16;\t\t\t\t\t\t\\'
-        print 'GLOBL_FN(GL_PREFIX(fn, fn_alt));\t\t\t\\'
-        print 'GL_PREFIX(fn, fn_alt):\t\t\t\t\t\\'
-        print '\tMOV_L(CONTENT(GLNAME(_glapi_Dispatch)), EAX) ;\t\\'
-        print '\tTEST_L(EAX, EAX) ;\t\t\t\t\\'
-        print '\tJE(1f) ;\t\t\t\t\t\\'
-        print '\tJMP(GL_OFFSET(off)) ;\t\t\t\t\\'
-        print '1:\tCALL(_x86_get_dispatch) ;\t\t\t\\'
-        print '\tJMP(GL_OFFSET(off))'
-        print '#else'
-        print '#  define GL_STUB(fn,off,fn_alt)\t\t\t\\'
-        print 'ALIGNTEXT16;\t\t\t\t\t\t\\'
-        print 'GLOBL_FN(GL_PREFIX(fn, fn_alt));\t\t\t\\'
-        print 'GL_PREFIX(fn, fn_alt):\t\t\t\t\t\\'
-        print '\tMOV_L(CONTENT(GLNAME(_glapi_Dispatch)), EAX) ;\t\\'
-        print '\tTEST_L(EAX, EAX) ;\t\t\t\t\\'
-        print '\tJE(1f) ;\t\t\t\t\t\\'
-        print '\tJMP(GL_OFFSET(off)) ;\t\t\t\t\\'
-        print '1:\tCALL(_glapi_get_dispatch) ;\t\t\t\\'
-        print '\tJMP(GL_OFFSET(off))'
-        print '#endif'
-        print ''
-        print '#ifdef HAVE_FUNC_ATTRIBUTE_ALIAS'
-        print '#  define GL_STUB_ALIAS(fn,off,fn_alt,alias,alias_alt)\t\\'
-        print '\t.globl\tGL_PREFIX(fn, fn_alt) ;\t\t\t\\'
-        print '\t.set\tGL_PREFIX(fn, fn_alt), GL_PREFIX(alias, alias_alt)'
-        print '#else'
-        print '#  define GL_STUB_ALIAS(fn,off,fn_alt,alias,alias_alt)\t\\'
-        print '    GL_STUB(fn, off, fn_alt)'
-        print '#endif'
-        print ''
-        print 'SEG_TEXT'
-        print ''
-        print '#ifdef GLX_USE_TLS'
-        print ''
-        print '\tGLOBL\tGLNAME(_x86_get_dispatch)'
-        print '\tHIDDEN(GLNAME(_x86_get_dispatch))'
-        print 'ALIGNTEXT16'
-        print 'GLNAME(_x86_get_dispatch):'
-        print '\tcall	1f'
-        print '1:\tpopl	%eax'
-        print '\taddl	$_GLOBAL_OFFSET_TABLE_+[.-1b], %eax'
-        print '\tmovl	_glapi_tls_Dispatch@GOTNTPOFF(%eax), %eax'
-        print '\tret'
-        print ''
-        print '#elif defined(HAVE_PTHREAD)'
-        print 'EXTERN GLNAME(_glapi_Dispatch)'
-        print 'EXTERN GLNAME(_gl_DispatchTSD)'
-        print 'EXTERN GLNAME(pthread_getspecific)'
-        print ''
-        print 'ALIGNTEXT16'
-        print 'GLNAME(_x86_get_dispatch):'
-        print '\tSUB_L(CONST(24), ESP)'
-        print '\tPUSH_L(GLNAME(_gl_DispatchTSD))'
-        print '\tCALL(GLNAME(pthread_getspecific))'
-        print '\tADD_L(CONST(28), ESP)'
-        print '\tRET'
-        print '#else'
-        print 'EXTERN GLNAME(_glapi_get_dispatch)'
-        print '#endif'
-        print ''
+        print('#include "x86/assyntax.h"')
+        print('')
+        print('#if defined(STDCALL_API)')
+        print('# if defined(USE_MGL_NAMESPACE)')
+        print('#  define GL_PREFIX(n,n2) GLNAME(CONCAT(mgl,n2))')
+        print('# else')
+        print('#  define GL_PREFIX(n,n2) GLNAME(CONCAT(gl,n2))')
+        print('# endif')
+        print('#else')
+        print('# if defined(USE_MGL_NAMESPACE)')
+        print('#  define GL_PREFIX(n,n2) GLNAME(CONCAT(mgl,n))')
+        print('#  define _glapi_Dispatch _mglapi_Dispatch')
+        print('# else')
+        print('#  define GL_PREFIX(n,n2) GLNAME(CONCAT(gl,n))')
+        print('# endif')
+        print('#endif')
+        print('')
+        print('#define GL_OFFSET(x) CODEPTR(REGOFF(4 * x, EAX))')
+        print('')
+        print('#if defined(GNU_ASSEMBLER) && !defined(__MINGW32__) && !defined(__APPLE__)')
+        print('#define GLOBL_FN(x) GLOBL x ; .type x, @function')
+        print('#else')
+        print('#define GLOBL_FN(x) GLOBL x')
+        print('#endif')
+        print('')
+        print('')
+        print('#ifdef GLX_USE_TLS')
+        print('')
+        print('#ifdef GLX_X86_READONLY_TEXT')
+        print('# define CTX_INSNS MOV_L(GS:(EAX), EAX)')
+        print('#else')
+        print('# define CTX_INSNS NOP /* Pad for init_glapi_relocs() */')
+        print('#endif')
+        print('')
+        print('#  define GL_STUB(fn,off,fn_alt)\t\t\t\\')
+        print('ALIGNTEXT16;\t\t\t\t\t\t\\')
+        print('GLOBL_FN(GL_PREFIX(fn, fn_alt));\t\t\t\\')
+        print('GL_PREFIX(fn, fn_alt):\t\t\t\t\t\\')
+        print('\tCALL(_x86_get_dispatch) ;\t\t\t\\')
+        print('\tCTX_INSNS ;					\\')
+        print('\tJMP(GL_OFFSET(off))')
+        print('')
+        print('#elif defined(HAVE_PTHREAD)')
+        print('#  define GL_STUB(fn,off,fn_alt)\t\t\t\\')
+        print('ALIGNTEXT16;\t\t\t\t\t\t\\')
+        print('GLOBL_FN(GL_PREFIX(fn, fn_alt));\t\t\t\\')
+        print('GL_PREFIX(fn, fn_alt):\t\t\t\t\t\\')
+        print('\tMOV_L(CONTENT(GLNAME(_glapi_Dispatch)), EAX) ;\t\\')
+        print('\tTEST_L(EAX, EAX) ;\t\t\t\t\\')
+        print('\tJE(1f) ;\t\t\t\t\t\\')
+        print('\tJMP(GL_OFFSET(off)) ;\t\t\t\t\\')
+        print('1:\tCALL(_x86_get_dispatch) ;\t\t\t\\')
+        print('\tJMP(GL_OFFSET(off))')
+        print('#else')
+        print('#  define GL_STUB(fn,off,fn_alt)\t\t\t\\')
+        print('ALIGNTEXT16;\t\t\t\t\t\t\\')
+        print('GLOBL_FN(GL_PREFIX(fn, fn_alt));\t\t\t\\')
+        print('GL_PREFIX(fn, fn_alt):\t\t\t\t\t\\')
+        print('\tMOV_L(CONTENT(GLNAME(_glapi_Dispatch)), EAX) ;\t\\')
+        print('\tTEST_L(EAX, EAX) ;\t\t\t\t\\')
+        print('\tJE(1f) ;\t\t\t\t\t\\')
+        print('\tJMP(GL_OFFSET(off)) ;\t\t\t\t\\')
+        print('1:\tCALL(_glapi_get_dispatch) ;\t\t\t\\')
+        print('\tJMP(GL_OFFSET(off))')
+        print('#endif')
+        print('')
+        print('#ifdef HAVE_FUNC_ATTRIBUTE_ALIAS')
+        print('#  define GL_STUB_ALIAS(fn,off,fn_alt,alias,alias_alt)\t\\')
+        print('\t.globl\tGL_PREFIX(fn, fn_alt) ;\t\t\t\\')
+        print('\t.set\tGL_PREFIX(fn, fn_alt), GL_PREFIX(alias, alias_alt)')
+        print('#else')
+        print('#  define GL_STUB_ALIAS(fn,off,fn_alt,alias,alias_alt)\t\\')
+        print('    GL_STUB(fn, off, fn_alt)')
+        print('#endif')
+        print('')
+        print('SEG_TEXT')
+        print('')
+        print('#ifdef GLX_USE_TLS')
+        print('')
+        print('\tGLOBL\tGLNAME(_x86_get_dispatch)')
+        print('\tHIDDEN(GLNAME(_x86_get_dispatch))')
+        print('ALIGNTEXT16')
+        print('GLNAME(_x86_get_dispatch):')
+        print('\tcall	1f')
+        print('1:\tpopl	%eax')
+        print('\taddl	$_GLOBAL_OFFSET_TABLE_+[.-1b], %eax')
+        print('\tmovl	_glapi_tls_Dispatch@GOTNTPOFF(%eax), %eax')
+        print('\tret')
+        print('')
+        print('#elif defined(HAVE_PTHREAD)')
+        print('EXTERN GLNAME(_glapi_Dispatch)')
+        print('EXTERN GLNAME(_gl_DispatchTSD)')
+        print('EXTERN GLNAME(pthread_getspecific)')
+        print('')
+        print('ALIGNTEXT16')
+        print('GLNAME(_x86_get_dispatch):')
+        print('\tSUB_L(CONST(24), ESP)')
+        print('\tPUSH_L(GLNAME(_gl_DispatchTSD))')
+        print('\tCALL(GLNAME(pthread_getspecific))')
+        print('\tADD_L(CONST(28), ESP)')
+        print('\tRET')
+        print('#else')
+        print('EXTERN GLNAME(_glapi_get_dispatch)')
+        print('#endif')
+        print('')
 
-        print '#if defined( GLX_USE_TLS ) && !defined( GLX_X86_READONLY_TEXT )'
-        print '\t\t.section\twtext, "awx", @progbits'
-        print '#endif /* defined( GLX_USE_TLS ) */'
+        print('#if defined( GLX_USE_TLS ) && !defined( GLX_X86_READONLY_TEXT )')
+        print('\t\t.section\twtext, "awx", @progbits')
+        print('#endif /* defined( GLX_USE_TLS ) */')
 
-        print ''
-        print '\t\tALIGNTEXT16'
-        print '\t\tGLOBL GLNAME(gl_dispatch_functions_start)'
-        print '\t\tHIDDEN(GLNAME(gl_dispatch_functions_start))'
-        print 'GLNAME(gl_dispatch_functions_start):'
-        print ''
+        print('')
+        print('\t\tALIGNTEXT16')
+        print('\t\tGLOBL GLNAME(gl_dispatch_functions_start)')
+        print('\t\tHIDDEN(GLNAME(gl_dispatch_functions_start))')
+        print('GLNAME(gl_dispatch_functions_start):')
+        print('')
         return
 
 
     def printRealFooter(self):
-        print ''
-        print '\t\tGLOBL\tGLNAME(gl_dispatch_functions_end)'
-        print '\t\tHIDDEN(GLNAME(gl_dispatch_functions_end))'
-        print '\t\tALIGNTEXT16'
-        print 'GLNAME(gl_dispatch_functions_end):'
-        print ''
-        print '#if defined (__ELF__) && defined (__linux__)'
-        print '	.section .note.GNU-stack,"",%progbits'
-        print '#endif'
+        print('')
+        print('\t\tGLOBL\tGLNAME(gl_dispatch_functions_end)')
+        print('\t\tHIDDEN(GLNAME(gl_dispatch_functions_end))')
+        print('\t\tALIGNTEXT16')
+        print('GLNAME(gl_dispatch_functions_end):')
+        print('')
+        print('#if defined (__ELF__) && defined (__linux__)')
+        print('	.section .note.GNU-stack,"",%progbits')
+        print('#endif')
         return
 
 
@@ -191,10 +193,10 @@
             stack = self.get_stack_size(f)
             alt = "%s@%u" % (name, stack)
 
-            print '\tGL_STUB(%s, %d, %s)' % (name, f.offset, alt)
+            print('\tGL_STUB(%s, %d, %s)' % (name, f.offset, alt))
 
             if not f.is_static_entry_point(f.name):
-                print '\tHIDDEN(GL_PREFIX(%s, %s))' % (name, alt)
+                print('\tHIDDEN(GL_PREFIX(%s, %s))' % (name, alt))
 
 
         for f in api.functionIterateByOffset():
@@ -209,11 +211,11 @@
                         text = '\tGL_STUB_ALIAS(%s, %d, %s, %s, %s)' % (n, f.offset, alt2, name, alt)
 
                         if f.has_different_protocol(n):
-                            print '#ifndef GLX_INDIRECT_RENDERING'
-                            print text
-                            print '#endif'
+                            print('#ifndef GLX_INDIRECT_RENDERING')
+                            print(text)
+                            print('#endif')
                         else:
-                            print text
+                            print(text)
 
         return
 
diff --git a/src/mapi/glapi/gen/remap_helper.py b/src/mapi/glapi/gen/remap_helper.py
index de759d6..0740b18 100644
--- a/src/mapi/glapi/gen/remap_helper.py
+++ b/src/mapi/glapi/gen/remap_helper.py
@@ -23,6 +23,8 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
+from __future__ import print_function
+
 import argparse
 
 import license
@@ -66,23 +68,23 @@
 
 
     def printRealHeader(self):
-        print '#include "main/dispatch.h"'
-        print '#include "main/remap.h"'
-        print ''
+        print('#include "main/dispatch.h"')
+        print('#include "main/remap.h"')
+        print('')
         return
 
 
     def printBody(self, api):
         pool_indices = {}
 
-        print '/* this is internal to remap.c */'
-        print '#ifndef need_MESA_remap_table'
-        print '#error Only remap.c should include this file!'
-        print '#endif /* need_MESA_remap_table */'
-        print ''
+        print('/* this is internal to remap.c */')
+        print('#ifndef need_MESA_remap_table')
+        print('#error Only remap.c should include this file!')
+        print('#endif /* need_MESA_remap_table */')
+        print('')
 
-        print ''
-        print 'static const char _mesa_function_pool[] ='
+        print('')
+        print('static const char _mesa_function_pool[] =')
 
         # output string pool
         index = 0;
@@ -100,26 +102,26 @@
             else:
                 comments = "dynamic"
 
-            print '   /* _mesa_function_pool[%d]: %s (%s) */' \
-                            % (index, f.name, comments)
+            print('   /* _mesa_function_pool[%d]: %s (%s) */' \
+                            % (index, f.name, comments))
             for line in spec:
-                print '   "%s\\0"' % line
+                print('   "%s\\0"' % line)
                 index += len(line) + 1
-        print '   ;'
-        print ''
+        print('   ;')
+        print('')
 
-        print '/* these functions need to be remapped */'
-        print 'static const struct gl_function_pool_remap MESA_remap_table_functions[] = {'
+        print('/* these functions need to be remapped */')
+        print('static const struct gl_function_pool_remap MESA_remap_table_functions[] = {')
         # output all functions that need to be remapped
         # iterate by offsets so that they are sorted by remap indices
         for f in api.functionIterateByOffset():
             if not f.assign_offset:
                 continue
-            print '   { %5d, %s_remap_index },' \
-                            % (pool_indices[f], f.name)
-        print '   {    -1, -1 }'
-        print '};'
-        print ''
+            print('   { %5d, %s_remap_index },' \
+                            % (pool_indices[f], f.name))
+        print('   {    -1, -1 }')
+        print('};')
+        print('')
         return
 
 
diff --git a/src/mapi/glapi/gen/typeexpr.py b/src/mapi/glapi/gen/typeexpr.py
index 6da85c2..1f710ea 100644
--- a/src/mapi/glapi/gen/typeexpr.py
+++ b/src/mapi/glapi/gen/typeexpr.py
@@ -24,7 +24,9 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
-import string, copy
+from __future__ import print_function
+
+import copy
 
 class type_node(object):
     def __init__(self):
@@ -124,7 +126,7 @@
 
         # Replace '*' with ' * ' in type_string.  Then, split the string
         # into tokens, separated by spaces.
-        tokens = string.split( string.replace( type_string, "*", " * " ) )
+        tokens = type_string.replace("*", " * ").split()
 
         const = 0
         t = None
@@ -286,6 +288,6 @@
     create_initial_types()
 
     for t in types_to_try:
-        print 'Trying "%s"...' % (t)
+        print('Trying "%s"...' % (t))
         te = type_expression( t )
-        print 'Got "%s" (%u, %u).' % (te.string(), te.get_stack_size(), te.get_element_size())
+        print('Got "%s" (%u, %u).' % (te.string(), te.get_stack_size(), te.get_element_size()))
diff --git a/src/mapi/glapi/meson.build b/src/mapi/glapi/meson.build
index e241d9e..2509e19 100644
--- a/src/mapi/glapi/meson.build
+++ b/src/mapi/glapi/meson.build
@@ -64,8 +64,9 @@
     static_glapi_files += glapi_x86_s
   elif with_asm_arch == 'x86_64'
     static_glapi_files += glapi_x86_64_s
+  elif with_asm_arch == 'sparc'
+    static_glapi_files += glapi_sparc_s
   endif
-  # TODO: SPARC asm
 endif
 
 libglapi_static = static_library(
diff --git a/src/mapi/glapi/registry/gl.xml b/src/mapi/glapi/registry/gl.xml
index 833478a..13882ef 100644
--- a/src/mapi/glapi/registry/gl.xml
+++ b/src/mapi/glapi/registry/gl.xml
@@ -6568,6 +6568,7 @@
         <enum value="0x8BB5" name="GL_VERTEX_PROGRAM_CALLBACK_MESA"/>
         <enum value="0x8BB6" name="GL_VERTEX_PROGRAM_CALLBACK_FUNC_MESA"/>
         <enum value="0x8BB7" name="GL_VERTEX_PROGRAM_CALLBACK_DATA_MESA"/>
+        <enum value="0x8BBB" name="GL_FRAMEBUFFER_FLIP_Y_MESA"/>
     </enums>
 
     <enums namespace="GL" start="0x8BC0" end="0x8BFF" vendor="QCOM" comment="Reassigned from AMD to QCOM">
@@ -44356,6 +44357,11 @@
                 <enum name="GL_TEXTURE_2D_STACK_BINDING_MESAX"/>
             </require>
         </extension>
+        <extension name="GL_MESA_framebuffer_flip_y" supported="gles2">
+            <require>
+                <enum name="GL_FRAMEBUFFER_FLIP_Y_MESA"/>
+            </require>
+        </extension>
         <extension name="GL_MESA_pack_invert" supported="gl">
             <require>
                 <enum name="GL_PACK_INVERT_MESA"/>
diff --git a/src/mapi/glapi/tests/check_table.cpp b/src/mapi/glapi/tests/check_table.cpp
index 6230f12..761f2a2 100644
--- a/src/mapi/glapi/tests/check_table.cpp
+++ b/src/mapi/glapi/tests/check_table.cpp
@@ -1260,7 +1260,6 @@
    { "glTextureStorage1DEXT", _O(TextureStorage1DEXT) },
    { "glTextureStorage2DEXT", _O(TextureStorage2DEXT) },
    { "glTextureStorage3DEXT", _O(TextureStorage3DEXT) },
-   { "glPolygonOffsetEXT", _O(PolygonOffsetEXT) },
    { "glSampleMaskSGIS", _O(SampleMaskSGIS) },
    { "glSamplePatternSGIS", _O(SamplePatternSGIS) },
    { "glColorPointerEXT", _O(ColorPointerEXT) },
diff --git a/src/mapi/mapi_abi.py b/src/mapi/mapi_abi.py
index 82a2511..be1d15d 100644
--- a/src/mapi/mapi_abi.py
+++ b/src/mapi/mapi_abi.py
@@ -24,6 +24,8 @@
 # Authors:
 #    Chia-I Wu <olv@lunarg.com>
 
+from __future__ import print_function
+
 import sys
 # make it possible to import glapi
 import os
@@ -166,7 +168,7 @@
             else:
                 attrs['handcode'] = None
 
-            if entry_dict.has_key(name):
+            if name in entry_dict:
                 raise Exception('%s is duplicated' % (name))
 
             cols = []
@@ -178,8 +180,7 @@
             ent = ABIEntry(cols, attrs, func)
             entry_dict[ent.name] = ent
 
-    entries = entry_dict.values()
-    entries.sort()
+    entries = sorted(entry_dict.values())
 
     return entries
 
@@ -244,12 +245,11 @@
             raise Exception('invalid slot in %s' % (line))
 
         ent = ABIEntry(cols, attrs)
-        if entry_dict.has_key(ent.name):
+        if ent.name in entry_dict:
             raise Exception('%s is duplicated' % (ent.name))
         entry_dict[ent.name] = ent
 
-    entries = entry_dict.values()
-    entries.sort()
+    entries = sorted(entry_dict.values())
 
     return entries
 
@@ -260,7 +260,7 @@
     all_names = []
     last_slot = entries[-1].slot
     i = 0
-    for slot in xrange(last_slot + 1):
+    for slot in range(last_slot + 1):
         if entries[i].slot != slot:
             raise Exception('entries are not ordered by slots')
         if entries[i].alias:
@@ -541,79 +541,79 @@
         return "\n".join(asm)
 
     def output_for_lib(self):
-        print self.c_notice()
+        print(self.c_notice())
 
         if self.c_header:
-            print
-            print self.c_header
+            print()
+            print(self.c_header)
 
-        print
-        print '#ifdef MAPI_TMP_DEFINES'
-        print self.c_public_includes()
-        print
-        print self.c_public_declarations(self.prefix_lib)
-        print '#undef MAPI_TMP_DEFINES'
-        print '#endif /* MAPI_TMP_DEFINES */'
+        print()
+        print('#ifdef MAPI_TMP_DEFINES')
+        print(self.c_public_includes())
+        print()
+        print(self.c_public_declarations(self.prefix_lib))
+        print('#undef MAPI_TMP_DEFINES')
+        print('#endif /* MAPI_TMP_DEFINES */')
 
         if self.lib_need_table_size:
-            print
-            print '#ifdef MAPI_TMP_TABLE'
-            print self.c_mapi_table()
-            print '#undef MAPI_TMP_TABLE'
-            print '#endif /* MAPI_TMP_TABLE */'
+            print()
+            print('#ifdef MAPI_TMP_TABLE')
+            print(self.c_mapi_table())
+            print('#undef MAPI_TMP_TABLE')
+            print('#endif /* MAPI_TMP_TABLE */')
 
         if self.lib_need_noop_array:
-            print
-            print '#ifdef MAPI_TMP_NOOP_ARRAY'
-            print '#ifdef DEBUG'
-            print
-            print self.c_noop_functions(self.prefix_noop, self.prefix_warn)
-            print
-            print 'const mapi_func table_%s_array[] = {' % (self.prefix_noop)
-            print self.c_noop_initializer(self.prefix_noop, False)
-            print '};'
-            print
-            print '#else /* DEBUG */'
-            print
-            print 'const mapi_func table_%s_array[] = {' % (self.prefix_noop)
-            print self.c_noop_initializer(self.prefix_noop, True)
-            print '};'
-            print
-            print '#endif /* DEBUG */'
-            print '#undef MAPI_TMP_NOOP_ARRAY'
-            print '#endif /* MAPI_TMP_NOOP_ARRAY */'
+            print()
+            print('#ifdef MAPI_TMP_NOOP_ARRAY')
+            print('#ifdef DEBUG')
+            print()
+            print(self.c_noop_functions(self.prefix_noop, self.prefix_warn))
+            print()
+            print('const mapi_func table_%s_array[] = {' % (self.prefix_noop))
+            print(self.c_noop_initializer(self.prefix_noop, False))
+            print('};')
+            print()
+            print('#else /* DEBUG */')
+            print()
+            print('const mapi_func table_%s_array[] = {' % (self.prefix_noop))
+            print(self.c_noop_initializer(self.prefix_noop, True))
+            print('};')
+            print()
+            print('#endif /* DEBUG */')
+            print('#undef MAPI_TMP_NOOP_ARRAY')
+            print('#endif /* MAPI_TMP_NOOP_ARRAY */')
 
         if self.lib_need_stubs:
             pool, pool_offsets = self.c_stub_string_pool()
-            print
-            print '#ifdef MAPI_TMP_PUBLIC_STUBS'
-            print 'static const char public_string_pool[] ='
-            print pool
-            print
-            print 'static const struct mapi_stub public_stubs[] = {'
-            print self.c_stub_initializer(self.prefix_lib, pool_offsets)
-            print '};'
-            print '#undef MAPI_TMP_PUBLIC_STUBS'
-            print '#endif /* MAPI_TMP_PUBLIC_STUBS */'
+            print()
+            print('#ifdef MAPI_TMP_PUBLIC_STUBS')
+            print('static const char public_string_pool[] =')
+            print(pool)
+            print()
+            print('static const struct mapi_stub public_stubs[] = {')
+            print(self.c_stub_initializer(self.prefix_lib, pool_offsets))
+            print('};')
+            print('#undef MAPI_TMP_PUBLIC_STUBS')
+            print('#endif /* MAPI_TMP_PUBLIC_STUBS */')
 
         if self.lib_need_all_entries:
-            print
-            print '#ifdef MAPI_TMP_PUBLIC_ENTRIES'
-            print self.c_public_dispatches(self.prefix_lib, False)
-            print
-            print 'static const mapi_func public_entries[] = {'
-            print self.c_public_initializer(self.prefix_lib)
-            print '};'
-            print '#undef MAPI_TMP_PUBLIC_ENTRIES'
-            print '#endif /* MAPI_TMP_PUBLIC_ENTRIES */'
+            print()
+            print('#ifdef MAPI_TMP_PUBLIC_ENTRIES')
+            print(self.c_public_dispatches(self.prefix_lib, False))
+            print()
+            print('static const mapi_func public_entries[] = {')
+            print(self.c_public_initializer(self.prefix_lib))
+            print('};')
+            print('#undef MAPI_TMP_PUBLIC_ENTRIES')
+            print('#endif /* MAPI_TMP_PUBLIC_ENTRIES */')
 
-            print
-            print '#ifdef MAPI_TMP_STUB_ASM_GCC'
-            print '__asm__('
-            print self.c_asm_gcc(self.prefix_lib, False)
-            print ');'
-            print '#undef MAPI_TMP_STUB_ASM_GCC'
-            print '#endif /* MAPI_TMP_STUB_ASM_GCC */'
+            print()
+            print('#ifdef MAPI_TMP_STUB_ASM_GCC')
+            print('__asm__(')
+            print(self.c_asm_gcc(self.prefix_lib, False))
+            print(');')
+            print('#undef MAPI_TMP_STUB_ASM_GCC')
+            print('#endif /* MAPI_TMP_STUB_ASM_GCC */')
 
         if self.lib_need_non_hidden_entries:
             all_hidden = True
@@ -622,21 +622,21 @@
                     all_hidden = False
                     break
             if not all_hidden:
-                print
-                print '#ifdef MAPI_TMP_PUBLIC_ENTRIES_NO_HIDDEN'
-                print self.c_public_dispatches(self.prefix_lib, True)
-                print
-                print '/* does not need public_entries */'
-                print '#undef MAPI_TMP_PUBLIC_ENTRIES_NO_HIDDEN'
-                print '#endif /* MAPI_TMP_PUBLIC_ENTRIES_NO_HIDDEN */'
+                print()
+                print('#ifdef MAPI_TMP_PUBLIC_ENTRIES_NO_HIDDEN')
+                print(self.c_public_dispatches(self.prefix_lib, True))
+                print()
+                print('/* does not need public_entries */')
+                print('#undef MAPI_TMP_PUBLIC_ENTRIES_NO_HIDDEN')
+                print('#endif /* MAPI_TMP_PUBLIC_ENTRIES_NO_HIDDEN */')
 
-                print
-                print '#ifdef MAPI_TMP_STUB_ASM_GCC_NO_HIDDEN'
-                print '__asm__('
-                print self.c_asm_gcc(self.prefix_lib, True)
-                print ');'
-                print '#undef MAPI_TMP_STUB_ASM_GCC_NO_HIDDEN'
-                print '#endif /* MAPI_TMP_STUB_ASM_GCC_NO_HIDDEN */'
+                print()
+                print('#ifdef MAPI_TMP_STUB_ASM_GCC_NO_HIDDEN')
+                print('__asm__(')
+                print(self.c_asm_gcc(self.prefix_lib, True))
+                print(');')
+                print('#undef MAPI_TMP_STUB_ASM_GCC_NO_HIDDEN')
+                print('#endif /* MAPI_TMP_STUB_ASM_GCC_NO_HIDDEN */')
 
 class GLAPIPrinter(ABIPrinter):
     """OpenGL API Printer"""
diff --git a/src/mapi/shared-glapi/meson.build b/src/mapi/shared-glapi/meson.build
index c7d136c..24e7729 100644
--- a/src/mapi/shared-glapi/meson.build
+++ b/src/mapi/shared-glapi/meson.build
@@ -40,7 +40,7 @@
   'glapi',
   [files_mapi_glapi, files_mapi_util, shared_glapi_mapi_tmp_h],
   c_args : [
-    c_msvc_compat_args, '-DMAPI_MODE_GLAPI',
+    c_msvc_compat_args, c_vis_args, '-DMAPI_MODE_GLAPI',
     '-DMAPI_ABI_HEADER="@0@"'.format(shared_glapi_mapi_tmp_h.full_path()),
   ],
   link_args : [ld_args_gc_sections],
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index d78493e..ae8934e 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -14,8 +14,6 @@
 	main/api_exec.h \
 	main/api_loopback.c \
 	main/api_loopback.h \
-	main/api_validate.c \
-	main/api_validate.h \
 	main/arbprogram.c \
 	main/arbprogram.h \
 	main/arrayobj.c \
@@ -49,6 +47,8 @@
 	main/condrender.c \
 	main/condrender.h \
 	main/config.h \
+	main/conservativeraster.c \
+	main/conservativeraster.h \
 	main/context.c \
 	main/context.h \
 	main/convolve.c \
@@ -70,6 +70,8 @@
 	main/drawpix.h \
 	main/drawtex.c \
 	main/drawtex.h \
+	main/draw_validate.c \
+	main/draw_validate.h \
 	main/enable.c \
 	main/enable.h \
 	main/enums.c \
@@ -212,8 +214,11 @@
 	main/syncobj.c \
 	main/syncobj.h \
 	main/texcompress.c \
+	main/texcompress_astc.cpp \
+	main/texcompress_astc.h \
 	main/texcompress_bptc.c \
 	main/texcompress_bptc.h \
+	main/texcompress_bptc_tmp.h \
 	main/texcompress_cpal.c \
 	main/texcompress_cpal.h \
 	main/texcompress_etc.c \
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index ba98ad4..5a21b64 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -42,6 +42,7 @@
 mesa_sources = (
     source_lists['MESA_FILES'] +
     source_lists['PROGRAM_FILES'] +
+    source_lists['PROGRAM_NIR_FILES'] +
     source_lists['STATETRACKER_FILES']
 )
 
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index b7ac2b5..e783262 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -306,5 +306,5 @@
                                  ctx->Stencil.ZPassFunc[1]);
 
 
-   ctx->Driver.DrawBuffer(ctx, ctx->Color.DrawBuffer[0]);
+   ctx->Driver.DrawBuffer(ctx);
 }
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index 830d82a..6b1713e 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -348,18 +348,18 @@
                                    GL_FALSE, GL_FALSE,
                                    offsetof(struct vertex, x));
          _mesa_bind_vertex_buffer(ctx, array_obj, VERT_ATTRIB_GENERIC(0),
-                                  *buf_obj, 0, sizeof(struct vertex), true);
+                                  *buf_obj, 0, sizeof(struct vertex));
          _mesa_enable_vertex_array_attrib(ctx, array_obj,
-                                          VERT_ATTRIB_GENERIC(0), true);
+                                          VERT_ATTRIB_GENERIC(0));
          if (texcoord_size > 0) {
             _mesa_update_array_format(ctx, array_obj, VERT_ATTRIB_GENERIC(1),
                                       texcoord_size, GL_FLOAT, GL_RGBA,
                                       GL_FALSE, GL_FALSE, GL_FALSE,
                                       offsetof(struct vertex, tex));
             _mesa_bind_vertex_buffer(ctx, array_obj, VERT_ATTRIB_GENERIC(1),
-                                     *buf_obj, 0, sizeof(struct vertex), true);
+                                     *buf_obj, 0, sizeof(struct vertex));
             _mesa_enable_vertex_array_attrib(ctx, array_obj,
-                                             VERT_ATTRIB_GENERIC(1), true);
+                                             VERT_ATTRIB_GENERIC(1));
          }
       } else {
          _mesa_update_array_format(ctx, array_obj, VERT_ATTRIB_POS,
@@ -367,9 +367,8 @@
                                    GL_FALSE, GL_FALSE,
                                    offsetof(struct vertex, x));
          _mesa_bind_vertex_buffer(ctx, array_obj, VERT_ATTRIB_POS,
-                                  *buf_obj, 0, sizeof(struct vertex), true);
-         _mesa_enable_vertex_array_attrib(ctx, array_obj,
-                                          VERT_ATTRIB_POS, true);
+                                  *buf_obj, 0, sizeof(struct vertex));
+         _mesa_enable_vertex_array_attrib(ctx, array_obj, VERT_ATTRIB_POS);
 
          if (texcoord_size > 0) {
             _mesa_update_array_format(ctx, array_obj, VERT_ATTRIB_TEX(0),
@@ -377,9 +376,9 @@
                                       GL_FALSE, GL_FALSE,
                                       offsetof(struct vertex, tex));
             _mesa_bind_vertex_buffer(ctx, array_obj, VERT_ATTRIB_TEX(0),
-                                     *buf_obj, 0, sizeof(struct vertex), true);
+                                     *buf_obj, 0, sizeof(struct vertex));
             _mesa_enable_vertex_array_attrib(ctx, array_obj,
-                                             VERT_ATTRIB_TEX(0), true);
+                                             VERT_ATTRIB_TEX(0));
          }
 
          if (color_size > 0) {
@@ -388,9 +387,9 @@
                                       GL_FALSE, GL_FALSE,
                                       offsetof(struct vertex, r));
             _mesa_bind_vertex_buffer(ctx, array_obj, VERT_ATTRIB_COLOR0,
-                                     *buf_obj, 0, sizeof(struct vertex), true);
+                                     *buf_obj, 0, sizeof(struct vertex));
             _mesa_enable_vertex_array_attrib(ctx, array_obj,
-                                             VERT_ATTRIB_COLOR0, true);
+                                             VERT_ATTRIB_COLOR0);
          }
       }
    } else {
@@ -3347,9 +3346,8 @@
                                 GL_FALSE, GL_FALSE,
                                 offsetof(struct vertex, x));
       _mesa_bind_vertex_buffer(ctx, array_obj, VERT_ATTRIB_POS,
-                               drawtex->buf_obj, 0,
-                               sizeof(struct vertex), true);
-      _mesa_enable_vertex_array_attrib(ctx, array_obj, VERT_ATTRIB_POS, true);
+                               drawtex->buf_obj, 0, sizeof(struct vertex));
+      _mesa_enable_vertex_array_attrib(ctx, array_obj, VERT_ATTRIB_POS);
 
 
       for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
@@ -3359,10 +3357,8 @@
                                    GL_FALSE, GL_FALSE,
                                    offsetof(struct vertex, st[i]));
          _mesa_bind_vertex_buffer(ctx, array_obj, VERT_ATTRIB_TEX(i),
-                                  drawtex->buf_obj, 0,
-                                  sizeof(struct vertex), true);
-         _mesa_enable_vertex_array_attrib(ctx, array_obj,
-                                          VERT_ATTRIB_TEX(i), true);
+                                  drawtex->buf_obj, 0, sizeof(struct vertex));
+         _mesa_enable_vertex_array_attrib(ctx, array_obj, VERT_ATTRIB_TEX(i));
       }
    }
    else {
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index 0b94d19..d257cb6 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -389,14 +389,6 @@
         screen->max_gl_compat_version < 31)
        mesa_api = API_OPENGL_CORE;
 
-    if (mesa_api == API_OPENGL_COMPAT
-        && ((ctx_config.major_version > 3)
-            || (ctx_config.major_version == 3 &&
-                ctx_config.minor_version >= 2))) {
-       *error = __DRI_CTX_ERROR_BAD_API;
-       return NULL;
-    }
-
     /* The latest version of EGL_KHR_create_context spec says:
      *
      *     "If the EGL_CONTEXT_OPENGL_DEBUG_BIT_KHR flag bit is set in
@@ -872,76 +864,114 @@
    }
 }
 
+/*
+ * Note: the first match is returned, which is important for formats like
+ * __DRI_IMAGE_FORMAT_R8 which maps to both MESA_FORMAT_{R,L}_UNORM8
+ */
+static const struct {
+   uint32_t    image_format;
+   mesa_format mesa_format;
+} format_mapping[] = {
+   {
+      .image_format = __DRI_IMAGE_FORMAT_RGB565,
+      .mesa_format  =        MESA_FORMAT_B5G6R5_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_ARGB1555,
+      .mesa_format  =        MESA_FORMAT_B5G5R5A1_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_XRGB8888,
+      .mesa_format  =        MESA_FORMAT_B8G8R8X8_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_ARGB2101010,
+      .mesa_format  =        MESA_FORMAT_B10G10R10A2_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_XRGB2101010,
+      .mesa_format  =        MESA_FORMAT_B10G10R10X2_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_ABGR2101010,
+      .mesa_format  =        MESA_FORMAT_R10G10B10A2_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_XBGR2101010,
+      .mesa_format  =        MESA_FORMAT_R10G10B10X2_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_ARGB8888,
+      .mesa_format  =        MESA_FORMAT_B8G8R8A8_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_ABGR8888,
+      .mesa_format  =        MESA_FORMAT_R8G8B8A8_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_XBGR8888,
+      .mesa_format  =        MESA_FORMAT_R8G8B8X8_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_R8,
+      .mesa_format  =        MESA_FORMAT_R_UNORM8,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_R8,
+      .mesa_format  =        MESA_FORMAT_L_UNORM8,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_GR88,
+      .mesa_format  =        MESA_FORMAT_R8G8_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_GR88,
+      .mesa_format  =        MESA_FORMAT_L8A8_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_SABGR8,
+      .mesa_format  =        MESA_FORMAT_R8G8B8A8_SRGB,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_SARGB8,
+      .mesa_format  =        MESA_FORMAT_B8G8R8A8_SRGB,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_R16,
+      .mesa_format  =        MESA_FORMAT_R_UNORM16,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_R16,
+      .mesa_format  =        MESA_FORMAT_L_UNORM16,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_GR1616,
+      .mesa_format  =        MESA_FORMAT_R16G16_UNORM,
+   },
+   {
+      .image_format = __DRI_IMAGE_FORMAT_GR1616,
+      .mesa_format  =        MESA_FORMAT_L16A16_UNORM,
+   },
+};
+
 uint32_t
 driGLFormatToImageFormat(mesa_format format)
 {
-   switch (format) {
-   case MESA_FORMAT_B5G6R5_UNORM:
-      return __DRI_IMAGE_FORMAT_RGB565;
-   case MESA_FORMAT_B5G5R5A1_UNORM:
-      return __DRI_IMAGE_FORMAT_ARGB1555;
-   case MESA_FORMAT_B8G8R8X8_UNORM:
-      return __DRI_IMAGE_FORMAT_XRGB8888;
-   case MESA_FORMAT_B10G10R10A2_UNORM:
-      return __DRI_IMAGE_FORMAT_ARGB2101010;
-   case MESA_FORMAT_B10G10R10X2_UNORM:
-      return __DRI_IMAGE_FORMAT_XRGB2101010;
-   case MESA_FORMAT_B8G8R8A8_UNORM:
-      return __DRI_IMAGE_FORMAT_ARGB8888;
-   case MESA_FORMAT_R8G8B8A8_UNORM:
-      return __DRI_IMAGE_FORMAT_ABGR8888;
-   case MESA_FORMAT_R8G8B8X8_UNORM:
-      return __DRI_IMAGE_FORMAT_XBGR8888;
-   case MESA_FORMAT_L_UNORM8:
-   case MESA_FORMAT_R_UNORM8:
-      return __DRI_IMAGE_FORMAT_R8;
-   case MESA_FORMAT_L8A8_UNORM:
-   case MESA_FORMAT_R8G8_UNORM:
-      return __DRI_IMAGE_FORMAT_GR88;
-   case MESA_FORMAT_NONE:
-      return __DRI_IMAGE_FORMAT_NONE;
-   case MESA_FORMAT_B8G8R8A8_SRGB:
-      return __DRI_IMAGE_FORMAT_SARGB8;
-   default:
-      return 0;
-   }
+   for (size_t i = 0; i < ARRAY_SIZE(format_mapping); i++)
+      if (format_mapping[i].mesa_format == format)
+         return format_mapping[i].image_format;
+
+   return __DRI_IMAGE_FORMAT_NONE;
 }
 
 mesa_format
 driImageFormatToGLFormat(uint32_t image_format)
 {
-   switch (image_format) {
-   case __DRI_IMAGE_FORMAT_RGB565:
-      return MESA_FORMAT_B5G6R5_UNORM;
-   case __DRI_IMAGE_FORMAT_ARGB1555:
-      return MESA_FORMAT_B5G5R5A1_UNORM;
-   case __DRI_IMAGE_FORMAT_XRGB8888:
-      return MESA_FORMAT_B8G8R8X8_UNORM;
-   case __DRI_IMAGE_FORMAT_ARGB2101010:
-      return MESA_FORMAT_B10G10R10A2_UNORM;
-   case __DRI_IMAGE_FORMAT_XRGB2101010:
-      return MESA_FORMAT_B10G10R10X2_UNORM;
-   case __DRI_IMAGE_FORMAT_ARGB8888:
-      return MESA_FORMAT_B8G8R8A8_UNORM;
-   case __DRI_IMAGE_FORMAT_ABGR8888:
-      return MESA_FORMAT_R8G8B8A8_UNORM;
-   case __DRI_IMAGE_FORMAT_XBGR8888:
-      return MESA_FORMAT_R8G8B8X8_UNORM;
-   case __DRI_IMAGE_FORMAT_R8:
-      return MESA_FORMAT_R_UNORM8;
-   case __DRI_IMAGE_FORMAT_R16:
-      return MESA_FORMAT_R_UNORM16;
-   case __DRI_IMAGE_FORMAT_GR88:
-      return MESA_FORMAT_R8G8_UNORM;
-   case __DRI_IMAGE_FORMAT_GR1616:
-      return MESA_FORMAT_R16G16_UNORM;
-   case __DRI_IMAGE_FORMAT_SARGB8:
-      return MESA_FORMAT_B8G8R8A8_SRGB;
-   case __DRI_IMAGE_FORMAT_NONE:
-      return MESA_FORMAT_NONE;
-   default:
-      return MESA_FORMAT_NONE;
-   }
+   for (size_t i = 0; i < ARRAY_SIZE(format_mapping); i++)
+      if (format_mapping[i].image_format == image_format)
+         return format_mapping[i].mesa_format;
+
+   return MESA_FORMAT_NONE;
 }
 
 /** Image driver interface */
diff --git a/src/mesa/drivers/dri/common/utils.c b/src/mesa/drivers/dri/common/utils.c
index 1cffd3a..fc5e2d1 100644
--- a/src/mesa/drivers/dri/common/utils.c
+++ b/src/mesa/drivers/dri/common/utils.c
@@ -208,6 +208,7 @@
       masks = masks_table[2];
       break;
    case MESA_FORMAT_R8G8B8A8_UNORM:
+   case MESA_FORMAT_R8G8B8A8_SRGB:
       masks = masks_table[5];
       break;
    case MESA_FORMAT_R8G8B8X8_UNORM:
diff --git a/src/mesa/drivers/dri/i915/intel_buffers.c b/src/mesa/drivers/dri/i915/intel_buffers.c
index 386e032..83d59ed 100644
--- a/src/mesa/drivers/dri/i915/intel_buffers.c
+++ b/src/mesa/drivers/dri/i915/intel_buffers.c
@@ -53,7 +53,7 @@
 }
 
 static void
-intelDrawBuffer(struct gl_context * ctx, GLenum mode)
+intelDrawBuffer(struct gl_context * ctx)
 {
    if (_mesa_is_front_buffer_drawing(ctx->DrawBuffer)) {
       struct intel_context *const intel = intel_context(ctx);
diff --git a/src/mesa/drivers/dri/i915/intel_context.h b/src/mesa/drivers/dri/i915/intel_context.h
index a219c7f..a833989 100644
--- a/src/mesa/drivers/dri/i915/intel_context.h
+++ b/src/mesa/drivers/dri/i915/intel_context.h
@@ -284,7 +284,11 @@
 
 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"
+#if ANDROID_API_LEVEL >= 26
+#include <log/log.h>
+#else
 #include <cutils/log.h>
+#endif /* use log/log.h start from android 8 major version */
 #ifndef ALOGW
 #define ALOGW LOGW
 #endif
diff --git a/src/mesa/drivers/dri/i915/intel_extensions.c b/src/mesa/drivers/dri/i915/intel_extensions.c
index c85bd78..05ac487 100644
--- a/src/mesa/drivers/dri/i915/intel_extensions.c
+++ b/src/mesa/drivers/dri/i915/intel_extensions.c
@@ -78,6 +78,7 @@
    ctx->Extensions.OES_draw_texture = true;
 
    ctx->Const.GLSLVersion = 120;
+   ctx->Const.GLSLVersionCompat = 120;
    _mesa_override_glsl_version(&ctx->Const);
 
    if (intel->gen >= 3) {
@@ -89,7 +90,6 @@
       ctx->Extensions.EXT_texture_sRGB = true;
       ctx->Extensions.EXT_texture_sRGB_decode = true;
       ctx->Extensions.EXT_stencil_two_side = true;
-      ctx->Extensions.ATI_separate_stencil = true;
       ctx->Extensions.ATI_texture_env_combine3 = true;
       ctx->Extensions.NV_texture_env_combine4 = true;
 
diff --git a/src/mesa/drivers/dri/i915/intel_fbo.c b/src/mesa/drivers/dri/i915/intel_fbo.c
index 827a77f..78e2c1e 100644
--- a/src/mesa/drivers/dri/i915/intel_fbo.c
+++ b/src/mesa/drivers/dri/i915/intel_fbo.c
@@ -86,7 +86,8 @@
 		       GLuint x, GLuint y, GLuint w, GLuint h,
 		       GLbitfield mode,
 		       GLubyte **out_map,
-		       GLint *out_stride)
+		       GLint *out_stride,
+		       bool flip_y)
 {
    struct intel_context *intel = intel_context(ctx);
    struct swrast_renderbuffer *srb = (struct swrast_renderbuffer *)rb;
@@ -94,6 +95,9 @@
    void *map;
    int stride;
 
+   /* driver does not support GL_FRAMEBUFFER_FLIP_Y_MESA */
+   assert((rb->Name == 0) == flip_y);
+
    if (srb->Buffer) {
       /* this is a malloc'd renderbuffer (accum buffer), not an irb */
       GLint bpp = _mesa_get_format_bytes(rb->Format);
diff --git a/src/mesa/drivers/dri/i915/intel_screen.c b/src/mesa/drivers/dri/i915/intel_screen.c
index 5024a69..882c498 100644
--- a/src/mesa/drivers/dri/i915/intel_screen.c
+++ b/src/mesa/drivers/dri/i915/intel_screen.c
@@ -415,7 +415,7 @@
    image->data = loaderPrivate;
    intel_setup_image_from_mipmap_tree(intel, image, iobj->mt, level, zoffset);
    image->dri_format = driGLFormatToImageFormat(image->format);
-   if (image->dri_format == MESA_FORMAT_NONE) {
+   if (image->dri_format == __DRI_IMAGE_FORMAT_NONE) {
       *error = __DRI_IMAGE_ERROR_BAD_PARAMETER;
       free(image);
       return NULL;
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 3479ceb..db6591a 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -37,6 +37,8 @@
 	brw_pipe_control.h \
 	brw_performance_query.h \
 	brw_performance_query.c \
+	brw_performance_query_mdapi.c \
+	brw_performance_query_metrics.h \
 	brw_program.c \
 	brw_program.h \
 	brw_program_binary.c \
@@ -66,14 +68,12 @@
 	gen4_blorp_exec.h \
 	gen6_clip_state.c \
 	gen6_constant_state.c \
-	gen6_depth_state.c \
 	gen6_multisample_state.c \
 	gen6_queryobj.c \
 	gen6_sampler_state.c \
 	gen6_sol.c \
 	gen6_urb.c \
 	gen7_l3_state.c \
-	gen7_misc_state.c \
 	gen7_sol_state.c \
 	gen7_urb.c \
 	gen8_depth_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.c b/src/mesa/drivers/dri/i965/brw_blorp.c
index 4f6bd97..ad3a47e 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -48,8 +48,8 @@
                         uint32_t *kernel_out, void *prog_data_out)
 {
    struct brw_context *brw = blorp->driver_ctx;
-   return brw_search_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
-                           key, key_size, kernel_out, prog_data_out);
+   return brw_search_cache(&brw->cache, BRW_CACHE_BLORP_PROG, key, key_size,
+                           kernel_out, prog_data_out, true);
 }
 
 static bool
@@ -121,7 +121,7 @@
 static void
 blorp_surf_for_miptree(struct brw_context *brw,
                        struct blorp_surf *surf,
-                       struct intel_mipmap_tree *mt,
+                       const struct intel_mipmap_tree *mt,
                        enum isl_aux_usage aux_usage,
                        bool is_render_target,
                        unsigned *level,
@@ -156,10 +156,6 @@
       .tile_y_sa = mt->level[*level].level_y,
    };
 
-   if (mt->format == MESA_FORMAT_S_UINT8 && is_render_target &&
-       devinfo->gen <= 7)
-      mt->r8stencil_needs_update = true;
-
    if (surf->aux_usage == ISL_AUX_USAGE_HIZ &&
        !intel_miptree_level_has_hiz(mt, *level))
       surf->aux_usage = ISL_AUX_USAGE_NONE;
@@ -168,7 +164,11 @@
       /* We only really need a clear color if we also have an auxiliary
        * surface.  Without one, it does nothing.
        */
-      surf->clear_color = mt->fast_clear_color;
+      surf->clear_color =
+         intel_miptree_get_clear_color(devinfo, mt, mt->surf.format,
+                                       !is_render_target, (struct brw_bo **)
+                                       &surf->clear_color_addr.buffer,
+                                       &surf->clear_color_addr.offset);
 
       surf->aux_surf = &mt->aux_buf->surf;
       surf->aux_addr = (struct blorp_address) {
@@ -178,13 +178,6 @@
 
       surf->aux_addr.buffer = mt->aux_buf->bo;
       surf->aux_addr.offset = mt->aux_buf->offset;
-
-      if (devinfo->gen >= 10) {
-         surf->clear_color_addr = (struct blorp_address) {
-            .buffer = mt->aux_buf->clear_color_bo,
-            .offset = mt->aux_buf->clear_color_offset,
-         };
-      }
    } else {
       surf->aux_addr = (struct blorp_address) {
          .buffer = NULL,
@@ -201,6 +194,26 @@
    *level -= mt->first_level;
 }
 
+static bool
+brw_blorp_supports_dst_format(struct brw_context *brw, mesa_format format)
+{
+   /* If it's renderable, it's definitely supported. */
+   if (brw->mesa_format_supports_render[format])
+      return true;
+
+   /* BLORP can't compress anything */
+   if (_mesa_is_format_compressed(format))
+      return false;
+
+   /* No exotic formats such as GL_LUMINANCE_ALPHA */
+   if (_mesa_get_format_bits(format, GL_RED_BITS) == 0 &&
+       _mesa_get_format_bits(format, GL_DEPTH_BITS) == 0 &&
+       _mesa_get_format_bits(format, GL_STENCIL_BITS) == 0)
+      return false;
+
+   return true;
+}
+
 static enum isl_format
 brw_blorp_to_isl_format(struct brw_context *brw, mesa_format format,
                         bool is_render_target)
@@ -218,15 +231,20 @@
       return ISL_FORMAT_R32_FLOAT;
    case MESA_FORMAT_Z_UNORM16:
       return ISL_FORMAT_R16_UNORM;
-   default: {
+   default:
       if (is_render_target) {
-         assert(brw->mesa_format_supports_render[format]);
-         return brw->mesa_to_isl_render_format[format];
+         assert(brw_blorp_supports_dst_format(brw, format));
+         if (brw->mesa_format_supports_render[format]) {
+            return brw->mesa_to_isl_render_format[format];
+         } else {
+            return brw_isl_format_for_mesa_format(format);
+         }
       } else {
+         /* Some destinations (is_render_target == true) are supported by
+          * blorp even though we technically can't render to them.
+          */
          return brw_isl_format_for_mesa_format(format);
       }
-      break;
-   }
    }
 }
 
@@ -268,12 +286,12 @@
                         float src_x1, float src_y1,
                         float dst_x0, float dst_y0,
                         float dst_x1, float dst_y1,
-                        GLenum filter, bool mirror_x, bool mirror_y,
+                        GLenum gl_filter, bool mirror_x, bool mirror_y,
                         bool decode_srgb, bool encode_srgb)
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
-   DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
+   DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f) "
        "to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
        __func__,
        src_mt->surf.samples, _mesa_get_format_name(src_mt->format), src_mt,
@@ -305,6 +323,65 @@
       src_format = dst_format = MESA_FORMAT_R_FLOAT32;
    }
 
+   enum blorp_filter blorp_filter;
+   if (fabsf(dst_x1 - dst_x0) == fabsf(src_x1 - src_x0) &&
+       fabsf(dst_y1 - dst_y0) == fabsf(src_y1 - src_y0)) {
+      if (src_mt->surf.samples > 1 && dst_mt->surf.samples <= 1) {
+         /* From the OpenGL ES 3.2 specification, section 16.2.1:
+          *
+          *    "If the read framebuffer is multisampled (its effective value
+          *    of SAMPLE_BUFFERS is one) and the draw framebuffer is not (its
+          *    value of SAMPLE_BUFFERS is zero), the samples corresponding to
+          *    each pixel location in the source are converted to a single
+          *    sample before being written to the destination.  The filter
+          *    parameter is ignored. If the source formats are integer types
+          *    or stencil values, a single sample’s value is selected for each
+          *    pixel.  If the source formats are floating-point or normalized
+          *    types, the sample values for each pixel are resolved in an
+          *    implementation-dependent manner.  If the source formats are
+          *    depth values, sample values are resolved in an implementation-
+          *    dependent manner where the result will be between the minimum
+          *    and maximum depth values in the pixel."
+          *
+          * For depth and stencil resolves, we choose to always use the value
+          * at sample 0.
+          */
+         GLenum base_format = _mesa_get_format_base_format(src_mt->format);
+         if (base_format == GL_DEPTH_COMPONENT ||
+             base_format == GL_STENCIL_INDEX ||
+             base_format == GL_DEPTH_STENCIL ||
+             _mesa_is_format_integer(src_mt->format)) {
+            /* The OpenGL ES 3.2 spec says:
+             *
+             *    "If the source formats are integer types or stencil values,
+             *    a single sample's value is selected for each pixel."
+             *
+             * Just take sample 0 in this case.
+             */
+            blorp_filter = BLORP_FILTER_SAMPLE_0;
+         } else {
+            blorp_filter = BLORP_FILTER_AVERAGE;
+         }
+      } else {
+         /* From the OpenGL 4.6 specification, section 18.3.1:
+          *
+          *    "If the source and destination dimensions are identical, no
+          *    filtering is applied."
+          *
+          * Using BLORP_FILTER_NONE will also handle the upsample case by
+          * replicating the one value in the source to all values in the
+          * destination.
+          */
+         blorp_filter = BLORP_FILTER_NONE;
+      }
+   } else if (gl_filter == GL_LINEAR ||
+              gl_filter == GL_SCALED_RESOLVE_FASTEST_EXT ||
+              gl_filter == GL_SCALED_RESOLVE_NICEST_EXT) {
+      blorp_filter = BLORP_FILTER_BILINEAR;
+   } else {
+      blorp_filter = BLORP_FILTER_NEAREST;
+   }
+
    enum isl_format src_isl_format =
       brw_blorp_to_isl_format(brw, src_format, false);
    enum isl_aux_usage src_aux_usage =
@@ -351,7 +428,7 @@
               dst_isl_format, ISL_SWIZZLE_IDENTITY,
               src_x0, src_y0, src_x1, src_y1,
               dst_x0, dst_y0, dst_x1, dst_y1,
-              filter, mirror_x, mirror_y);
+              blorp_filter, mirror_x, mirror_y);
    blorp_batch_finish(&batch);
 
    intel_miptree_finish_write(brw, dst_mt, dst_level, dst_layer, 1,
@@ -571,14 +648,6 @@
       src_mt = find_miptree(buffer_bit, src_irb);
       dst_mt = find_miptree(buffer_bit, dst_irb);
 
-      /* We can't handle format conversions between Z24 and other formats
-       * since we have to lie about the surface format. See the comments in
-       * brw_blorp_surface_info::set().
-       */
-      if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
-          (dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT))
-         return false;
-
       /* We also can't handle any combined depth-stencil formats because we
        * have to reinterpret as a color format.
        */
@@ -647,32 +716,14 @@
    struct intel_mipmap_tree *src_mt = src_irb->mt;
    struct intel_mipmap_tree *dst_mt = intel_image->mt;
 
-   /* There is support for only up to eight samples. */
-   if (src_mt->surf.samples > 8 || dst_mt->surf.samples > 8)
-      return false;
-
-   if (_mesa_get_format_base_format(src_rb->Format) !=
-       _mesa_get_format_base_format(dst_image->TexFormat)) {
-      return false;
-   }
-
-   /* We can't handle format conversions between Z24 and other formats since
-    * we have to lie about the surface format.  See the comments in
-    * brw_blorp_surface_info::set().
-    */
-   if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
-       (dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT)) {
-      return false;
-   }
-
-   /* We also can't handle any combined depth-stencil formats because we
-    * have to reinterpret as a color format.
+   /* We can't handle any combined depth-stencil formats because we have to
+    * reinterpret as a color format.
     */
    if (_mesa_get_format_base_format(src_mt->format) == GL_DEPTH_STENCIL ||
        _mesa_get_format_base_format(dst_mt->format) == GL_DEPTH_STENCIL)
       return false;
 
-   if (!brw->mesa_format_supports_render[dst_image->TexFormat])
+   if (!brw_blorp_supports_dst_format(brw, dst_image->TexFormat))
       return false;
 
    /* Source clipping shouldn't be necessary, since copytexsubimage (in
@@ -693,7 +744,7 @@
    /* Account for the fact that in the system framebuffer, the origin is at
     * the lower left.
     */
-   bool mirror_y = _mesa_is_winsys_fbo(ctx->ReadBuffer);
+   bool mirror_y = ctx->ReadBuffer->FlipY;
    if (mirror_y)
       apply_y_flip(&srcY0, &srcY1, src_rb->Height);
 
@@ -826,7 +877,8 @@
        * data which we need to copy into a BO.
        */
       struct brw_bo *bo =
-         brw_bo_alloc(brw->bufmgr, "tmp_tex_subimage_src", size);
+         brw_bo_alloc(brw->bufmgr, "tmp_tex_subimage_src", size,
+                      BRW_MEMZONE_OTHER);
       if (bo == NULL) {
          perf_debug("intel_texsubimage: temp bo creation failed: size = %u\n",
                     size);
@@ -1172,12 +1224,12 @@
 
    x0 = fb->_Xmin;
    x1 = fb->_Xmax;
-   if (rb->Name != 0) {
-      y0 = fb->_Ymin;
-      y1 = fb->_Ymax;
-   } else {
+   if (fb->FlipY) {
       y0 = rb->Height - fb->_Ymax;
       y1 = rb->Height - fb->_Ymin;
+   } else {
+      y0 = fb->_Ymin;
+      y1 = fb->_Ymax;
    }
 
    /* If the clear region is empty, just return. */
@@ -1218,11 +1270,8 @@
     */
    if (can_fast_clear && !irb->mt->aux_buf) {
       assert(irb->mt->aux_usage == ISL_AUX_USAGE_CCS_D);
-      if (!intel_miptree_alloc_ccs(brw, irb->mt)) {
-         /* There are a few reasons in addition to out-of-memory, that can
-          * cause intel_miptree_alloc_non_msrt_mcs to fail.  Try to recover by
-          * falling back to non-fast clear.
-          */
+      if (!intel_miptree_alloc_aux(brw, irb->mt)) {
+         /* We're out of memory. Fall back to a non-fast clear. */
          can_fast_clear = false;
       }
    }
@@ -1235,14 +1284,16 @@
    if (can_fast_clear) {
       const enum isl_aux_state aux_state =
          intel_miptree_get_aux_state(irb->mt, irb->mt_level, irb->mt_layer);
+      union isl_color_value clear_color =
+         brw_meta_convert_fast_clear_color(brw, irb->mt,
+                                           &ctx->Color.ClearColor);
 
-      bool same_clear_color =
-         !intel_miptree_set_clear_color(brw, irb->mt, &ctx->Color.ClearColor);
+      intel_miptree_set_clear_color(brw, irb->mt, clear_color);
 
-      /* If the buffer is already in INTEL_FAST_CLEAR_STATE_CLEAR, the clear
+      /* If the buffer is already in ISL_AUX_STATE_CLEAR, the clear
        * is redundant and can be skipped.
        */
-      if (aux_state == ISL_AUX_STATE_CLEAR && same_clear_color)
+      if (aux_state == ISL_AUX_STATE_CLEAR)
          return;
 
       DBG("%s (fast) to mt %p level %d layers %d+%d\n", __FUNCTION__,
@@ -1269,7 +1320,8 @@
       brw_emit_end_of_pipe_sync(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH);
 
       struct blorp_batch batch;
-      blorp_batch_init(&brw->blorp, &batch, brw, 0);
+      blorp_batch_init(&brw->blorp, &batch, brw,
+                       BLORP_BATCH_NO_UPDATE_CLEAR_COLOR);
       blorp_fast_clear(&batch, &surf, isl_format,
                        level, irb->mt_layer, num_layers,
                        x0, y0, x1, y1);
@@ -1363,9 +1415,8 @@
    if (!(mask & (BUFFER_BITS_DEPTH_STENCIL)))
       return;
 
-   uint32_t x0, x1, y0, y1, rb_name, rb_height;
+   uint32_t x0, x1, y0, y1, rb_height;
    if (depth_rb) {
-      rb_name = depth_rb->Name;
       rb_height = depth_rb->Height;
       if (stencil_rb) {
          assert(depth_rb->Width == stencil_rb->Width);
@@ -1373,18 +1424,17 @@
       }
    } else {
       assert(stencil_rb);
-      rb_name = stencil_rb->Name;
       rb_height = stencil_rb->Height;
    }
 
    x0 = fb->_Xmin;
    x1 = fb->_Xmax;
-   if (rb_name != 0) {
-      y0 = fb->_Ymin;
-      y1 = fb->_Ymax;
-   } else {
+   if (fb->FlipY) {
       y0 = rb_height - fb->_Ymax;
       y1 = rb_height - fb->_Ymin;
+   } else {
+      y0 = fb->_Ymin;
+      y1 = fb->_Ymax;
    }
 
    /* If the clear region is empty, just return. */
@@ -1424,10 +1474,10 @@
          assert(level == irb->mt_level);
          assert(start_layer == irb->mt_layer);
          assert(num_layers == fb->MaxNumLayers ? irb->layer_count : 1);
-      } else {
-         level = irb->mt_level;
-         start_layer = irb->mt_layer;
       }
+
+      level = irb->mt_level;
+      start_layer = irb->mt_layer;
       num_layers = fb->MaxNumLayers ? irb->layer_count : 1;
 
       stencil_mask = ctx->Stencil.WriteMask[0] & 0xff;
@@ -1623,7 +1673,8 @@
                           &level, start_layer, num_layers, isl_tmp);
 
    struct blorp_batch batch;
-   blorp_batch_init(&brw->blorp, &batch, brw, 0);
+   blorp_batch_init(&brw->blorp, &batch, brw,
+                    BLORP_BATCH_NO_UPDATE_CLEAR_COLOR);
    blorp_hiz_op(&batch, &surf, level, start_layer, num_layers, op);
    blorp_batch_finish(&batch);
 
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 8ba915b..19e2d14 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -54,12 +54,15 @@
 #endif
 #include "common/gen_clflush.h"
 #include "common/gen_debug.h"
+#include "common/gen_gem.h"
 #include "dev/gen_device_info.h"
 #include "libdrm_macros.h"
 #include "main/macros.h"
 #include "util/macros.h"
 #include "util/hash_table.h"
 #include "util/list.h"
+#include "util/u_dynarray.h"
+#include "util/vma.h"
 #include "brw_bufmgr.h"
 #include "brw_context.h"
 #include "string.h"
@@ -98,9 +101,41 @@
    return c == unless;
 }
 
+/**
+ * i965 fixed-size bucketing VMA allocator.
+ *
+ * The BO cache maintains "cache buckets" for buffers of various sizes.
+ * All buffers in a given bucket are identically sized - when allocating,
+ * we always round up to the bucket size.  This means that virtually all
+ * allocations are fixed-size; only buffers which are too large to fit in
+ * a bucket can be variably-sized.
+ *
+ * We create an allocator for each bucket.  Each contains a free-list, where
+ * each node contains a <starting address, 64-bit bitmap> pair.  Each bit
+ * represents a bucket-sized block of memory.  (At the first level, each
+ * bit corresponds to a page.  For the second bucket, bits correspond to
+ * two pages, and so on.)  1 means a block is free, and 0 means it's in-use.
+ * The lowest bit in the bitmap is for the first block.
+ *
+ * This makes allocations cheap - any bit of any node will do.  We can pick
+ * the head of the list and use ffs() to find a free block.  If there are
+ * none, we allocate 64 blocks from a larger allocator - either a bigger
+ * bucketing allocator, or a fallback top-level allocator for large objects.
+ */
+struct vma_bucket_node {
+   uint64_t start_address;
+   uint64_t bitmap;
+};
+
 struct bo_cache_bucket {
+   /** List of cached BOs. */
    struct list_head head;
+
+   /** Size of this bucket, in bytes. */
    uint64_t size;
+
+   /** List of vma_bucket_nodes. */
+   struct util_dynarray vma_list[BRW_MEMZONE_COUNT];
 };
 
 struct brw_bufmgr {
@@ -116,10 +151,13 @@
    struct hash_table *name_table;
    struct hash_table *handle_table;
 
+   struct util_vma_heap vma_allocator[BRW_MEMZONE_COUNT];
+
    bool has_llc:1;
    bool has_mmap_wc:1;
    bool bo_reuse:1;
-   bool supports_48b_addresses:1;
+
+   uint64_t initial_kflags;
 };
 
 static int bo_set_tiling_internal(struct brw_bo *bo, uint32_t tiling_mode,
@@ -127,6 +165,10 @@
 
 static void bo_free(struct brw_bo *bo);
 
+static uint64_t vma_alloc(struct brw_bufmgr *bufmgr,
+                          enum brw_memory_zone memzone,
+                          uint64_t size, uint64_t alignment);
+
 static uint32_t
 key_hash_uint(const void *key)
 {
@@ -221,6 +263,187 @@
           &bufmgr->cache_bucket[index] : NULL;
 }
 
+static enum brw_memory_zone
+memzone_for_address(uint64_t address)
+{
+   const uint64_t _4GB = 1ull << 32;
+
+   if (address >= _4GB)
+      return BRW_MEMZONE_OTHER;
+
+   return BRW_MEMZONE_LOW_4G;
+}
+
+static uint64_t
+bucket_vma_alloc(struct brw_bufmgr *bufmgr,
+                 struct bo_cache_bucket *bucket,
+                 enum brw_memory_zone memzone)
+{
+   struct util_dynarray *vma_list = &bucket->vma_list[memzone];
+   struct vma_bucket_node *node;
+
+   if (vma_list->size == 0) {
+      /* This bucket allocator is out of space - allocate a new block of
+       * memory for 64 blocks from a larger allocator (either a larger
+       * bucket or util_vma).
+       *
+       * We align the address to the node size (64 blocks) so that
+       * bucket_vma_free can easily compute the starting address of this
+       * block by rounding any address we return down to the node size.
+       *
+       * Set the first bit used, and return the start address.
+       */
+      uint64_t node_size = 64ull * bucket->size;
+      node = util_dynarray_grow(vma_list, sizeof(struct vma_bucket_node));
+
+      if (unlikely(!node))
+         return 0ull;
+
+      uint64_t addr = vma_alloc(bufmgr, memzone, node_size, node_size);
+      node->start_address = gen_48b_address(addr);
+      node->bitmap = ~1ull;
+      return node->start_address;
+   }
+
+   /* Pick any bit from any node - they're all the right size and free. */
+   node = util_dynarray_top_ptr(vma_list, struct vma_bucket_node);
+   int bit = ffsll(node->bitmap) - 1;
+   assert(bit >= 0 && bit <= 63);
+
+   /* Reserve the memory by clearing the bit. */
+   assert((node->bitmap & (1ull << bit)) != 0ull);
+   node->bitmap &= ~(1ull << bit);
+
+   uint64_t addr = node->start_address + bit * bucket->size;
+
+   /* If this node is now completely full, remove it from the free list. */
+   if (node->bitmap == 0ull) {
+      (void) util_dynarray_pop(vma_list, struct vma_bucket_node);
+   }
+
+   return addr;
+}
+
+static void
+bucket_vma_free(struct bo_cache_bucket *bucket, uint64_t address)
+{
+   enum brw_memory_zone memzone = memzone_for_address(address);
+   struct util_dynarray *vma_list = &bucket->vma_list[memzone];
+   const uint64_t node_bytes = 64ull * bucket->size;
+   struct vma_bucket_node *node = NULL;
+
+   /* bucket_vma_alloc allocates 64 blocks at a time, and aligns it to
+    * that 64 block size.  So, we can round down to get the starting address.
+    */
+   uint64_t start = (address / node_bytes) * node_bytes;
+
+   /* Dividing the offset from start by bucket size gives us the bit index. */
+   int bit = (address - start) / bucket->size;
+
+   assert(start + bit * bucket->size == address);
+
+   util_dynarray_foreach(vma_list, struct vma_bucket_node, cur) {
+      if (cur->start_address == start) {
+         node = cur;
+         break;
+      }
+   }
+
+   if (!node) {
+      /* No node - the whole group of 64 blocks must have been in-use. */
+      node = util_dynarray_grow(vma_list, sizeof(struct vma_bucket_node));
+
+      if (unlikely(!node))
+         return; /* bogus, leaks some GPU VMA, but nothing we can do... */
+
+      node->start_address = start;
+      node->bitmap = 0ull;
+   }
+
+   /* Set the bit to return the memory. */
+   assert((node->bitmap & (1ull << bit)) == 0ull);
+   node->bitmap |= 1ull << bit;
+
+   /* The block might be entirely free now, and if so, we could return it
+    * to the larger allocator.  But we may as well hang on to it, in case
+    * we get more allocations at this block size.
+    */
+}
+
+static struct bo_cache_bucket *
+get_bucket_allocator(struct brw_bufmgr *bufmgr, uint64_t size)
+{
+   /* Skip using the bucket allocator for very large sizes, as it allocates
+    * 64 of them and this can balloon rather quickly.
+    */
+   if (size > 1024 * PAGE_SIZE)
+      return NULL;
+
+   struct bo_cache_bucket *bucket = bucket_for_size(bufmgr, size);
+
+   if (bucket && bucket->size == size)
+      return bucket;
+
+   return NULL;
+}
+
+/**
+ * Allocate a section of virtual memory for a buffer, assigning an address.
+ *
+ * This uses either the bucket allocator for the given size, or the large
+ * object allocator (util_vma).
+ */
+static uint64_t
+vma_alloc(struct brw_bufmgr *bufmgr,
+          enum brw_memory_zone memzone,
+          uint64_t size,
+          uint64_t alignment)
+{
+   /* Without softpin support, we let the kernel assign addresses. */
+   assert(brw_using_softpin(bufmgr));
+
+   struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
+   uint64_t addr;
+
+   if (bucket) {
+      addr = bucket_vma_alloc(bufmgr, bucket, memzone);
+   } else {
+      addr = util_vma_heap_alloc(&bufmgr->vma_allocator[memzone], size,
+                                 alignment);
+   }
+
+   assert((addr >> 48ull) == 0);
+   assert((addr % alignment) == 0);
+
+   return gen_canonical_address(addr);
+}
+
+/**
+ * Free a virtual memory area, allowing the address to be reused.
+ */
+static void
+vma_free(struct brw_bufmgr *bufmgr,
+         uint64_t address,
+         uint64_t size)
+{
+   assert(brw_using_softpin(bufmgr));
+
+   /* Un-canonicalize the address. */
+   address = gen_48b_address(address);
+
+   if (address == 0ull)
+      return;
+
+   struct bo_cache_bucket *bucket = get_bucket_allocator(bufmgr, size);
+
+   if (bucket) {
+      bucket_vma_free(bucket, address);
+   } else {
+      enum brw_memory_zone memzone = memzone_for_address(address);
+      util_vma_heap_free(&bufmgr->vma_allocator[memzone], address, size);
+   }
+}
+
 int
 brw_bo_busy(struct brw_bo *bo)
 {
@@ -267,12 +490,12 @@
 bo_alloc_internal(struct brw_bufmgr *bufmgr,
                   const char *name,
                   uint64_t size,
+                  enum brw_memory_zone memzone,
                   unsigned flags,
                   uint32_t tiling_mode,
                   uint32_t stride)
 {
    struct brw_bo *bo;
-   unsigned int page_size = getpagesize();
    int ret;
    struct bo_cache_bucket *bucket;
    bool alloc_from_cache;
@@ -298,12 +521,12 @@
     * allocation up.
     */
    if (bucket == NULL) {
-      bo_size = size;
-      if (bo_size < page_size)
-         bo_size = page_size;
+      unsigned int page_size = getpagesize();
+      bo_size = size == 0 ? page_size : ALIGN(size, page_size);
    } else {
       bo_size = bucket->size;
    }
+   assert(bo_size);
 
    mtx_lock(&bufmgr->lock);
    /* Get a buffer out of the cache if available */
@@ -358,7 +581,16 @@
       }
    }
 
-   if (!alloc_from_cache) {
+   if (alloc_from_cache) {
+      /* If the cache BO isn't in the right memory zone, free the old
+       * memory and assign it a new address.
+       */
+      if ((bo->kflags & EXEC_OBJECT_PINNED) &&
+          memzone != memzone_for_address(bo->gtt_offset)) {
+         vma_free(bufmgr, bo->gtt_offset, bo->size);
+         bo->gtt_offset = 0ull;
+      }
+   } else {
       bo = calloc(1, sizeof(*bo));
       if (!bo)
          goto err;
@@ -407,8 +639,14 @@
    bo->reusable = true;
    bo->cache_coherent = bufmgr->has_llc;
    bo->index = -1;
-   if (bufmgr->supports_48b_addresses)
-      bo->kflags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+   bo->kflags = bufmgr->initial_kflags;
+
+   if ((bo->kflags & EXEC_OBJECT_PINNED) && bo->gtt_offset == 0ull) {
+      bo->gtt_offset = vma_alloc(bufmgr, memzone, bo->size, 1);
+
+      if (bo->gtt_offset == 0ull)
+         goto err_free;
+   }
 
    mtx_unlock(&bufmgr->lock);
 
@@ -426,23 +664,27 @@
 
 struct brw_bo *
 brw_bo_alloc(struct brw_bufmgr *bufmgr,
-             const char *name, uint64_t size)
+             const char *name, uint64_t size,
+             enum brw_memory_zone memzone)
 {
-   return bo_alloc_internal(bufmgr, name, size, 0, I915_TILING_NONE, 0);
+   return bo_alloc_internal(bufmgr, name, size, memzone,
+                            0, I915_TILING_NONE, 0);
 }
 
 struct brw_bo *
 brw_bo_alloc_tiled(struct brw_bufmgr *bufmgr, const char *name,
-                   uint64_t size, uint32_t tiling_mode, uint32_t pitch,
+                   uint64_t size, enum brw_memory_zone memzone,
+                   uint32_t tiling_mode, uint32_t pitch,
                    unsigned flags)
 {
-   return bo_alloc_internal(bufmgr, name, size, flags, tiling_mode, pitch);
+   return bo_alloc_internal(bufmgr, name, size, memzone,
+                            flags, tiling_mode, pitch);
 }
 
 struct brw_bo *
 brw_bo_alloc_tiled_2d(struct brw_bufmgr *bufmgr, const char *name,
-                      int x, int y, int cpp, uint32_t tiling,
-                      uint32_t *pitch, unsigned flags)
+                      int x, int y, int cpp, enum brw_memory_zone memzone,
+                      uint32_t tiling, uint32_t *pitch, unsigned flags)
 {
    uint64_t size;
    uint32_t stride;
@@ -477,7 +719,8 @@
    if (tiling == I915_TILING_NONE)
       stride = 0;
 
-   return bo_alloc_internal(bufmgr, name, size, flags, tiling, stride);
+   return bo_alloc_internal(bufmgr, name, size, memzone,
+                            flags, tiling, stride);
 }
 
 /**
@@ -537,6 +780,10 @@
    bo->global_name = handle;
    bo->reusable = false;
    bo->external = true;
+   bo->kflags = bufmgr->initial_kflags;
+
+   if (bo->kflags & EXEC_OBJECT_PINNED)
+      bo->gtt_offset = vma_alloc(bufmgr, BRW_MEMZONE_OTHER, bo->size, 1);
 
    _mesa_hash_table_insert(bufmgr->handle_table, &bo->gem_handle, bo);
    _mesa_hash_table_insert(bufmgr->name_table, &bo->global_name, bo);
@@ -598,6 +845,10 @@
       DBG("DRM_IOCTL_GEM_CLOSE %d failed (%s): %s\n",
           bo->gem_handle, bo->name, strerror(errno));
    }
+
+   if (bo->kflags & EXEC_OBJECT_PINNED)
+      vma_free(bo->bufmgr, bo->gtt_offset, bo->size);
+
    free(bo);
 }
 
@@ -641,7 +892,6 @@
       bo->free_time = time;
 
       bo->name = NULL;
-      bo->kflags = 0;
 
       list_addtail(&bo->head, &bucket->head);
    } else {
@@ -1057,11 +1307,23 @@
 
          bo_free(bo);
       }
+
+      if (brw_using_softpin(bufmgr)) {
+         for (int z = 0; z < BRW_MEMZONE_COUNT; z++) {
+            util_dynarray_fini(&bucket->vma_list[z]);
+         }
+      }
    }
 
    _mesa_hash_table_destroy(bufmgr->name_table, NULL);
    _mesa_hash_table_destroy(bufmgr->handle_table, NULL);
 
+   if (brw_using_softpin(bufmgr)) {
+      for (int z = 0; z < BRW_MEMZONE_COUNT; z++) {
+         util_vma_heap_finish(&bufmgr->vma_allocator[z]);
+      }
+   }
+
    free(bufmgr);
 }
 
@@ -1157,6 +1419,12 @@
    bo->name = "prime";
    bo->reusable = false;
    bo->external = true;
+   bo->kflags = bufmgr->initial_kflags;
+
+   if (bo->kflags & EXEC_OBJECT_PINNED) {
+      assert(bo->size > 0);
+      bo->gtt_offset = vma_alloc(bufmgr, BRW_MEMZONE_OTHER, bo->size, 1);
+   }
 
    if (tiling_mode < 0) {
       struct drm_i915_gem_get_tiling get_tiling = { .handle = bo->gem_handle };
@@ -1284,6 +1552,10 @@
    assert(i < ARRAY_SIZE(bufmgr->cache_bucket));
 
    list_inithead(&bufmgr->cache_bucket[i].head);
+   if (brw_using_softpin(bufmgr)) {
+      for (int z = 0; z < BRW_MEMZONE_COUNT; z++)
+         util_dynarray_init(&bufmgr->cache_bucket[i].vma_list[z], NULL);
+   }
    bufmgr->cache_bucket[i].size = size;
    bufmgr->num_buckets++;
 
@@ -1385,22 +1657,26 @@
    return v;
 }
 
-static bool
-gem_supports_48b_addresses(int fd)
+static int
+gem_context_getparam(int fd, uint32_t context, uint64_t param, uint64_t *value)
 {
-   struct drm_i915_gem_exec_object2 obj = {
-      .flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
+   struct drm_i915_gem_context_param gp = {
+      .ctx_id = context,
+      .param = param,
    };
 
-   struct drm_i915_gem_execbuffer2 execbuf = {
-      .buffers_ptr = (uintptr_t)&obj,
-      .buffer_count = 1,
-      .rsvd1 = 0xffffffu,
-   };
+   if (drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_GETPARAM, &gp))
+      return -1;
 
-   int ret = drmIoctl(fd, DRM_IOCTL_I915_GEM_EXECBUFFER2, &execbuf);
+   *value = gp.value;
 
-   return ret == -1 && errno == ENOENT;
+   return 0;
+}
+
+bool
+brw_using_softpin(struct brw_bufmgr *bufmgr)
+{
+   return bufmgr->initial_kflags & EXEC_OBJECT_PINNED;
 }
 
 /**
@@ -1434,10 +1710,39 @@
       return NULL;
    }
 
+   uint64_t gtt_size;
+   if (gem_context_getparam(fd, 0, I915_CONTEXT_PARAM_GTT_SIZE, &gtt_size))
+      gtt_size = 0;
+
    bufmgr->has_llc = devinfo->has_llc;
    bufmgr->has_mmap_wc = gem_param(fd, I915_PARAM_MMAP_VERSION) > 0;
-   bufmgr->supports_48b_addresses =
-      devinfo->gen >= 8 && gem_supports_48b_addresses(fd);
+
+   const uint64_t _4GB = 4ull << 30;
+
+   if (devinfo->gen >= 8 && gtt_size > _4GB) {
+      bufmgr->initial_kflags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
+
+      /* Allocate VMA in userspace if we have softpin and full PPGTT. */
+      if (gem_param(fd, I915_PARAM_HAS_EXEC_SOFTPIN) > 0 &&
+          gem_param(fd, I915_PARAM_HAS_ALIASING_PPGTT) > 1) {
+         bufmgr->initial_kflags |= EXEC_OBJECT_PINNED;
+
+         util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_LOW_4G],
+                            4096, _4GB);
+         util_vma_heap_init(&bufmgr->vma_allocator[BRW_MEMZONE_OTHER],
+                            1 * _4GB, gtt_size - 1 * _4GB);
+      } else if (devinfo->gen >= 10) {
+         /* Softpin landed in 4.5, but GVT used an aliasing PPGTT until
+          * kernel commit 6b3816d69628becb7ff35978aa0751798b4a940a in
+          * 4.14.  Gen10+ GVT hasn't landed yet, so it's not actually a
+          * problem - but extending this requirement back to earlier gens
+          * might actually mean requiring 4.14.
+          */
+         fprintf(stderr, "i965 requires softpin (Kernel 4.5) on Gen10+.");
+         free(bufmgr);
+         return NULL;
+      }
+   }
 
    init_cache_buckets(bufmgr);
 
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 68f5e0c..32fc7a5 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -37,6 +37,8 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <time.h>
+
 #include "util/u_atomic.h"
 #include "util/list.h"
 
@@ -47,6 +49,42 @@
 struct gen_device_info;
 struct brw_context;
 
+/**
+ * Memory zones.  When allocating a buffer, you can request that it is
+ * placed into a specific region of the virtual address space (PPGTT).
+ *
+ * Most buffers can go anywhere (BRW_MEMZONE_OTHER).  Some buffers are
+ * accessed via an offset from a base address.  STATE_BASE_ADDRESS has
+ * a maximum 4GB size for each region, so we need to restrict those
+ * buffers to be within 4GB of the base.  Each memory zone corresponds
+ * to a particular base address.
+ *
+ * Currently, i965 partitions the address space into two regions:
+ *
+ * - Low 4GB
+ * - Full 48-bit address space
+ *
+ * Eventually, we hope to carve out 4GB of VMA for each base address.
+ */
+enum brw_memory_zone {
+   BRW_MEMZONE_LOW_4G,
+   BRW_MEMZONE_OTHER,
+
+   /* Shaders - Instruction State Base Address */
+   BRW_MEMZONE_SHADER  = BRW_MEMZONE_LOW_4G,
+
+   /* Scratch - General State Base Address */
+   BRW_MEMZONE_SCRATCH = BRW_MEMZONE_LOW_4G,
+
+   /* Surface State Base Address */
+   BRW_MEMZONE_SURFACE = BRW_MEMZONE_LOW_4G,
+
+   /* Dynamic State Base Address */
+   BRW_MEMZONE_DYNAMIC = BRW_MEMZONE_LOW_4G,
+};
+
+#define BRW_MEMZONE_COUNT (BRW_MEMZONE_OTHER + 1)
+
 struct brw_bo {
    /**
     * Size in bytes of the buffer object.
@@ -168,7 +206,7 @@
  * using brw_bo_map() to be used by the CPU.
  */
 struct brw_bo *brw_bo_alloc(struct brw_bufmgr *bufmgr, const char *name,
-                            uint64_t size);
+                            uint64_t size, enum brw_memory_zone memzone);
 
 /**
  * Allocate a tiled buffer object.
@@ -184,6 +222,7 @@
 struct brw_bo *brw_bo_alloc_tiled(struct brw_bufmgr *bufmgr,
                                   const char *name,
                                   uint64_t size,
+                                  enum brw_memory_zone memzone,
                                   uint32_t tiling_mode,
                                   uint32_t pitch,
                                   unsigned flags);
@@ -206,6 +245,7 @@
 struct brw_bo *brw_bo_alloc_tiled_2d(struct brw_bufmgr *bufmgr,
                                      const char *name,
                                      int x, int y, int cpp,
+                                     enum brw_memory_zone memzone,
                                      uint32_t tiling_mode,
                                      uint32_t *pitch,
                                      unsigned flags);
@@ -332,6 +372,8 @@
 int brw_reg_read(struct brw_bufmgr *bufmgr, uint32_t offset,
                  uint64_t *result);
 
+bool brw_using_softpin(struct brw_bufmgr *bufmgr);
+
 /** @{ */
 
 #if defined(__cplusplus)
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
index 3d540d6..30e0986 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -30,7 +30,6 @@
 #include "drivers/common/meta.h"
 
 #include "intel_batchbuffer.h"
-#include "intel_blit.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
 
@@ -108,7 +107,6 @@
    struct intel_mipmap_tree *mt = depth_irb->mt;
    struct gl_renderbuffer_attachment *depth_att = &fb->Attachment[BUFFER_DEPTH];
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   bool same_clear_value = true;
 
    if (devinfo->gen < 6)
       return false;
@@ -213,48 +211,8 @@
          }
       }
 
-      intel_miptree_set_depth_clear_value(brw, mt, clear_value);
-      same_clear_value = false;
-   }
-
-   bool need_clear = false;
-   for (unsigned a = 0; a < num_layers; a++) {
-      enum isl_aux_state aux_state =
-         intel_miptree_get_aux_state(mt, depth_irb->mt_level,
-                                     depth_irb->mt_layer + a);
-
-      if (aux_state != ISL_AUX_STATE_CLEAR) {
-         need_clear = true;
-         break;
-      }
-   }
-
-   if (!need_clear) {
-      /* If all of the layers we intend to clear are already in the clear
-       * state then simply updating the miptree fast clear value is sufficient
-       * to change their clear value.
-       */
-      if (devinfo->gen >= 10 && !same_clear_value) {
-         /* Before gen10, it was enough to just update the clear value in the
-          * miptree. But on gen10+, we let blorp update the clear value state
-          * buffer when doing a fast clear. Since we are skipping the fast
-          * clear here, we need to update the clear color ourselves.
-          */
-         uint32_t clear_offset = mt->aux_buf->clear_color_offset;
-         union isl_color_value clear_color = { .f32 = { clear_value, } };
-
-         /* We can't update the clear color while the hardware is still using
-          * the previous one for a resolve or sampling from it. So make sure
-          * that there's no pending commands at this point.
-          */
-         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
-         for (int i = 0; i < 4; i++) {
-            brw_store_data_imm32(brw, mt->aux_buf->clear_color_bo,
-                                 clear_offset + i * 4, clear_color.u32[i]);
-         }
-         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE);
-      }
-      return true;
+      const union isl_color_value clear_color = { .f32 = {clear_value, } };
+      intel_miptree_set_clear_color(brw, mt, clear_color);
    }
 
    for (unsigned a = 0; a < num_layers; a++) {
@@ -269,13 +227,9 @@
       }
    }
 
-   /* Now, the HiZ buffer contains data that needs to be resolved to the depth
-    * buffer.
-    */
    intel_miptree_set_aux_state(brw, mt, depth_irb->mt_level,
                                depth_irb->mt_layer, num_layers,
                                ISL_AUX_STATE_CLEAR);
-
    return true;
 }
 
@@ -307,14 +261,6 @@
       }
    }
 
-   if (mask & BUFFER_BIT_STENCIL) {
-      struct intel_renderbuffer *stencil_irb =
-         intel_get_renderbuffer(fb, BUFFER_STENCIL);
-      struct intel_mipmap_tree *mt = stencil_irb->mt;
-      if (mt && mt->stencil_mt)
-         mt->stencil_mt->r8stencil_needs_update = true;
-   }
-
    if (mask & BUFFER_BITS_COLOR) {
       brw_blorp_clear_color(brw, fb, mask, partial_clear,
                             ctx->Color.sRGBEnabled);
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 3a7c482..49c41d8 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -203,9 +203,8 @@
       }
    }
 
-   if (!brw_search_cache(&brw->cache, BRW_CACHE_CLIP_PROG,
-			 &key, sizeof(key),
-			 &brw->clip.prog_offset, &brw->clip.prog_data)) {
+   if (!brw_search_cache(&brw->cache, BRW_CACHE_CLIP_PROG, &key, sizeof(key),
+                         &brw->clip.prog_offset, &brw->clip.prog_data, true)) {
       compile_clip_prog( brw, &key );
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
index 5ce899b..5c8e3a5 100644
--- a/src/mesa/drivers/dri/i965/brw_compute.c
+++ b/src/mesa/drivers/dri/i965/brw_compute.c
@@ -167,7 +167,7 @@
 brw_dispatch_compute_common(struct gl_context *ctx)
 {
    struct brw_context *brw = brw_context(ctx);
-   bool fail_next = false;
+   bool fail_next;
 
    if (!_mesa_check_conditional_render(ctx))
       return;
@@ -182,9 +182,10 @@
    /* Flush the batch if the batch/state buffers are nearly full.  We can
     * grow them if needed, but this is not free, so we'd like to avoid it.
     */
-   intel_batchbuffer_require_space(brw, 600, RENDER_RING);
+   intel_batchbuffer_require_space(brw, 600);
    brw_require_statebuffer_space(brw, 2500);
    intel_batchbuffer_save_state(brw);
+   fail_next = intel_batchbuffer_saved_state_is_empty(brw);
 
  retry:
    brw->batch.no_wrap = true;
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 68259fe..ed9e9d7 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -340,9 +340,13 @@
    /* GL_ARB_get_program_binary */
    brw_program_binary_init(brw->screen->deviceID);
    functions->GetProgramBinaryDriverSHA1 = brw_get_program_binary_driver_sha1;
-   functions->ProgramBinarySerializeDriverBlob = brw_program_serialize_nir;
+   functions->ProgramBinarySerializeDriverBlob = brw_serialize_program_binary;
    functions->ProgramBinaryDeserializeDriverBlob =
       brw_deserialize_program_binary;
+
+   if (brw->screen->disk_cache) {
+      functions->ShaderCacheSerializeDriverBlob = brw_program_serialize_nir;
+   }
 }
 
 static void
@@ -363,6 +367,9 @@
    ctx->Const.SpirVCapabilities.draw_parameters = true;
    ctx->Const.SpirVCapabilities.image_write_without_format = true;
    ctx->Const.SpirVCapabilities.variable_pointers = true;
+   ctx->Const.SpirVCapabilities.atomic_storage = devinfo->gen >= 7;
+   ctx->Const.SpirVCapabilities.transform_feedback = devinfo->gen >= 7;
+   ctx->Const.SpirVCapabilities.geometry_streams = devinfo->gen >= 7;
 }
 
 static void
@@ -922,15 +929,6 @@
    brw->gs.base.stage = MESA_SHADER_GEOMETRY;
    brw->wm.base.stage = MESA_SHADER_FRAGMENT;
    brw->cs.base.stage = MESA_SHADER_COMPUTE;
-   if (devinfo->gen >= 8) {
-      brw->vtbl.emit_depth_stencil_hiz = gen8_emit_depth_stencil_hiz;
-   } else if (devinfo->gen >= 7) {
-      brw->vtbl.emit_depth_stencil_hiz = gen7_emit_depth_stencil_hiz;
-   } else if (devinfo->gen >= 6) {
-      brw->vtbl.emit_depth_stencil_hiz = gen6_emit_depth_stencil_hiz;
-   } else {
-      brw->vtbl.emit_depth_stencil_hiz = brw_emit_depth_stencil_hiz;
-   }
 
    brw_init_driver_functions(brw, &functions);
 
@@ -989,22 +987,21 @@
 
    intel_batchbuffer_init(brw);
 
-   if (devinfo->gen >= 6) {
-      /* Create a new hardware context.  Using a hardware context means that
-       * our GPU state will be saved/restored on context switch, allowing us
-       * to assume that the GPU is in the same state we left it in.
-       *
-       * This is required for transform feedback buffer offsets, query objects,
-       * and also allows us to reduce how much state we have to emit.
-       */
-      brw->hw_ctx = brw_create_hw_context(brw->bufmgr);
+   /* Create a new hardware context.  Using a hardware context means that
+    * our GPU state will be saved/restored on context switch, allowing us
+    * to assume that the GPU is in the same state we left it in.
+    *
+    * This is required for transform feedback buffer offsets, query objects,
+    * and also allows us to reduce how much state we have to emit.
+    */
+   brw->hw_ctx = brw_create_hw_context(brw->bufmgr);
+   if (!brw->hw_ctx && devinfo->gen >= 6) {
+      fprintf(stderr, "Failed to create hardware context.\n");
+      intelDestroyContext(driContextPriv);
+      return false;
+   }
 
-      if (!brw->hw_ctx) {
-         fprintf(stderr, "Failed to create hardware context.\n");
-         intelDestroyContext(driContextPriv);
-         return false;
-      }
-
+   if (brw->hw_ctx) {
       int hw_priority = GEN_CONTEXT_MEDIUM_PRIORITY;
       if (ctx_config->attribute_mask & __DRIVER_CONTEXT_ATTRIB_PRIORITY) {
          switch (ctx_config->priority) {
@@ -1698,9 +1695,18 @@
    if (last_mt && last_mt->bo == buffer->bo)
       return;
 
+   /* Only allow internal compression if samples == 0.  For multisampled
+    * window system buffers, the only thing the single-sampled buffer is used
+    * for is as a resolve target.  If we do any compression beyond what is
+    * supported by the window system, we will just have to resolve so it's
+    * probably better to just not bother.
+    */
+   const bool allow_internal_aux = (num_samples == 0);
+
    struct intel_mipmap_tree *mt =
       intel_miptree_create_for_dri_image(intel, buffer, GL_TEXTURE_2D,
-                                         intel_rb_format(rb), true);
+                                         intel_rb_format(rb),
+                                         allow_internal_aux);
    if (!mt)
       return;
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 1ac88cb..4b62650 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -37,7 +37,6 @@
 #include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/errors.h"
-#include "vbo/vbo.h"
 #include "brw_structs.h"
 #include "brw_pipe_control.h"
 #include "compiler/brw_compiler.h"
@@ -48,6 +47,7 @@
 #include <brw_bufmgr.h>
 
 #include "common/gen_debug.h"
+#include "common/gen_decoder.h"
 #include "intel_screen.h"
 #include "intel_tex_obj.h"
 
@@ -431,6 +431,7 @@
    ST_GS,
    ST_FS8,
    ST_FS16,
+   ST_FS32,
    ST_CS,
 };
 
@@ -444,7 +445,8 @@
    GLuint step_rate;
 };
 struct brw_vertex_element {
-   const struct gl_vertex_array *glarray;
+   const struct gl_array_attributes *glattrib;
+   const struct gl_vertex_buffer_binding *glbinding;
 
    int buffer;
    bool is_dual_slot;
@@ -465,12 +467,6 @@
    bool flushed;
 };
 
-enum brw_gpu_ring {
-   UNKNOWN_RING,
-   RENDER_RING,
-   BLT_RING,
-};
-
 struct brw_reloc_list {
    struct drm_i915_gem_relocation_entry *relocs;
    int reloc_count;
@@ -483,6 +479,7 @@
    struct brw_bo *partial_bo;
    uint32_t *partial_bo_map;
    unsigned partial_bytes;
+   enum brw_memory_zone memzone;
 };
 
 struct intel_batchbuffer {
@@ -500,7 +497,6 @@
    uint32_t *map_next;
    uint32_t state_used;
 
-   enum brw_gpu_ring ring;
    bool use_shadow_copy;
    bool use_batch_first;
    bool needs_sol_reset;
@@ -529,6 +525,8 @@
 
    /** Map from batch offset to brw_state_batch data (with DEBUG_BATCH) */
    struct hash_table *state_batch_sizes;
+
+   struct gen_batch_decode_ctx decoder;
 };
 
 #define BRW_MAX_XFB_STREAMS 4
@@ -686,7 +684,8 @@
 
 enum brw_query_kind {
    OA_COUNTERS,
-   PIPELINE_STATS
+   OA_COUNTERS_RAW,
+   PIPELINE_STATS,
 };
 
 struct brw_perf_query_register_prog {
@@ -743,20 +742,6 @@
    struct
    {
       /**
-       * Send the appropriate state packets to configure depth, stencil, and
-       * HiZ buffers (i965+ only)
-       */
-      void (*emit_depth_stencil_hiz)(struct brw_context *brw,
-                                     struct intel_mipmap_tree *depth_mt,
-                                     uint32_t depth_offset,
-                                     uint32_t depthbuffer_format,
-                                     uint32_t depth_surface_type,
-                                     struct intel_mipmap_tree *stencil_mt,
-                                     bool hiz, bool separate_stencil,
-                                     uint32_t width, uint32_t height,
-                                     uint32_t tile_x, uint32_t tile_y);
-
-      /**
        * Emit an MI_REPORT_PERF_COUNT command packet.
        *
        * This asks the GPU to write a report of the current OA counter values
@@ -904,20 +889,35 @@
       } params;
 
       /**
-       * Buffer and offset used for GL_ARB_shader_draw_parameters
-       * (for now, only gl_BaseVertex).
+       * Buffer and offset used for GL_ARB_shader_draw_parameters which will
+       * point to the indirect buffer for indirect draw calls.
        */
       struct brw_bo *draw_params_bo;
       uint32_t draw_params_offset;
 
+      struct {
+         /**
+          * The value of gl_DrawID for the current _mesa_prim. This always comes
+          * in from it's own vertex buffer since it's not part of the indirect
+          * draw parameters.
+          */
+         int gl_drawid;
+
+         /**
+          * Stores if the current _mesa_prim is an indexed or non-indexed draw
+          * (~0/0). Useful to calculate gl_BaseVertex as an AND of firstvertex
+          * and is_indexed_draw.
+          */
+         int is_indexed_draw;
+      } derived_params;
+
       /**
-       * The value of gl_DrawID for the current _mesa_prim. This always comes
-       * in from it's own vertex buffer since it's not part of the indirect
-       * draw parameters.
+       * Buffer and offset used for GL_ARB_shader_draw_parameters which contains
+       * parameters that are not present in the indirect buffer. They will go in
+       * their own vertex element.
        */
-      int gl_drawid;
-      struct brw_bo *draw_id_bo;
-      uint32_t draw_id_offset;
+      struct brw_bo *derived_draw_params_bo;
+      uint32_t derived_draw_params_offset;
 
       /**
        * Pointer to the the buffer storing the indirect draw parameters. It
@@ -966,8 +966,8 @@
        */
       uint8_t attrib_wa_flags[VERT_ATTRIB_MAX];
 
-      /* For the initial pushdown, keep the list of vbo inputs. */
-      struct vbo_inputs draw_arrays;
+      /* High bits of the last seen vertex buffer address (for workarounds). */
+      uint16_t last_bo_high_bits[33];
    } vb;
 
    struct {
@@ -988,6 +988,9 @@
        * referencing the same index buffer.
        */
       unsigned int start_vertex_offset;
+
+      /* High bits of the last seen index buffer address (for workarounds). */
+      uint16_t last_bo_high_bits;
    } ib;
 
    /* Active vertex program:
@@ -1645,6 +1648,9 @@
 brw_program_binary_init(unsigned device_id);
 extern void
 brw_get_program_binary_driver_sha1(struct gl_context *ctx, uint8_t *sha1);
+void brw_serialize_program_binary(struct gl_context *ctx,
+                                  struct gl_shader_program *sh_prog,
+                                  struct gl_program *prog);
 extern void
 brw_deserialize_program_binary(struct gl_context *ctx,
                                struct gl_shader_program *shProg,
@@ -1652,8 +1658,9 @@
 void
 brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog);
 void
-brw_program_deserialize_nir(struct gl_context *ctx, struct gl_program *prog,
-                            gl_shader_stage stage);
+brw_program_deserialize_driver_blob(struct gl_context *ctx,
+                                    struct gl_program *prog,
+                                    gl_shader_stage stage);
 
 /*======================================================================
  * Inline conversion functions.  These are better-typed than the
@@ -1700,45 +1707,6 @@
 void
 brw_emit_depthbuffer(struct brw_context *brw);
 
-void
-brw_emit_depth_stencil_hiz(struct brw_context *brw,
-                           struct intel_mipmap_tree *depth_mt,
-                           uint32_t depth_offset, uint32_t depthbuffer_format,
-                           uint32_t depth_surface_type,
-                           struct intel_mipmap_tree *stencil_mt,
-                           bool hiz, bool separate_stencil,
-                           uint32_t width, uint32_t height,
-                           uint32_t tile_x, uint32_t tile_y);
-
-void
-gen6_emit_depth_stencil_hiz(struct brw_context *brw,
-                            struct intel_mipmap_tree *depth_mt,
-                            uint32_t depth_offset, uint32_t depthbuffer_format,
-                            uint32_t depth_surface_type,
-                            struct intel_mipmap_tree *stencil_mt,
-                            bool hiz, bool separate_stencil,
-                            uint32_t width, uint32_t height,
-                            uint32_t tile_x, uint32_t tile_y);
-
-void
-gen7_emit_depth_stencil_hiz(struct brw_context *brw,
-                            struct intel_mipmap_tree *depth_mt,
-                            uint32_t depth_offset, uint32_t depthbuffer_format,
-                            uint32_t depth_surface_type,
-                            struct intel_mipmap_tree *stencil_mt,
-                            bool hiz, bool separate_stencil,
-                            uint32_t width, uint32_t height,
-                            uint32_t tile_x, uint32_t tile_y);
-void
-gen8_emit_depth_stencil_hiz(struct brw_context *brw,
-                            struct intel_mipmap_tree *depth_mt,
-                            uint32_t depth_offset, uint32_t depthbuffer_format,
-                            uint32_t depth_surface_type,
-                            struct intel_mipmap_tree *stencil_mt,
-                            bool hiz, bool separate_stencil,
-                            uint32_t width, uint32_t height,
-                            uint32_t tile_x, uint32_t tile_y);
-
 uint32_t get_hw_prim_for_gl_prim(int mode);
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c
index e3f8fc6..498c80d 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.c
+++ b/src/mesa/drivers/dri/i965/brw_cs.c
@@ -168,10 +168,9 @@
 
    brw_cs_populate_key(brw, &key);
 
-   if (brw_search_cache(&brw->cache, BRW_CACHE_CS_PROG,
-                        &key, sizeof(key),
-                        &brw->cs.base.prog_offset,
-                        &brw->cs.base.prog_data))
+   if (brw_search_cache(&brw->cache, BRW_CACHE_CS_PROG, &key, sizeof(key),
+                        &brw->cs.base.prog_offset, &brw->cs.base.prog_data,
+                        true))
       return;
 
    if (brw_disk_cache_upload_program(brw, MESA_SHADER_COMPUTE))
@@ -184,6 +183,16 @@
    assert(success);
 }
 
+void
+brw_cs_populate_default_key(const struct gen_device_info *devinfo,
+                            struct brw_cs_prog_key *key,
+                            struct gl_program *prog)
+{
+   memset(key, 0, sizeof(*key));
+   key->program_string_id = brw_program(prog)->id;
+
+   brw_setup_tex_for_precompile(devinfo, &key->tex, prog);
+}
 
 bool
 brw_cs_precompile(struct gl_context *ctx, struct gl_program *prog)
@@ -193,10 +202,7 @@
 
    struct brw_program *bcp = brw_program(prog);
 
-   memset(&key, 0, sizeof(key));
-   key.program_string_id = bcp->id;
-
-   brw_setup_tex_for_precompile(brw, &key.tex, prog);
+   brw_cs_populate_default_key(&brw->screen->devinfo, &key, prog);
 
    uint32_t old_prog_offset = brw->cs.base.prog_offset;
    struct brw_stage_prog_data *old_prog_data = brw->cs.base.prog_data;
diff --git a/src/mesa/drivers/dri/i965/brw_cs.h b/src/mesa/drivers/dri/i965/brw_cs.h
index 60eb19c..669d4b5 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.h
+++ b/src/mesa/drivers/dri/i965/brw_cs.h
@@ -34,6 +34,10 @@
 
 void
 brw_cs_populate_key(struct brw_context *brw, struct brw_cs_prog_key *key);
+void
+brw_cs_populate_default_key(const struct gen_device_info *devinfo,
+                            struct brw_cs_prog_key *key,
+                            struct gl_program *prog);
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 8bf6f68..855f1c7 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1656,6 +1656,18 @@
 #define CS_DEBUG_MODE2                     0x20d8 /* Gen9+ */
 # define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
 
+#define GEN7_RPSTAT1                       0xA01C
+#define  GEN7_RPSTAT1_CURR_GT_FREQ_SHIFT   7
+#define  GEN7_RPSTAT1_CURR_GT_FREQ_MASK    INTEL_MASK(13, 7)
+#define  GEN7_RPSTAT1_PREV_GT_FREQ_SHIFT   0
+#define  GEN7_RPSTAT1_PREV_GT_FREQ_MASK    INTEL_MASK(6, 0)
+
+#define GEN9_RPSTAT0                       0xA01C
+#define  GEN9_RPSTAT0_CURR_GT_FREQ_SHIFT   23
+#define  GEN9_RPSTAT0_CURR_GT_FREQ_MASK    INTEL_MASK(31, 23)
+#define  GEN9_RPSTAT0_PREV_GT_FREQ_SHIFT   0
+#define  GEN9_RPSTAT0_PREV_GT_FREQ_MASK    INTEL_MASK(8, 0)
+
 #define SLICE_COMMON_ECO_CHICKEN1          0x731c /* Gen9+ */
 # define GLK_SCEC_BARRIER_MODE_GPGPU       (0 << 7)
 # define GLK_SCEC_BARRIER_MODE_3D_HULL     (1 << 7)
diff --git a/src/mesa/drivers/dri/i965/brw_disk_cache.c b/src/mesa/drivers/dri/i965/brw_disk_cache.c
index 36d7967..9a6f2ff 100644
--- a/src/mesa/drivers/dri/i965/brw_disk_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_disk_cache.c
@@ -73,51 +73,14 @@
    _mesa_sha1_compute(manifest, strlen(manifest), out_sha1);
 }
 
-static void
-write_blob_program_data(struct blob *binary, gl_shader_stage stage,
-                        const void *program,
-                        struct brw_stage_prog_data *prog_data)
-{
-   /* Write prog_data to blob. */
-   blob_write_bytes(binary, prog_data, brw_prog_data_size(stage));
-
-   /* Write program to blob. */
-   blob_write_bytes(binary, program, prog_data->program_size);
-
-   /* Write push params */
-   blob_write_bytes(binary, prog_data->param,
-                    sizeof(uint32_t) * prog_data->nr_params);
-
-   /* Write pull params */
-   blob_write_bytes(binary, prog_data->pull_param,
-                    sizeof(uint32_t) * prog_data->nr_pull_params);
-}
-
 static bool
 read_blob_program_data(struct blob_reader *binary, struct gl_program *prog,
                        gl_shader_stage stage, const uint8_t **program,
                        struct brw_stage_prog_data *prog_data)
 {
-   /* Read shader prog_data from blob. */
-   blob_copy_bytes(binary, prog_data, brw_prog_data_size(stage));
-   if (binary->overrun)
-      return false;
-
-   /* Read shader program from blob. */
-   *program = blob_read_bytes(binary, prog_data->program_size);
-
-   /* Read push params */
-   prog_data->param = rzalloc_array(NULL, uint32_t, prog_data->nr_params);
-   blob_copy_bytes(binary, prog_data->param,
-                   sizeof(uint32_t) * prog_data->nr_params);
-
-   /* Read pull params */
-   prog_data->pull_param = rzalloc_array(NULL, uint32_t,
-                                         prog_data->nr_pull_params);
-   blob_copy_bytes(binary, prog_data->pull_param,
-                   sizeof(uint32_t) * prog_data->nr_pull_params);
-
-   return (binary->current == binary->end && !binary->overrun);
+   return
+      brw_read_blob_program_data(binary, prog, stage, program, prog_data) &&
+      (binary->current == binary->end);
 }
 
 static bool
@@ -131,36 +94,32 @@
    switch (stage) {
    case MESA_SHADER_VERTEX:
       brw_vs_populate_key(brw, &prog_key.vs);
-      /* We don't care what instance of the program it is for the disk cache
-       * hash lookup, so set the id to 0 for the sha1 hashing.
-       * program_string_id will be set below.
-       */
-      prog_key.vs.program_string_id = 0;
       break;
    case MESA_SHADER_TESS_CTRL:
       brw_tcs_populate_key(brw, &prog_key.tcs);
-      prog_key.tcs.program_string_id = 0;
       break;
    case MESA_SHADER_TESS_EVAL:
       brw_tes_populate_key(brw, &prog_key.tes);
-      prog_key.tes.program_string_id = 0;
       break;
    case MESA_SHADER_GEOMETRY:
       brw_gs_populate_key(brw, &prog_key.gs);
-      prog_key.gs.program_string_id = 0;
       break;
    case MESA_SHADER_FRAGMENT:
       brw_wm_populate_key(brw, &prog_key.wm);
-      prog_key.wm.program_string_id = 0;
       break;
    case MESA_SHADER_COMPUTE:
       brw_cs_populate_key(brw, &prog_key.cs);
-      prog_key.cs.program_string_id = 0;
       break;
    default:
       unreachable("Unsupported stage!");
    }
 
+   /* We don't care what instance of the program it is for the disk cache hash
+    * lookup, so set the id to 0 for the sha1 hashing. program_string_id will
+    * be set below.
+    */
+   brw_prog_key_set_id(&prog_key, stage, 0);
+
    gen_shader_sha1(brw, prog, stage, &prog_key, binary_sha1);
 
    size_t buffer_size;
@@ -208,32 +167,26 @@
 
    switch (stage) {
    case MESA_SHADER_VERTEX:
-      prog_key.vs.program_string_id = brw_program(prog)->id;
       cache_id = BRW_CACHE_VS_PROG;
       stage_state = &brw->vs.base;
       break;
    case MESA_SHADER_TESS_CTRL:
-      prog_key.tcs.program_string_id = brw_program(prog)->id;
       cache_id = BRW_CACHE_TCS_PROG;
       stage_state = &brw->tcs.base;
       break;
    case MESA_SHADER_TESS_EVAL:
-      prog_key.tes.program_string_id = brw_program(prog)->id;
       cache_id = BRW_CACHE_TES_PROG;
       stage_state = &brw->tes.base;
       break;
    case MESA_SHADER_GEOMETRY:
-      prog_key.gs.program_string_id = brw_program(prog)->id;
       cache_id = BRW_CACHE_GS_PROG;
       stage_state = &brw->gs.base;
       break;
    case MESA_SHADER_FRAGMENT:
-      prog_key.wm.program_string_id = brw_program(prog)->id;
       cache_id = BRW_CACHE_FS_PROG;
       stage_state = &brw->wm.base;
       break;
    case MESA_SHADER_COMPUTE:
-      prog_key.cs.program_string_id = brw_program(prog)->id;
       cache_id = BRW_CACHE_CS_PROG;
       stage_state = &brw->cs.base;
       break;
@@ -241,12 +194,14 @@
       unreachable("Unsupported stage!");
    }
 
+   brw_prog_key_set_id(&prog_key, stage, brw_program(prog)->id);
+
    brw_alloc_stage_scratch(brw, stage_state, prog_data->total_scratch);
 
    if (unlikely(debug_enabled_for_stage(stage))) {
       fprintf(stderr, "NIR for %s program %d loaded from disk shader cache:\n",
               _mesa_shader_stage_to_abbrev(stage), brw_program(prog)->id);
-      brw_program_deserialize_nir(&brw->ctx, prog, stage);
+      brw_program_deserialize_driver_blob(&brw->ctx, prog, stage);
       nir_shader *nir = prog->nir;
       nir_print_shader(nir, stderr);
       fprintf(stderr, "Native code for %s %s shader %s from disk cache:\n",
@@ -299,7 +254,7 @@
               _mesa_shader_stage_to_abbrev(prog->info.stage));
    }
 
-   brw_program_deserialize_nir(&brw->ctx, prog, stage);
+   brw_program_deserialize_driver_blob(&brw->ctx, prog, stage);
 
    return false;
 }
@@ -318,7 +273,7 @@
     * generation time when the program is in normal memory accessible with
     * cache to the CPU. Another easier change would be to use
     * _mesa_streaming_load_memcpy to read from the program mapped memory. */
-   write_blob_program_data(&binary, stage, program_map, prog_data);
+   brw_write_blob_program_data(&binary, stage, program_map, prog_data);
 
    unsigned char sha1[20];
    char buf[41];
@@ -425,10 +380,11 @@
    if (INTEL_DEBUG & DEBUG_DISK_CACHE_DISABLE_MASK)
       return;
 
-   char renderer[10];
+   /* array length: print length + null char + 1 extra to verify it is unused */
+   char renderer[11];
    MAYBE_UNUSED int len = snprintf(renderer, sizeof(renderer), "i965_%04x",
                                    screen->deviceID);
-   assert(len == sizeof(renderer) - 1);
+   assert(len == sizeof(renderer) - 2);
 
    const struct build_id_note *note =
       build_id_find_nhdr_for_addr(brw_disk_cache_init);
@@ -440,7 +396,8 @@
    char timestamp[41];
    _mesa_sha1_format(timestamp, id_sha1);
 
-   const uint64_t driver_flags = INTEL_DEBUG & DEBUG_DISK_CACHE_MASK;
+   const uint64_t driver_flags =
+      brw_get_compiler_config_value(screen->compiler);
    screen->disk_cache = disk_cache_create(renderer, timestamp, driver_flags);
 #endif
 }
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 438d1db..19ee396 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -25,6 +25,7 @@
 
 #include <sys/errno.h>
 
+#include "main/arrayobj.h"
 #include "main/blend.h"
 #include "main/context.h"
 #include "main/condrender.h"
@@ -277,8 +278,7 @@
 
 
 static void
-brw_merge_inputs(struct brw_context *brw,
-                 const struct gl_vertex_array *arrays)
+brw_merge_inputs(struct brw_context *brw)
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    const struct gl_context *ctx = &brw->ctx;
@@ -291,8 +291,10 @@
    brw->vb.nr_buffers = 0;
 
    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      brw->vb.inputs[i].buffer = -1;
-      brw->vb.inputs[i].glarray = &arrays[i];
+      struct brw_vertex_element *input = &brw->vb.inputs[i];
+      input->buffer = -1;
+      _mesa_draw_attrib_and_binding(ctx, i,
+                                    &input->glattrib, &input->glbinding);
    }
 
    if (devinfo->gen < 8 && !devinfo->is_haswell) {
@@ -305,7 +307,7 @@
          uint8_t wa_flags = 0;
 
          i = u_bit_scan64(&mask);
-         glattrib = brw->vb.inputs[i].glarray->VertexAttrib;
+         glattrib = brw->vb.inputs[i].glattrib;
 
          switch (glattrib->Type) {
 
@@ -661,6 +663,9 @@
  * If the depth buffer was written to and if it has an accompanying HiZ
  * buffer, then mark that it needs a depth resolve.
  *
+ * If the stencil buffer was written to then mark that it may need to be
+ * copied to an R8 texture.
+ *
  * If the color buffer is a multisample window system buffer, then
  * mark that it needs a downsample.
  *
@@ -704,8 +709,15 @@
          brw_depth_cache_add_bo(brw, depth_irb->mt->bo);
    }
 
-   if (stencil_irb && brw->stencil_write_enabled)
-      brw_depth_cache_add_bo(brw, stencil_irb->mt->bo);
+   if (stencil_irb && brw->stencil_write_enabled) {
+      struct intel_mipmap_tree *stencil_mt =
+         stencil_irb->mt->stencil_mt != NULL ?
+         stencil_irb->mt->stencil_mt : stencil_irb->mt;
+      brw_depth_cache_add_bo(brw, stencil_mt->bo);
+      intel_miptree_finish_write(brw, stencil_mt, stencil_irb->mt_level,
+                                 stencil_irb->mt_layer,
+                                 stencil_irb->layer_count, ISL_AUX_USAGE_NONE);
+   }
 
    for (unsigned i = 0; i < fb->_NumColorDrawBuffers; i++) {
       struct intel_renderbuffer *irb =
@@ -779,7 +791,6 @@
 
 static void
 brw_prepare_drawing(struct gl_context *ctx,
-                    const struct gl_vertex_array *arrays,
                     const struct _mesa_index_buffer *ib,
                     bool index_bounds_valid,
                     GLuint min_index,
@@ -832,7 +843,7 @@
 
    /* Bind all inputs, derive varying and size information:
     */
-   brw_merge_inputs(brw, arrays);
+   brw_merge_inputs(brw);
 
    brw->ib.ib = ib;
    brw->ctx.NewDriverState |= BRW_NEW_INDICES;
@@ -866,7 +877,6 @@
  */
 static void
 brw_draw_single_prim(struct gl_context *ctx,
-                     const struct gl_vertex_array *arrays,
                      const struct _mesa_prim *prim,
                      unsigned prim_id,
                      struct brw_transform_feedback_object *xfb_obj,
@@ -875,7 +885,7 @@
 {
    struct brw_context *brw = brw_context(ctx);
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   bool fail_next = false;
+   bool fail_next;
 
    /* Flag BRW_NEW_DRAW_CALL on every draw.  This allows us to have
     * atoms that happen on every draw call.
@@ -885,9 +895,10 @@
    /* Flush the batch if the batch/state buffers are nearly full.  We can
     * grow them if needed, but this is not free, so we'd like to avoid it.
     */
-   intel_batchbuffer_require_space(brw, 1500, RENDER_RING);
+   intel_batchbuffer_require_space(brw, 1500);
    brw_require_statebuffer_space(brw, 2400);
    intel_batchbuffer_save_state(brw);
+   fail_next = intel_batchbuffer_saved_state_is_empty(brw);
 
    if (brw->num_instances != prim->num_instances ||
        brw->basevertex != prim->basevertex ||
@@ -897,7 +908,7 @@
       brw->baseinstance = prim->base_instance;
       if (prim_id > 0) { /* For i == 0 we just did this before the loop */
          brw->ctx.NewDriverState |= BRW_NEW_VERTICES;
-         brw_merge_inputs(brw, arrays);
+         brw_merge_inputs(brw);
       }
    }
 
@@ -912,16 +923,12 @@
    const struct brw_vs_prog_data *vs_prog_data =
       brw_vs_prog_data(brw->vs.base.prog_data);
    if (prim_id > 0) {
-      const bool uses_firstvertex =
-         vs_prog_data->uses_basevertex ||
-         vs_prog_data->uses_firstvertex;
-
       const bool uses_draw_parameters =
-         uses_firstvertex ||
+         vs_prog_data->uses_firstvertex ||
          vs_prog_data->uses_baseinstance;
 
       if ((uses_draw_parameters && prim->is_indirect) ||
-          (uses_firstvertex &&
+          (vs_prog_data->uses_firstvertex &&
            brw->draw.params.firstvertex != new_firstvertex) ||
           (vs_prog_data->uses_baseinstance &&
            brw->draw.params.gl_baseinstance != new_baseinstance))
@@ -948,17 +955,21 @@
    }
 
    /* gl_DrawID always needs its own vertex buffer since it's not part of
-    * the indirect parameter buffer. If the program uses gl_DrawID we need
-    * to flag BRW_NEW_VERTICES. For the first iteration, we don't have
-    * valid vs_prog_data, but we always flag BRW_NEW_VERTICES before
-    * the loop.
+    * the indirect parameter buffer. Same for is_indexed_draw, which shares
+    * the buffer with gl_DrawID. If the program uses gl_DrawID, we need to
+    * flag BRW_NEW_VERTICES. For the first iteration, we don't have valid
+    * vs_prog_data, but we always flag BRW_NEW_VERTICES before the loop.
     */
-   brw->draw.gl_drawid = prim->draw_id;
-   brw_bo_unreference(brw->draw.draw_id_bo);
-   brw->draw.draw_id_bo = NULL;
    if (prim_id > 0 && vs_prog_data->uses_drawid)
       brw->ctx.NewDriverState |= BRW_NEW_VERTICES;
 
+   brw->draw.derived_params.gl_drawid = prim->draw_id;
+   brw->draw.derived_params.is_indexed_draw = prim->indexed ? ~0 : 0;
+
+   brw_bo_unreference(brw->draw.derived_draw_params_bo);
+   brw->draw.derived_draw_params_bo = NULL;
+   brw->draw.derived_draw_params_offset = 0;
+
    if (devinfo->gen < 6)
       brw_set_prim(brw, prim);
    else
@@ -1004,20 +1015,6 @@
 }
 
 
-static bool
-all_varyings_in_vbos(const struct gl_vertex_array *arrays)
-{
-   GLuint i;
-
-   for (i = 0; i < VERT_ATTRIB_MAX; i++)
-      if (arrays[i].BufferBinding->Stride &&
-          arrays[i].BufferBinding->BufferObj->Name == 0)
-         return false;
-
-   return true;
-}
-
-
 
 void
 brw_draw_prims(struct gl_context *ctx,
@@ -1033,16 +1030,10 @@
 {
    unsigned i;
    struct brw_context *brw = brw_context(ctx);
-   const struct gl_vertex_array *arrays;
    int predicate_state = brw->predicate.state;
    struct brw_transform_feedback_object *xfb_obj =
       (struct brw_transform_feedback_object *) gl_xfb_obj;
 
-   /* The initial pushdown of the inputs array into the drivers */
-   _mesa_set_drawing_arrays(ctx, brw->vb.draw_arrays.inputs);
-   arrays = ctx->Array._DrawArrays;
-   _vbo_update_inputs(ctx, &brw->vb.draw_arrays);
-
    if (!brw_check_conditional_render(brw))
       return;
 
@@ -1069,15 +1060,14 @@
     * get the minimum and maximum of their index buffer so we know what range
     * to upload.
     */
-   if (!index_bounds_valid && !all_varyings_in_vbos(arrays)) {
+   if (!index_bounds_valid && _mesa_draw_user_array_bits(ctx) != 0) {
       perf_debug("Scanning index buffer to compute index buffer bounds.  "
                  "Use glDrawRangeElements() to avoid this.\n");
       vbo_get_minmax_indices(ctx, prims, ib, &min_index, &max_index, nr_prims);
       index_bounds_valid = true;
    }
 
-   brw_prepare_drawing(ctx, arrays, ib, index_bounds_valid, min_index,
-                       max_index);
+   brw_prepare_drawing(ctx, ib, index_bounds_valid, min_index, max_index);
    /* Try drawing with the hardware, but don't do anything else if we can't
     * manage it.  swrast doesn't support our featureset, so we can't fall back
     * to it.
@@ -1114,8 +1104,7 @@
          brw->predicate.state = BRW_PREDICATE_STATE_USE_BIT;
       }
 
-      brw_draw_single_prim(ctx, arrays, &prims[i], i, xfb_obj, stream,
-                           indirect);
+      brw_draw_single_prim(ctx, &prims[i], i, xfb_obj, stream, indirect);
    }
 
    brw_finish_drawing(ctx);
@@ -1183,9 +1172,6 @@
 void
 brw_draw_init(struct brw_context *brw)
 {
-   /* Keep our list of gl_vertex_array inputs */
-   _vbo_init_inputs(&brw->vb.draw_arrays);
-
    for (int i = 0; i < VERT_ATTRIB_MAX; i++)
       brw->vb.inputs[i].buffer = -1;
    brw->vb.nr_buffers = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 7573f78..bc9b256 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -23,6 +23,7 @@
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
+#include "main/arrayobj.h"
 #include "main/bufferobj.h"
 #include "main/context.h"
 #include "main/enums.h"
@@ -403,9 +404,8 @@
 			struct brw_vertex_buffer *buffer,
 			GLuint dst_stride)
 {
-   const struct gl_vertex_array *glarray = element->glarray;
-   const struct gl_vertex_buffer_binding *glbinding = glarray->BufferBinding;
-   const struct gl_array_attributes *glattrib = glarray->VertexAttrib;
+   const struct gl_vertex_buffer_binding *glbinding = element->glbinding;
+   const struct gl_array_attributes *glattrib = element->glattrib;
    const int src_stride = glbinding->Stride;
 
    /* If the source stride is zero, we just want to upload the current
@@ -512,15 +512,15 @@
 
    for (i = j = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
-      const struct gl_vertex_array *glarray = input->glarray;
-      const struct gl_vertex_buffer_binding *glbinding = glarray->BufferBinding;
-      const struct gl_array_attributes *glattrib = glarray->VertexAttrib;
+      const struct gl_vertex_buffer_binding *glbinding = input->glbinding;
+      const struct gl_array_attributes *glattrib = input->glattrib;
 
       if (_mesa_is_bufferobj(glbinding->BufferObj)) {
 	 struct intel_buffer_object *intel_buffer =
 	    intel_buffer_object(glbinding->BufferObj);
 
-         const uint32_t offset = glbinding->Offset + glattrib->RelativeOffset;
+         const uint32_t offset = _mesa_draw_binding_offset(glbinding) +
+            _mesa_draw_attributes_relative_offset(glattrib);
 
          /* Start with the worst case */
          uint32_t start = 0;
@@ -546,10 +546,11 @@
 	  */
 	 unsigned k;
 	 for (k = 0; k < i; k++) {
-	    const struct gl_vertex_array *other = brw->vb.enabled[k]->glarray;
-            const struct gl_vertex_buffer_binding *obind = other->BufferBinding;
-            const struct gl_array_attributes *oattrib = other->VertexAttrib;
-            const uint32_t ooffset = obind->Offset + oattrib->RelativeOffset;
+            struct brw_vertex_element *other = brw->vb.enabled[k];
+            const struct gl_vertex_buffer_binding *obind = other->glbinding;
+            const struct gl_array_attributes *oattrib = other->glattrib;
+            const uint32_t ooffset = _mesa_draw_binding_offset(obind) +
+               _mesa_draw_attributes_relative_offset(oattrib);
 	    if (glbinding->BufferObj == obind->BufferObj &&
 		glbinding->Stride == obind->Stride &&
 		glbinding->InstanceDivisor == obind->InstanceDivisor &&
@@ -658,8 +659,7 @@
          buffer->step_rate = 0;
 
 	 for (i = 0; i < nr_uploads; i++) {
-            const struct gl_vertex_array *glarray = upload[i]->glarray;
-            const struct gl_array_attributes *glattrib = glarray->VertexAttrib;
+            const struct gl_array_attributes *glattrib = upload[i]->glattrib;
 	    /* Then, just point upload[i] at upload[0]'s buffer. */
             upload[i]->offset = ((const unsigned char *)glattrib->Ptr - ptr);
 	    upload[i]->buffer = j;
@@ -672,9 +672,8 @@
    /* Upload non-interleaved arrays */
    for (i = 0; i < nr_uploads; i++) {
       struct brw_vertex_buffer *buffer = &brw->vb.buffers[j];
-      const struct gl_vertex_array *glarray = upload[i]->glarray;
-      const struct gl_vertex_buffer_binding *glbinding = glarray->BufferBinding;
-      const struct gl_array_attributes *glattrib = glarray->VertexAttrib;
+      const struct gl_vertex_buffer_binding *glbinding = upload[i]->glbinding;
+      const struct gl_array_attributes *glattrib = upload[i]->glattrib;
       if (glbinding->InstanceDivisor == 0) {
          copy_array_to_vbo_array(brw, upload[i], min_index, max_index,
                                  buffer, glattrib->_ElementSize);
@@ -704,11 +703,8 @@
    const struct brw_vs_prog_data *vs_prog_data =
       brw_vs_prog_data(brw->vs.base.prog_data);
 
-   const bool uses_firstvertex =
-      vs_prog_data->uses_basevertex || vs_prog_data->uses_firstvertex;
-
    /* For non-indirect draws, upload the shader draw parameters */
-   if ((uses_firstvertex || vs_prog_data->uses_baseinstance) &&
+   if ((vs_prog_data->uses_firstvertex || vs_prog_data->uses_baseinstance) &&
        brw->draw.draw_params_bo == NULL) {
       brw_upload_data(&brw->upload,
                       &brw->draw.params, sizeof(brw->draw.params), 4,
@@ -716,11 +712,11 @@
                       &brw->draw.draw_params_offset);
    }
 
-   if (vs_prog_data->uses_drawid) {
+   if (vs_prog_data->uses_drawid || vs_prog_data->uses_is_indexed_draw) {
       brw_upload_data(&brw->upload,
-                      &brw->draw.gl_drawid, sizeof(brw->draw.gl_drawid), 4,
-                      &brw->draw.draw_id_bo,
-                      &brw->draw.draw_id_offset);
+                      &brw->draw.derived_params, sizeof(brw->draw.derived_params), 4,
+                      &brw->draw.derived_draw_params_bo,
+                      &brw->draw.derived_draw_params_offset);
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_ff_gs.c b/src/mesa/drivers/dri/i965/brw_ff_gs.c
index 174418a..b2c4ee1 100644
--- a/src/mesa/drivers/dri/i965/brw_ff_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_ff_gs.c
@@ -251,9 +251,9 @@
    }
 
    if (brw->ff_gs.prog_active) {
-      if (!brw_search_cache(&brw->cache, BRW_CACHE_FF_GS_PROG,
-			    &key, sizeof(key),
-			    &brw->ff_gs.prog_offset, &brw->ff_gs.prog_data)) {
+      if (!brw_search_cache(&brw->cache, BRW_CACHE_FF_GS_PROG, &key,
+                            sizeof(key), &brw->ff_gs.prog_offset,
+                            &brw->ff_gs.prog_data, true)) {
          brw_codegen_ff_gs_prog(brw, &key);
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 91c7168..7263f63 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -192,10 +192,9 @@
 
    brw_gs_populate_key(brw, &key);
 
-   if (brw_search_cache(&brw->cache, BRW_CACHE_GS_PROG,
-                        &key, sizeof(key),
-                        &stage_state->prog_offset,
-                        &brw->gs.base.prog_data))
+   if (brw_search_cache(&brw->cache, BRW_CACHE_GS_PROG, &key, sizeof(key),
+                        &stage_state->prog_offset, &brw->gs.base.prog_data,
+                        true))
       return;
 
    if (brw_disk_cache_upload_program(brw, MESA_SHADER_GEOMETRY))
@@ -208,6 +207,17 @@
    assert(success);
 }
 
+void
+brw_gs_populate_default_key(const struct gen_device_info *devinfo,
+                            struct brw_gs_prog_key *key,
+                            struct gl_program *prog)
+{
+   memset(key, 0, sizeof(*key));
+
+   brw_setup_tex_for_precompile(devinfo, &key->tex, prog);
+   key->program_string_id = brw_program(prog)->id;
+}
+
 bool
 brw_gs_precompile(struct gl_context *ctx, struct gl_program *prog)
 {
@@ -219,10 +229,7 @@
 
    struct brw_program *bgp = brw_program(prog);
 
-   memset(&key, 0, sizeof(key));
-
-   brw_setup_tex_for_precompile(brw, &key.tex, prog);
-   key.program_string_id = bgp->id;
+   brw_gs_populate_default_key(&brw->screen->devinfo, &key, prog);
 
    success = brw_codegen_gs_prog(brw, bgp, &key);
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
index 537a416..cff994a 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.h
+++ b/src/mesa/drivers/dri/i965/brw_gs.h
@@ -40,6 +40,10 @@
 void
 brw_gs_populate_key(struct brw_context *brw,
                     struct brw_gs_prog_key *key);
+void
+brw_gs_populate_default_key(const struct gen_device_info *devinfo,
+                            struct brw_gs_prog_key *key,
+                            struct gl_program *prog);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index ff087f8..0723255 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -24,11 +24,14 @@
 #include "brw_context.h"
 #include "compiler/brw_nir.h"
 #include "brw_program.h"
+#include "compiler/glsl/gl_nir.h"
+#include "compiler/glsl/gl_nir_linker.h"
 #include "compiler/glsl/ir.h"
 #include "compiler/glsl/ir_optimization.h"
 #include "compiler/glsl/program.h"
 #include "compiler/nir/nir_serialize.h"
 #include "program/program.h"
+#include "main/glspirv.h"
 #include "main/mtypes.h"
 #include "main/shaderapi.h"
 #include "main/shaderobj.h"
@@ -243,7 +246,6 @@
       _mesa_copy_linked_program_data(shProg, shader);
 
       prog->ShadowSamplers = shader->shadow_samplers;
-      _mesa_update_shader_textures_used(shProg, prog);
 
       bool debug_enabled =
          (INTEL_DEBUG & intel_debug_flag_for_shader_stage(shader->Stage));
@@ -259,6 +261,15 @@
                                  compiler->scalar_stage[stage]);
    }
 
+   /* SPIR-V programs use a NIR linker */
+   if (shProg->data->spirv) {
+      if (!gl_nir_link_uniforms(ctx, shProg))
+         return false;
+
+      gl_nir_link_assign_atomic_counter_resources(ctx, shProg);
+      gl_nir_link_assign_xfb_resources(ctx, shProg);
+   }
+
    /* Determine first and last stage. */
    unsigned first = MESA_SHADER_STAGES;
    unsigned last = 0;
@@ -297,10 +308,13 @@
          continue;
 
       struct gl_program *prog = shader->Program;
+
+      _mesa_update_shader_textures_used(shProg, prog);
+
       brw_shader_gather_info(prog->nir, prog);
 
-      NIR_PASS_V(prog->nir, nir_lower_samplers, shProg);
-      NIR_PASS_V(prog->nir, nir_lower_atomics, shProg, false);
+      NIR_PASS_V(prog->nir, gl_nir_lower_samplers, shProg);
+      NIR_PASS_V(prog->nir, gl_nir_lower_atomics, shProg, false);
       NIR_PASS_V(prog->nir, nir_lower_atomics_to_ssbo,
                  prog->nir->info.num_abos);
 
@@ -318,13 +332,10 @@
        * get sent to the shader.
        */
       nir_foreach_variable(var, &prog->nir->uniforms) {
-         if (strncmp(var->name, "gl_", 3) == 0) {
-            const nir_state_slot *const slots = var->state_slots;
-            assert(var->state_slots != NULL);
-
-            for (unsigned int i = 0; i < var->num_state_slots; i++) {
-               _mesa_add_state_reference(prog->Parameters, slots[i].tokens);
-            }
+         const nir_state_slot *const slots = var->state_slots;
+         for (unsigned int i = 0; i < var->num_state_slots; i++) {
+            assert(slots != NULL);
+            _mesa_add_state_reference(prog->Parameters, slots[i].tokens);
          }
       }
    }
@@ -351,21 +362,14 @@
       }
    }
 
-   if (brw->ctx.Cache) {
-      for (stage = 0; stage < ARRAY_SIZE(shProg->_LinkedShaders); stage++) {
-         struct gl_linked_shader *shader = shProg->_LinkedShaders[stage];
-         if (!shader)
-            continue;
-
-         struct gl_program *prog = shader->Program;
-         brw_program_serialize_nir(ctx, prog);
-      }
-   }
-
    if (brw->precompile && !brw_shader_precompile(ctx, shProg))
       return false;
 
-   build_program_resource_list(ctx, shProg);
+   /* SPIR-V programs build its resource list from linked NIR shaders. */
+   if (!shProg->data->spirv)
+      build_program_resource_list(ctx, shProg);
+   else
+      nir_build_program_resource_list(ctx, shProg);
 
    for (stage = 0; stage < ARRAY_SIZE(shProg->_LinkedShaders); stage++) {
       struct gl_linked_shader *shader = shProg->_LinkedShaders[stage];
diff --git a/src/mesa/drivers/dri/i965/brw_meta_util.c b/src/mesa/drivers/dri/i965/brw_meta_util.c
index b311815..908b098 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_util.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_util.c
@@ -250,13 +250,13 @@
    /* Account for the fact that in the system framebuffer, the origin is at
     * the lower left.
     */
-   if (_mesa_is_winsys_fbo(read_fb)) {
+   if (read_fb->FlipY) {
       GLint tmp = read_fb->Height - *srcY0;
       *srcY0 = read_fb->Height - *srcY1;
       *srcY1 = tmp;
       *mirror_y = !*mirror_y;
    }
-   if (_mesa_is_winsys_fbo(draw_fb)) {
+   if (draw_fb->FlipY) {
       GLint tmp = draw_fb->Height - *dstY0;
       *dstY0 = draw_fb->Height - *dstY1;
       *dstY1 = tmp;
@@ -293,18 +293,7 @@
        brw->mesa_to_isl_render_format[mt->format])
       return false;
 
-   const bool srgb_rb = _mesa_get_srgb_format_linear(mt->format) != mt->format;
-  /* Gen10 doesn't automatically decode the clear color of sRGB buffers. Since
-   * we currently don't perform this decode in software, avoid a fast-clear
-   * altogether. TODO: Do this in software.
-   */
    const mesa_format format = _mesa_get_render_format(ctx, mt->format);
-   if (devinfo->gen >= 10 && srgb_rb) {
-      perf_debug("sRGB fast clear not enabled for (%s)",
-                 _mesa_get_format_name(format));
-      return false;
-   }
-
    if (_mesa_is_format_integer_color(format)) {
       if (devinfo->gen >= 8) {
          perf_debug("Integer fast clear not enabled for (%s)",
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 05517eb..5cf704f 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -254,127 +254,38 @@
        rebase_depth_stencil(brw, stencil_irb, invalidate_stencil);
 }
 
-void
-brw_emit_depthbuffer(struct brw_context *brw)
+static void
+brw_emit_depth_stencil_hiz(struct brw_context *brw,
+                           struct intel_renderbuffer *depth_irb,
+                           struct intel_mipmap_tree *depth_mt,
+                           struct intel_renderbuffer *stencil_irb,
+                           struct intel_mipmap_tree *stencil_mt)
 {
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   struct gl_context *ctx = &brw->ctx;
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   /* _NEW_BUFFERS */
-   struct intel_renderbuffer *depth_irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
-   struct intel_renderbuffer *stencil_irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
-   struct intel_mipmap_tree *depth_mt = intel_renderbuffer_get_mt(depth_irb);
-   struct intel_mipmap_tree *stencil_mt = get_stencil_miptree(stencil_irb);
    uint32_t tile_x = brw->depthstencil.tile_x;
    uint32_t tile_y = brw->depthstencil.tile_y;
-   bool hiz = depth_irb && intel_renderbuffer_has_hiz(depth_irb);
-   bool separate_stencil = false;
    uint32_t depth_surface_type = BRW_SURFACE_NULL;
    uint32_t depthbuffer_format = BRW_DEPTHFORMAT_D32_FLOAT;
    uint32_t depth_offset = 0;
    uint32_t width = 1, height = 1;
-
-   if (stencil_mt) {
-      separate_stencil = stencil_mt->format == MESA_FORMAT_S_UINT8;
-
-      /* Gen7 supports only separate stencil */
-      assert(separate_stencil || devinfo->gen < 7);
-   }
+   bool tiled_surface = true;
 
    /* If there's a packed depth/stencil bound to stencil only, we need to
     * emit the packed depth/stencil buffer packet.
     */
-   if (!depth_irb && stencil_irb && !separate_stencil) {
+   if (!depth_irb && stencil_irb) {
       depth_irb = stencil_irb;
       depth_mt = stencil_mt;
    }
 
    if (depth_irb && depth_mt) {
-      /* When 3DSTATE_DEPTH_BUFFER.Separate_Stencil_Enable is set, then
-       * 3DSTATE_DEPTH_BUFFER.Surface_Format is not permitted to be a packed
-       * depthstencil format.
-       *
-       * Gens prior to 7 require that HiZ_Enable and Separate_Stencil_Enable be
-       * set to the same value. Gens after 7 implicitly always set
-       * Separate_Stencil_Enable; software cannot disable it.
-       */
-      if ((devinfo->gen < 7 && hiz) || devinfo->gen >= 7) {
-         assert(!_mesa_is_format_packed_depth_stencil(depth_mt->format));
-      }
-
-      /* Prior to Gen7, if using separate stencil, hiz must be enabled. */
-      assert(devinfo->gen >= 7 || !separate_stencil || hiz);
-
-      assert(devinfo->gen < 6 || depth_mt->surf.tiling == ISL_TILING_Y0);
-      assert(!hiz || depth_mt->surf.tiling == ISL_TILING_Y0);
-
       depthbuffer_format = brw_depthbuffer_format(brw);
       depth_surface_type = BRW_SURFACE_2D;
       depth_offset = brw->depthstencil.depth_offset;
       width = depth_irb->Base.Base.Width;
       height = depth_irb->Base.Base.Height;
-   } else if (separate_stencil) {
-      /*
-       * There exists a separate stencil buffer but no depth buffer.
-       *
-       * The stencil buffer inherits most of its fields from
-       * 3DSTATE_DEPTH_BUFFER: namely the tile walk, surface type, width, and
-       * height.
-       *
-       * The tiled bit must be set. From the Sandybridge PRM, Volume 2, Part 1,
-       * Section 7.5.5.1.1 3DSTATE_DEPTH_BUFFER, Bit 1.27 Tiled Surface:
-       *     [DevGT+]: This field must be set to TRUE.
-       */
-      assert(brw->has_separate_stencil);
-
-      depth_surface_type = BRW_SURFACE_2D;
-      width = stencil_irb->Base.Base.Width;
-      height = stencil_irb->Base.Base.Height;
+      tiled_surface = depth_mt->surf.tiling != ISL_TILING_LINEAR;
    }
 
-   if (depth_mt)
-      brw_cache_flush_for_depth(brw, depth_mt->bo);
-   if (stencil_mt)
-      brw_cache_flush_for_depth(brw, stencil_mt->bo);
-
-   brw->vtbl.emit_depth_stencil_hiz(brw, depth_mt, depth_offset,
-                                    depthbuffer_format, depth_surface_type,
-                                    stencil_mt, hiz, separate_stencil,
-                                    width, height, tile_x, tile_y);
-}
-
-uint32_t
-brw_convert_depth_value(mesa_format format, float value)
-{
-   switch (format) {
-   case MESA_FORMAT_Z_FLOAT32:
-      return float_as_int(value);
-   case MESA_FORMAT_Z_UNORM16:
-      return value * ((1u << 16) - 1);
-   case MESA_FORMAT_Z24_UNORM_X8_UINT:
-      return value * ((1u << 24) - 1);
-   default:
-      unreachable("Invalid depth format");
-   }
-}
-
-void
-brw_emit_depth_stencil_hiz(struct brw_context *brw,
-                           struct intel_mipmap_tree *depth_mt,
-                           uint32_t depth_offset, uint32_t depthbuffer_format,
-                           uint32_t depth_surface_type,
-                           struct intel_mipmap_tree *stencil_mt,
-                           bool hiz, bool separate_stencil,
-                           uint32_t width, uint32_t height,
-                           uint32_t tile_x, uint32_t tile_y)
-{
-   (void)hiz;
-   (void)separate_stencil;
-   (void)stencil_mt;
-
-   assert(!hiz);
-   assert(!separate_stencil);
-
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
    const unsigned len = (devinfo->is_g4x || devinfo->gen == 5) ? 6 : 5;
 
@@ -383,7 +294,7 @@
    OUT_BATCH((depth_mt ? depth_mt->surf.row_pitch - 1 : 0) |
              (depthbuffer_format << 18) |
              (BRW_TILEWALK_YMAJOR << 26) |
-             (1 << 27) |
+             (tiled_surface << 27) |
              (depth_surface_type << 29));
 
    if (depth_mt) {
@@ -407,10 +318,148 @@
    ADVANCE_BATCH();
 }
 
+void
+brw_emit_depthbuffer(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   struct gl_context *ctx = &brw->ctx;
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   /* _NEW_BUFFERS */
+   struct intel_renderbuffer *depth_irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
+   struct intel_renderbuffer *stencil_irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
+   struct intel_mipmap_tree *depth_mt = intel_renderbuffer_get_mt(depth_irb);
+   struct intel_mipmap_tree *stencil_mt = get_stencil_miptree(stencil_irb);
+
+   if (depth_mt)
+      brw_cache_flush_for_depth(brw, depth_mt->bo);
+   if (stencil_mt)
+      brw_cache_flush_for_depth(brw, stencil_mt->bo);
+
+   if (devinfo->gen < 6) {
+      brw_emit_depth_stencil_hiz(brw, depth_irb, depth_mt,
+                                 stencil_irb, stencil_mt);
+      return;
+   }
+
+   /* Skip repeated NULL depth/stencil emits (think 2D rendering). */
+   if (!depth_mt && !stencil_mt && brw->no_depth_or_stencil) {
+      assert(brw->hw_ctx);
+      return;
+   }
+
+   brw_emit_depth_stall_flushes(brw);
+
+   const unsigned ds_dwords = brw->isl_dev.ds.size / 4;
+   intel_batchbuffer_begin(brw, ds_dwords);
+   uint32_t *ds_map = brw->batch.map_next;
+   const uint32_t ds_offset = (char *)ds_map - (char *)brw->batch.batch.map;
+
+   struct isl_view view = {
+      /* Some nice defaults */
+      .base_level = 0,
+      .levels = 1,
+      .base_array_layer = 0,
+      .array_len = 1,
+      .swizzle = ISL_SWIZZLE_IDENTITY,
+   };
+
+   struct isl_depth_stencil_hiz_emit_info info = {
+      .view = &view,
+   };
+
+   if (depth_mt) {
+      view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
+      info.depth_surf = &depth_mt->surf;
+
+      info.depth_address =
+         brw_batch_reloc(&brw->batch,
+                         ds_offset + brw->isl_dev.ds.depth_offset,
+                         depth_mt->bo, depth_mt->offset, RELOC_WRITE);
+
+      info.mocs = brw_get_bo_mocs(devinfo, depth_mt->bo);
+      view.base_level = depth_irb->mt_level - depth_irb->mt->first_level;
+      view.base_array_layer = depth_irb->mt_layer;
+      view.array_len = MAX2(depth_irb->layer_count, 1);
+      view.format = depth_mt->surf.format;
+
+      info.hiz_usage = depth_mt->aux_usage;
+      if (!intel_renderbuffer_has_hiz(depth_irb)) {
+         /* Just because a miptree has ISL_AUX_USAGE_HIZ does not mean that
+          * all miplevels of that miptree are guaranteed to support HiZ.  See
+          * intel_miptree_level_enable_hiz for details.
+          */
+         info.hiz_usage = ISL_AUX_USAGE_NONE;
+      }
+
+      if (info.hiz_usage == ISL_AUX_USAGE_HIZ) {
+         info.hiz_surf = &depth_mt->aux_buf->surf;
+
+         uint32_t hiz_offset = 0;
+         if (devinfo->gen == 6) {
+            /* HiZ surfaces on Sandy Bridge technically don't support
+             * mip-mapping.  However, we can fake it by offsetting to the
+             * first slice of LOD0 in the HiZ surface.
+             */
+            isl_surf_get_image_offset_B_tile_sa(&depth_mt->aux_buf->surf,
+                                                view.base_level, 0, 0,
+                                                &hiz_offset, NULL, NULL);
+         }
+
+         info.hiz_address =
+            brw_batch_reloc(&brw->batch,
+                            ds_offset + brw->isl_dev.ds.hiz_offset,
+                            depth_mt->aux_buf->bo,
+                            depth_mt->aux_buf->offset + hiz_offset,
+                            RELOC_WRITE);
+      }
+
+      info.depth_clear_value = depth_mt->fast_clear_color.f32[0];
+   }
+
+   if (stencil_mt) {
+      view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
+      info.stencil_surf = &stencil_mt->surf;
+
+      if (!depth_mt) {
+         info.mocs = brw_get_bo_mocs(devinfo, stencil_mt->bo);
+         view.base_level = stencil_irb->mt_level - stencil_irb->mt->first_level;
+         view.base_array_layer = stencil_irb->mt_layer;
+         view.array_len = MAX2(stencil_irb->layer_count, 1);
+         view.format = stencil_mt->surf.format;
+      }
+
+      uint32_t stencil_offset = 0;
+      if (devinfo->gen == 6) {
+         /* Stencil surfaces on Sandy Bridge technically don't support
+          * mip-mapping.  However, we can fake it by offsetting to the
+          * first slice of LOD0 in the stencil surface.
+          */
+         isl_surf_get_image_offset_B_tile_sa(&stencil_mt->surf,
+                                             view.base_level, 0, 0,
+                                             &stencil_offset, NULL, NULL);
+      }
+
+      info.stencil_address =
+         brw_batch_reloc(&brw->batch,
+                         ds_offset + brw->isl_dev.ds.stencil_offset,
+                         stencil_mt->bo,
+                         stencil_mt->offset + stencil_offset,
+                         RELOC_WRITE);
+   }
+
+   isl_emit_depth_stencil_hiz_s(&brw->isl_dev, ds_map, &info);
+
+   brw->batch.map_next += ds_dwords;
+   intel_batchbuffer_advance(brw);
+
+   brw->no_depth_or_stencil = !depth_mt && !stencil_mt;
+}
+
 const struct brw_tracked_state brw_depthbuffer = {
    .dirty = {
       .mesa = _NEW_BUFFERS,
-      .brw = BRW_NEW_BATCH |
+      .brw = BRW_NEW_AUX_STATE |
+             BRW_NEW_BATCH |
              BRW_NEW_BLORP,
    },
    .emit = brw_emit_depthbuffer,
diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
index 69da83a..54f9f9b 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
+++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
@@ -118,35 +118,59 @@
    }
 }
 
+static unsigned
+count_uniform_storage_slots(const struct glsl_type *type)
+{
+   /* gl_uniform_storage can cope with one level of array, so if the
+    * type is a composite type or an array where each element occupies
+    * more than one slot than we need to recursively process it.
+    */
+   if (glsl_type_is_struct(type)) {
+      unsigned location_count = 0;
+
+      for (unsigned i = 0; i < glsl_get_length(type); i++) {
+         const struct glsl_type *field_type = glsl_get_struct_field(type, i);
+
+         location_count += count_uniform_storage_slots(field_type);
+      }
+
+      return location_count;
+   }
+
+   if (glsl_type_is_array(type)) {
+      const struct glsl_type *element_type = glsl_get_array_element(type);
+
+      if (glsl_type_is_array(element_type) ||
+          glsl_type_is_struct(element_type)) {
+         unsigned element_count = count_uniform_storage_slots(element_type);
+         return element_count * glsl_get_length(type);
+      }
+   }
+
+   return 1;
+}
+
 static void
 brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var,
                            const struct gl_program *prog,
                            struct brw_stage_prog_data *stage_prog_data,
                            bool is_scalar)
 {
-   int namelen = strlen(var->name);
-
    /* The data for our (non-builtin) uniforms is stored in a series of
     * gl_uniform_storage structs for each subcomponent that
     * glGetUniformLocation() could name.  We know it's been set up in the same
-    * order we'd walk the type, so walk the list of storage and find anything
-    * with our name, or the prefix of a component that starts with our name.
+    * order we'd walk the type, so walk the list of storage that matches the
+    * range of slots covered by this variable.
     */
    unsigned uniform_index = var->data.driver_location / 4;
-   for (unsigned u = 0; u < prog->sh.data->NumUniformStorage; u++) {
+   unsigned num_slots = count_uniform_storage_slots(var->type);
+   for (unsigned u = 0; u < num_slots; u++) {
       struct gl_uniform_storage *storage =
-         &prog->sh.data->UniformStorage[u];
+         &prog->sh.data->UniformStorage[var->data.location + u];
 
       if (storage->builtin || storage->type->is_sampler())
          continue;
 
-      if (strncmp(var->name, storage->name, namelen) != 0 ||
-          (storage->name[namelen] != 0 &&
-           storage->name[namelen] != '.' &&
-           storage->name[namelen] != '[')) {
-         continue;
-      }
-
       if (storage->type->is_image()) {
          brw_setup_image_uniform_values(stage, stage_prog_data,
                                         uniform_index, storage);
@@ -202,7 +226,7 @@
       if (var->interface_type != NULL || var->type->contains_atomic())
          continue;
 
-      if (strncmp(var->name, "gl_", 3) == 0) {
+      if (var->num_state_slots > 0) {
          brw_nir_setup_glsl_builtin_uniform(var, prog, stage_prog_data,
                                             is_scalar);
       } else {
@@ -243,29 +267,3 @@
          stage_prog_data->param[4 * p + i] = BRW_PARAM_BUILTIN_ZERO;
    }
 }
-
-void
-brw_nir_lower_patch_vertices_in_to_uniform(nir_shader *nir)
-{
-   nir_foreach_variable_safe(var, &nir->system_values) {
-      if (var->data.location != SYSTEM_VALUE_VERTICES_IN)
-         continue;
-
-      gl_state_index16 tokens[STATE_LENGTH] = {
-         STATE_INTERNAL,
-         nir->info.stage == MESA_SHADER_TESS_CTRL ?
-            (gl_state_index16)STATE_TCS_PATCH_VERTICES_IN :
-            (gl_state_index16)STATE_TES_PATCH_VERTICES_IN,
-      };
-      var->num_state_slots = 1;
-      var->state_slots =
-         ralloc_array(var, nir_state_slot, var->num_state_slots);
-      memcpy(var->state_slots[0].tokens, tokens, sizeof(tokens));
-      var->state_slots[0].swizzle = SWIZZLE_XXXX;
-
-      var->data.mode = nir_var_uniform;
-      var->data.location = -1;
-      exec_node_remove(&var->node);
-      exec_list_push_tail(&nir->uniforms, &var->node);
-   }
-}
diff --git a/src/mesa/drivers/dri/i965/brw_oa.py b/src/mesa/drivers/dri/i965/brw_oa.py
index 7bf7987..7538255 100644
--- a/src/mesa/drivers/dri/i965/brw_oa.py
+++ b/src/mesa/drivers/dri/i965/brw_oa.py
@@ -629,7 +629,7 @@
 
     c(textwrap.dedent("""\
         #include "brw_context.h"
-        #include "brw_performance_query.h"
+        #include "brw_performance_query_metrics.h"
 
 
         #define MIN(a, b) ((a < b) ? (a) : (b))
@@ -653,7 +653,7 @@
 
             c("\n")
             register_lengths = compute_register_lengths(set);
-            for reg_type, reg_length in register_lengths.iteritems():
+            for reg_type, reg_length in register_lengths.items():
                 c("static struct brw_perf_query_register_prog {0}_{1}_{2}[{3}];".format(gen.chipset,
                                                                                         set.underscore_name,
                                                                                         reg_type, reg_length))
@@ -692,7 +692,7 @@
                     .c_offset = 46,
                 """))
 
-            for reg_type, reg_length in register_lengths.iteritems():
+            for reg_type, reg_length in register_lengths.items():
                 c(".{0} = {1}_{2}_{3},".format(reg_type, gen.chipset, set.underscore_name, reg_type))
                 c(".n_{0} = 0, /* Determined at runtime */".format(reg_type))
 
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.c b/src/mesa/drivers/dri/i965/brw_performance_query.c
index ef5401a..d45529f 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.c
@@ -76,15 +76,6 @@
 
 #define FILE_DEBUG_FLAG DEBUG_PERFMON
 
-/*
- * The largest OA formats we can use include:
- * For Haswell:
- *   1 timestamp, 45 A counters, 8 B counters and 8 C counters.
- * For Gen8+
- *   1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
- */
-#define MAX_OA_REPORT_COUNTERS 62
-
 #define OAREPORT_REASON_MASK           0x3f
 #define OAREPORT_REASON_SHIFT          19
 #define OAREPORT_REASON_TIMER          (1<<0)
@@ -216,85 +207,6 @@
    uint32_t last_timestamp;
 };
 
-/**
- * i965 representation of a performance query object.
- *
- * NB: We want to keep this structure relatively lean considering that
- * applications may expect to allocate enough objects to be able to
- * query around all draw calls in a frame.
- */
-struct brw_perf_query_object
-{
-   struct gl_perf_query_object base;
-
-   const struct brw_perf_query_info *query;
-
-   /* See query->kind to know which state below is in use... */
-   union {
-      struct {
-
-         /**
-          * BO containing OA counter snapshots at query Begin/End time.
-          */
-         struct brw_bo *bo;
-
-         /**
-          * Address of mapped of @bo
-          */
-         void *map;
-
-         /**
-          * The MI_REPORT_PERF_COUNT command lets us specify a unique
-          * ID that will be reflected in the resulting OA report
-          * that's written by the GPU. This is the ID we're expecting
-          * in the begin report and the the end report should be
-          * @begin_report_id + 1.
-          */
-         int begin_report_id;
-
-         /**
-          * Reference the head of the brw->perfquery.sample_buffers
-          * list at the time that the query started (so we only need
-          * to look at nodes after this point when looking for samples
-          * related to this query)
-          *
-          * (See struct brw_oa_sample_buf description for more details)
-          */
-         struct exec_node *samples_head;
-
-         /**
-          * Storage for the final accumulated OA counters.
-          */
-         uint64_t accumulator[MAX_OA_REPORT_COUNTERS];
-
-         /**
-          * Hw ID used by the context on which the query was running.
-          */
-         uint32_t hw_id;
-
-         /**
-          * false while in the unaccumulated_elements list, and set to
-          * true when the final, end MI_RPC snapshot has been
-          * accumulated.
-          */
-         bool results_accumulated;
-
-         /**
-          * Number of reports accumulated to produce the results.
-          */
-         uint32_t reports_accumulated;
-      } oa;
-
-      struct {
-         /**
-          * BO containing starting and ending snapshots for the
-          * statistics counters.
-          */
-         struct brw_bo *bo;
-      } pipeline_stats;
-   };
-};
-
 /** Downcasting convenience macro. */
 static inline struct brw_perf_query_object *
 brw_perf_query(struct gl_perf_query_object *o)
@@ -302,12 +214,10 @@
    return (struct brw_perf_query_object *) o;
 }
 
-#define STATS_BO_SIZE               4096
-#define STATS_BO_END_OFFSET_BYTES   (STATS_BO_SIZE / 2)
-#define MAX_STAT_COUNTERS           (STATS_BO_END_OFFSET_BYTES / 8)
-
 #define MI_RPC_BO_SIZE              4096
 #define MI_RPC_BO_END_OFFSET_BYTES  (MI_RPC_BO_SIZE / 2)
+#define MI_FREQ_START_OFFSET_BYTES  (3072)
+#define MI_FREQ_END_OFFSET_BYTES    (3076)
 
 /******************************************************************************/
 
@@ -356,6 +266,44 @@
 brw_is_perf_query_ready(struct gl_context *ctx,
                         struct gl_perf_query_object *o);
 
+static uint64_t
+brw_perf_query_get_metric_id(struct brw_context *brw,
+                             const struct brw_perf_query_info *query)
+{
+   /* These queries are know not to ever change, their config ID has been
+    * loaded upon the first query creation. No need to look them up again.
+    */
+   if (query->kind == OA_COUNTERS)
+      return query->oa_metrics_set_id;
+
+   assert(query->kind == OA_COUNTERS_RAW);
+
+   /* Raw queries can be reprogrammed up by an external application/library.
+    * When a raw query is used for the first time it's id is set to a value !=
+    * 0. When it stops being used the id returns to 0. No need to reload the
+    * ID when it's already loaded.
+    */
+   if (query->oa_metrics_set_id != 0) {
+      DBG("Raw query '%s' guid=%s using cached ID: %"PRIu64"\n",
+          query->name, query->guid, query->oa_metrics_set_id);
+      return query->oa_metrics_set_id;
+   }
+
+   char metric_id_file[280];
+   snprintf(metric_id_file, sizeof(metric_id_file),
+            "%s/metrics/%s/id", brw->perfquery.sysfs_dev_dir, query->guid);
+
+   struct brw_perf_query_info *raw_query = (struct brw_perf_query_info *)query;
+   if (!read_file_uint64(metric_id_file, &raw_query->oa_metrics_set_id)) {
+      DBG("Unable to read query guid=%s ID, falling back to test config\n", query->guid);
+      raw_query->oa_metrics_set_id = 1ULL;
+   } else {
+      DBG("Raw query '%s'guid=%s loaded ID: %"PRIu64"\n",
+          query->name, query->guid, query->oa_metrics_set_id);
+   }
+   return query->oa_metrics_set_id;
+}
+
 static void
 dump_perf_query_callback(GLuint id, void *query_void, void *brw_void)
 {
@@ -365,6 +313,7 @@
 
    switch (obj->query->kind) {
    case OA_COUNTERS:
+   case OA_COUNTERS_RAW:
       DBG("%4d: %-6s %-8s BO: %-4s OA data: %-10s %-15s\n",
           id,
           o->Used ? "Dirty," : "New,",
@@ -473,6 +422,7 @@
 
    switch (query->kind) {
    case OA_COUNTERS:
+   case OA_COUNTERS_RAW:
       *n_active = brw->perfquery.n_active_oa_queries;
       break;
 
@@ -606,36 +556,6 @@
    reap_old_sample_buffers(brw);
 }
 
-static void
-accumulate_uint32(const uint32_t *report0,
-                  const uint32_t *report1,
-                  uint64_t *accumulator)
-{
-   *accumulator += (uint32_t)(*report1 - *report0);
-}
-
-static void
-accumulate_uint40(int a_index,
-                  const uint32_t *report0,
-                  const uint32_t *report1,
-                  uint64_t *accumulator)
-{
-   const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
-   const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
-   uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
-   uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
-   uint64_t value0 = report0[a_index + 4] | high0;
-   uint64_t value1 = report1[a_index + 4] | high1;
-   uint64_t delta;
-
-   if (value0 > value1)
-      delta = (1ULL << 40) + value1 - value0;
-   else
-      delta = value1 - value0;
-
-   *accumulator += delta;
-}
-
 /**
  * Given pointers to starting and ending OA snapshots, add the deltas for each
  * counter to the results.
@@ -655,27 +575,29 @@
 
    switch (query->oa_format) {
    case I915_OA_FORMAT_A32u40_A4u32_B8_C8:
-      accumulate_uint32(start + 1, end + 1, accumulator + idx++); /* timestamp */
-      accumulate_uint32(start + 3, end + 3, accumulator + idx++); /* clock */
+      brw_perf_query_accumulate_uint32(start + 1, end + 1, accumulator + idx++); /* timestamp */
+      brw_perf_query_accumulate_uint32(start + 3, end + 3, accumulator + idx++); /* clock */
 
       /* 32x 40bit A counters... */
       for (i = 0; i < 32; i++)
-         accumulate_uint40(i, start, end, accumulator + idx++);
+         brw_perf_query_accumulate_uint40(i, start, end, accumulator + idx++);
 
       /* 4x 32bit A counters... */
       for (i = 0; i < 4; i++)
-         accumulate_uint32(start + 36 + i, end + 36 + i, accumulator + idx++);
+         brw_perf_query_accumulate_uint32(start + 36 + i, end + 36 + i,
+                                          accumulator + idx++);
 
       /* 8x 32bit B counters + 8x 32bit C counters... */
       for (i = 0; i < 16; i++)
-         accumulate_uint32(start + 48 + i, end + 48 + i, accumulator + idx++);
+         brw_perf_query_accumulate_uint32(start + 48 + i, end + 48 + i,
+                                          accumulator + idx++);
 
       break;
    case I915_OA_FORMAT_A45_B8_C8:
-      accumulate_uint32(start + 1, end + 1, accumulator); /* timestamp */
+      brw_perf_query_accumulate_uint32(start + 1, end + 1, accumulator); /* timestamp */
 
       for (i = 0; i < 61; i++)
-         accumulate_uint32(start + 3 + i, end + 3 + i, accumulator + 1 + i);
+         brw_perf_query_accumulate_uint32(start + 3 + i, end + 3 + i, accumulator + 1 + i);
 
       break;
    default:
@@ -1058,12 +980,33 @@
 }
 
 static void
-close_perf(struct brw_context *brw)
+close_perf(struct brw_context *brw,
+           const struct brw_perf_query_info *query)
 {
    if (brw->perfquery.oa_stream_fd != -1) {
       close(brw->perfquery.oa_stream_fd);
       brw->perfquery.oa_stream_fd = -1;
    }
+   if (query->kind == OA_COUNTERS_RAW) {
+      struct brw_perf_query_info *raw_query =
+         (struct brw_perf_query_info *) query;
+      raw_query->oa_metrics_set_id = 0;
+   }
+}
+
+static void
+capture_frequency_stat_register(struct brw_context *brw,
+                                struct brw_bo *bo,
+                                uint32_t bo_offset)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   if (devinfo->gen >= 7 && devinfo->gen <= 8 &&
+       !devinfo->is_baytrail && !devinfo->is_cherryview) {
+      brw_store_register_mem32(brw, bo, GEN7_RPSTAT1, bo_offset);
+   } else if (devinfo->gen >= 9) {
+      brw_store_register_mem32(brw, bo, GEN9_RPSTAT0, bo_offset);
+   }
 }
 
 /**
@@ -1136,6 +1079,7 @@
 
    switch (query->kind) {
    case OA_COUNTERS:
+   case OA_COUNTERS_RAW: {
 
       /* Opening an i915 perf stream implies exclusive access to the OA unit
        * which will generate counter reports for a specific counter set with a
@@ -1143,14 +1087,17 @@
        * require a different counter set or format unless we get an opportunity
        * to close the stream and open a new one...
        */
-      if (brw->perfquery.oa_stream_fd != -1 &&
-          brw->perfquery.current_oa_metrics_set_id !=
-          query->oa_metrics_set_id) {
+      uint64_t metric_id = brw_perf_query_get_metric_id(brw, query);
 
-         if (brw->perfquery.n_oa_users != 0)
+      if (brw->perfquery.oa_stream_fd != -1 &&
+          brw->perfquery.current_oa_metrics_set_id != metric_id) {
+
+         if (brw->perfquery.n_oa_users != 0) {
+            DBG("WARNING: Begin(%d) failed already using perf config=%i/%"PRIu64"\n",
+                o->Id, brw->perfquery.current_oa_metrics_set_id, metric_id);
             return false;
-         else
-            close_perf(brw);
+         } else
+            close_perf(brw, query);
       }
 
       /* If the OA counters aren't already on, enable them. */
@@ -1212,17 +1159,15 @@
              prev_sample_period / 1000000ul);
 
          if (!open_i915_perf_oa_stream(brw,
-                                       query->oa_metrics_set_id,
+                                       metric_id,
                                        query->oa_format,
                                        period_exponent,
                                        screen->fd, /* drm fd */
                                        brw->hw_ctx))
             return false;
       } else {
-         assert(brw->perfquery.current_oa_metrics_set_id ==
-                query->oa_metrics_set_id &&
-                brw->perfquery.current_oa_format ==
-                query->oa_format);
+         assert(brw->perfquery.current_oa_metrics_set_id == metric_id &&
+                brw->perfquery.current_oa_format == query->oa_format);
       }
 
       if (!inc_n_oa_users(brw)) {
@@ -1236,7 +1181,8 @@
       }
 
       obj->oa.bo =
-         brw_bo_alloc(brw->bufmgr, "perf. query OA MI_RPC bo", MI_RPC_BO_SIZE);
+         brw_bo_alloc(brw->bufmgr, "perf. query OA MI_RPC bo", MI_RPC_BO_SIZE,
+                      BRW_MEMZONE_OTHER);
 #ifdef DEBUG
       /* Pre-filling the BO helps debug whether writes landed. */
       void *map = brw_bo_map(brw, obj->oa.bo, MAP_WRITE);
@@ -1258,6 +1204,8 @@
       /* Take a starting OA counter snapshot. */
       brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo, 0,
                                           obj->oa.begin_report_id);
+      capture_frequency_stat_register(brw, obj->oa.bo, MI_FREQ_START_OFFSET_BYTES);
+
       ++brw->perfquery.n_active_oa_queries;
 
       /* No already-buffered samples can possibly be associated with this query
@@ -1283,6 +1231,7 @@
 
       add_to_unaccumulated_query_list(brw, obj);
       break;
+   }
 
    case PIPELINE_STATS:
       if (obj->pipeline_stats.bo) {
@@ -1292,7 +1241,7 @@
 
       obj->pipeline_stats.bo =
          brw_bo_alloc(brw->bufmgr, "perf. query pipeline stats bo",
-                      STATS_BO_SIZE);
+                      STATS_BO_SIZE, BRW_MEMZONE_OTHER);
 
       /* Take starting snapshots. */
       snapshot_statistics_registers(brw, obj, 0);
@@ -1333,6 +1282,7 @@
 
    switch (obj->query->kind) {
    case OA_COUNTERS:
+   case OA_COUNTERS_RAW:
 
       /* NB: It's possible that the query will have already been marked
        * as 'accumulated' if an error was seen while reading samples
@@ -1341,6 +1291,7 @@
        */
       if (!obj->oa.results_accumulated) {
          /* Take an ending OA counter snapshot. */
+         capture_frequency_stat_register(brw, obj->oa.bo, MI_FREQ_END_OFFSET_BYTES);
          brw->vtbl.emit_mi_report_perf_count(brw, obj->oa.bo,
                                              MI_RPC_BO_END_OFFSET_BYTES,
                                              obj->oa.begin_report_id + 1);
@@ -1377,6 +1328,7 @@
 
    switch (obj->query->kind) {
    case OA_COUNTERS:
+   case OA_COUNTERS_RAW:
       bo = obj->oa.bo;
       break;
 
@@ -1405,7 +1357,8 @@
     * we need to wait for all the reports to come in before we can
     * read them.
     */
-   if (obj->query->kind == OA_COUNTERS) {
+   if (obj->query->kind == OA_COUNTERS ||
+       obj->query->kind == OA_COUNTERS_RAW) {
       while (!read_oa_samples_for_query(brw, obj))
          ;
    }
@@ -1423,6 +1376,7 @@
 
    switch (obj->query->kind) {
    case OA_COUNTERS:
+   case OA_COUNTERS_RAW:
       return (obj->oa.results_accumulated ||
               (obj->oa.bo &&
                !brw_batch_references(&brw->batch, obj->oa.bo) &&
@@ -1441,6 +1395,93 @@
    return false;
 }
 
+static void
+gen8_read_report_clock_ratios(const uint32_t *report,
+                              uint64_t *slice_freq_hz,
+                              uint64_t *unslice_freq_hz)
+{
+   /* The lower 16bits of the RPT_ID field of the OA reports contains a
+    * snapshot of the bits coming from the RP_FREQ_NORMAL register and is
+    * divided this way :
+    *
+    * RPT_ID[31:25]: RP_FREQ_NORMAL[20:14] (low squashed_slice_clock_frequency)
+    * RPT_ID[10:9]:  RP_FREQ_NORMAL[22:21] (high squashed_slice_clock_frequency)
+    * RPT_ID[8:0]:   RP_FREQ_NORMAL[31:23] (squashed_unslice_clock_frequency)
+    *
+    * RP_FREQ_NORMAL[31:23]: Software Unslice Ratio Request
+    *                        Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
+    *
+    * RP_FREQ_NORMAL[22:14]: Software Slice Ratio Request
+    *                        Multiple of 33.33MHz 2xclk (16 MHz 1xclk)
+    */
+
+   uint32_t unslice_freq = report[0] & 0x1ff;
+   uint32_t slice_freq_low = (report[0] >> 25) & 0x7f;
+   uint32_t slice_freq_high = (report[0] >> 9) & 0x3;
+   uint32_t slice_freq = slice_freq_low | (slice_freq_high << 7);
+
+   *slice_freq_hz = slice_freq * 16666667ULL;
+   *unslice_freq_hz = unslice_freq * 16666667ULL;
+}
+
+static void
+read_slice_unslice_frequencies(struct brw_context *brw,
+                               struct brw_perf_query_object *obj)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   uint32_t *begin_report, *end_report;
+
+   /* Slice/Unslice frequency is only available in the OA reports when the
+    * "Disable OA reports due to clock ratio change" field in
+    * OA_DEBUG_REGISTER is set to 1. This is how the kernel programs this
+    * global register (see drivers/gpu/drm/i915/i915_perf.c)
+    *
+    * Documentation says this should be available on Gen9+ but experimentation
+    * shows that Gen8 reports similar values, so we enable it there too.
+    */
+   if (devinfo->gen < 8)
+      return;
+
+   begin_report = obj->oa.map;
+   end_report = obj->oa.map + MI_RPC_BO_END_OFFSET_BYTES;
+
+   gen8_read_report_clock_ratios(begin_report,
+                                 &obj->oa.slice_frequency[0],
+                                 &obj->oa.unslice_frequency[0]);
+   gen8_read_report_clock_ratios(end_report,
+                                 &obj->oa.slice_frequency[1],
+                                 &obj->oa.unslice_frequency[1]);
+}
+
+static void
+read_gt_frequency(struct brw_context *brw,
+                  struct brw_perf_query_object *obj)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   uint32_t start = *((uint32_t *)(obj->oa.map + MI_FREQ_START_OFFSET_BYTES)),
+      end = *((uint32_t *)(obj->oa.map + MI_FREQ_END_OFFSET_BYTES));
+
+   switch (devinfo->gen) {
+   case 7:
+   case 8:
+      obj->oa.gt_frequency[0] = GET_FIELD(start, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
+      obj->oa.gt_frequency[1] = GET_FIELD(end, GEN7_RPSTAT1_CURR_GT_FREQ) * 50ULL;
+      break;
+   case 9:
+   case 10:
+   case 11:
+      obj->oa.gt_frequency[0] = GET_FIELD(start, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
+      obj->oa.gt_frequency[1] = GET_FIELD(end, GEN9_RPSTAT0_CURR_GT_FREQ) * 50ULL / 3ULL;
+      break;
+   default:
+      unreachable("unexpected gen");
+   }
+
+   /* Put the numbers into Hz. */
+   obj->oa.gt_frequency[0] *= 1000000ULL;
+   obj->oa.gt_frequency[1] *= 1000000ULL;
+}
+
 static int
 get_oa_counter_data(struct brw_context *brw,
                     struct brw_perf_query_object *obj,
@@ -1451,14 +1492,6 @@
    int n_counters = query->n_counters;
    int written = 0;
 
-   if (!obj->oa.results_accumulated) {
-      accumulate_oa_reports(brw, obj);
-      assert(obj->oa.results_accumulated);
-
-      brw_bo_unmap(obj->oa.bo);
-      obj->oa.map = NULL;
-   }
-
    for (int i = 0; i < n_counters; i++) {
       const struct brw_perf_query_counter *counter = &query->counters[i];
       uint64_t *out_uint64;
@@ -1548,7 +1581,20 @@
 
    switch (obj->query->kind) {
    case OA_COUNTERS:
-      written = get_oa_counter_data(brw, obj, data_size, (uint8_t *)data);
+   case OA_COUNTERS_RAW:
+      if (!obj->oa.results_accumulated) {
+         read_gt_frequency(brw, obj);
+         read_slice_unslice_frequencies(brw, obj);
+         accumulate_oa_reports(brw, obj);
+         assert(obj->oa.results_accumulated);
+
+         brw_bo_unmap(obj->oa.bo);
+         obj->oa.map = NULL;
+      }
+      if (obj->query->kind == OA_COUNTERS)
+         written = get_oa_counter_data(brw, obj, data_size, (uint8_t *)data);
+      else
+         written = brw_perf_query_get_mdapi_oa_data(brw, obj, data_size, (uint8_t *)data);
       break;
 
    case PIPELINE_STATS:
@@ -1604,6 +1650,7 @@
 
    switch (obj->query->kind) {
    case OA_COUNTERS:
+   case OA_COUNTERS_RAW:
       if (obj->oa.bo) {
          if (!obj->oa.results_accumulated) {
             drop_from_unaccumulated_query_list(brw, obj);
@@ -1629,68 +1676,25 @@
       break;
    }
 
-   free(obj);
-
    /* As an indication that the INTEL_performance_query extension is no
     * longer in use, it's a good time to free our cache of sample
     * buffers and close any current i915-perf stream.
     */
    if (--brw->perfquery.n_query_instances == 0) {
       free_sample_bufs(brw);
-      close_perf(brw);
+      close_perf(brw, obj->query);
    }
+
+   free(obj);
 }
 
 /******************************************************************************/
 
-static struct brw_perf_query_info *
-append_query_info(struct brw_context *brw)
-{
-   brw->perfquery.queries =
-      reralloc(brw, brw->perfquery.queries,
-               struct brw_perf_query_info, ++brw->perfquery.n_queries);
-
-   return &brw->perfquery.queries[brw->perfquery.n_queries - 1];
-}
-
-static void
-add_stat_reg(struct brw_perf_query_info *query,
-             uint32_t reg,
-             uint32_t numerator,
-             uint32_t denominator,
-             const char *name,
-             const char *description)
-{
-   struct brw_perf_query_counter *counter;
-
-   assert(query->n_counters < MAX_STAT_COUNTERS);
-
-   counter = &query->counters[query->n_counters];
-   counter->name = name;
-   counter->desc = description;
-   counter->type = GL_PERFQUERY_COUNTER_RAW_INTEL;
-   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
-   counter->size = sizeof(uint64_t);
-   counter->offset = sizeof(uint64_t) * query->n_counters;
-   counter->pipeline_stat.reg = reg;
-   counter->pipeline_stat.numerator = numerator;
-   counter->pipeline_stat.denominator = denominator;
-
-   query->n_counters++;
-}
-
-static void
-add_basic_stat_reg(struct brw_perf_query_info *query,
-                   uint32_t reg, const char *name)
-{
-   add_stat_reg(query, reg, 1, 1, name, name);
-}
-
 static void
 init_pipeline_statistic_query_registers(struct brw_context *brw)
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   struct brw_perf_query_info *query = append_query_info(brw);
+   struct brw_perf_query_info *query = brw_perf_query_append_query_info(brw);
 
    query->kind = PIPELINE_STATS;
    query->name = "Pipeline Statistics Registers";
@@ -1698,75 +1702,75 @@
    query->counters =
       rzalloc_array(brw, struct brw_perf_query_counter, MAX_STAT_COUNTERS);
 
-   add_basic_stat_reg(query, IA_VERTICES_COUNT,
-                      "N vertices submitted");
-   add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
-                      "N primitives submitted");
-   add_basic_stat_reg(query, VS_INVOCATION_COUNT,
-                      "N vertex shader invocations");
+   brw_perf_query_info_add_basic_stat_reg(query, IA_VERTICES_COUNT,
+                                          "N vertices submitted");
+   brw_perf_query_info_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
+                                          "N primitives submitted");
+   brw_perf_query_info_add_basic_stat_reg(query, VS_INVOCATION_COUNT,
+                                          "N vertex shader invocations");
 
    if (devinfo->gen == 6) {
-      add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
-                   "SO_PRIM_STORAGE_NEEDED",
-                   "N geometry shader stream-out primitives (total)");
-      add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
-                   "SO_NUM_PRIMS_WRITTEN",
-                   "N geometry shader stream-out primitives (written)");
+      brw_perf_query_info_add_stat_reg(query, GEN6_SO_PRIM_STORAGE_NEEDED, 1, 1,
+                                       "SO_PRIM_STORAGE_NEEDED",
+                                       "N geometry shader stream-out primitives (total)");
+      brw_perf_query_info_add_stat_reg(query, GEN6_SO_NUM_PRIMS_WRITTEN, 1, 1,
+                                       "SO_NUM_PRIMS_WRITTEN",
+                                       "N geometry shader stream-out primitives (written)");
    } else {
-      add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
-                   "SO_PRIM_STORAGE_NEEDED (Stream 0)",
-                   "N stream-out (stream 0) primitives (total)");
-      add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
-                   "SO_PRIM_STORAGE_NEEDED (Stream 1)",
-                   "N stream-out (stream 1) primitives (total)");
-      add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
-                   "SO_PRIM_STORAGE_NEEDED (Stream 2)",
-                   "N stream-out (stream 2) primitives (total)");
-      add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
-                   "SO_PRIM_STORAGE_NEEDED (Stream 3)",
-                   "N stream-out (stream 3) primitives (total)");
-      add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
-                   "SO_NUM_PRIMS_WRITTEN (Stream 0)",
-                   "N stream-out (stream 0) primitives (written)");
-      add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
-                   "SO_NUM_PRIMS_WRITTEN (Stream 1)",
-                   "N stream-out (stream 1) primitives (written)");
-      add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
-                   "SO_NUM_PRIMS_WRITTEN (Stream 2)",
-                   "N stream-out (stream 2) primitives (written)");
-      add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
-                   "SO_NUM_PRIMS_WRITTEN (Stream 3)",
-                   "N stream-out (stream 3) primitives (written)");
+      brw_perf_query_info_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(0), 1, 1,
+                                       "SO_PRIM_STORAGE_NEEDED (Stream 0)",
+                                       "N stream-out (stream 0) primitives (total)");
+      brw_perf_query_info_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(1), 1, 1,
+                                       "SO_PRIM_STORAGE_NEEDED (Stream 1)",
+                                       "N stream-out (stream 1) primitives (total)");
+      brw_perf_query_info_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(2), 1, 1,
+                                       "SO_PRIM_STORAGE_NEEDED (Stream 2)",
+                                       "N stream-out (stream 2) primitives (total)");
+      brw_perf_query_info_add_stat_reg(query, GEN7_SO_PRIM_STORAGE_NEEDED(3), 1, 1,
+                                       "SO_PRIM_STORAGE_NEEDED (Stream 3)",
+                                       "N stream-out (stream 3) primitives (total)");
+      brw_perf_query_info_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(0), 1, 1,
+                                       "SO_NUM_PRIMS_WRITTEN (Stream 0)",
+                                       "N stream-out (stream 0) primitives (written)");
+      brw_perf_query_info_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(1), 1, 1,
+                                       "SO_NUM_PRIMS_WRITTEN (Stream 1)",
+                                       "N stream-out (stream 1) primitives (written)");
+      brw_perf_query_info_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(2), 1, 1,
+                                       "SO_NUM_PRIMS_WRITTEN (Stream 2)",
+                                       "N stream-out (stream 2) primitives (written)");
+      brw_perf_query_info_add_stat_reg(query, GEN7_SO_NUM_PRIMS_WRITTEN(3), 1, 1,
+                                       "SO_NUM_PRIMS_WRITTEN (Stream 3)",
+                                       "N stream-out (stream 3) primitives (written)");
    }
 
-   add_basic_stat_reg(query, HS_INVOCATION_COUNT,
-                      "N TCS shader invocations");
-   add_basic_stat_reg(query, DS_INVOCATION_COUNT,
-                      "N TES shader invocations");
+   brw_perf_query_info_add_basic_stat_reg(query, HS_INVOCATION_COUNT,
+                                          "N TCS shader invocations");
+   brw_perf_query_info_add_basic_stat_reg(query, DS_INVOCATION_COUNT,
+                                          "N TES shader invocations");
 
-   add_basic_stat_reg(query, GS_INVOCATION_COUNT,
-                      "N geometry shader invocations");
-   add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
-                      "N geometry shader primitives emitted");
+   brw_perf_query_info_add_basic_stat_reg(query, GS_INVOCATION_COUNT,
+                                          "N geometry shader invocations");
+   brw_perf_query_info_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
+                                          "N geometry shader primitives emitted");
 
-   add_basic_stat_reg(query, CL_INVOCATION_COUNT,
-                      "N primitives entering clipping");
-   add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
-                      "N primitives leaving clipping");
+   brw_perf_query_info_add_basic_stat_reg(query, CL_INVOCATION_COUNT,
+                                          "N primitives entering clipping");
+   brw_perf_query_info_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
+                                          "N primitives leaving clipping");
 
    if (devinfo->is_haswell || devinfo->gen == 8)
-      add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
-                   "N fragment shader invocations",
-                   "N fragment shader invocations");
+      brw_perf_query_info_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
+                                       "N fragment shader invocations",
+                                       "N fragment shader invocations");
    else
-      add_basic_stat_reg(query, PS_INVOCATION_COUNT,
-                         "N fragment shader invocations");
+      brw_perf_query_info_add_basic_stat_reg(query, PS_INVOCATION_COUNT,
+                                             "N fragment shader invocations");
 
-   add_basic_stat_reg(query, PS_DEPTH_COUNT, "N z-pass fragments");
+   brw_perf_query_info_add_basic_stat_reg(query, PS_DEPTH_COUNT, "N z-pass fragments");
 
    if (devinfo->gen >= 7)
-      add_basic_stat_reg(query, CS_INVOCATION_COUNT,
-                         "N compute shader invocations");
+      brw_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
+                                             "N compute shader invocations");
 
    query->data_size = sizeof(uint64_t) * query->n_counters;
 }
@@ -1776,7 +1780,9 @@
                    const struct brw_perf_query_info *query,
                    uint64_t config_id)
 {
-   struct brw_perf_query_info *registred_query = append_query_info(brw);
+   struct brw_perf_query_info *registred_query =
+      brw_perf_query_append_query_info(brw);
+
    *registred_query = *query;
    registred_query->oa_metrics_set_id = config_id;
    DBG("metric set registred: id = %" PRIu64", guid = %s\n",
@@ -2160,6 +2166,7 @@
       return brw->perfquery.n_queries;
 
    init_pipeline_statistic_query_registers(brw);
+   brw_perf_query_register_mdapi_statistic_query(brw);
 
    oa_register = get_register_queries_function(devinfo);
 
@@ -2202,6 +2209,8 @@
          init_oa_configs(brw);
       else
          enumerate_sysfs_metrics(brw);
+
+      brw_perf_query_register_mdapi_oa_query(brw);
    }
 
    brw->perfquery.unaccumulated =
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query.h b/src/mesa/drivers/dri/i965/brw_performance_query.h
index 11938b7..66b32c0 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_query.h
+++ b/src/mesa/drivers/dri/i965/brw_performance_query.h
@@ -27,33 +27,201 @@
 #include <stdint.h>
 
 #include "brw_context.h"
+#include "brw_performance_query_metrics.h"
 
-struct brw_pipeline_stat
+/*
+ * When currently allocate only one page for pipeline statistics queries. Here
+ * we derived the maximum number of counters for that amount.
+ */
+#define STATS_BO_SIZE               4096
+#define STATS_BO_END_OFFSET_BYTES   (STATS_BO_SIZE / 2)
+#define MAX_STAT_COUNTERS           (STATS_BO_END_OFFSET_BYTES / 8)
+
+/*
+ * The largest OA formats we can use include:
+ * For Haswell:
+ *   1 timestamp, 45 A counters, 8 B counters and 8 C counters.
+ * For Gen8+
+ *   1 timestamp, 1 clock, 36 A counters, 8 B counters and 8 C counters
+ */
+#define MAX_OA_REPORT_COUNTERS 62
+
+/**
+ * i965 representation of a performance query object.
+ *
+ * NB: We want to keep this structure relatively lean considering that
+ * applications may expect to allocate enough objects to be able to
+ * query around all draw calls in a frame.
+ */
+struct brw_perf_query_object
 {
-   uint32_t reg;
-   uint32_t numerator;
-   uint32_t denominator;
-};
+   struct gl_perf_query_object base;
 
-struct brw_perf_query_counter
-{
-   const char *name;
-   const char *desc;
-   GLenum type;
-   GLenum data_type;
-   uint64_t raw_max;
-   size_t offset;
-   size_t size;
+   const struct brw_perf_query_info *query;
 
+   /* See query->kind to know which state below is in use... */
    union {
-      uint64_t (*oa_counter_read_uint64)(struct brw_context *brw,
-                                         const struct brw_perf_query_info *query,
-                                         uint64_t *accumulator);
-      float (*oa_counter_read_float)(struct brw_context *brw,
-                                     const struct brw_perf_query_info *query,
-                                     uint64_t *accumulator);
-      struct brw_pipeline_stat pipeline_stat;
+      struct {
+
+         /**
+          * BO containing OA counter snapshots at query Begin/End time.
+          */
+         struct brw_bo *bo;
+
+         /**
+          * Address of mapped of @bo
+          */
+         void *map;
+
+         /**
+          * The MI_REPORT_PERF_COUNT command lets us specify a unique
+          * ID that will be reflected in the resulting OA report
+          * that's written by the GPU. This is the ID we're expecting
+          * in the begin report and the the end report should be
+          * @begin_report_id + 1.
+          */
+         int begin_report_id;
+
+         /**
+          * Reference the head of the brw->perfquery.sample_buffers
+          * list at the time that the query started (so we only need
+          * to look at nodes after this point when looking for samples
+          * related to this query)
+          *
+          * (See struct brw_oa_sample_buf description for more details)
+          */
+         struct exec_node *samples_head;
+
+         /**
+          * Storage for the final accumulated OA counters.
+          */
+         uint64_t accumulator[MAX_OA_REPORT_COUNTERS];
+
+         /**
+          * Hw ID used by the context on which the query was running.
+          */
+         uint32_t hw_id;
+
+         /**
+          * false while in the unaccumulated_elements list, and set to
+          * true when the final, end MI_RPC snapshot has been
+          * accumulated.
+          */
+         bool results_accumulated;
+
+         /**
+          * Number of reports accumulated to produce the results.
+          */
+         uint32_t reports_accumulated;
+
+         /**
+          * Frequency of the GT at begin and end of the query.
+          */
+         uint64_t gt_frequency[2];
+
+         /**
+          * Frequency in the slices of the GT at the begin and end of the
+          * query.
+          */
+         uint64_t slice_frequency[2];
+
+         /**
+          * Frequency in the unslice of the GT at the begin and end of the
+          * query.
+          */
+         uint64_t unslice_frequency[2];
+      } oa;
+
+      struct {
+         /**
+          * BO containing starting and ending snapshots for the
+          * statistics counters.
+          */
+         struct brw_bo *bo;
+      } pipeline_stats;
    };
 };
 
+static inline struct brw_perf_query_info *
+brw_perf_query_append_query_info(struct brw_context *brw)
+{
+   brw->perfquery.queries =
+      reralloc(brw, brw->perfquery.queries,
+               struct brw_perf_query_info, ++brw->perfquery.n_queries);
+
+   return &brw->perfquery.queries[brw->perfquery.n_queries - 1];
+}
+
+static inline void
+brw_perf_query_info_add_stat_reg(struct brw_perf_query_info *query,
+                                 uint32_t reg,
+                                 uint32_t numerator,
+                                 uint32_t denominator,
+                                 const char *name,
+                                 const char *description)
+{
+   struct brw_perf_query_counter *counter;
+
+   assert(query->n_counters < MAX_STAT_COUNTERS);
+
+   counter = &query->counters[query->n_counters];
+   counter->name = name;
+   counter->desc = description;
+   counter->type = GL_PERFQUERY_COUNTER_RAW_INTEL;
+   counter->data_type = GL_PERFQUERY_COUNTER_DATA_UINT64_INTEL;
+   counter->size = sizeof(uint64_t);
+   counter->offset = sizeof(uint64_t) * query->n_counters;
+   counter->pipeline_stat.reg = reg;
+   counter->pipeline_stat.numerator = numerator;
+   counter->pipeline_stat.denominator = denominator;
+
+   query->n_counters++;
+}
+
+static inline void
+brw_perf_query_info_add_basic_stat_reg(struct brw_perf_query_info *query,
+                                       uint32_t reg, const char *name)
+{
+   brw_perf_query_info_add_stat_reg(query, reg, 1, 1, name, name);
+}
+
+/* Accumulate 32bits OA counters */
+static inline void
+brw_perf_query_accumulate_uint32(const uint32_t *report0,
+                                 const uint32_t *report1,
+                                 uint64_t *accumulator)
+{
+   *accumulator += (uint32_t)(*report1 - *report0);
+}
+
+/* Accumulate 40bits OA counters */
+static inline void
+brw_perf_query_accumulate_uint40(int a_index,
+                                 const uint32_t *report0,
+                                 const uint32_t *report1,
+                                 uint64_t *accumulator)
+{
+   const uint8_t *high_bytes0 = (uint8_t *)(report0 + 40);
+   const uint8_t *high_bytes1 = (uint8_t *)(report1 + 40);
+   uint64_t high0 = (uint64_t)(high_bytes0[a_index]) << 32;
+   uint64_t high1 = (uint64_t)(high_bytes1[a_index]) << 32;
+   uint64_t value0 = report0[a_index + 4] | high0;
+   uint64_t value1 = report1[a_index + 4] | high1;
+   uint64_t delta;
+
+   if (value0 > value1)
+      delta = (1ULL << 40) + value1 - value0;
+   else
+      delta = value1 - value0;
+
+   *accumulator += delta;
+}
+
+int brw_perf_query_get_mdapi_oa_data(struct brw_context *brw,
+                                     struct brw_perf_query_object *obj,
+                                     size_t data_size,
+                                     uint8_t *data);
+void brw_perf_query_register_mdapi_oa_query(struct brw_context *brw);
+void brw_perf_query_register_mdapi_statistic_query(struct brw_context *brw);
+
 #endif /* BRW_PERFORMANCE_QUERY_H */
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query_mdapi.c b/src/mesa/drivers/dri/i965/brw_performance_query_mdapi.c
new file mode 100644
index 0000000..70f69de
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_performance_query_mdapi.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_defines.h"
+#include "brw_performance_query.h"
+
+/**
+ * Data format expected by MDAPI.
+ */
+
+struct mdapi_gen7_metrics {
+   uint64_t TotalTime;
+
+   uint64_t ACounters[45];
+   uint64_t NOACounters[16];
+
+   uint64_t PerfCounter1;
+   uint64_t PerfCounter2;
+   uint32_t SplitOccured;
+   uint32_t CoreFrequencyChanged;
+   uint64_t CoreFrequency;
+   uint32_t ReportId;
+   uint32_t ReportsCount;
+};
+
+#define GTDI_QUERY_BDW_METRICS_OA_COUNT         36
+#define GTDI_QUERY_BDW_METRICS_OA_40b_COUNT     32
+#define GTDI_QUERY_BDW_METRICS_NOA_COUNT        16
+struct mdapi_gen8_metrics {
+   uint64_t TotalTime;
+   uint64_t GPUTicks;
+   uint64_t OaCntr[GTDI_QUERY_BDW_METRICS_OA_COUNT];
+   uint64_t NoaCntr[GTDI_QUERY_BDW_METRICS_NOA_COUNT];
+   uint64_t BeginTimestamp;
+   uint64_t Reserved1;
+   uint64_t Reserved2;
+   uint32_t Reserved3;
+   uint32_t OverrunOccured;
+   uint64_t MarkerUser;
+   uint64_t MarkerDriver;
+
+   uint64_t SliceFrequency;
+   uint64_t UnsliceFrequency;
+   uint64_t PerfCounter1;
+   uint64_t PerfCounter2;
+   uint32_t SplitOccured;
+   uint32_t CoreFrequencyChanged;
+   uint64_t CoreFrequency;
+   uint32_t ReportId;
+   uint32_t ReportsCount;
+};
+
+#define GTDI_MAX_READ_REGS 16
+
+struct mdapi_gen9_metrics {
+   uint64_t TotalTime;
+   uint64_t GPUTicks;
+   uint64_t OaCntr[GTDI_QUERY_BDW_METRICS_OA_COUNT];
+   uint64_t NoaCntr[GTDI_QUERY_BDW_METRICS_NOA_COUNT];
+   uint64_t BeginTimestamp;
+   uint64_t Reserved1;
+   uint64_t Reserved2;
+   uint32_t Reserved3;
+   uint32_t OverrunOccured;
+   uint64_t MarkerUser;
+   uint64_t MarkerDriver;
+
+   uint64_t SliceFrequency;
+   uint64_t UnsliceFrequency;
+   uint64_t PerfCounter1;
+   uint64_t PerfCounter2;
+   uint32_t SplitOccured;
+   uint32_t CoreFrequencyChanged;
+   uint64_t CoreFrequency;
+   uint32_t ReportId;
+   uint32_t ReportsCount;
+
+   uint64_t UserCntr[GTDI_MAX_READ_REGS];
+   uint32_t UserCntrCfgId;
+   uint32_t Reserved4;
+};
+
+struct mdapi_pipeline_metrics {
+   uint64_t IAVertices;
+   uint64_t IAPrimitives;
+   uint64_t VSInvocations;
+   uint64_t GSInvocations;
+   uint64_t GSPrimitives;
+   uint64_t CInvocations;
+   uint64_t CPrimitives;
+   uint64_t PSInvocations;
+   uint64_t HSInvocations;
+   uint64_t DSInvocations;
+   uint64_t CSInvocations;
+};
+
+int
+brw_perf_query_get_mdapi_oa_data(struct brw_context *brw,
+                                 struct brw_perf_query_object *obj,
+                                 size_t data_size,
+                                 uint8_t *data)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   switch (devinfo->gen) {
+   case 7: {
+      struct mdapi_gen7_metrics *mdapi_data = (struct mdapi_gen7_metrics *) data;
+
+      if (data_size < sizeof(*mdapi_data))
+         return 0;
+
+      assert(devinfo->is_haswell);
+
+      for (int i = 0; i < ARRAY_SIZE(mdapi_data->ACounters); i++)
+         mdapi_data->ACounters[i] = obj->oa.accumulator[1 + i];
+
+      for (int i = 0; i < ARRAY_SIZE(mdapi_data->NOACounters); i++) {
+         mdapi_data->NOACounters[i] =
+            obj->oa.accumulator[1 + ARRAY_SIZE(mdapi_data->ACounters) + i];
+      }
+
+      mdapi_data->ReportsCount = obj->oa.reports_accumulated;
+      mdapi_data->TotalTime = brw_timebase_scale(brw, obj->oa.accumulator[0]);
+      mdapi_data->CoreFrequency = obj->oa.gt_frequency[1];
+      mdapi_data->CoreFrequencyChanged = obj->oa.gt_frequency[0] != obj->oa.gt_frequency[1];
+      return sizeof(*mdapi_data);
+   }
+   case 8: {
+      struct mdapi_gen8_metrics *mdapi_data = (struct mdapi_gen8_metrics *) data;
+
+      if (data_size < sizeof(*mdapi_data))
+         return 0;
+
+      for (int i = 0; i < ARRAY_SIZE(mdapi_data->OaCntr); i++)
+         mdapi_data->OaCntr[i] = obj->oa.accumulator[2 + i];
+      for (int i = 0; i < ARRAY_SIZE(mdapi_data->NoaCntr); i++) {
+         mdapi_data->NoaCntr[i] =
+            obj->oa.accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
+      }
+
+      mdapi_data->ReportId = obj->oa.hw_id;
+      mdapi_data->ReportsCount = obj->oa.reports_accumulated;
+      mdapi_data->TotalTime = brw_timebase_scale(brw, obj->oa.accumulator[0]);
+      mdapi_data->GPUTicks = obj->oa.accumulator[1];
+      mdapi_data->CoreFrequency = obj->oa.gt_frequency[1];
+      mdapi_data->CoreFrequencyChanged = obj->oa.gt_frequency[0] != obj->oa.gt_frequency[1];
+      mdapi_data->SliceFrequency = (obj->oa.slice_frequency[0] + obj->oa.slice_frequency[1]) / 2ULL;
+      mdapi_data->UnsliceFrequency = (obj->oa.unslice_frequency[0] + obj->oa.unslice_frequency[1]) / 2ULL;
+
+      return sizeof(*mdapi_data);
+   }
+   case 9:
+   case 10:
+   case 11: {
+      struct mdapi_gen9_metrics *mdapi_data = (struct mdapi_gen9_metrics *) data;
+
+      if (data_size < sizeof(*mdapi_data))
+         return 0;
+
+      for (int i = 0; i < ARRAY_SIZE(mdapi_data->OaCntr); i++)
+         mdapi_data->OaCntr[i] = obj->oa.accumulator[2 + i];
+      for (int i = 0; i < ARRAY_SIZE(mdapi_data->NoaCntr); i++) {
+         mdapi_data->NoaCntr[i] =
+            obj->oa.accumulator[2 + ARRAY_SIZE(mdapi_data->OaCntr) + i];
+      }
+
+      mdapi_data->ReportId = obj->oa.hw_id;
+      mdapi_data->ReportsCount = obj->oa.reports_accumulated;
+      mdapi_data->TotalTime = brw_timebase_scale(brw, obj->oa.accumulator[0]);
+      mdapi_data->GPUTicks = obj->oa.accumulator[1];
+      mdapi_data->CoreFrequency = obj->oa.gt_frequency[1];
+      mdapi_data->CoreFrequencyChanged = obj->oa.gt_frequency[0] != obj->oa.gt_frequency[1];
+      mdapi_data->SliceFrequency = (obj->oa.slice_frequency[0] + obj->oa.slice_frequency[1]) / 2ULL;
+      mdapi_data->UnsliceFrequency = (obj->oa.unslice_frequency[0] + obj->oa.unslice_frequency[1]) / 2ULL;
+
+      return sizeof(*mdapi_data);
+   }
+   default:
+      unreachable("unexpected gen");
+   }
+
+   return 0;
+}
+
+static void
+fill_mdapi_perf_query_counter(struct brw_perf_query_info *query,
+                              const char *name,
+                              uint32_t data_offset,
+                              uint32_t data_size,
+                              GLenum data_type)
+{
+   struct brw_perf_query_counter *counter = &query->counters[query->n_counters];
+
+   counter->name = name;
+   counter->desc = "Raw counter value";
+   counter->data_type = data_type;
+   counter->offset = data_offset;
+   counter->size = data_size;
+   assert(counter->offset + counter->size <= query->data_size);
+
+   query->n_counters++;
+}
+
+#define MDAPI_QUERY_ADD_COUNTER(query, struct_name, field_name, type_name) \
+   fill_mdapi_perf_query_counter(query, #field_name,                    \
+                                 (uint8_t *) &struct_name.field_name -  \
+                                 (uint8_t *) &struct_name,              \
+                                 sizeof(struct_name.field_name),        \
+                                 GL_PERFQUERY_COUNTER_DATA_##type_name##_INTEL)
+#define MDAPI_QUERY_ADD_ARRAY_COUNTER(ctx, query, struct_name, field_name, idx, type_name) \
+   fill_mdapi_perf_query_counter(query,                                 \
+                                 ralloc_asprintf(ctx, "%s%i", #field_name, idx), \
+                                 (uint8_t *) &struct_name.field_name[idx] - \
+                                 (uint8_t *) &struct_name,              \
+                                 sizeof(struct_name.field_name[0]),     \
+                                 GL_PERFQUERY_COUNTER_DATA_##type_name##_INTEL)
+
+void
+brw_perf_query_register_mdapi_oa_query(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   /* MDAPI requires different structures for pretty much every generation
+    * (right now we have definitions for gen 7 to 11).
+    */
+   if (!(devinfo->gen >= 7 && devinfo->gen <= 11))
+      return;
+
+   struct brw_perf_query_info *query = brw_perf_query_append_query_info(brw);
+
+   query->kind = OA_COUNTERS_RAW;
+   query->name = "Intel_Raw_Hardware_Counters_Set_0_Query";
+   /* Guid has to matches with MDAPI's. */
+   query->guid = "2f01b241-7014-42a7-9eb6-a925cad3daba";
+   query->n_counters = 0;
+   query->oa_metrics_set_id = 0; /* Set by MDAPI */
+
+   int n_counters;
+   switch (devinfo->gen) {
+   case 7: {
+      query->oa_format = I915_OA_FORMAT_A45_B8_C8;
+
+      struct mdapi_gen7_metrics metric_data;
+      query->data_size = sizeof(metric_data);
+
+      n_counters = 1 + 45 + 16 + 7;
+      query->counters =
+         rzalloc_array_size(brw->perfquery.queries,
+                            sizeof(*query->counters), n_counters);
+
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
+      for (int i = 0; i < ARRAY_SIZE(metric_data.ACounters); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(brw->perfquery.queries,
+                                       query, metric_data, ACounters, i, UINT64);
+      }
+      for (int i = 0; i < ARRAY_SIZE(metric_data.NOACounters); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(brw->perfquery.queries,
+                                       query, metric_data, NOACounters, i, UINT64);
+      }
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
+      break;
+   }
+   case 8: {
+      query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
+
+      struct mdapi_gen8_metrics metric_data;
+      query->data_size = sizeof(metric_data);
+
+      n_counters = 2 + 36 + 16 + 16;
+      query->counters =
+         rzalloc_array_size(brw->perfquery.queries,
+                            sizeof(*query->counters), n_counters);
+
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
+      for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(brw->perfquery.queries,
+                                       query, metric_data, OaCntr, i, UINT64);
+      }
+      for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(brw->perfquery.queries,
+                                       query, metric_data, NoaCntr, i, UINT64);
+      }
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
+      break;
+   }
+   case 9:
+   case 10:
+   case 11: {
+      query->oa_format = I915_OA_FORMAT_A32u40_A4u32_B8_C8;
+
+      struct mdapi_gen9_metrics metric_data;
+      query->data_size = sizeof(metric_data);
+
+      n_counters = 2 + 36 + 16 + 16 + 16 + 2;
+      query->counters =
+         rzalloc_array_size(brw->perfquery.queries,
+                            sizeof(*query->counters), n_counters);
+
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, TotalTime, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, GPUTicks, UINT64);
+      for (int i = 0; i < ARRAY_SIZE(metric_data.OaCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(brw->perfquery.queries,
+                                       query, metric_data, OaCntr, i, UINT64);
+      }
+      for (int i = 0; i < ARRAY_SIZE(metric_data.NoaCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(brw->perfquery.queries,
+                                       query, metric_data, NoaCntr, i, UINT64);
+      }
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, BeginTimestamp, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved3, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, OverrunOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerUser, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, MarkerDriver, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SliceFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, UnsliceFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter1, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, PerfCounter2, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, SplitOccured, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequencyChanged, BOOL32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, CoreFrequency, UINT64);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportId, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, ReportsCount, UINT32);
+      for (int i = 0; i < ARRAY_SIZE(metric_data.UserCntr); i++) {
+         MDAPI_QUERY_ADD_ARRAY_COUNTER(brw->perfquery.queries,
+                                       query, metric_data, UserCntr, i, UINT64);
+      }
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, UserCntrCfgId, UINT32);
+      MDAPI_QUERY_ADD_COUNTER(query, metric_data, Reserved4, UINT32);
+      break;
+   }
+   default:
+      unreachable("Unsupported gen");
+      break;
+   }
+
+   assert(query->n_counters <= n_counters);
+
+   {
+      /* Accumulation buffer offsets copied from an actual query... */
+      const struct brw_perf_query_info *copy_query =
+         &brw->perfquery.queries[0];
+
+      query->gpu_time_offset = copy_query->gpu_time_offset;
+      query->gpu_clock_offset = copy_query->gpu_clock_offset;
+      query->a_offset = copy_query->a_offset;
+      query->b_offset = copy_query->b_offset;
+      query->c_offset = copy_query->c_offset;
+   }
+}
+
+void
+brw_perf_query_register_mdapi_statistic_query(struct brw_context *brw)
+{
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+
+   if (!(devinfo->gen >= 7 && devinfo->gen <= 9))
+      return;
+
+   struct brw_perf_query_info *query = brw_perf_query_append_query_info(brw);
+
+   query->kind = PIPELINE_STATS;
+   query->name = "Intel_Raw_Pipeline_Statistics_Query";
+   query->n_counters = 0;
+   query->counters =
+      rzalloc_array(brw, struct brw_perf_query_counter, MAX_STAT_COUNTERS);
+
+   /* The order has to match mdapi_pipeline_metrics. */
+   brw_perf_query_info_add_basic_stat_reg(query, IA_VERTICES_COUNT,
+                                          "N vertices submitted");
+   brw_perf_query_info_add_basic_stat_reg(query, IA_PRIMITIVES_COUNT,
+                                          "N primitives submitted");
+   brw_perf_query_info_add_basic_stat_reg(query, VS_INVOCATION_COUNT,
+                                          "N vertex shader invocations");
+   brw_perf_query_info_add_basic_stat_reg(query, GS_INVOCATION_COUNT,
+                                          "N geometry shader invocations");
+   brw_perf_query_info_add_basic_stat_reg(query, GS_PRIMITIVES_COUNT,
+                                          "N geometry shader primitives emitted");
+   brw_perf_query_info_add_basic_stat_reg(query, CL_INVOCATION_COUNT,
+                                          "N primitives entering clipping");
+   brw_perf_query_info_add_basic_stat_reg(query, CL_PRIMITIVES_COUNT,
+                                          "N primitives leaving clipping");
+   if (devinfo->is_haswell || devinfo->gen == 8) {
+      brw_perf_query_info_add_stat_reg(query, PS_INVOCATION_COUNT, 1, 4,
+                                       "N fragment shader invocations",
+                                       "N fragment shader invocations");
+   } else {
+      brw_perf_query_info_add_basic_stat_reg(query, PS_INVOCATION_COUNT,
+                                             "N fragment shader invocations");
+   }
+   brw_perf_query_info_add_basic_stat_reg(query, HS_INVOCATION_COUNT,
+                                          "N TCS shader invocations");
+   brw_perf_query_info_add_basic_stat_reg(query, DS_INVOCATION_COUNT,
+                                          "N TES shader invocations");
+   if (devinfo->gen >= 7) {
+      brw_perf_query_info_add_basic_stat_reg(query, CS_INVOCATION_COUNT,
+                                             "N compute shader invocations");
+   }
+
+   query->data_size = sizeof(uint64_t) * query->n_counters;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_performance_query_metrics.h b/src/mesa/drivers/dri/i965/brw_performance_query_metrics.h
new file mode 100644
index 0000000..80d7ddc
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_performance_query_metrics.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_PERFORMANCE_QUERY_METRICS_H
+#define BRW_PERFORMANCE_QUERY_METRICS_H
+
+#include <stdint.h>
+
+struct brw_pipeline_stat
+{
+   uint32_t reg;
+   uint32_t numerator;
+   uint32_t denominator;
+};
+
+struct brw_perf_query_counter
+{
+   const char *name;
+   const char *desc;
+   GLenum type;
+   GLenum data_type;
+   uint64_t raw_max;
+   size_t offset;
+   size_t size;
+
+   union {
+      uint64_t (*oa_counter_read_uint64)(struct brw_context *brw,
+                                         const struct brw_perf_query_info *query,
+                                         uint64_t *accumulator);
+      float (*oa_counter_read_float)(struct brw_context *brw,
+                                     const struct brw_perf_query_info *query,
+                                     uint64_t *accumulator);
+      struct brw_pipeline_stat pipeline_stat;
+   };
+};
+
+#endif /* BRW_PERFORMANCE_QUERY_METRICS_H */
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index e31d625..122ac26 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -544,29 +544,17 @@
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
-   if (brw->batch.ring == BLT_RING && devinfo->gen >= 6) {
-      const unsigned n_dwords = devinfo->gen >= 8 ? 5 : 4;
-      BEGIN_BATCH_BLT(n_dwords);
-      OUT_BATCH(MI_FLUSH_DW | (n_dwords - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      if (n_dwords == 5)
-         OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH;
-      if (devinfo->gen >= 6) {
-         flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
-                  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
-                  PIPE_CONTROL_DATA_CACHE_FLUSH |
-                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
-                  PIPE_CONTROL_VF_CACHE_INVALIDATE |
-                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
-                  PIPE_CONTROL_CS_STALL;
-      }
-      brw_emit_pipe_control_flush(brw, flags);
+   int flags = PIPE_CONTROL_RENDER_TARGET_FLUSH;
+   if (devinfo->gen >= 6) {
+      flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
+               PIPE_CONTROL_CONST_CACHE_INVALIDATE |
+               PIPE_CONTROL_DATA_CACHE_FLUSH |
+               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+               PIPE_CONTROL_VF_CACHE_INVALIDATE |
+               PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+               PIPE_CONTROL_CS_STALL;
    }
+   brw_emit_pipe_control_flush(brw, flags);
 }
 
 int
@@ -580,7 +568,8 @@
     * the gen6 workaround because it involves actually writing to
     * the buffer, and the kernel doesn't let us write to the batch.
     */
-   brw->workaround_bo = brw_bo_alloc(brw->bufmgr, "workaround", 4096);
+   brw->workaround_bo = brw_bo_alloc(brw->bufmgr, "workaround", 4096,
+                                     BRW_MEMZONE_OTHER);
    if (brw->workaround_bo == NULL)
       return -ENOMEM;
 
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index fc77926..7adb75d 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -41,7 +41,6 @@
 #include "util/ralloc.h"
 #include "compiler/glsl/ir.h"
 #include "compiler/glsl/glsl_to_nir.h"
-#include "compiler/nir/nir_serialize.h"
 
 #include "brw_program.h"
 #include "brw_context.h"
@@ -49,6 +48,11 @@
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
 
+#include "brw_cs.h"
+#include "brw_gs.h"
+#include "brw_vs.h"
+#include "brw_wm.h"
+
 static bool
 brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
 {
@@ -70,6 +74,7 @@
                gl_shader_stage stage,
                bool is_scalar)
 {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    struct gl_context *ctx = &brw->ctx;
    const nir_shader_compiler_options *options =
       ctx->Const.ShaderCompilerOptions[stage].NirOptions;
@@ -77,10 +82,11 @@
 
    /* First, lower the GLSL/Mesa IR or SPIR-V to NIR */
    if (shader_prog) {
-      if (shader_prog->_LinkedShaders[stage]->spirv_data)
+      if (shader_prog->data->spirv) {
          nir = _mesa_spirv_to_nir(ctx, shader_prog, stage, options);
-      else
+      } else {
          nir = glsl_to_nir(shader_prog, stage, options);
+      }
       assert (nir);
 
       nir_remove_dead_variables(nir, nir_var_shader_in | nir_var_shader_out);
@@ -94,32 +100,26 @@
    }
    nir_validate_shader(nir);
 
-   /* Lower PatchVerticesIn from system value to uniform. This needs to
-    * happen before brw_preprocess_nir, since that will lower system values
-    * to intrinsics.
-    *
-    * We only do this for TES if no TCS is present, since otherwise we know
-    * the number of vertices in the patch at link time and we can lower it
-    * directly to a constant. We do this in nir_lower_patch_vertices, which
-    * needs to run after brw_nir_preprocess has turned the system values
-    * into intrinsics.
-    */
-   const bool lower_patch_vertices_in_to_uniform =
-      (stage == MESA_SHADER_TESS_CTRL && brw->screen->devinfo.gen >= 8) ||
-      (stage == MESA_SHADER_TESS_EVAL &&
-       !shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
-
-   if (lower_patch_vertices_in_to_uniform)
-      brw_nir_lower_patch_vertices_in_to_uniform(nir);
-
    nir = brw_preprocess_nir(brw->screen->compiler, nir);
 
-   if (stage == MESA_SHADER_TESS_EVAL && !lower_patch_vertices_in_to_uniform) {
-      assert(shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]);
-      struct gl_linked_shader *linked_tcs =
+   if (stage == MESA_SHADER_TESS_CTRL) {
+      /* Lower gl_PatchVerticesIn from a sys. value to a uniform on Gen8+. */
+      static const gl_state_index16 tokens[STATE_LENGTH] =
+         { STATE_INTERNAL, STATE_TCS_PATCH_VERTICES_IN };
+      nir_lower_patch_vertices(nir, 0, devinfo->gen >= 8 ? tokens : NULL);
+   }
+
+   if (stage == MESA_SHADER_TESS_EVAL) {
+      /* Lower gl_PatchVerticesIn to a constant if we have a TCS, or
+       * a uniform if we don't.
+       */
+      struct gl_linked_shader *tcs =
          shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL];
-      uint32_t patch_vertices = linked_tcs->Program->info.tess.tcs_vertices_out;
-      nir_lower_tes_patch_vertices(nir, patch_vertices);
+      uint32_t static_patch_vertices =
+         tcs ? tcs->Program->info.tess.tcs_vertices_out : 0;
+      static const gl_state_index16 tokens[STATE_LENGTH] =
+         { STATE_INTERNAL, STATE_TES_PATCH_VERTICES_IN };
+      nir_lower_patch_vertices(nir, static_patch_vertices, tokens);
    }
 
    if (stage == MESA_SHADER_FRAGMENT) {
@@ -348,7 +348,8 @@
    }
 
    if (!old_bo) {
-      *scratch_bo = brw_bo_alloc(brw->bufmgr, "scratch bo", size);
+      *scratch_bo =
+         brw_bo_alloc(brw->bufmgr, "scratch bo", size, BRW_MEMZONE_SCRATCH);
    }
 }
 
@@ -443,7 +444,7 @@
 
    stage_state->scratch_bo =
       brw_bo_alloc(brw->bufmgr, "shader scratch space",
-                   per_thread_size * thread_count);
+                   per_thread_size * thread_count, BRW_MEMZONE_SCRATCH);
 }
 
 void brwInitFragProgFuncs( struct dd_function_table *functions )
@@ -472,7 +473,8 @@
    const int max_entries = 2048;
    brw->shader_time.bo =
       brw_bo_alloc(brw->bufmgr, "shader time",
-                   max_entries * BRW_SHADER_TIME_STRIDE * 3);
+                   max_entries * BRW_SHADER_TIME_STRIDE * 3,
+                   BRW_MEMZONE_OTHER);
    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
@@ -538,6 +540,7 @@
       case ST_GS:
       case ST_FS8:
       case ST_FS16:
+      case ST_FS32:
       case ST_CS:
          written = brw->shader_time.cumulative[i].written;
          reset = brw->shader_time.cumulative[i].reset;
@@ -566,6 +569,7 @@
       case ST_GS:
       case ST_FS8:
       case ST_FS16:
+      case ST_FS32:
       case ST_CS:
          total_by_type[type] += scaled[i];
          break;
@@ -615,6 +619,9 @@
       case ST_FS16:
          stage = "fs16";
          break;
+      case ST_FS32:
+         stage = "fs32";
+         break;
       case ST_CS:
          stage = "cs";
          break;
@@ -634,6 +641,7 @@
    print_shader_time_line("total", "gs", 0, total_by_type[ST_GS], total);
    print_shader_time_line("total", "fs8", 0, total_by_type[ST_FS8], total);
    print_shader_time_line("total", "fs16", 0, total_by_type[ST_FS16], total);
+   print_shader_time_line("total", "fs32", 0, total_by_type[ST_FS32], total);
    print_shader_time_line("total", "cs", 0, total_by_type[ST_CS], total);
 }
 
@@ -731,11 +739,10 @@
 }
 
 void
-brw_setup_tex_for_precompile(struct brw_context *brw,
+brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
                              struct brw_sampler_prog_key_data *tex,
                              struct gl_program *prog)
 {
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    const bool has_shader_channel_select = devinfo->is_haswell || devinfo->gen >= 8;
    unsigned sampler_count = util_last_bit(prog->SamplersUsed);
    for (unsigned i = 0; i < sampler_count; i++) {
@@ -832,34 +839,47 @@
 }
 
 void
-brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog)
+brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
+                    unsigned id)
 {
-   struct blob writer;
-   blob_init(&writer);
-   nir_serialize(&writer, prog->nir);
-   prog->driver_cache_blob = ralloc_size(NULL, writer.size);
-   memcpy(prog->driver_cache_blob, writer.data, writer.size);
-   prog->driver_cache_blob_size = writer.size;
-   blob_finish(&writer);
+   static const unsigned stage_offsets[] = {
+      offsetof(struct brw_vs_prog_key, program_string_id),
+      offsetof(struct brw_tcs_prog_key, program_string_id),
+      offsetof(struct brw_tes_prog_key, program_string_id),
+      offsetof(struct brw_gs_prog_key, program_string_id),
+      offsetof(struct brw_wm_prog_key, program_string_id),
+      offsetof(struct brw_cs_prog_key, program_string_id),
+   };
+   assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_offsets));
+   *(unsigned*)((uint8_t*)key + stage_offsets[stage]) = id;
 }
 
 void
-brw_program_deserialize_nir(struct gl_context *ctx, struct gl_program *prog,
-                            gl_shader_stage stage)
+brw_populate_default_key(const struct gen_device_info *devinfo,
+                         union brw_any_prog_key *prog_key,
+                         struct gl_shader_program *sh_prog,
+                         struct gl_program *prog)
 {
-   if (!prog->nir) {
-      assert(prog->driver_cache_blob && prog->driver_cache_blob_size > 0);
-      const struct nir_shader_compiler_options *options =
-         ctx->Const.ShaderCompilerOptions[stage].NirOptions;
-      struct blob_reader reader;
-      blob_reader_init(&reader, prog->driver_cache_blob,
-                       prog->driver_cache_blob_size);
-      prog->nir = nir_deserialize(NULL, options, &reader);
-   }
-
-   if (prog->driver_cache_blob) {
-      ralloc_free(prog->driver_cache_blob);
-      prog->driver_cache_blob = NULL;
-      prog->driver_cache_blob_size = 0;
+   switch (prog->info.stage) {
+   case MESA_SHADER_VERTEX:
+      brw_vs_populate_default_key(devinfo, &prog_key->vs, prog);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      brw_tcs_populate_default_key(devinfo, &prog_key->tcs, sh_prog, prog);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      brw_tes_populate_default_key(devinfo, &prog_key->tes, sh_prog, prog);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      brw_gs_populate_default_key(devinfo, &prog_key->gs, prog);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      brw_wm_populate_default_key(devinfo, &prog_key->wm, prog);
+      break;
+   case MESA_SHADER_COMPUTE:
+      brw_cs_populate_default_key(devinfo, &prog_key->cs, prog);
+      break;
+   default:
+      unreachable("Unsupported stage!");
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h
index 701b8da..32c87fa 100644
--- a/src/mesa/drivers/dri/i965/brw_program.h
+++ b/src/mesa/drivers/dri/i965/brw_program.h
@@ -32,6 +32,8 @@
 #endif
 
 struct brw_context;
+struct blob;
+struct blob_reader;
 
 enum brw_param_domain {
    BRW_PARAM_DOMAIN_BUILTIN = 0,
@@ -64,7 +66,7 @@
 
 void brw_shader_gather_info(nir_shader *nir, struct gl_program *prog);
 
-void brw_setup_tex_for_precompile(struct brw_context *brw,
+void brw_setup_tex_for_precompile(const struct gen_device_info *devinfo,
                                   struct brw_sampler_prog_key_data *tex,
                                   struct gl_program *prog);
 
@@ -82,6 +84,16 @@
                                         uint32_t next_binding_table_offset);
 
 void
+brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage stage,
+                    unsigned id);
+
+void
+brw_populate_default_key(const struct gen_device_info *devinfo,
+                         union brw_any_prog_key *prog_key,
+                         struct gl_shader_program *sh_prog,
+                         struct gl_program *prog);
+
+void
 brw_stage_prog_data_free(const void *prog_data);
 
 void
@@ -103,9 +115,25 @@
 void brw_upload_tcs_prog(struct brw_context *brw);
 void brw_tcs_populate_key(struct brw_context *brw,
                           struct brw_tcs_prog_key *key);
+void brw_tcs_populate_default_key(const struct gen_device_info *devinfo,
+                                  struct brw_tcs_prog_key *key,
+                                  struct gl_shader_program *sh_prog,
+                                  struct gl_program *prog);
 void brw_upload_tes_prog(struct brw_context *brw);
 void brw_tes_populate_key(struct brw_context *brw,
                           struct brw_tes_prog_key *key);
+void brw_tes_populate_default_key(const struct gen_device_info *devinfo,
+                                  struct brw_tes_prog_key *key,
+                                  struct gl_shader_program *sh_prog,
+                                  struct gl_program *prog);
+
+void brw_write_blob_program_data(struct blob *binary, gl_shader_stage stage,
+                                 const void *program,
+                                 struct brw_stage_prog_data *prog_data);
+bool brw_read_blob_program_data(struct blob_reader *binary,
+                                struct gl_program *prog, gl_shader_stage stage,
+                                const uint8_t **program,
+                                struct brw_stage_prog_data *prog_data);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/mesa/drivers/dri/i965/brw_program_binary.c b/src/mesa/drivers/dri/i965/brw_program_binary.c
index f1b327d..db03332 100644
--- a/src/mesa/drivers/dri/i965/brw_program_binary.c
+++ b/src/mesa/drivers/dri/i965/brw_program_binary.c
@@ -23,11 +23,13 @@
 
 #include <stdint.h>
 
+#include "compiler/nir/nir_serialize.h"
 #include "util/build_id.h"
 #include "util/mesa-sha1.h"
 
 #include "brw_context.h"
 #include "brw_program.h"
+#include "brw_state.h"
 
 static uint8_t driver_sha1[20];
 
@@ -60,6 +62,176 @@
    memcpy(sha1, driver_sha1, sizeof(uint8_t) * 20);
 }
 
+enum driver_cache_blob_part {
+   END_PART,
+   GEN_PART,
+   NIR_PART,
+};
+
+static bool
+blob_parts_valid(void *blob, uint32_t size)
+{
+   struct blob_reader reader;
+   blob_reader_init(&reader, blob, size);
+
+   do {
+      uint32_t part_type = blob_read_uint32(&reader);
+      if (reader.overrun)
+         return false;
+      if (part_type == END_PART)
+         return reader.current == reader.end;
+      switch ((enum driver_cache_blob_part)part_type) {
+      case GEN_PART:
+      case NIR_PART:
+         /* Read the uint32_t part-size and skip over it */
+         blob_skip_bytes(&reader, blob_read_uint32(&reader));
+         if (reader.overrun)
+            return false;
+         break;
+      default:
+         return false;
+      }
+   } while (true);
+}
+
+static bool
+blob_has_part(void *blob, uint32_t size, enum driver_cache_blob_part part)
+{
+   struct blob_reader reader;
+   blob_reader_init(&reader, blob, size);
+
+   assert(blob_parts_valid(blob, size));
+   do {
+      uint32_t part_type = blob_read_uint32(&reader);
+      if (part_type == END_PART)
+         return false;
+      if (part_type == part)
+         return true;
+      blob_skip_bytes(&reader, blob_read_uint32(&reader));
+   } while (true);
+}
+
+static bool
+driver_blob_is_ready(void *blob, uint32_t size, bool with_gen_program)
+{
+   if (!blob) {
+      return false;
+   } else if (!blob_parts_valid(blob, size)) {
+      unreachable("Driver blob format is bad!");
+      return false;
+   } else if (blob_has_part(blob, size, GEN_PART) == with_gen_program) {
+      return true;
+   } else {
+      return false;
+   }
+}
+
+static void
+serialize_nir_part(struct blob *writer, struct gl_program *prog)
+{
+   blob_write_uint32(writer, NIR_PART);
+   intptr_t size_offset = blob_reserve_uint32(writer);
+   size_t nir_start = writer->size;
+   nir_serialize(writer, prog->nir);
+   blob_overwrite_uint32(writer, size_offset, writer->size - nir_start);
+}
+
+void
+brw_program_serialize_nir(struct gl_context *ctx, struct gl_program *prog)
+{
+   if (driver_blob_is_ready(prog->driver_cache_blob,
+                            prog->driver_cache_blob_size, false))
+      return;
+
+   if (prog->driver_cache_blob)
+      ralloc_free(prog->driver_cache_blob);
+
+   struct blob writer;
+   blob_init(&writer);
+   serialize_nir_part(&writer, prog);
+   blob_write_uint32(&writer, END_PART);
+   prog->driver_cache_blob = ralloc_size(NULL, writer.size);
+   memcpy(prog->driver_cache_blob, writer.data, writer.size);
+   prog->driver_cache_blob_size = writer.size;
+   blob_finish(&writer);
+}
+
+static bool
+deserialize_gen_program(struct blob_reader *reader, struct gl_context *ctx,
+                        struct gl_program *prog, gl_shader_stage stage)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   union brw_any_prog_key prog_key;
+   blob_copy_bytes(reader, &prog_key, brw_prog_key_size(stage));
+   brw_prog_key_set_id(&prog_key, stage, brw_program(prog)->id);
+
+   enum brw_cache_id cache_id = brw_stage_cache_id(stage);
+
+   const uint8_t *program;
+   struct brw_stage_prog_data *prog_data =
+      ralloc_size(NULL, sizeof(union brw_any_prog_data));
+
+   if (!brw_read_blob_program_data(reader, prog, stage, &program, prog_data)) {
+      ralloc_free(prog_data);
+      return false;
+   }
+
+   uint32_t offset;
+   void *out_prog_data;
+   brw_upload_cache(&brw->cache, cache_id, &prog_key, brw_prog_key_size(stage),
+                    program, prog_data->program_size, prog_data,
+                    brw_prog_data_size(stage), &offset, &out_prog_data);
+
+   ralloc_free(prog_data);
+
+   return true;
+}
+
+void
+brw_program_deserialize_driver_blob(struct gl_context *ctx,
+                                    struct gl_program *prog,
+                                    gl_shader_stage stage)
+{
+   if (!prog->driver_cache_blob)
+      return;
+
+   struct blob_reader reader;
+   blob_reader_init(&reader, prog->driver_cache_blob,
+                    prog->driver_cache_blob_size);
+
+   do {
+      uint32_t part_type = blob_read_uint32(&reader);
+      if ((enum driver_cache_blob_part)part_type == END_PART)
+         break;
+      switch ((enum driver_cache_blob_part)part_type) {
+      case GEN_PART: {
+         uint32_t gen_size = blob_read_uint32(&reader);
+         assert(!reader.overrun &&
+                (uintptr_t)(reader.end - reader.current) > gen_size);
+         deserialize_gen_program(&reader, ctx, prog, stage);
+         break;
+      }
+      case NIR_PART: {
+         uint32_t nir_size = blob_read_uint32(&reader);
+         assert(!reader.overrun &&
+                (uintptr_t)(reader.end - reader.current) > nir_size);
+         const struct nir_shader_compiler_options *options =
+            ctx->Const.ShaderCompilerOptions[stage].NirOptions;
+         prog->nir = nir_deserialize(NULL, options, &reader);
+         break;
+      }
+      default:
+         unreachable("Unsupported blob part type!");
+         break;
+      }
+   } while (true);
+
+   ralloc_free(prog->driver_cache_blob);
+   prog->driver_cache_blob = NULL;
+   prog->driver_cache_blob_size = 0;
+}
+
 /* This is just a wrapper around brw_program_deserialize_nir() as i965
  * doesn't need gl_shader_program like other drivers do.
  */
@@ -68,5 +240,114 @@
                                struct gl_shader_program *shProg,
                                struct gl_program *prog)
 {
-   brw_program_deserialize_nir(ctx, prog, prog->info.stage);
+   brw_program_deserialize_driver_blob(ctx, prog, prog->info.stage);
+}
+
+static void
+serialize_gen_part(struct blob *writer, struct gl_context *ctx,
+                   struct gl_shader_program *sh_prog,
+                   struct gl_program *prog)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   union brw_any_prog_key key;
+   brw_populate_default_key(&brw->screen->devinfo, &key, sh_prog, prog);
+
+   const gl_shader_stage stage = prog->info.stage;
+   uint32_t offset = 0;
+   void *prog_data = NULL;
+   if (brw_search_cache(&brw->cache, brw_stage_cache_id(stage), &key,
+                        brw_prog_key_size(stage), &offset, &prog_data,
+                        false)) {
+      const void *program_map = brw->cache.map + offset;
+      /* TODO: Improve perf for non-LLC. It would be best to save it at
+       * program generation time when the program is in normal memory
+       * accessible with cache to the CPU. Another easier change would be to
+       * use _mesa_streaming_load_memcpy to read from the program mapped
+       * memory.
+       */
+      blob_write_uint32(writer, GEN_PART);
+      intptr_t size_offset = blob_reserve_uint32(writer);
+      size_t gen_start = writer->size;
+      blob_write_bytes(writer, &key, brw_prog_key_size(stage));
+      brw_write_blob_program_data(writer, stage, program_map, prog_data);
+      blob_overwrite_uint32(writer, size_offset, writer->size - gen_start);
+   }
+}
+
+void
+brw_serialize_program_binary(struct gl_context *ctx,
+                             struct gl_shader_program *sh_prog,
+                             struct gl_program *prog)
+{
+   if (driver_blob_is_ready(prog->driver_cache_blob,
+                            prog->driver_cache_blob_size, true))
+      return;
+
+   if (prog->driver_cache_blob) {
+      if (!prog->nir) {
+         /* If we loaded from the disk shader cache, then the nir might not
+          * have been deserialized yet.
+          */
+         brw_program_deserialize_driver_blob(ctx, prog, prog->info.stage);
+      }
+      ralloc_free(prog->driver_cache_blob);
+   }
+
+   struct blob writer;
+   blob_init(&writer);
+   serialize_nir_part(&writer, prog);
+   serialize_gen_part(&writer, ctx, sh_prog, prog);
+   blob_write_uint32(&writer, END_PART);
+   prog->driver_cache_blob = ralloc_size(NULL, writer.size);
+   memcpy(prog->driver_cache_blob, writer.data, writer.size);
+   prog->driver_cache_blob_size = writer.size;
+   blob_finish(&writer);
+}
+
+void
+brw_write_blob_program_data(struct blob *binary, gl_shader_stage stage,
+                            const void *program,
+                            struct brw_stage_prog_data *prog_data)
+{
+   /* Write prog_data to blob. */
+   blob_write_bytes(binary, prog_data, brw_prog_data_size(stage));
+
+   /* Write program to blob. */
+   blob_write_bytes(binary, program, prog_data->program_size);
+
+   /* Write push params */
+   blob_write_bytes(binary, prog_data->param,
+                    sizeof(uint32_t) * prog_data->nr_params);
+
+   /* Write pull params */
+   blob_write_bytes(binary, prog_data->pull_param,
+                    sizeof(uint32_t) * prog_data->nr_pull_params);
+}
+
+bool
+brw_read_blob_program_data(struct blob_reader *binary, struct gl_program *prog,
+                           gl_shader_stage stage, const uint8_t **program,
+                           struct brw_stage_prog_data *prog_data)
+{
+   /* Read shader prog_data from blob. */
+   blob_copy_bytes(binary, prog_data, brw_prog_data_size(stage));
+   if (binary->overrun)
+      return false;
+
+   /* Read shader program from blob. */
+   *program = blob_read_bytes(binary, prog_data->program_size);
+
+   /* Read push params */
+   prog_data->param = rzalloc_array(NULL, uint32_t, prog_data->nr_params);
+   blob_copy_bytes(binary, prog_data->param,
+                   sizeof(uint32_t) * prog_data->nr_params);
+
+   /* Read pull params */
+   prog_data->pull_param = rzalloc_array(NULL, uint32_t,
+                                         prog_data->nr_pull_params);
+   blob_copy_bytes(binary, prog_data->pull_param,
+                   sizeof(uint32_t) * prog_data->nr_pull_params);
+
+   return !binary->overrun;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_program_cache.c b/src/mesa/drivers/dri/i965/brw_program_cache.c
index ce11f1d..600b061 100644
--- a/src/mesa/drivers/dri/i965/brw_program_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_program_cache.c
@@ -78,6 +78,21 @@
    struct brw_cache_item *next;
 };
 
+enum brw_cache_id
+brw_stage_cache_id(gl_shader_stage stage)
+{
+   static const enum brw_cache_id stage_ids[] = {
+      BRW_CACHE_VS_PROG,
+      BRW_CACHE_TCS_PROG,
+      BRW_CACHE_TES_PROG,
+      BRW_CACHE_GS_PROG,
+      BRW_CACHE_FS_PROG,
+      BRW_CACHE_CS_PROG,
+   };
+   assert((int)stage >= 0 && stage < ARRAY_SIZE(stage_ids));
+   return stage_ids[stage];
+}
+
 static unsigned
 get_program_string_id(enum brw_cache_id cache_id, const void *key)
 {
@@ -179,12 +194,10 @@
  * Returns the buffer object matching cache_id and key, or NULL.
  */
 bool
-brw_search_cache(struct brw_cache *cache,
-                 enum brw_cache_id cache_id,
-                 const void *key, GLuint key_size,
-                 uint32_t *inout_offset, void *inout_prog_data)
+brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
+                 const void *key, GLuint key_size, uint32_t *inout_offset,
+                 void *inout_prog_data, bool flag_state)
 {
-   struct brw_context *brw = cache->brw;
    struct brw_cache_item *item;
    struct brw_cache_item lookup;
    GLuint hash;
@@ -204,7 +217,8 @@
 
    if (item->offset != *inout_offset ||
        prog_data != *((void **) inout_prog_data)) {
-      brw->ctx.NewDriverState |= (1 << cache_id);
+      if (likely(flag_state))
+         cache->brw->ctx.NewDriverState |= (1 << cache_id);
       *inout_offset = item->offset;
       *((void **) inout_prog_data) = prog_data;
    }
@@ -221,9 +235,10 @@
    perf_debug("Copying to larger program cache: %u kB -> %u kB\n",
               (unsigned) cache->bo->size / 1024, new_size / 1024);
 
-   new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size);
+   new_bo = brw_bo_alloc(brw->bufmgr, "program cache", new_size,
+                         BRW_MEMZONE_SHADER);
    if (can_do_exec_capture(brw->screen))
-      new_bo->kflags = EXEC_OBJECT_CAPTURE;
+      new_bo->kflags |= EXEC_OBJECT_CAPTURE;
 
    void *map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE |
                                        MAP_ASYNC | MAP_PERSISTENT);
@@ -388,9 +403,10 @@
    cache->items =
       calloc(cache->size, sizeof(struct brw_cache_item *));
 
-   cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384);
+   cache->bo = brw_bo_alloc(brw->bufmgr, "program cache", 16384,
+                            BRW_MEMZONE_SHADER);
    if (can_do_exec_capture(brw->screen))
-      cache->bo->kflags = EXEC_OBJECT_CAPTURE;
+      cache->bo->kflags |= EXEC_OBJECT_CAPTURE;
 
    cache->map = brw_bo_map(brw, cache->bo, MAP_READ | MAP_WRITE |
                                            MAP_ASYNC | MAP_PERSISTENT);
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index da3df6b..bc4b8c4 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -134,7 +134,7 @@
 			 struct brw_query_object *query)
 {
    struct brw_context *brw = brw_context(ctx);
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    int i;
    uint64_t *results;
@@ -261,7 +261,7 @@
 {
    struct brw_context *brw = brw_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    assert(devinfo->gen < 6);
 
@@ -287,7 +287,8 @@
        * the system was doing other work, such as running other applications.
        */
       brw_bo_unreference(query->bo);
-      query->bo = brw_bo_alloc(brw->bufmgr, "timer query", 4096);
+      query->bo =
+         brw_bo_alloc(brw->bufmgr, "timer query", 4096, BRW_MEMZONE_OTHER);
       brw_write_timestamp(brw, query->bo, 0);
       break;
 
@@ -333,7 +334,7 @@
 {
    struct brw_context *brw = brw_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    assert(devinfo->gen < 6);
 
@@ -387,7 +388,8 @@
 static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q)
 {
    struct brw_query_object *query = (struct brw_query_object *)q;
-   const struct gen_device_info *devinfo = &brw_context(ctx)->screen->devinfo;
+   UNUSED const struct gen_device_info *devinfo =
+      &brw_context(ctx)->screen->devinfo;
 
    assert(devinfo->gen < 6);
 
@@ -405,7 +407,7 @@
 {
    struct brw_context *brw = brw_context(ctx);
    struct brw_query_object *query = (struct brw_query_object *)q;
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    assert(devinfo->gen < 6);
 
@@ -435,7 +437,7 @@
 ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query)
 {
    struct brw_context *brw = brw_context(ctx);
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
 
    assert(devinfo->gen < 6);
 
@@ -449,7 +451,7 @@
          brw_queryobj_get_results(ctx, query);
       }
 
-      query->bo = brw_bo_alloc(brw->bufmgr, "query", 4096);
+      query->bo = brw_bo_alloc(brw->bufmgr, "query", 4096, BRW_MEMZONE_OTHER);
       query->last_index = 0;
    }
 }
@@ -529,7 +531,8 @@
    assert(q->Target == GL_TIMESTAMP);
 
    brw_bo_unreference(query->bo);
-   query->bo = brw_bo_alloc(brw->bufmgr, "timestamp query", 4096);
+   query->bo =
+      brw_bo_alloc(brw->bufmgr, "timestamp query", 4096, BRW_MEMZONE_OTHER);
    brw_write_timestamp(brw, query->bo, 0);
 
    query->flushed = false;
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index 37ce999..f4073fa 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -90,7 +90,7 @@
       return;
 
    /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   bool flip_y = ctx->DrawBuffer->FlipY;
 
    memset(&key, 0, sizeof(key));
 
@@ -137,7 +137,7 @@
     * Window coordinates in a FBO are inverted, which means point
     * sprite origin must be inverted, too.
     */
-   if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
+   if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
       key.sprite_origin_lower_left = true;
 
    /* BRW_NEW_FS_PROG_DATA */
@@ -161,12 +161,11 @@
        * face orientation, just as we invert the viewport in
        * sf_unit_create_from_key().
        */
-      key.frontface_ccw = brw->polygon_front_bit == render_to_fbo;
+      key.frontface_ccw = brw->polygon_front_bit != flip_y;
    }
 
-   if (!brw_search_cache(&brw->cache, BRW_CACHE_SF_PROG,
-			 &key, sizeof(key),
-			 &brw->sf.prog_offset, &brw->sf.prog_data)) {
+   if (!brw_search_cache(&brw->cache, BRW_CACHE_SF_PROG, &key, sizeof(key),
+                         &brw->sf.prog_offset, &brw->sf.prog_data, true)) {
       compile_sf_prog( brw, &key );
    }
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 9acb625..f6acf81 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -89,7 +89,6 @@
 extern const struct brw_tracked_state gen6_sol_surface;
 extern const struct brw_tracked_state gen6_sf_vp;
 extern const struct brw_tracked_state gen6_urb;
-extern const struct brw_tracked_state gen7_depthbuffer;
 extern const struct brw_tracked_state gen7_l3_state;
 extern const struct brw_tracked_state gen7_push_constant_space;
 extern const struct brw_tracked_state gen7_urb;
@@ -115,9 +114,6 @@
 uint32_t
 brw_depthbuffer_format(struct brw_context *brw);
 
-uint32_t
-brw_convert_depth_value(mesa_format format, float value);
-
 void brw_upload_state_base_address(struct brw_context *brw);
 
 /* gen8_depth_state.c */
@@ -167,11 +163,9 @@
                       GLuint aux_sz,
                       uint32_t *out_offset, void *out_aux);
 
-bool brw_search_cache(struct brw_cache *cache,
-                      enum brw_cache_id cache_id,
-                      const void *key,
-                      GLuint key_size,
-                      uint32_t *inout_offset, void *inout_aux);
+bool brw_search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
+                      const void *key, GLuint key_size, uint32_t *inout_offset,
+                      void *inout_aux, bool flag_state);
 
 const void *brw_find_previous_compile(struct brw_cache *cache,
                                       enum brw_cache_id cache_id,
@@ -184,11 +178,12 @@
 
 void brw_print_program_cache(struct brw_context *brw);
 
+enum brw_cache_id brw_stage_cache_id(gl_shader_stage stage);
+
 /* intel_batchbuffer.c */
 void brw_require_statebuffer_space(struct brw_context *brw, int size);
 void *brw_state_batch(struct brw_context *brw,
                       int size, int alignment, uint32_t *out_offset);
-uint32_t brw_state_batch_size(struct brw_context *brw, uint32_t offset);
 
 /* brw_wm_surface_state.c */
 uint32_t brw_get_surface_tiling_bits(uint32_t tiling);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index d8273aa..7574264 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -64,10 +64,6 @@
    brw_upload_invariant_state(brw);
 
    if (devinfo->gen == 10 || devinfo->gen == 11) {
-      brw_load_register_imm32(brw, GEN10_CACHE_MODE_SS,
-                              REG_MASK(GEN10_FLOAT_BLEND_OPTIMIZATION_ENABLE) |
-                              GEN10_FLOAT_BLEND_OPTIMIZATION_ENABLE);
-
       /* From gen10 workaround table in h/w specs:
        *
        *    "On 3DSTATE_3D_MODE, driver must always program bits 31:16 of DW1
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index fb592be..c2d99be 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -65,13 +65,4 @@
    } bits1;
 };
 
-struct gen5_sampler_default_color {
-   uint8_t ub[4];
-   float f[4];
-   uint16_t hf[4];
-   uint16_t us[4];
-   int16_t s[4];
-   uint8_t b[4];
-};
-
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 4ccb8ac..5361114 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -337,10 +337,9 @@
 
    brw_tcs_populate_key(brw, &key);
 
-   if (brw_search_cache(&brw->cache, BRW_CACHE_TCS_PROG,
-                        &key, sizeof(key),
-                        &stage_state->prog_offset,
-                        &brw->tcs.base.prog_data))
+   if (brw_search_cache(&brw->cache, BRW_CACHE_TCS_PROG, &key, sizeof(key),
+                        &stage_state->prog_offset, &brw->tcs.base.prog_data,
+                        true))
       return;
 
    if (brw_disk_cache_upload_program(brw, MESA_SHADER_TESS_CTRL))
@@ -354,6 +353,37 @@
    assert(success);
 }
 
+void
+brw_tcs_populate_default_key(const struct gen_device_info *devinfo,
+                             struct brw_tcs_prog_key *key,
+                             struct gl_shader_program *sh_prog,
+                             struct gl_program *prog)
+{
+   struct brw_program *btcp = brw_program(prog);
+   const struct gl_linked_shader *tes =
+      sh_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
+
+   memset(key, 0, sizeof(*key));
+
+   key->program_string_id = btcp->id;
+   brw_setup_tex_for_precompile(devinfo, &key->tex, prog);
+
+   /* Guess that the input and output patches have the same dimensionality. */
+   if (devinfo->gen < 8)
+      key->input_vertices = prog->info.tess.tcs_vertices_out;
+
+   if (tes) {
+      key->tes_primitive_mode = tes->Program->info.tess.primitive_mode;
+      key->quads_workaround = devinfo->gen < 9 &&
+                              tes->Program->info.tess.primitive_mode == GL_QUADS &&
+                              tes->Program->info.tess.spacing == TESS_SPACING_EQUAL;
+   } else {
+      key->tes_primitive_mode = GL_TRIANGLES;
+   }
+
+   key->outputs_written = prog->nir->info.outputs_written;
+   key->patch_outputs_written = prog->nir->info.patch_outputs_written;
+}
 
 bool
 brw_tcs_precompile(struct gl_context *ctx,
@@ -369,31 +399,9 @@
    struct brw_program *btcp = brw_program(prog);
    const struct gl_linked_shader *tes =
       shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   struct brw_program *btep = tes ? brw_program(tes->Program) : NULL;
 
-   memset(&key, 0, sizeof(key));
-
-   key.program_string_id = btcp->id;
-   brw_setup_tex_for_precompile(brw, &key.tex, prog);
-
-   /* Guess that the input and output patches have the same dimensionality. */
-   if (devinfo->gen < 8)
-      key.input_vertices = prog->info.tess.tcs_vertices_out;
-
-   struct brw_program *btep;
-   if (tes) {
-      btep = brw_program(tes->Program);
-      key.tes_primitive_mode = tes->Program->info.tess.primitive_mode;
-      key.quads_workaround = devinfo->gen < 9 &&
-                             tes->Program->info.tess.primitive_mode == GL_QUADS &&
-                             tes->Program->info.tess.spacing == TESS_SPACING_EQUAL;
-   } else {
-      btep = NULL;
-      key.tes_primitive_mode = GL_TRIANGLES;
-   }
-
-   key.outputs_written = prog->nir->info.outputs_written;
-   key.patch_outputs_written = prog->nir->info.patch_outputs_written;
+   brw_tcs_populate_default_key(&brw->screen->devinfo, &key, shader_prog, prog);
 
    success = brw_codegen_tcs_prog(brw, btcp, btep, &key);
 
diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c
index b9573c8..b3220a9 100644
--- a/src/mesa/drivers/dri/i965/brw_tes.c
+++ b/src/mesa/drivers/dri/i965/brw_tes.c
@@ -195,10 +195,9 @@
 
    brw_tes_populate_key(brw, &key);
 
-   if (brw_search_cache(&brw->cache, BRW_CACHE_TES_PROG,
-                        &key, sizeof(key),
-                        &stage_state->prog_offset,
-                        &brw->tes.base.prog_data))
+   if (brw_search_cache(&brw->cache, BRW_CACHE_TES_PROG, &key, sizeof(key),
+                        &stage_state->prog_offset, &brw->tes.base.prog_data,
+                        true))
       return;
 
    if (brw_disk_cache_upload_program(brw, MESA_SHADER_TESS_EVAL))
@@ -211,6 +210,30 @@
    assert(success);
 }
 
+void
+brw_tes_populate_default_key(const struct gen_device_info *devinfo,
+                             struct brw_tes_prog_key *key,
+                             struct gl_shader_program *sh_prog,
+                             struct gl_program *prog)
+{
+   struct brw_program *btep = brw_program(prog);
+
+   memset(key, 0, sizeof(*key));
+
+   key->program_string_id = btep->id;
+   key->inputs_read = prog->nir->info.inputs_read;
+   key->patch_inputs_read = prog->nir->info.patch_inputs_read;
+
+   if (sh_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
+      struct gl_program *tcp =
+         sh_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program;
+      key->inputs_read |= tcp->nir->info.outputs_written &
+         ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
+      key->patch_inputs_read |= tcp->nir->info.patch_outputs_written;
+   }
+
+   brw_setup_tex_for_precompile(devinfo, &key->tex, prog);
+}
 
 bool
 brw_tes_precompile(struct gl_context *ctx,
@@ -225,21 +248,7 @@
 
    struct brw_program *btep = brw_program(prog);
 
-   memset(&key, 0, sizeof(key));
-
-   key.program_string_id = btep->id;
-   key.inputs_read = prog->nir->info.inputs_read;
-   key.patch_inputs_read = prog->nir->info.patch_inputs_read;
-
-   if (shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]) {
-      struct gl_program *tcp =
-         shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program;
-      key.inputs_read |= tcp->nir->info.outputs_written &
-         ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
-      key.patch_inputs_read |= tcp->nir->info.patch_outputs_written;
-   }
-
-   brw_setup_tex_for_precompile(brw, &key.tex, prog);
+   brw_tes_populate_default_key(&brw->screen->devinfo, &key, shader_prog, prog);
 
    success = brw_codegen_tes_prog(brw, btep, &key);
 
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index a86fa78..d34240e 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -264,5 +264,5 @@
       while (--pad);
    }
 
-   intel_batchbuffer_data(brw, &uf, sizeof(uf), RENDER_RING);
+   intel_batchbuffer_data(brw, &uf, sizeof(uf));
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 903c342..69c0046 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -69,7 +69,7 @@
    }
 }
 
-GLbitfield64
+static GLbitfield64
 brw_vs_outputs_written(struct brw_context *brw, struct brw_vs_prog_key *key,
                        GLbitfield64 user_varyings)
 {
@@ -341,9 +341,9 @@
 
    brw_vs_populate_key(brw, &key);
 
-   if (brw_search_cache(&brw->cache, BRW_CACHE_VS_PROG,
-                        &key, sizeof(key),
-                        &brw->vs.base.prog_offset, &brw->vs.base.prog_data))
+   if (brw_search_cache(&brw->cache, BRW_CACHE_VS_PROG, &key, sizeof(key),
+                        &brw->vs.base.prog_offset, &brw->vs.base.prog_data,
+                        true))
       return;
 
    if (brw_disk_cache_upload_program(brw, MESA_SHADER_VERTEX))
@@ -356,6 +356,23 @@
    assert(success);
 }
 
+void
+brw_vs_populate_default_key(const struct gen_device_info *devinfo,
+                            struct brw_vs_prog_key *key,
+                            struct gl_program *prog)
+{
+   struct brw_program *bvp = brw_program(prog);
+
+   memset(key, 0, sizeof(*key));
+
+   brw_setup_tex_for_precompile(devinfo, &key->tex, prog);
+   key->program_string_id = bvp->id;
+   key->clamp_vertex_color =
+      (prog->info.outputs_written &
+       (VARYING_BIT_COL0 | VARYING_BIT_COL1 | VARYING_BIT_BFC0 |
+        VARYING_BIT_BFC1));
+}
+
 bool
 brw_vs_precompile(struct gl_context *ctx, struct gl_program *prog)
 {
@@ -367,14 +384,7 @@
 
    struct brw_program *bvp = brw_program(prog);
 
-   memset(&key, 0, sizeof(key));
-
-   brw_setup_tex_for_precompile(brw, &key.tex, prog);
-   key.program_string_id = bvp->id;
-   key.clamp_vertex_color =
-      (prog->info.outputs_written &
-       (VARYING_BIT_COL0 | VARYING_BIT_COL1 | VARYING_BIT_BFC0 |
-        VARYING_BIT_BFC1));
+   brw_vs_populate_default_key(&brw->screen->devinfo, &key, prog);
 
    success = brw_codegen_vs_prog(brw, bvp, &key);
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 6e052d7..94419f3 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -39,16 +39,16 @@
 extern "C" {
 #endif
 
-GLbitfield64
-brw_vs_outputs_written(struct brw_context *brw, struct brw_vs_prog_key *key,
-                       GLbitfield64 outputs_written);
-
 void
 brw_upload_vs_prog(struct brw_context *brw);
 
 void
 brw_vs_populate_key(struct brw_context *brw,
                     struct brw_vs_prog_key *key);
+void
+brw_vs_populate_default_key(const struct gen_device_info *devinfo,
+                            struct brw_vs_prog_key *key,
+                            struct gl_program *prog);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index b280d5e..70fe384 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -164,18 +164,20 @@
       start_time = get_time();
    }
 
-   int st_index8 = -1, st_index16 = -1;
+   int st_index8 = -1, st_index16 = -1, st_index32 = -1;
    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
       st_index8 = brw_get_shader_time_index(brw, &fp->program, ST_FS8,
                                             !fp->program.is_arb_asm);
       st_index16 = brw_get_shader_time_index(brw, &fp->program, ST_FS16,
                                              !fp->program.is_arb_asm);
+      st_index32 = brw_get_shader_time_index(brw, &fp->program, ST_FS32,
+                                             !fp->program.is_arb_asm);
    }
 
    char *error_str = NULL;
    program = brw_compile_fs(brw->screen->compiler, brw, mem_ctx,
                             key, &prog_data, fp->program.nir,
-                            &fp->program, st_index8, st_index16,
+                            &fp->program, st_index8, st_index16, st_index32,
                             true, false, vue_map,
                             &error_str);
 
@@ -588,10 +590,9 @@
 
    brw_wm_populate_key(brw, &key);
 
-   if (brw_search_cache(&brw->cache, BRW_CACHE_FS_PROG,
-                        &key, sizeof(key),
-                        &brw->wm.base.prog_offset,
-                        &brw->wm.base.prog_data))
+   if (brw_search_cache(&brw->cache, BRW_CACHE_FS_PROG, &key, sizeof(key),
+                        &brw->wm.base.prog_offset, &brw->wm.base.prog_data,
+                        true))
       return;
 
    if (brw_disk_cache_upload_program(brw, MESA_SHADER_FRAGMENT))
@@ -605,6 +606,45 @@
    assert(success);
 }
 
+void
+brw_wm_populate_default_key(const struct gen_device_info *devinfo,
+                            struct brw_wm_prog_key *key,
+                            struct gl_program *prog)
+{
+   memset(key, 0, sizeof(*key));
+
+   uint64_t outputs_written = prog->info.outputs_written;
+
+   if (devinfo->gen < 6) {
+      if (prog->info.fs.uses_discard)
+         key->iz_lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
+
+      if (outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+         key->iz_lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
+
+      /* Just assume depth testing. */
+      key->iz_lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
+      key->iz_lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
+   }
+
+   if (devinfo->gen < 6 || _mesa_bitcount_64(prog->info.inputs_read &
+                                             BRW_FS_VARYING_INPUT_MASK) > 16) {
+      key->input_slots_valid = prog->info.inputs_read | VARYING_BIT_POS;
+   }
+
+   brw_setup_tex_for_precompile(devinfo, &key->tex, prog);
+
+   key->nr_color_regions = _mesa_bitcount_64(outputs_written &
+         ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
+           BITFIELD64_BIT(FRAG_RESULT_STENCIL) |
+           BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
+
+   key->program_string_id = brw_program(prog)->id;
+
+   /* Whether reads from the framebuffer should behave coherently. */
+   key->coherent_fb_fetch = devinfo->gen >= 9;
+}
+
 bool
 brw_fs_precompile(struct gl_context *ctx, struct gl_program *prog)
 {
@@ -614,38 +654,11 @@
 
    struct brw_program *bfp = brw_program(prog);
 
-   memset(&key, 0, sizeof(key));
+   brw_wm_populate_default_key(&brw->screen->devinfo, &key, prog);
 
-   uint64_t outputs_written = prog->info.outputs_written;
-
-   if (devinfo->gen < 6) {
-      if (prog->info.fs.uses_discard)
-         key.iz_lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
-
-      if (outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
-         key.iz_lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
-
-      /* Just assume depth testing. */
-      key.iz_lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
-      key.iz_lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
-   }
-
-   if (devinfo->gen < 6 || _mesa_bitcount_64(prog->info.inputs_read &
-                                             BRW_FS_VARYING_INPUT_MASK) > 16) {
-      key.input_slots_valid = prog->info.inputs_read | VARYING_BIT_POS;
-   }
-
-   brw_setup_tex_for_precompile(brw, &key.tex, prog);
-
-   key.nr_color_regions = _mesa_bitcount_64(outputs_written &
-         ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
-           BITFIELD64_BIT(FRAG_RESULT_STENCIL) |
-           BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
-
-   key.program_string_id = bfp->id;
-
-   /* Whether reads from the framebuffer should behave coherently. */
-   key.coherent_fb_fetch = ctx->Extensions.EXT_shader_framebuffer_fetch;
+   /* check brw_wm_populate_default_key coherent_fb_fetch setting */
+   assert(key.coherent_fb_fetch ==
+          ctx->Extensions.EXT_shader_framebuffer_fetch);
 
    uint32_t old_prog_offset = brw->wm.base.prog_offset;
    struct brw_stage_prog_data *old_prog_data = brw->wm.base.prog_data;
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 113cdf3..ea94497 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -47,6 +47,10 @@
 void
 brw_wm_populate_key(struct brw_context *brw,
                     struct brw_wm_prog_key *key);
+void
+brw_wm_populate_default_key(const struct gen_device_info *devinfo,
+                            struct brw_wm_prog_key *key,
+                            struct gl_program *prog);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index a0a4a4f0..644b940 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -155,6 +155,8 @@
    struct brw_bo *aux_bo = NULL;
    struct isl_surf *aux_surf = NULL;
    uint64_t aux_offset = 0;
+   struct brw_bo *clear_bo = NULL;
+   uint32_t clear_offset = 0;
 
    if (aux_usage != ISL_AUX_USAGE_NONE) {
       aux_surf = &mt->aux_buf->surf;
@@ -164,7 +166,10 @@
       /* We only really need a clear color if we also have an auxiliary
        * surface.  Without one, it does nothing.
        */
-      clear_color = mt->fast_clear_color;
+      clear_color =
+         intel_miptree_get_clear_color(devinfo, mt, view.format,
+                                       view.usage & ISL_SURF_USAGE_TEXTURE_BIT,
+                                       &clear_bo, &clear_offset);
    }
 
    void *state = brw_state_batch(brw,
@@ -172,15 +177,6 @@
                                  brw->isl_dev.ss.align,
                                  surf_offset);
 
-   bool use_clear_address = devinfo->gen >= 10 && aux_surf;
-
-   struct brw_bo *clear_bo = NULL;
-   uint32_t clear_offset = 0;
-   if (use_clear_address) {
-      clear_bo = mt->aux_buf->clear_color_bo;
-      clear_offset = mt->aux_buf->clear_color_offset;
-   }
-
    isl_surf_fill_state(&brw->isl_dev, state, .surf = &surf, .view = &view,
                        .address = brw_state_reloc(&brw->batch,
                                                   *surf_offset + brw->isl_dev.ss.addr_offset,
@@ -189,7 +185,7 @@
                        .aux_address = aux_offset,
                        .mocs = brw_get_bo_mocs(devinfo, mt->bo),
                        .clear_color = clear_color,
-                       .use_clear_address = use_clear_address,
+                       .use_clear_address = clear_bo != NULL,
                        .clear_address = clear_offset,
                        .x_offset_sa = tile_x, .y_offset_sa = tile_y);
    if (aux_surf) {
@@ -221,7 +217,7 @@
       }
    }
 
-   if (use_clear_address) {
+   if (clear_bo != NULL) {
       /* Make sure the offset is aligned with a cacheline. */
       assert((clear_offset & 0x3f) == 0);
       uint64_t *clear_address =
@@ -596,6 +592,12 @@
          .usage = ISL_SURF_USAGE_TEXTURE_BIT,
       };
 
+      /* On Ivy Bridge and earlier, we handle texture swizzle with shader
+       * code.  The actual surface swizzle should be identity.
+       */
+      if (devinfo->gen <= 7 && !devinfo->is_haswell)
+         view.swizzle = ISL_SWIZZLE_IDENTITY;
+
       if (obj->Target == GL_TEXTURE_CUBE_MAP ||
           obj->Target == GL_TEXTURE_CUBE_MAP_ARRAY)
          view.usage |= ISL_SURF_USAGE_CUBE_BIT;
@@ -1495,18 +1497,6 @@
    param->stride[0] = _mesa_get_format_bytes(u->_ActualFormat);
 }
 
-static unsigned
-get_image_num_layers(const struct intel_mipmap_tree *mt, GLenum target,
-                     unsigned level)
-{
-   if (target == GL_TEXTURE_CUBE_MAP)
-      return 6;
-
-   return target == GL_TEXTURE_3D ?
-      minify(mt->surf.logical_level0_px.depth, level) :
-      mt->surf.logical_level0_px.array_len;
-}
-
 static void
 update_image_surface(struct brw_context *brw,
                      struct gl_image_unit *u,
@@ -1538,14 +1528,29 @@
       } else {
          struct intel_texture_object *intel_obj = intel_texture_object(obj);
          struct intel_mipmap_tree *mt = intel_obj->mt;
-         const unsigned num_layers = u->Layered ?
-            get_image_num_layers(mt, obj->Target, u->Level) : 1;
+
+         unsigned base_layer, num_layers;
+         if (u->Layered) {
+            if (obj->Target == GL_TEXTURE_3D) {
+               base_layer = 0;
+               num_layers = minify(mt->surf.logical_level0_px.depth, u->Level);
+            } else {
+               assert(obj->Immutable || obj->MinLayer == 0);
+               base_layer = obj->MinLayer;
+               num_layers = obj->Immutable ?
+                                obj->NumLayers :
+                                mt->surf.logical_level0_px.array_len;
+            }
+         } else {
+            base_layer = obj->MinLayer + u->_Layer;
+            num_layers = 1;
+         }
 
          struct isl_view view = {
             .format = format,
             .base_level = obj->MinLevel + u->Level,
             .levels = 1,
-            .base_array_layer = obj->MinLayer + u->_Layer,
+            .base_array_layer = base_layer,
             .array_len = num_layers,
             .swizzle = ISL_SWIZZLE_IDENTITY,
             .usage = ISL_SURF_USAGE_STORAGE_BIT,
diff --git a/src/mesa/drivers/dri/i965/gen4_blorp_exec.h b/src/mesa/drivers/dri/i965/gen4_blorp_exec.h
index e59bc9f..0edc518 100644
--- a/src/mesa/drivers/dri/i965/gen4_blorp_exec.h
+++ b/src/mesa/drivers/dri/i965/gen4_blorp_exec.h
@@ -132,17 +132,22 @@
 
          wm._8PixelDispatchEnable = prog_data->dispatch_8;
          wm._16PixelDispatchEnable = prog_data->dispatch_16;
+         wm._32PixelDispatchEnable = prog_data->dispatch_32;
 
 #if GEN_GEN == 4
          wm.KernelStartPointer0 =
             instruction_state_address(batch, params->wm_prog_kernel);
-         wm.GRFRegisterCount0 = prog_data->reg_blocks_0;
+         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0);
 #else
-         wm.KernelStartPointer0 = params->wm_prog_kernel;
-         wm.GRFRegisterCount0 = prog_data->reg_blocks_0;
-         wm.KernelStartPointer2 =
-            params->wm_prog_kernel + prog_data->prog_offset_2;
-         wm.GRFRegisterCount2 = prog_data->reg_blocks_2;
+         wm.KernelStartPointer0 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 0);
+         wm.KernelStartPointer1 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 1);
+         wm.KernelStartPointer2 = params->wm_prog_kernel +
+                                  brw_wm_prog_data_prog_offset(prog_data, wm, 2);
+         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(prog_data, wm, 0);
+         wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(prog_data, wm, 1);
+         wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(prog_data, wm, 2);
 #endif
       }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_depth_state.c b/src/mesa/drivers/dri/i965/gen6_depth_state.c
deleted file mode 100644
index 8a1d580..0000000
--- a/src/mesa/drivers/dri/i965/gen6_depth_state.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-
-#include "intel_batchbuffer.h"
-#include "intel_fbo.h"
-#include "intel_mipmap_tree.h"
-
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-
-#include "main/mtypes.h"
-#include "main/fbobject.h"
-#include "main/glformats.h"
-
-void
-gen6_emit_depth_stencil_hiz(struct brw_context *brw,
-                            struct intel_mipmap_tree *depth_mt,
-                            uint32_t depth_offset, uint32_t depthbuffer_format,
-                            uint32_t depth_surface_type,
-                            struct intel_mipmap_tree *stencil_mt,
-                            bool hiz, bool separate_stencil,
-                            uint32_t width, uint32_t height,
-                            uint32_t tile_x, uint32_t tile_y)
-{
-   struct gl_context *ctx = &brw->ctx;
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   uint32_t surftype;
-   unsigned int depth = 1;
-   GLenum gl_target = GL_TEXTURE_2D;
-   unsigned int lod;
-   const struct intel_mipmap_tree *mt = depth_mt ? depth_mt : stencil_mt;
-   const struct intel_renderbuffer *irb = NULL;
-   const struct gl_renderbuffer *rb = NULL;
-
-   /* Enable the hiz bit if we're doing separate stencil, because it and the
-    * separate stencil bit must have the same value. From Section 2.11.5.6.1.1
-    * 3DSTATE_DEPTH_BUFFER, Bit 1.21 "Separate Stencil Enable":
-    *     [DevIL]: If this field is enabled, Hierarchical Depth Buffer
-    *     Enable must also be enabled.
-    *
-    *     [DevGT]: This field must be set to the same value (enabled or
-    *     disabled) as Hierarchical Depth Buffer Enable
-    */
-   bool enable_hiz_ss = hiz || separate_stencil;
-
-   brw_emit_depth_stall_flushes(brw);
-
-   irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
-   if (!irb)
-      irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
-   rb = (struct gl_renderbuffer*) irb;
-
-   if (rb) {
-      depth = MAX2(irb->layer_count, 1);
-      if (rb->TexImage)
-         gl_target = rb->TexImage->TexObject->Target;
-   }
-
-   switch (gl_target) {
-   case GL_TEXTURE_CUBE_MAP_ARRAY:
-   case GL_TEXTURE_CUBE_MAP:
-      /* The PRM claims that we should use BRW_SURFACE_CUBE for this
-       * situation, but experiments show that gl_Layer doesn't work when we do
-       * this.  So we use BRW_SURFACE_2D, since for rendering purposes this is
-       * equivalent.
-       */
-      surftype = BRW_SURFACE_2D;
-      depth *= 6;
-      break;
-   case GL_TEXTURE_3D:
-      assert(mt);
-      depth = mt->surf.logical_level0_px.depth;
-      /* fallthrough */
-   default:
-      surftype = translate_tex_target(gl_target);
-      break;
-   }
-
-   const unsigned min_array_element = irb ? irb->mt_layer : 0;
-
-   lod = irb ? irb->mt_level - irb->mt->first_level : 0;
-
-   if (mt) {
-      width = mt->surf.logical_level0_px.width;
-      height = mt->surf.logical_level0_px.height;
-   }
-
-   BEGIN_BATCH(7);
-   /* 3DSTATE_DEPTH_BUFFER dw0 */
-   OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
-
-   /* 3DSTATE_DEPTH_BUFFER dw1 */
-   OUT_BATCH((depth_mt ? depth_mt->surf.row_pitch - 1 : 0) |
-             (depthbuffer_format << 18) |
-             ((enable_hiz_ss ? 1 : 0) << 21) | /* separate stencil enable */
-             ((enable_hiz_ss ? 1 : 0) << 22) | /* hiz enable */
-             (BRW_TILEWALK_YMAJOR << 26) |
-             (1 << 27) |
-             (surftype << 29));
-
-   /* 3DSTATE_DEPTH_BUFFER dw2 */
-   if (depth_mt) {
-      OUT_RELOC(depth_mt->bo, RELOC_WRITE, 0);
-   } else {
-      OUT_BATCH(0);
-   }
-
-   /* 3DSTATE_DEPTH_BUFFER dw3 */
-   OUT_BATCH(((width - 1) << 6) |
-             ((height - 1) << 19) |
-             lod << 2);
-
-   /* 3DSTATE_DEPTH_BUFFER dw4 */
-   OUT_BATCH((depth - 1) << 21 |
-             min_array_element << 10 |
-             (depth - 1) << 1);
-
-   /* 3DSTATE_DEPTH_BUFFER dw5 */
-   OUT_BATCH(0);
-   assert(tile_x == 0 && tile_y == 0);
-
-   /* 3DSTATE_DEPTH_BUFFER dw6 */
-   OUT_BATCH(0);
-
-   ADVANCE_BATCH();
-
-   if (hiz || separate_stencil) {
-      /*
-       * In the 3DSTATE_DEPTH_BUFFER batch emitted above, the 'separate
-       * stencil enable' and 'hiz enable' bits were set. Therefore we must
-       * emit 3DSTATE_HIER_DEPTH_BUFFER and 3DSTATE_STENCIL_BUFFER. Even if
-       * there is no stencil buffer, 3DSTATE_STENCIL_BUFFER must be emitted;
-       * failure to do so causes hangs on gen5 and a stall on gen6.
-       */
-
-      /* Emit hiz buffer. */
-      if (hiz) {
-         assert(depth_mt);
-
-         uint32_t offset;
-         isl_surf_get_image_offset_B_tile_sa(&depth_mt->aux_buf->surf,
-                                             lod, 0, 0, &offset, NULL, NULL);
-
-	 BEGIN_BATCH(3);
-	 OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
-	 OUT_BATCH(depth_mt->aux_buf->surf.row_pitch - 1);
-	 OUT_RELOC(depth_mt->aux_buf->bo, RELOC_WRITE, offset);
-	 ADVANCE_BATCH();
-      } else {
-	 BEGIN_BATCH(3);
-	 OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2));
-	 OUT_BATCH(0);
-	 OUT_BATCH(0);
-	 ADVANCE_BATCH();
-      }
-
-      /* Emit stencil buffer. */
-      if (separate_stencil) {
-         assert(stencil_mt->format == MESA_FORMAT_S_UINT8);
-         assert(stencil_mt->surf.size > 0);
-
-         uint32_t offset;
-         isl_surf_get_image_offset_B_tile_sa(&stencil_mt->surf,
-                                             lod, 0, 0, &offset, NULL, NULL);
-
-	 BEGIN_BATCH(3);
-	 OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
-	 OUT_BATCH(stencil_mt->surf.row_pitch - 1);
-	 OUT_RELOC(stencil_mt->bo, RELOC_WRITE, offset);
-	 ADVANCE_BATCH();
-      } else {
-	 BEGIN_BATCH(3);
-	 OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2));
-	 OUT_BATCH(0);
-	 OUT_BATCH(0);
-	 ADVANCE_BATCH();
-      }
-   }
-
-   /*
-    * On Gen >= 6, emit clear params for safety. If using hiz, then clear
-    * params must be emitted.
-    *
-    * From Section 2.11.5.6.4.1 3DSTATE_CLEAR_PARAMS:
-    *     3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE packet
-    *     when HiZ is enabled and the DEPTH_BUFFER_STATE changes.
-    */
-   BEGIN_BATCH(2);
-   OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 |
-             GEN5_DEPTH_CLEAR_VALID |
-             (2 - 2));
-   if (depth_mt) {
-      OUT_BATCH(brw_convert_depth_value(depth_mt->format,
-                                        depth_mt->fast_clear_color.f32[0]));
-   } else {
-      OUT_BATCH(0);
-   }
-   ADVANCE_BATCH();
-}
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index 75060d4..ce9bb47 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -329,7 +329,8 @@
 
    /* Since we're starting a new query, we need to throw away old results. */
    brw_bo_unreference(query->bo);
-   query->bo = brw_bo_alloc(brw->bufmgr, "query results", 4096);
+   query->bo =
+      brw_bo_alloc(brw->bufmgr, "query results", 4096, BRW_MEMZONE_OTHER);
 
    /* For ARB_query_buffer_object: The result is not available */
    set_query_availability(brw, query, false);
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index 0c830c7..a2d2606 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -195,9 +195,11 @@
    _mesa_init_transform_feedback_object(&brw_obj->base, name);
 
    brw_obj->offset_bo =
-      brw_bo_alloc(brw->bufmgr, "transform feedback offsets", 16);
+      brw_bo_alloc(brw->bufmgr, "transform feedback offsets", 16,
+                   BRW_MEMZONE_OTHER);
    brw_obj->prim_count_bo =
-      brw_bo_alloc(brw->bufmgr, "xfb primitive counts", 16384);
+      brw_bo_alloc(brw->bufmgr, "xfb primitive counts", 16384,
+                   BRW_MEMZONE_OTHER);
 
    return &brw_obj->base;
 }
diff --git a/src/mesa/drivers/dri/i965/gen7_misc_state.c b/src/mesa/drivers/dri/i965/gen7_misc_state.c
deleted file mode 100644
index 1ce7658..0000000
--- a/src/mesa/drivers/dri/i965/gen7_misc_state.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * Copyright © 2011 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-#include "main/mtypes.h"
-#include "intel_batchbuffer.h"
-#include "intel_mipmap_tree.h"
-#include "intel_fbo.h"
-#include "brw_context.h"
-#include "brw_state.h"
-#include "brw_defines.h"
-
-void
-gen7_emit_depth_stencil_hiz(struct brw_context *brw,
-                            struct intel_mipmap_tree *depth_mt,
-                            uint32_t depth_offset, uint32_t depthbuffer_format,
-                            uint32_t depth_surface_type,
-                            struct intel_mipmap_tree *stencil_mt,
-                            bool hiz, bool separate_stencil,
-                            uint32_t width, uint32_t height,
-                            uint32_t tile_x, uint32_t tile_y)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   struct gl_context *ctx = &brw->ctx;
-   const uint8_t mocs = GEN7_MOCS_L3;
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   uint32_t surftype;
-   unsigned int depth = 1;
-   unsigned int min_array_element;
-   GLenum gl_target = GL_TEXTURE_2D;
-   unsigned int lod;
-   const struct intel_mipmap_tree *mt = depth_mt ? depth_mt : stencil_mt;
-   const struct intel_renderbuffer *irb = NULL;
-   const struct gl_renderbuffer *rb = NULL;
-
-   /* Skip repeated NULL depth/stencil emits (think 2D rendering). */
-   if (!mt && brw->no_depth_or_stencil) {
-      assert(brw->hw_ctx);
-      return;
-   }
-
-   brw_emit_depth_stall_flushes(brw);
-
-   irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
-   if (!irb)
-      irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
-   rb = (struct gl_renderbuffer*) irb;
-
-   if (rb) {
-      depth = MAX2(irb->layer_count, 1);
-      if (rb->TexImage)
-         gl_target = rb->TexImage->TexObject->Target;
-   }
-
-   switch (gl_target) {
-   case GL_TEXTURE_CUBE_MAP_ARRAY:
-   case GL_TEXTURE_CUBE_MAP:
-      /* The PRM claims that we should use BRW_SURFACE_CUBE for this
-       * situation, but experiments show that gl_Layer doesn't work when we do
-       * this.  So we use BRW_SURFACE_2D, since for rendering purposes this is
-       * equivalent.
-       */
-      surftype = BRW_SURFACE_2D;
-      depth *= 6;
-      break;
-   case GL_TEXTURE_3D:
-      assert(mt);
-      depth = mt->surf.logical_level0_px.depth;
-      /* fallthrough */
-   default:
-      surftype = translate_tex_target(gl_target);
-      break;
-   }
-
-   min_array_element = irb ? irb->mt_layer : 0;
-
-   lod = irb ? irb->mt_level - irb->mt->first_level : 0;
-
-   if (mt) {
-      width = mt->surf.logical_level0_px.width;
-      height = mt->surf.logical_level0_px.height;
-   }
-
-   /* _NEW_DEPTH, _NEW_STENCIL, _NEW_BUFFERS */
-   BEGIN_BATCH(7);
-   /* 3DSTATE_DEPTH_BUFFER dw0 */
-   OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2));
-
-   /* 3DSTATE_DEPTH_BUFFER dw1 */
-   OUT_BATCH((depth_mt ? depth_mt->surf.row_pitch - 1 : 0) |
-             (depthbuffer_format << 18) |
-             ((hiz ? 1 : 0) << 22) |
-             ((stencil_mt != NULL && brw->stencil_write_enabled) << 27) |
-             (brw_depth_writes_enabled(brw) << 28) |
-             (surftype << 29));
-
-   /* 3DSTATE_DEPTH_BUFFER dw2 */
-   if (depth_mt) {
-      OUT_RELOC(depth_mt->bo, RELOC_WRITE, 0);
-   } else {
-      OUT_BATCH(0);
-   }
-
-   /* 3DSTATE_DEPTH_BUFFER dw3 */
-   OUT_BATCH(((width - 1) << 4) |
-             ((height - 1) << 18) |
-             lod);
-
-   /* 3DSTATE_DEPTH_BUFFER dw4 */
-   OUT_BATCH(((depth - 1) << 21) |
-             (min_array_element << 10) |
-             mocs);
-
-   /* 3DSTATE_DEPTH_BUFFER dw5 */
-   OUT_BATCH(0);
-
-   /* 3DSTATE_DEPTH_BUFFER dw6 */
-   OUT_BATCH((depth - 1) << 21);
-   ADVANCE_BATCH();
-
-   if (!hiz) {
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (3 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      assert(depth_mt);
-
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (3 - 2));
-      OUT_BATCH((mocs << 25) |
-                (depth_mt->aux_buf->pitch - 1));
-      OUT_RELOC(depth_mt->aux_buf->bo, RELOC_WRITE, 0);
-      ADVANCE_BATCH();
-   }
-
-   if (stencil_mt == NULL) {
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (3 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      stencil_mt->r8stencil_needs_update = true;
-      const int enabled = devinfo->is_haswell ? HSW_STENCIL_ENABLED : 0;
-
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (3 - 2));
-      OUT_BATCH(enabled |
-                mocs << 25 |
-	        (stencil_mt->surf.row_pitch - 1));
-      OUT_RELOC(stencil_mt->bo, RELOC_WRITE, 0);
-      ADVANCE_BATCH();
-   }
-
-   BEGIN_BATCH(3);
-   OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2));
-   if (depth_mt) {
-      OUT_BATCH(brw_convert_depth_value(depth_mt->format,
-                                        depth_mt->fast_clear_color.f32[0]));
-   } else {
-      OUT_BATCH(0);
-   }
-   OUT_BATCH(1);
-   ADVANCE_BATCH();
-
-   brw->no_depth_or_stencil = !mt;
-}
-
-/**
- * \see brw_context.state.depth_region
- */
-const struct brw_tracked_state gen7_depthbuffer = {
-   .dirty = {
-      .mesa = _NEW_BUFFERS |
-              _NEW_DEPTH |
-              _NEW_STENCIL,
-      .brw = BRW_NEW_BATCH |
-             BRW_NEW_BLORP,
-   },
-   .emit = brw_emit_depthbuffer,
-};
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 0fc0273..1ea5884 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -32,184 +32,6 @@
 #include "main/framebuffer.h"
 
 /**
- * Helper function to emit depth related command packets.
- */
-static void
-emit_depth_packets(struct brw_context *brw,
-                   struct intel_mipmap_tree *depth_mt,
-                   uint32_t depthbuffer_format,
-                   uint32_t depth_surface_type,
-                   bool depth_writable,
-                   struct intel_mipmap_tree *stencil_mt,
-                   bool stencil_writable,
-                   bool hiz,
-                   uint32_t width,
-                   uint32_t height,
-                   uint32_t depth,
-                   uint32_t lod,
-                   uint32_t min_array_element)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   uint32_t mocs_wb = devinfo->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
-
-   /* Skip repeated NULL depth/stencil emits (think 2D rendering). */
-   if (!depth_mt && !stencil_mt && brw->no_depth_or_stencil) {
-      assert(brw->hw_ctx);
-      return;
-   }
-
-   brw_emit_depth_stall_flushes(brw);
-
-   /* _NEW_BUFFERS, _NEW_DEPTH, _NEW_STENCIL */
-   BEGIN_BATCH(8);
-   OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (8 - 2));
-   OUT_BATCH(depth_surface_type << 29 |
-             (depth_writable ? (1 << 28) : 0) |
-             (stencil_mt != NULL && stencil_writable) << 27 |
-             (hiz ? 1 : 0) << 22 |
-             depthbuffer_format << 18 |
-             (depth_mt ? depth_mt->surf.row_pitch - 1 : 0));
-   if (depth_mt) {
-      OUT_RELOC64(depth_mt->bo, RELOC_WRITE, 0);
-   } else {
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-   }
-   OUT_BATCH(((width - 1) << 4) | ((height - 1) << 18) | lod);
-   OUT_BATCH(((depth - 1) << 21) | (min_array_element << 10) | mocs_wb);
-   OUT_BATCH(0);
-   OUT_BATCH(((depth - 1) << 21) |
-              (depth_mt ? depth_mt->surf.array_pitch_el_rows >> 2 : 0));
-   ADVANCE_BATCH();
-
-   if (!hiz) {
-      BEGIN_BATCH(5);
-      OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (5 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      assert(depth_mt);
-      BEGIN_BATCH(5);
-      OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (5 - 2));
-      OUT_BATCH((depth_mt->aux_buf->pitch - 1) | mocs_wb << 25);
-      OUT_RELOC64(depth_mt->aux_buf->bo, RELOC_WRITE, 0);
-      OUT_BATCH(depth_mt->aux_buf->qpitch >> 2);
-      ADVANCE_BATCH();
-   }
-
-   if (stencil_mt == NULL) {
-      BEGIN_BATCH(5);
-      OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2));
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      OUT_BATCH(0);
-      ADVANCE_BATCH();
-   } else {
-      BEGIN_BATCH(5);
-      OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (5 - 2));
-      OUT_BATCH(HSW_STENCIL_ENABLED | mocs_wb << 22 |
-                (stencil_mt->surf.row_pitch - 1));
-      OUT_RELOC64(stencil_mt->bo, RELOC_WRITE, 0);
-      OUT_BATCH(stencil_mt->surf.array_pitch_el_rows >> 2);
-      ADVANCE_BATCH();
-   }
-
-   BEGIN_BATCH(3);
-   OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2));
-   OUT_BATCH(depth_mt ? depth_mt->fast_clear_color.u32[0] : 0);
-   OUT_BATCH(1);
-   ADVANCE_BATCH();
-
-   brw->no_depth_or_stencil = !depth_mt && !stencil_mt;
-}
-
-/* Awful vtable-compatible function; should be cleaned up in the future. */
-void
-gen8_emit_depth_stencil_hiz(struct brw_context *brw,
-                            struct intel_mipmap_tree *depth_mt,
-                            uint32_t depth_offset,
-                            uint32_t depthbuffer_format,
-                            uint32_t depth_surface_type,
-                            struct intel_mipmap_tree *stencil_mt,
-                            bool hiz, bool separate_stencil,
-                            uint32_t width, uint32_t height,
-                            uint32_t tile_x, uint32_t tile_y)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   struct gl_context *ctx = &brw->ctx;
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   uint32_t surftype;
-   unsigned int depth = 1;
-   unsigned int min_array_element;
-   GLenum gl_target = GL_TEXTURE_2D;
-   unsigned int lod;
-   const struct intel_mipmap_tree *mt = depth_mt ? depth_mt : stencil_mt;
-   const struct intel_renderbuffer *irb = NULL;
-   const struct gl_renderbuffer *rb = NULL;
-
-   irb = intel_get_renderbuffer(fb, BUFFER_DEPTH);
-   if (!irb)
-      irb = intel_get_renderbuffer(fb, BUFFER_STENCIL);
-   rb = (struct gl_renderbuffer *) irb;
-
-   if (rb) {
-      depth = MAX2(irb->layer_count, 1);
-      if (rb->TexImage)
-         gl_target = rb->TexImage->TexObject->Target;
-   }
-
-   switch (gl_target) {
-   case GL_TEXTURE_CUBE_MAP_ARRAY:
-   case GL_TEXTURE_CUBE_MAP:
-      /* The PRM claims that we should use BRW_SURFACE_CUBE for this
-       * situation, but experiments show that gl_Layer doesn't work when we do
-       * this.  So we use BRW_SURFACE_2D, since for rendering purposes this is
-       * equivalent.
-       */
-      surftype = BRW_SURFACE_2D;
-      depth *= 6;
-      break;
-   case GL_TEXTURE_3D:
-      assert(mt);
-      depth = mt->surf.logical_level0_px.depth;
-      surftype = translate_tex_target(gl_target);
-      break;
-   case GL_TEXTURE_1D_ARRAY:
-   case GL_TEXTURE_1D:
-      if (devinfo->gen >= 9) {
-         /* WaDisable1DDepthStencil. Skylake+ doesn't support 1D depth
-          * textures but it does allow pretending it's a 2D texture
-          * instead.
-          */
-         surftype = BRW_SURFACE_2D;
-         break;
-      }
-      /* fallthrough */
-   default:
-      surftype = translate_tex_target(gl_target);
-      break;
-   }
-
-   min_array_element = irb ? irb->mt_layer : 0;
-
-   lod = irb ? irb->mt_level - irb->mt->first_level : 0;
-
-   if (mt) {
-      width = mt->surf.logical_level0_px.width;
-      height = mt->surf.logical_level0_px.height;
-   }
-
-   emit_depth_packets(brw, depth_mt, brw_depthbuffer_format(brw), surftype,
-                      brw_depth_writes_enabled(brw),
-                      stencil_mt, brw->stencil_write_enabled,
-                      hiz, width, height, depth, lod, min_array_element);
-}
-
-/**
  * Should we set the PMA FIX ENABLE bit?
  *
  * To avoid unnecessary depth related stalls, we need to set this bit.
diff --git a/src/mesa/drivers/dri/i965/genX_blorp_exec.c b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
index 5814389..a62b88e 100644
--- a/src/mesa/drivers/dri/i965/genX_blorp_exec.c
+++ b/src/mesa/drivers/dri/i965/genX_blorp_exec.c
@@ -44,7 +44,7 @@
    assert(batch->blorp->driver_ctx == batch->driver_batch);
    struct brw_context *brw = batch->driver_batch;
 
-   intel_batchbuffer_begin(brw, n, RENDER_RING);
+   intel_batchbuffer_begin(brw, n);
    uint32_t *map = brw->batch.map_next;
    brw->batch.map_next += n;
    intel_batchbuffer_advance(brw);
@@ -189,6 +189,35 @@
    return data;
 }
 
+/**
+ * See vf_invalidate_for_vb_48b_transitions in genX_state_upload.c.
+ */
+static void
+blorp_vf_invalidate_for_vb_48b_transitions(struct blorp_batch *batch,
+                                           const struct blorp_address *addrs,
+                                           unsigned num_vbs)
+{
+#if GEN_GEN >= 8
+   struct brw_context *brw = batch->driver_batch;
+   bool need_invalidate = false;
+
+   for (unsigned i = 0; i < num_vbs; i++) {
+      struct brw_bo *bo = addrs[i].buffer;
+      uint16_t high_bits =
+         bo && (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32u : 0;
+
+      if (high_bits != brw->vb.last_bo_high_bits[i]) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[i] = high_bits;
+      }
+   }
+
+   if (need_invalidate) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+   }
+#endif
+}
+
 #if GEN_GEN >= 8
 static struct blorp_address
 blorp_get_workaround_page(struct blorp_batch *batch)
@@ -277,9 +306,10 @@
    brw_select_pipeline(brw, BRW_RENDER_PIPELINE);
 
 retry:
-   intel_batchbuffer_require_space(brw, 1400, RENDER_RING);
+   intel_batchbuffer_require_space(brw, 1400);
    brw_require_statebuffer_space(brw, 600);
    intel_batchbuffer_save_state(brw);
+   check_aperture_failed_once |= intel_batchbuffer_saved_state_is_empty(brw);
    brw->batch.no_wrap = true;
 
 #if GEN_GEN == 6
diff --git a/src/mesa/drivers/dri/i965/genX_state_upload.c b/src/mesa/drivers/dri/i965/genX_state_upload.c
index d47940b..0f82500 100644
--- a/src/mesa/drivers/dri/i965/genX_state_upload.c
+++ b/src/mesa/drivers/dri/i965/genX_state_upload.c
@@ -59,7 +59,7 @@
 UNUSED static void *
 emit_dwords(struct brw_context *brw, unsigned n)
 {
-   intel_batchbuffer_begin(brw, n, RENDER_RING);
+   intel_batchbuffer_begin(brw, n);
    uint32_t *map = brw->batch.map_next;
    brw->batch.map_next += n;
    intel_batchbuffer_advance(brw);
@@ -217,7 +217,7 @@
        * to a FBO (i.e. any named frame buffer object), we *don't*
        * need to invert - we already match the layout.
        */
-      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+      if (ctx->DrawBuffer->FlipY) {
          for (unsigned i = 0; i < 32; i++)
             poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
       } else {
@@ -257,7 +257,7 @@
        * to a user-created FBO then our native pixel coordinate system
        * works just fine, and there's no window system to worry about.
        */
-      if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+      if (ctx->DrawBuffer->FlipY) {
          poly.PolygonStippleYOffset =
             (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
       }
@@ -480,6 +480,65 @@
    }
 }
 
+static UNUSED uint16_t
+pinned_bo_high_bits(struct brw_bo *bo)
+{
+   return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
+}
+
+/* The VF cache designers apparently cut corners, and made the cache key's
+ * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
+ * of the address.  If you happen to have two vertex buffers which get placed
+ * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
+ * collisions.  (These collisions can happen within a single batch.)
+ *
+ * In the soft-pin world, we'd like to assign addresses up front, and never
+ * move buffers.  So, we need to do a VF cache invalidate if the buffer for
+ * a particular VB slot has different [48:32] address bits than the last one.
+ *
+ * In the relocation world, we have no idea what the addresses will be, so
+ * we can't apply this workaround.  Instead, we tell the kernel to move it
+ * to the low 4GB regardless.
+ */
+static void
+vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   bool need_invalidate = false;
+   unsigned i;
+
+   for (i = 0; i < brw->vb.nr_buffers; i++) {
+      uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
+
+      if (high_bits != brw->vb.last_bo_high_bits[i]) {
+         need_invalidate = true;
+         brw->vb.last_bo_high_bits[i] = high_bits;
+      }
+   }
+
+   /* Don't bother with draw parameter buffers - those are generated by
+    * the driver so we can select a consistent memory zone.
+    */
+
+   if (need_invalidate) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+   }
+#endif
+}
+
+static void
+vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
+{
+#if GEN_GEN >= 8
+   uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
+
+   if (high_bits != brw->ib.last_bo_high_bits) {
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
+      brw->ib.last_bo_high_bits = high_bits;
+   }
+#endif
+}
+
 static void
 genX(emit_vertices)(struct brw_context *brw)
 {
@@ -539,16 +598,20 @@
    }
 #endif
 
-   const bool uses_firstvertex =
-      vs_prog_data->uses_basevertex || vs_prog_data->uses_firstvertex;
+   const bool uses_draw_params =
+      vs_prog_data->uses_firstvertex ||
+      vs_prog_data->uses_baseinstance;
 
-   const bool needs_sgvs_element = (uses_firstvertex ||
-                                    vs_prog_data->uses_baseinstance ||
+   const bool uses_derived_draw_params =
+      vs_prog_data->uses_drawid ||
+      vs_prog_data->uses_is_indexed_draw;
+
+   const bool needs_sgvs_element = (uses_draw_params ||
                                     vs_prog_data->uses_instanceid ||
                                     vs_prog_data->uses_vertexid);
 
    unsigned nr_elements =
-      brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
+      brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params;
 
 #if GEN_GEN < 8
    /* If any of the formats of vb.enabled needs more that one upload, we need
@@ -556,8 +619,7 @@
     */
    for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
-      const struct gl_vertex_array *glarray = input->glarray;
-      const struct gl_array_attributes *glattrib = glarray->VertexAttrib;
+      const struct gl_array_attributes *glattrib = input->glattrib;
       uint32_t format = brw_get_vertex_surface_type(brw, glattrib);
 
       if (uploads_needed(format, input->is_dual_slot) > 1)
@@ -588,11 +650,10 @@
    }
 
    /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
-   const bool uses_draw_params =
-      uses_firstvertex ||
-      vs_prog_data->uses_baseinstance;
    const unsigned nr_buffers = brw->vb.nr_buffers +
-      uses_draw_params + vs_prog_data->uses_drawid;
+      uses_draw_params + uses_derived_draw_params;
+
+   vf_invalidate_for_vb_48bit_transitions(brw);
 
    if (nr_buffers) {
       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
@@ -626,11 +687,11 @@
                                              0 /* step rate */);
       }
 
-      if (vs_prog_data->uses_drawid) {
+      if (uses_derived_draw_params) {
          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
-                                             brw->draw.draw_id_bo,
-                                             brw->draw.draw_id_offset,
-                                             brw->draw.draw_id_bo->size,
+                                             brw->draw.derived_draw_params_bo,
+                                             brw->draw.derived_draw_params_offset,
+                                             brw->draw.derived_draw_params_bo->size,
                                              0 /* stride */,
                                              0 /* step rate */);
       }
@@ -651,8 +712,7 @@
    unsigned i;
    for (i = 0; i < brw->vb.nr_enabled; i++) {
       const struct brw_vertex_element *input = brw->vb.enabled[i];
-      const struct gl_vertex_array *glarray = input->glarray;
-      const struct gl_array_attributes *glattrib = glarray->VertexAttrib;
+      const struct gl_array_attributes *glattrib = input->glattrib;
       uint32_t format = brw_get_vertex_surface_type(brw, glattrib);
       uint32_t comp0 = VFCOMP_STORE_SRC;
       uint32_t comp1 = VFCOMP_STORE_SRC;
@@ -694,8 +754,7 @@
           * entry. */
          const unsigned offset = input->offset + c * 16;
 
-         const struct gl_vertex_array *glarray = input->glarray;
-         const struct gl_array_attributes *glattrib = glarray->VertexAttrib;
+         const struct gl_array_attributes *glattrib = input->glattrib;
          const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
             upload_format_size(upload_format) : glattrib->Size;
 
@@ -772,8 +831,7 @@
       };
 
 #if GEN_GEN >= 8
-      if (uses_firstvertex ||
-          vs_prog_data->uses_baseinstance) {
+      if (uses_draw_params) {
          elem_state.VertexBufferIndex = brw->vb.nr_buffers;
          elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
          elem_state.Component0Control = VFCOMP_STORE_SRC;
@@ -782,11 +840,10 @@
 #else
       elem_state.VertexBufferIndex = brw->vb.nr_buffers;
       elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
-      if (uses_firstvertex)
+      if (uses_draw_params) {
          elem_state.Component0Control = VFCOMP_STORE_SRC;
-
-      if (vs_prog_data->uses_baseinstance)
          elem_state.Component1Control = VFCOMP_STORE_SRC;
+      }
 
       if (vs_prog_data->uses_vertexid)
          elem_state.Component2Control = VFCOMP_STORE_VID;
@@ -799,13 +856,13 @@
       dw += GENX(VERTEX_ELEMENT_STATE_length);
    }
 
-   if (vs_prog_data->uses_drawid) {
+   if (uses_derived_draw_params) {
       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
          .Valid = true,
          .VertexBufferIndex = brw->vb.nr_buffers + 1,
-         .SourceElementFormat = ISL_FORMAT_R32_UINT,
+         .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
          .Component0Control = VFCOMP_STORE_SRC,
-         .Component1Control = VFCOMP_STORE_0,
+         .Component1Control = VFCOMP_STORE_SRC,
          .Component2Control = VFCOMP_STORE_0,
          .Component3Control = VFCOMP_STORE_0,
 #if GEN_GEN < 5
@@ -819,8 +876,7 @@
 
 #if GEN_GEN >= 6
    if (gen6_edgeflag_input) {
-      const struct gl_vertex_array *glarray = gen6_edgeflag_input->glarray;
-      const struct gl_array_attributes *glattrib = glarray->VertexAttrib;
+      const struct gl_array_attributes *glattrib = gen6_edgeflag_input->glattrib;
       const uint32_t format = brw_get_vertex_surface_type(brw, glattrib);
 
       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
@@ -891,6 +947,8 @@
    if (index_buffer == NULL)
       return;
 
+   vf_invalidate_for_ib_48bit_transition(brw);
+
    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 #if GEN_GEN < 8 && !GEN_IS_HASWELL
       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
@@ -1410,7 +1468,7 @@
 #endif
 
 #if GEN_GEN == 7
-      clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
+      clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
 
       if (ctx->Polygon.CullFlag) {
          switch (ctx->Polygon.CullFaceMode) {
@@ -1525,7 +1583,7 @@
 
 #if GEN_GEN <= 7
    /* _NEW_BUFFERS */
-   bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   bool flip_y = ctx->DrawBuffer->FlipY;
    UNUSED const bool multisampled_fbo =
       _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 #endif
@@ -1577,7 +1635,7 @@
 
 #if GEN_GEN <= 7
       /* _NEW_POLYGON */
-      sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
+      sf.FrontWinding = brw->polygon_front_bit != flip_y;
 #if GEN_GEN >= 6
       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
@@ -1715,7 +1773,7 @@
        * Window coordinates in an FBO are inverted, which means point
        * sprite origin must be inverted, too.
        */
-      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
       } else {
          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
@@ -1821,47 +1879,100 @@
          /* Pointer to the WM constant buffer.  Covered by the set of
           * state flags from gen6_upload_wm_push_constants.
           */
-         wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
-         wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
       }
    }
 #endif
 
 #if GEN_GEN >= 6
    brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
+#else
+   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
+   brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
+#endif
+
+#if GEN_GEN <= 6
+      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
+      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
+      wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
+#endif
+
+#if GEN_GEN == 4
+      /* On gen4, we only have one shader kernel */
+      if (brw_wm_state_has_ksp(wm, 0)) {
+         assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
+         wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
+         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+         wm.DispatchGRFStartRegisterForConstantSetupData0 =
+            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
+      }
+#elif GEN_GEN == 5
+      /* On gen5, we have multiple shader kernels but only one GRF start
+       * register for all kernels
+       */
+      wm.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+      wm.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+      wm.KernelStartPointer2 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+      wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
+      wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
+      wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
+
+      wm.DispatchGRFStartRegisterForConstantSetupData0 =
+         wm_prog_data->base.dispatch_grf_start_reg;
+
+      /* Dispatch GRF Start should be the same for all shaders on gen5 */
+      if (brw_wm_state_has_ksp(wm, 1)) {
+         assert(wm_prog_data->base.dispatch_grf_start_reg ==
+                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
+      }
+      if (brw_wm_state_has_ksp(wm, 2)) {
+         assert(wm_prog_data->base.dispatch_grf_start_reg ==
+                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
+      }
+#elif GEN_GEN == 6
+      /* On gen6, we have multiple shader kernels and we no longer specify a
+       * register count for each one.
+       */
+      wm.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
+      wm.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
+      wm.KernelStartPointer2 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
+
+      wm.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
+      wm.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
+      wm.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
+#endif
+
+#if GEN_GEN <= 5
+      wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
+      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
+      wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
+      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
+      wm.SetupURBEntryReadOffset = 0;
+      wm.EarlyDepthTestEnable = true;
+#endif
+
+#if GEN_GEN >= 6
       wm.LineAntialiasingRegionWidth = _10pixels;
       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
 
       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
       wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
 #else
-   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
-   brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
-      if (wm_prog_data->dispatch_8 && wm_prog_data->dispatch_16) {
-         /* These two fields should be the same pre-gen6, which is why we
-          * only have one hardware field to program for both dispatch
-          * widths.
-          */
-         assert(wm_prog_data->base.dispatch_grf_start_reg ==
-                wm_prog_data->dispatch_grf_start_reg_2);
-      }
-
-      if (wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16)
-         wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0;
-
       if (stage_state->sampler_count)
          wm.SamplerStatePointer =
             ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
-#if GEN_GEN == 5
-      if (wm_prog_data->prog_offset_2)
-         wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2;
-#endif
 
-      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
-      wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
-      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
-      wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
-      wm.EarlyDepthTestEnable = true;
       wm.LineAntialiasingRegionWidth = _05pixels;
       wm.LineEndCapAntialiasingRegionWidth = _10pixels;
 
@@ -1896,21 +2007,6 @@
       wm.BindingTableEntryCount =
          wm_prog_data->base.binding_table.size_bytes / 4;
       wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
-      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
-      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
-      wm.DispatchGRFStartRegisterForConstantSetupData0 =
-         wm_prog_data->base.dispatch_grf_start_reg;
-      if (GEN_GEN == 6 ||
-          wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16) {
-         wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
-      }
-
-#if GEN_GEN >= 5
-      if (GEN_GEN == 6 || wm_prog_data->prog_offset_2) {
-         wm.KernelStartPointer2 =
-            KSP(brw, stage_state->prog_offset + wm_prog_data->prog_offset_2);
-      }
-#endif
 
 #if GEN_GEN == 6
       wm.DualSourceBlendEnable =
@@ -1935,9 +2031,6 @@
          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
       else
          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
-
-      wm.DispatchGRFStartRegisterForConstantSetupData2 =
-         wm_prog_data->dispatch_grf_start_reg_2;
 #endif
 
       if (wm_prog_data->base.total_scratch) {
@@ -2072,7 +2165,13 @@
    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
    pkt.SamplerCount       =                                               \
       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
+   /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to     \
+    * disable prefetching of binding tables in A0 and B0 steppings.       \
+    * TODO: Revisit this WA on C0 stepping.                               \
+    */                                                                    \
    pkt.BindingTableEntryCount =                                           \
+      GEN_GEN == 11 ?                                                     \
+      0 :                                                                 \
       stage_prog_data->binding_table.size_bytes / 4;                      \
    pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
                                                                           \
@@ -2122,8 +2221,8 @@
    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
       if (stage_state->push_const_size != 0) {
          cvs.Buffer0Valid = true;
-         cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
-         cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
       }
    }
 #endif
@@ -2280,7 +2379,7 @@
 
 static void
 set_scissor_bits(const struct gl_context *ctx, int i,
-                 bool render_to_fbo, unsigned fb_width, unsigned fb_height,
+                 bool flip_y, unsigned fb_width, unsigned fb_height,
                  struct GENX(SCISSOR_RECT) *sc)
 {
    int bbox[4];
@@ -2302,7 +2401,7 @@
       sc->ScissorRectangleXMax = 0;
       sc->ScissorRectangleYMin = 1;
       sc->ScissorRectangleYMax = 0;
-   } else if (render_to_fbo) {
+   } else if (!flip_y) {
       /* texmemory: Y=0=bottom */
       sc->ScissorRectangleXMin = bbox[0];
       sc->ScissorRectangleXMax = bbox[1] - 1;
@@ -2322,7 +2421,7 @@
 genX(upload_scissor_state)(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
    struct GENX(SCISSOR_RECT) scissor;
    uint32_t scissor_state_offset;
    const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
@@ -2346,7 +2445,7 @@
     * inclusive but max is exclusive.
     */
    for (unsigned i = 0; i < viewport_count; i++) {
-      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
+      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
       GENX(SCISSOR_RECT_pack)(
          NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
    }
@@ -2415,6 +2514,17 @@
     */
    const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
 
+   /* Workaround: prevent gpu hangs on SandyBridge
+    * by disabling guardband clipping for odd dimensions.
+    */
+   if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) {
+      *xmin = -1.0f;
+      *xmax =  1.0f;
+      *ymin = -1.0f;
+      *ymax =  1.0f;
+      return;
+   }
+
    if (m00 != 0 && m11 != 0) {
       /* First, we compute the screen-space render area */
       const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
@@ -2461,7 +2571,7 @@
    const unsigned viewport_count = brw->clip.viewport_count;
 
    /* _NEW_BUFFERS */
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
    const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
    const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
 
@@ -2485,12 +2595,12 @@
 #endif
 
    /* _NEW_BUFFERS */
-   if (render_to_fbo) {
-      y_scale = 1.0;
-      y_bias = 0;
-   } else {
+   if (flip_y) {
       y_scale = -1.0;
       y_bias = (float)fb_height;
+   } else {
+      y_scale = 1.0;
+      y_bias = 0;
    }
 
    for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
@@ -2518,7 +2628,7 @@
       clv.YMaxClipGuardband = gb_ymax;
 
 #if GEN_GEN < 6
-      set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
+      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
                        &sfv.ScissorRectangle);
 #elif GEN_GEN >= 8
       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
@@ -2535,16 +2645,16 @@
       const float viewport_Ymax =
          MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
 
-      if (render_to_fbo) {
-         sfv.XMinViewPort = viewport_Xmin;
-         sfv.XMaxViewPort = viewport_Xmax - 1;
-         sfv.YMinViewPort = viewport_Ymin;
-         sfv.YMaxViewPort = viewport_Ymax - 1;
-      } else {
+      if (flip_y) {
          sfv.XMinViewPort = viewport_Xmin;
          sfv.XMaxViewPort = viewport_Xmax - 1;
          sfv.YMinViewPort = fb_height - viewport_Ymax;
          sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
+      } else {
+         sfv.XMinViewPort = viewport_Xmin;
+         sfv.XMaxViewPort = viewport_Xmax - 1;
+         sfv.YMinViewPort = viewport_Ymin;
+         sfv.YMaxViewPort = viewport_Ymax - 1;
       }
 #endif
 
@@ -2614,8 +2724,8 @@
    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
       if (active && stage_state->push_const_size != 0) {
          cgs.Buffer0Valid = true;
-         cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
-         cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
+         cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
+         cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
       }
    }
 #endif
@@ -3475,14 +3585,14 @@
       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
 
       /* _NEW_BUFFERS */
-      bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+      bool flip_y = ctx->DrawBuffer->FlipY;
 
       /* _NEW_POINT
        *
        * Window coordinates in an FBO are inverted, which means point
        * sprite origin must be inverted.
        */
-      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
+      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
          sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
       else
          sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
@@ -3861,7 +3971,13 @@
          DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
 
       /* BRW_NEW_FS_PROG_DATA */
-      ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
+      /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable
+       * prefetching of binding tables in A0 and B0 steppings.
+       * TODO: Revisit this workaround on C0 stepping.
+       */
+      ps.BindingTableEntryCount = GEN_GEN == 11 ?
+                                  0 :
+                                  prog_data->base.binding_table.size_bytes / 4;
 
       if (prog_data->base.use_alt_mode)
          ps.FloatingPointMode = Alternate;
@@ -3937,14 +4053,37 @@
 
       ps._8PixelDispatchEnable = prog_data->dispatch_8;
       ps._16PixelDispatchEnable = prog_data->dispatch_16;
-      ps.DispatchGRFStartRegisterForConstantSetupData0 =
-         prog_data->base.dispatch_grf_start_reg;
-      ps.DispatchGRFStartRegisterForConstantSetupData2 =
-         prog_data->dispatch_grf_start_reg_2;
+      ps._32PixelDispatchEnable = prog_data->dispatch_32;
 
-      ps.KernelStartPointer0 = stage_state->prog_offset;
+      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
+       *
+       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
+       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
+       *
+       * Since 16x MSAA is first introduced on SKL, we don't need to apply
+       * the workaround on any older hardware.
+       *
+       * BRW_NEW_NUM_SAMPLES
+       */
+      if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
+          brw->num_samples == 16) {
+         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
+         ps._32PixelDispatchEnable = false;
+      }
+
+      ps.DispatchGRFStartRegisterForConstantSetupData0 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
+      ps.DispatchGRFStartRegisterForConstantSetupData1 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
+      ps.DispatchGRFStartRegisterForConstantSetupData2 =
+         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
+
+      ps.KernelStartPointer0 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 0);
+      ps.KernelStartPointer1 = stage_state->prog_offset +
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 1);
       ps.KernelStartPointer2 = stage_state->prog_offset +
-         prog_data->prog_offset_2;
+                               brw_wm_prog_data_prog_offset(prog_data, ps, 2);
 
       if (prog_data->base.total_scratch) {
          ps.ScratchSpaceBasePointer =
@@ -3962,7 +4101,8 @@
                             : 0),
       .brw   = BRW_NEW_BATCH |
                BRW_NEW_BLORP |
-               BRW_NEW_FS_PROG_DATA,
+               BRW_NEW_FS_PROG_DATA |
+               (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
    },
    .emit = genX(upload_ps),
 };
@@ -4380,7 +4520,7 @@
    const struct gl_context *ctx = &brw->ctx;
 
    /* _NEW_BUFFERS */
-   const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
+   const bool flip_y = ctx->DrawBuffer->FlipY;
 
    /* _NEW_POLYGON */
    const struct gl_polygon_attrib *polygon = &ctx->Polygon;
@@ -4389,7 +4529,7 @@
    const struct gl_point_attrib *point = &ctx->Point;
 
    brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
-      if (brw->polygon_front_bit == render_to_fbo)
+      if (brw->polygon_front_bit != flip_y)
          raster.FrontWinding = CounterClockwise;
 
       if (polygon->CullFlag) {
@@ -5604,7 +5744,7 @@
 
       &genX(scissor_state),
 
-      &gen7_depthbuffer,
+      &brw_depthbuffer,
 
       &genX(polygon_stipple),
       &genX(polygon_stipple_offset),
@@ -5695,7 +5835,7 @@
 
       &genX(scissor_state),
 
-      &gen7_depthbuffer,
+      &brw_depthbuffer,
 
       &genX(polygon_stipple),
       &genX(polygon_stipple_offset),
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index b60d7d6..73ccecf 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -32,6 +32,7 @@
 #include "brw_defines.h"
 #include "brw_state.h"
 #include "common/gen_decoder.h"
+#include "common/gen_gem.h"
 
 #include "util/hash_table.h"
 
@@ -54,8 +55,10 @@
 
 static void
 intel_batchbuffer_reset(struct brw_context *brw);
+static void
+brw_new_batch(struct brw_context *brw);
 
-UNUSED static void
+static void
 dump_validation_list(struct intel_batchbuffer *batch)
 {
    fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
@@ -77,6 +80,40 @@
    }
 }
 
+static struct gen_batch_decode_bo
+decode_get_bo(void *v_brw, uint64_t address)
+{
+   struct brw_context *brw = v_brw;
+   struct intel_batchbuffer *batch = &brw->batch;
+
+   for (int i = 0; i < batch->exec_count; i++) {
+      struct brw_bo *bo = batch->exec_bos[i];
+      /* The decoder zeroes out the top 16 bits, so we need to as well */
+      uint64_t bo_address = bo->gtt_offset & (~0ull >> 16);
+
+      if (address >= bo_address && address < bo_address + bo->size) {
+         return (struct gen_batch_decode_bo) {
+            .addr = address,
+            .size = bo->size,
+            .map = brw_bo_map(brw, bo, MAP_READ) + (address - bo_address),
+         };
+      }
+   }
+
+   return (struct gen_batch_decode_bo) { };
+}
+
+static unsigned
+decode_get_state_size(void *v_brw, uint32_t offset_from_dsba)
+{
+   struct brw_context *brw = v_brw;
+   struct intel_batchbuffer *batch = &brw->batch;
+   struct hash_entry *entry =
+      _mesa_hash_table_search(batch->state_batch_sizes,
+                              (void *) (uintptr_t) offset_from_dsba);
+   return entry ? (uintptr_t) entry->data : 0;
+}
+
 static bool
 uint_key_compare(const void *a, const void *b)
 {
@@ -107,15 +144,11 @@
 
    batch->use_shadow_copy = !devinfo->has_llc;
 
-   if (batch->use_shadow_copy) {
-      batch->batch.map = malloc(BATCH_SZ);
-      batch->map_next = batch->batch.map;
-      batch->state.map = malloc(STATE_SZ);
-   }
-
    init_reloc_list(&batch->batch_relocs, 250);
    init_reloc_list(&batch->state_relocs, 250);
 
+   batch->batch.map = NULL;
+   batch->state.map = NULL;
    batch->exec_count = 0;
    batch->exec_array_size = 100;
    batch->exec_bos =
@@ -126,6 +159,17 @@
    if (INTEL_DEBUG & DEBUG_BATCH) {
       batch->state_batch_sizes =
          _mesa_hash_table_create(NULL, uint_key_hash, uint_key_compare);
+
+      const unsigned decode_flags =
+         GEN_BATCH_DECODE_FULL |
+         ((INTEL_DEBUG & DEBUG_COLOR) ? GEN_BATCH_DECODE_IN_COLOR : 0) |
+         GEN_BATCH_DECODE_OFFSETS |
+         GEN_BATCH_DECODE_FLOATS;
+
+      gen_batch_decode_ctx_init(&batch->decoder, devinfo, stderr,
+                                decode_flags, NULL, decode_get_bo,
+                                decode_get_state_size, brw);
+      batch->decoder.max_vbo_decoded_lines = 100;
    }
 
    batch->use_batch_first =
@@ -184,19 +228,27 @@
 static void
 recreate_growing_buffer(struct brw_context *brw,
                         struct brw_growing_bo *grow,
-                        const char *name, unsigned size)
+                        const char *name, unsigned size,
+                        enum brw_memory_zone memzone)
 {
    struct intel_screen *screen = brw->screen;
    struct intel_batchbuffer *batch = &brw->batch;
    struct brw_bufmgr *bufmgr = screen->bufmgr;
 
-   grow->bo = brw_bo_alloc(bufmgr, name, size);
-   grow->bo->kflags = can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
+   /* We can't grow buffers when using softpin, so just overallocate them. */
+   if (brw_using_softpin(bufmgr))
+      size *= 2;
+
+   grow->bo = brw_bo_alloc(bufmgr, name, size, memzone);
+   grow->bo->kflags |= can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
    grow->partial_bo = NULL;
    grow->partial_bo_map = NULL;
    grow->partial_bytes = 0;
+   grow->memzone = memzone;
 
-   if (!batch->use_shadow_copy)
+   if (batch->use_shadow_copy)
+      grow->map = realloc(grow->map, grow->bo->size);
+   else
       grow->map = brw_bo_map(brw, grow->bo, MAP_READ | MAP_WRITE);
 }
 
@@ -211,10 +263,12 @@
    }
    batch->last_bo = batch->batch.bo;
 
-   recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ);
+   recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ,
+                           BRW_MEMZONE_OTHER);
    batch->map_next = batch->batch.map;
 
-   recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ);
+   recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ,
+                           BRW_MEMZONE_DYNAMIC);
 
    /* Avoid making 0 a valid state offset - otherwise the decoder will try
     * and decode data when we use offset 0 as a null pointer.
@@ -227,11 +281,6 @@
    batch->needs_sol_reset = false;
    batch->state_base_address_emitted = false;
 
-   /* We don't know what ring the new batch will be sent to until we see the
-    * first BEGIN_BATCH or BEGIN_BATCH_BLT.  Mark it as unknown.
-    */
-   batch->ring = UNKNOWN_RING;
-
    if (batch->state_batch_sizes)
       _mesa_hash_table_clear(batch->state_batch_sizes, NULL);
 }
@@ -252,6 +301,13 @@
    brw->batch.saved.exec_count = brw->batch.exec_count;
 }
 
+bool
+intel_batchbuffer_saved_state_is_empty(struct brw_context *brw)
+{
+   struct intel_batchbuffer *batch = &brw->batch;
+   return (batch->saved.map_next == batch->batch.map);
+}
+
 void
 intel_batchbuffer_reset_to_saved(struct brw_context *brw)
 {
@@ -265,7 +321,7 @@
 
    brw->batch.map_next = brw->batch.saved.map_next;
    if (USED_BATCH(brw->batch) == 0)
-      brw->batch.ring = UNKNOWN_RING;
+      brw_new_batch(brw);
 }
 
 void
@@ -287,8 +343,10 @@
    brw_bo_unreference(batch->last_bo);
    brw_bo_unreference(batch->batch.bo);
    brw_bo_unreference(batch->state.bo);
-   if (batch->state_batch_sizes)
+   if (batch->state_batch_sizes) {
       _mesa_hash_table_destroy(batch->state_batch_sizes, NULL);
+      gen_batch_decode_ctx_finish(&batch->decoder);
+   }
 }
 
 /**
@@ -340,6 +398,13 @@
    struct brw_bufmgr *bufmgr = brw->bufmgr;
    struct brw_bo *bo = grow->bo;
 
+   /* We can't grow buffers that are softpinned, as the growing mechanism
+    * involves putting a larger buffer at the same gtt_offset...and we've
+    * only allocated the smaller amount of VMA.  Without relocations, this
+    * simply won't work.  This should never happen, however.
+    */
+   assert(!(bo->kflags & EXEC_OBJECT_PINNED));
+
    perf_debug("Growing %s - ran out of space\n", bo->name);
 
    if (grow->partial_bo) {
@@ -351,7 +416,8 @@
       finish_growing_bos(grow);
    }
 
-   struct brw_bo *new_bo = brw_bo_alloc(bufmgr, bo->name, new_size);
+   struct brw_bo *new_bo =
+      brw_bo_alloc(bufmgr, bo->name, new_size, grow->memzone);
 
    /* Copy existing data to the new larger buffer */
    grow->partial_bo_map = grow->map;
@@ -457,18 +523,10 @@
 }
 
 void
-intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
-                                enum brw_gpu_ring ring)
+intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz)
 {
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    struct intel_batchbuffer *batch = &brw->batch;
 
-   /* If we're switching rings, implicitly flush the batch. */
-   if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
-       devinfo->gen >= 6) {
-      intel_batchbuffer_flush(brw);
-   }
-
    const unsigned batch_used = USED_BATCH(*batch) * 4;
    if (batch_used + sz >= BATCH_SZ && !batch->no_wrap) {
       intel_batchbuffer_flush(brw);
@@ -480,222 +538,8 @@
       batch->map_next = (void *) batch->batch.map + batch_used;
       assert(batch_used + sz < batch->batch.bo->size);
    }
-
-   /* The intel_batchbuffer_flush() calls above might have changed
-    * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
-    */
-   brw->batch.ring = ring;
 }
 
-#ifdef DEBUG
-#define CSI "\e["
-#define BLUE_HEADER  CSI "0;44m"
-#define NORMAL       CSI "0m"
-
-
-static void
-decode_struct(struct brw_context *brw, struct gen_spec *spec,
-              const char *struct_name, uint32_t *data,
-              uint32_t gtt_offset, uint32_t offset, bool color)
-{
-   struct gen_group *group = gen_spec_find_struct(spec, struct_name);
-   if (!group)
-      return;
-
-   fprintf(stderr, "%s\n", struct_name);
-   gen_print_group(stderr, group, gtt_offset + offset,
-                   &data[offset / 4], 0, color);
-}
-
-static void
-decode_structs(struct brw_context *brw, struct gen_spec *spec,
-               const char *struct_name,
-               uint32_t *data, uint32_t gtt_offset, uint32_t offset,
-               int struct_size, bool color)
-{
-   struct gen_group *group = gen_spec_find_struct(spec, struct_name);
-   if (!group)
-      return;
-
-   int entries = brw_state_batch_size(brw, offset) / struct_size;
-   for (int i = 0; i < entries; i++) {
-      fprintf(stderr, "%s %d\n", struct_name, i);
-      gen_print_group(stderr, group, gtt_offset + offset,
-                      &data[(offset + i * struct_size) / 4], 0, color);
-   }
-}
-
-static void
-do_batch_dump(struct brw_context *brw)
-{
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   struct intel_batchbuffer *batch = &brw->batch;
-   struct gen_spec *spec = gen_spec_load(&brw->screen->devinfo);
-
-   if (batch->ring != RENDER_RING)
-      return;
-
-   uint32_t *batch_data = brw_bo_map(brw, batch->batch.bo, MAP_READ);
-   uint32_t *state = brw_bo_map(brw, batch->state.bo, MAP_READ);
-   if (batch_data == NULL || state == NULL) {
-      fprintf(stderr, "WARNING: failed to map batchbuffer/statebuffer\n");
-      return;
-   }
-
-   uint32_t *end = batch_data + USED_BATCH(*batch);
-   uint32_t batch_gtt_offset = batch->batch.bo->gtt_offset;
-   uint32_t state_gtt_offset = batch->state.bo->gtt_offset;
-   int length;
-
-   bool color = INTEL_DEBUG & DEBUG_COLOR;
-   const char *header_color = color ? BLUE_HEADER : "";
-   const char *reset_color  = color ? NORMAL : "";
-
-   for (uint32_t *p = batch_data; p < end; p += length) {
-      struct gen_group *inst = gen_spec_find_instruction(spec, p);
-      length = gen_group_get_length(inst, p);
-      assert(inst == NULL || length > 0);
-      length = MAX2(1, length);
-      if (inst == NULL) {
-         fprintf(stderr, "unknown instruction %08x\n", p[0]);
-         continue;
-      }
-
-      uint64_t offset = batch_gtt_offset + 4 * (p - batch_data);
-
-      fprintf(stderr, "%s0x%08"PRIx64":  0x%08x:  %-80s%s\n", header_color,
-              offset, p[0], gen_group_get_name(inst), reset_color);
-
-      gen_print_group(stderr, inst, offset, p, 0, color);
-
-      switch (gen_group_get_opcode(inst) >> 16) {
-      case _3DSTATE_PIPELINED_POINTERS:
-         /* Note: these Gen4-5 pointers are full relocations rather than
-          * offsets from the start of the statebuffer.  So we need to subtract
-          * gtt_offset (the start of the statebuffer) to obtain an offset we
-          * can add to the map and get at the data.
-          */
-         decode_struct(brw, spec, "VS_STATE", state, state_gtt_offset,
-                       (p[1] & ~0x1fu) - state_gtt_offset, color);
-         if (p[2] & 1) {
-            decode_struct(brw, spec, "GS_STATE", state, state_gtt_offset,
-                          (p[2] & ~0x1fu) - state_gtt_offset, color);
-         }
-         if (p[3] & 1) {
-            decode_struct(brw, spec, "CLIP_STATE", state, state_gtt_offset,
-                          (p[3] & ~0x1fu) - state_gtt_offset, color);
-         }
-         decode_struct(brw, spec, "SF_STATE", state, state_gtt_offset,
-                       (p[4] & ~0x1fu) - state_gtt_offset, color);
-         decode_struct(brw, spec, "WM_STATE", state, state_gtt_offset,
-                       (p[5] & ~0x1fu) - state_gtt_offset, color);
-         decode_struct(brw, spec, "COLOR_CALC_STATE", state, state_gtt_offset,
-                       (p[6] & ~0x3fu) - state_gtt_offset, color);
-         break;
-      case _3DSTATE_BINDING_TABLE_POINTERS_VS:
-      case _3DSTATE_BINDING_TABLE_POINTERS_HS:
-      case _3DSTATE_BINDING_TABLE_POINTERS_DS:
-      case _3DSTATE_BINDING_TABLE_POINTERS_GS:
-      case _3DSTATE_BINDING_TABLE_POINTERS_PS: {
-         struct gen_group *group =
-            gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
-         if (!group)
-            break;
-
-         uint32_t bt_offset = p[1] & ~0x1fu;
-         int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
-         uint32_t *bt_pointers = &state[bt_offset / 4];
-         for (int i = 0; i < bt_entries; i++) {
-            fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
-            gen_print_group(stderr, group, state_gtt_offset + bt_pointers[i],
-                            &state[bt_pointers[i] / 4], 0, color);
-         }
-         break;
-      }
-      case _3DSTATE_SAMPLER_STATE_POINTERS_VS:
-      case _3DSTATE_SAMPLER_STATE_POINTERS_HS:
-      case _3DSTATE_SAMPLER_STATE_POINTERS_DS:
-      case _3DSTATE_SAMPLER_STATE_POINTERS_GS:
-      case _3DSTATE_SAMPLER_STATE_POINTERS_PS:
-         decode_structs(brw, spec, "SAMPLER_STATE", state,
-                        state_gtt_offset, p[1] & ~0x1fu, 4 * 4, color);
-         break;
-      case _3DSTATE_VIEWPORT_STATE_POINTERS:
-         decode_structs(brw, spec, "CLIP_VIEWPORT", state,
-                        state_gtt_offset, p[1] & ~0x3fu, 4 * 4, color);
-         decode_structs(brw, spec, "SF_VIEWPORT", state,
-                        state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
-         decode_structs(brw, spec, "CC_VIEWPORT", state,
-                        state_gtt_offset, p[3] & ~0x3fu, 2 * 4, color);
-         break;
-      case _3DSTATE_VIEWPORT_STATE_POINTERS_CC:
-         decode_structs(brw, spec, "CC_VIEWPORT", state,
-                        state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
-         break;
-      case _3DSTATE_VIEWPORT_STATE_POINTERS_SF_CL:
-         decode_structs(brw, spec, "SF_CLIP_VIEWPORT", state,
-                        state_gtt_offset, p[1] & ~0x3fu, 16 * 4, color);
-         break;
-      case _3DSTATE_SCISSOR_STATE_POINTERS:
-         decode_structs(brw, spec, "SCISSOR_RECT", state,
-                        state_gtt_offset, p[1] & ~0x1fu, 2 * 4, color);
-         break;
-      case _3DSTATE_BLEND_STATE_POINTERS:
-         /* TODO: handle Gen8+ extra dword at the beginning */
-         decode_structs(brw, spec, "BLEND_STATE", state,
-                        state_gtt_offset, p[1] & ~0x3fu, 8 * 4, color);
-         break;
-      case _3DSTATE_CC_STATE_POINTERS:
-         if (devinfo->gen >= 7) {
-            decode_struct(brw, spec, "COLOR_CALC_STATE", state,
-                          state_gtt_offset, p[1] & ~0x3fu, color);
-         } else if (devinfo->gen == 6) {
-            decode_structs(brw, spec, "BLEND_STATE", state,
-                           state_gtt_offset, p[1] & ~0x3fu, 2 * 4, color);
-            decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
-                          state_gtt_offset, p[2] & ~0x3fu, color);
-            decode_struct(brw, spec, "COLOR_CALC_STATE", state,
-                          state_gtt_offset, p[3] & ~0x3fu, color);
-         }
-         break;
-      case _3DSTATE_DEPTH_STENCIL_STATE_POINTERS:
-         decode_struct(brw, spec, "DEPTH_STENCIL_STATE", state,
-                       state_gtt_offset, p[1] & ~0x3fu, color);
-         break;
-      case MEDIA_INTERFACE_DESCRIPTOR_LOAD: {
-         struct gen_group *group =
-            gen_spec_find_struct(spec, "RENDER_SURFACE_STATE");
-         if (!group)
-            break;
-
-         uint32_t idd_offset = p[3] & ~0x1fu;
-         decode_struct(brw, spec, "INTERFACE_DESCRIPTOR_DATA", state,
-                       state_gtt_offset, idd_offset, color);
-
-         uint32_t ss_offset = state[idd_offset / 4 + 3] & ~0x1fu;
-         decode_structs(brw, spec, "SAMPLER_STATE", state,
-                        state_gtt_offset, ss_offset, 4 * 4, color);
-
-         uint32_t bt_offset = state[idd_offset / 4 + 4] & ~0x1fu;
-         int bt_entries = brw_state_batch_size(brw, bt_offset) / 4;
-         uint32_t *bt_pointers = &state[bt_offset / 4];
-         for (int i = 0; i < bt_entries; i++) {
-            fprintf(stderr, "SURFACE_STATE - BTI = %d\n", i);
-            gen_print_group(stderr, group, state_gtt_offset + bt_pointers[i],
-                            &state[bt_pointers[i] / 4], 0, color);
-         }
-         break;
-      }
-      }
-   }
-
-   brw_bo_unmap(batch->batch.bo);
-   brw_bo_unmap(batch->state.bo);
-}
-#else
-static void do_batch_dump(struct brw_context *brw) { }
-#endif
-
 /**
  * Called when starting a new batch buffer.
  */
@@ -760,46 +604,44 @@
     */
    brw_emit_query_end(brw);
 
-   if (brw->batch.ring == RENDER_RING) {
-      /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
-       * assume that the L3 cache is configured according to the hardware
-       * defaults.  On Kernel 4.16+, we no longer need to do this.
+   /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
+    * assume that the L3 cache is configured according to the hardware
+    * defaults.  On Kernel 4.16+, we no longer need to do this.
+    */
+   if (devinfo->gen >= 7 &&
+       !(brw->screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION))
+      gen7_restore_default_l3_config(brw);
+
+   if (devinfo->is_haswell) {
+      /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
+       * 3DSTATE_CC_STATE_POINTERS > "Note":
+       *
+       * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
+       *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
+       *
+       * From the example in the docs, it seems to expect a regular pipe control
+       * flush here as well. We may have done it already, but meh.
+       *
+       * See also WaAvoidRCZCounterRollover.
        */
-      if (devinfo->gen >= 7 &&
-          !(brw->screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION))
-         gen7_restore_default_l3_config(brw);
-
-      if (devinfo->is_haswell) {
-         /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
-          * 3DSTATE_CC_STATE_POINTERS > "Note":
-          *
-          * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
-          *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
-          *
-          * From the example in the docs, it seems to expect a regular pipe control
-          * flush here as well. We may have done it already, but meh.
-          *
-          * See also WaAvoidRCZCounterRollover.
-          */
-         brw_emit_mi_flush(brw);
-         BEGIN_BATCH(2);
-         OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
-         OUT_BATCH(brw->cc.state_offset | 1);
-         ADVANCE_BATCH();
-         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
-                                          PIPE_CONTROL_CS_STALL);
-      }
-
-      /* Do not restore push constant packets during context restore. */
-      if (devinfo->gen >= 7)
-         gen10_emit_isp_disable(brw);
+      brw_emit_mi_flush(brw);
+      BEGIN_BATCH(2);
+      OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
+      OUT_BATCH(brw->cc.state_offset | 1);
+      ADVANCE_BATCH();
+      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                       PIPE_CONTROL_CS_STALL);
    }
 
+   /* Do not restore push constant packets during context restore. */
+   if (devinfo->gen >= 7)
+      gen10_emit_isp_disable(brw);
+
    /* Emit MI_BATCH_BUFFER_END to finish our batch.  Note that execbuf2
     * requires our batch size to be QWord aligned, so we pad it out if
     * necessary by emitting an extra MI_NOOP after the end.
     */
-   intel_batchbuffer_require_space(brw, 8, brw->batch.ring);
+   intel_batchbuffer_require_space(brw, 8);
    *brw->batch.map_next++ = MI_BATCH_BUFFER_END;
    if (USED_BATCH(brw->batch) & 1) {
       *brw->batch.map_next++ = MI_NOOP;
@@ -890,6 +732,7 @@
 
       /* Update brw_bo::gtt_offset */
       if (batch->validation_list[i].offset != bo->gtt_offset) {
+         assert(!(bo->kflags & EXEC_OBJECT_PINNED));
          DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
              bo->gem_handle, bo->gtt_offset,
              batch->validation_list[i].offset);
@@ -906,7 +749,6 @@
 static int
 submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
 {
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    __DRIscreen *dri_screen = brw->screen->driScrnPriv;
    struct intel_batchbuffer *batch = &brw->batch;
    int ret = 0;
@@ -935,18 +777,11 @@
        *   To avoid stalling, execobject.offset should match the current
        *   address of that object within the active context.
        */
-      int flags = I915_EXEC_NO_RELOC;
+      int flags = I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
 
-      if (devinfo->gen >= 6 && batch->ring == BLT_RING) {
-         flags |= I915_EXEC_BLT;
-      } else {
-         flags |= I915_EXEC_RENDER;
-      }
       if (batch->needs_sol_reset)
          flags |= I915_EXEC_GEN7_SOL_RESET;
 
-      uint32_t hw_ctx = batch->ring == RENDER_RING ? brw->hw_ctx : 0;
-
       /* Set statebuffer relocations */
       const unsigned state_index = batch->state.bo->index;
       if (state_index < batch->exec_count &&
@@ -981,15 +816,18 @@
          batch->exec_bos[index] = tmp_bo;
       }
 
-      ret = execbuffer(dri_screen->fd, batch, hw_ctx,
+      ret = execbuffer(dri_screen->fd, batch, brw->hw_ctx,
                        4 * USED_BATCH(*batch),
                        in_fence_fd, out_fence_fd, flags);
 
       throttle(brw);
    }
 
-   if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
-      do_batch_dump(brw);
+   if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
+      gen_print_batch(&batch->decoder, batch->batch.map,
+                      4 * USED_BATCH(*batch),
+                      batch->batch.bo->gtt_offset);
+   }
 
    if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
       brw_check_for_reset(brw);
@@ -1046,6 +884,8 @@
               (float) brw->batch.aperture_space / (1024 * 1024),
               brw->batch.batch_relocs.reloc_count,
               brw->batch.state_relocs.reloc_count);
+
+      dump_validation_list(&brw->batch);
    }
 
    ret = submit_batch(brw, in_fence_fd, out_fence_fd);
@@ -1092,6 +932,14 @@
 {
    assert(target != NULL);
 
+   if (target->kflags & EXEC_OBJECT_PINNED) {
+      brw_use_pinned_bo(batch, target, reloc_flags & RELOC_WRITE);
+      return gen_canonical_address(target->gtt_offset + target_offset);
+   }
+
+   unsigned int index = add_exec_bo(batch, target);
+   struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
+
    if (rlist->reloc_count == rlist->reloc_array_size) {
       rlist->reloc_array_size *= 2;
       rlist->relocs = realloc(rlist->relocs,
@@ -1099,9 +947,6 @@
                               sizeof(struct drm_i915_gem_relocation_entry));
    }
 
-   unsigned int index = add_exec_bo(batch, target);
-   struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
-
    if (reloc_flags & RELOC_32BIT) {
       /* Restrict this buffer to the low 32 bits of the address space.
        *
@@ -1135,6 +980,21 @@
    return entry->offset + target_offset;
 }
 
+void
+brw_use_pinned_bo(struct intel_batchbuffer *batch, struct brw_bo *bo,
+                  unsigned writable_flag)
+{
+   assert(bo->kflags & EXEC_OBJECT_PINNED);
+   assert((writable_flag & ~EXEC_OBJECT_WRITE) == 0);
+
+   unsigned int index = add_exec_bo(batch, bo);
+   struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
+   assert(entry->offset == bo->gtt_offset);
+
+   if (writable_flag)
+      entry->flags |= EXEC_OBJECT_WRITE;
+}
+
 uint64_t
 brw_batch_reloc(struct intel_batchbuffer *batch, uint32_t batch_offset,
                 struct brw_bo *target, uint32_t target_offset,
@@ -1157,16 +1017,6 @@
                      target, target_offset, reloc_flags);
 }
 
-
-uint32_t
-brw_state_batch_size(struct brw_context *brw, uint32_t offset)
-{
-   struct hash_entry *entry =
-      _mesa_hash_table_search(brw->batch.state_batch_sizes,
-                              (void *) (uintptr_t) offset);
-   return entry ? (uintptr_t) entry->data : 0;
-}
-
 /**
  * Reserve some space in the statebuffer, or flush.
  *
@@ -1220,10 +1070,10 @@
 
 void
 intel_batchbuffer_data(struct brw_context *brw,
-                       const void *data, GLuint bytes, enum brw_gpu_ring ring)
+                       const void *data, GLuint bytes)
 {
    assert((bytes & 3) == 0);
-   intel_batchbuffer_require_space(brw, bytes, ring);
+   intel_batchbuffer_require_space(brw, bytes);
    memcpy(brw->batch.map_next, data, bytes);
    brw->batch.map_next += bytes >> 2;
 }
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index 7be5b10..95e548c 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -24,9 +24,9 @@
 void intel_batchbuffer_init(struct brw_context *brw);
 void intel_batchbuffer_free(struct intel_batchbuffer *batch);
 void intel_batchbuffer_save_state(struct brw_context *brw);
+bool intel_batchbuffer_saved_state_is_empty(struct brw_context *brw);
 void intel_batchbuffer_reset_to_saved(struct brw_context *brw);
-void intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
-                                     enum brw_gpu_ring ring);
+void intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz);
 int _intel_batchbuffer_flush_fence(struct brw_context *brw,
                                    int in_fence_fd, int *out_fence_fd,
                                    const char *file, int line);
@@ -43,8 +43,7 @@
  * intel_buffer_dword() calls.
  */
 void intel_batchbuffer_data(struct brw_context *brw,
-                            const void *data, GLuint bytes,
-                            enum brw_gpu_ring ring);
+                            const void *data, GLuint bytes);
 
 bool brw_batch_has_aperture_space(struct brw_context *brw,
                                   unsigned extra_space_in_bytes);
@@ -55,6 +54,10 @@
 #define RELOC_NEEDS_GGTT EXEC_OBJECT_NEEDS_GTT
 /* Inverted meaning, but using the same bit...emit_reloc will flip it. */
 #define RELOC_32BIT EXEC_OBJECT_SUPPORTS_48B_ADDRESS
+
+void brw_use_pinned_bo(struct intel_batchbuffer *batch, struct brw_bo *bo,
+                       unsigned writeable_flag);
+
 uint64_t brw_batch_reloc(struct intel_batchbuffer *batch,
                          uint32_t batch_offset,
                          struct brw_bo *target,
@@ -81,9 +84,9 @@
 }
 
 static inline void
-intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
+intel_batchbuffer_begin(struct brw_context *brw, int n)
 {
-   intel_batchbuffer_require_space(brw, n * 4, ring);
+   intel_batchbuffer_require_space(brw, n * 4);
 
 #ifdef DEBUG
    brw->batch.emit = USED_BATCH(brw->batch);
@@ -117,12 +120,13 @@
 }
 
 #define BEGIN_BATCH(n) do {                            \
-   intel_batchbuffer_begin(brw, (n), RENDER_RING);     \
+   intel_batchbuffer_begin(brw, (n));                  \
    uint32_t *__map = brw->batch.map_next;              \
    brw->batch.map_next += (n)
 
 #define BEGIN_BATCH_BLT(n) do {                        \
-   intel_batchbuffer_begin(brw, (n), BLT_RING);        \
+   assert(brw->screen->devinfo.gen < 6);               \
+   intel_batchbuffer_begin(brw, (n));                  \
    uint32_t *__map = brw->batch.map_next;              \
    brw->batch.map_next += (n)
 
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index ec875c8..dae0a91 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -279,7 +279,7 @@
 
    unsigned length = devinfo->gen >= 8 ? 10 : 8;
 
-   intel_batchbuffer_require_space(brw, length * 4, BLT_RING);
+   intel_batchbuffer_require_space(brw, length * 4);
    DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
        __func__,
        src_buffer, src_pitch, src_offset, src_x, src_y,
@@ -653,7 +653,7 @@
 
    unsigned xy_setup_blt_length = devinfo->gen >= 8 ? 10 : 8;
    intel_batchbuffer_require_space(brw, (xy_setup_blt_length * 4) +
-                                        (3 * 4) + dwords * 4, BLT_RING);
+                                        (3 * 4) + dwords * 4);
 
    opcode = XY_SETUP_BLT_CMD;
    if (cpp == 4)
@@ -691,69 +691,13 @@
    OUT_BATCH(SET_FIELD(y + h, BLT_Y) | SET_FIELD(x + w, BLT_X));
    ADVANCE_BATCH();
 
-   intel_batchbuffer_data(brw, src_bits, dwords * 4, BLT_RING);
+   intel_batchbuffer_data(brw, src_bits, dwords * 4);
 
    brw_emit_mi_flush(brw);
 
    return true;
 }
 
-/* We don't have a memmove-type blit like some other hardware, so we'll do a
- * rectangular blit covering a large space, then emit 1-scanline blit at the
- * end to cover the last if we need.
- */
-void
-intel_emit_linear_blit(struct brw_context *brw,
-		       struct brw_bo *dst_bo,
-		       unsigned int dst_offset,
-		       struct brw_bo *src_bo,
-		       unsigned int src_offset,
-		       unsigned int size)
-{
-   struct gl_context *ctx = &brw->ctx;
-   GLuint pitch, height;
-   int16_t src_x, dst_x;
-   bool ok;
-
-   do {
-      /* The pitch given to the GPU must be DWORD aligned, and
-       * we want width to match pitch. Max width is (1 << 15 - 1),
-       * rounding that down to the nearest DWORD is 1 << 15 - 4
-       */
-      pitch = ROUND_DOWN_TO(MIN2(size, (1 << 15) - 64), 4);
-      height = (size < pitch || pitch == 0) ? 1 : size / pitch;
-
-      src_x = src_offset % 64;
-      dst_x = dst_offset % 64;
-      pitch = ALIGN(MIN2(size, (1 << 15) - 64), 4);
-      assert(src_x + pitch < 1 << 15);
-      assert(dst_x + pitch < 1 << 15);
-
-      ok = emit_copy_blit(brw, 1,
-                          pitch, src_bo, src_offset - src_x,
-                          ISL_TILING_LINEAR,
-                          pitch, dst_bo, dst_offset - dst_x,
-                          ISL_TILING_LINEAR,
-                          src_x, 0, /* src x/y */
-                          dst_x, 0, /* dst x/y */
-                          MIN2(size, pitch), height, /* w, h */
-                          COLOR_LOGICOP_COPY);
-      if (!ok) {
-         _mesa_problem(ctx, "Failed to linear blit %dx%d\n",
-                       MIN2(size, pitch), height);
-         return;
-      }
-
-      pitch *= height;
-      if (size <= pitch)
-         return;
-
-      src_offset += pitch;
-      dst_offset += pitch;
-      size -= pitch;
-   } while (1);
-}
-
 /**
  * Used to initialize the alpha value of an ARGB8888 miptree after copying
  * into it from an XRGB8888 source.
diff --git a/src/mesa/drivers/dri/i965/intel_blit.h b/src/mesa/drivers/dri/i965/intel_blit.h
index f3ca7b0..babdfa4 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -61,11 +61,5 @@
 				  GLshort x, GLshort y,
 				  GLshort w, GLshort h,
 				  enum gl_logicop_mode logic_op);
-void intel_emit_linear_blit(struct brw_context *brw,
-			    struct brw_bo *dst_bo,
-			    unsigned int dst_offset,
-			    struct brw_bo *src_bo,
-			    unsigned int src_offset,
-			    unsigned int size);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/intel_buffer_objects.c b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
index 9deb5e7..452e6d3 100644
--- a/src/mesa/drivers/dri/i965/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/i965/intel_buffer_objects.c
@@ -96,7 +96,8 @@
        */
       size += 64 * 32; /* max read length of 64 256-bit units */
    }
-   intel_obj->buffer = brw_bo_alloc(brw->bufmgr, "bufferobj", size);
+   intel_obj->buffer =
+      brw_bo_alloc(brw->bufmgr, "bufferobj", size, BRW_MEMZONE_OTHER);
 
    /* the buffer might be bound as a uniform buffer, need to update it
     */
@@ -290,7 +291,7 @@
                     intel_obj->valid_data_start,
                     intel_obj->valid_data_end);
          struct brw_bo *temp_bo =
-            brw_bo_alloc(brw->bufmgr, "subdata temp", size);
+            brw_bo_alloc(brw->bufmgr, "subdata temp", size, BRW_MEMZONE_OTHER);
 
          brw_bo_subdata(temp_bo, 0, size, data);
 
@@ -462,7 +463,8 @@
       intel_obj->map_extra[index] = (uintptr_t) offset % alignment;
       intel_obj->range_map_bo[index] =
          brw_bo_alloc(brw->bufmgr, "BO blit temp",
-                      length + intel_obj->map_extra[index]);
+                      length + intel_obj->map_extra[index],
+                      BRW_MEMZONE_OTHER);
       void *map = brw_bo_map(brw, intel_obj->range_map_bo[index], access);
       obj->Mappings[index].Pointer = map + intel_obj->map_extra[index];
       return obj->Mappings[index].Pointer;
diff --git a/src/mesa/drivers/dri/i965/intel_buffers.c b/src/mesa/drivers/dri/i965/intel_buffers.c
index fd522cc..dae56e3 100644
--- a/src/mesa/drivers/dri/i965/intel_buffers.c
+++ b/src/mesa/drivers/dri/i965/intel_buffers.c
@@ -33,7 +33,7 @@
 #include "main/renderbuffer.h"
 
 static void
-intelDrawBuffer(struct gl_context * ctx, GLenum mode)
+intelDrawBuffer(struct gl_context *ctx)
 {
    if (_mesa_is_front_buffer_drawing(ctx->DrawBuffer)) {
       struct brw_context *const brw = brw_context(ctx);
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 73a6c73..2e28445 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -118,10 +118,11 @@
    ctx->Extensions.KHR_robustness = true;
    ctx->Extensions.AMD_seamless_cubemap_per_texture = true;
    ctx->Extensions.APPLE_object_purgeable = true;
-   ctx->Extensions.ATI_separate_stencil = true;
    ctx->Extensions.ATI_texture_env_combine3 = true;
+   ctx->Extensions.MESA_framebuffer_flip_y = true;
    ctx->Extensions.MESA_pack_invert = true;
    ctx->Extensions.NV_conditional_render = true;
+   ctx->Extensions.NV_fog_distance = true;
    ctx->Extensions.NV_primitive_restart = true;
    ctx->Extensions.NV_texture_barrier = true;
    ctx->Extensions.NV_texture_env_combine4 = true;
@@ -147,6 +148,12 @@
       ctx->Const.GLSLVersion = 330;
    else
       ctx->Const.GLSLVersion = 120;
+
+   if (devinfo->gen >= 6)
+      ctx->Const.GLSLVersionCompat = 130;
+   else
+      ctx->Const.GLSLVersionCompat = 120;
+
    _mesa_override_glsl_version(&ctx->Const);
 
    ctx->Extensions.EXT_shader_integer_mix = ctx->Const.GLSLVersion >= 130;
@@ -189,7 +196,8 @@
       ctx->Extensions.ARB_texture_multisample = true;
       ctx->Extensions.ARB_uniform_buffer_object = true;
 
-      ctx->Extensions.AMD_vertex_shader_layer = true;
+      if (ctx->API != API_OPENGL_COMPAT)
+         ctx->Extensions.AMD_vertex_shader_layer = true;
       ctx->Extensions.EXT_framebuffer_multisample = true;
       ctx->Extensions.EXT_framebuffer_multisample_blit_scaled = true;
       ctx->Extensions.EXT_transform_feedback = true;
@@ -217,8 +225,10 @@
       ctx->Extensions.ARB_conservative_depth = true;
       ctx->Extensions.ARB_derivative_control = true;
       ctx->Extensions.ARB_framebuffer_no_attachments = true;
-      ctx->Extensions.ARB_gpu_shader5 = true;
-      ctx->Extensions.ARB_gpu_shader_fp64 = devinfo->has_64bit_types;
+      if (ctx->API != API_OPENGL_COMPAT) {
+         ctx->Extensions.ARB_gpu_shader5 = true;
+         ctx->Extensions.ARB_gpu_shader_fp64 = devinfo->has_64bit_types;
+      }
       ctx->Extensions.ARB_shader_atomic_counters = true;
       ctx->Extensions.ARB_shader_atomic_counter_ops = true;
       ctx->Extensions.ARB_shader_clock = true;
@@ -226,7 +236,8 @@
       ctx->Extensions.ARB_shader_image_size = true;
       ctx->Extensions.ARB_shader_precision = true;
       ctx->Extensions.ARB_shader_texture_image_samples = true;
-      ctx->Extensions.ARB_tessellation_shader = true;
+      if (ctx->API != API_OPENGL_COMPAT)
+         ctx->Extensions.ARB_tessellation_shader = true;
       ctx->Extensions.ARB_texture_compression_bptc = true;
       ctx->Extensions.ARB_texture_view = true;
       ctx->Extensions.ARB_shader_storage_buffer_object = true;
@@ -234,6 +245,7 @@
       ctx->Extensions.EXT_shader_samples_identical = true;
       ctx->Extensions.OES_primitive_bounding_box = true;
       ctx->Extensions.OES_texture_buffer = true;
+      ctx->Extensions.ARB_fragment_shader_interlock = true;
 
       if (can_do_pipelined_register_writes(brw->screen)) {
          ctx->Extensions.ARB_draw_indirect = true;
@@ -274,9 +286,10 @@
    if (devinfo->gen >= 8 || devinfo->is_baytrail) {
       /* For now, we only enable OES_copy_image on platforms that support
        * ETC2 natively in hardware.  We would need more hacks to support it
-       * elsewhere.
+       * elsewhere. Same with OES_texture_view.
        */
       ctx->Extensions.OES_copy_image = true;
+      ctx->Extensions.OES_texture_view = true;
    }
 
    if (devinfo->gen >= 8) {
@@ -303,7 +316,7 @@
    if (devinfo->gen >= 6)
       ctx->Extensions.INTEL_performance_query = true;
 
-   if (ctx->API == API_OPENGL_CORE)
+   if (ctx->API != API_OPENGL_COMPAT)
       ctx->Extensions.ARB_base_instance = true;
    if (ctx->API != API_OPENGL_CORE)
       ctx->Extensions.ARB_color_buffer_float = true;
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index ca4008f..e682595 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -105,7 +105,8 @@
 		       GLuint x, GLuint y, GLuint w, GLuint h,
 		       GLbitfield mode,
 		       GLubyte **out_map,
-		       GLint *out_stride)
+		       GLint *out_stride,
+		       bool flip_y)
 {
    struct brw_context *brw = brw_context(ctx);
    struct swrast_renderbuffer *srb = (struct swrast_renderbuffer *)rb;
@@ -162,14 +163,14 @@
     * upside-down.  So we need to ask for a rectangle on flipped vertically, and
     * we then return a pointer to the bottom of it with a negative stride.
     */
-   if (rb->Name == 0) {
+   if (flip_y) {
       y = rb->Height - y - h;
    }
 
    intel_miptree_map(brw, mt, irb->mt_level, irb->mt_layer,
 		     x, y, w, h, mode, &map, &stride);
 
-   if (rb->Name == 0) {
+   if (flip_y) {
       map += (h - 1) * stride;
       stride = -stride;
    }
@@ -845,10 +846,10 @@
          if (!intel_miptree_blit(brw,
                                  src_irb->mt,
                                  src_irb->mt_level, src_irb->mt_layer,
-                                 srcX0, srcY0, src_rb->Name == 0,
+                                 srcX0, srcY0, readFb->FlipY,
                                  dst_irb->mt,
                                  dst_irb->mt_level, dst_irb->mt_layer,
-                                 dstX0, dstY0, dst_rb->Name == 0,
+                                 dstX0, dstY0, drawFb->FlipY,
                                  dstX1 - dstX0, dstY1 - dstY0,
                                  COLOR_LOGICOP_COPY)) {
             perf_debug("glBlitFramebuffer(): unknown blit failure.  "
@@ -915,14 +916,6 @@
       assert(!"Invalid blit");
    }
 
-   /* Try using the BLT engine. */
-   mask = intel_blit_framebuffer_with_blitter(ctx, readFb, drawFb,
-                                              srcX0, srcY0, srcX1, srcY1,
-                                              dstX0, dstY0, dstX1, dstY1,
-                                              mask);
-   if (mask == 0x0)
-      return;
-
    _swrast_BlitFramebuffer(ctx, readFb, drawFb,
                            srcX0, srcY0, srcX1, srcY1,
                            dstX0, dstY0, dstX1, dstY1,
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index c38abaa..3668135 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -36,7 +36,6 @@
 
 #include "brw_blorp.h"
 #include "brw_context.h"
-#include "brw_meta_util.h"
 #include "brw_state.h"
 
 #include "main/enums.h"
@@ -46,6 +45,9 @@
 #include "main/texcompress_etc.h"
 #include "main/teximage.h"
 #include "main/streaming-load-memcpy.h"
+
+#include "util/format_srgb.h"
+
 #include "x86/common_x86_asm.h"
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
@@ -57,10 +59,6 @@
 static void intel_miptree_unmap_raw(struct intel_mipmap_tree *mt);
 
 static bool
-intel_miptree_alloc_aux(struct brw_context *brw,
-                        struct intel_mipmap_tree *mt);
-
-static bool
 intel_miptree_supports_mcs(struct brw_context *brw,
                            const struct intel_mipmap_tree *mt)
 {
@@ -161,12 +159,8 @@
       return false;
 
    /* MCS is only supported for color buffers */
-   switch (_mesa_get_format_base_format(mt->format)) {
-   case GL_DEPTH_COMPONENT:
-   case GL_DEPTH_STENCIL:
-   case GL_STENCIL_INDEX:
+   if (!_mesa_is_format_color_format(mt->format))
       return false;
-   }
 
    if (mt->cpp != 4 && mt->cpp != 8 && mt->cpp != 16)
       return false;
@@ -206,6 +200,13 @@
    if (devinfo->gen < 8 && (mip_mapped || arrayed))
       return false;
 
+   /* The PRM doesn't say this explicitly, but fast-clears don't appear to
+    * work for 3D textures until gen9 where the layout of 3D textures changes
+    * to match 2D array textures.
+    */
+   if (devinfo->gen <= 8 && mt->surf.dim != ISL_SURF_DIM_2D)
+      return false;
+
    /* There's no point in using an MCS buffer if the surface isn't in a
     * renderable format.
     */
@@ -622,6 +623,7 @@
    if (!bo) {
       mt->bo = brw_bo_alloc_tiled(brw->bufmgr, "isl-miptree",
                                   mt->surf.size,
+                                  BRW_MEMZONE_OTHER,
                                   isl_tiling_to_i915_tiling(
                                      mt->surf.tiling),
                                   mt->surf.row_pitch, alloc_flags);
@@ -647,28 +649,21 @@
    return NULL;
 }
 
-static bool
-make_separate_stencil_surface(struct brw_context *brw,
-                              struct intel_mipmap_tree *mt)
+/* Return the usual surface usage flags for the given format. */
+static isl_surf_usage_flags_t
+mt_surf_usage(mesa_format format)
 {
-   mt->stencil_mt = make_surface(brw, mt->target, MESA_FORMAT_S_UINT8,
-                                 0, mt->surf.levels - 1,
-                                 mt->surf.logical_level0_px.width,
-                                 mt->surf.logical_level0_px.height,
-                                 mt->surf.dim == ISL_SURF_DIM_3D ?
-                                    mt->surf.logical_level0_px.depth :
-                                    mt->surf.logical_level0_px.array_len,
-                                 mt->surf.samples, ISL_TILING_W_BIT,
-                                 ISL_SURF_USAGE_STENCIL_BIT |
-                                 ISL_SURF_USAGE_TEXTURE_BIT,
-                                 BO_ALLOC_BUSY, 0, NULL);
-
-   if (!mt->stencil_mt)
-      return false;
-
-   mt->stencil_mt->r8stencil_needs_update = true;
-
-   return true;
+   switch(_mesa_get_format_base_format(format)) {
+   case GL_DEPTH_COMPONENT:
+      return ISL_SURF_USAGE_DEPTH_BIT | ISL_SURF_USAGE_TEXTURE_BIT;
+   case GL_DEPTH_STENCIL:
+      return ISL_SURF_USAGE_DEPTH_BIT | ISL_SURF_USAGE_STENCIL_BIT |
+             ISL_SURF_USAGE_TEXTURE_BIT;
+   case GL_STENCIL_INDEX:
+      return ISL_SURF_USAGE_STENCIL_BIT | ISL_SURF_USAGE_TEXTURE_BIT;
+   default:
+      return ISL_SURF_USAGE_RENDER_TARGET_BIT | ISL_SURF_USAGE_TEXTURE_BIT;
+   }
 }
 
 static struct intel_mipmap_tree *
@@ -684,75 +679,48 @@
                enum intel_miptree_create_flags flags)
 {
    const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   const uint32_t alloc_flags =
+      (flags & MIPTREE_CREATE_BUSY || num_samples > 1) ? BO_ALLOC_BUSY : 0;
+   isl_tiling_flags_t tiling_flags = ISL_TILING_ANY_MASK;
 
-   if (format == MESA_FORMAT_S_UINT8)
-      return make_surface(brw, target, format, first_level, last_level,
-                          width0, height0, depth0, num_samples,
-                          ISL_TILING_W_BIT,
-                          ISL_SURF_USAGE_STENCIL_BIT |
-                          ISL_SURF_USAGE_TEXTURE_BIT,
-                          BO_ALLOC_BUSY,
-                          0,
-                          NULL);
+   /* TODO: This used to be because there wasn't BLORP to handle Y-tiling. */
+   if (devinfo->gen < 6 && _mesa_is_format_color_format(format))
+      tiling_flags &= ~ISL_TILING_Y0_BIT;
 
-   const GLenum base_format = _mesa_get_format_base_format(format);
-   if ((base_format == GL_DEPTH_COMPONENT ||
-        base_format == GL_DEPTH_STENCIL) &&
-       !(flags & MIPTREE_CREATE_LINEAR)) {
+   mesa_format mt_fmt;
+   if (_mesa_is_format_color_format(format)) {
+      mt_fmt = intel_lower_compressed_format(brw, format);
+   } else {
       /* Fix up the Z miptree format for how we're splitting out separate
-       * stencil.  Gen7 expects there to be no stencil bits in its depth buffer.
+       * stencil. Gen7 expects there to be no stencil bits in its depth buffer.
        */
-      const mesa_format depth_only_format =
-         intel_depth_format_for_depthstencil_format(format);
-      struct intel_mipmap_tree *mt = make_surface(
-         brw, target, devinfo->gen >= 6 ? depth_only_format : format,
-         first_level, last_level,
-         width0, height0, depth0, num_samples, ISL_TILING_Y0_BIT,
-         ISL_SURF_USAGE_DEPTH_BIT | ISL_SURF_USAGE_TEXTURE_BIT,
-         BO_ALLOC_BUSY, 0, NULL);
+      mt_fmt = (devinfo->gen < 6) ? format :
+               intel_depth_format_for_depthstencil_format(format);
+   }
 
-      if (needs_separate_stencil(brw, mt, format) &&
-          !make_separate_stencil_surface(brw, mt)) {
+   struct intel_mipmap_tree *mt =
+      make_surface(brw, target, mt_fmt, first_level, last_level,
+                   width0, height0, depth0, num_samples,
+                   tiling_flags, mt_surf_usage(mt_fmt),
+                   alloc_flags, 0, NULL);
+
+   if (mt == NULL)
+      return NULL;
+
+   if (needs_separate_stencil(brw, mt, format)) {
+      mt->stencil_mt =
+         make_surface(brw, target, MESA_FORMAT_S_UINT8, first_level, last_level,
+                      width0, height0, depth0, num_samples,
+                      ISL_TILING_W_BIT, mt_surf_usage(MESA_FORMAT_S_UINT8),
+                      alloc_flags, 0, NULL);
+      if (mt->stencil_mt == NULL) {
          intel_miptree_release(&mt);
          return NULL;
       }
-
-      if (!(flags & MIPTREE_CREATE_NO_AUX))
-         intel_miptree_choose_aux_usage(brw, mt);
-
-      return mt;
    }
 
-   mesa_format tex_format = format;
-   mesa_format etc_format = MESA_FORMAT_NONE;
-   uint32_t alloc_flags = 0;
-
-   format = intel_lower_compressed_format(brw, format);
-
-   etc_format = (format != tex_format) ? tex_format : MESA_FORMAT_NONE;
-
-   if (flags & MIPTREE_CREATE_BUSY)
-      alloc_flags |= BO_ALLOC_BUSY;
-
-   isl_tiling_flags_t tiling_flags = (flags & MIPTREE_CREATE_LINEAR) ?
-      ISL_TILING_LINEAR_BIT : ISL_TILING_ANY_MASK;
-
-   /* TODO: This used to be because there wasn't BLORP to handle Y-tiling. */
-   if (devinfo->gen < 6)
-      tiling_flags &= ~ISL_TILING_Y0_BIT;
-
-   struct intel_mipmap_tree *mt = make_surface(
-                                     brw, target, format,
-                                     first_level, last_level,
-                                     width0, height0, depth0,
-                                     num_samples, tiling_flags,
-                                     ISL_SURF_USAGE_RENDER_TARGET_BIT |
-                                     ISL_SURF_USAGE_TEXTURE_BIT,
-                                     alloc_flags, 0, NULL);
-   if (!mt)
-      return NULL;
-
-   mt->etc_format = etc_format;
+   mt->etc_format = (_mesa_is_format_color_format(format) && mt_fmt != format) ?
+                    format : MESA_FORMAT_NONE;
 
    if (!(flags & MIPTREE_CREATE_NO_AUX))
       intel_miptree_choose_aux_usage(brw, mt);
@@ -784,7 +752,12 @@
 
    mt->offset = 0;
 
-   if (!intel_miptree_alloc_aux(brw, mt)) {
+   /* Create the auxiliary surface up-front. CCS_D, on the other hand, can only
+    * compress clear color so we wait until an actual fast-clear to allocate
+    * it.
+    */
+   if (mt->aux_usage != ISL_AUX_USAGE_CCS_D &&
+       !intel_miptree_alloc_aux(brw, mt)) {
       intel_miptree_release(&mt);
       return NULL;
    }
@@ -811,12 +784,11 @@
 
    if ((base_format == GL_DEPTH_COMPONENT ||
         base_format == GL_DEPTH_STENCIL)) {
-      const mesa_format depth_only_format =
+      const mesa_format mt_fmt = (devinfo->gen < 6) ? format :
          intel_depth_format_for_depthstencil_format(format);
-      mt = make_surface(brw, target,
-                        devinfo->gen >= 6 ? depth_only_format : format,
+      mt = make_surface(brw, target, mt_fmt,
                         0, 0, width, height, depth, 1, ISL_TILING_Y0_BIT,
-                        ISL_SURF_USAGE_DEPTH_BIT | ISL_SURF_USAGE_TEXTURE_BIT,
+                        mt_surf_usage(mt_fmt),
                         0, pitch, bo);
       if (!mt)
          return NULL;
@@ -831,8 +803,7 @@
       mt = make_surface(brw, target, MESA_FORMAT_S_UINT8,
                         0, 0, width, height, depth, 1,
                         ISL_TILING_W_BIT,
-                        ISL_SURF_USAGE_STENCIL_BIT |
-                        ISL_SURF_USAGE_TEXTURE_BIT,
+                        mt_surf_usage(MESA_FORMAT_S_UINT8),
                         0, pitch, bo);
       if (!mt)
          return NULL;
@@ -854,16 +825,10 @@
     */
    assert(pitch >= 0);
 
-   /* The BO already has a tiling format and we shouldn't confuse the lower
-    * layers by making it try to find a tiling format again.
-    */
-   assert((flags & MIPTREE_CREATE_LINEAR) == 0);
-
    mt = make_surface(brw, target, format,
                      0, 0, width, height, depth, 1,
                      1lu << tiling,
-                     ISL_SURF_USAGE_RENDER_TARGET_BIT |
-                     ISL_SURF_USAGE_TEXTURE_BIT,
+                     mt_surf_usage(format),
                      0, pitch, bo);
    if (!mt)
       return NULL;
@@ -875,7 +840,12 @@
    if (!(flags & MIPTREE_CREATE_NO_AUX)) {
       intel_miptree_choose_aux_usage(brw, mt);
 
-      if (!intel_miptree_alloc_aux(brw, mt)) {
+      /* Create the auxiliary surface up-front. CCS_D, on the other hand, can
+       * only compress clear color so we wait until an actual fast-clear to
+       * allocate it.
+       */
+      if (mt->aux_usage != ISL_AUX_USAGE_CCS_D &&
+          !intel_miptree_alloc_aux(brw, mt)) {
          intel_miptree_release(&mt);
          return NULL;
       }
@@ -973,11 +943,12 @@
     * system with CCS, we don't have the extra space at the end of the aux
     * buffer. So create a new bo here that will store that clear color.
     */
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   if (devinfo->gen >= 10) {
+   if (brw->isl_dev.ss.clear_color_state_size > 0) {
       mt->aux_buf->clear_color_bo =
-         brw_bo_alloc(brw->bufmgr, "clear_color_bo",
-                      brw->isl_dev.ss.clear_color_state_size);
+         brw_bo_alloc_tiled(brw->bufmgr, "clear_color_bo",
+                            brw->isl_dev.ss.clear_color_state_size,
+                            BRW_MEMZONE_OTHER, I915_TILING_NONE, 0,
+                            BO_ALLOC_ZEROED);
       if (!mt->aux_buf->clear_color_bo) {
          free(mt->aux_buf);
          mt->aux_buf = NULL;
@@ -989,9 +960,6 @@
    brw_bo_reference(image->bo);
 
    mt->aux_buf->offset = image->aux_offset;
-   mt->aux_buf->size = image->bo->size - image->aux_offset;
-   mt->aux_buf->pitch = image->aux_pitch;
-   mt->aux_buf->qpitch = 0;
    mt->aux_buf->surf = temp_ccs_surf;
 
    return true;
@@ -1001,7 +969,7 @@
 intel_miptree_create_for_dri_image(struct brw_context *brw,
                                    __DRIimage *image, GLenum target,
                                    mesa_format format,
-                                   bool is_winsys_image)
+                                   bool allow_internal_aux)
 {
    uint32_t bo_tiling, bo_swizzle;
    brw_bo_get_tiling(image->bo, &bo_tiling, &bo_swizzle);
@@ -1044,7 +1012,7 @@
     * other hand, have no resolve point so we can't have aux without a
     * modifier.
     */
-   if (!is_winsys_image)
+   if (!allow_internal_aux)
       mt_create_flags |= MIPTREE_CREATE_NO_AUX;
 
    /* If we have a modifier which specifies aux, don't create one yet */
@@ -1093,7 +1061,7 @@
        * as part of the flush operation.
        */
       mt->supports_fast_clear =
-         is_winsys_image || mod_info->supports_clear_color;
+         allow_internal_aux || mod_info->supports_clear_color;
 
       /* We don't know the actual state of the surface when we get it but we
        * can make a pretty good guess based on the modifier.  What we do know
@@ -1555,6 +1523,7 @@
                          unsigned dst_level, unsigned dst_layer)
 
 {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    mesa_format format = src_mt->format;
    unsigned width = minify(src_mt->surf.phys_level0_sa.width,
                            src_level - src_mt->first_level);
@@ -1567,6 +1536,32 @@
    assert(_mesa_get_srgb_format_linear(src_mt->format) ==
           _mesa_get_srgb_format_linear(dst_mt->format));
 
+   DBG("validate blit mt %s %p %d,%d -> mt %s %p %d,%d (%dx%d)\n",
+       _mesa_get_format_name(src_mt->format),
+       src_mt, src_level, src_layer,
+       _mesa_get_format_name(dst_mt->format),
+       dst_mt, dst_level, dst_layer,
+       width, height);
+
+   if (devinfo->gen >= 6) {
+      /* On gen6 and above, we just use blorp.  It's faster than the blitter
+       * and can handle everything without software fallbacks.
+       */
+      brw_blorp_copy_miptrees(brw,
+                              src_mt, src_level, src_layer,
+                              dst_mt, dst_level, dst_layer,
+                              0, 0, 0, 0, width, height);
+
+      if (src_mt->stencil_mt) {
+         assert(dst_mt->stencil_mt);
+         brw_blorp_copy_miptrees(brw,
+                                 src_mt->stencil_mt, src_level, src_layer,
+                                 dst_mt->stencil_mt, dst_level, dst_layer,
+                                 0, 0, 0, 0, width, height);
+      }
+      return;
+   }
+
    if (dst_mt->compressed) {
       unsigned int i, j;
       _mesa_get_format_block_size(dst_mt->format, &i, &j);
@@ -1574,17 +1569,8 @@
       width = ALIGN_NPOT(width, i) / i;
    }
 
-   /* If it's a packed depth/stencil buffer with separate stencil, the blit
-    * below won't apply since we can't do the depth's Y tiling or the
-    * stencil's W tiling in the blitter.
-    */
-   if (src_mt->stencil_mt) {
-      intel_miptree_copy_slice_sw(brw,
-                                  src_mt, src_level, src_layer,
-                                  dst_mt, dst_level, dst_layer,
-                                  width, height);
-      return;
-   }
+   /* Gen4-5 doesn't support separate stencil */
+   assert(!src_mt->stencil_mt);
 
    uint32_t dst_x, dst_y, src_x, src_y;
    intel_miptree_get_image_offset(dst_mt, dst_level, dst_layer,
@@ -1653,74 +1639,75 @@
    intel_obj->needs_validate = true;
 }
 
-static bool
-intel_miptree_init_mcs(struct brw_context *brw,
-                       struct intel_mipmap_tree *mt,
-                       int init_value)
-{
-   assert(mt->aux_buf != NULL);
-
-   /* From the Ivy Bridge PRM, Vol 2 Part 1 p326:
-    *
-    *     When MCS buffer is enabled and bound to MSRT, it is required that it
-    *     is cleared prior to any rendering.
-    *
-    * Since we don't use the MCS buffer for any purpose other than rendering,
-    * it makes sense to just clear it immediately upon allocation.
-    *
-    * Note: the clear value for MCS buffers is all 1's, so we memset to 0xff.
-    */
-   void *map = brw_bo_map(brw, mt->aux_buf->bo, MAP_WRITE | MAP_RAW);
-   if (unlikely(map == NULL)) {
-      fprintf(stderr, "Failed to map mcs buffer into GTT\n");
-      intel_miptree_aux_buffer_free(mt->aux_buf);
-      mt->aux_buf = NULL;
-      return false;
-   }
-   void *data = map;
-   memset(data, init_value, mt->aux_buf->size);
-   brw_bo_unmap(mt->aux_buf->bo);
-   return true;
-}
-
 static struct intel_miptree_aux_buffer *
 intel_alloc_aux_buffer(struct brw_context *brw,
-                       const char *name,
                        const struct isl_surf *aux_surf,
-                       uint32_t alloc_flags,
-                       struct intel_mipmap_tree *mt)
+                       bool wants_memset,
+                       uint8_t memset_value)
 {
    struct intel_miptree_aux_buffer *buf = calloc(sizeof(*buf), 1);
    if (!buf)
       return false;
 
-   buf->size = aux_surf->size;
+   uint64_t size = aux_surf->size;
 
-   const struct gen_device_info *devinfo = &brw->screen->devinfo;
-   if (devinfo->gen >= 10) {
-      /* On CNL, instead of setting the clear color in the SURFACE_STATE, we
+   const bool has_indirect_clear = brw->isl_dev.ss.clear_color_state_size > 0;
+   if (has_indirect_clear) {
+      /* On CNL+, instead of setting the clear color in the SURFACE_STATE, we
        * will set a pointer to a dword somewhere that contains the color. So,
        * allocate the space for the clear color value here on the aux buffer.
        */
-      buf->clear_color_offset = buf->size;
-      buf->size += brw->isl_dev.ss.clear_color_state_size;
+      buf->clear_color_offset = size;
+      size += brw->isl_dev.ss.clear_color_state_size;
    }
 
-   buf->pitch = aux_surf->row_pitch;
-   buf->qpitch = isl_surf_get_array_pitch_sa_rows(aux_surf);
+   /* If the buffer needs to be initialised (requiring the buffer to be
+    * immediately mapped to cpu space for writing), do not use the gpu access
+    * flag which can cause an unnecessary delay if the backing pages happened
+    * to be just used by the GPU.
+    */
+   const bool alloc_zeroed = wants_memset && memset_value == 0;
+   const bool needs_memset =
+      !alloc_zeroed && (wants_memset || has_indirect_clear);
+   const uint32_t alloc_flags =
+      alloc_zeroed ? BO_ALLOC_ZEROED : (needs_memset ? 0 : BO_ALLOC_BUSY);
 
    /* ISL has stricter set of alignment rules then the drm allocator.
     * Therefore one can pass the ISL dimensions in terms of bytes instead of
     * trying to recalculate based on different format block sizes.
     */
-   buf->bo = brw_bo_alloc_tiled(brw->bufmgr, name, buf->size,
-                                I915_TILING_Y, buf->pitch, alloc_flags);
+   buf->bo = brw_bo_alloc_tiled(brw->bufmgr, "aux-miptree", size,
+                                BRW_MEMZONE_OTHER, I915_TILING_Y,
+                                aux_surf->row_pitch, alloc_flags);
    if (!buf->bo) {
       free(buf);
       return NULL;
    }
 
-   if (devinfo->gen >= 10) {
+   /* Initialize the bo to the desired value */
+   if (needs_memset) {
+      assert(!(alloc_flags & BO_ALLOC_BUSY));
+
+      void *map = brw_bo_map(brw, buf->bo, MAP_WRITE | MAP_RAW);
+      if (map == NULL) {
+         intel_miptree_aux_buffer_free(buf);
+         return NULL;
+      }
+
+      /* Memset the aux_surf portion of the BO. */
+      if (wants_memset)
+         memset(map, memset_value, aux_surf->size);
+
+      /* Zero the indirect clear color to match ::fast_clear_color. */
+      if (has_indirect_clear) {
+         memset((char *)map + buf->clear_color_offset, 0,
+                brw->isl_dev.ss.clear_color_state_size);
+      }
+
+      brw_bo_unmap(buf->bo);
+   }
+
+   if (has_indirect_clear) {
       buf->clear_color_bo = buf->bo;
       brw_bo_reference(buf->clear_color_bo);
    }
@@ -1730,94 +1717,9 @@
    return buf;
 }
 
-static bool
-intel_miptree_alloc_mcs(struct brw_context *brw,
-                        struct intel_mipmap_tree *mt,
-                        GLuint num_samples)
-{
-   assert(brw->screen->devinfo.gen >= 7); /* MCS only used on Gen7+ */
-   assert(mt->aux_buf == NULL);
-   assert(mt->aux_usage == ISL_AUX_USAGE_MCS);
-
-   /* Multisampled miptrees are only supported for single level. */
-   assert(mt->first_level == 0);
-   enum isl_aux_state **aux_state =
-      create_aux_state_map(mt, ISL_AUX_STATE_CLEAR);
-   if (!aux_state)
-      return false;
-
-   struct isl_surf temp_mcs_surf;
-
-   MAYBE_UNUSED bool ok =
-      isl_surf_get_mcs_surf(&brw->isl_dev, &mt->surf, &temp_mcs_surf);
-   assert(ok);
-
-   /* Buffer needs to be initialised requiring the buffer to be immediately
-    * mapped to cpu space for writing. Therefore do not use the gpu access
-    * flag which can cause an unnecessary delay if the backing pages happened
-    * to be just used by the GPU.
-    */
-   const uint32_t alloc_flags = 0;
-   mt->aux_buf = intel_alloc_aux_buffer(brw, "mcs-miptree",
-                                        &temp_mcs_surf, alloc_flags, mt);
-   if (!mt->aux_buf ||
-       !intel_miptree_init_mcs(brw, mt, 0xFF)) {
-      free(aux_state);
-      return false;
-   }
-
-   mt->aux_state = aux_state;
-
-   return true;
-}
-
-bool
-intel_miptree_alloc_ccs(struct brw_context *brw,
-                        struct intel_mipmap_tree *mt)
-{
-   assert(mt->aux_buf == NULL);
-   assert(mt->aux_usage == ISL_AUX_USAGE_CCS_E ||
-          mt->aux_usage == ISL_AUX_USAGE_CCS_D);
-
-   struct isl_surf temp_ccs_surf;
-
-   if (!isl_surf_get_ccs_surf(&brw->isl_dev, &mt->surf, &temp_ccs_surf, 0))
-      return false;
-
-   assert(temp_ccs_surf.size &&
-          (temp_ccs_surf.size % temp_ccs_surf.row_pitch == 0));
-
-   enum isl_aux_state **aux_state =
-      create_aux_state_map(mt, ISL_AUX_STATE_PASS_THROUGH);
-   if (!aux_state)
-      return false;
-
-   /* When CCS_E is used, we need to ensure that the CCS starts off in a valid
-    * state.  From the Sky Lake PRM, "MCS Buffer for Render Target(s)":
-    *
-    *    "If Software wants to enable Color Compression without Fast clear,
-    *    Software needs to initialize MCS with zeros."
-    *
-    * A CCS value of 0 indicates that the corresponding block is in the
-    * pass-through state which is what we want.
-    *
-    * For CCS_D, do the same thing. On gen9+, this avoids having any undefined
-    * bits in the aux buffer.
-    */
-   mt->aux_buf = intel_alloc_aux_buffer(brw, "ccs-miptree", &temp_ccs_surf,
-                                        BO_ALLOC_ZEROED, mt);
-   if (!mt->aux_buf) {
-      free(aux_state);
-      return false;
-   }
-
-   mt->aux_state = aux_state;
-
-   return true;
-}
 
 /**
- * Helper for intel_miptree_alloc_hiz() that sets
+ * Helper for intel_miptree_alloc_aux() that sets
  * \c mt->level[level].has_hiz. Return true if and only if
  * \c has_hiz was set.
  */
@@ -1852,41 +1754,6 @@
    return true;
 }
 
-bool
-intel_miptree_alloc_hiz(struct brw_context *brw,
-			struct intel_mipmap_tree *mt)
-{
-   assert(mt->aux_buf == NULL);
-   assert(mt->aux_usage == ISL_AUX_USAGE_HIZ);
-
-   enum isl_aux_state **aux_state =
-      create_aux_state_map(mt, ISL_AUX_STATE_AUX_INVALID);
-   if (!aux_state)
-      return false;
-
-   struct isl_surf temp_hiz_surf;
-
-   MAYBE_UNUSED bool ok =
-      isl_surf_get_hiz_surf(&brw->isl_dev, &mt->surf, &temp_hiz_surf);
-   assert(ok);
-
-   const uint32_t alloc_flags = BO_ALLOC_BUSY;
-   mt->aux_buf = intel_alloc_aux_buffer(brw, "hiz-miptree",
-                                        &temp_hiz_surf, alloc_flags, mt);
-
-   if (!mt->aux_buf) {
-      free(aux_state);
-      return false;
-   }
-
-   for (unsigned level = mt->first_level; level <= mt->last_level; ++level)
-      intel_miptree_level_enable_hiz(brw, mt, level);
-
-   mt->aux_state = aux_state;
-
-   return true;
-}
-
 
 /**
  * Allocate the initial aux surface for a miptree based on mt->aux_usage
@@ -1895,42 +1762,96 @@
  * create the auxiliary surfaces up-front.  CCS_D, on the other hand, can only
  * compress clear color so we wait until an actual fast-clear to allocate it.
  */
-static bool
+bool
 intel_miptree_alloc_aux(struct brw_context *brw,
                         struct intel_mipmap_tree *mt)
 {
+   assert(mt->aux_buf == NULL);
+
+   /* Get the aux buf allocation parameters for this miptree. */
+   enum isl_aux_state initial_state;
+   uint8_t memset_value;
+   struct isl_surf aux_surf;
+   MAYBE_UNUSED bool aux_surf_ok = false;
+
    switch (mt->aux_usage) {
    case ISL_AUX_USAGE_NONE:
-      return true;
-
+      aux_surf.size = 0;
+      aux_surf_ok = true;
+      break;
    case ISL_AUX_USAGE_HIZ:
-      assert(!_mesa_is_format_color_format(mt->format));
-      if (!intel_miptree_alloc_hiz(brw, mt))
-         return false;
-      return true;
-
+      initial_state = ISL_AUX_STATE_AUX_INVALID;
+      memset_value = 0;
+      aux_surf_ok = isl_surf_get_hiz_surf(&brw->isl_dev, &mt->surf, &aux_surf);
+      break;
    case ISL_AUX_USAGE_MCS:
-      assert(_mesa_is_format_color_format(mt->format));
-      assert(mt->surf.samples > 1);
-      if (!intel_miptree_alloc_mcs(brw, mt, mt->surf.samples))
-         return false;
-      return true;
-
-   case ISL_AUX_USAGE_CCS_D:
-      /* Since CCS_D can only compress clear color so we wait until an actual
-       * fast-clear to allocate it.
+      /* From the Ivy Bridge PRM, Vol 2 Part 1 p326:
+       *
+       *     When MCS buffer is enabled and bound to MSRT, it is required that
+       *     it is cleared prior to any rendering.
+       *
+       * Since we don't use the MCS buffer for any purpose other than
+       * rendering, it makes sense to just clear it immediately upon
+       * allocation.
+       *
+       * Note: the clear value for MCS buffers is all 1's, so we memset to
+       * 0xff.
        */
-      return true;
-
+      initial_state = ISL_AUX_STATE_CLEAR;
+      memset_value = 0xFF;
+      aux_surf_ok = isl_surf_get_mcs_surf(&brw->isl_dev, &mt->surf, &aux_surf);
+      break;
+   case ISL_AUX_USAGE_CCS_D:
    case ISL_AUX_USAGE_CCS_E:
-      assert(_mesa_is_format_color_format(mt->format));
-      assert(mt->surf.samples == 1);
-      if (!intel_miptree_alloc_ccs(brw, mt))
-         return false;
-      return true;
+      /* When CCS_E is used, we need to ensure that the CCS starts off in a
+       * valid state.  From the Sky Lake PRM, "MCS Buffer for Render
+       * Target(s)":
+       *
+       *    "If Software wants to enable Color Compression without Fast
+       *    clear, Software needs to initialize MCS with zeros."
+       *
+       * A CCS value of 0 indicates that the corresponding block is in the
+       * pass-through state which is what we want.
+       *
+       * For CCS_D, do the same thing. On gen9+, this avoids having any
+       * undefined bits in the aux buffer.
+       */
+      initial_state = ISL_AUX_STATE_PASS_THROUGH;
+      memset_value = 0;
+      aux_surf_ok =
+         isl_surf_get_ccs_surf(&brw->isl_dev, &mt->surf, &aux_surf, 0);
+      break;
    }
 
-   unreachable("Invalid aux usage");
+   /* We should have a valid aux_surf. */
+   assert(aux_surf_ok);
+
+   /* No work is needed for a zero-sized auxiliary buffer. */
+   if (aux_surf.size == 0)
+      return true;
+
+   /* Create the aux_state for the auxiliary buffer. */
+   mt->aux_state = create_aux_state_map(mt, initial_state);
+   if (mt->aux_state == NULL)
+      return false;
+
+   /* Allocate the auxiliary buffer. */
+   const bool needs_memset = initial_state != ISL_AUX_STATE_AUX_INVALID;
+   mt->aux_buf = intel_alloc_aux_buffer(brw, &aux_surf, needs_memset,
+                                        memset_value);
+   if (mt->aux_buf == NULL) {
+      free_aux_state_map(mt->aux_state);
+      mt->aux_state = NULL;
+      return false;
+   }
+
+   /* Perform aux_usage-specific initialization. */
+   if (mt->aux_usage == ISL_AUX_USAGE_HIZ) {
+      for (unsigned level = mt->first_level; level <= mt->last_level; ++level)
+         intel_miptree_level_enable_hiz(brw, mt, level);
+   }
+
+   return true;
 }
 
 
@@ -2500,11 +2421,13 @@
                            uint32_t start_layer, uint32_t num_layers,
                            enum isl_aux_usage aux_usage)
 {
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
    num_layers = miptree_layer_range_length(mt, level, start_layer, num_layers);
 
    switch (mt->aux_usage) {
    case ISL_AUX_USAGE_NONE:
-      /* Nothing to do */
+      if (mt->format == MESA_FORMAT_S_UINT8 && devinfo->gen <= 7)
+         mt->r8stencil_needs_update = true;
       break;
 
    case ISL_AUX_USAGE_MCS:
@@ -2743,11 +2666,11 @@
          return ISL_AUX_USAGE_NONE;
       }
 
-      /* gen9 hardware technically supports non-0/1 clear colors with sRGB
+      /* gen9+ hardware technically supports non-0/1 clear colors with sRGB
        * formats.  However, there are issues with blending where it doesn't
        * properly apply the sRGB curve to the clear color when blending.
        */
-      if (devinfo->gen == 9 && blend_enabled &&
+      if (devinfo->gen >= 9 && blend_enabled &&
           isl_format_is_srgb(render_format) &&
           !isl_color_value_is_zero_one(mt->fast_clear_color, render_format))
          return ISL_AUX_USAGE_NONE;
@@ -3005,7 +2928,7 @@
    assert(devinfo->gen >= 7);
    struct intel_mipmap_tree *src =
       mt->format == MESA_FORMAT_S_UINT8 ? mt : mt->stencil_mt;
-   if (!src || devinfo->gen >= 8 || !src->r8stencil_needs_update)
+   if (!src || devinfo->gen >= 8)
       return;
 
    assert(src->surf.size > 0);
@@ -3029,6 +2952,9 @@
       assert(mt->r8stencil_mt);
    }
 
+   if (src->r8stencil_needs_update == false)
+      return;
+
    struct intel_mipmap_tree *dst = mt->r8stencil_mt;
 
    for (int level = src->first_level; level <= src->last_level; level++) {
@@ -3072,6 +2998,15 @@
 }
 
 static void
+intel_miptree_unmap_gtt(struct brw_context *brw,
+                        struct intel_mipmap_tree *mt,
+                        struct intel_miptree_map *map,
+                        unsigned int level, unsigned int slice)
+{
+   intel_miptree_unmap_raw(mt);
+}
+
+static void
 intel_miptree_map_gtt(struct brw_context *brw,
 		      struct intel_mipmap_tree *mt,
 		      struct intel_miptree_map *map,
@@ -3093,6 +3028,9 @@
    y /= bh;
    x /= bw;
 
+   intel_miptree_access_raw(brw, mt, level, slice,
+                            map->mode & GL_MAP_WRITE_BIT);
+
    base = intel_miptree_map_raw(brw, mt, map->mode);
 
    if (base == NULL)
@@ -3116,12 +3054,37 @@
        map->x, map->y, map->w, map->h,
        mt, _mesa_get_format_name(mt->format),
        x, y, map->ptr, map->stride);
+
+   map->unmap = intel_miptree_unmap_gtt;
 }
 
 static void
-intel_miptree_unmap_gtt(struct intel_mipmap_tree *mt)
+intel_miptree_unmap_blit(struct brw_context *brw,
+			 struct intel_mipmap_tree *mt,
+			 struct intel_miptree_map *map,
+			 unsigned int level,
+			 unsigned int slice)
 {
-   intel_miptree_unmap_raw(mt);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   struct gl_context *ctx = &brw->ctx;
+
+   intel_miptree_unmap_raw(map->linear_mt);
+
+   if (map->mode & GL_MAP_WRITE_BIT) {
+      if (devinfo->gen >= 6) {
+         brw_blorp_copy_miptrees(brw, map->linear_mt, 0, 0,
+                                 mt, level, slice,
+                                 0, 0, map->x, map->y, map->w, map->h);
+      } else {
+         bool ok = intel_miptree_copy(brw,
+                                      map->linear_mt, 0, 0, 0, 0,
+                                      mt, level, slice, map->x, map->y,
+                                      map->w, map->h);
+         WARN_ONCE(!ok, "Failed to blit from linear temporary mapping");
+      }
+   }
+
+   intel_miptree_release(&map->linear_mt);
 }
 
 static void
@@ -3130,12 +3093,13 @@
 		       struct intel_miptree_map *map,
 		       unsigned int level, unsigned int slice)
 {
-   map->linear_mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
-                                         /* first_level */ 0,
-                                         /* last_level */ 0,
-                                         map->w, map->h, 1,
-                                         /* samples */ 1,
-                                         MIPTREE_CREATE_LINEAR);
+   const struct gen_device_info *devinfo = &brw->screen->devinfo;
+   map->linear_mt = make_surface(brw, GL_TEXTURE_2D, mt->format,
+                                 0, 0, map->w, map->h, 1, 1,
+                                 ISL_TILING_LINEAR_BIT,
+                                 ISL_SURF_USAGE_RENDER_TARGET_BIT |
+                                 ISL_SURF_USAGE_TEXTURE_BIT,
+                                 0, 0, NULL);
 
    if (!map->linear_mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
@@ -3149,12 +3113,18 @@
     * temporary buffer back out.
     */
    if (!(map->mode & GL_MAP_INVALIDATE_RANGE_BIT)) {
-      if (!intel_miptree_copy(brw,
-                              mt, level, slice, map->x, map->y,
-                              map->linear_mt, 0, 0, 0, 0,
-                              map->w, map->h)) {
-         fprintf(stderr, "Failed to blit\n");
-         goto fail;
+      if (devinfo->gen >= 6) {
+         brw_blorp_copy_miptrees(brw, mt, level, slice,
+                                 map->linear_mt, 0, 0,
+                                 map->x, map->y, 0, 0, map->w, map->h);
+      } else {
+         if (!intel_miptree_copy(brw,
+                                 mt, level, slice, map->x, map->y,
+                                 map->linear_mt, 0, 0, 0, 0,
+                                 map->w, map->h)) {
+            fprintf(stderr, "Failed to blit\n");
+            goto fail;
+         }
       }
    }
 
@@ -3165,6 +3135,7 @@
        mt, _mesa_get_format_name(mt->format),
        level, slice, map->ptr, map->stride);
 
+   map->unmap = intel_miptree_unmap_blit;
    return;
 
 fail:
@@ -3173,33 +3144,23 @@
    map->stride = 0;
 }
 
-static void
-intel_miptree_unmap_blit(struct brw_context *brw,
-			 struct intel_mipmap_tree *mt,
-			 struct intel_miptree_map *map,
-			 unsigned int level,
-			 unsigned int slice)
-{
-   struct gl_context *ctx = &brw->ctx;
-
-   intel_miptree_unmap_raw(map->linear_mt);
-
-   if (map->mode & GL_MAP_WRITE_BIT) {
-      bool ok = intel_miptree_copy(brw,
-                                   map->linear_mt, 0, 0, 0, 0,
-                                   mt, level, slice, map->x, map->y,
-                                   map->w, map->h);
-      WARN_ONCE(!ok, "Failed to blit from linear temporary mapping");
-   }
-
-   intel_miptree_release(&map->linear_mt);
-}
-
 /**
  * "Map" a buffer by copying it to an untiled temporary using MOVNTDQA.
  */
 #if defined(USE_SSE41)
 static void
+intel_miptree_unmap_movntdqa(struct brw_context *brw,
+                             struct intel_mipmap_tree *mt,
+                             struct intel_miptree_map *map,
+                             unsigned int level,
+                             unsigned int slice)
+{
+   _mesa_align_free(map->buffer);
+   map->buffer = NULL;
+   map->ptr = NULL;
+}
+
+static void
 intel_miptree_map_movntdqa(struct brw_context *brw,
                            struct intel_mipmap_tree *mt,
                            struct intel_miptree_map *map,
@@ -3208,6 +3169,8 @@
    assert(map->mode & GL_MAP_READ_BIT);
    assert(!(map->mode & GL_MAP_WRITE_BIT));
 
+   intel_miptree_access_raw(brw, mt, level, slice, false);
+
    DBG("%s: %d,%d %dx%d from mt %p (%s) %d,%d = %p/%d\n", __func__,
        map->x, map->y, map->w, map->h,
        mt, _mesa_get_format_name(mt->format),
@@ -3256,22 +3219,42 @@
    }
 
    intel_miptree_unmap_raw(mt);
-}
 
-static void
-intel_miptree_unmap_movntdqa(struct brw_context *brw,
-                             struct intel_mipmap_tree *mt,
-                             struct intel_miptree_map *map,
-                             unsigned int level,
-                             unsigned int slice)
-{
-   _mesa_align_free(map->buffer);
-   map->buffer = NULL;
-   map->ptr = NULL;
+   map->unmap = intel_miptree_unmap_movntdqa;
 }
 #endif
 
 static void
+intel_miptree_unmap_s8(struct brw_context *brw,
+		       struct intel_mipmap_tree *mt,
+		       struct intel_miptree_map *map,
+		       unsigned int level,
+		       unsigned int slice)
+{
+   if (map->mode & GL_MAP_WRITE_BIT) {
+      unsigned int image_x, image_y;
+      uint8_t *untiled_s8_map = map->ptr;
+      uint8_t *tiled_s8_map = intel_miptree_map_raw(brw, mt, GL_MAP_WRITE_BIT);
+
+      intel_miptree_get_image_offset(mt, level, slice, &image_x, &image_y);
+
+      for (uint32_t y = 0; y < map->h; y++) {
+	 for (uint32_t x = 0; x < map->w; x++) {
+	    ptrdiff_t offset = intel_offset_S8(mt->surf.row_pitch,
+	                                       image_x + x + map->x,
+	                                       image_y + y + map->y,
+					       brw->has_swizzling);
+	    tiled_s8_map[offset] = untiled_s8_map[y * map->w + x];
+	 }
+      }
+
+      intel_miptree_unmap_raw(mt);
+   }
+
+   free(map->buffer);
+}
+
+static void
 intel_miptree_map_s8(struct brw_context *brw,
 		     struct intel_mipmap_tree *mt,
 		     struct intel_miptree_map *map,
@@ -3282,6 +3265,9 @@
    if (!map->buffer)
       return;
 
+   intel_miptree_access_raw(brw, mt, level, slice,
+                            map->mode & GL_MAP_WRITE_BIT);
+
    /* One of either READ_BIT or WRITE_BIT or both is set.  READ_BIT implies no
     * INVALIDATE_RANGE_BIT.  WRITE_BIT needs the original values read in unless
     * invalidate is set, since we'll be writing the whole rectangle from our
@@ -3314,57 +3300,8 @@
 	  map->x, map->y, map->w, map->h,
 	  mt, map->ptr, map->stride);
    }
-}
 
-static void
-intel_miptree_unmap_s8(struct brw_context *brw,
-		       struct intel_mipmap_tree *mt,
-		       struct intel_miptree_map *map,
-		       unsigned int level,
-		       unsigned int slice)
-{
-   if (map->mode & GL_MAP_WRITE_BIT) {
-      unsigned int image_x, image_y;
-      uint8_t *untiled_s8_map = map->ptr;
-      uint8_t *tiled_s8_map = intel_miptree_map_raw(brw, mt, GL_MAP_WRITE_BIT);
-
-      intel_miptree_get_image_offset(mt, level, slice, &image_x, &image_y);
-
-      for (uint32_t y = 0; y < map->h; y++) {
-	 for (uint32_t x = 0; x < map->w; x++) {
-	    ptrdiff_t offset = intel_offset_S8(mt->surf.row_pitch,
-	                                       image_x + x + map->x,
-	                                       image_y + y + map->y,
-					       brw->has_swizzling);
-	    tiled_s8_map[offset] = untiled_s8_map[y * map->w + x];
-	 }
-      }
-
-      intel_miptree_unmap_raw(mt);
-   }
-
-   free(map->buffer);
-}
-
-static void
-intel_miptree_map_etc(struct brw_context *brw,
-                      struct intel_mipmap_tree *mt,
-                      struct intel_miptree_map *map,
-                      unsigned int level,
-                      unsigned int slice)
-{
-   assert(mt->etc_format != MESA_FORMAT_NONE);
-   if (mt->etc_format == MESA_FORMAT_ETC1_RGB8) {
-      assert(mt->format == MESA_FORMAT_R8G8B8X8_UNORM);
-   }
-
-   assert(map->mode & GL_MAP_WRITE_BIT);
-   assert(map->mode & GL_MAP_INVALIDATE_RANGE_BIT);
-
-   map->stride = _mesa_format_row_stride(mt->etc_format, map->w);
-   map->buffer = malloc(_mesa_format_image_size(mt->etc_format,
-                                                map->w, map->h, 1));
-   map->ptr = map->buffer;
+   map->unmap = intel_miptree_unmap_s8;
 }
 
 static void
@@ -3392,14 +3329,38 @@
    else
       _mesa_unpack_etc2_format(dst, mt->surf.row_pitch,
                                map->ptr, map->stride,
-                               map->w, map->h, mt->etc_format);
+			       map->w, map->h, mt->etc_format, true);
 
    intel_miptree_unmap_raw(mt);
    free(map->buffer);
 }
 
+static void
+intel_miptree_map_etc(struct brw_context *brw,
+                      struct intel_mipmap_tree *mt,
+                      struct intel_miptree_map *map,
+                      unsigned int level,
+                      unsigned int slice)
+{
+   assert(mt->etc_format != MESA_FORMAT_NONE);
+   if (mt->etc_format == MESA_FORMAT_ETC1_RGB8) {
+      assert(mt->format == MESA_FORMAT_R8G8B8X8_UNORM);
+   }
+
+   assert(map->mode & GL_MAP_WRITE_BIT);
+   assert(map->mode & GL_MAP_INVALIDATE_RANGE_BIT);
+
+   intel_miptree_access_raw(brw, mt, level, slice, true);
+
+   map->stride = _mesa_format_row_stride(mt->etc_format, map->w);
+   map->buffer = malloc(_mesa_format_image_size(mt->etc_format,
+                                                map->w, map->h, 1));
+   map->ptr = map->buffer;
+   map->unmap = intel_miptree_unmap_etc;
+}
+
 /**
- * Mapping function for packed depth/stencil miptrees backed by real separate
+ * Mapping functions for packed depth/stencil miptrees backed by real separate
  * miptrees for depth and stencil.
  *
  * On gen7, and to support HiZ pre-gen7, we have to have the stencil buffer
@@ -3410,77 +3371,6 @@
  * copying the data between the actual backing store and the temporary.
  */
 static void
-intel_miptree_map_depthstencil(struct brw_context *brw,
-			       struct intel_mipmap_tree *mt,
-			       struct intel_miptree_map *map,
-			       unsigned int level, unsigned int slice)
-{
-   struct intel_mipmap_tree *z_mt = mt;
-   struct intel_mipmap_tree *s_mt = mt->stencil_mt;
-   bool map_z32f_x24s8 = mt->format == MESA_FORMAT_Z_FLOAT32;
-   int packed_bpp = map_z32f_x24s8 ? 8 : 4;
-
-   map->stride = map->w * packed_bpp;
-   map->buffer = map->ptr = malloc(map->stride * map->h);
-   if (!map->buffer)
-      return;
-
-   /* One of either READ_BIT or WRITE_BIT or both is set.  READ_BIT implies no
-    * INVALIDATE_RANGE_BIT.  WRITE_BIT needs the original values read in unless
-    * invalidate is set, since we'll be writing the whole rectangle from our
-    * temporary buffer back out.
-    */
-   if (!(map->mode & GL_MAP_INVALIDATE_RANGE_BIT)) {
-      uint32_t *packed_map = map->ptr;
-      uint8_t *s_map = intel_miptree_map_raw(brw, s_mt, GL_MAP_READ_BIT);
-      uint32_t *z_map = intel_miptree_map_raw(brw, z_mt, GL_MAP_READ_BIT);
-      unsigned int s_image_x, s_image_y;
-      unsigned int z_image_x, z_image_y;
-
-      intel_miptree_get_image_offset(s_mt, level, slice,
-				     &s_image_x, &s_image_y);
-      intel_miptree_get_image_offset(z_mt, level, slice,
-				     &z_image_x, &z_image_y);
-
-      for (uint32_t y = 0; y < map->h; y++) {
-	 for (uint32_t x = 0; x < map->w; x++) {
-	    int map_x = map->x + x, map_y = map->y + y;
-	    ptrdiff_t s_offset = intel_offset_S8(s_mt->surf.row_pitch,
-						 map_x + s_image_x,
-						 map_y + s_image_y,
-						 brw->has_swizzling);
-	    ptrdiff_t z_offset = ((map_y + z_image_y) *
-                                  (z_mt->surf.row_pitch / 4) +
-				  (map_x + z_image_x));
-	    uint8_t s = s_map[s_offset];
-	    uint32_t z = z_map[z_offset];
-
-	    if (map_z32f_x24s8) {
-	       packed_map[(y * map->w + x) * 2 + 0] = z;
-	       packed_map[(y * map->w + x) * 2 + 1] = s;
-	    } else {
-	       packed_map[y * map->w + x] = (s << 24) | (z & 0x00ffffff);
-	    }
-	 }
-      }
-
-      intel_miptree_unmap_raw(s_mt);
-      intel_miptree_unmap_raw(z_mt);
-
-      DBG("%s: %d,%d %dx%d from z mt %p %d,%d, s mt %p %d,%d = %p/%d\n",
-	  __func__,
-	  map->x, map->y, map->w, map->h,
-	  z_mt, map->x + z_image_x, map->y + z_image_y,
-	  s_mt, map->x + s_image_x, map->y + s_image_y,
-	  map->ptr, map->stride);
-   } else {
-      DBG("%s: %d,%d %dx%d from mt %p = %p/%d\n", __func__,
-	  map->x, map->y, map->w, map->h,
-	  mt, map->ptr, map->stride);
-   }
-}
-
-static void
 intel_miptree_unmap_depthstencil(struct brw_context *brw,
 				 struct intel_mipmap_tree *mt,
 				 struct intel_miptree_map *map,
@@ -3539,6 +3429,84 @@
    free(map->buffer);
 }
 
+static void
+intel_miptree_map_depthstencil(struct brw_context *brw,
+			       struct intel_mipmap_tree *mt,
+			       struct intel_miptree_map *map,
+			       unsigned int level, unsigned int slice)
+{
+   struct intel_mipmap_tree *z_mt = mt;
+   struct intel_mipmap_tree *s_mt = mt->stencil_mt;
+   bool map_z32f_x24s8 = mt->format == MESA_FORMAT_Z_FLOAT32;
+   int packed_bpp = map_z32f_x24s8 ? 8 : 4;
+
+   map->stride = map->w * packed_bpp;
+   map->buffer = map->ptr = malloc(map->stride * map->h);
+   if (!map->buffer)
+      return;
+
+   intel_miptree_access_raw(brw, z_mt, level, slice,
+                            map->mode & GL_MAP_WRITE_BIT);
+   intel_miptree_access_raw(brw, s_mt, level, slice,
+                            map->mode & GL_MAP_WRITE_BIT);
+
+   /* One of either READ_BIT or WRITE_BIT or both is set.  READ_BIT implies no
+    * INVALIDATE_RANGE_BIT.  WRITE_BIT needs the original values read in unless
+    * invalidate is set, since we'll be writing the whole rectangle from our
+    * temporary buffer back out.
+    */
+   if (!(map->mode & GL_MAP_INVALIDATE_RANGE_BIT)) {
+      uint32_t *packed_map = map->ptr;
+      uint8_t *s_map = intel_miptree_map_raw(brw, s_mt, GL_MAP_READ_BIT);
+      uint32_t *z_map = intel_miptree_map_raw(brw, z_mt, GL_MAP_READ_BIT);
+      unsigned int s_image_x, s_image_y;
+      unsigned int z_image_x, z_image_y;
+
+      intel_miptree_get_image_offset(s_mt, level, slice,
+				     &s_image_x, &s_image_y);
+      intel_miptree_get_image_offset(z_mt, level, slice,
+				     &z_image_x, &z_image_y);
+
+      for (uint32_t y = 0; y < map->h; y++) {
+	 for (uint32_t x = 0; x < map->w; x++) {
+	    int map_x = map->x + x, map_y = map->y + y;
+	    ptrdiff_t s_offset = intel_offset_S8(s_mt->surf.row_pitch,
+						 map_x + s_image_x,
+						 map_y + s_image_y,
+						 brw->has_swizzling);
+	    ptrdiff_t z_offset = ((map_y + z_image_y) *
+                                  (z_mt->surf.row_pitch / 4) +
+				  (map_x + z_image_x));
+	    uint8_t s = s_map[s_offset];
+	    uint32_t z = z_map[z_offset];
+
+	    if (map_z32f_x24s8) {
+	       packed_map[(y * map->w + x) * 2 + 0] = z;
+	       packed_map[(y * map->w + x) * 2 + 1] = s;
+	    } else {
+	       packed_map[y * map->w + x] = (s << 24) | (z & 0x00ffffff);
+	    }
+	 }
+      }
+
+      intel_miptree_unmap_raw(s_mt);
+      intel_miptree_unmap_raw(z_mt);
+
+      DBG("%s: %d,%d %dx%d from z mt %p %d,%d, s mt %p %d,%d = %p/%d\n",
+	  __func__,
+	  map->x, map->y, map->w, map->h,
+	  z_mt, map->x + z_image_x, map->y + z_image_y,
+	  s_mt, map->x + s_image_x, map->y + s_image_y,
+	  map->ptr, map->stride);
+   } else {
+      DBG("%s: %d,%d %dx%d from mt %p = %p/%d\n", __func__,
+	  map->x, map->y, map->w, map->h,
+	  mt, map->ptr, map->stride);
+   }
+
+   map->unmap = intel_miptree_unmap_depthstencil;
+}
+
 /**
  * Create and attach a map to the miptree at (level, slice). Return the
  * attached map.
@@ -3660,9 +3628,6 @@
       return;
    }
 
-   intel_miptree_access_raw(brw, mt, level, slice,
-                            map->mode & GL_MAP_WRITE_BIT);
-
    if (mt->format == MESA_FORMAT_S_UINT8) {
       intel_miptree_map_s8(brw, mt, map, level, slice);
    } else if (mt->etc_format != MESA_FORMAT_NONE &&
@@ -3705,22 +3670,8 @@
    DBG("%s: mt %p (%s) level %d slice %d\n", __func__,
        mt, _mesa_get_format_name(mt->format), level, slice);
 
-   if (mt->format == MESA_FORMAT_S_UINT8) {
-      intel_miptree_unmap_s8(brw, mt, map, level, slice);
-   } else if (mt->etc_format != MESA_FORMAT_NONE &&
-              !(map->mode & BRW_MAP_DIRECT_BIT)) {
-      intel_miptree_unmap_etc(brw, mt, map, level, slice);
-   } else if (mt->stencil_mt && !(map->mode & BRW_MAP_DIRECT_BIT)) {
-      intel_miptree_unmap_depthstencil(brw, mt, map, level, slice);
-   } else if (map->linear_mt) {
-      intel_miptree_unmap_blit(brw, mt, map, level, slice);
-#if defined(USE_SSE41)
-   } else if (map->buffer && cpu_has_sse4_1) {
-      intel_miptree_unmap_movntdqa(brw, mt, map, level, slice);
-#endif
-   } else {
-      intel_miptree_unmap_gtt(mt);
-   }
+   if (map->unmap)
+	   map->unmap(brw, mt, map, level, slice);
 
    intel_miptree_release_map(mt, level, slice);
 }
@@ -3784,28 +3735,56 @@
 bool
 intel_miptree_set_clear_color(struct brw_context *brw,
                               struct intel_mipmap_tree *mt,
-                              const union gl_color_union *color)
+                              union isl_color_value clear_color)
 {
-   const union isl_color_value clear_color =
-      brw_meta_convert_fast_clear_color(brw, mt, color);
-
    if (memcmp(&mt->fast_clear_color, &clear_color, sizeof(clear_color)) != 0) {
       mt->fast_clear_color = clear_color;
+      if (mt->aux_buf->clear_color_bo) {
+         /* We can't update the clear color while the hardware is still using
+          * the previous one for a resolve or sampling from it. Make sure that
+          * there are no pending commands at this point.
+          */
+         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
+         for (int i = 0; i < 4; i++) {
+            brw_store_data_imm32(brw, mt->aux_buf->clear_color_bo,
+                                 mt->aux_buf->clear_color_offset + i * 4,
+                                 mt->fast_clear_color.u32[i]);
+         }
+         brw_emit_pipe_control_flush(brw, PIPE_CONTROL_STATE_CACHE_INVALIDATE);
+      }
       brw->ctx.NewDriverState |= BRW_NEW_AUX_STATE;
       return true;
    }
    return false;
 }
 
-bool
-intel_miptree_set_depth_clear_value(struct brw_context *brw,
-                                    struct intel_mipmap_tree *mt,
-                                    float clear_value)
+union isl_color_value
+intel_miptree_get_clear_color(const struct gen_device_info *devinfo,
+                              const struct intel_mipmap_tree *mt,
+                              enum isl_format view_format, bool sampling,
+                              struct brw_bo **clear_color_bo,
+                              uint32_t *clear_color_offset)
 {
-   if (mt->fast_clear_color.f32[0] != clear_value) {
-      mt->fast_clear_color.f32[0] = clear_value;
-      brw->ctx.NewDriverState |= BRW_NEW_AUX_STATE;
-      return true;
+   assert(mt->aux_buf);
+
+   if (devinfo->gen == 10 && isl_format_is_srgb(view_format) && sampling) {
+      /* The gen10 sampler doesn't gamma-correct the clear color. In this case,
+       * we switch to using the inline clear color and do the sRGB color
+       * conversion process defined in the OpenGL spec. The red, green, and
+       * blue channels take part in gamma correction, while the alpha channel
+       * is unchanged.
+       */
+      union isl_color_value srgb_decoded_value = mt->fast_clear_color;
+      for (unsigned i = 0; i < 3; i++) {
+         srgb_decoded_value.f32[i] =
+            util_format_srgb_to_linear_float(mt->fast_clear_color.f32[i]);
+      }
+      *clear_color_bo = 0;
+      *clear_color_offset = 0;
+      return srgb_decoded_value;
+   } else {
+      *clear_color_bo = mt->aux_buf->clear_color_bo;
+      *clear_color_offset = mt->aux_buf->clear_color_offset;
+      return mt->fast_clear_color;
    }
-   return false;
 }
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 6c71968..08c129a 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -88,6 +88,12 @@
    void *ptr;
    /** Stride of the mapping. */
    int stride;
+
+   void (*unmap)(struct brw_context *brw,
+                 struct intel_mipmap_tree *mt,
+                 struct intel_miptree_map *map,
+                 unsigned int level,
+                 unsigned int slice);
 };
 
 /**
@@ -156,31 +162,6 @@
     */
    uint32_t offset;
 
-   /*
-    * Size of the MCS surface.
-    *
-    * This is needed when doing any gtt mapped operations on the buffer (which
-    * will be Y-tiled). It is possible that it will not be the same as bo->size
-    * when the drm allocator rounds up the requested size.
-    */
-   size_t size;
-
-   /**
-    * Pitch in bytes.
-    *
-    * @see RENDER_SURFACE_STATE.AuxiliarySurfacePitch
-    * @see 3DSTATE_HIER_DEPTH_BUFFER.SurfacePitch
-    */
-   uint32_t pitch;
-
-   /**
-    * The distance in rows between array slices.
-    *
-    * @see RENDER_SURFACE_STATE.AuxiliarySurfaceQPitch
-    * @see 3DSTATE_HIER_DEPTH_BUFFER.SurfaceQPitch
-    */
-   uint32_t qpitch;
-
    /**
     * Buffer object containing the indirect clear color.
     *
@@ -373,7 +354,7 @@
 };
 
 bool
-intel_miptree_alloc_ccs(struct brw_context *brw,
+intel_miptree_alloc_aux(struct brw_context *brw,
                         struct intel_mipmap_tree *mt);
 
 enum intel_miptree_create_flags {
@@ -389,16 +370,13 @@
     */
    MIPTREE_CREATE_BUSY     = 1 << 0,
 
-   /** Create a linear (not tiled) miptree */
-   MIPTREE_CREATE_LINEAR   = 1 << 1,
-
    /** Create the miptree with auxiliary compression disabled
     *
     * This does not prevent the caller of intel_miptree_create from coming
     * along later and turning auxiliary compression back on but it does mean
     * that the miptree will be created with mt->aux_usage == NONE.
     */
-   MIPTREE_CREATE_NO_AUX   = 1 << 2,
+   MIPTREE_CREATE_NO_AUX   = 1 << 1,
 };
 
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
@@ -429,7 +407,7 @@
                                    __DRIimage *image,
                                    GLenum target,
                                    mesa_format format,
-                                   bool is_winsys_image);
+                                   bool allow_internal_aux);
 
 bool
 intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
@@ -530,15 +508,6 @@
  * functions on a miptree without HiZ. In that case, each function is a no-op.
  */
 
-/**
- * \brief Allocate the miptree's embedded HiZ miptree.
- * \see intel_mipmap_tree:hiz_mt
- * \return false if allocation failed
- */
-bool
-intel_miptree_alloc_hiz(struct brw_context *brw,
-			struct intel_mipmap_tree *mt);
-
 bool
 intel_miptree_level_has_hiz(const struct intel_mipmap_tree *mt, uint32_t level);
 
@@ -736,12 +705,15 @@
 bool
 intel_miptree_set_clear_color(struct brw_context *brw,
                               struct intel_mipmap_tree *mt,
-                              const union gl_color_union *color);
+                              union isl_color_value clear_color);
 
-bool
-intel_miptree_set_depth_clear_value(struct brw_context *brw,
-                                    struct intel_mipmap_tree *mt,
-                                    float clear_value);
+/* Get a clear color suitable for filling out an ISL surface state. */
+union isl_color_value
+intel_miptree_get_clear_color(const struct gen_device_info *devinfo,
+                              const struct intel_mipmap_tree *mt,
+                              enum isl_format view_format, bool sampling,
+                              struct brw_bo **clear_color_bo,
+                              uint32_t *clear_color_offset);
 
 
 static inline int
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c b/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
index 5bc341b..33ed3eb 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_bitmap.c
@@ -158,10 +158,10 @@
 static inline int
 y_flip(struct gl_framebuffer *fb, int y, int height)
 {
-   if (_mesa_is_user_fbo(fb))
-      return y;
-   else
+   if (fb->FlipY)
       return fb->Height - y - height;
+   else
+      return y;
 }
 
 /*
@@ -283,7 +283,7 @@
                                      w, h,
                                      (GLubyte *)stipple,
                                      8,
-                                     _mesa_is_winsys_fbo(fb));
+                                     fb->FlipY);
          if (count == 0)
 	    continue;
 
@@ -348,11 +348,13 @@
 	    const struct gl_pixelstore_attrib *unpack,
 	    const GLubyte * pixels)
 {
+   struct brw_context *brw = brw_context(ctx);
+
    if (!_mesa_check_conditional_render(ctx))
       return;
 
-   if (do_blit_bitmap(ctx, x, y, width, height,
-                          unpack, pixels))
+   if (brw->screen->devinfo.gen < 6 &&
+       do_blit_bitmap(ctx, x, y, width, height, unpack, pixels))
       return;
 
    _mesa_meta_Bitmap(ctx, x, y, width, height, unpack, pixels);
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_copy.c b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
index 8029ffb..b5c3f6a 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_copy.c
@@ -170,9 +170,9 @@
 
    if (!intel_miptree_blit(brw,
                            read_irb->mt, read_irb->mt_level, read_irb->mt_layer,
-                           srcx, srcy, _mesa_is_winsys_fbo(read_fb),
+                           srcx, srcy, read_fb->FlipY,
                            draw_irb->mt, draw_irb->mt_level, draw_irb->mt_layer,
-                           dstx, dsty, _mesa_is_winsys_fbo(fb),
+                           dstx, dsty, fb->FlipY,
                            width, height,
                            (ctx->Color.ColorLogicOpEnabled ?
                             ctx->Color._LogicOp : COLOR_LOGICOP_COPY))) {
@@ -196,12 +196,15 @@
                 GLsizei width, GLsizei height,
                 GLint destx, GLint desty, GLenum type)
 {
+   struct brw_context *brw = brw_context(ctx);
+
    DBG("%s\n", __func__);
 
    if (!_mesa_check_conditional_render(ctx))
       return;
 
-   if (do_blit_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
+   if (brw->screen->devinfo.gen < 6 &&
+       do_blit_copypixels(ctx, srcx, srcy, width, height, destx, desty, type))
       return;
 
    /* this will use swrast if needed */
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_draw.c b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
index 82dca4a..9b7d436 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
@@ -127,7 +127,7 @@
                            pbo_mt, 0, 0,
                            0, 0, src_flip,
                            irb->mt, irb->mt_level, irb->mt_layer,
-                           x, y, _mesa_is_winsys_fbo(ctx->DrawBuffer),
+                           x, y, ctx->DrawBuffer->FlipY,
                            width, height, COLOR_LOGICOP_COPY)) {
       DBG("%s: blit failed\n", __func__);
       intel_miptree_release(&pbo_mt);
@@ -163,7 +163,8 @@
       return;
    }
 
-   if (_mesa_is_bufferobj(unpack->BufferObj)) {
+   if (brw->screen->devinfo.gen < 6 &&
+       _mesa_is_bufferobj(unpack->BufferObj)) {
       if (do_blit_drawpixels(ctx, x, y, width, height, format, type, unpack,
 			     pixels)) {
 	 return;
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index cf95737..8a90b20 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -39,7 +39,6 @@
 #include "brw_blorp.h"
 #include "intel_screen.h"
 #include "intel_batchbuffer.h"
-#include "intel_blit.h"
 #include "intel_buffers.h"
 #include "intel_fbo.h"
 #include "intel_mipmap_tree.h"
@@ -182,7 +181,7 @@
     * tiled_to_linear a negative pitch so that it walks through the
     * client's data backwards as it walks through the renderbufer forwards.
     */
-   if (rb->Name == 0) {
+   if (ctx->ReadBuffer->FlipY) {
       yoffset = rb->Height - yoffset - height;
       pixels += (ptrdiff_t) (height - 1) * dst_pitch;
       dst_pitch = -dst_pitch;
@@ -250,7 +249,7 @@
    return brw_blorp_download_miptree(brw, irb->mt, rb->Format, swizzle,
                                      irb->mt_level, x, y, irb->mt_layer,
                                      w, h, 1, GL_TEXTURE_2D, format, type,
-                                     rb->Name == 0, pixels, packing);
+                                     ctx->ReadBuffer->FlipY, pixels, packing);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index bb5ed53..0dd75cb 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -190,6 +190,12 @@
    { __DRI_IMAGE_FOURCC_XRGB2101010, __DRI_IMAGE_COMPONENTS_RGB, 1,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_XRGB2101010, 4 } } },
 
+   { __DRI_IMAGE_FOURCC_ABGR2101010, __DRI_IMAGE_COMPONENTS_RGBA, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_ABGR2101010, 4 } } },
+
+   { __DRI_IMAGE_FOURCC_XBGR2101010, __DRI_IMAGE_COMPONENTS_RGB, 1,
+     { { 0, 0, 0, __DRI_IMAGE_FORMAT_XBGR2101010, 4 } } },
+
    { __DRI_IMAGE_FOURCC_ARGB8888, __DRI_IMAGE_COMPONENTS_RGBA, 1,
      { { 0, 0, 0, __DRI_IMAGE_FORMAT_ARGB8888, 4 } } },
 
@@ -589,7 +595,7 @@
    image->dri_format = driGLFormatToImageFormat(image->format);
    image->has_depthstencil = iobj->mt->stencil_mt? true : false;
    image->planar_format = iobj->planar_format;
-   if (image->dri_format == MESA_FORMAT_NONE) {
+   if (image->dri_format == __DRI_IMAGE_FORMAT_NONE) {
       *error = __DRI_IMAGE_ERROR_BAD_PARAMETER;
       free(image);
       return NULL;
@@ -749,6 +755,7 @@
     */
    image->bo = brw_bo_alloc_tiled(screen->bufmgr, "image",
                                   surf.size + aux_surf.size,
+                                  BRW_MEMZONE_OTHER,
                                   isl_tiling_to_i915_tiling(mod_info->tiling),
                                   surf.row_pitch, BO_ALLOC_ZEROED);
    if (image->bo == NULL) {
@@ -1265,22 +1272,47 @@
 }
 
 static bool
-intel_image_format_is_supported(const struct intel_image_format *fmt)
+intel_image_format_is_supported(const struct gen_device_info *devinfo,
+                                const struct intel_image_format *fmt)
 {
-   if (fmt->fourcc == __DRI_IMAGE_FOURCC_SARGB8888)
-      return false;
+   /* Currently, all formats with an intel_image_format are available on all
+    * platforms so there's really nothing to check there.
+    */
+
+#ifndef NDEBUG
+   if (fmt->nplanes == 1) {
+      mesa_format format = driImageFormatToGLFormat(fmt->planes[0].dri_format);
+      /* The images we will create are actually based on the RGBA non-sRGB
+       * version of the format.
+       */
+      format = _mesa_format_fallback_rgbx_to_rgba(format);
+      format = _mesa_get_srgb_format_linear(format);
+      enum isl_format isl_format = brw_isl_format_for_mesa_format(format);
+      assert(isl_format_supports_rendering(devinfo, isl_format));
+   }
+#endif
 
    return true;
 }
 
 static GLboolean
-intel_query_dma_buf_formats(__DRIscreen *screen, int max,
+intel_query_dma_buf_formats(__DRIscreen *_screen, int max,
                             int *formats, int *count)
 {
+   struct intel_screen *screen = _screen->driverPrivate;
    int num_formats = 0, i;
 
    for (i = 0; i < ARRAY_SIZE(intel_image_formats); i++) {
-      if (!intel_image_format_is_supported(&intel_image_formats[i]))
+      /* These two formats are valid DRI formats but do not exist in
+       * drm_fourcc.h in the Linux kernel.  We don't want to accidentally
+       * advertise them through the EGL layer.
+       */
+      if (intel_image_formats[i].fourcc == __DRI_IMAGE_FOURCC_SARGB8888 ||
+          intel_image_formats[i].fourcc == __DRI_IMAGE_FOURCC_SABGR8888)
+         continue;
+
+      if (!intel_image_format_is_supported(&screen->devinfo,
+                                           &intel_image_formats[i]))
          continue;
 
       num_formats++;
@@ -1310,7 +1342,7 @@
    if (f == NULL)
       return false;
 
-   if (!intel_image_format_is_supported(f))
+   if (!intel_image_format_is_supported(&screen->devinfo, f))
       return false;
 
    for (i = 0; i < ARRAY_SIZE(supported_modifiers); i++) {
@@ -1855,7 +1887,7 @@
    uint32_t swizzle_mode = 0;
    struct brw_bo *buffer =
       brw_bo_alloc_tiled(screen->bufmgr, "swizzle test", 32768,
-                         tiling, 512, 0);
+                         BRW_MEMZONE_OTHER, tiling, 512, 0);
    if (buffer == NULL)
       return false;
 
@@ -1930,11 +1962,11 @@
    bool success = false;
 
    /* Create a zero'ed temporary buffer for reading our results */
-   results = brw_bo_alloc(screen->bufmgr, "registers", 4096);
+   results = brw_bo_alloc(screen->bufmgr, "registers", 4096, BRW_MEMZONE_OTHER);
    if (results == NULL)
       goto err;
 
-   bo = brw_bo_alloc(screen->bufmgr, "batchbuffer", 4096);
+   bo = brw_bo_alloc(screen->bufmgr, "batchbuffer", 4096, BRW_MEMZONE_OTHER);
    if (bo == NULL)
       goto err_results;
 
@@ -2108,6 +2140,8 @@
 
       /* Required by Android, for HAL_PIXEL_FORMAT_RGBX_8888. */
       MESA_FORMAT_R8G8B8X8_UNORM,
+
+      MESA_FORMAT_R8G8B8A8_SRGB,
    };
 
    /* GLX_SWAP_COPY_OML is not supported due to page flipping. */
@@ -2127,7 +2161,7 @@
    if (intel_loader_get_cap(dri_screen, DRI_LOADER_CAP_RGBA_ORDERING))
       num_formats = ARRAY_SIZE(formats);
    else
-      num_formats = ARRAY_SIZE(formats) - 2; /* all - RGBA_ORDERING formats */
+      num_formats = ARRAY_SIZE(formats) - 3; /* all - RGBA_ORDERING formats */
 
    /* Shall we expose 10 bpc formats? */
    bool allow_rgb10_configs = driQueryOptionb(&screen->optionCache,
@@ -2740,6 +2774,7 @@
                                            width,
                                            height,
                                            cpp,
+                                           BRW_MEMZONE_OTHER,
                                            I915_TILING_X, &pitch,
                                            BO_ALLOC_BUSY);
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_copy.c b/src/mesa/drivers/dri/i965/intel_tex_copy.c
index 5a0e09f..bc209c6 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_copy.c
@@ -37,61 +37,11 @@
 #include "intel_mipmap_tree.h"
 #include "intel_fbo.h"
 #include "intel_tex.h"
-#include "intel_blit.h"
 #include "brw_context.h"
 
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
 
 
-static bool
-intel_copy_texsubimage(struct brw_context *brw,
-                       struct intel_texture_image *intelImage,
-                       GLint dstx, GLint dsty, GLint slice,
-                       struct intel_renderbuffer *irb,
-                       GLint x, GLint y, GLsizei width, GLsizei height)
-{
-   const GLenum internalFormat = intelImage->base.Base.InternalFormat;
-
-   if (!intelImage->mt || !irb || !irb->mt) {
-      if (unlikely(INTEL_DEBUG & DEBUG_PERF))
-	 fprintf(stderr, "%s fail %p %p (0x%08x)\n",
-		 __func__, intelImage->mt, irb, internalFormat);
-      return false;
-   }
-
-   /* No pixel transfer operations (zoom, bias, mapping), just a blit */
-   if (brw->ctx._ImageTransferState)
-      return false;
-
-   intel_prepare_render(brw);
-
-   /* glCopyTexSubImage() can be called on a multisampled renderbuffer (if
-    * that renderbuffer is associated with the window system framebuffer),
-    * however the hardware blitter can't handle this case, so fall back to
-    * meta (which can, since it uses ReadPixels).
-    */
-   if (irb->Base.Base.NumSamples != 0)
-      return false;
-
-   /* glCopyTexSubImage() can't be called on a multisampled texture. */
-   assert(intelImage->base.Base.NumSamples == 0);
-
-   /* account for view parameters and face index */
-   int dst_level = intelImage->base.Base.Level +
-                   intelImage->base.Base.TexObject->MinLevel;
-   int dst_slice = slice + intelImage->base.Base.Face +
-                   intelImage->base.Base.TexObject->MinLayer;
-
-   /* blit from src buffer to texture */
-   return intel_miptree_blit(brw,
-                             irb->mt, irb->mt_level, irb->mt_layer,
-                             x, y, irb->Base.Base.Name == 0,
-                             intelImage->mt, dst_level, dst_slice,
-                             dstx, dsty, false,
-                             width, height, COLOR_LOGICOP_COPY);
-}
-
-
 static void
 intelCopyTexSubImage(struct gl_context *ctx, GLuint dims,
                      struct gl_texture_image *texImage,
@@ -107,14 +57,6 @@
                                  xoffset, yoffset, width, height))
       return;
 
-   /* Next, try the BLT engine. */
-   if (intel_copy_texsubimage(brw,
-                              intel_texture_image(texImage),
-                              xoffset, yoffset, slice,
-                              intel_renderbuffer(rb), x, y, width, height)) {
-      return;
-   }
-
    /* Finally, fall back to meta.  This will likely be slow. */
    perf_debug("%s - fallback to swrast\n", __func__);
    _mesa_meta_CopyTexSubImage(ctx, dims, texImage,
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 856216e..3d94838 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -21,7 +21,6 @@
 #include "intel_buffer_objects.h"
 #include "intel_batchbuffer.h"
 #include "intel_tex.h"
-#include "intel_blit.h"
 #include "intel_fbo.h"
 #include "intel_image.h"
 #include "intel_tiled_memcpy.h"
@@ -326,9 +325,6 @@
 
    bool tex_busy = mt && brw_bo_busy(mt->bo);
 
-   if (mt && mt->format == MESA_FORMAT_S_UINT8)
-      mt->r8stencil_needs_update = true;
-
    if (_mesa_is_bufferobj(packing->BufferObj) || tex_busy ||
        mt->aux_usage == ISL_AUX_USAGE_CCS_E) {
       ok = intel_texsubimage_blorp(brw, dims, texImage,
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index eaa60ba..72ce83c 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -29,7 +29,6 @@
 
 #include "brw_context.h"
 #include "intel_mipmap_tree.h"
-#include "intel_blit.h"
 #include "intel_tex.h"
 
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 6930682..7c6bde9 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -451,7 +451,7 @@
  */
 static inline void
 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
-                 uint32_t y0, uint32_t y1,
+                 uint32_t y0, uint32_t y3,
                  char *dst, const char *src,
                  int32_t dst_pitch,
                  uint32_t swizzle_bit,
@@ -470,6 +470,9 @@
    const uint32_t column_width = ytile_span;
    const uint32_t bytes_per_column = column_width * ytile_height;
 
+   uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
+   uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
+
    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
 
@@ -485,24 +488,81 @@
 
    dst += (ptrdiff_t)y0 * dst_pitch;
 
-   for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
+   if (y0 != y1) {
+      for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
+         uint32_t xo = xo1;
+         uint32_t swizzle = swizzle1;
+
+         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
+
+         /* Step by spans/columns.  As it happens, the swizzle bit flips
+          * at each step so we don't need to calculate it explicitly.
+          */
+         for (x = x1; x < x2; x += ytile_span) {
+            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
+            xo += bytes_per_column;
+            swizzle ^= swizzle_bit;
+         }
+
+         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+
+         dst += dst_pitch;
+      }
+   }
+
+   for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
       uint32_t xo = xo1;
       uint32_t swizzle = swizzle1;
 
-      mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
+      if (x0 != x1) {
+         mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
+         mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
+         mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
+         mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
+      }
 
       /* Step by spans/columns.  As it happens, the swizzle bit flips
        * at each step so we don't need to calculate it explicitly.
        */
       for (x = x1; x < x2; x += ytile_span) {
-         mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
+         mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
+         mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
+         mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
+         mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
          xo += bytes_per_column;
          swizzle ^= swizzle_bit;
       }
 
-      mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+      if (x2 != x3) {
+         mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
+         mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
+         mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
+         mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
+      }
 
-      dst += dst_pitch;
+      dst += 4 * dst_pitch;
+   }
+
+   if (y2 != y3) {
+      for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
+         uint32_t xo = xo1;
+         uint32_t swizzle = swizzle1;
+
+         mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
+
+         /* Step by spans/columns.  As it happens, the swizzle bit flips
+          * at each step so we don't need to calculate it explicitly.
+          */
+         for (x = x1; x < x2; x += ytile_span) {
+            mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
+            xo += bytes_per_column;
+            swizzle ^= swizzle_bit;
+         }
+
+         mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+
+         dst += dst_pitch;
+      }
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_upload.c b/src/mesa/drivers/dri/i965/intel_upload.c
index f165a7b..d81ae43 100644
--- a/src/mesa/drivers/dri/i965/intel_upload.c
+++ b/src/mesa/drivers/dri/i965/intel_upload.c
@@ -86,7 +86,8 @@
    assert((upload->bo == NULL) == (upload->map == NULL));
    if (!upload->bo) {
       upload->bo = brw_bo_alloc(upload->bufmgr, "streamed data",
-                                MAX2(upload->default_size, size));
+                                MAX2(upload->default_size, size),
+                                BRW_MEMZONE_OTHER);
       upload->map = brw_bo_map(NULL, upload->bo,
                                MAP_READ | MAP_WRITE |
                                MAP_PERSISTENT | MAP_ASYNC);
diff --git a/src/mesa/drivers/dri/i965/meson.build b/src/mesa/drivers/dri/i965/meson.build
index 90ea150..fe6a5ad 100644
--- a/src/mesa/drivers/dri/i965/meson.build
+++ b/src/mesa/drivers/dri/i965/meson.build
@@ -56,6 +56,8 @@
   'brw_pipe_control.c',
   'brw_performance_query.h',
   'brw_performance_query.c',
+  'brw_performance_query_mdapi.c',
+  'brw_performance_query_metrics.h',
   'brw_program.c',
   'brw_program.h',
   'brw_program_binary.c',
@@ -85,14 +87,12 @@
   'gen4_blorp_exec.h',
   'gen6_clip_state.c',
   'gen6_constant_state.c',
-  'gen6_depth_state.c',
   'gen6_multisample_state.c',
   'gen6_queryobj.c',
   'gen6_sampler_state.c',
   'gen6_sol.c',
   'gen6_urb.c',
   'gen7_l3_state.c',
-  'gen7_misc_state.c',
   'gen7_sol_state.c',
   'gen7_urb.c',
   'gen8_depth_state.c',
diff --git a/src/mesa/drivers/dri/meson.build b/src/mesa/drivers/dri/meson.build
index 8e23fd4..943727b 100644
--- a/src/mesa/drivers/dri/meson.build
+++ b/src/mesa/drivers/dri/meson.build
@@ -44,7 +44,7 @@
 if dri_drivers != []
   libmesa_dri_drivers = shared_library(
     'mesa_dri_drivers',
-    dummy_cpp,  # see meson #2180
+    [],
     link_whole : dri_drivers,
     link_with : [
       libmegadriver_stub, libdricommon, libxmlconfig, libglapi, libmesa_util,
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
index c78d4ba..77e7be1 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
@@ -133,13 +133,17 @@
 			 GLuint x, GLuint y, GLuint w, GLuint h,
 			 GLbitfield mode,
 			 GLubyte **out_map,
-			 GLint *out_stride)
+			 GLint *out_stride,
+			 bool flip_y)
 {
 	struct nouveau_surface *s = &to_nouveau_renderbuffer(rb)->surface;
 	GLubyte *map;
 	int stride;
 	int flags = 0;
 
+	/* driver does not support GL_FRAMEBUFFER_FLIP_Y_MESA */
+	assert((rb->Name == 0) == flip_y);
+
 	if (mode & GL_MAP_READ_BIT)
 		flags |= NOUVEAU_BO_RD;
 	if (mode & GL_MAP_WRITE_BIT)
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_state.c b/src/mesa/drivers/dri/nouveau/nouveau_state.c
index debbd38..a05c8be 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_state.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_state.c
@@ -115,7 +115,7 @@
 }
 
 static void
-nouveau_draw_buffer(struct gl_context *ctx, GLenum buffer)
+nouveau_draw_buffer(struct gl_context *ctx)
 {
 	nouveau_validate_framebuffer(ctx);
 	context_dirty(ctx, FRAMEBUFFER);
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
index 79b444c..3900c77 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_vbo_t.c
@@ -41,7 +41,7 @@
  * structures. */
 
 static int
-get_array_stride(struct gl_context *ctx, const struct gl_vertex_array *a)
+get_array_stride(struct gl_context *ctx, const struct tnl_vertex_array *a)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
 	const struct gl_vertex_buffer_binding *binding = a->BufferBinding;
@@ -57,7 +57,7 @@
 
 static void
 vbo_init_arrays(struct gl_context *ctx, const struct _mesa_index_buffer *ib,
-		const struct gl_vertex_array *arrays)
+		const struct tnl_vertex_array *arrays)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
 	GLboolean imm = (render->mode == IMM);
@@ -78,7 +78,7 @@
 	}
 
 	FOR_EACH_BOUND_ATTR(render, i, attr) {
-		const struct gl_vertex_array *array = &arrays[attr];
+		const struct tnl_vertex_array *array = &arrays[attr];
 		const struct gl_vertex_buffer_binding *binding =
 			array->BufferBinding;
 		const struct gl_array_attributes *attrib = array->VertexAttrib;
@@ -94,7 +94,7 @@
 
 static void
 vbo_deinit_arrays(struct gl_context *ctx, const struct _mesa_index_buffer *ib,
-		  const struct gl_vertex_array *arrays)
+		  const struct tnl_vertex_array *arrays)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
 	int i, attr;
@@ -118,7 +118,7 @@
 /* Make some rendering decisions from the GL context. */
 
 static void
-vbo_choose_render_mode(struct gl_context *ctx, const struct gl_vertex_array *arrays)
+vbo_choose_render_mode(struct gl_context *ctx, const struct tnl_vertex_array *arrays)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
 	int i;
@@ -136,12 +136,12 @@
 }
 
 static void
-vbo_emit_attr(struct gl_context *ctx, const struct gl_vertex_array *arrays,
+vbo_emit_attr(struct gl_context *ctx, const struct tnl_vertex_array *arrays,
 	      int attr)
 {
 	struct nouveau_pushbuf *push = context_push(ctx);
 	struct nouveau_render_state *render = to_render_state(ctx);
-	const struct gl_vertex_array *array = &arrays[attr];
+	const struct tnl_vertex_array *array = &arrays[attr];
 	const struct gl_vertex_buffer_binding *binding = array->BufferBinding;
 	const struct gl_array_attributes *attrib = array->VertexAttrib;
 	const GLubyte *p = _mesa_vertex_attrib_address(attrib, binding);
@@ -179,7 +179,7 @@
 #define MAT(a) VERT_ATTRIB_MAT(MAT_ATTRIB_##a)
 
 static void
-vbo_choose_attrs(struct gl_context *ctx, const struct gl_vertex_array *arrays)
+vbo_choose_attrs(struct gl_context *ctx, const struct tnl_vertex_array *arrays)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
 	int i;
@@ -222,13 +222,13 @@
 }
 
 static int
-get_max_client_stride(struct gl_context *ctx, const struct gl_vertex_array *arrays)
+get_max_client_stride(struct gl_context *ctx, const struct tnl_vertex_array *arrays)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
 	int i, attr, s = 0;
 
 	FOR_EACH_BOUND_ATTR(render, i, attr) {
-		const struct gl_vertex_array *a = &arrays[attr];
+		const struct tnl_vertex_array *a = &arrays[attr];
 
 		if (!_mesa_is_bufferobj(a->BufferBinding->BufferObj))
 			s = MAX2(s, get_array_stride(ctx, a));
@@ -239,7 +239,7 @@
 
 static void
 TAG(vbo_render_prims)(struct gl_context *ctx,
-		      const struct gl_vertex_array *arrays,
+		      const struct tnl_vertex_array *arrays,
 		      const struct _mesa_prim *prims, GLuint nr_prims,
 		      const struct _mesa_index_buffer *ib,
 		      GLboolean index_bounds_valid,
@@ -249,7 +249,7 @@
 		      struct gl_buffer_object *indirect);
 
 static GLboolean
-vbo_maybe_split(struct gl_context *ctx, const struct gl_vertex_array *arrays,
+vbo_maybe_split(struct gl_context *ctx, const struct tnl_vertex_array *arrays,
 	    const struct _mesa_prim *prims, GLuint nr_prims,
 	    const struct _mesa_index_buffer *ib,
 	    GLuint min_index, GLuint max_index)
@@ -309,7 +309,7 @@
 }
 
 static void
-vbo_bind_vertices(struct gl_context *ctx, const struct gl_vertex_array *arrays,
+vbo_bind_vertices(struct gl_context *ctx, const struct tnl_vertex_array *arrays,
 		  int base, unsigned min_index, unsigned max_index, int *pdelta)
 {
 	struct nouveau_render_state *render = to_render_state(ctx);
@@ -323,7 +323,7 @@
 	*pdelta = -1;
 
 	FOR_EACH_BOUND_ATTR(render, i, attr) {
-		const struct gl_vertex_array *array = &arrays[attr];
+		const struct tnl_vertex_array *array = &arrays[attr];
 		const struct gl_vertex_buffer_binding *binding =
 			array->BufferBinding;
 		const struct gl_array_attributes *attrib = array->VertexAttrib;
@@ -381,7 +381,7 @@
 }
 
 static void
-vbo_draw_vbo(struct gl_context *ctx, const struct gl_vertex_array *arrays,
+vbo_draw_vbo(struct gl_context *ctx, const struct tnl_vertex_array *arrays,
 	     const struct _mesa_prim *prims, GLuint nr_prims,
 	     const struct _mesa_index_buffer *ib, GLuint min_index,
 	     GLuint max_index)
@@ -431,7 +431,7 @@
 }
 
 static void
-vbo_draw_imm(struct gl_context *ctx, const struct gl_vertex_array *arrays,
+vbo_draw_imm(struct gl_context *ctx, const struct tnl_vertex_array *arrays,
 	     const struct _mesa_prim *prims, GLuint nr_prims,
 	     const struct _mesa_index_buffer *ib, GLuint min_index,
 	     GLuint max_index)
@@ -477,7 +477,7 @@
 
 static void
 TAG(vbo_render_prims)(struct gl_context *ctx,
-		      const struct gl_vertex_array *arrays,
+		      const struct tnl_vertex_array *arrays,
 		      const struct _mesa_prim *prims, GLuint nr_prims,
 		      const struct _mesa_index_buffer *ib,
 		      GLboolean index_bounds_valid,
@@ -515,7 +515,7 @@
 
 static void
 TAG(vbo_check_render_prims)(struct gl_context *ctx,
-			    const struct gl_vertex_array *arrays,
+			    const struct tnl_vertex_array *arrays,
 			    const struct _mesa_prim *prims, GLuint nr_prims,
 			    const struct _mesa_index_buffer *ib,
 			    GLboolean index_bounds_valid,
@@ -550,9 +550,9 @@
 	      struct gl_buffer_object *indirect)
 {
 	/* Borrow and update the inputs list from the tnl context */
-	_tnl_bind_inputs(ctx);
+	const struct tnl_vertex_array* arrays = _tnl_bind_inputs(ctx);
 
-	TAG(vbo_check_render_prims)(ctx, ctx->Array._DrawArrays,
+	TAG(vbo_check_render_prims)(ctx, arrays,
 				    prims, nr_prims, ib,
 				    index_bounds_valid, min_index, max_index,
 				    tfb_vertcount, stream, indirect);
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index 4524f06..9417dfc 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -353,6 +353,7 @@
    ctx->Extensions.EXT_texture_filter_anisotropic = true;
    ctx->Extensions.EXT_texture_mirror_clamp = true;
    ctx->Extensions.MESA_pack_invert = true;
+   ctx->Extensions.NV_fog_distance = true;
    ctx->Extensions.NV_texture_rectangle = true;
    ctx->Extensions.OES_EGL_image = true;
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 6e4b4c4..a3bf00b 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -175,27 +175,6 @@
 	return age;
 }
 
-/**
- * Check if we're about to draw into the front color buffer.
- * If so, set the intel->front_buffer_dirty field to true.
- */
-void
-radeon_check_front_buffer_rendering(struct gl_context *ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-	const struct gl_framebuffer *fb = ctx->DrawBuffer;
-
-	if (fb->Name == 0) {
-		/* drawing to window system buffer */
-		if (fb->_NumColorDrawBuffers > 0) {
-			if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
-				radeon->front_buffer_dirty = GL_TRUE;
-			}
-		}
-	}
-}
-
-
 void radeon_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb)
 {
 	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
@@ -338,28 +317,22 @@
 /**
  * Called via glDrawBuffer.
  */
-void radeonDrawBuffer( struct gl_context *ctx, GLenum mode )
+void radeonDrawBuffer(struct gl_context *ctx)
 {
 	if (RADEON_DEBUG & RADEON_DRI)
-		fprintf(stderr, "%s %s\n", __func__,
-			_mesa_enum_to_string( mode ));
+		fprintf(stderr, "%s\n", __func__);
 
-	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+	if (_mesa_is_front_buffer_drawing(ctx->DrawBuffer)) {
 		radeonContextPtr radeon = RADEON_CONTEXT(ctx);
 
-		const GLboolean was_front_buffer_rendering =
-			radeon->is_front_buffer_rendering;
-
-		radeon->is_front_buffer_rendering = (mode == GL_FRONT_LEFT) ||
-                                            (mode == GL_FRONT);
-
-      /* If we weren't front-buffer rendering before but we are now, make sure
-       * that the front-buffer has actually been allocated.
-       */
-		if (!was_front_buffer_rendering && radeon->is_front_buffer_rendering) {
-			radeon_update_renderbuffers(radeon->driContext,
-				radeon->driContext->driDrawablePriv, GL_FALSE);
-      }
+		/* If we might be front-buffer rendering on this buffer for
+		 * the first time, invalidate our DRI drawable so we'll ask
+		 * for new buffers (including the fake front) before we start
+		 * rendering again.
+		 */
+		radeon_update_renderbuffers(radeon->driContext,
+					    radeon->driContext->driDrawablePriv,
+					    GL_FALSE);
 	}
 
 	radeon_draw_buffer(ctx, ctx->DrawBuffer);
@@ -367,16 +340,10 @@
 
 void radeonReadBuffer( struct gl_context *ctx, GLenum mode )
 {
-	if (ctx->DrawBuffer && _mesa_is_winsys_fbo(ctx->DrawBuffer)) {
+	if (_mesa_is_front_buffer_reading(ctx->ReadBuffer)) {
 		struct radeon_context *const rmesa = RADEON_CONTEXT(ctx);
-		const GLboolean was_front_buffer_reading = rmesa->is_front_buffer_reading;
-		rmesa->is_front_buffer_reading = (mode == GL_FRONT_LEFT)
-					|| (mode == GL_FRONT);
-
-		if (!was_front_buffer_reading && rmesa->is_front_buffer_reading) {
-			radeon_update_renderbuffers(rmesa->driContext,
-						    rmesa->driContext->driReadablePriv, GL_FALSE);
-	 	}
+		radeon_update_renderbuffers(rmesa->driContext,
+					    rmesa->driContext->driReadablePriv, GL_FALSE);
 	}
 	/* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
 	if (ctx->ReadBuffer == ctx->DrawBuffer) {
@@ -402,7 +369,7 @@
 	void (*old_viewport)(struct gl_context *ctx);
 
 	if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
-		if (radeon->is_front_buffer_rendering) {
+		if (_mesa_is_front_buffer_drawing(ctx->DrawBuffer)) {
 			ctx->Driver.Flush(ctx);
 		}
 		radeon_update_renderbuffers(driContext, driContext->driDrawablePriv, GL_FALSE);
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.h b/src/mesa/drivers/dri/radeon/radeon_common.h
index fedaf50..a39b936 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common.h
@@ -21,7 +21,7 @@
 
 void radeon_window_moved(radeonContextPtr radeon);
 void radeon_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb);
-void radeonDrawBuffer( struct gl_context *ctx, GLenum mode );
+void radeonDrawBuffer(struct gl_context *ctx);
 void radeonReadBuffer( struct gl_context *ctx, GLenum mode );
 void radeon_viewport(struct gl_context *ctx);
 void radeon_fbo_init(struct radeon_context *radeon);
@@ -37,7 +37,6 @@
 				GLenum format, GLenum type,
 				const struct gl_pixelstore_attrib *pack, GLvoid * pixels);
 
-void radeon_check_front_buffer_rendering(struct gl_context *ctx);
 static inline struct radeon_renderbuffer *radeon_renderbuffer(struct gl_renderbuffer *rb)
 {
 	struct radeon_renderbuffer *rrb = (struct radeon_renderbuffer *)rb;
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 7e24f6d..47719ba 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -352,7 +352,7 @@
      * that will happen next will probably dirty the front buffer.  So
      * mark it as dirty here.
      */
-    if (radeon->is_front_buffer_rendering)
+    if (_mesa_is_front_buffer_drawing(radeon->glCtx.DrawBuffer))
 	radeon->front_buffer_dirty = GL_TRUE;
 }
 
@@ -389,10 +389,10 @@
 		struct radeon_renderbuffer *stencil_rb;
 
 		i = 0;
-		if ((front_only || radeon->is_front_buffer_rendering ||
-		     radeon->is_front_buffer_reading ||
-		     !draw->color_rb[1])
-		    && draw->color_rb[0]) {
+                if ((front_only || _mesa_is_front_buffer_drawing(&draw->base) ||
+                     _mesa_is_front_buffer_reading(&draw->base) ||
+                     !draw->color_rb[1])
+                    && draw->color_rb[0]) {
 			attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
 			attachments[i++] = radeon_bits_per_pixel(draw->color_rb[0]);
 		}
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
index 328b545..bd7343f 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -426,23 +426,6 @@
     */
    GLboolean front_buffer_dirty;
 
-   /**
-    * Track whether front-buffer rendering is currently enabled
-    *
-    * A separate flag is used to track this in order to support MRT more
-    * easily.
-    */
-   GLboolean is_front_buffer_rendering;
-
-   /**
-    * Track whether front-buffer is the current read target.
-    *
-    * This is closely associated with is_front_buffer_rendering, but may
-    * be set separately.  The DRI2 fake front buffer must be referenced
-    * either way.
-    */
-   GLboolean is_front_buffer_reading;
-
    struct {
 	struct radeon_query_object *current;
 	struct radeon_state_atom queryobj;
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index 37c9c3f..439b95b 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -226,7 +226,8 @@
 		       GLuint x, GLuint y, GLuint w, GLuint h,
 		       GLbitfield mode,
 		       GLubyte **out_map,
-		       GLint *out_stride)
+		       GLint *out_stride,
+		       bool flip_y)
 {
    struct radeon_context *const rmesa = RADEON_CONTEXT(ctx);
    struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
@@ -236,6 +237,9 @@
    int ret;
    int src_x, src_y;
 
+   /* driver does not support GL_FRAMEBUFFER_FLIP_Y_MESA */
+   assert((rb->Name == 0) == flip_y);
+
    if (!rrb || !rrb->bo) {
 	   *out_map = NULL;
 	   *out_stride = 0;
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h
index e70e334..efb2e60 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.h
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.h
@@ -126,7 +126,6 @@
 #define radeon_prepare_render               r200_radeon_prepare_render
 #define radeonUnbindContext                 r200_radeonUnbindContext
 #define radeon_update_renderbuffers         r200_radeon_update_renderbuffers
-#define radeon_check_front_buffer_rendering r200_radeon_check_front_buffer_rendering
 #define radeonCountStateEmitSize            r200_radeonCountStateEmitSize
 #define radeon_draw_buffer                  r200_radeon_draw_buffer
 #define radeonDrawBuffer                    r200_radeonDrawBuffer
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.c b/src/mesa/drivers/dri/radeon/radeon_span.c
index 42f62a0..fa5b2d9 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.c
+++ b/src/mesa/drivers/dri/radeon/radeon_span.c
@@ -44,6 +44,7 @@
 #include "main/texformat.h"
 #include "main/renderbuffer.h"
 #include "main/samplerobj.h"
+#include "main/framebuffer.h"
 #include "swrast/swrast.h"
 #include "swrast/s_renderbuffer.h"
 
@@ -52,7 +53,9 @@
 
 
 static void
-radeon_renderbuffer_map(struct gl_context *ctx, struct gl_renderbuffer *rb)
+radeon_renderbuffer_map(struct gl_context *ctx,
+			struct gl_renderbuffer *rb,
+			bool flip_y)
 {
 	struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
 	GLubyte *map;
@@ -63,7 +66,7 @@
 
 	ctx->Driver.MapRenderbuffer(ctx, rb, 0, 0, rb->Width, rb->Height,
 				    GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
-				    &map, &stride);
+				    &map, &stride, flip_y);
 
 	rrb->base.Map = map;
 	rrb->base.RowStride = stride;
@@ -95,9 +98,11 @@
 
 	/* check for render to textures */
 	for (i = 0; i < BUFFER_COUNT; i++)
-		radeon_renderbuffer_map(ctx, fb->Attachment[i].Renderbuffer);
+		radeon_renderbuffer_map(ctx, fb->Attachment[i].Renderbuffer,
+			fb->FlipY);
 
-	radeon_check_front_buffer_rendering(ctx);
+        if (_mesa_is_front_buffer_drawing(fb))
+		RADEON_CONTEXT(ctx)->front_buffer_dirty = true;
 }
 
 static void
@@ -113,7 +118,8 @@
 	for (i = 0; i < BUFFER_COUNT; i++)
 		radeon_renderbuffer_unmap(ctx, fb->Attachment[i].Renderbuffer);
 
-	radeon_check_front_buffer_rendering(ctx);
+        if (_mesa_is_front_buffer_drawing(fb))
+		RADEON_CONTEXT(ctx)->front_buffer_dirty = true;
 }
 
 static void radeonSpanRenderStart(struct gl_context * ctx)
diff --git a/src/mesa/drivers/dri/swrast/swrast.c b/src/mesa/drivers/dri/swrast/swrast.c
index ae5874f..4be993a 100644
--- a/src/mesa/drivers/dri/swrast/swrast.c
+++ b/src/mesa/drivers/dri/swrast/swrast.c
@@ -470,13 +470,17 @@
 			GLuint x, GLuint y, GLuint w, GLuint h,
 			GLbitfield mode,
 			GLubyte **out_map,
-			GLint *out_stride)
+			GLint *out_stride,
+			bool flip_y)
 {
    struct dri_swrast_renderbuffer *xrb = dri_swrast_renderbuffer(rb);
    GLubyte *map = xrb->Base.Buffer;
    int cpp = _mesa_get_format_bytes(rb->Format);
    int stride = rb->Width * cpp;
 
+   /* driver does not support GL_FRAMEBUFFER_FLIP_Y_MESA */
+   assert((rb->Name == 0) == flip_y);
+
    if (rb->AllocStorage == swrast_alloc_front_storage) {
       __DRIdrawable *dPriv = xrb->dPriv;
       __DRIscreen *sPriv = dPriv->driScreenPriv;
@@ -675,6 +679,9 @@
 {
     GLsizei width, height;
 
+    if (!fb)
+        return;
+
     get_window_size(fb, &width, &height);
     if (fb->Width != width || fb->Height != height) {
 	_mesa_resize_framebuffer(ctx, fb, width, height);
@@ -857,30 +864,26 @@
 		 __DRIdrawable * driReadPriv)
 {
     struct gl_context *mesaCtx;
-    struct gl_framebuffer *mesaDraw;
-    struct gl_framebuffer *mesaRead;
+    struct gl_framebuffer *mesaDraw = NULL;
+    struct gl_framebuffer *mesaRead = NULL;
     TRACE;
 
     if (cPriv) {
-	struct dri_context *ctx = dri_context(cPriv);
-	struct dri_drawable *draw;
-	struct dri_drawable *read;
+        mesaCtx = &dri_context(cPriv)->Base;
 
-	if (!driDrawPriv || !driReadPriv)
-	    return GL_FALSE;
+	if (driDrawPriv && driReadPriv) {
+           struct dri_drawable *draw = dri_drawable(driDrawPriv);
+           struct dri_drawable *read = dri_drawable(driReadPriv);
+           mesaDraw = &draw->Base;
+           mesaRead = &read->Base;
+        }
 
-	draw = dri_drawable(driDrawPriv);
-	read = dri_drawable(driReadPriv);
-	mesaCtx = &ctx->Base;
-	mesaDraw = &draw->Base;
-	mesaRead = &read->Base;
-
-	/* check for same context and buffer */
-	if (mesaCtx == _mesa_get_current_context()
-	    && mesaCtx->DrawBuffer == mesaDraw
-	    && mesaCtx->ReadBuffer == mesaRead) {
-	    return GL_TRUE;
-	}
+        /* check for same context and buffer */
+        if (mesaCtx == _mesa_get_current_context()
+            && mesaCtx->DrawBuffer == mesaDraw
+            && mesaCtx->ReadBuffer == mesaRead) {
+            return GL_TRUE;
+        }
 
 	_glapi_check_multithread();
 
diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c
index 3423eb6..be683d4 100644
--- a/src/mesa/drivers/osmesa/osmesa.c
+++ b/src/mesa/drivers/osmesa/osmesa.c
@@ -573,7 +573,8 @@
                        struct gl_renderbuffer *rb,
                        GLuint x, GLuint y, GLuint w, GLuint h,
                        GLbitfield mode,
-                       GLubyte **mapOut, GLint *rowStrideOut)
+                       GLubyte **mapOut, GLint *rowStrideOut,
+                       bool flip_y)
 {
    const OSMesaContext osmesa = OSMESA_CONTEXT(ctx);
 
@@ -601,7 +602,7 @@
    }
    else {
       _swrast_map_soft_renderbuffer(ctx, rb, x, y, w, h, mode,
-                                    mapOut, rowStrideOut);
+                                    mapOut, rowStrideOut, flip_y);
    }
 }
 
diff --git a/src/mesa/drivers/x11/xm_buffer.c b/src/mesa/drivers/x11/xm_buffer.c
index 97c7814..d945d8a 100644
--- a/src/mesa/drivers/x11/xm_buffer.c
+++ b/src/mesa/drivers/x11/xm_buffer.c
@@ -423,7 +423,8 @@
                       struct gl_renderbuffer *rb,
                       GLuint x, GLuint y, GLuint w, GLuint h,
                       GLbitfield mode,
-                      GLubyte **mapOut, GLint *rowStrideOut)
+                      GLubyte **mapOut, GLint *rowStrideOut,
+                      bool flip_y)
 {
    struct xmesa_renderbuffer *xrb = xmesa_renderbuffer(rb);
 
@@ -506,7 +507,7 @@
 
    /* otherwise, this is an ordinary malloc-based renderbuffer */
    _swrast_map_soft_renderbuffer(ctx, rb, x, y, w, h, mode,
-                                 mapOut, rowStrideOut);
+                                 mapOut, rowStrideOut, false);
 }
 
 
diff --git a/src/mesa/drivers/x11/xmesaP.h b/src/mesa/drivers/x11/xmesaP.h
index ff3ddc4..97f15ab 100644
--- a/src/mesa/drivers/x11/xmesaP.h
+++ b/src/mesa/drivers/x11/xmesaP.h
@@ -358,7 +358,8 @@
                       struct gl_renderbuffer *rb,
                       GLuint x, GLuint y, GLuint w, GLuint h,
                       GLbitfield mode,
-                      GLubyte **mapOut, GLint *rowStrideOut);
+                      GLubyte **mapOut, GLint *rowStrideOut,
+                      bool flip_y);
 
 extern void
 xmesa_UnmapRenderbuffer(struct gl_context *ctx, struct gl_renderbuffer *rb);
diff --git a/src/mesa/main/accum.c b/src/mesa/main/accum.c
index f5ac8a1..a0a206b 100644
--- a/src/mesa/main/accum.c
+++ b/src/mesa/main/accum.c
@@ -82,7 +82,8 @@
    height = ctx->DrawBuffer->_Ymax - ctx->DrawBuffer->_Ymin;
 
    ctx->Driver.MapRenderbuffer(ctx, accRb, x, y, width, height,
-                               GL_MAP_WRITE_BIT, &accMap, &accRowStride);
+                               GL_MAP_WRITE_BIT, &accMap, &accRowStride,
+                               ctx->DrawBuffer->FlipY);
 
    if (!accMap) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glAccum");
@@ -137,7 +138,8 @@
 
    ctx->Driver.MapRenderbuffer(ctx, accRb, xpos, ypos, width, height,
                                GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
-                               &accMap, &accRowStride);
+                               &accMap, &accRowStride,
+                               ctx->DrawBuffer->FlipY);
 
    if (!accMap) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glAccum");
@@ -206,7 +208,8 @@
 
    /* Map accum buffer */
    ctx->Driver.MapRenderbuffer(ctx, accRb, xpos, ypos, width, height,
-                               mappingFlags, &accMap, &accRowStride);
+                               mappingFlags, &accMap, &accRowStride,
+                               ctx->DrawBuffer->FlipY);
    if (!accMap) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glAccum");
       return;
@@ -215,7 +218,8 @@
    /* Map color buffer */
    ctx->Driver.MapRenderbuffer(ctx, colorRb, xpos, ypos, width, height,
                                GL_MAP_READ_BIT,
-                               &colorMap, &colorRowStride);
+                               &colorMap, &colorRowStride,
+                               ctx->DrawBuffer->FlipY);
    if (!colorMap) {
       ctx->Driver.UnmapRenderbuffer(ctx, accRb);
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glAccum");
@@ -288,7 +292,7 @@
    /* Map accum buffer */
    ctx->Driver.MapRenderbuffer(ctx, accRb, xpos, ypos, width, height,
                                GL_MAP_READ_BIT,
-                               &accMap, &accRowStride);
+                               &accMap, &accRowStride, fb->FlipY);
    if (!accMap) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glAccum");
       return;
@@ -308,7 +312,8 @@
 
       /* Map color buffer */
       ctx->Driver.MapRenderbuffer(ctx, colorRb, xpos, ypos, width, height,
-                                  mappingFlags, &colorMap, &colorRowStride);
+                                  mappingFlags, &colorMap, &colorRowStride,
+                                  fb->FlipY);
       if (!colorMap) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glAccum");
          continue;
diff --git a/src/mesa/main/api_arrayelt.c b/src/mesa/main/api_arrayelt.c
index 2dfa74f..afa3012 100644
--- a/src/mesa/main/api_arrayelt.c
+++ b/src/mesa/main/api_arrayelt.c
@@ -1823,7 +1823,7 @@
     * Luckily, neither the drivers nor tnl muck with the state that
     * concerns us here:
     */
-   assert(ctx->NewState & (_NEW_ARRAY | _NEW_PROGRAM));
+   assert(ctx->NewState & _NEW_ARRAY);
 
    assert(!actx->mapped_vbos);
    actx->dirty_state = true;
diff --git a/src/mesa/main/api_loopback.c b/src/mesa/main/api_loopback.c
index 4eab811..c03c33e 100644
--- a/src/mesa/main/api_loopback.c
+++ b/src/mesa/main/api_loopback.c
@@ -1790,22 +1790,5 @@
       SET_VertexAttribI4sv(dest, _mesa_VertexAttribI4sv);
       SET_VertexAttribI4ubv(dest, _mesa_VertexAttribI4ubv);
       SET_VertexAttribI4usv(dest, _mesa_VertexAttribI4usv);
-
-      /* GL_ARB_bindless_texture */
-      SET_VertexAttribL1ui64ARB(dest, _mesa_VertexAttribL1ui64ARB);
-      SET_VertexAttribL1ui64vARB(dest, _mesa_VertexAttribL1ui64vARB);
-   }
-
-   if (ctx->API == API_OPENGL_CORE) {
-      /* GL 4.1 / GL_ARB_vertex_attrib_64bit */
-      SET_VertexAttribL1d(dest, _mesa_VertexAttribL1d);
-      SET_VertexAttribL2d(dest, _mesa_VertexAttribL2d);
-      SET_VertexAttribL3d(dest, _mesa_VertexAttribL3d);
-      SET_VertexAttribL4d(dest, _mesa_VertexAttribL4d);
-
-      SET_VertexAttribL1dv(dest, _mesa_VertexAttribL1dv);
-      SET_VertexAttribL2dv(dest, _mesa_VertexAttribL2dv);
-      SET_VertexAttribL3dv(dest, _mesa_VertexAttribL3dv);
-      SET_VertexAttribL4dv(dest, _mesa_VertexAttribL4dv);
    }
 }
diff --git a/src/mesa/main/arrayobj.c b/src/mesa/main/arrayobj.c
index 899d4de..5ee68cf 100644
--- a/src/mesa/main/arrayobj.c
+++ b/src/mesa/main/arrayobj.c
@@ -451,8 +451,116 @@
 
 
 /**
- * Updates the derived gl_vertex_arrays when a gl_array_attributes
- * or a gl_vertex_buffer_binding has changed.
+ * Compute the offset range for the provided binding.
+ *
+ * This is a helper function for the below.
+ */
+static void
+compute_vbo_offset_range(const struct gl_vertex_array_object *vao,
+                         const struct gl_vertex_buffer_binding *binding,
+                         GLsizeiptr* min, GLsizeiptr* max)
+{
+   /* The function is meant to work on VBO bindings */
+   assert(_mesa_is_bufferobj(binding->BufferObj));
+
+   /* Start with an inverted range of relative offsets. */
+   GLuint min_offset = ~(GLuint)0;
+   GLuint max_offset = 0;
+
+   /* We work on the unmapped originaly VAO array entries. */
+   GLbitfield mask = vao->_Enabled & binding->_BoundArrays;
+   /* The binding should be active somehow, not to return inverted ranges */
+   assert(mask);
+   while (mask) {
+      const int i = u_bit_scan(&mask);
+      const GLuint off = vao->VertexAttrib[i].RelativeOffset;
+      min_offset = MIN2(off, min_offset);
+      max_offset = MAX2(off, max_offset);
+   }
+
+   *min = binding->Offset + (GLsizeiptr)min_offset;
+   *max = binding->Offset + (GLsizeiptr)max_offset;
+}
+
+
+/**
+ * Update the unique binding and pos/generic0 map tracking in the vao.
+ *
+ * The idea is to build up information in the vao so that a consuming
+ * backend can execute the following to set up buffer and vertex element
+ * information:
+ *
+ * const GLbitfield inputs_read = VERT_BIT_ALL; // backend vp inputs
+ *
+ * // Attribute data is in a VBO.
+ * GLbitfield vbomask = inputs_read & _mesa_draw_vbo_array_bits(ctx);
+ * while (vbomask) {
+ *    // The attribute index to start pulling a binding
+ *    const gl_vert_attrib i = ffs(vbomask) - 1;
+ *    const struct gl_vertex_buffer_binding *const binding
+ *       = _mesa_draw_buffer_binding(vao, i);
+ *
+ *    <insert code to handle the vertex buffer object at binding>
+ *
+ *    const GLbitfield boundmask = _mesa_draw_bound_attrib_bits(binding);
+ *    GLbitfield attrmask = vbomask & boundmask;
+ *    assert(attrmask);
+ *    // Walk attributes belonging to the binding
+ *    while (attrmask) {
+ *       const gl_vert_attrib attr = u_bit_scan(&attrmask);
+ *       const struct gl_array_attributes *const attrib
+ *          = _mesa_draw_array_attrib(vao, attr);
+ *
+ *       <insert code to handle the vertex element refering to the binding>
+ *    }
+ *    vbomask &= ~boundmask;
+ * }
+ *
+ * // Process user space buffers
+ * GLbitfield usermask = inputs_read & _mesa_draw_user_array_bits(ctx);
+ * while (usermask) {
+ *    // The attribute index to start pulling a binding
+ *    const gl_vert_attrib i = ffs(usermask) - 1;
+ *    const struct gl_vertex_buffer_binding *const binding
+ *       = _mesa_draw_buffer_binding(vao, i);
+ *
+ *    <insert code to handle a set of interleaved user space arrays at binding>
+ *
+ *    const GLbitfield boundmask = _mesa_draw_bound_attrib_bits(binding);
+ *    GLbitfield attrmask = usermask & boundmask;
+ *    assert(attrmask);
+ *    // Walk interleaved attributes with a common stride and instance divisor
+ *    while (attrmask) {
+ *       const gl_vert_attrib attr = u_bit_scan(&attrmask);
+ *       const struct gl_array_attributes *const attrib
+ *          = _mesa_draw_array_attrib(vao, attr);
+ *
+ *       <insert code to handle non vbo vertex arrays>
+ *    }
+ *    usermask &= ~boundmask;
+ * }
+ *
+ * // Process values that should have better been uniforms in the application
+ * GLbitfield curmask = inputs_read & _mesa_draw_current_bits(ctx);
+ * while (curmask) {
+ *    const gl_vert_attrib attr = u_bit_scan(&curmask);
+ *    const struct gl_array_attributes *const attrib
+ *       = _mesa_draw_current_attrib(ctx, attr);
+ *
+ *    <insert code to handle current values>
+ * }
+ *
+ *
+ * Note that the scan below must not incoporate any context state.
+ * The rationale is that once a VAO is finalized it should not
+ * be touched anymore. That means, do not incorporate the
+ * gl_context::Array._DrawVAOEnabledAttribs bitmask into this scan.
+ * A backend driver may further reduce the handled vertex processing
+ * inputs based on their vertex shader inputs. But scanning for
+ * collapsable binding points to reduce relocs is done based on the
+ * enabled arrays.
+ * Also VAOs may be shared between contexts due to their use in dlists
+ * thus no context state should bleed into the VAO.
  */
 void
 _mesa_update_vao_derived_arrays(struct gl_context *ctx,
@@ -461,11 +569,281 @@
    /* Make sure we do not run into problems with shared objects */
    assert(!vao->SharedAndImmutable || vao->NewArrays == 0);
 
-   /*
-    * Stay tuned, the next series scans for duplicate bindings in this
-    * function. So that drivers can easily know the minimum unique set
-    * of bindings.
+   /* Limit used for common binding scanning below. */
+   const GLsizeiptr MaxRelativeOffset =
+      ctx->Const.MaxVertexAttribRelativeOffset;
+
+   /* The gl_vertex_array_object::_AttributeMapMode denotes the way
+    * VERT_ATTRIB_{POS,GENERIC0} mapping is done.
+    *
+    * This mapping is used to map between the OpenGL api visible
+    * VERT_ATTRIB_* arrays to mesa driver arrayinputs or shader inputs.
+    * The mapping only depends on the enabled bits of the
+    * VERT_ATTRIB_{POS,GENERIC0} arrays and is tracked in the VAO.
+    *
+    * This map needs to be applied when finally translating to the bitmasks
+    * as consumed by the driver backends. The duplicate scanning is here
+    * can as well be done in the OpenGL API numbering without this map.
     */
+   const gl_attribute_map_mode mode = vao->_AttributeMapMode;
+   /* Enabled array bits. */
+   const GLbitfield enabled = vao->_Enabled;
+   /* VBO array bits. */
+   const GLbitfield vbos = vao->VertexAttribBufferMask;
+
+   /* Compute and store effectively enabled and mapped vbo arrays */
+   vao->_EffEnabledVBO = _mesa_vao_enable_to_vp_inputs(mode, enabled & vbos);
+   /* Walk those enabled arrays that have a real vbo attached */
+   GLbitfield mask = enabled;
+   while (mask) {
+      /* Do not use u_bit_scan as we can walk multiple attrib arrays at once */
+      const int i = ffs(mask) - 1;
+      /* The binding from the first to be processed attribute. */
+      const GLuint bindex = vao->VertexAttrib[i].BufferBindingIndex;
+      struct gl_vertex_buffer_binding *binding = &vao->BufferBinding[bindex];
+
+      /* The scan goes different for user space arrays than vbos */
+      if (_mesa_is_bufferobj(binding->BufferObj)) {
+         /* The bound arrays. */
+         const GLbitfield bound = enabled & binding->_BoundArrays;
+
+         /* Start this current effective binding with the actual bound arrays */
+         GLbitfield eff_bound_arrays = bound;
+
+         /*
+          * If there is nothing left to scan just update the effective binding
+          * information. If the VAO is already only using a single binding point
+          * we end up here. So the overhead of this scan for an application
+          * carefully preparing the VAO for draw is low.
+          */
+
+         GLbitfield scanmask = mask & vbos & ~bound;
+         /* Is there something left to scan? */
+         if (scanmask == 0) {
+            /* Just update the back reference from the attrib to the binding and
+             * the effective offset.
+             */
+            GLbitfield attrmask = eff_bound_arrays;
+            while (attrmask) {
+               const int j = u_bit_scan(&attrmask);
+               struct gl_array_attributes *attrib2 = &vao->VertexAttrib[j];
+
+               /* Update the index into the common binding point and offset */
+               attrib2->_EffBufferBindingIndex = bindex;
+               attrib2->_EffRelativeOffset = attrib2->RelativeOffset;
+               assert(attrib2->_EffRelativeOffset <= MaxRelativeOffset);
+
+               /* Only enabled arrays shall appear in the unique bindings */
+               assert(attrib2->Enabled);
+            }
+            /* Finally this is the set of effectively bound arrays with the
+             * original binding offset.
+             */
+            binding->_EffOffset = binding->Offset;
+            /* The bound arrays past the VERT_ATTRIB_{POS,GENERIC0} mapping. */
+            binding->_EffBoundArrays =
+               _mesa_vao_enable_to_vp_inputs(mode, eff_bound_arrays);
+
+         } else {
+            /* In the VBO case, scan for attribute/binding
+             * combinations with relative bindings in the range of
+             * [0, ctx->Const.MaxVertexAttribRelativeOffset].
+             * Note that this does also go beyond just interleaved arrays
+             * as long as they use the same VBO, binding parameters and the
+             * offsets stay within bounds that the backend still can handle.
+             */
+
+            GLsizeiptr min_offset, max_offset;
+            compute_vbo_offset_range(vao, binding, &min_offset, &max_offset);
+            assert(max_offset <= min_offset + MaxRelativeOffset);
+
+            /* Now scan. */
+            while (scanmask) {
+               /* Do not use u_bit_scan as we can walk multiple
+                * attrib arrays at once
+                */
+               const int j = ffs(scanmask) - 1;
+               const struct gl_array_attributes *attrib2 =
+                  &vao->VertexAttrib[j];
+               const struct gl_vertex_buffer_binding *binding2 =
+                  &vao->BufferBinding[attrib2->BufferBindingIndex];
+
+               /* Remove those attrib bits from the mask that are bound to the
+                * same effective binding point.
+                */
+               const GLbitfield bound2 = enabled & binding2->_BoundArrays;
+               scanmask &= ~bound2;
+
+               /* Check if we have an identical binding */
+               if (binding->Stride != binding2->Stride)
+                  continue;
+               if (binding->InstanceDivisor != binding2->InstanceDivisor)
+                  continue;
+               if (binding->BufferObj != binding2->BufferObj)
+                  continue;
+               /* Check if we can fold both bindings into a common binding */
+               GLsizeiptr min_offset2, max_offset2;
+               compute_vbo_offset_range(vao, binding2,
+                                        &min_offset2, &max_offset2);
+               /* If the relative offset is within the limits ... */
+               if (min_offset + MaxRelativeOffset < max_offset2)
+                  continue;
+               if (min_offset2 + MaxRelativeOffset < max_offset)
+                  continue;
+               /* ... add this array to the effective binding */
+               eff_bound_arrays |= bound2;
+               min_offset = MIN2(min_offset, min_offset2);
+               max_offset = MAX2(max_offset, max_offset2);
+               assert(max_offset <= min_offset + MaxRelativeOffset);
+            }
+
+            /* Update the back reference from the attrib to the binding */
+            GLbitfield attrmask = eff_bound_arrays;
+            while (attrmask) {
+               const int j = u_bit_scan(&attrmask);
+               struct gl_array_attributes *attrib2 = &vao->VertexAttrib[j];
+               const struct gl_vertex_buffer_binding *binding2 =
+                  &vao->BufferBinding[attrib2->BufferBindingIndex];
+
+               /* Update the index into the common binding point and offset */
+               attrib2->_EffBufferBindingIndex = bindex;
+               attrib2->_EffRelativeOffset =
+                  binding2->Offset + attrib2->RelativeOffset - min_offset;
+               assert(attrib2->_EffRelativeOffset <= MaxRelativeOffset);
+
+               /* Only enabled arrays shall appear in the unique bindings */
+               assert(attrib2->Enabled);
+            }
+            /* Finally this is the set of effectively bound arrays */
+            binding->_EffOffset = min_offset;
+            /* The bound arrays past the VERT_ATTRIB_{POS,GENERIC0} mapping. */
+            binding->_EffBoundArrays =
+               _mesa_vao_enable_to_vp_inputs(mode, eff_bound_arrays);
+         }
+
+         /* Mark all the effective bound arrays as processed. */
+         mask &= ~eff_bound_arrays;
+
+      } else {
+         /* Scanning of common bindings for user space arrays.
+          */
+
+         const struct gl_array_attributes *attrib = &vao->VertexAttrib[i];
+         const GLbitfield bound = VERT_BIT(i);
+
+         /* Note that user space array pointers can only happen using a one
+          * to one binding point to array mapping.
+          * The OpenGL 4.x/ARB_vertex_attrib_binding api does not support
+          * user space arrays collected at multiple binding points.
+          * The only provider of user space interleaved arrays with a single
+          * binding point is the mesa internal vbo module. But that one
+          * provides a perfect interleaved set of arrays.
+          *
+          * If this would not be true we would potentially get attribute arrays
+          * with user space pointers that may not lie within the
+          * MaxRelativeOffset range but still attached to a single binding.
+          * Then we would need to store the effective attribute and binding
+          * grouping information in a seperate array beside
+          * gl_array_attributes/gl_vertex_buffer_binding.
+          */
+         assert(_mesa_bitcount(binding->_BoundArrays & vao->_Enabled) == 1
+                || (vao->_Enabled & ~binding->_BoundArrays) == 0);
+
+         /* Start this current effective binding with the array */
+         GLbitfield eff_bound_arrays = bound;
+
+         const GLubyte *ptr = attrib->Ptr;
+         unsigned vertex_end = attrib->_ElementSize;
+
+         /* Walk other user space arrays and see which are interleaved
+          * using the same binding parameters.
+          */
+         GLbitfield scanmask = mask & ~vbos & ~bound;
+         while (scanmask) {
+            const int j = u_bit_scan(&scanmask);
+            const struct gl_array_attributes *attrib2 = &vao->VertexAttrib[j];
+            const struct gl_vertex_buffer_binding *binding2 =
+               &vao->BufferBinding[attrib2->BufferBindingIndex];
+
+            /* See the comment at the same assert above. */
+            assert(_mesa_bitcount(binding2->_BoundArrays & vao->_Enabled) == 1
+                   || (vao->_Enabled & ~binding->_BoundArrays) == 0);
+
+            /* Check if we have an identical binding */
+            if (binding->Stride != binding2->Stride)
+               continue;
+            if (binding->InstanceDivisor != binding2->InstanceDivisor)
+               continue;
+            if (ptr <= attrib2->Ptr) {
+               if (ptr + binding->Stride < attrib2->Ptr + attrib2->_ElementSize)
+                  continue;
+               unsigned end = attrib2->Ptr + attrib2->_ElementSize - ptr;
+               vertex_end = MAX2(vertex_end, end);
+            } else {
+               if (attrib2->Ptr + binding->Stride < ptr + vertex_end)
+                  continue;
+               vertex_end += (GLsizei)(ptr - attrib2->Ptr);
+               ptr = attrib2->Ptr;
+            }
+
+            /* User space buffer object */
+            assert(!_mesa_is_bufferobj(binding2->BufferObj));
+
+            eff_bound_arrays |= VERT_BIT(j);
+         }
+
+         /* Update the back reference from the attrib to the binding */
+         GLbitfield attrmask = eff_bound_arrays;
+         while (attrmask) {
+            const int j = u_bit_scan(&attrmask);
+            struct gl_array_attributes *attrib2 = &vao->VertexAttrib[j];
+
+            /* Update the index into the common binding point and the offset */
+            attrib2->_EffBufferBindingIndex = bindex;
+            attrib2->_EffRelativeOffset = attrib2->Ptr - ptr;
+            assert(attrib2->_EffRelativeOffset <= binding->Stride);
+
+            /* Only enabled arrays shall appear in the unique bindings */
+            assert(attrib2->Enabled);
+         }
+         /* Finally this is the set of effectively bound arrays */
+         binding->_EffOffset = (GLintptr)ptr;
+         /* The bound arrays past the VERT_ATTRIB_{POS,GENERIC0} mapping. */
+         binding->_EffBoundArrays =
+            _mesa_vao_enable_to_vp_inputs(mode, eff_bound_arrays);
+
+         /* Mark all the effective bound arrays as processed. */
+         mask &= ~eff_bound_arrays;
+      }
+   }
+
+#ifndef NDEBUG
+   /* Make sure the above code works as expected. */
+   for (gl_vert_attrib attr = 0; attr < VERT_ATTRIB_MAX; ++attr) {
+      /* Query the original api defined attrib/binding information ... */
+      const unsigned char *const map =_mesa_vao_attribute_map[mode];
+      const struct gl_array_attributes *attrib = &vao->VertexAttrib[map[attr]];
+      if (attrib->Enabled) {
+         const struct gl_vertex_buffer_binding *binding =
+            &vao->BufferBinding[attrib->BufferBindingIndex];
+         /* ... and compare that with the computed attrib/binding */
+         const struct gl_vertex_buffer_binding *binding2 =
+            &vao->BufferBinding[attrib->_EffBufferBindingIndex];
+         assert(binding->Stride == binding2->Stride);
+         assert(binding->InstanceDivisor == binding2->InstanceDivisor);
+         assert(binding->BufferObj == binding2->BufferObj);
+         if (_mesa_is_bufferobj(binding->BufferObj)) {
+            assert(attrib->_EffRelativeOffset <= MaxRelativeOffset);
+            assert(binding->Offset + attrib->RelativeOffset ==
+                   binding2->_EffOffset + attrib->_EffRelativeOffset);
+         } else {
+            assert(attrib->_EffRelativeOffset < binding->Stride);
+            assert((GLintptr)attrib->Ptr ==
+                   binding2->_EffOffset + attrib->_EffRelativeOffset);
+         }
+      }
+   }
+#endif
 }
 
 
@@ -592,7 +970,6 @@
     * or to prevent a crash if the VAO being unbound is going to be
     * deleted.
     */
-   _mesa_set_drawing_arrays(ctx, NULL);
    _mesa_set_draw_vao(ctx, ctx->Array._EmptyVAO, 0);
 
    ctx->NewState |= _NEW_ARRAY;
diff --git a/src/mesa/main/arrayobj.h b/src/mesa/main/arrayobj.h
index 8da5c9f..8b11c79 100644
--- a/src/mesa/main/arrayobj.h
+++ b/src/mesa/main/arrayobj.h
@@ -30,6 +30,7 @@
 #include "glheader.h"
 #include "mtypes.h"
 #include "glformats.h"
+#include "vbo/vbo.h"
 
 struct gl_context;
 
@@ -146,6 +147,191 @@
 }
 
 
+/**
+ * Helper functions for consuming backends to walk the
+ * ctx->Array._DrawVAO for driver side array setup.
+ * Note that mesa provides preprocessed minimal binding information
+ * in the VAO. See _mesa_update_vao_derived_arrays for documentation.
+ */
+
+/**
+ * Return enabled vertex attribute bits for draw.
+ */
+static inline GLbitfield
+_mesa_draw_array_bits(const struct gl_context *ctx)
+{
+   return ctx->Array._DrawVAOEnabledAttribs;
+}
+
+
+/**
+ * Return enabled buffer object vertex attribute bits for draw.
+ *
+ * Needs the a fully updated VAO ready for draw.
+ */
+static inline GLbitfield
+_mesa_draw_vbo_array_bits(const struct gl_context *ctx)
+{
+   const struct gl_vertex_array_object *const vao = ctx->Array._DrawVAO;
+   assert(vao->NewArrays == 0);
+   return vao->_EffEnabledVBO & ctx->Array._DrawVAOEnabledAttribs;
+}
+
+
+/**
+ * Return enabled user space vertex attribute bits for draw.
+ *
+ * Needs the a fully updated VAO ready for draw.
+ */
+static inline GLbitfield
+_mesa_draw_user_array_bits(const struct gl_context *ctx)
+{
+   const struct gl_vertex_array_object *const vao = ctx->Array._DrawVAO;
+   assert(vao->NewArrays == 0);
+   return ~vao->_EffEnabledVBO & ctx->Array._DrawVAOEnabledAttribs;
+}
+
+
+/**
+ * Return enabled current values attribute bits for draw.
+ */
+static inline GLbitfield
+_mesa_draw_current_bits(const struct gl_context *ctx)
+{
+   return ~ctx->Array._DrawVAOEnabledAttribs & VERT_BIT_ALL;
+}
+
+
+/**
+ * Return vertex buffer binding provided the attribute struct.
+ *
+ * Needs the a fully updated VAO ready for draw.
+ */
+static inline const struct gl_vertex_buffer_binding*
+_mesa_draw_buffer_binding_from_attrib(const struct gl_vertex_array_object *vao,
+                                      const struct gl_array_attributes *attrib)
+{
+   assert(vao->NewArrays == 0);
+   return &vao->BufferBinding[attrib->_EffBufferBindingIndex];
+}
+
+
+/**
+ * Return vertex array attribute provided the attribute number.
+ */
+static inline const struct gl_array_attributes*
+_mesa_draw_array_attrib(const struct gl_vertex_array_object *vao,
+                        gl_vert_attrib attr)
+{
+   assert(vao->NewArrays == 0);
+   const gl_attribute_map_mode map_mode = vao->_AttributeMapMode;
+   return &vao->VertexAttrib[_mesa_vao_attribute_map[map_mode][attr]];
+}
+
+
+/**
+ * Return vertex buffer binding provided an attribute number.
+ */
+static inline const struct gl_vertex_buffer_binding*
+_mesa_draw_buffer_binding(const struct gl_vertex_array_object *vao,
+                          gl_vert_attrib attr)
+{
+   const struct gl_array_attributes *const attrib
+      = _mesa_draw_array_attrib(vao, attr);
+   return _mesa_draw_buffer_binding_from_attrib(vao, attrib);
+}
+
+
+/**
+ * Return vertex attribute bits bound at the provided binding.
+ *
+ * Needs the a fully updated VAO ready for draw.
+ */
+static inline GLbitfield
+_mesa_draw_bound_attrib_bits(const struct gl_vertex_buffer_binding *binding)
+{
+   return binding->_EffBoundArrays;
+}
+
+
+/**
+ * Return the vertex offset bound at the provided binding.
+ *
+ * Needs the a fully updated VAO ready for draw.
+ */
+static inline GLintptr
+_mesa_draw_binding_offset(const struct gl_vertex_buffer_binding *binding)
+{
+   return binding->_EffOffset;
+}
+
+
+/**
+ * Return the relative offset of the provided attrib.
+ *
+ * Needs the a fully updated VAO ready for draw.
+ */
+static inline GLushort
+_mesa_draw_attributes_relative_offset(const struct gl_array_attributes *attrib)
+{
+   return attrib->_EffRelativeOffset;
+}
+
+
+/**
+ * Return a current value vertex array attribute provided the attribute number.
+ */
+static inline const struct gl_array_attributes*
+_mesa_draw_current_attrib(const struct gl_context *ctx, gl_vert_attrib attr)
+{
+   return _vbo_current_attrib(ctx, attr);
+}
+
+
+/**
+ * Return true if we have the VERT_ATTRIB_EDGEFLAG array enabled.
+ */
+static inline bool
+_mesa_draw_edge_flag_array_enabled(const struct gl_context *ctx)
+{
+   return ctx->Array._DrawVAOEnabledAttribs & VERT_BIT_EDGEFLAG;
+}
+
+
+/**
+ * Return the attrib for the given attribute.
+ */
+static inline const struct gl_array_attributes*
+_mesa_draw_attrib(const struct gl_context *ctx, gl_vert_attrib attr)
+{
+   if (ctx->Array._DrawVAOEnabledAttribs & VERT_BIT(attr)) {
+      const struct gl_vertex_array_object *vao = ctx->Array._DrawVAO;
+      return _mesa_draw_array_attrib(vao, attr);
+   } else {
+      return _vbo_current_attrib(ctx, attr);
+   }
+}
+
+
+/**
+ * Return the attrib, binding pair for the given attribute.
+ */
+static inline void
+_mesa_draw_attrib_and_binding(const struct gl_context *ctx, gl_vert_attrib attr,
+                              const struct gl_array_attributes **attrib,
+                              const struct gl_vertex_buffer_binding **binding)
+{
+   if (ctx->Array._DrawVAOEnabledAttribs & VERT_BIT(attr)) {
+      const struct gl_vertex_array_object *vao = ctx->Array._DrawVAO;
+      *attrib = _mesa_draw_array_attrib(vao, attr);
+      *binding = _mesa_draw_buffer_binding_from_attrib(vao, *attrib);
+   } else {
+      *attrib = _vbo_current_attrib(ctx, attr);
+      *binding = _vbo_current_binding(ctx);
+   }
+}
+
+
 /*
  * API functions
  */
diff --git a/src/mesa/main/attrib.c b/src/mesa/main/attrib.c
index 9f0e716..cbe93ab 100644
--- a/src/mesa/main/attrib.c
+++ b/src/mesa/main/attrib.c
@@ -137,6 +137,9 @@
 
    /* GL_ARB_framebuffer_sRGB / GL_EXT_framebuffer_sRGB */
    GLboolean sRGBEnabled;
+
+   /* GL_NV_conservative_raster */
+   GLboolean ConservativeRasterization;
 };
 
 
@@ -177,6 +180,13 @@
 };
 
 
+struct viewport_state
+{
+   struct gl_viewport_attrib ViewportArray[MAX_VIEWPORTS];
+   GLuint SubpixelPrecisionBias[2];
+};
+
+
 /** An unused GL_*_BIT value */
 #define DUMMY_BIT 0x10000000
 
@@ -393,6 +403,9 @@
 
       /* GL_ARB_framebuffer_sRGB / GL_EXT_framebuffer_sRGB */
       attr->sRGBEnabled = ctx->Color.sRGBEnabled;
+
+      /* GL_NV_conservative_raster */
+      attr->ConservativeRasterization = ctx->ConservativeRasterization;
    }
 
    if (mask & GL_EVAL_BIT) {
@@ -544,11 +557,23 @@
    }
 
    if (mask & GL_VIEWPORT_BIT) {
-      if (!push_attrib(ctx, &head, GL_VIEWPORT_BIT,
-                       sizeof(struct gl_viewport_attrib)
-                       * ctx->Const.MaxViewports,
-                       (void*)&ctx->ViewportArray))
+      struct viewport_state *viewstate = CALLOC_STRUCT(viewport_state);
+      if (!viewstate) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glPushAttrib(GL_VIEWPORT_BIT)");
          goto end;
+      }
+
+      if (!save_attrib_data(&head, GL_VIEWPORT_BIT, viewstate)) {
+         free(viewstate);
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glPushAttrib(GL_VIEWPORT_BIT)");
+         goto end;
+      }
+
+      memcpy(&viewstate->ViewportArray, &ctx->ViewportArray,
+             sizeof(struct gl_viewport_attrib)*ctx->Const.MaxViewports);
+
+      viewstate->SubpixelPrecisionBias[0] = ctx->SubpixelPrecisionBias[0];
+      viewstate->SubpixelPrecisionBias[1] = ctx->SubpixelPrecisionBias[1];
    }
 
    /* GL_ARB_multisample */
@@ -713,6 +738,13 @@
    TEST_AND_UPDATE(ctx->Color.sRGBEnabled, enable->sRGBEnabled,
                    GL_FRAMEBUFFER_SRGB);
 
+   /* GL_NV_conservative_raster */
+   if (ctx->Extensions.NV_conservative_raster) {
+      TEST_AND_UPDATE(ctx->ConservativeRasterization,
+                      enable->ConservativeRasterization,
+                      GL_CONSERVATIVE_RASTERIZATION_NV);
+   }
+
    /* texture unit enables */
    for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
       const GLbitfield enabled = enable->Texture[i];
@@ -1126,7 +1158,8 @@
                                       ctx->DriverFlags.NewSampleAlphaToXEnable |
                                       ctx->DriverFlags.NewSampleMask |
                                       ctx->DriverFlags.NewScissorTest |
-                                      ctx->DriverFlags.NewStencil;
+                                      ctx->DriverFlags.NewStencil |
+                                      ctx->DriverFlags.NewNvConservativeRasterization;
             }
             break;
          case GL_EVAL_BIT:
@@ -1418,13 +1451,20 @@
          case GL_VIEWPORT_BIT:
             {
                unsigned i;
-               const struct gl_viewport_attrib *vp;
-               vp = (const struct gl_viewport_attrib *) attr->data;
+               const struct viewport_state *viewstate;
+               viewstate = (const struct viewport_state *) attr->data;
 
                for (i = 0; i < ctx->Const.MaxViewports; i++) {
-                  _mesa_set_viewport(ctx, i, vp[i].X, vp[i].Y, vp[i].Width,
-                                     vp[i].Height);
-                  _mesa_set_depth_range(ctx, i, vp[i].Near, vp[i].Far);
+                  const struct gl_viewport_attrib *vp = &viewstate->ViewportArray[i];
+                  _mesa_set_viewport(ctx, i, vp->X, vp->Y, vp->Width,
+                                     vp->Height);
+                  _mesa_set_depth_range(ctx, i, vp->Near, vp->Far);
+               }
+
+               if (ctx->Extensions.NV_conservative_raster) {
+                  GLuint biasx = viewstate->SubpixelPrecisionBias[0];
+                  GLuint biasy = viewstate->SubpixelPrecisionBias[1];
+                  _mesa_SubpixelPrecisionBiasNV(biasx, biasy);
                }
             }
             break;
@@ -1512,6 +1552,7 @@
 
    /* _Enabled must be the same than on push */
    dest->_Enabled = src->_Enabled;
+   dest->_EffEnabledVBO = src->_EffEnabledVBO;
    /* The bitmask of bound VBOs needs to match the VertexBinding array */
    dest->VertexAttribBufferMask = src->VertexAttribBufferMask;
    dest->_AttributeMapMode = src->_AttributeMapMode;
@@ -1547,7 +1588,6 @@
    /* skip IndexBufferObj */
 
    /* Invalidate array state. It will be updated during the next draw. */
-   _mesa_set_drawing_arrays(ctx, NULL);
    _mesa_set_draw_vao(ctx, ctx->Array._EmptyVAO, 0);
 }
 
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 068c7dd..1d1e51b 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -129,8 +129,7 @@
          return &ctx->QueryBuffer;
       break;
    case GL_DRAW_INDIRECT_BUFFER:
-      if ((ctx->API == API_OPENGL_CORE &&
-           ctx->Extensions.ARB_draw_indirect) ||
+      if ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_draw_indirect) ||
            _mesa_is_gles31(ctx)) {
          return &ctx->DrawIndirectBuffer;
       }
@@ -1167,7 +1166,7 @@
    if (vao->BufferBinding[index].BufferObj == obj) {
       _mesa_bind_vertex_buffer(ctx, vao, index, ctx->Shared->NullBufferObj,
                                vao->BufferBinding[index].Offset,
-                               vao->BufferBinding[index].Stride, true);
+                               vao->BufferBinding[index].Stride);
    }
 }
 
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 7bb5725d..bb85688 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -305,7 +305,9 @@
    /* Call device driver function only if fb is the bound draw buffer */
    if (fb == ctx->DrawBuffer) {
       if (ctx->Driver.DrawBuffer)
-         ctx->Driver.DrawBuffer(ctx, buffer);
+         ctx->Driver.DrawBuffer(ctx);
+      if (ctx->Driver.DrawBufferAllocate)
+         ctx->Driver.DrawBufferAllocate(ctx);
    }
 }
 
@@ -586,7 +588,9 @@
     */
    if (fb == ctx->DrawBuffer) {
       if (ctx->Driver.DrawBuffer)
-         ctx->Driver.DrawBuffer(ctx, n > 0 ? buffers[0] : GL_NONE);
+         ctx->Driver.DrawBuffer(ctx);
+      if (ctx->Driver.DrawBufferAllocate)
+         ctx->Driver.DrawBufferAllocate(ctx);
    }
 }
 
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 81573bf..6a2f766 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -315,4 +315,13 @@
 #define MAX_CLIPPED_VERTICES ((2 * (6 + MAX_CLIP_PLANES))+1)
 
 
+/** For GL_ARB_sample_locations - maximum of SAMPLE_LOCATION_PIXEL_GRID_*_ARB */
+#define MAX_SAMPLE_LOCATION_GRID_SIZE 4
+
+/* It is theoretically possible for Consts.MaxSamples to be >32 but
+ * other code seems to assume that is not the case.
+ */
+#define MAX_SAMPLE_LOCATION_TABLE_SIZE \
+   (MAX_SAMPLE_LOCATION_GRID_SIZE * MAX_SAMPLE_LOCATION_GRID_SIZE * 32)
+
 #endif /* MESA_CONFIG_H_INCLUDED */
diff --git a/src/mesa/main/conservativeraster.c b/src/mesa/main/conservativeraster.c
new file mode 100644
index 0000000..9068a00
--- /dev/null
+++ b/src/mesa/main/conservativeraster.c
@@ -0,0 +1,128 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2018 Rhys Perry
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * \file conservativeraster.c
+ * glConservativeRasterParameteriNV and glConservativeRasterParameterfNV functions
+ */
+
+#include "conservativeraster.h"
+#include "context.h"
+#include "enums.h"
+
+static ALWAYS_INLINE void
+conservative_raster_parameter(GLenum pname, GLfloat param,
+                              bool no_error, const char *func)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!no_error && !ctx->Extensions.NV_conservative_raster_dilate &&
+       !ctx->Extensions.NV_conservative_raster_pre_snap_triangles) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s not supported", func);
+      return;
+   }
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "%s(%s, %g)\n",
+                  func, _mesa_enum_to_string(pname), param);
+
+   ASSERT_OUTSIDE_BEGIN_END(ctx);
+
+   switch (pname) {
+   case GL_CONSERVATIVE_RASTER_DILATE_NV:
+      if (!no_error && !ctx->Extensions.NV_conservative_raster_dilate)
+         goto invalid_pname_enum;
+
+      if (!no_error && param<0.0) {
+         _mesa_error(ctx, GL_INVALID_VALUE, "%s(param=%g)", func, param);
+         return;
+      }
+      ctx->ConservativeRasterDilate =
+         CLAMP(param,
+               ctx->Const.ConservativeRasterDilateRange[0],
+               ctx->Const.ConservativeRasterDilateRange[1]);
+      break;
+   case GL_CONSERVATIVE_RASTER_MODE_NV:
+      if (!no_error && !ctx->Extensions.NV_conservative_raster_pre_snap_triangles)
+         goto invalid_pname_enum;
+
+      if (!no_error && param != GL_CONSERVATIVE_RASTER_MODE_POST_SNAP_NV &&
+          param != GL_CONSERVATIVE_RASTER_MODE_PRE_SNAP_TRIANGLES_NV) {
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "%s(pname=%s)", func, _mesa_enum_to_string(param));
+         return;
+      }
+      ctx->ConservativeRasterMode = param;
+      break;
+   default:
+      goto invalid_pname_enum;
+      break;
+   }
+
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |=
+      ctx->DriverFlags.NewNvConservativeRasterizationParams;
+
+   return;
+invalid_pname_enum:
+   if (!no_error)
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)",
+                  func, _mesa_enum_to_string(pname));
+}
+
+void GLAPIENTRY
+_mesa_ConservativeRasterParameteriNV_no_error(GLenum pname, GLint param)
+{
+   conservative_raster_parameter(pname, param, true, 
+                                 "glConservativeRasterParameteriNV");
+}
+
+void GLAPIENTRY
+_mesa_ConservativeRasterParameteriNV(GLenum pname, GLint param)
+{
+   conservative_raster_parameter(pname, param, false,
+                                 "glConservativeRasterParameteriNV");
+}
+
+void GLAPIENTRY
+_mesa_ConservativeRasterParameterfNV_no_error(GLenum pname, GLfloat param)
+{
+   conservative_raster_parameter(pname, param, true, 
+                                 "glConservativeRasterParameterfNV");
+}
+
+void GLAPIENTRY
+_mesa_ConservativeRasterParameterfNV(GLenum pname, GLfloat param)
+{
+   conservative_raster_parameter(pname, param, false, 
+                                 "glConservativeRasterParameterfNV");
+}
+
+void
+_mesa_init_conservative_raster(struct gl_context *ctx)
+{
+   ctx->ConservativeRasterDilate = 0.0;
+   ctx->ConservativeRasterMode = GL_CONSERVATIVE_RASTER_MODE_POST_SNAP_NV;
+}
diff --git a/src/mesa/main/conservativeraster.h b/src/mesa/main/conservativeraster.h
new file mode 100644
index 0000000..1865cfc
--- /dev/null
+++ b/src/mesa/main/conservativeraster.h
@@ -0,0 +1,48 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2018 Rhys Perry
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef CONSERVATIVERASTER_H
+#define CONSERVATIVERASTER_H
+
+#include "glheader.h"
+
+struct gl_context;
+
+extern void GLAPIENTRY
+_mesa_ConservativeRasterParameteriNV_no_error(GLenum pname, GLint param);
+
+extern void GLAPIENTRY
+_mesa_ConservativeRasterParameteriNV(GLenum pname, GLint param);
+
+extern void GLAPIENTRY
+_mesa_ConservativeRasterParameterfNV_no_error(GLenum pname, GLfloat param);
+
+extern void GLAPIENTRY
+_mesa_ConservativeRasterParameterfNV(GLenum pname, GLfloat param);
+
+extern void
+_mesa_init_conservative_raster(struct gl_context *ctx);
+
+#endif
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index e13343b..0ef8fe3 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -87,6 +87,7 @@
 #include "blend.h"
 #include "buffers.h"
 #include "bufferobj.h"
+#include "conservativeraster.h"
 #include "context.h"
 #include "cpuinfo.h"
 #include "debug.h"
@@ -637,10 +638,6 @@
    consts->MaxGeometryOutputVertices = MAX_GEOMETRY_OUTPUT_VERTICES;
    consts->MaxGeometryTotalOutputComponents = MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS;
 
-   /* Shading language version */
-   consts->GLSLVersion = 120;
-   _mesa_override_glsl_version(consts);
-
 #ifdef DEBUG
    consts->GenerateTemporaryNames = true;
 #else
@@ -739,6 +736,14 @@
    consts->MaxComputeVariableGroupSize[1] = 512;
    consts->MaxComputeVariableGroupSize[2] = 64;
    consts->MaxComputeVariableGroupInvocations = 512;
+
+   /** GL_NV_conservative_raster */
+   consts->MaxSubpixelPrecisionBiasBits = 0;
+
+   /** GL_NV_conservative_raster_dilate */
+   consts->ConservativeRasterDilateRange[0] = 0.0;
+   consts->ConservativeRasterDilateRange[1] = 0.0;
+   consts->ConservativeRasterDilateGranularity = 0.0;
 }
 
 
@@ -828,6 +833,7 @@
    _mesa_init_bbox( ctx );
    _mesa_init_buffer_objects( ctx );
    _mesa_init_color( ctx );
+   _mesa_init_conservative_raster( ctx );
    _mesa_init_current( ctx );
    _mesa_init_depth( ctx );
    _mesa_init_debug( ctx );
diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h
index ef06540..d50438f 100644
--- a/src/mesa/main/context.h
+++ b/src/mesa/main/context.h
@@ -233,6 +233,22 @@
 } while (0)
 
 /**
+ * Flush vertices.
+ *
+ * \param ctx GL context.
+ *
+ * Checks if dd_function_table::NeedFlush is marked to flush stored vertices
+ * or current state and calls dd_function_table::FlushVertices if so.
+ */
+#define FLUSH_FOR_DRAW(ctx)                                     \
+do {                                                            \
+   if (MESA_VERBOSE & VERBOSE_STATE)                            \
+      _mesa_debug(ctx, "FLUSH_FOR_DRAW in %s\n", __func__);     \
+   if (ctx->Driver.NeedFlush)                                   \
+      vbo_exec_FlushVertices(ctx, ctx->Driver.NeedFlush);       \
+} while (0)
+
+/**
  * Macro to assert that the API call was made outside the
  * glBegin()/glEnd() pair, with return value.
  * 
@@ -362,6 +378,13 @@
           _mesa_has_OES_texture_cube_map_array(ctx);
 }
 
+static inline bool
+_mesa_has_texture_view(const struct gl_context *ctx)
+{
+   return _mesa_has_ARB_texture_view(ctx) ||
+          _mesa_has_OES_texture_view(ctx);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index d85d89e..f14c3e0 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -429,7 +429,8 @@
 			   struct gl_renderbuffer *rb,
 			   GLuint x, GLuint y, GLuint w, GLuint h,
 			   GLbitfield mode,
-			   GLubyte **mapOut, GLint *rowStrideOut);
+			   GLubyte **mapOut, GLint *rowStrideOut,
+			   bool flip_y);
 
    void (*UnmapRenderbuffer)(struct gl_context *ctx,
 			     struct gl_renderbuffer *rb);
@@ -611,7 +612,9 @@
    /** Specify mapping of depth values from NDC to window coordinates */
    void (*DepthRange)(struct gl_context *ctx);
    /** Specify the current buffer for writing */
-   void (*DrawBuffer)( struct gl_context *ctx, GLenum buffer );
+   void (*DrawBuffer)(struct gl_context *ctx);
+   /** Used to allocated any buffers with on-demand creation */
+   void (*DrawBufferAllocate)(struct gl_context *ctx);
    /** Enable or disable server-side gl capabilities */
    void (*Enable)(struct gl_context *ctx, GLenum cap, GLboolean state);
    /** Specify fog parameters */
@@ -786,6 +789,14 @@
                               const GLenum *attachments);
 
    /**
+    * \name Functions for GL_ARB_sample_locations
+    */
+   void (*GetProgrammableSampleCaps)(struct gl_context *ctx,
+                                     const struct gl_framebuffer *fb,
+                                     GLuint *bits, GLuint *width, GLuint *height);
+   void (*EvaluateDepthValues)(struct gl_context *ctx);
+
+   /**
     * \name Query objects
     */
    /*@{*/
@@ -1208,6 +1219,7 @@
    void (*GetProgramBinaryDriverSHA1)(struct gl_context *ctx, uint8_t *sha1);
 
    void (*ProgramBinarySerializeDriverBlob)(struct gl_context *ctx,
+                                            struct gl_shader_program *shProg,
                                             struct gl_program *prog);
 
    void (*ProgramBinaryDeserializeDriverBlob)(struct gl_context *ctx,
@@ -1272,6 +1284,21 @@
                                 struct gl_semaphore_object *semObj,
                                 int fd);
    /*@}*/
+
+   /**
+    * \name Disk shader cache functions
+    */
+   /*@{*/
+   /**
+    * Called to initialize gl_program::driver_cache_blob (and size) with a
+    * ralloc allocated buffer.
+    *
+    * This buffer will be saved and restored as part of the gl_program
+    * serialization and deserialization.
+    */
+   void (*ShaderCacheSerializeDriverBlob)(struct gl_context *ctx,
+                                          struct gl_program *prog);
+   /*@}*/
 };
 
 
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index 565e55d..ae23d29 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -35,7 +35,7 @@
 #include "api_arrayelt.h"
 #include "api_exec.h"
 #include "api_loopback.h"
-#include "api_validate.h"
+#include "draw_validate.h"
 #include "atifragshader.h"
 #include "config.h"
 #include "bufferobj.h"
@@ -290,6 +290,15 @@
    OPCODE_TRANSLATE,
    OPCODE_VIEWPORT,
    OPCODE_WINDOW_POS,
+   /* ARB_viewport_array */
+   OPCODE_VIEWPORT_ARRAY_V,
+   OPCODE_VIEWPORT_INDEXED_F,
+   OPCODE_VIEWPORT_INDEXED_FV,
+   OPCODE_SCISSOR_ARRAY_V,
+   OPCODE_SCISSOR_INDEXED,
+   OPCODE_SCISSOR_INDEXED_V,
+   OPCODE_DEPTH_ARRAY_V,
+   OPCODE_DEPTH_INDEXED,
    /* GL_ARB_multitexture */
    OPCODE_ACTIVE_TEXTURE,
    /* GL_ARB_texture_compression */
@@ -365,6 +374,30 @@
    OPCODE_UNIFORM_3UIV,
    OPCODE_UNIFORM_4UIV,
 
+   /* GL_ARB_gpu_shader_fp64 */
+   OPCODE_UNIFORM_1D,
+   OPCODE_UNIFORM_2D,
+   OPCODE_UNIFORM_3D,
+   OPCODE_UNIFORM_4D,
+   OPCODE_UNIFORM_1DV,
+   OPCODE_UNIFORM_2DV,
+   OPCODE_UNIFORM_3DV,
+   OPCODE_UNIFORM_4DV,
+   OPCODE_UNIFORM_MATRIX22D,
+   OPCODE_UNIFORM_MATRIX33D,
+   OPCODE_UNIFORM_MATRIX44D,
+   OPCODE_UNIFORM_MATRIX23D,
+   OPCODE_UNIFORM_MATRIX32D,
+   OPCODE_UNIFORM_MATRIX24D,
+   OPCODE_UNIFORM_MATRIX42D,
+   OPCODE_UNIFORM_MATRIX34D,
+   OPCODE_UNIFORM_MATRIX43D,
+
+   /* OpenGL 4.0 / GL_ARB_tessellation_shader */
+   OPCODE_PATCH_PARAMETER_I,
+   OPCODE_PATCH_PARAMETER_FV_INNER,
+   OPCODE_PATCH_PARAMETER_FV_OUTER,
+
    /* OpenGL 4.2 / GL_ARB_separate_shader_objects */
    OPCODE_USE_PROGRAM_STAGES,
    OPCODE_PROGRAM_UNIFORM_1F,
@@ -375,6 +408,14 @@
    OPCODE_PROGRAM_UNIFORM_2FV,
    OPCODE_PROGRAM_UNIFORM_3FV,
    OPCODE_PROGRAM_UNIFORM_4FV,
+   OPCODE_PROGRAM_UNIFORM_1D,
+   OPCODE_PROGRAM_UNIFORM_2D,
+   OPCODE_PROGRAM_UNIFORM_3D,
+   OPCODE_PROGRAM_UNIFORM_4D,
+   OPCODE_PROGRAM_UNIFORM_1DV,
+   OPCODE_PROGRAM_UNIFORM_2DV,
+   OPCODE_PROGRAM_UNIFORM_3DV,
+   OPCODE_PROGRAM_UNIFORM_4DV,
    OPCODE_PROGRAM_UNIFORM_1I,
    OPCODE_PROGRAM_UNIFORM_2I,
    OPCODE_PROGRAM_UNIFORM_3I,
@@ -400,6 +441,15 @@
    OPCODE_PROGRAM_UNIFORM_MATRIX42F,
    OPCODE_PROGRAM_UNIFORM_MATRIX34F,
    OPCODE_PROGRAM_UNIFORM_MATRIX43F,
+   OPCODE_PROGRAM_UNIFORM_MATRIX22D,
+   OPCODE_PROGRAM_UNIFORM_MATRIX33D,
+   OPCODE_PROGRAM_UNIFORM_MATRIX44D,
+   OPCODE_PROGRAM_UNIFORM_MATRIX23D,
+   OPCODE_PROGRAM_UNIFORM_MATRIX32D,
+   OPCODE_PROGRAM_UNIFORM_MATRIX24D,
+   OPCODE_PROGRAM_UNIFORM_MATRIX42D,
+   OPCODE_PROGRAM_UNIFORM_MATRIX34D,
+   OPCODE_PROGRAM_UNIFORM_MATRIX43D,
 
    /* GL_ARB_clip_control */
    OPCODE_CLIP_CONTROL,
@@ -421,6 +471,10 @@
    OPCODE_ATTR_2F_ARB,
    OPCODE_ATTR_3F_ARB,
    OPCODE_ATTR_4F_ARB,
+   OPCODE_ATTR_1D,
+   OPCODE_ATTR_2D,
+   OPCODE_ATTR_3D,
+   OPCODE_ATTR_4D,
    OPCODE_MATERIAL,
    OPCODE_BEGIN,
    OPCODE_END,
@@ -460,6 +514,9 @@
    OPCODE_SAMPLER_PARAMETERIIV,
    OPCODE_SAMPLER_PARAMETERUIV,
 
+   /* ARB_compute_shader */
+   OPCODE_DISPATCH_COMPUTE,
+
    /* GL_ARB_sync */
    OPCODE_WAIT_SYNC,
 
@@ -482,12 +539,24 @@
    /* ARB_uniform_buffer_object */
    OPCODE_UNIFORM_BLOCK_BINDING,
 
+   /* ARB_shader_subroutines */
+   OPCODE_UNIFORM_SUBROUTINES,
+
    /* EXT_polygon_offset_clamp */
    OPCODE_POLYGON_OFFSET_CLAMP,
 
    /* EXT_window_rectangles */
    OPCODE_WINDOW_RECTANGLES,
 
+   /* NV_conservative_raster */
+   OPCODE_SUBPIXEL_PRECISION_BIAS,
+
+   /* NV_conservative_raster_dilate */
+   OPCODE_CONSERVATIVE_RASTER_PARAMETER_F,
+
+   /* NV_conservative_raster_pre_snap_triangles */
+   OPCODE_CONSERVATIVE_RASTER_PARAMETER_I,
+
    /* The following three are meta instructions */
    OPCODE_ERROR,                /* raise compiled-in error */
    OPCODE_CONTINUE,
@@ -592,6 +661,22 @@
 };
 
 
+union float64_pair
+{
+   GLdouble d;
+   GLuint uint32[2];
+};
+
+
+#define ASSIGN_DOUBLE_TO_NODES(n, idx, value)                              \
+   do {                                                                    \
+      union float64_pair tmp;                                              \
+      tmp.d = value;                                                       \
+      n[idx].ui = tmp.uint32[0];                                           \
+      n[idx+1].ui = tmp.uint32[1];                                         \
+   } while (0)
+
+
 /**
  * How many nodes to allocate at a time.  Note that bulk vertex data
  * from glBegin/glVertex/glEnd primitives will typically wind up in
@@ -1020,6 +1105,10 @@
          case OPCODE_UNIFORM_2FV:
          case OPCODE_UNIFORM_3FV:
          case OPCODE_UNIFORM_4FV:
+         case OPCODE_UNIFORM_1DV:
+         case OPCODE_UNIFORM_2DV:
+         case OPCODE_UNIFORM_3DV:
+         case OPCODE_UNIFORM_4DV:
          case OPCODE_UNIFORM_1IV:
          case OPCODE_UNIFORM_2IV:
          case OPCODE_UNIFORM_3IV:
@@ -1039,12 +1128,25 @@
          case OPCODE_UNIFORM_MATRIX32:
          case OPCODE_UNIFORM_MATRIX34:
          case OPCODE_UNIFORM_MATRIX43:
+         case OPCODE_UNIFORM_MATRIX22D:
+         case OPCODE_UNIFORM_MATRIX33D:
+         case OPCODE_UNIFORM_MATRIX44D:
+         case OPCODE_UNIFORM_MATRIX24D:
+         case OPCODE_UNIFORM_MATRIX42D:
+         case OPCODE_UNIFORM_MATRIX23D:
+         case OPCODE_UNIFORM_MATRIX32D:
+         case OPCODE_UNIFORM_MATRIX34D:
+         case OPCODE_UNIFORM_MATRIX43D:
             free(get_pointer(&n[4]));
             break;
          case OPCODE_PROGRAM_UNIFORM_1FV:
          case OPCODE_PROGRAM_UNIFORM_2FV:
          case OPCODE_PROGRAM_UNIFORM_3FV:
          case OPCODE_PROGRAM_UNIFORM_4FV:
+         case OPCODE_PROGRAM_UNIFORM_1DV:
+         case OPCODE_PROGRAM_UNIFORM_2DV:
+         case OPCODE_PROGRAM_UNIFORM_3DV:
+         case OPCODE_PROGRAM_UNIFORM_4DV:
          case OPCODE_PROGRAM_UNIFORM_1IV:
          case OPCODE_PROGRAM_UNIFORM_2IV:
          case OPCODE_PROGRAM_UNIFORM_3IV:
@@ -1064,11 +1166,24 @@
          case OPCODE_PROGRAM_UNIFORM_MATRIX32F:
          case OPCODE_PROGRAM_UNIFORM_MATRIX34F:
          case OPCODE_PROGRAM_UNIFORM_MATRIX43F:
+         case OPCODE_PROGRAM_UNIFORM_MATRIX22D:
+         case OPCODE_PROGRAM_UNIFORM_MATRIX33D:
+         case OPCODE_PROGRAM_UNIFORM_MATRIX44D:
+         case OPCODE_PROGRAM_UNIFORM_MATRIX24D:
+         case OPCODE_PROGRAM_UNIFORM_MATRIX42D:
+         case OPCODE_PROGRAM_UNIFORM_MATRIX23D:
+         case OPCODE_PROGRAM_UNIFORM_MATRIX32D:
+         case OPCODE_PROGRAM_UNIFORM_MATRIX34D:
+         case OPCODE_PROGRAM_UNIFORM_MATRIX43D:
             free(get_pointer(&n[5]));
             break;
          case OPCODE_PIXEL_MAP:
             free(get_pointer(&n[3]));
             break;
+         case OPCODE_VIEWPORT_ARRAY_V:
+         case OPCODE_SCISSOR_ARRAY_V:
+         case OPCODE_DEPTH_ARRAY_V:
+         case OPCODE_UNIFORM_SUBROUTINES:
          case OPCODE_WINDOW_RECTANGLES:
             free(get_pointer(&n[3]));
             break;
@@ -1802,6 +1917,47 @@
                "glDrawElementsInstancedBaseVertexBaseInstance() during display list compile");
 }
 
+static void APIENTRY
+save_DrawArraysIndirect(UNUSED GLenum mode,
+                        UNUSED const void *indirect)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "glDrawArraysIndirect() during display list compile");
+}
+
+static void APIENTRY
+save_DrawElementsIndirect(UNUSED GLenum mode,
+                          UNUSED GLenum type,
+                          UNUSED const void *indirect)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "glDrawElementsIndirect() during display list compile");
+}
+
+static void APIENTRY
+save_MultiDrawArraysIndirect(UNUSED GLenum mode,
+                             UNUSED const void *indirect,
+                             UNUSED GLsizei primcount,
+                             UNUSED GLsizei stride)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "glMultiDrawArraysIndirect() during display list compile");
+}
+
+static void APIENTRY
+save_MultiDrawElementsIndirect(UNUSED GLenum mode,
+                               UNUSED GLenum type,
+                               UNUSED const void *indirect,
+                               UNUSED GLsizei primcount,
+                               UNUSED GLsizei stride)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "glMultiDrawElementsIndirect() during display list compile");
+}
 
 /**
  * While building a display list we cache some OpenGL state.
@@ -3263,6 +3419,54 @@
 
 
 static void GLAPIENTRY
+save_PatchParameteri(GLenum pname, const GLint value)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PATCH_PARAMETER_I, 2);
+   if (n) {
+      n[1].e = pname;
+      n[2].i = value;
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_PatchParameteri(ctx->Exec, (pname, value));
+   }
+}
+
+
+static void GLAPIENTRY
+save_PatchParameterfv(GLenum pname, const GLfloat *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+
+   if (pname == GL_PATCH_DEFAULT_OUTER_LEVEL) {
+      n = alloc_instruction(ctx, OPCODE_PATCH_PARAMETER_FV_OUTER, 5);
+   } else {
+      assert(pname == GL_PATCH_DEFAULT_INNER_LEVEL);
+      n = alloc_instruction(ctx, OPCODE_PATCH_PARAMETER_FV_INNER, 3);
+   }
+   if (n) {
+      n[1].e = pname;
+      if (pname == GL_PATCH_DEFAULT_OUTER_LEVEL) {
+         n[2].f = params[0];
+         n[3].f = params[1];
+         n[4].f = params[2];
+         n[5].f = params[3];
+      } else {
+         n[2].f = params[0];
+         n[3].f = params[1];
+      }
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_PatchParameterfv(ctx->Exec, (pname, params));
+   }
+}
+
+
+static void GLAPIENTRY
 save_PixelMapfv(GLenum map, GLint mapsize, const GLfloat *values)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -3477,14 +3681,6 @@
 
 
 static void GLAPIENTRY
-save_PolygonOffsetEXT(GLfloat factor, GLfloat bias)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   /* XXX mult by DepthMaxF here??? */
-   save_PolygonOffset(factor, ctx->DrawBuffer->_DepthMaxF * bias);
-}
-
-static void GLAPIENTRY
 save_PolygonOffsetClampEXT(GLfloat factor, GLfloat units, GLfloat clamp)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -4476,6 +4672,154 @@
    }
 }
 
+static void GLAPIENTRY
+save_ViewportIndexedf(GLuint index, GLfloat x, GLfloat y, GLfloat width,
+                      GLfloat height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_VIEWPORT_INDEXED_F, 5);
+   if (n) {
+      n[1].ui = index;
+      n[2].f = x;
+      n[3].f = y;
+      n[4].f = width;
+      n[5].f = height;
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ViewportIndexedf(ctx->Exec, (index, x, y, width, height));
+   }
+}
+
+static void GLAPIENTRY
+save_ViewportIndexedfv(GLuint index, const GLfloat *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_VIEWPORT_INDEXED_FV, 5);
+   if (n) {
+      n[1].ui = index;
+      n[2].f = v[0];
+      n[3].f = v[1];
+      n[4].f = v[2];
+      n[5].f = v[3];
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ViewportIndexedfv(ctx->Exec, (index, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ViewportArrayv(GLuint first, GLsizei count, const GLfloat *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_VIEWPORT_ARRAY_V, 2 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = first;
+      n[2].si = count;
+      save_pointer(&n[3], memdup(v, count * 4 * sizeof(GLfloat)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ViewportArrayv(ctx->Exec, (first, count, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ScissorIndexed(GLuint index, GLint left, GLint bottom, GLsizei width,
+                    GLsizei height)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_SCISSOR_INDEXED, 5);
+   if (n) {
+      n[1].ui = index;
+      n[2].i = left;
+      n[3].i = bottom;
+      n[4].si = width;
+      n[5].si = height;
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ScissorIndexed(ctx->Exec, (index, left, bottom, width, height));
+   }
+}
+
+static void GLAPIENTRY
+save_ScissorIndexedv(GLuint index, const GLint *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_SCISSOR_INDEXED_V, 5);
+   if (n) {
+      n[1].ui = index;
+      n[2].i = v[0];
+      n[3].i = v[1];
+      n[4].si = v[2];
+      n[5].si = v[3];
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ScissorIndexedv(ctx->Exec, (index, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ScissorArrayv(GLuint first, GLsizei count, const GLint *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_SCISSOR_ARRAY_V, 2 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = first;
+      n[2].si = count;
+      save_pointer(&n[3], memdup(v, count * 4 * sizeof(GLint)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ScissorArrayv(ctx->Exec, (first, count, v));
+   }
+}
+
+static void GLAPIENTRY
+save_DepthRangeIndexed(GLuint index, GLclampd n, GLclampd f)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *node;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   node = alloc_instruction(ctx, OPCODE_DEPTH_INDEXED, 3);
+   if (node) {
+      node[1].ui = index;
+      /* Mesa stores these as floats internally so we deliberately convert
+       * them to a float here.
+       */
+      node[2].f = n;
+      node[3].f = f;
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_DepthRangeIndexed(ctx->Exec, (index, n, f));
+   }
+}
+
+static void GLAPIENTRY
+save_DepthRangeArrayv(GLuint first, GLsizei count, const GLclampd *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_DEPTH_ARRAY_V, 2 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = first;
+      n[2].si = count;
+      save_pointer(&n[3], memdup(v, count * 2 * sizeof(GLclampd)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_DepthRangeArrayv(ctx->Exec, (first, count, v));
+   }
+}
 
 static void GLAPIENTRY
 save_WindowPos4fMESA(GLfloat x, GLfloat y, GLfloat z, GLfloat w)
@@ -6077,6 +6421,152 @@
 }
 
 static void GLAPIENTRY
+save_VertexAttribL1d(GLuint index, GLdouble x)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (index < MAX_VERTEX_GENERIC_ATTRIBS) {
+      Node *n;
+      SAVE_FLUSH_VERTICES(ctx);
+      n = alloc_instruction(ctx, OPCODE_ATTR_1D, 3);
+      if (n) {
+         n[1].ui = index;
+         ASSIGN_DOUBLE_TO_NODES(n, 2, x);
+      }
+
+      ctx->ListState.ActiveAttribSize[index] = 1;
+      memcpy(ctx->ListState.CurrentAttrib[index], &n[2], sizeof(GLdouble));
+
+      if (ctx->ExecuteFlag) {
+         CALL_VertexAttribL1d(ctx->Exec, (index, x));
+      }
+   } else {
+      index_error();
+   }
+}
+
+static void GLAPIENTRY
+save_VertexAttribL1dv(GLuint index, const GLdouble *v)
+{
+   if (index < MAX_VERTEX_GENERIC_ATTRIBS)
+      save_VertexAttribL1d(index, v[0]);
+   else
+      index_error();
+}
+
+static void GLAPIENTRY
+save_VertexAttribL2d(GLuint index, GLdouble x, GLdouble y)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (index < MAX_VERTEX_GENERIC_ATTRIBS) {
+      Node *n;
+      SAVE_FLUSH_VERTICES(ctx);
+      n = alloc_instruction(ctx, OPCODE_ATTR_2D, 5);
+      if (n) {
+         n[1].ui = index;
+         ASSIGN_DOUBLE_TO_NODES(n, 2, x);
+         ASSIGN_DOUBLE_TO_NODES(n, 4, y);
+      }
+
+      ctx->ListState.ActiveAttribSize[index] = 2;
+      memcpy(ctx->ListState.CurrentAttrib[index], &n[2],
+             2 * sizeof(GLdouble));
+
+      if (ctx->ExecuteFlag) {
+         CALL_VertexAttribL2d(ctx->Exec, (index, x, y));
+      }
+   } else {
+      index_error();
+   }
+}
+
+static void GLAPIENTRY
+save_VertexAttribL2dv(GLuint index, const GLdouble *v)
+{
+   if (index < MAX_VERTEX_GENERIC_ATTRIBS)
+      save_VertexAttribL2d(index, v[0], v[1]);
+   else
+      index_error();
+}
+
+static void GLAPIENTRY
+save_VertexAttribL3d(GLuint index, GLdouble x, GLdouble y, GLdouble z)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (index < MAX_VERTEX_GENERIC_ATTRIBS) {
+      Node *n;
+      SAVE_FLUSH_VERTICES(ctx);
+      n = alloc_instruction(ctx, OPCODE_ATTR_3D, 7);
+      if (n) {
+         n[1].ui = index;
+         ASSIGN_DOUBLE_TO_NODES(n, 2, x);
+         ASSIGN_DOUBLE_TO_NODES(n, 4, y);
+         ASSIGN_DOUBLE_TO_NODES(n, 6, z);
+      }
+
+      ctx->ListState.ActiveAttribSize[index] = 3;
+      memcpy(ctx->ListState.CurrentAttrib[index], &n[2],
+             3 * sizeof(GLdouble));
+
+      if (ctx->ExecuteFlag) {
+         CALL_VertexAttribL3d(ctx->Exec, (index, x, y, z));
+      }
+   } else {
+      index_error();
+   }
+}
+
+static void GLAPIENTRY
+save_VertexAttribL3dv(GLuint index, const GLdouble *v)
+{
+   if (index < MAX_VERTEX_GENERIC_ATTRIBS)
+      save_VertexAttribL3d(index, v[0], v[1], v[2]);
+   else
+      index_error();
+}
+
+static void GLAPIENTRY
+save_VertexAttribL4d(GLuint index, GLdouble x, GLdouble y, GLdouble z,
+                       GLdouble w)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (index < MAX_VERTEX_GENERIC_ATTRIBS) {
+      Node *n;
+      SAVE_FLUSH_VERTICES(ctx);
+      n = alloc_instruction(ctx, OPCODE_ATTR_4D, 9);
+      if (n) {
+         n[1].ui = index;
+         ASSIGN_DOUBLE_TO_NODES(n, 2, x);
+         ASSIGN_DOUBLE_TO_NODES(n, 4, y);
+         ASSIGN_DOUBLE_TO_NODES(n, 6, z);
+         ASSIGN_DOUBLE_TO_NODES(n, 8, w);
+      }
+
+      ctx->ListState.ActiveAttribSize[index] = 4;
+      memcpy(ctx->ListState.CurrentAttrib[index], &n[2],
+             4 * sizeof(GLdouble));
+
+      if (ctx->ExecuteFlag) {
+         CALL_VertexAttribL4d(ctx->Exec, (index, x, y, z, w));
+      }
+   } else {
+      index_error();
+   }
+}
+
+static void GLAPIENTRY
+save_VertexAttribL4dv(GLuint index, const GLdouble *v)
+{
+   if (index < MAX_VERTEX_GENERIC_ATTRIBS)
+      save_VertexAttribL4d(index, v[0], v[1], v[2], v[3]);
+   else
+      index_error();
+}
+
+static void GLAPIENTRY
 save_PrimitiveRestartNV(void)
 {
    /* Note: this is used when outside a glBegin/End pair in a display list */
@@ -6275,6 +6765,33 @@
 }
 
 static void GLAPIENTRY
+save_DispatchCompute(GLuint num_groups_x, GLuint num_groups_y,
+                     GLuint num_groups_z)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_DISPATCH_COMPUTE, 3);
+   if (n) {
+      n[1].ui = num_groups_x;
+      n[2].ui = num_groups_y;
+      n[3].ui = num_groups_z;
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_DispatchCompute(ctx->Exec, (num_groups_x, num_groups_y,
+                                       num_groups_z));
+   }
+}
+
+static void GLAPIENTRY
+save_DispatchComputeIndirect(GLintptr indirect)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "glDispatchComputeIndirect() during display list compile");
+}
+
+static void GLAPIENTRY
 save_UseProgram(GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -6434,6 +6951,152 @@
 
 
 static void GLAPIENTRY
+save_Uniform1d(GLint location, GLdouble x)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_1D, 3);
+   if (n) {
+      n[1].i = location;
+      ASSIGN_DOUBLE_TO_NODES(n, 2, x);
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_Uniform1d(ctx->Exec, (location, x));
+   }
+}
+
+
+static void GLAPIENTRY
+save_Uniform2d(GLint location, GLdouble x, GLdouble y)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_2D, 5);
+   if (n) {
+      n[1].i = location;
+      ASSIGN_DOUBLE_TO_NODES(n, 2, x);
+      ASSIGN_DOUBLE_TO_NODES(n, 4, y);
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_Uniform2d(ctx->Exec, (location, x, y));
+   }
+}
+
+
+static void GLAPIENTRY
+save_Uniform3d(GLint location, GLdouble x, GLdouble y, GLdouble z)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_3D, 7);
+   if (n) {
+      n[1].i = location;
+      ASSIGN_DOUBLE_TO_NODES(n, 2, x);
+      ASSIGN_DOUBLE_TO_NODES(n, 4, y);
+      ASSIGN_DOUBLE_TO_NODES(n, 6, z);
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_Uniform3d(ctx->Exec, (location, x, y, z));
+   }
+}
+
+
+static void GLAPIENTRY
+save_Uniform4d(GLint location, GLdouble x, GLdouble y, GLdouble z, GLdouble w)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_4D, 9);
+   if (n) {
+      n[1].i = location;
+      ASSIGN_DOUBLE_TO_NODES(n, 2, x);
+      ASSIGN_DOUBLE_TO_NODES(n, 4, y);
+      ASSIGN_DOUBLE_TO_NODES(n, 6, z);
+      ASSIGN_DOUBLE_TO_NODES(n, 8, w);
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_Uniform4d(ctx->Exec, (location, x, y, z, w));
+   }
+}
+
+
+static void GLAPIENTRY
+save_Uniform1dv(GLint location, GLsizei count, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_1DV, 2 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      save_pointer(&n[3], memdup(v, count * 1 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_Uniform1dv(ctx->Exec, (location, count, v));
+   }
+}
+
+
+static void GLAPIENTRY
+save_Uniform2dv(GLint location, GLsizei count, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_2DV, 2 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      save_pointer(&n[3], memdup(v, count * 2 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_Uniform2dv(ctx->Exec, (location, count, v));
+   }
+}
+
+
+static void GLAPIENTRY
+save_Uniform3dv(GLint location, GLsizei count, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_3DV, 2 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      save_pointer(&n[3], memdup(v, count * 3 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_Uniform3dv(ctx->Exec, (location, count, v));
+   }
+}
+
+
+static void GLAPIENTRY
+save_Uniform4dv(GLint location, GLsizei count, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_4DV, 2 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      save_pointer(&n[3], memdup(v, count * 4 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_Uniform4dv(ctx->Exec, (location, count, v));
+   }
+}
+
+
+static void GLAPIENTRY
 save_Uniform1iARB(GLint location, GLint x)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -6891,6 +7554,184 @@
    }
 }
 
+
+static void GLAPIENTRY
+save_UniformMatrix2dv(GLint location, GLsizei count, GLboolean transpose,
+                      const GLdouble *m)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_MATRIX22D, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      n[3].b = transpose;
+      save_pointer(&n[4], memdup(m, count * 2 * 2 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformMatrix2dv(ctx->Exec, (location, count, transpose, m));
+   }
+}
+
+static void GLAPIENTRY
+save_UniformMatrix3dv(GLint location, GLsizei count, GLboolean transpose,
+                      const GLdouble *m)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_MATRIX33D, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      n[3].b = transpose;
+      save_pointer(&n[4], memdup(m, count * 3 * 3 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformMatrix3dv(ctx->Exec, (location, count, transpose, m));
+   }
+}
+
+static void GLAPIENTRY
+save_UniformMatrix4dv(GLint location, GLsizei count, GLboolean transpose,
+                      const GLdouble *m)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_MATRIX44D, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      n[3].b = transpose;
+      save_pointer(&n[4], memdup(m, count * 4 * 4 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformMatrix4dv(ctx->Exec, (location, count, transpose, m));
+   }
+}
+
+
+static void GLAPIENTRY
+save_UniformMatrix2x3dv(GLint location, GLsizei count, GLboolean transpose,
+                        const GLdouble *m)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_MATRIX23D, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      n[3].b = transpose;
+      save_pointer(&n[4], memdup(m, count * 2 * 3 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformMatrix2x3dv(ctx->Exec, (location, count, transpose, m));
+   }
+}
+
+
+static void GLAPIENTRY
+save_UniformMatrix3x2dv(GLint location, GLsizei count, GLboolean transpose,
+                        const GLdouble *m)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_MATRIX32D, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      n[3].b = transpose;
+      save_pointer(&n[4], memdup(m, count * 3 * 2 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformMatrix3x2dv(ctx->Exec, (location, count, transpose, m));
+   }
+}
+
+
+static void GLAPIENTRY
+save_UniformMatrix2x4dv(GLint location, GLsizei count, GLboolean transpose,
+                        const GLdouble *m)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_MATRIX24D, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      n[3].b = transpose;
+      save_pointer(&n[4], memdup(m, count * 2 * 4 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformMatrix2x4dv(ctx->Exec, (location, count, transpose, m));
+   }
+}
+
+static void GLAPIENTRY
+save_UniformMatrix4x2dv(GLint location, GLsizei count, GLboolean transpose,
+                        const GLdouble *m)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_MATRIX42D, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      n[3].b = transpose;
+      save_pointer(&n[4], memdup(m, count * 4 * 2 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformMatrix4x2dv(ctx->Exec, (location, count, transpose, m));
+   }
+}
+
+
+static void GLAPIENTRY
+save_UniformMatrix3x4dv(GLint location, GLsizei count, GLboolean transpose,
+                        const GLdouble *m)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_MATRIX34D, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      n[3].b = transpose;
+      save_pointer(&n[4], memdup(m, count * 3 * 4 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformMatrix3x4dv(ctx->Exec, (location, count, transpose, m));
+   }
+}
+
+
+static void GLAPIENTRY
+save_UniformMatrix4x3dv(GLint location, GLsizei count, GLboolean transpose,
+                        const GLdouble *m)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_MATRIX43D, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].i = location;
+      n[2].i = count;
+      n[3].b = transpose;
+      save_pointer(&n[4], memdup(m, count * 4 * 3 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformMatrix4x3dv(ctx->Exec, (location, count, transpose, m));
+   }
+}
+
+
 static void GLAPIENTRY
 save_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
 {
@@ -7061,6 +7902,158 @@
 }
 
 static void GLAPIENTRY
+save_ProgramUniform1d(GLuint program, GLint location, GLdouble x)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1D, 4);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      ASSIGN_DOUBLE_TO_NODES(n, 3, x);
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniform1d(ctx->Exec, (program, location, x));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniform2d(GLuint program, GLint location, GLdouble x, GLdouble y)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2D, 6);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      ASSIGN_DOUBLE_TO_NODES(n, 3, x);
+      ASSIGN_DOUBLE_TO_NODES(n, 5, y);
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniform2d(ctx->Exec, (program, location, x, y));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniform3d(GLuint program, GLint location,
+                      GLdouble x, GLdouble y, GLdouble z)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3D, 8);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      ASSIGN_DOUBLE_TO_NODES(n, 3, x);
+      ASSIGN_DOUBLE_TO_NODES(n, 5, y);
+      ASSIGN_DOUBLE_TO_NODES(n, 7, z);
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniform3d(ctx->Exec, (program, location, x, y, z));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniform4d(GLuint program, GLint location,
+                      GLdouble x, GLdouble y, GLdouble z, GLdouble w)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4D, 10);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      ASSIGN_DOUBLE_TO_NODES(n, 3, x);
+      ASSIGN_DOUBLE_TO_NODES(n, 5, y);
+      ASSIGN_DOUBLE_TO_NODES(n, 7, z);
+      ASSIGN_DOUBLE_TO_NODES(n, 9, w);
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniform4d(ctx->Exec, (program, location, x, y, z, w));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniform1dv(GLuint program, GLint location, GLsizei count,
+                       const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_1DV, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      save_pointer(&n[4], memdup(v, count * 1 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniform1dv(ctx->Exec, (program, location, count, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniform2dv(GLuint program, GLint location, GLsizei count,
+                       const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_2DV, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      save_pointer(&n[4], memdup(v, count * 2 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniform2dv(ctx->Exec, (program, location, count, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniform3dv(GLuint program, GLint location, GLsizei count,
+                       const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_3DV, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      save_pointer(&n[4], memdup(v, count * 3 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniform3dv(ctx->Exec, (program, location, count, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniform4dv(GLuint program, GLint location, GLsizei count,
+                       const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_4DV, 3 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      save_pointer(&n[4], memdup(v, count * 4 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniform4dv(ctx->Exec, (program, location, count, v));
+   }
+}
+
+static void GLAPIENTRY
 save_ProgramUniform1i(GLuint program, GLint location, GLint x)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -7563,6 +8556,204 @@
 }
 
 static void GLAPIENTRY
+save_ProgramUniformMatrix2dv(GLuint program, GLint location, GLsizei count,
+                             GLboolean transpose, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_MATRIX22D,
+                         4 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      n[4].b = transpose;
+      save_pointer(&n[5], memdup(v, count * 2 * 2 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniformMatrix2dv(ctx->Exec,
+                                   (program, location, count, transpose, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniformMatrix2x3dv(GLuint program, GLint location, GLsizei count,
+                               GLboolean transpose, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_MATRIX23D,
+                         4 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      n[4].b = transpose;
+      save_pointer(&n[5], memdup(v, count * 2 * 3 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniformMatrix2x3dv(ctx->Exec,
+                                     (program, location, count, transpose, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniformMatrix2x4dv(GLuint program, GLint location, GLsizei count,
+                               GLboolean transpose, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_MATRIX24D,
+                         4 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      n[4].b = transpose;
+      save_pointer(&n[5], memdup(v, count * 2 * 4 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniformMatrix2x4dv(ctx->Exec,
+                                     (program, location, count, transpose, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniformMatrix3x2dv(GLuint program, GLint location, GLsizei count,
+                               GLboolean transpose, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_MATRIX32D,
+                         4 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      n[4].b = transpose;
+      save_pointer(&n[5], memdup(v, count * 3 * 2 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniformMatrix3x2dv(ctx->Exec,
+                                     (program, location, count, transpose, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniformMatrix3dv(GLuint program, GLint location, GLsizei count,
+                             GLboolean transpose, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_MATRIX33D,
+                         4 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      n[4].b = transpose;
+      save_pointer(&n[5], memdup(v, count * 3 * 3 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniformMatrix3dv(ctx->Exec,
+                                   (program, location, count, transpose, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniformMatrix3x4dv(GLuint program, GLint location, GLsizei count,
+                               GLboolean transpose, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_MATRIX34D,
+                         4 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      n[4].b = transpose;
+      save_pointer(&n[5], memdup(v, count * 3 * 4 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniformMatrix3x4dv(ctx->Exec,
+                                     (program, location, count, transpose, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniformMatrix4x2dv(GLuint program, GLint location, GLsizei count,
+                               GLboolean transpose, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_MATRIX42D,
+                         4 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      n[4].b = transpose;
+      save_pointer(&n[5], memdup(v, count * 4 * 2 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniformMatrix4x2dv(ctx->Exec,
+                                     (program, location, count, transpose, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniformMatrix4x3dv(GLuint program, GLint location, GLsizei count,
+                               GLboolean transpose, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_MATRIX43D,
+                         4 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      n[4].b = transpose;
+      save_pointer(&n[5], memdup(v, count * 4 * 3 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniformMatrix4x3dv(ctx->Exec,
+                                     (program, location, count, transpose, v));
+   }
+}
+
+static void GLAPIENTRY
+save_ProgramUniformMatrix4dv(GLuint program, GLint location, GLsizei count,
+                             GLboolean transpose, const GLdouble *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_PROGRAM_UNIFORM_MATRIX44D,
+                         4 + POINTER_DWORDS);
+   if (n) {
+      n[1].ui = program;
+      n[2].i = location;
+      n[3].i = count;
+      n[4].b = transpose;
+      save_pointer(&n[5], memdup(v, count * 4 * 4 * sizeof(GLdouble)));
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ProgramUniformMatrix4dv(ctx->Exec,
+                                   (program, location, count, transpose, v));
+   }
+}
+
+static void GLAPIENTRY
 save_ClipControl(GLenum origin, GLenum depth)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -7906,6 +9097,28 @@
    }
 }
 
+static void GLAPIENTRY
+save_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
+                           const GLuint *indices)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_UNIFORM_SUBROUTINES, 2 + POINTER_DWORDS);
+   if (n) {
+      GLint *indices_copy = NULL;
+
+      if (count > 0)
+         indices_copy = memdup(indices, sizeof(GLuint) * 4 * count);
+      n[1].e = shadertype;
+      n[2].si = count;
+      save_pointer(&n[3], indices_copy);
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_UniformSubroutinesuiv(ctx->Exec, (shadertype, count, indices));
+   }
+}
+
 /** GL_EXT_window_rectangles */
 static void GLAPIENTRY
 save_WindowRectanglesEXT(GLenum mode, GLsizei count, const GLint *box)
@@ -7928,6 +9141,59 @@
    }
 }
 
+
+/** GL_NV_conservative_raster */
+static void GLAPIENTRY
+save_SubpixelPrecisionBiasNV(GLuint xbits, GLuint ybits)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_SUBPIXEL_PRECISION_BIAS, 2);
+   if (n) {
+      n[1].ui = xbits;
+      n[2].ui = ybits;
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_SubpixelPrecisionBiasNV(ctx->Exec, (xbits, ybits));
+   }
+}
+
+/** GL_NV_conservative_raster_dilate */
+static void GLAPIENTRY
+save_ConservativeRasterParameterfNV(GLenum pname, GLfloat param)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_CONSERVATIVE_RASTER_PARAMETER_F, 2);
+   if (n) {
+      n[1].e = pname;
+      n[2].f = param;
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ConservativeRasterParameterfNV(ctx->Exec, (pname, param));
+   }
+}
+
+/** GL_NV_conservative_raster_pre_snap_triangles */
+static void GLAPIENTRY
+save_ConservativeRasterParameteriNV(GLenum pname, GLint param)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   Node *n;
+   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
+   n = alloc_instruction(ctx, OPCODE_CONSERVATIVE_RASTER_PARAMETER_I, 2);
+   if (n) {
+      n[1].e = pname;
+      n[2].i = param;
+   }
+   if (ctx->ExecuteFlag) {
+      CALL_ConservativeRasterParameteriNV(ctx->Exec, (pname, param));
+   }
+}
+
+
 /**
  * Save an error-generating command into display list.
  *
@@ -8347,6 +9613,27 @@
          case OPCODE_PASSTHROUGH:
             CALL_PassThrough(ctx->Exec, (n[1].f));
             break;
+         case OPCODE_PATCH_PARAMETER_I:
+            CALL_PatchParameteri(ctx->Exec, (n[1].e, n[2].i));
+            break;
+         case OPCODE_PATCH_PARAMETER_FV_INNER:
+            {
+               GLfloat params[2];
+               params[0] = n[2].f;
+               params[1] = n[3].f;
+               CALL_PatchParameterfv(ctx->Exec, (n[1].e, params));
+            }
+            break;
+         case OPCODE_PATCH_PARAMETER_FV_OUTER:
+            {
+               GLfloat params[4];
+               params[0] = n[2].f;
+               params[1] = n[3].f;
+               params[2] = n[4].f;
+               params[3] = n[5].f;
+               CALL_PatchParameterfv(ctx->Exec, (n[1].e, params));
+            }
+            break;
          case OPCODE_PIXEL_MAP:
             CALL_PixelMapfv(ctx->Exec,
                             (n[1].e, n[2].i, get_pointer(&n[3])));
@@ -8568,6 +9855,47 @@
          case OPCODE_WINDOW_POS:
             CALL_WindowPos4fMESA(ctx->Exec, (n[1].f, n[2].f, n[3].f, n[4].f));
             break;
+         case OPCODE_VIEWPORT_ARRAY_V:
+            CALL_ViewportArrayv(ctx->Exec, (n[1].ui, n[2].si,
+                                            get_pointer(&n[3])));
+            break;
+         case OPCODE_VIEWPORT_INDEXED_F:
+            CALL_ViewportIndexedf(ctx->Exec, (n[1].ui, n[2].f, n[3].f, n[4].f,
+                                              n[5].f));
+            break;
+         case OPCODE_VIEWPORT_INDEXED_FV: {
+            GLfloat v[4];
+            v[0] = n[2].f;
+            v[1] = n[3].f;
+            v[2] = n[4].f;
+            v[3] = n[5].f;
+            CALL_ViewportIndexedfv(ctx->Exec, (n[1].ui, v));
+            break;
+         }
+         case OPCODE_SCISSOR_ARRAY_V:
+            CALL_ScissorArrayv(ctx->Exec, (n[1].ui, n[2].si,
+                                           get_pointer(&n[3])));
+            break;
+         case OPCODE_SCISSOR_INDEXED:
+            CALL_ScissorIndexed(ctx->Exec, (n[1].ui, n[2].i, n[3].i, n[4].si,
+                                            n[5].si));
+            break;
+         case OPCODE_SCISSOR_INDEXED_V: {
+            GLint v[4];
+            v[0] = n[2].i;
+            v[1] = n[3].i;
+            v[2] = n[4].si;
+            v[3] = n[5].si;
+            CALL_ScissorIndexedv(ctx->Exec, (n[1].ui, v));
+            break;
+         }
+         case OPCODE_DEPTH_ARRAY_V:
+            CALL_DepthRangeArrayv(ctx->Exec, (n[1].ui, n[2].si,
+                                              get_pointer(&n[3])));
+            break;
+         case OPCODE_DEPTH_INDEXED:
+            CALL_DepthRangeIndexed(ctx->Exec, (n[1].ui, n[2].f, n[3].f));
+            break;
          case OPCODE_ACTIVE_TEXTURE:   /* GL_ARB_multitexture */
             CALL_ActiveTexture(ctx->Exec, (n[1].e));
             break;
@@ -8697,6 +10025,72 @@
          case OPCODE_UNIFORM_4FV:
             CALL_Uniform4fv(ctx->Exec, (n[1].i, n[2].i, get_pointer(&n[3])));
             break;
+         case OPCODE_UNIFORM_1D: {
+            union float64_pair x;
+
+            x.uint32[0] = n[2].ui;
+            x.uint32[1] = n[3].ui;
+
+            CALL_Uniform1d(ctx->Exec, (n[1].i, x.d));
+            break;
+         }
+         case OPCODE_UNIFORM_2D: {
+            union float64_pair x;
+            union float64_pair y;
+
+            x.uint32[0] = n[2].ui;
+            x.uint32[1] = n[3].ui;
+            y.uint32[0] = n[4].ui;
+            y.uint32[1] = n[5].ui;
+
+            CALL_Uniform2d(ctx->Exec, (n[1].i, x.d, y.d));
+            break;
+         }
+         case OPCODE_UNIFORM_3D: {
+            union float64_pair x;
+            union float64_pair y;
+            union float64_pair z;
+
+            x.uint32[0] = n[2].ui;
+            x.uint32[1] = n[3].ui;
+            y.uint32[0] = n[4].ui;
+            y.uint32[1] = n[5].ui;
+            z.uint32[0] = n[6].ui;
+            z.uint32[1] = n[7].ui;
+
+            CALL_Uniform3d(ctx->Exec, (n[1].i, x.d, y.d, z.d));
+            break;
+         }
+         case OPCODE_UNIFORM_4D: {
+            union float64_pair x;
+            union float64_pair y;
+            union float64_pair z;
+            union float64_pair w;
+
+            x.uint32[0] = n[2].ui;
+            x.uint32[1] = n[3].ui;
+            y.uint32[0] = n[4].ui;
+            y.uint32[1] = n[5].ui;
+            z.uint32[0] = n[6].ui;
+            z.uint32[1] = n[7].ui;
+            w.uint32[0] = n[8].ui;
+            w.uint32[1] = n[9].ui;
+
+            CALL_Uniform4d(ctx->Exec, (n[1].i, x.d, y.d, z.d, w.d));
+            break;
+         }
+         case OPCODE_UNIFORM_1DV:
+            CALL_Uniform1dv(ctx->Exec, (n[1].i, n[2].i, get_pointer(&n[3])));
+            break;
+         case OPCODE_UNIFORM_2DV:
+            CALL_Uniform2dv(ctx->Exec, (n[1].i, n[2].i, get_pointer(&n[3])));
+            break;
+         case OPCODE_UNIFORM_3DV:
+            CALL_Uniform3dv(ctx->Exec, (n[1].i, n[2].i, get_pointer(&n[3])));
+            break;
+         case OPCODE_UNIFORM_4DV:
+            CALL_Uniform4dv(ctx->Exec, (n[1].i, n[2].i, get_pointer(&n[3])));
+            break;
          case OPCODE_UNIFORM_1I:
             CALL_Uniform1i(ctx->Exec, (n[1].i, n[2].i));
             break;
@@ -8783,6 +10177,42 @@
             CALL_UniformMatrix4x3fv(ctx->Exec,
                                     (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
             break;
+         case OPCODE_UNIFORM_MATRIX22D:
+            CALL_UniformMatrix2dv(ctx->Exec,
+                                  (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
+            break;
+         case OPCODE_UNIFORM_MATRIX33D:
+            CALL_UniformMatrix3dv(ctx->Exec,
+                                  (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
+            break;
+         case OPCODE_UNIFORM_MATRIX44D:
+            CALL_UniformMatrix4dv(ctx->Exec,
+                                  (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
+            break;
+         case OPCODE_UNIFORM_MATRIX23D:
+            CALL_UniformMatrix2x3dv(ctx->Exec,
+                                    (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
+            break;
+         case OPCODE_UNIFORM_MATRIX32D:
+            CALL_UniformMatrix3x2dv(ctx->Exec,
+                                    (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
+            break;
+         case OPCODE_UNIFORM_MATRIX24D:
+            CALL_UniformMatrix2x4dv(ctx->Exec,
+                                    (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
+            break;
+         case OPCODE_UNIFORM_MATRIX42D:
+            CALL_UniformMatrix4x2dv(ctx->Exec,
+                                    (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
+            break;
+         case OPCODE_UNIFORM_MATRIX34D:
+            CALL_UniformMatrix3x4dv(ctx->Exec,
+                                    (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
+            break;
+         case OPCODE_UNIFORM_MATRIX43D:
+            CALL_UniformMatrix4x3dv(ctx->Exec,
+                                    (n[1].i, n[2].i, n[3].b, get_pointer(&n[4])));
+            break;
 
          case OPCODE_USE_PROGRAM_STAGES:
             CALL_UseProgramStages(ctx->Exec, (n[1].ui, n[2].ui, n[3].ui));
@@ -8817,6 +10247,78 @@
             CALL_ProgramUniform4fv(ctx->Exec, (n[1].ui, n[2].i, n[3].i,
                                                get_pointer(&n[4])));
             break;
+         case OPCODE_PROGRAM_UNIFORM_1D: {
+            union float64_pair x;
+
+            x.uint32[0] = n[3].ui;
+            x.uint32[1] = n[4].ui;
+
+            CALL_ProgramUniform1d(ctx->Exec, (n[1].ui, n[2].i, x.d));
+            break;
+         }
+         case OPCODE_PROGRAM_UNIFORM_2D: {
+            union float64_pair x;
+            union float64_pair y;
+
+            x.uint32[0] = n[3].ui;
+            x.uint32[1] = n[4].ui;
+            y.uint32[0] = n[5].ui;
+            y.uint32[1] = n[6].ui;
+
+            CALL_ProgramUniform2d(ctx->Exec, (n[1].ui, n[2].i, x.d, y.d));
+            break;
+         }
+         case OPCODE_PROGRAM_UNIFORM_3D: {
+            union float64_pair x;
+            union float64_pair y;
+            union float64_pair z;
+
+            x.uint32[0] = n[3].ui;
+            x.uint32[1] = n[4].ui;
+            y.uint32[0] = n[5].ui;
+            y.uint32[1] = n[6].ui;
+            z.uint32[0] = n[7].ui;
+            z.uint32[1] = n[8].ui;
+
+            CALL_ProgramUniform3d(ctx->Exec, (n[1].ui, n[2].i,
+                                              x.d, y.d, z.d));
+            break;
+         }
+         case OPCODE_PROGRAM_UNIFORM_4D: {
+            union float64_pair x;
+            union float64_pair y;
+            union float64_pair z;
+            union float64_pair w;
+
+            x.uint32[0] = n[3].ui;
+            x.uint32[1] = n[4].ui;
+            y.uint32[0] = n[5].ui;
+            y.uint32[1] = n[6].ui;
+            z.uint32[0] = n[7].ui;
+            z.uint32[1] = n[8].ui;
+            w.uint32[0] = n[9].ui;
+            w.uint32[1] = n[10].ui;
+
+            CALL_ProgramUniform4d(ctx->Exec, (n[1].ui, n[2].i,
+                                              x.d, y.d, z.d, w.d));
+            break;
+         }
+         case OPCODE_PROGRAM_UNIFORM_1DV:
+            CALL_ProgramUniform1dv(ctx->Exec, (n[1].ui, n[2].i, n[3].i,
+                                               get_pointer(&n[4])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_2DV:
+            CALL_ProgramUniform2dv(ctx->Exec, (n[1].ui, n[2].i, n[3].i,
+                                               get_pointer(&n[4])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_3DV:
+            CALL_ProgramUniform3dv(ctx->Exec, (n[1].ui, n[2].i, n[3].i,
+                                               get_pointer(&n[4])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_4DV:
+            CALL_ProgramUniform4dv(ctx->Exec, (n[1].ui, n[2].i, n[3].i,
+                                               get_pointer(&n[4])));
+            break;
          case OPCODE_PROGRAM_UNIFORM_1I:
             CALL_ProgramUniform1i(ctx->Exec, (n[1].ui, n[2].i, n[3].i));
             break;
@@ -8924,6 +10426,51 @@
                                          (n[1].ui, n[2].i, n[3].i, n[4].b,
                                           get_pointer(&n[5])));
             break;
+         case OPCODE_PROGRAM_UNIFORM_MATRIX22D:
+            CALL_ProgramUniformMatrix2dv(ctx->Exec,
+                                         (n[1].ui, n[2].i, n[3].i, n[4].b,
+                                          get_pointer(&n[5])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_MATRIX23D:
+            CALL_ProgramUniformMatrix2x3dv(ctx->Exec,
+                                           (n[1].ui, n[2].i, n[3].i, n[4].b,
+                                            get_pointer(&n[5])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_MATRIX24D:
+            CALL_ProgramUniformMatrix2x4dv(ctx->Exec,
+                                           (n[1].ui, n[2].i, n[3].i, n[4].b,
+                                            get_pointer(&n[5])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_MATRIX32D:
+            CALL_ProgramUniformMatrix3x2dv(ctx->Exec,
+                                           (n[1].ui, n[2].i, n[3].i, n[4].b,
+                                            get_pointer(&n[5])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_MATRIX33D:
+            CALL_ProgramUniformMatrix3dv(ctx->Exec,
+                                         (n[1].ui, n[2].i, n[3].i, n[4].b,
+                                          get_pointer(&n[5])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_MATRIX34D:
+            CALL_ProgramUniformMatrix3x4dv(ctx->Exec,
+                                           (n[1].ui, n[2].i, n[3].i, n[4].b,
+                                            get_pointer(&n[5])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_MATRIX42D:
+            CALL_ProgramUniformMatrix4x2dv(ctx->Exec,
+                                           (n[1].ui, n[2].i, n[3].i, n[4].b,
+                                            get_pointer(&n[5])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_MATRIX43D:
+            CALL_ProgramUniformMatrix4x3dv(ctx->Exec,
+                                           (n[1].ui, n[2].i, n[3].i, n[4].b,
+                                            get_pointer(&n[5])));
+            break;
+         case OPCODE_PROGRAM_UNIFORM_MATRIX44D:
+            CALL_ProgramUniformMatrix4dv(ctx->Exec,
+                                         (n[1].ui, n[2].i, n[3].i, n[4].b,
+                                          get_pointer(&n[5])));
+            break;
 
          case OPCODE_CLIP_CONTROL:
             CALL_ClipControl(ctx->Exec, (n[1].e, n[2].e));
@@ -8963,6 +10510,26 @@
          case OPCODE_ATTR_4F_ARB:
             CALL_VertexAttrib4fvARB(ctx->Exec, (n[1].e, &n[2].f));
             break;
+         case OPCODE_ATTR_1D: {
+            GLdouble *d = (GLdouble *) &n[2];
+            CALL_VertexAttribL1d(ctx->Exec, (n[1].ui, *d));
+            break;
+         }
+         case OPCODE_ATTR_2D: {
+            GLdouble *d = (GLdouble *) &n[2];
+            CALL_VertexAttribL2dv(ctx->Exec, (n[1].ui, d));
+            break;
+         }
+         case OPCODE_ATTR_3D: {
+            GLdouble *d = (GLdouble *) &n[2];
+            CALL_VertexAttribL3dv(ctx->Exec, (n[1].ui, d));
+            break;
+         }
+         case OPCODE_ATTR_4D: {
+            GLdouble *d = (GLdouble *) &n[2];
+            CALL_VertexAttribL4dv(ctx->Exec, (n[1].ui, d));
+            break;
+         }
          case OPCODE_MATERIAL:
             CALL_Materialfv(ctx->Exec, (n[1].e, n[2].e, &n[3].f));
             break;
@@ -9103,6 +10670,11 @@
             }
             break;
 
+         /* ARB_compute_shader */
+         case OPCODE_DISPATCH_COMPUTE:
+            CALL_DispatchCompute(ctx->Exec, (n[1].ui, n[2].ui, n[3].ui));
+            break;
+
          /* GL_ARB_sync */
          case OPCODE_WAIT_SYNC:
             {
@@ -9126,12 +10698,32 @@
             CALL_UniformBlockBinding(ctx->Exec, (n[1].ui, n[2].ui, n[3].ui));
             break;
 
+         case OPCODE_UNIFORM_SUBROUTINES:
+            CALL_UniformSubroutinesuiv(ctx->Exec, (n[1].e, n[2].si,
+                                                   get_pointer(&n[3])));
+            break;
+
          /* GL_EXT_window_rectangles */
          case OPCODE_WINDOW_RECTANGLES:
             CALL_WindowRectanglesEXT(
                   ctx->Exec, (n[1].e, n[2].si, get_pointer(&n[3])));
             break;
 
+         /* GL_NV_conservative_raster */
+         case OPCODE_SUBPIXEL_PRECISION_BIAS:
+            CALL_SubpixelPrecisionBiasNV(ctx->Exec, (n[1].ui, n[2].ui));
+            break;
+
+         /* GL_NV_conservative_raster_dilate */
+         case OPCODE_CONSERVATIVE_RASTER_PARAMETER_F:
+            CALL_ConservativeRasterParameterfNV(ctx->Exec, (n[1].e, n[2].f));
+            break;
+
+         /* GL_NV_conservative_raster_pre_snap_triangles */
+         case OPCODE_CONSERVATIVE_RASTER_PARAMETER_I:
+            CALL_ConservativeRasterParameteriNV(ctx->Exec, (n[1].e, n[2].i));
+            break;
+
          case OPCODE_CONTINUE:
             n = (Node *) get_pointer(&n[1]);
             break;
@@ -9762,9 +11354,6 @@
    SET_BlendColorEXT(table, save_BlendColorEXT);
 #endif
 
-   /* 3. GL_EXT_polygon_offset */
-   SET_PolygonOffsetEXT(table, save_PolygonOffsetEXT);
-
    /* 6. GL_EXT_texture3d */
 #if 0
    SET_CopyTexSubImage3DEXT(table, save_CopyTexSubImage3D);
@@ -9781,6 +11370,24 @@
    SET_PointParameterf(table, save_PointParameterfEXT);
    SET_PointParameterfv(table, save_PointParameterfvEXT);
 
+   /* 91. GL_ARB_tessellation_shader */
+   SET_PatchParameteri(table, save_PatchParameteri);
+   SET_PatchParameterfv(table, save_PatchParameterfv);
+
+   /* 100. ARB_viewport_array */
+   SET_ViewportArrayv(table, save_ViewportArrayv);
+   SET_ViewportIndexedf(table, save_ViewportIndexedf);
+   SET_ViewportIndexedfv(table, save_ViewportIndexedfv);
+   SET_ScissorArrayv(table, save_ScissorArrayv);
+   SET_ScissorIndexed(table, save_ScissorIndexed);
+   SET_ScissorIndexedv(table, save_ScissorIndexedv);
+   SET_DepthRangeArrayv(table, save_DepthRangeArrayv);
+   SET_DepthRangeIndexed(table, save_DepthRangeIndexed);
+
+   /* 122. ARB_compute_shader */
+   SET_DispatchCompute(table, save_DispatchCompute);
+   SET_DispatchComputeIndirect(table, save_DispatchComputeIndirect);
+
    /* 173. GL_EXT_blend_func_separate */
    SET_BlendFuncSeparate(table, save_BlendFuncSeparateEXT);
 
@@ -9935,6 +11542,25 @@
    SET_Uniform3uiv(table, save_Uniform3uiv);
    SET_Uniform4uiv(table, save_Uniform4uiv);
 
+   /* GL_ARB_gpu_shader_fp64 */
+   SET_Uniform1d(table, save_Uniform1d);
+   SET_Uniform2d(table, save_Uniform2d);
+   SET_Uniform3d(table, save_Uniform3d);
+   SET_Uniform4d(table, save_Uniform4d);
+   SET_Uniform1dv(table, save_Uniform1dv);
+   SET_Uniform2dv(table, save_Uniform2dv);
+   SET_Uniform3dv(table, save_Uniform3dv);
+   SET_Uniform4dv(table, save_Uniform4dv);
+   SET_UniformMatrix2dv(table, save_UniformMatrix2dv);
+   SET_UniformMatrix3dv(table, save_UniformMatrix3dv);
+   SET_UniformMatrix4dv(table, save_UniformMatrix4dv);
+   SET_UniformMatrix2x3dv(table, save_UniformMatrix2x3dv);
+   SET_UniformMatrix3x2dv(table, save_UniformMatrix3x2dv);
+   SET_UniformMatrix2x4dv(table, save_UniformMatrix2x4dv);
+   SET_UniformMatrix4x2dv(table, save_UniformMatrix4x2dv);
+   SET_UniformMatrix3x4dv(table, save_UniformMatrix3x4dv);
+   SET_UniformMatrix4x3dv(table, save_UniformMatrix4x3dv);
+
    /* These are: */
    SET_BeginTransformFeedback(table, save_BeginTransformFeedback);
    SET_EndTransformFeedback(table, save_EndTransformFeedback);
@@ -9980,6 +11606,9 @@
    /* GL_ARB_uniform_buffer_object */
    SET_UniformBlockBinding(table, save_UniformBlockBinding);
 
+   /* GL_ARB_shader_subroutines */
+   SET_UniformSubroutinesuiv(table, save_UniformSubroutinesuiv);
+
    /* GL_ARB_draw_instanced */
    SET_DrawArraysInstancedARB(table, save_DrawArraysInstancedARB);
    SET_DrawElementsInstancedARB(table, save_DrawElementsInstancedARB);
@@ -9992,6 +11621,12 @@
    SET_DrawElementsInstancedBaseInstance(table, save_DrawElementsInstancedBaseInstance);
    SET_DrawElementsInstancedBaseVertexBaseInstance(table, save_DrawElementsInstancedBaseVertexBaseInstance);
 
+   /* GL_ARB_draw_indirect / GL_ARB_multi_draw_indirect */
+   SET_DrawArraysIndirect(table, save_DrawArraysIndirect);
+   SET_DrawElementsIndirect(table, save_DrawElementsIndirect);
+   SET_MultiDrawArraysIndirect(table, save_MultiDrawArraysIndirect);
+   SET_MultiDrawElementsIndirect(table, save_MultiDrawElementsIndirect);
+
    /* OpenGL 4.2 / GL_ARB_separate_shader_objects */
    SET_UseProgramStages(table, save_UseProgramStages);
    SET_ProgramUniform1f(table, save_ProgramUniform1f);
@@ -10002,6 +11637,14 @@
    SET_ProgramUniform2fv(table, save_ProgramUniform2fv);
    SET_ProgramUniform3fv(table, save_ProgramUniform3fv);
    SET_ProgramUniform4fv(table, save_ProgramUniform4fv);
+   SET_ProgramUniform1d(table, save_ProgramUniform1d);
+   SET_ProgramUniform2d(table, save_ProgramUniform2d);
+   SET_ProgramUniform3d(table, save_ProgramUniform3d);
+   SET_ProgramUniform4d(table, save_ProgramUniform4d);
+   SET_ProgramUniform1dv(table, save_ProgramUniform1dv);
+   SET_ProgramUniform2dv(table, save_ProgramUniform2dv);
+   SET_ProgramUniform3dv(table, save_ProgramUniform3dv);
+   SET_ProgramUniform4dv(table, save_ProgramUniform4dv);
    SET_ProgramUniform1i(table, save_ProgramUniform1i);
    SET_ProgramUniform2i(table, save_ProgramUniform2i);
    SET_ProgramUniform3i(table, save_ProgramUniform3i);
@@ -10027,12 +11670,30 @@
    SET_ProgramUniformMatrix4x2fv(table, save_ProgramUniformMatrix4x2fv);
    SET_ProgramUniformMatrix3x4fv(table, save_ProgramUniformMatrix3x4fv);
    SET_ProgramUniformMatrix4x3fv(table, save_ProgramUniformMatrix4x3fv);
+   SET_ProgramUniformMatrix2dv(table, save_ProgramUniformMatrix2dv);
+   SET_ProgramUniformMatrix3dv(table, save_ProgramUniformMatrix3dv);
+   SET_ProgramUniformMatrix4dv(table, save_ProgramUniformMatrix4dv);
+   SET_ProgramUniformMatrix2x3dv(table, save_ProgramUniformMatrix2x3dv);
+   SET_ProgramUniformMatrix3x2dv(table, save_ProgramUniformMatrix3x2dv);
+   SET_ProgramUniformMatrix2x4dv(table, save_ProgramUniformMatrix2x4dv);
+   SET_ProgramUniformMatrix4x2dv(table, save_ProgramUniformMatrix4x2dv);
+   SET_ProgramUniformMatrix3x4dv(table, save_ProgramUniformMatrix3x4dv);
+   SET_ProgramUniformMatrix4x3dv(table, save_ProgramUniformMatrix4x3dv);
 
    /* GL_{ARB,EXT}_polygon_offset_clamp */
    SET_PolygonOffsetClampEXT(table, save_PolygonOffsetClampEXT);
 
    /* GL_EXT_window_rectangles */
    SET_WindowRectanglesEXT(table, save_WindowRectanglesEXT);
+
+   /* GL_NV_conservative_raster */
+   SET_SubpixelPrecisionBiasNV(table, save_SubpixelPrecisionBiasNV);
+
+   /* GL_NV_conservative_raster_dilate */
+   SET_ConservativeRasterParameterfNV(table, save_ConservativeRasterParameterfNV);
+
+   /* GL_NV_conservative_raster_pre_snap_triangles */
+   SET_ConservativeRasterParameteriNV(table, save_ConservativeRasterParameteriNV);
 }
 
 
@@ -10445,6 +12106,14 @@
    vfmt->VertexAttrib3fvARB = save_VertexAttrib3fvARB;
    vfmt->VertexAttrib4fARB = save_VertexAttrib4fARB;
    vfmt->VertexAttrib4fvARB = save_VertexAttrib4fvARB;
+   vfmt->VertexAttribL1d = save_VertexAttribL1d;
+   vfmt->VertexAttribL1dv = save_VertexAttribL1dv;
+   vfmt->VertexAttribL2d = save_VertexAttribL2d;
+   vfmt->VertexAttribL2dv = save_VertexAttribL2dv;
+   vfmt->VertexAttribL3d = save_VertexAttribL3d;
+   vfmt->VertexAttribL3dv = save_VertexAttribL3dv;
+   vfmt->VertexAttribL4d = save_VertexAttribL4d;
+   vfmt->VertexAttribL4dv = save_VertexAttribL4dv;
 
    vfmt->PrimitiveRestartNV = save_PrimitiveRestartNV;
 }
diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/draw_validate.c
similarity index 97%
rename from src/mesa/main/api_validate.c
rename to src/mesa/main/draw_validate.c
index 7b91fdf..29304bd 100644
--- a/src/mesa/main/api_validate.c
+++ b/src/mesa/main/draw_validate.c
@@ -24,7 +24,7 @@
 
 #include <stdbool.h>
 #include "glheader.h"
-#include "api_validate.h"
+#include "draw_validate.h"
 #include "arrayobj.h"
 #include "bufferobj.h"
 #include "context.h"
@@ -696,8 +696,6 @@
                             GLenum mode, GLsizei count, GLenum type,
                             const GLvoid *indices)
 {
-   FLUSH_CURRENT(ctx, 0);
-
    return validate_DrawElements_common(ctx, mode, count, type, indices,
                                        "glDrawElements");
 }
@@ -716,8 +714,6 @@
 {
    GLsizei i;
 
-   FLUSH_CURRENT(ctx, 0);
-
    /*
     * Section 2.3.1 (Errors) of the OpenGL 4.5 (Core Profile) spec says:
     *
@@ -780,8 +776,6 @@
                                  GLsizei count, GLenum type,
                                  const GLvoid *indices)
 {
-   FLUSH_CURRENT(ctx, 0);
-
    if (end < start) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glDrawRangeElements(end<start)");
       return GL_FALSE;
@@ -895,8 +889,6 @@
 validate_draw_arrays(struct gl_context *ctx, const char *func,
                      GLenum mode, GLsizei count, GLsizei numInstances)
 {
-   FLUSH_CURRENT(ctx, 0);
-
    if (count < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(count)", func);
       return false;
@@ -971,8 +963,6 @@
 {
    int i;
 
-   FLUSH_CURRENT(ctx, 0);
-
    if (!_mesa_valid_prim_mode(ctx, mode, "glMultiDrawArrays"))
       return false;
 
@@ -1018,8 +1008,6 @@
                                      GLenum mode, GLsizei count, GLenum type,
                                      const GLvoid *indices, GLsizei numInstances)
 {
-   FLUSH_CURRENT(ctx, 0);
-
    if (numInstances < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glDrawElementsInstanced(numInstances=%d)", numInstances);
@@ -1039,8 +1027,6 @@
                                      GLuint stream,
                                      GLsizei numInstances)
 {
-   FLUSH_CURRENT(ctx, 0);
-
    if (!_mesa_valid_prim_mode(ctx, mode, "glDrawTransformFeedback*(mode)")) {
       return GL_FALSE;
    }
@@ -1099,7 +1085,8 @@
     *      structure,  be in buffer objects,  and may not be called when
     *      the default vertex array object is bound."
     */
-   if (ctx->Array.VAO == ctx->Array.DefaultVAO) {
+   if (ctx->API != API_OPENGL_COMPAT &&
+       ctx->Array.VAO == ctx->Array.DefaultVAO) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "(no VAO bound)");
       return GL_FALSE;
    }
@@ -1206,10 +1193,10 @@
    return valid_draw_indirect(ctx, mode, indirect, size, name);
 }
 
-static inline GLboolean
-valid_draw_indirect_multi(struct gl_context *ctx,
-                          GLsizei primcount, GLsizei stride,
-                          const char *name)
+GLboolean
+_mesa_valid_draw_indirect_multi(struct gl_context *ctx,
+                                GLsizei primcount, GLsizei stride,
+                                const char *name)
 {
 
    /* From the ARB_multi_draw_indirect specification:
@@ -1244,8 +1231,6 @@
 {
    const unsigned drawArraysNumParams = 4;
 
-   FLUSH_CURRENT(ctx, 0);
-
    return valid_draw_indirect(ctx, mode,
                               indirect, drawArraysNumParams * sizeof(GLuint),
                               "glDrawArraysIndirect");
@@ -1258,8 +1243,6 @@
 {
    const unsigned drawElementsNumParams = 5;
 
-   FLUSH_CURRENT(ctx, 0);
-
    return valid_draw_indirect_elements(ctx, mode, type,
                                        indirect, drawElementsNumParams * sizeof(GLuint),
                                        "glDrawElementsIndirect");
@@ -1274,13 +1257,11 @@
    GLsizeiptr size = 0;
    const unsigned drawArraysNumParams = 4;
 
-   FLUSH_CURRENT(ctx, 0);
-
    /* caller has converted stride==0 to drawArraysNumParams * sizeof(GLuint) */
    assert(stride != 0);
 
-   if (!valid_draw_indirect_multi(ctx, primcount, stride,
-                                  "glMultiDrawArraysIndirect"))
+   if (!_mesa_valid_draw_indirect_multi(ctx, primcount, stride,
+                                        "glMultiDrawArraysIndirect"))
       return GL_FALSE;
 
    /* number of bytes of the indirect buffer which will be read */
@@ -1304,13 +1285,11 @@
    GLsizeiptr size = 0;
    const unsigned drawElementsNumParams = 5;
 
-   FLUSH_CURRENT(ctx, 0);
-
    /* caller has converted stride==0 to drawElementsNumParams * sizeof(GLuint) */
    assert(stride != 0);
 
-   if (!valid_draw_indirect_multi(ctx, primcount, stride,
-                                  "glMultiDrawElementsIndirect"))
+   if (!_mesa_valid_draw_indirect_multi(ctx, primcount, stride,
+                                        "glMultiDrawElementsIndirect"))
       return GL_FALSE;
 
    /* number of bytes of the indirect buffer which will be read */
@@ -1385,13 +1364,11 @@
    GLsizeiptr size = 0;
    const unsigned drawArraysNumParams = 4;
 
-   FLUSH_CURRENT(ctx, 0);
-
    /* caller has converted stride==0 to drawArraysNumParams * sizeof(GLuint) */
    assert(stride != 0);
 
-   if (!valid_draw_indirect_multi(ctx, maxdrawcount, stride,
-                                  "glMultiDrawArraysIndirectCountARB"))
+   if (!_mesa_valid_draw_indirect_multi(ctx, maxdrawcount, stride,
+                                        "glMultiDrawArraysIndirectCountARB"))
       return GL_FALSE;
 
    /* number of bytes of the indirect buffer which will be read */
@@ -1418,13 +1395,11 @@
    GLsizeiptr size = 0;
    const unsigned drawElementsNumParams = 5;
 
-   FLUSH_CURRENT(ctx, 0);
-
    /* caller has converted stride==0 to drawElementsNumParams * sizeof(GLuint) */
    assert(stride != 0);
 
-   if (!valid_draw_indirect_multi(ctx, maxdrawcount, stride,
-                                  "glMultiDrawElementsIndirectCountARB"))
+   if (!_mesa_valid_draw_indirect_multi(ctx, maxdrawcount, stride,
+                                        "glMultiDrawElementsIndirectCountARB"))
       return GL_FALSE;
 
    /* number of bytes of the indirect buffer which will be read */
diff --git a/src/mesa/main/api_validate.h b/src/mesa/main/draw_validate.h
similarity index 96%
rename from src/mesa/main/api_validate.h
rename to src/mesa/main/draw_validate.h
index 7a18115..d015c7e 100644
--- a/src/mesa/main/api_validate.h
+++ b/src/mesa/main/draw_validate.h
@@ -44,6 +44,9 @@
 extern GLboolean
 _mesa_valid_prim_mode(struct gl_context *ctx, GLenum mode, const char *name);
 
+extern GLboolean
+_mesa_valid_draw_indirect_multi(struct gl_context *ctx, GLsizei primcount,
+                                GLsizei stride, const char *name);
 
 extern GLboolean
 _mesa_validate_DrawArrays(struct gl_context *ctx, GLenum mode, GLsizei count);
diff --git a/src/mesa/main/drawpix.c b/src/mesa/main/drawpix.c
index 4bf14fc..2f55dde 100644
--- a/src/mesa/main/drawpix.c
+++ b/src/mesa/main/drawpix.c
@@ -24,7 +24,7 @@
 
 #include "glheader.h"
 #include "imports.h"
-#include "api_validate.h"
+#include "draw_validate.h"
 #include "bufferobj.h"
 #include "context.h"
 #include "drawpix.h"
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index 7625a4c..d1b2f3a 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -68,9 +68,9 @@
 vao_state(struct gl_context *ctx, gl_vert_attrib attr, GLboolean state)
 {
    if (state)
-      _mesa_enable_vertex_array_attrib(ctx, ctx->Array.VAO, attr, true);
+      _mesa_enable_vertex_array_attrib(ctx, ctx->Array.VAO, attr);
    else
-      _mesa_disable_vertex_array_attrib(ctx, ctx->Array.VAO, attr, true);
+      _mesa_disable_vertex_array_attrib(ctx, ctx->Array.VAO, attr);
 }
 
 
@@ -482,6 +482,16 @@
             ctx->DriverFlags.NewIntelConservativeRasterization;
          ctx->IntelConservativeRasterization = state;
          break;
+      case GL_CONSERVATIVE_RASTERIZATION_NV:
+         if (!_mesa_has_NV_conservative_raster(ctx))
+            goto invalid_enum_error;
+         if (ctx->ConservativeRasterization == state)
+            return;
+         FLUSH_VERTICES(ctx, 0);
+         ctx->NewDriverState |=
+            ctx->DriverFlags.NewNvConservativeRasterization;
+         ctx->ConservativeRasterization = state;
+         break;
       case GL_COLOR_LOGIC_OP:
          if (!_mesa_is_desktop_gl(ctx) && ctx->API != API_OPENGLES)
             goto invalid_enum_error;
@@ -1750,6 +1760,10 @@
          CHECK_EXTENSION(INTEL_conservative_rasterization);
          return ctx->IntelConservativeRasterization;
 
+      case GL_CONSERVATIVE_RASTERIZATION_NV:
+         CHECK_EXTENSION(NV_conservative_raster);
+         return ctx->ConservativeRasterization;
+
       case GL_TILE_RASTER_ORDER_FIXED_MESA:
          CHECK_EXTENSION(MESA_tile_raster_order);
          return ctx->TileRasterOrderFixed;
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 38fc52d..4d95a07 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -129,9 +129,7 @@
    ctx->Extensions.ARB_texture_env_crossbar = GL_TRUE;
    ctx->Extensions.ARB_texture_env_dot3 = GL_TRUE;
    ctx->Extensions.ARB_texture_filter_anisotropic = GL_TRUE;
-#ifdef TEXTURE_FLOAT_ENABLED
    ctx->Extensions.ARB_texture_float = GL_TRUE;
-#endif
    ctx->Extensions.ARB_texture_mirror_clamp_to_edge = GL_TRUE;
    ctx->Extensions.ARB_texture_non_power_of_two = GL_TRUE;
    ctx->Extensions.ARB_texture_rg = GL_TRUE;
@@ -144,7 +142,6 @@
    ctx->Extensions.ATI_texture_compression_3dc = GL_TRUE;
    ctx->Extensions.ATI_texture_env_combine3 = GL_TRUE;
    ctx->Extensions.ATI_texture_mirror_once = GL_TRUE;
-   ctx->Extensions.ATI_separate_stencil = GL_TRUE;
    ctx->Extensions.EXT_blend_color = GL_TRUE;
    ctx->Extensions.EXT_blend_equation_separate = GL_TRUE;
    ctx->Extensions.EXT_blend_func_separate = GL_TRUE;
@@ -338,6 +335,30 @@
    return (ctx->Version >= ext->version[ctx->API]) && base[ext->offset];
 }
 
+/**
+ * Compare two entries of the extensions table.  Sorts first by year,
+ * then by name.
+ *
+ * Arguments are indices into _mesa_extension_table.
+ */
+static int
+extension_compare(const void *p1, const void *p2)
+{
+   extension_index i1 = * (const extension_index *) p1;
+   extension_index i2 = * (const extension_index *) p2;
+   const struct mesa_extension *e1 = &_mesa_extension_table[i1];
+   const struct mesa_extension *e2 = &_mesa_extension_table[i2];
+   int res;
+
+   res = (int)e1->year - (int)e2->year;
+
+   if (res == 0) {
+      res = strcmp(e1->name, e2->name);
+   }
+
+   return res;
+}
+
 
 /**
  * Construct the GL_EXTENSIONS string.  Called the first time that
@@ -375,8 +396,8 @@
 
       if (i->year <= maxYear &&
           _mesa_extension_supported(ctx, k)) {
-         length += strlen(i->name) + 1; /* +1 for space */
-         extension_indices[count++] = k;
+	 length += strlen(i->name) + 1; /* +1 for space */
+	 ++count;
       }
    }
    for (k = 0; k < MAX_UNRECOGNIZED_EXTENSIONS; k++)
@@ -388,6 +409,24 @@
       return NULL;
    }
 
+   /* Sort extensions in chronological order because idTech 2/3 games
+    * (e.g., Quake3 demo) store the extension list in a fixed size buffer.
+    * Some cases truncate, while others overflow the buffer. Resulting in
+    * misrendering and crashes, respectively.
+    * Address the former here, while the latter will be addressed by setting
+    * the MESA_EXTENSION_MAX_YEAR environment variable.
+    */
+   j = 0;
+   for (k = 0; k < MESA_EXTENSION_COUNT; ++k) {
+      if (_mesa_extension_table[k].year <= maxYear &&
+         _mesa_extension_supported(ctx, k)) {
+         extension_indices[j++] = k;
+      }
+   }
+   assert(j == count);
+   qsort(extension_indices, count,
+         sizeof *extension_indices, extension_compare);
+
    /* Build the extension string.*/
    for (j = 0; j < count; ++j) {
       const struct mesa_extension *i = &_mesa_extension_table[extension_indices[j]];
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 492f7c3..3f01896 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -10,13 +10,13 @@
 
 EXT(AMD_conservative_depth                  , ARB_conservative_depth                 , GLL, GLC,  x ,  x , 2009)
 EXT(AMD_draw_buffers_blend                  , ARB_draw_buffers_blend                 , GLL, GLC,  x ,  x , 2009)
-EXT(AMD_performance_monitor                 , AMD_performance_monitor                , GLL, GLC,  x ,  x , 2007)
+EXT(AMD_performance_monitor                 , AMD_performance_monitor                , GLL, GLC,  x , ES2, 2007)
 EXT(AMD_pinned_memory                       , AMD_pinned_memory                      , GLL, GLC,  x ,  x , 2013)
 EXT(AMD_seamless_cubemap_per_texture        , AMD_seamless_cubemap_per_texture       , GLL, GLC,  x ,  x , 2009)
 EXT(AMD_shader_stencil_export               , ARB_shader_stencil_export              , GLL, GLC,  x ,  x , 2009)
 EXT(AMD_shader_trinary_minmax               , dummy_true                             , GLL, GLC,  x ,  x , 2012)
-EXT(AMD_vertex_shader_layer                 , AMD_vertex_shader_layer                ,  x , GLC,  x ,  x , 2012)
-EXT(AMD_vertex_shader_viewport_index        , AMD_vertex_shader_viewport_index       ,  x , GLC,  x ,  x , 2012)
+EXT(AMD_vertex_shader_layer                 , AMD_vertex_shader_layer                , GLL, GLC,  x ,  x , 2012)
+EXT(AMD_vertex_shader_viewport_index        , AMD_vertex_shader_viewport_index       , GLL, GLC,  x ,  x , 2012)
 
 EXT(ANDROID_extension_pack_es31a            , ANDROID_extension_pack_es31a           ,  x ,  x ,  x ,  31, 2014)
 
@@ -28,7 +28,7 @@
 EXT(APPLE_texture_max_level                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
 
 EXT(ARB_ES2_compatibility                   , ARB_ES2_compatibility                  , GLL, GLC,  x ,  x , 2009)
-EXT(ARB_ES3_1_compatibility                 , ARB_ES3_1_compatibility                ,  x , GLC,  x ,  x , 2014)
+EXT(ARB_ES3_1_compatibility                 , ARB_ES3_1_compatibility                , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_ES3_2_compatibility                 , ARB_ES3_2_compatibility                ,  x , GLC,  x ,  x , 2015)
 EXT(ARB_ES3_compatibility                   , ARB_ES3_compatibility                  , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_arrays_of_arrays                    , ARB_arrays_of_arrays                   , GLL, GLC,  x ,  x , 2012)
@@ -54,32 +54,33 @@
 EXT(ARB_depth_clamp                         , ARB_depth_clamp                        , GLL, GLC,  x ,  x , 2003)
 EXT(ARB_depth_texture                       , ARB_depth_texture                      , GLL,  x ,  x ,  x , 2001)
 EXT(ARB_derivative_control                  , ARB_derivative_control                 , GLL, GLC,  x ,  x , 2014)
-EXT(ARB_direct_state_access                 , dummy_true                             ,  x , GLC,  x ,  x , 2014)
+EXT(ARB_direct_state_access                 , dummy_true                             ,  45, GLC,  x ,  x , 2014)
 EXT(ARB_draw_buffers                        , dummy_true                             , GLL, GLC,  x ,  x , 2002)
 EXT(ARB_draw_buffers_blend                  , ARB_draw_buffers_blend                 , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          , GLL, GLC,  x ,  x , 2009)
-EXT(ARB_draw_indirect                       , ARB_draw_indirect                      ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_draw_indirect                       , ARB_draw_indirect                      , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_draw_instanced                      , ARB_draw_instanced                     , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_enhanced_layouts                    , ARB_enhanced_layouts                   , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_explicit_attrib_location            , ARB_explicit_attrib_location           , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_explicit_uniform_location           , ARB_explicit_uniform_location          , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_fragment_coord_conventions          , ARB_fragment_coord_conventions         , GLL, GLC,  x ,  x , 2009)
-EXT(ARB_fragment_layer_viewport             , ARB_fragment_layer_viewport            ,  x , GLC,  x ,  x , 2012)
+EXT(ARB_fragment_layer_viewport             , ARB_fragment_layer_viewport            , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_fragment_program                    , ARB_fragment_program                   , GLL,  x ,  x ,  x , 2002)
 EXT(ARB_fragment_program_shadow             , ARB_fragment_program_shadow            , GLL,  x ,  x ,  x , 2003)
 EXT(ARB_fragment_shader                     , ARB_fragment_shader                    , GLL, GLC,  x ,  x , 2002)
+EXT(ARB_fragment_shader_interlock           , ARB_fragment_shader_interlock          , GLL, GLC,  x ,  x , 2015)
 EXT(ARB_framebuffer_no_attachments          , ARB_framebuffer_no_attachments         , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_framebuffer_object                  , ARB_framebuffer_object                 , GLL, GLC,  x ,  x , 2005)
 EXT(ARB_framebuffer_sRGB                    , EXT_framebuffer_sRGB                   , GLL, GLC,  x ,  x , 1998)
 EXT(ARB_get_program_binary                  , dummy_true                             , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_get_texture_sub_image               , dummy_true                             , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_gl_spirv                            , ARB_gl_spirv                           ,  x,  GLC,  x ,  x , 2016)
-EXT(ARB_gpu_shader5                         , ARB_gpu_shader5                        ,  x , GLC,  x ,  x , 2010)
-EXT(ARB_gpu_shader_fp64                     , ARB_gpu_shader_fp64                    ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_gpu_shader5                         , ARB_gpu_shader5                        , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_gpu_shader_fp64                     , ARB_gpu_shader_fp64                    ,  32, GLC,  x ,  x , 2010)
 EXT(ARB_gpu_shader_int64                    , ARB_gpu_shader_int64                   ,  x , GLC,  x ,  x , 2015)
 EXT(ARB_half_float_pixel                    , dummy_true                             , GLL, GLC,  x ,  x , 2003)
 EXT(ARB_half_float_vertex                   , ARB_half_float_vertex                  , GLL, GLC,  x ,  x , 2008)
-EXT(ARB_indirect_parameters                 , ARB_indirect_parameters                ,  x , GLC,  x ,  x , 2013)
+EXT(ARB_indirect_parameters                 , ARB_indirect_parameters                , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_instanced_arrays                    , ARB_instanced_arrays                   , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_internalformat_query                , ARB_internalformat_query               , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_internalformat_query2               , ARB_internalformat_query2              , GLL, GLC,  x ,  x , 2013)
@@ -87,7 +88,7 @@
 EXT(ARB_map_buffer_alignment                , dummy_true                             , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_map_buffer_range                    , ARB_map_buffer_range                   , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_multi_bind                          , dummy_true                             , GLL, GLC,  x ,  x , 2013)
-EXT(ARB_multi_draw_indirect                 , ARB_draw_indirect                      ,  x , GLC,  x ,  x , 2012)
+EXT(ARB_multi_draw_indirect                 , ARB_draw_indirect                      , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_multisample                         , dummy_true                             , GLL,  x ,  x ,  x , 1994)
 EXT(ARB_multitexture                        , dummy_true                             , GLL,  x ,  x ,  x , 1998)
 EXT(ARB_occlusion_query                     , ARB_occlusion_query                    , GLL,  x ,  x ,  x , 2001)
@@ -103,6 +104,7 @@
 EXT(ARB_query_buffer_object                 , ARB_query_buffer_object                , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_robust_buffer_access_behavior       , ARB_robust_buffer_access_behavior      , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_robustness                          , dummy_true                             , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_sample_locations                    , ARB_sample_locations                   , GLL, GLC,  x ,  x , 2015)
 EXT(ARB_sample_shading                      , ARB_sample_shading                     , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_sampler_objects                     , dummy_true                             , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_seamless_cube_map                   , ARB_seamless_cube_map                  , GLL, GLC,  x ,  x , 2009)
@@ -121,10 +123,10 @@
 EXT(ARB_shader_precision                    , ARB_shader_precision                   , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_shader_stencil_export               , ARB_shader_stencil_export              , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_shader_storage_buffer_object        , ARB_shader_storage_buffer_object       , GLL, GLC,  x ,  x , 2012)
-EXT(ARB_shader_subroutine                   , dummy_true                             ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_shader_subroutine                   , dummy_true                             ,  31, GLC,  x ,  x , 2010)
 EXT(ARB_shader_texture_image_samples        , ARB_shader_texture_image_samples       , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_shader_texture_lod                  , ARB_shader_texture_lod                 , GLL, GLC,  x ,  x , 2009)
-EXT(ARB_shader_viewport_layer_array         , ARB_shader_viewport_layer_array        ,  x , GLC,  x ,  x , 2015)
+EXT(ARB_shader_viewport_layer_array         , ARB_shader_viewport_layer_array        , GLL, GLC,  x ,  x , 2015)
 EXT(ARB_shading_language_100                , dummy_true                             , GLL,  x ,  x ,  x , 2003)
 EXT(ARB_shading_language_420pack            , ARB_shading_language_420pack           , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_shading_language_packing            , ARB_shading_language_packing           , GLL, GLC,  x ,  x , 2011)
@@ -132,7 +134,7 @@
 EXT(ARB_sparse_buffer                       , ARB_sparse_buffer                      , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_stencil_texturing                   , ARB_stencil_texturing                  , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_sync                                , ARB_sync                               , GLL, GLC,  x ,  x , 2003)
-EXT(ARB_tessellation_shader                 , ARB_tessellation_shader                ,  x , GLC,  x ,  x , 2009)
+EXT(ARB_tessellation_shader                 , ARB_tessellation_shader                , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_texture_barrier                     , NV_texture_barrier                     , GLL, GLC,  x ,  x , 2014)
 EXT(ARB_texture_border_clamp                , ARB_texture_border_clamp               , GLL,  x ,  x ,  x , 2000)
 EXT(ARB_texture_buffer_object               , ARB_texture_buffer_object              , GLL, GLC,  x ,  x , 2008)
@@ -173,21 +175,21 @@
 EXT(ARB_uniform_buffer_object               , ARB_uniform_buffer_object              , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_vertex_array_bgra                   , EXT_vertex_array_bgra                  , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_vertex_array_object                 , dummy_true                             , GLL, GLC,  x ,  x , 2006)
-EXT(ARB_vertex_attrib_64bit                 , ARB_vertex_attrib_64bit                ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_vertex_attrib_64bit                 , ARB_vertex_attrib_64bit                ,  32, GLC,  x ,  x , 2010)
 EXT(ARB_vertex_attrib_binding               , dummy_true                             , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_vertex_buffer_object                , dummy_true                             , GLL,  x ,  x ,  x , 2003)
 EXT(ARB_vertex_program                      , ARB_vertex_program                     , GLL,  x ,  x ,  x , 2002)
 EXT(ARB_vertex_shader                       , ARB_vertex_shader                      , GLL, GLC,  x ,  x , 2002)
 EXT(ARB_vertex_type_10f_11f_11f_rev         , ARB_vertex_type_10f_11f_11f_rev        , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_vertex_type_2_10_10_10_rev          , ARB_vertex_type_2_10_10_10_rev         , GLL, GLC,  x ,  x , 2009)
-EXT(ARB_viewport_array                      , ARB_viewport_array                     ,  x , GLC,  x ,  x , 2010)
+EXT(ARB_viewport_array                      , ARB_viewport_array                     , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_window_pos                          , dummy_true                             , GLL,  x ,  x ,  x , 2001)
 
 EXT(ATI_blend_equation_separate             , EXT_blend_equation_separate            , GLL, GLC,  x ,  x , 2003)
 EXT(ATI_draw_buffers                        , dummy_true                             , GLL,  x ,  x ,  x , 2002)
 EXT(ATI_fragment_shader                     , ATI_fragment_shader                    , GLL,  x ,  x ,  x , 2001)
 EXT(ATI_meminfo                             , ATI_meminfo                            , GLL, GLC,  x ,  x , 2009)
-EXT(ATI_separate_stencil                    , ATI_separate_stencil                   , GLL,  x ,  x ,  x , 2006)
+EXT(ATI_separate_stencil                    , EXT_stencil_two_side                   , GLL,  x ,  x ,  x , 2006)
 EXT(ATI_texture_compression_3dc             , ATI_texture_compression_3dc            , GLL,  x ,  x ,  x , 2004)
 EXT(ATI_texture_env_combine3                , ATI_texture_env_combine3               , GLL,  x ,  x ,  x , 2002)
 EXT(ATI_texture_float                       , ARB_texture_float                      , GLL, GLC,  x ,  x , 2002)
@@ -240,7 +242,6 @@
 EXT(EXT_packed_pixels                       , dummy_true                             , GLL,  x ,  x ,  x , 1997)
 EXT(EXT_pixel_buffer_object                 , EXT_pixel_buffer_object                , GLL, GLC,  x ,  x , 2004)
 EXT(EXT_point_parameters                    , EXT_point_parameters                   , GLL,  x ,  x ,  x , 1997)
-EXT(EXT_polygon_offset                      , dummy_true                             , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_polygon_offset_clamp                , ARB_polygon_offset_clamp               , GLL, GLC, ES1, ES2, 2014)
 EXT(EXT_primitive_bounding_box              , OES_primitive_bounding_box             ,  x ,  x ,  x ,  31, 2014)
 EXT(EXT_provoking_vertex                    , EXT_provoking_vertex                   , GLL, GLC,  x ,  x , 2009)
@@ -283,6 +284,7 @@
 EXT(EXT_texture_integer                     , EXT_texture_integer                    , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_texture_lod_bias                    , dummy_true                             , GLL,  x , ES1,  x , 1999)
 EXT(EXT_texture_mirror_clamp                , EXT_texture_mirror_clamp               , GLL, GLC,  x ,  x , 2004)
+EXT(EXT_texture_norm16                      , dummy_true                             ,  x ,  x ,  x ,  31, 2014)
 EXT(EXT_texture_object                      , dummy_true                             , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_texture_rectangle                   , NV_texture_rectangle                   , GLL,  x ,  x ,  x , 2004)
 EXT(EXT_texture_rg                          , ARB_texture_rg                         ,  x ,  x ,  x , ES2, 2011)
@@ -321,6 +323,7 @@
 EXT(KHR_texture_compression_astc_ldr        , KHR_texture_compression_astc_ldr       , GLL, GLC,  x , ES2, 2012)
 EXT(KHR_texture_compression_astc_sliced_3d  , KHR_texture_compression_astc_sliced_3d , GLL, GLC,  x , ES2, 2015)
 
+EXT(MESA_framebuffer_flip_y                 , MESA_framebuffer_flip_y                ,   x,   x,  x ,  31, 2018)
 EXT(MESA_pack_invert                        , MESA_pack_invert                       , GLL, GLC,  x ,  x , 2002)
 EXT(MESA_shader_integer_functions           , MESA_shader_integer_functions          , GLL, GLC,  x ,  30, 2016)
 EXT(MESA_texture_signed_rgba                , EXT_texture_snorm                      , GLL, GLC,  x ,  x , 2009)
@@ -332,6 +335,10 @@
 
 EXT(NV_blend_square                         , dummy_true                             , GLL,  x ,  x ,  x , 1999)
 EXT(NV_conditional_render                   , NV_conditional_render                  , GLL, GLC,  x ,  x , 2008)
+EXT(NV_conservative_raster                  , NV_conservative_raster                 , GLL, GLC, ES1, ES2, 2015)
+EXT(NV_conservative_raster_dilate           , NV_conservative_raster_dilate          , GLL, GLC, ES1, ES2, 2015)
+EXT(NV_conservative_raster_pre_snap         , NV_conservative_raster_pre_snap        , GLL, GLC, ES1, ES2, 2017)
+EXT(NV_conservative_raster_pre_snap_triangles, NV_conservative_raster_pre_snap_triangles, GLL, GLC, ES1, ES2, 2015)
 EXT(NV_depth_clamp                          , ARB_depth_clamp                        , GLL, GLC,  x ,  x , 2001)
 EXT(NV_draw_buffers                         , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
 EXT(NV_fbo_color_attachments                , dummy_true                             ,  x ,  x ,  x , ES2, 2010)
@@ -346,6 +353,7 @@
 EXT(NV_read_depth                           , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
 EXT(NV_read_depth_stencil                   , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
 EXT(NV_read_stencil                         , dummy_true                             ,  x ,  x ,  x , ES2, 2011)
+EXT(NV_sample_locations                     , ARB_sample_locations                   , GLL, GLC,  x , ES2, 2015)
 EXT(NV_texgen_reflection                    , dummy_true                             , GLL,  x ,  x ,  x , 1999)
 EXT(NV_texture_barrier                      , NV_texture_barrier                     , GLL, GLC,  x ,  x , 2009)
 EXT(NV_texture_env_combine4                 , NV_texture_env_combine4                , GLL,  x ,  x ,  x , 1999)
@@ -416,6 +424,7 @@
 EXT(OES_texture_npot                        , ARB_texture_non_power_of_two           ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_texture_stencil8                    , ARB_texture_stencil8                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_texture_storage_multisample_2d_array, ARB_texture_multisample                ,  x ,  x ,  x ,  31, 2014)
+EXT(OES_texture_view                        , OES_texture_view                       ,  x ,  x ,  x ,  31, 2014)
 EXT(OES_vertex_array_object                 , dummy_true                             ,  x ,  x , ES1, ES2, 2010)
 EXT(OES_vertex_half_float                   , ARB_half_float_vertex                  ,  x ,  x ,  x , ES2, 2005)
 EXT(OES_viewport_array                      , OES_viewport_array                     ,  x ,  x ,  x ,  31, 2010)
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index a63e8b8..a9400d5 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -35,6 +35,7 @@
 
 #include "buffers.h"
 #include "context.h"
+#include "debug_output.h"
 #include "enums.h"
 #include "fbobject.h"
 #include "formats.h"
@@ -1404,14 +1405,46 @@
 }
 
 /**
- * ARB_framebuffer_no_attachment - Application passes requested param's
- * here. NOTE: NumSamples requested need not be _NumSamples which is
- * what the hw supports.
+ * ARB_framebuffer_no_attachment and ARB_sample_locations - Application passes
+ * requested param's here. NOTE: NumSamples requested need not be _NumSamples
+ * which is what the hw supports.
  */
 static void
 framebuffer_parameteri(struct gl_context *ctx, struct gl_framebuffer *fb,
                        GLenum pname, GLint param, const char *func)
 {
+   bool cannot_be_winsys_fbo = false;
+
+   switch (pname) {
+   case GL_FRAMEBUFFER_DEFAULT_WIDTH:
+   case GL_FRAMEBUFFER_DEFAULT_HEIGHT:
+   case GL_FRAMEBUFFER_DEFAULT_LAYERS:
+   case GL_FRAMEBUFFER_DEFAULT_SAMPLES:
+   case GL_FRAMEBUFFER_DEFAULT_FIXED_SAMPLE_LOCATIONS:
+      if (!ctx->Extensions.ARB_framebuffer_no_attachments)
+         goto invalid_pname_enum;
+      cannot_be_winsys_fbo = true;
+      break;
+   case GL_FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_ARB:
+   case GL_FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_ARB:
+      if (!ctx->Extensions.ARB_sample_locations)
+         goto invalid_pname_enum;
+      break;
+   case GL_FRAMEBUFFER_FLIP_Y_MESA:
+      if (!ctx->Extensions.MESA_framebuffer_flip_y)
+         goto invalid_pname_enum;
+      cannot_be_winsys_fbo = true;
+      break;
+   default:
+      goto invalid_pname_enum;
+   }
+
+   if (cannot_be_winsys_fbo && _mesa_is_winsys_fbo(fb)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(invalid pname=0x%x for default framebuffer)", func, pname);
+      return;
+   }
+
    switch (pname) {
    case GL_FRAMEBUFFER_DEFAULT_WIDTH:
       if (param < 0 || param > ctx->Const.MaxFramebufferWidth)
@@ -1448,13 +1481,33 @@
    case GL_FRAMEBUFFER_DEFAULT_FIXED_SAMPLE_LOCATIONS:
       fb->DefaultGeometry.FixedSampleLocations = param;
       break;
-   default:
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "%s(pname=0x%x)", func, pname);
+   case GL_FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_ARB:
+      fb->ProgrammableSampleLocations = !!param;
+      break;
+   case GL_FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_ARB:
+      fb->SampleLocationPixelGrid = !!param;
+      break;
+   case GL_FRAMEBUFFER_FLIP_Y_MESA:
+      fb->FlipY = param;
+      break;
    }
 
-   invalidate_framebuffer(fb);
-   ctx->NewState |= _NEW_BUFFERS;
+   switch (pname) {
+   case GL_FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_ARB:
+   case GL_FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_ARB:
+      if (fb == ctx->DrawBuffer)
+         ctx->NewDriverState |= ctx->DriverFlags.NewSampleLocations;
+      break;
+   default:
+      invalidate_framebuffer(fb);
+      ctx->NewState |= _NEW_BUFFERS;
+      break;
+   }
+
+   return;
+
+invalid_pname_enum:
+   _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=0x%x)", func, pname);
 }
 
 void GLAPIENTRY
@@ -1463,10 +1516,12 @@
    GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *fb;
 
-   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments &&
+       !ctx->Extensions.ARB_sample_locations) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glFramebufferParameteriv not supported "
-                  "(ARB_framebuffer_no_attachments not implemented)");
+                  "(neither ARB_framebuffer_no_attachments nor ARB_sample_locations"
+                  " is available)");
       return;
    }
 
@@ -1477,13 +1532,6 @@
       return;
    }
 
-   /* check framebuffer binding */
-   if (_mesa_is_winsys_fbo(fb)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glFramebufferParameteri");
-      return;
-   }
-
    framebuffer_parameteri(ctx, fb, pname, param, "glFramebufferParameteri");
 }
 
@@ -1528,9 +1576,20 @@
        */
       cannot_be_winsys_fbo = !_mesa_is_desktop_gl(ctx);
       break;
+   case GL_FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_ARB:
+   case GL_FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_ARB:
+      if (!ctx->Extensions.ARB_sample_locations)
+         goto invalid_pname_enum;
+      cannot_be_winsys_fbo = false;
+      break;
+   case GL_FRAMEBUFFER_FLIP_Y_MESA:
+      if (!ctx->Extensions.MESA_framebuffer_flip_y) {
+         _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=0x%x)", func, pname);
+         return false;
+      }
+      break;
    default:
-      _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=0x%x)", func, pname);
-      return false;
+      goto invalid_pname_enum;
    }
 
    if (cannot_be_winsys_fbo && _mesa_is_winsys_fbo(fb)) {
@@ -1540,6 +1599,10 @@
    }
 
    return true;
+
+invalid_pname_enum:
+   _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=0x%x)", func, pname);
+   return false;
 }
 
 static void
@@ -1583,6 +1646,15 @@
    case GL_STEREO:
       *params = fb->Visual.stereoMode;
       break;
+   case GL_FRAMEBUFFER_PROGRAMMABLE_SAMPLE_LOCATIONS_ARB:
+      *params = fb->ProgrammableSampleLocations;
+      break;
+   case GL_FRAMEBUFFER_SAMPLE_LOCATION_PIXEL_GRID_ARB:
+      *params = fb->SampleLocationPixelGrid;
+      break;
+   case GL_FRAMEBUFFER_FLIP_Y_MESA:
+      *params = fb->FlipY;
+      break;
    }
 }
 
@@ -1592,10 +1664,12 @@
    GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *fb;
 
-   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments &&
+       !ctx->Extensions.ARB_sample_locations) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glGetFramebufferParameteriv not supported "
-                  "(ARB_framebuffer_no_attachments not implemented)");
+                  "(neither ARB_framebuffer_no_attachments nor ARB_sample_locations"
+                  " is available)");
       return;
    }
 
@@ -1870,8 +1944,10 @@
    case GL_RGBA:
    case GL_RGBA2:
    case GL_RGBA12:
-   case GL_RGBA16:
       return _mesa_is_desktop_gl(ctx) ? GL_RGBA : 0;
+   case GL_RGBA16:
+      return _mesa_is_desktop_gl(ctx) || _mesa_has_EXT_texture_norm16(ctx)
+         ? GL_RGBA : 0;
    case GL_RGB10_A2:
    case GL_SRGB8_ALPHA8_EXT:
       return _mesa_is_desktop_gl(ctx) || _mesa_is_gles3(ctx) ? GL_RGBA : 0;
@@ -1906,15 +1982,17 @@
              ctx->Extensions.ARB_depth_buffer_float)
          ? GL_DEPTH_STENCIL : 0;
    case GL_RED:
+      return _mesa_has_ARB_texture_rg(ctx) ? GL_RED : 0;
    case GL_R16:
-      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_rg
+      return _mesa_has_ARB_texture_rg(ctx) || _mesa_has_EXT_texture_norm16(ctx)
          ? GL_RED : 0;
    case GL_R8:
       return ctx->API != API_OPENGLES && ctx->Extensions.ARB_texture_rg
          ? GL_RED : 0;
    case GL_RG:
+      return _mesa_has_ARB_texture_rg(ctx) ? GL_RG : 0;
    case GL_RG16:
-      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_rg
+      return _mesa_has_ARB_texture_rg(ctx) || _mesa_has_EXT_texture_norm16(ctx)
          ? GL_RG : 0;
    case GL_RG8:
       return ctx->API != API_OPENGLES && ctx->Extensions.ARB_texture_rg
@@ -2694,6 +2772,7 @@
 
    if (bindDrawBuf) {
       FLUSH_VERTICES(ctx, _NEW_BUFFERS);
+      ctx->NewDriverState |= ctx->DriverFlags.NewSampleLocations;
 
       /* check if old framebuffer had any texture attachments */
       if (oldDrawFb)
@@ -4234,15 +4313,21 @@
    GET_CURRENT_CONTEXT(ctx);
    struct gl_framebuffer *fb = NULL;
 
-   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments &&
+       !ctx->Extensions.ARB_sample_locations) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glNamedFramebufferParameteri("
-                  "ARB_framebuffer_no_attachments not implemented)");
+                  "neither ARB_framebuffer_no_attachments nor "
+                  "ARB_sample_locations is available)");
       return;
    }
 
-   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
-                                     "glNamedFramebufferParameteri");
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glNamedFramebufferParameteri");
+   } else {
+      fb = ctx->WinSysDrawBuffer;
+   }
 
    if (fb) {
       framebuffer_parameteri(ctx, fb, pname, param,
@@ -4261,16 +4346,16 @@
    if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glNamedFramebufferParameteriv("
-                  "ARB_framebuffer_no_attachments not implemented)");
+                  "neither ARB_framebuffer_no_attachments nor ARB_sample_locations"
+                  " is available)");
       return;
    }
 
-   if (framebuffer) {
+   if (framebuffer)
       fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
                                         "glGetNamedFramebufferParameteriv");
-   } else {
+   else
       fb = ctx->WinSysDrawBuffer;
-   }
 
    if (fb) {
       get_framebuffer_parameteriv(ctx, fb, pname, param,
@@ -4605,3 +4690,141 @@
                "glDiscardFramebufferEXT(attachment %s)",
               _mesa_enum_to_string(attachments[i]));
 }
+
+static void
+sample_locations(struct gl_context *ctx, struct gl_framebuffer *fb,
+                 GLuint start, GLsizei count, const GLfloat *v, bool no_error,
+                 const char *name)
+{
+   GLsizei i;
+
+   if (!no_error) {
+      if (!ctx->Extensions.ARB_sample_locations) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "%s not supported "
+                     "(ARB_sample_locations not available)", name);
+         return;
+      }
+
+      if (start + count > MAX_SAMPLE_LOCATION_TABLE_SIZE) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(start+size > sample location table size)", name);
+         return;
+      }
+   }
+
+   if (!fb->SampleLocationTable) {
+      size_t size = MAX_SAMPLE_LOCATION_TABLE_SIZE * 2 * sizeof(GLfloat);
+      fb->SampleLocationTable = malloc(size);
+      if (!fb->SampleLocationTable) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY,
+                     "Cannot allocate sample location table");
+         return;
+      }
+      for (i = 0; i < MAX_SAMPLE_LOCATION_TABLE_SIZE * 2; i++)
+         fb->SampleLocationTable[i] = 0.5f;
+   }
+
+   for (i = 0; i < count * 2; i++) {
+      /* The ARB_sample_locations spec says:
+       *
+       *    Sample locations outside of [0,1] result in undefined
+       *    behavior.
+       *
+       * To simplify driver implementations, we choose to clamp to
+       * [0,1] and change NaN into 0.5.
+       */
+      if (isnan(v[i]) || v[i] < 0.0f || v[i] > 1.0f) {
+         static GLuint msg_id = 0;
+         static const char* msg = "Invalid sample location specified";
+         _mesa_debug_get_id(&msg_id);
+
+         _mesa_log_msg(ctx, MESA_DEBUG_SOURCE_API, MESA_DEBUG_TYPE_UNDEFINED,
+                       msg_id, MESA_DEBUG_SEVERITY_HIGH, strlen(msg), msg);
+      }
+
+      if (isnan(v[i]))
+         fb->SampleLocationTable[start * 2 + i] = 0.5f;
+      else
+         fb->SampleLocationTable[start * 2 + i] = CLAMP(v[i], 0.0f, 1.0f);
+   }
+
+   if (fb == ctx->DrawBuffer)
+      ctx->NewDriverState |= ctx->DriverFlags.NewSampleLocations;
+}
+
+void GLAPIENTRY
+_mesa_FramebufferSampleLocationsfvARB(GLenum target, GLuint start,
+                                      GLsizei count, const GLfloat *v)
+{
+   struct gl_framebuffer *fb;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glFramebufferSampleLocationsfvARB(target %s)",
+                  _mesa_enum_to_string(target));
+      return;
+   }
+
+   sample_locations(ctx, fb, start, count, v, false,
+                    "glFramebufferSampleLocationsfvARB");
+}
+
+void GLAPIENTRY
+_mesa_NamedFramebufferSampleLocationsfvARB(GLuint framebuffer, GLuint start,
+                                           GLsizei count, const GLfloat *v)
+{
+   struct gl_framebuffer *fb;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glNamedFramebufferSampleLocationsfvARB");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysDrawBuffer;
+
+   sample_locations(ctx, fb, start, count, v, false,
+                    "glNamedFramebufferSampleLocationsfvARB");
+}
+
+void GLAPIENTRY
+_mesa_FramebufferSampleLocationsfvARB_no_error(GLenum target, GLuint start,
+                                               GLsizei count, const GLfloat *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   sample_locations(ctx, get_framebuffer_target(ctx, target), start,
+                    count, v, true, "glFramebufferSampleLocationsfvARB");
+}
+
+void GLAPIENTRY
+_mesa_NamedFramebufferSampleLocationsfvARB_no_error(GLuint framebuffer,
+                                                    GLuint start, GLsizei count,
+                                                    const GLfloat *v)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   sample_locations(ctx, _mesa_lookup_framebuffer(ctx, framebuffer), start,
+                    count, v, true, "glNamedFramebufferSampleLocationsfvARB");
+}
+
+void GLAPIENTRY
+_mesa_EvaluateDepthValuesARB(void)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (!ctx->Extensions.ARB_sample_locations) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "EvaluateDepthValuesARB not supported (neither "
+                  "ARB_sample_locations nor NV_sample_locations is available)");
+      return;
+   }
+
+   if (ctx->Driver.EvaluateDepthValues)
+      ctx->Driver.EvaluateDepthValues(ctx);
+}
diff --git a/src/mesa/main/fbobject.h b/src/mesa/main/fbobject.h
index 31d743d..5ba62d6 100644
--- a/src/mesa/main/fbobject.h
+++ b/src/mesa/main/fbobject.h
@@ -355,4 +355,24 @@
 extern void GLAPIENTRY
 _mesa_GetFramebufferParameteriv(GLenum target, GLenum pname, GLint *params);
 
+extern void GLAPIENTRY
+_mesa_FramebufferSampleLocationsfvARB(GLenum target, GLuint start,
+                                      GLsizei count, const GLfloat *v);
+
+extern void GLAPIENTRY
+_mesa_NamedFramebufferSampleLocationsfvARB(GLuint framebuffer, GLuint start,
+                                           GLsizei count, const GLfloat *v);
+
+extern void GLAPIENTRY
+_mesa_FramebufferSampleLocationsfvARB_no_error(GLenum target, GLuint start,
+                                               GLsizei count, const GLfloat *v);
+
+extern void GLAPIENTRY
+_mesa_NamedFramebufferSampleLocationsfvARB_no_error(GLuint framebuffer,
+                                                    GLuint start, GLsizei count,
+                                                    const GLfloat *v);
+
+extern void GLAPIENTRY
+_mesa_EvaluateDepthValuesARB(void);
+
 #endif /* FBOBJECT_H */
diff --git a/src/mesa/main/ff_fragment_shader.cpp b/src/mesa/main/ff_fragment_shader.cpp
index a698931..935a216 100644
--- a/src/mesa/main/ff_fragment_shader.cpp
+++ b/src/mesa/main/ff_fragment_shader.cpp
@@ -229,7 +229,11 @@
     * since vertex shader state validation comes after fragment state
     * validation (see additional comments in state.c).
     */
-   if (vertexShader)
+   if (ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY] != NULL)
+      vprog = ctx->_Shader->CurrentProgram[MESA_SHADER_GEOMETRY];
+   else if (ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL] != NULL)
+      vprog = ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL];
+   else if (vertexShader)
       vprog = ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX];
    else
       vprog = ctx->VertexProgram.Current;
diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c
index 1c75559..dfb494b 100644
--- a/src/mesa/main/ffvertex_prog.c
+++ b/src/mesa/main/ffvertex_prog.c
@@ -1297,12 +1297,14 @@
    struct ureg input;
 
    switch (p->state->fog_distance_mode) {
-   case FDM_EYE_RADIAL: /* Z = sqrt(Xe*Xe + Ye*Ye + Ze*Ze) */
+   case FDM_EYE_RADIAL: { /* Z = sqrt(Xe*Xe + Ye*Ye + Ze*Ze) */
+      struct ureg tmp = get_temp(p);
       input = get_eye_position(p);
-      emit_op2(p, OPCODE_DP3, fog, WRITEMASK_X, input, input);
-      emit_op1(p, OPCODE_RSQ, fog, WRITEMASK_X, fog);
-      emit_op1(p, OPCODE_RCP, fog, WRITEMASK_X, fog);
+      emit_op2(p, OPCODE_DP3, tmp, WRITEMASK_X, input, input);
+      emit_op1(p, OPCODE_RSQ, tmp, WRITEMASK_X, tmp);
+      emit_op1(p, OPCODE_RCP, fog, WRITEMASK_X, tmp);
       break;
+   }
    case FDM_EYE_PLANE: /* Z = Ze */
       input = get_eye_position_z(p);
       emit_op1(p, OPCODE_MOV, fog, WRITEMASK_X, input);
diff --git a/src/mesa/main/format_info.py b/src/mesa/main/format_info.py
index b0308ef..bbecaa2 100644
--- a/src/mesa/main/format_info.py
+++ b/src/mesa/main/format_info.py
@@ -21,6 +21,8 @@
 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
+from __future__ import print_function
+
 import format_parser as parser
 import sys
 
@@ -135,7 +137,7 @@
 
 formats = parser.parse(sys.argv[1])
 
-print '''
+print('''
 /*
  * Mesa 3-D graphics library
  *
@@ -167,35 +169,35 @@
 
 static const struct gl_format_info format_info[MESA_FORMAT_COUNT] =
 {
-'''
+''')
 
 def format_channel_bits(fmat, tuple_list):
    return ['.%s = %s' % (field, str(get_channel_bits(fmat, name))) for (field, name) in tuple_list]
 
 
 for fmat in formats:
-   print '   {'
-   print '      .Name = {0},'.format(fmat.name)
-   print '      .StrName = "{0}",'.format(fmat.name)
-   print '      .Layout = {0},'.format('MESA_FORMAT_LAYOUT_' + fmat.layout.upper())
-   print '      .BaseFormat = {0},'.format(get_gl_base_format(fmat))
-   print '      .DataType = {0},'.format(get_gl_data_type(fmat))
+   print('   {')
+   print('      .Name = {0},'.format(fmat.name))
+   print('      .StrName = "{0}",'.format(fmat.name))
+   print('      .Layout = {0},'.format('MESA_FORMAT_LAYOUT_' + fmat.layout.upper()))
+   print('      .BaseFormat = {0},'.format(get_gl_base_format(fmat)))
+   print('      .DataType = {0},'.format(get_gl_data_type(fmat)))
 
    bits = [('RedBits', 'r'), ('GreenBits', 'g'), ('BlueBits', 'b'), ('AlphaBits', 'a')]
-   print '      {0},'.format(', '.join(format_channel_bits(fmat, bits)))
+   print('      {0},'.format(', '.join(format_channel_bits(fmat, bits))))
    bits = [('LuminanceBits', 'l'), ('IntensityBits', 'i'), ('DepthBits', 'z'), ('StencilBits', 's')]
-   print '      {0},'.format(', '.join(format_channel_bits(fmat, bits)))
+   print('      {0},'.format(', '.join(format_channel_bits(fmat, bits))))
 
-   print '      .IsSRGBFormat = {0:d},'.format(fmat.colorspace == 'srgb')
+   print('      .IsSRGBFormat = {0:d},'.format(fmat.colorspace == 'srgb'))
 
-   print '      .BlockWidth = {0}, .BlockHeight = {1}, .BlockDepth = {2},'.format(fmat.block_width, fmat.block_height, fmat.block_depth)
-   print '      .BytesPerBlock = {0},'.format(int(fmat.block_size() / 8))
+   print('      .BlockWidth = {0}, .BlockHeight = {1}, .BlockDepth = {2},'.format(fmat.block_width, fmat.block_height, fmat.block_depth))
+   print('      .BytesPerBlock = {0},'.format(int(fmat.block_size() / 8)))
 
-   print '      .Swizzle = {{ {0} }},'.format(', '.join(map(str, fmat.swizzle)))
+   print('      .Swizzle = {{ {0} }},'.format(', '.join(map(str, fmat.swizzle))))
    if fmat.is_array():
       chan = fmat.array_element()
       norm = chan.norm or chan.type == parser.FLOAT
-      print '      .ArrayFormat = MESA_ARRAY_FORMAT({0}),'.format(', '.join([
+      print('      .ArrayFormat = MESA_ARRAY_FORMAT({0}),'.format(', '.join([
          str(chan.size / 8),
          str(int(chan.sign)),
          str(int(chan.type == parser.FLOAT)),
@@ -205,9 +207,9 @@
          str(fmat.swizzle[1]),
          str(fmat.swizzle[2]),
          str(fmat.swizzle[3]),
-      ]))
+      ])))
    else:
-      print '      .ArrayFormat = 0,'
-   print '   },'
+      print('      .ArrayFormat = 0,')
+   print('   },')
 
-print '};'
+print('};')
diff --git a/src/mesa/main/format_pack.py b/src/mesa/main/format_pack.py
index 77ab166..d3c8d24 100644
--- a/src/mesa/main/format_pack.py
+++ b/src/mesa/main/format_pack.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 from mako.template import Template
 from sys import argv
@@ -1001,4 +1002,4 @@
 
 template = Template(string);
 
-print template.render(argv = argv[0:])
+print(template.render(argv = argv[0:]))
diff --git a/src/mesa/main/format_parser.py b/src/mesa/main/format_parser.py
index 4c36c3c..3321ad3 100644
--- a/src/mesa/main/format_parser.py
+++ b/src/mesa/main/format_parser.py
@@ -216,8 +216,8 @@
       component, exactly as you would expect.
       """
       rev = [Swizzle.SWIZZLE_NONE] * 4
-      for i in xrange(4):
-         for j in xrange(4):
+      for i in range(4):
+         for j in range(4):
             if self.__list[j] == i and rev[i] == Swizzle.SWIZZLE_NONE:
                rev[i] = j
       return Swizzle(rev)
diff --git a/src/mesa/main/format_unpack.py b/src/mesa/main/format_unpack.py
index 87f64cc..286c08e 100644
--- a/src/mesa/main/format_unpack.py
+++ b/src/mesa/main/format_unpack.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 from mako.template import Template
 from sys import argv
@@ -891,4 +892,4 @@
 
 template = Template(string);
 
-print template.render(argv = argv[0:])
+print(template.render(argv = argv[0:]))
diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index 3b000fa..84b5f51 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -501,8 +501,7 @@
 
       /* additional checks for compressed textures */
       if (_mesa_is_compressed_format(ctx, internalformat) &&
-          (!_mesa_target_can_be_compressed(ctx, target, internalformat, NULL) ||
-           _mesa_format_no_online_compression(internalformat)))
+          !_mesa_target_can_be_compressed(ctx, target, internalformat, NULL))
          return false;
 
       break;
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index fdb53af..d4cd5d2 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -628,6 +628,48 @@
 
 
 /**
+ * Return TRUE if format is an ASTC 2D compressed format.
+ */
+bool
+_mesa_is_format_astc_2d(mesa_format format)
+{
+   switch (format) {
+   case MESA_FORMAT_RGBA_ASTC_4x4:
+   case MESA_FORMAT_RGBA_ASTC_5x4:
+   case MESA_FORMAT_RGBA_ASTC_5x5:
+   case MESA_FORMAT_RGBA_ASTC_6x5:
+   case MESA_FORMAT_RGBA_ASTC_6x6:
+   case MESA_FORMAT_RGBA_ASTC_8x5:
+   case MESA_FORMAT_RGBA_ASTC_8x6:
+   case MESA_FORMAT_RGBA_ASTC_8x8:
+   case MESA_FORMAT_RGBA_ASTC_10x5:
+   case MESA_FORMAT_RGBA_ASTC_10x6:
+   case MESA_FORMAT_RGBA_ASTC_10x8:
+   case MESA_FORMAT_RGBA_ASTC_10x10:
+   case MESA_FORMAT_RGBA_ASTC_12x10:
+   case MESA_FORMAT_RGBA_ASTC_12x12:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10:
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12:
+      return true;
+   default:
+      return false;
+   }
+}
+
+
+/**
  * If the given format is a compressed format, return a corresponding
  * uncompressed format.
  */
diff --git a/src/mesa/main/formats.h b/src/mesa/main/formats.h
index 2afa886..335e4de 100644
--- a/src/mesa/main/formats.h
+++ b/src/mesa/main/formats.h
@@ -721,6 +721,9 @@
 extern bool
 _mesa_is_format_etc2(mesa_format format);
 
+bool
+_mesa_is_format_astc_2d(mesa_format format);
+
 GLenum
 _mesa_is_format_color_format(mesa_format format);
 
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index 249e775..10dd2fd 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -159,6 +159,11 @@
    fb->_AllColorBuffersFixedPoint = !visual->floatMode;
    fb->_HasSNormOrFloatColorBuffer = visual->floatMode;
    fb->_HasAttachments = true;
+   fb->FlipY = true;
+
+   fb->SampleLocationTable = NULL;
+   fb->ProgrammableSampleLocations = 0;
+   fb->SampleLocationPixelGrid = 0;
 
    compute_depth_max(fb);
 }
@@ -183,6 +188,9 @@
    fb->_ColorDrawBufferIndexes[0] = BUFFER_COLOR0;
    fb->ColorReadBuffer = GL_COLOR_ATTACHMENT0_EXT;
    fb->_ColorReadBufferIndex = BUFFER_COLOR0;
+   fb->SampleLocationTable = NULL;
+   fb->ProgrammableSampleLocations = 0;
+   fb->SampleLocationPixelGrid = 0;
    fb->Delete = _mesa_destroy_framebuffer;
    simple_mtx_init(&fb->Mutex, mtx_plain);
 }
@@ -210,14 +218,12 @@
 void
 _mesa_free_framebuffer_data(struct gl_framebuffer *fb)
 {
-   GLuint i;
-
    assert(fb);
    assert(fb->RefCount == 0);
 
    simple_mtx_destroy(&fb->Mutex);
 
-   for (i = 0; i < BUFFER_COUNT; i++) {
+   for (unsigned i = 0; i < BUFFER_COUNT; i++) {
       struct gl_renderbuffer_attachment *att = &fb->Attachment[i];
       if (att->Renderbuffer) {
          _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
@@ -229,6 +235,9 @@
       assert(!att->Texture);
       att->Type = GL_NONE;
    }
+
+   free(fb->SampleLocationTable);
+   fb->SampleLocationTable = NULL;
 }
 
 
@@ -280,8 +289,6 @@
 _mesa_resize_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
                          GLuint width, GLuint height)
 {
-   GLuint i;
-
    /* XXX I think we could check if the size is not changing
     * and return early.
     */
@@ -289,7 +296,7 @@
    /* Can only resize win-sys framebuffer objects */
    assert(_mesa_is_winsys_fbo(fb));
 
-   for (i = 0; i < BUFFER_COUNT; i++) {
+   for (unsigned i = 0; i < BUFFER_COUNT; i++) {
       struct gl_renderbuffer_attachment *att = &fb->Attachment[i];
       if (att->Type == GL_RENDERBUFFER_EXT && att->Renderbuffer) {
          struct gl_renderbuffer *rb = att->Renderbuffer;
@@ -427,13 +434,11 @@
 _mesa_update_framebuffer_visual(struct gl_context *ctx,
 				struct gl_framebuffer *fb)
 {
-   GLuint i;
-
    memset(&fb->Visual, 0, sizeof(fb->Visual));
    fb->Visual.rgbMode = GL_TRUE; /* assume this */
 
    /* find first RGB renderbuffer */
-   for (i = 0; i < BUFFER_COUNT; i++) {
+   for (unsigned i = 0; i < BUFFER_COUNT; i++) {
       if (fb->Attachment[i].Renderbuffer) {
          const struct gl_renderbuffer *rb = fb->Attachment[i].Renderbuffer;
          const GLenum baseFormat = _mesa_get_format_base_format(rb->Format);
@@ -461,7 +466,7 @@
    }
 
    fb->Visual.floatMode = GL_FALSE;
-   for (i = 0; i < BUFFER_COUNT; i++) {
+   for (unsigned i = 0; i < BUFFER_COUNT; i++) {
       if (fb->Attachment[i].Renderbuffer) {
          const struct gl_renderbuffer *rb = fb->Attachment[i].Renderbuffer;
          const mesa_format fmt = rb->Format;
@@ -555,7 +560,7 @@
  * writing colors.
  */
 static void
-update_color_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb)
+update_color_draw_buffers(struct gl_framebuffer *fb)
 {
    GLuint output;
 
@@ -579,9 +584,8 @@
  * Unlike the DrawBuffer, we can only read from one (or zero) color buffers.
  */
 static void
-update_color_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb)
+update_color_read_buffer(struct gl_framebuffer *fb)
 {
-   (void) ctx;
    if (fb->_ColorReadBufferIndex == BUFFER_NONE ||
        fb->DeletePending ||
        fb->Width == 0 ||
@@ -623,6 +627,12 @@
          _mesa_drawbuffers(ctx, fb, ctx->Const.MaxDrawBuffers,
                            ctx->Color.DrawBuffer, NULL);
       }
+
+      /* Call device driver function if fb is the bound draw buffer. */
+      if (fb == ctx->DrawBuffer) {
+         if (ctx->Driver.DrawBufferAllocate)
+            ctx->Driver.DrawBufferAllocate(ctx);
+      }
    }
    else {
       /* This is a user-created framebuffer.
@@ -638,8 +648,8 @@
     * read-state if this FB is bound as ctx->DrawBuffer), but no
     * harm.
     */
-   update_color_draw_buffers(ctx, fb);
-   update_color_read_buffer(ctx, fb);
+   update_color_draw_buffers(fb);
+   update_color_read_buffer(fb);
 
    compute_depth_max(fb);
 }
@@ -835,22 +845,49 @@
    }
    else {
       const mesa_format format = fb->_ColorReadBuffer->Format;
-      const GLenum data_type = _mesa_get_format_datatype(format);
 
-      if (format == MESA_FORMAT_B8G8R8A8_UNORM)
-         return GL_BGRA;
-      else if (format == MESA_FORMAT_B5G6R5_UNORM)
-         return GL_RGB;
-      else if (format == MESA_FORMAT_R_UNORM8)
-         return GL_RED;
-
-      switch (data_type) {
-      case GL_UNSIGNED_INT:
-      case GL_INT:
+      switch (format) {
+      case MESA_FORMAT_RGBA_UINT8:
          return GL_RGBA_INTEGER;
+      case MESA_FORMAT_B8G8R8A8_UNORM:
+         return GL_BGRA;
+      case MESA_FORMAT_B5G6R5_UNORM:
+      case MESA_FORMAT_R11G11B10_FLOAT:
+         return GL_RGB;
+      case MESA_FORMAT_RG_FLOAT32:
+      case MESA_FORMAT_RG_FLOAT16:
+      case MESA_FORMAT_R8G8_UNORM:
+      case MESA_FORMAT_R8G8_SNORM:
+         return GL_RG;
+      case MESA_FORMAT_RG_SINT32:
+      case MESA_FORMAT_RG_UINT32:
+      case MESA_FORMAT_RG_SINT16:
+      case MESA_FORMAT_RG_UINT16:
+      case MESA_FORMAT_RG_SINT8:
+      case MESA_FORMAT_RG_UINT8:
+         return GL_RG_INTEGER;
+      case MESA_FORMAT_R_FLOAT32:
+      case MESA_FORMAT_R_FLOAT16:
+      case MESA_FORMAT_R_UNORM16:
+      case MESA_FORMAT_R_UNORM8:
+      case MESA_FORMAT_R_SNORM16:
+      case MESA_FORMAT_R_SNORM8:
+         return GL_RED;
+      case MESA_FORMAT_R_SINT32:
+      case MESA_FORMAT_R_UINT32:
+      case MESA_FORMAT_R_SINT16:
+      case MESA_FORMAT_R_UINT16:
+      case MESA_FORMAT_R_SINT8:
+      case MESA_FORMAT_R_UINT8:
+         return GL_RED_INTEGER;
       default:
-         return GL_RGBA;
+         break;
       }
+
+      if (_mesa_is_format_integer(format))
+         return GL_RGBA_INTEGER;
+      else
+         return GL_RGBA;
    }
 }
 
@@ -883,29 +920,13 @@
       return GL_NONE;
    }
    else {
-      const GLenum format = fb->_ColorReadBuffer->Format;
-      const GLenum data_type = _mesa_get_format_datatype(format);
+      const mesa_format format = fb->_ColorReadBuffer->Format;
+      GLenum data_type;
+      GLuint comps;
 
-      if (format == MESA_FORMAT_B5G6R5_UNORM)
-         return GL_UNSIGNED_SHORT_5_6_5;
+      _mesa_uncompressed_format_to_type_and_comps(format, &data_type, &comps);
 
-      if (format == MESA_FORMAT_B10G10R10A2_UNORM ||
-          format == MESA_FORMAT_B10G10R10X2_UNORM ||
-          format == MESA_FORMAT_R10G10B10A2_UNORM ||
-          format == MESA_FORMAT_R10G10B10X2_UNORM)
-         return GL_UNSIGNED_INT_2_10_10_10_REV;
-
-      switch (data_type) {
-      case GL_SIGNED_NORMALIZED:
-         return GL_BYTE;
-      case GL_UNSIGNED_INT:
-      case GL_INT:
-      case GL_FLOAT:
-         return data_type;
-      case GL_UNSIGNED_NORMALIZED:
-      default:
-         return GL_UNSIGNED_BYTE;
-      }
+      return data_type;
    }
 }
 
@@ -936,14 +957,12 @@
 void
 _mesa_print_framebuffer(const struct gl_framebuffer *fb)
 {
-   GLuint i;
-
    fprintf(stderr, "Mesa Framebuffer %u at %p\n", fb->Name, (void *) fb);
    fprintf(stderr, "  Size: %u x %u  Status: %s\n", fb->Width, fb->Height,
            _mesa_enum_to_string(fb->_Status));
    fprintf(stderr, "  Attachments:\n");
 
-   for (i = 0; i < BUFFER_COUNT; i++) {
+   for (unsigned i = 0; i < BUFFER_COUNT; i++) {
       const struct gl_renderbuffer_attachment *att = &fb->Attachment[i];
       if (att->Type == GL_TEXTURE) {
          const struct gl_texture_image *texImage = att->Renderbuffer->TexImage;
diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index 488c32f..fd20ea2 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -93,7 +93,7 @@
              internalformat == GL_LUMINANCE_ALPHA ||
              internalformat == GL_LUMINANCE || internalformat == GL_ALPHA ||
              internalformat == GL_BGRA_EXT ||
-             (_mesa_is_es3_color_renderable(internalformat) &&
+             (_mesa_is_es3_color_renderable(ctx, internalformat) &&
               _mesa_is_es3_texture_filterable(ctx, internalformat));
    }
 
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 56d3421..db0079b 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -192,6 +192,7 @@
    GLenum value_enum;
    GLubyte value_ubyte;
    GLshort value_short;
+   GLuint value_uint;
 
    /* Sigh, see GL_COMPRESSED_TEXTURE_FORMATS_ARB handling */
    struct {
@@ -513,6 +514,10 @@
 EXTRA_EXT(ARB_compute_variable_group_size);
 EXTRA_EXT(KHR_robustness);
 EXTRA_EXT(ARB_sparse_buffer);
+EXTRA_EXT(NV_conservative_raster);
+EXTRA_EXT(NV_conservative_raster_dilate);
+EXTRA_EXT(NV_conservative_raster_pre_snap_triangles);
+EXTRA_EXT(ARB_sample_locations);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
@@ -1194,6 +1199,35 @@
          simple_mtx_unlock(&ctx->Shared->Mutex);
       }
       break;
+   /* GL_ARB_sample_locations */
+   case GL_SAMPLE_LOCATION_SUBPIXEL_BITS_ARB:
+   case GL_SAMPLE_LOCATION_PIXEL_GRID_WIDTH_ARB:
+   case GL_SAMPLE_LOCATION_PIXEL_GRID_HEIGHT_ARB:
+      {
+         GLuint bits, width, height;
+
+         if (ctx->NewState & _NEW_BUFFERS)
+            _mesa_update_state(ctx);
+
+         if (ctx->DrawBuffer->_Status != GL_FRAMEBUFFER_COMPLETE) {
+            v->value_uint = 0;
+            break;
+         }
+
+         ctx->Driver.GetProgrammableSampleCaps(ctx, ctx->DrawBuffer,
+                                               &bits, &width, &height);
+
+         if (d->pname == GL_SAMPLE_LOCATION_PIXEL_GRID_WIDTH_ARB)
+            v->value_uint = width;
+         else if (d->pname == GL_SAMPLE_LOCATION_PIXEL_GRID_HEIGHT_ARB)
+            v->value_uint = height;
+         else
+            v->value_uint = bits;
+      }
+      break;
+   case GL_PROGRAMMABLE_SAMPLE_LOCATION_TABLE_SIZE_ARB:
+      v->value_uint = MAX_SAMPLE_LOCATION_TABLE_SIZE;
+      break;
    }
 }
 
@@ -3065,7 +3099,7 @@
       break;
 
    case TYPE_ENUM16:
-      params[0] = INT_TO_FIXED(((GLenum16 *) p)[0]);
+      params[0] = INT_TO_FIXED((GLint)(((GLenum16 *) p)[0]));
       break;
 
    case TYPE_INT_N:
diff --git a/src/mesa/main/get_hash_generator.py b/src/mesa/main/get_hash_generator.py
index ddd498f..facdccd 100644
--- a/src/mesa/main/get_hash_generator.py
+++ b/src/mesa/main/get_hash_generator.py
@@ -28,6 +28,8 @@
 # Generate a C header file containing hash tables of glGet parameter
 # names for each GL API. The generated file is to be included by glGet.c
 
+from __future__ import print_function
+
 import os, sys, imp, getopt
 from collections import defaultdict
 import get_hash_params
@@ -46,16 +48,16 @@
 gl_apis=set(["GL", "GL_CORE", "GLES", "GLES2", "GLES3", "GLES31", "GLES32"])
 
 def print_header():
-   print "typedef const unsigned short table_t[%d];\n" % (hash_table_size)
-   print "static const int prime_factor = %d, prime_step = %d;\n" % \
-          (prime_factor, prime_step)
+   print("typedef const unsigned short table_t[%d];\n" % (hash_table_size))
+   print("static const int prime_factor = %d, prime_step = %d;\n" % \
+          (prime_factor, prime_step))
 
 def print_params(params):
-   print "static const struct value_desc values[] = {"
+   print("static const struct value_desc values[] = {")
    for p in params:
-      print "    { %s, %s }," % (p[0], p[1])
+      print("    { %s, %s }," % (p[0], p[1]))
 
-   print "};\n"
+   print("};\n")
 
 def api_name(api):
    return "API_OPEN%s" % api
@@ -78,7 +80,7 @@
    return "table_" + api_name(api)
 
 def print_table(api, table):
-   print "static table_t %s = {" % (table_name(api))
+   print("static table_t %s = {" % (table_name(api)))
 
    # convert sparse (index, value) table into a dense table
    dense_table = [0] * hash_table_size
@@ -89,9 +91,9 @@
    for i in range(0, hash_table_size, row_size):
       row = dense_table[i : i + row_size]
       idx_val = ["%4d" % v for v in row]
-      print " " * 4 + ", ".join(idx_val) + ","
+      print(" " * 4 + ", ".join(idx_val) + ",")
 
-   print "};\n"
+   print("};\n")
 
 def print_tables(tables):
    for table in tables:
@@ -104,12 +106,12 @@
          i = api_index(api)
          dense_tables[i] = "&%s" % (tname)
 
-   print "static table_t *table_set[] = {"
+   print("static table_t *table_set[] = {")
    for expr in dense_tables:
-      print "   %s," % expr
-   print "};\n"
+      print("   %s," % expr)
+   print("};\n")
 
-   print "#define table(api) (*table_set[api])"
+   print("#define table(api) (*table_set[api])")
 
 # Merge tables with matching parameter lists (i.e. GL and GL_CORE)
 def merge_tables(tables):
@@ -199,7 +201,7 @@
 if __name__ == '__main__':
    try:
       (opts, args) = getopt.getopt(sys.argv[1:], "f:")
-   except Exception,e:
+   except Exception:
       show_usage()
 
    if len(args) != 0:
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 4af12d4..618e306 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -355,6 +355,19 @@
 # GL_ARB_robustness / GL_KHR_robustness
   [ "CONTEXT_ROBUST_ACCESS", "CONTEXT_ENUM16(Const.RobustAccess), extra_KHR_robustness" ],
   [ "RESET_NOTIFICATION_STRATEGY_ARB", "CONTEXT_ENUM16(Const.ResetStrategy), extra_KHR_robustness_or_GL" ],
+
+# GL_NV_conservative_raster
+  [ "SUBPIXEL_PRECISION_BIAS_X_BITS_NV", "CONTEXT_UINT(SubpixelPrecisionBias[0]), extra_NV_conservative_raster" ],
+  [ "SUBPIXEL_PRECISION_BIAS_Y_BITS_NV", "CONTEXT_UINT(SubpixelPrecisionBias[1]), extra_NV_conservative_raster" ],
+  [ "MAX_SUBPIXEL_PRECISION_BIAS_BITS_NV", "CONTEXT_UINT(Const.MaxSubpixelPrecisionBiasBits), extra_NV_conservative_raster" ],
+
+# GL_NV_conservative_raster_dilate
+  [ "CONSERVATIVE_RASTER_DILATE_RANGE_NV", "CONTEXT_FLOAT2(Const.ConservativeRasterDilateRange), extra_NV_conservative_raster_dilate" ],
+  [ "CONSERVATIVE_RASTER_DILATE_GRANULARITY_NV", "CONTEXT_FLOAT(Const.ConservativeRasterDilateGranularity), extra_NV_conservative_raster_dilate" ],
+  [ "CONSERVATIVE_RASTER_DILATE_NV", "CONTEXT_FLOAT(ConservativeRasterDilate), extra_NV_conservative_raster_dilate" ],
+
+# GL_NV_conservative_raster_pre_snap_triangles
+  [ "CONSERVATIVE_RASTER_MODE_NV", "CONTEXT_ENUM16(ConservativeRasterMode), extra_NV_conservative_raster_pre_snap_triangles" ],
 ]},
 
 # GLES3 is not a typo.
@@ -438,6 +451,11 @@
   [ "MAX_WINDOW_RECTANGLES_EXT", "CONTEXT_INT(Const.MaxWindowRectangles), extra_EXT_window_rectangles" ],
   [ "NUM_WINDOW_RECTANGLES_EXT", "CONTEXT_INT(Scissor.NumWindowRects), extra_EXT_window_rectangles" ],
   [ "WINDOW_RECTANGLE_MODE_EXT", "CONTEXT_ENUM16(Scissor.WindowRectMode), extra_EXT_window_rectangles" ],
+
+  # GL_ARB_gpu_shader5 / GL_OES_shader_multisample_interpolation
+  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
 ]},
 
 { "apis": ["GLES", "GLES2"], "params": [
@@ -545,18 +563,13 @@
 
   # GL_NUM_SHADING_LANGUAGE_VERSIONS
   [ "NUM_SHADING_LANGUAGE_VERSIONS", "LOC_CUSTOM, TYPE_INT, 0, extra_version_43" ],
-]},
 
-# Enums in OpenGL Core profile and ES 3.0
-{ "apis": ["GL_CORE", "GLES3"], "params": [
-  # GL_ARB_gpu_shader5 / GL_OES_shader_multisample_interpolation
-  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
-  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
-  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
-]},
+  # GL_ARB_sample_locations
+  [ "SAMPLE_LOCATION_SUBPIXEL_BITS_ARB", "LOC_CUSTOM, TYPE_UINT, 0, extra_ARB_sample_locations" ],
+  [ "SAMPLE_LOCATION_PIXEL_GRID_WIDTH_ARB", "LOC_CUSTOM, TYPE_UINT, 0, extra_ARB_sample_locations" ],
+  [ "SAMPLE_LOCATION_PIXEL_GRID_HEIGHT_ARB", "LOC_CUSTOM, TYPE_UINT, 0, extra_ARB_sample_locations" ],
+  [ "PROGRAMMABLE_SAMPLE_LOCATION_TABLE_SIZE_ARB", "LOC_CUSTOM, TYPE_UINT, 0, extra_ARB_sample_locations" ],
 
-# Enums in OpenGL Core profile and ES 3.1
-{ "apis": ["GL_CORE", "GLES31"], "params": [
 # GL_ARB_draw_indirect / GLES 3.1
   [ "DRAW_INDIRECT_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_draw_indirect" ],
 
@@ -637,12 +650,11 @@
   [ "CONSERVATIVE_RASTERIZATION_INTEL", "CONTEXT_BOOL(IntelConservativeRasterization), extra_INTEL_conservative_rasterization" ],
 ]},
 
-{ "apis": ["GL_CORE", "GLES32"], "params": [
+# Enums in OpenGL and ES 3.2
+{ "apis": ["GL", "GL_CORE", "GLES32"], "params": [
   [ "MULTISAMPLE_LINE_WIDTH_RANGE_ARB", "CONTEXT_FLOAT2(Const.MinLineWidthAA), extra_ES32" ],
   [ "MULTISAMPLE_LINE_WIDTH_GRANULARITY_ARB", "CONTEXT_FLOAT(Const.LineWidthGranularity), extra_ES32" ],
-]},
 
-{ "apis": ["GL", "GL_CORE", "GLES32"], "params": [
 # GL 3.0 or ES 3.2
   [ "CONTEXT_FLAGS", "CONTEXT_INT(Const.ContextFlags), extra_version_30" ],
 ]},
@@ -746,7 +758,6 @@
   [ "PIXEL_MAP_S_TO_S_SIZE", "CONTEXT_INT(PixelMaps.StoS.Size), NO_EXTRA" ],
   [ "POINT_SIZE_GRANULARITY", "CONTEXT_FLOAT(Const.PointSizeGranularity), NO_EXTRA" ],
   [ "POLYGON_MODE", "CONTEXT_ENUM2(Polygon.FrontMode), NO_EXTRA" ],
-  [ "POLYGON_OFFSET_BIAS_EXT", "CONTEXT_FLOAT(Polygon.OffsetUnits), NO_EXTRA" ],
   [ "POLYGON_OFFSET_POINT", "CONTEXT_BOOL(Polygon.OffsetPoint), NO_EXTRA" ],
   [ "POLYGON_OFFSET_LINE", "CONTEXT_BOOL(Polygon.OffsetLine), NO_EXTRA" ],
   [ "POLYGON_SMOOTH", "CONTEXT_BOOL(Polygon.SmoothFlag), NO_EXTRA" ],
@@ -964,17 +975,13 @@
 
 # GL_ARB_sparse_buffer
   [ "SPARSE_BUFFER_PAGE_SIZE_ARB", "CONTEXT_INT(Const.SparseBufferPageSize), extra_ARB_sparse_buffer" ],
-]},
 
-# Enums restricted to OpenGL Core profile
-{ "apis": ["GL_CORE"], "params": [
 # GL_ARB_shader_subroutine
   [ "MAX_SUBROUTINES", "CONST(MAX_SUBROUTINES), NO_EXTRA" ],
   [ "MAX_SUBROUTINE_UNIFORM_LOCATIONS", "CONST(MAX_SUBROUTINE_UNIFORM_LOCATIONS), NO_EXTRA" ],
 
 # GL_ARB_indirect_parameters
   [ "PARAMETER_BUFFER_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_indirect_parameters" ],
-
-]}
+]},
 
 ]
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index 2100fcf..667020c 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -2859,6 +2859,17 @@
             return GL_INVALID_OPERATION;
          break;
 
+      case GL_UNSIGNED_SHORT:
+         if (!_mesa_has_EXT_texture_norm16(ctx) || internalFormat != GL_RGBA16)
+            return GL_INVALID_OPERATION;
+         break;
+
+      case GL_SHORT:
+         if (!_mesa_has_EXT_texture_norm16(ctx) ||
+             internalFormat != GL_RGBA16_SNORM)
+            return GL_INVALID_OPERATION;
+         break;
+
       case GL_UNSIGNED_SHORT_4_4_4_4:
          switch (internalFormat) {
          case GL_RGBA:
@@ -2986,6 +2997,17 @@
             return GL_INVALID_OPERATION;
          break;
 
+      case GL_UNSIGNED_SHORT:
+         if (!_mesa_has_EXT_texture_norm16(ctx) || internalFormat != GL_RGB16)
+            return GL_INVALID_OPERATION;
+         break;
+
+      case GL_SHORT:
+         if (!_mesa_has_EXT_texture_norm16(ctx) ||
+             internalFormat != GL_RGB16_SNORM)
+            return GL_INVALID_OPERATION;
+         break;
+
       case GL_UNSIGNED_SHORT_5_6_5:
          switch (internalFormat) {
          case GL_RGB:
@@ -3117,6 +3139,17 @@
             return GL_INVALID_OPERATION;
          break;
 
+      case GL_UNSIGNED_SHORT:
+         if (!_mesa_has_EXT_texture_norm16(ctx) || internalFormat != GL_RG16)
+            return GL_INVALID_OPERATION;
+         break;
+
+      case GL_SHORT:
+         if (!_mesa_has_EXT_texture_norm16(ctx) ||
+             internalFormat != GL_RG16_SNORM)
+            return GL_INVALID_OPERATION;
+         break;
+
       case GL_HALF_FLOAT:
       case GL_HALF_FLOAT_OES:
          switch (internalFormat) {
@@ -3207,6 +3240,17 @@
             return GL_INVALID_OPERATION;
          break;
 
+      case GL_UNSIGNED_SHORT:
+         if (!_mesa_has_EXT_texture_norm16(ctx) || internalFormat != GL_R16)
+            return GL_INVALID_OPERATION;
+         break;
+
+      case GL_SHORT:
+         if (!_mesa_has_EXT_texture_norm16(ctx) ||
+             internalFormat != GL_R16_SNORM)
+            return GL_INVALID_OPERATION;
+         break;
+
       case GL_HALF_FLOAT:
       case GL_HALF_FLOAT_OES:
          switch (internalFormat) {
@@ -3706,7 +3750,8 @@
  * is marked "Color Renderable" in Table 8.10 of the ES 3.2 specification.
  */
 bool
-_mesa_is_es3_color_renderable(GLenum internal_format)
+_mesa_is_es3_color_renderable(const struct gl_context *ctx,
+                              GLenum internal_format)
 {
    switch (internal_format) {
    case GL_R8:
@@ -3745,6 +3790,10 @@
    case GL_RGBA32I:
    case GL_RGBA32UI:
       return true;
+   case GL_R16:
+   case GL_RG16:
+   case GL_RGBA16:
+      return _mesa_has_EXT_texture_norm16(ctx);
    default:
       return false;
    }
@@ -3780,6 +3829,15 @@
    case GL_R11F_G11F_B10F:
    case GL_RGB9_E5:
       return true;
+   case GL_R16:
+   case GL_R16_SNORM:
+   case GL_RG16:
+   case GL_RG16_SNORM:
+   case GL_RGB16:
+   case GL_RGB16_SNORM:
+   case GL_RGBA16:
+   case GL_RGBA16_SNORM:
+      return _mesa_has_EXT_texture_norm16(ctx);
    case GL_R32F:
    case GL_RG32F:
    case GL_RGB32F:
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index 844f1e2..5a21317 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -155,7 +155,8 @@
                                       GLenum gl_format, GLenum type);
 
 extern bool
-_mesa_is_es3_color_renderable(GLenum internal_format);
+_mesa_is_es3_color_renderable(const struct gl_context *ctx,
+                              GLenum internal_format);
 
 extern bool
 _mesa_is_es3_texture_filterable(const struct gl_context *ctx,
diff --git a/src/mesa/main/glheader.h b/src/mesa/main/glheader.h
index 1664882..1a91d54 100644
--- a/src/mesa/main/glheader.h
+++ b/src/mesa/main/glheader.h
@@ -160,6 +160,9 @@
 #define GL_HALF_FLOAT_OES                                       0x8D61
 #endif
 
+#ifndef GL_MESA_framebuffer_flip_y
+#define GL_FRAMEBUFFER_FLIP_Y_MESA                              0x8BBB
+#endif
 
 /**
  * Internal token to represent a GLSL shader program (a collection of
diff --git a/src/mesa/main/glspirv.c b/src/mesa/main/glspirv.c
index c585bc5..87075a5 100644
--- a/src/mesa/main/glspirv.c
+++ b/src/mesa/main/glspirv.c
@@ -173,6 +173,13 @@
       prog->_LinkedShaders[shader_type] = linked;
       prog->data->linked_stages |= 1 << shader_type;
    }
+
+   int last_vert_stage =
+      util_last_bit(prog->data->linked_stages &
+                    ((1 << (MESA_SHADER_GEOMETRY + 1)) - 1));
+
+   if (last_vert_stage)
+      prog->last_vert_prog = prog->_LinkedShaders[last_vert_stage - 1]->Program;
 }
 
 nir_shader *
@@ -206,6 +213,7 @@
    }
 
    const struct spirv_to_nir_options spirv_options = {
+      .lower_workgroup_access_to_offsets = true,
       .caps = ctx->Const.SpirVCapabilities
    };
 
@@ -230,6 +238,14 @@
                       prog->Name);
    nir_validate_shader(nir);
 
+   NIR_PASS_V(nir, nir_copy_prop);
+
+   /* Split member structs.  We do this before lower_io_to_temporaries so that
+    * it doesn't lower system values to temporaries by accident.
+    */
+   NIR_PASS_V(nir, nir_split_var_copies);
+   NIR_PASS_V(nir, nir_split_per_member_structs);
+
    return nir;
 }
 
diff --git a/src/mesa/main/glspirv.h b/src/mesa/main/glspirv.h
index cbcd3c0..8025c17 100644
--- a/src/mesa/main/glspirv.h
+++ b/src/mesa/main/glspirv.h
@@ -30,6 +30,7 @@
 extern "C" {
 #endif
 
+struct gl_shader_program;
 struct gl_context;
 struct gl_shader;
 
diff --git a/src/mesa/main/glthread.c b/src/mesa/main/glthread.c
index c71c037..18a83bb 100644
--- a/src/mesa/main/glthread.c
+++ b/src/mesa/main/glthread.c
@@ -73,7 +73,7 @@
    if (!glthread)
       return;
 
-   if (!util_queue_init(&glthread->queue, "glthread", MARSHAL_MAX_BATCHES - 2,
+   if (!util_queue_init(&glthread->queue, "gl", MARSHAL_MAX_BATCHES - 2,
                         1, 0)) {
       free(glthread);
       return;
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 9b6eee4..88e6baa 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -1470,9 +1470,9 @@
 static void
 make_2d_mipmap(GLenum datatype, GLuint comps, GLint border,
                GLint srcWidth, GLint srcHeight,
-	       const GLubyte *srcPtr, GLint srcRowStride,
+               const GLubyte *srcPtr, GLint srcRowStride,
                GLint dstWidth, GLint dstHeight,
-	       GLubyte *dstPtr, GLint dstRowStride)
+               GLubyte *dstPtr, GLint dstRowStride)
 {
    const GLint bpt = bytes_per_pixel(datatype, comps);
    const GLint srcWidthNB = srcWidth - 2 * border;  /* sizes w/out border */
@@ -1761,17 +1761,17 @@
       assert(srcHeight == 1);
       assert(dstHeight == 1);
       for (i = 0; i < dstDepth; i++) {
-	 make_1d_mipmap(datatype, comps, border,
-			srcWidth, srcData[i],
-			dstWidth, dstData[i]);
+         make_1d_mipmap(datatype, comps, border,
+                        srcWidth, srcData[i],
+                        dstWidth, dstData[i]);
       }
       break;
    case GL_TEXTURE_2D_ARRAY_EXT:
    case GL_TEXTURE_CUBE_MAP_ARRAY:
       for (i = 0; i < dstDepth; i++) {
-	 make_2d_mipmap(datatype, comps, border,
-			srcWidth, srcHeight, srcData[i], srcRowStride,
-			dstWidth, dstHeight, dstData[i], dstRowStride);
+         make_2d_mipmap(datatype, comps, border,
+                        srcWidth, srcHeight, srcData[i], srcRowStride,
+                        dstWidth, dstHeight, dstData[i], dstRowStride);
       }
       break;
    case GL_TEXTURE_RECTANGLE_NV:
@@ -1800,7 +1800,7 @@
       *dstWidth = srcWidth; /* can't go smaller */
    }
 
-   if ((srcHeight - 2 * border > 1) && 
+   if ((srcHeight - 2 * border > 1) &&
        target != GL_TEXTURE_1D_ARRAY_EXT &&
        target != GL_PROXY_TEXTURE_1D_ARRAY_EXT) {
       *dstHeight = (srcHeight - 2 * border) / 2 + 2 * border;
@@ -1944,9 +1944,9 @@
 
 static void
 generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
-			     struct gl_texture_object *texObj,
-			     const struct gl_texture_image *srcImage,
-			     GLuint maxLevel)
+                             struct gl_texture_object *texObj,
+                             const struct gl_texture_image *srcImage,
+                             GLuint maxLevel)
 {
    GLuint level;
    GLenum datatype;
@@ -1983,10 +1983,10 @@
       dstDepth = dstImage->Depth;
 
       if (target == GL_TEXTURE_1D_ARRAY) {
-	 srcDepth = srcHeight;
-	 dstDepth = dstHeight;
-	 srcHeight = 1;
-	 dstHeight = 1;
+         srcDepth = srcHeight;
+         dstDepth = dstHeight;
+         srcHeight = 1;
+         dstHeight = 1;
       }
 
       /* Map src texture image slices */
@@ -2064,9 +2064,9 @@
 
 static void
 generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *srcImage,
-			   GLuint maxLevel)
+                           struct gl_texture_object *texObj,
+                           struct gl_texture_image *srcImage,
+                           GLuint maxLevel)
 {
    GLuint level;
    mesa_format temp_format;
@@ -2079,8 +2079,8 @@
 
    /* only two types of compressed textures at this time */
    assert(texObj->Target == GL_TEXTURE_2D ||
-	  texObj->Target == GL_TEXTURE_2D_ARRAY ||
-	  texObj->Target == GL_TEXTURE_CUBE_MAP ||
+          texObj->Target == GL_TEXTURE_2D_ARRAY ||
+          texObj->Target == GL_TEXTURE_CUBE_MAP ||
           texObj->Target == GL_TEXTURE_CUBE_MAP_ARRAY);
 
    /*
@@ -2173,11 +2173,11 @@
       temp_dst_img_stride = _mesa_format_image_size(temp_format, dstWidth,
                                                     dstHeight, 1);
       if (!temp_dst) {
-	 temp_dst = malloc(temp_dst_img_stride * dstDepth);
-	 if (!temp_dst) {
-	    _mesa_error(ctx, GL_OUT_OF_MEMORY, "generate mipmaps");
+         temp_dst = malloc(temp_dst_img_stride * dstDepth);
+         if (!temp_dst) {
+            _mesa_error(ctx, GL_OUT_OF_MEMORY, "generate mipmaps");
             goto end;
-	 }
+         }
       }
 
       /* for 2D arrays, setup array[depth] of slice pointers */
@@ -2206,9 +2206,9 @@
 
       /* swap src and dest pointers */
       {
-	 GLubyte *temp = temp_src;
-	 temp_src = temp_dst;
-	 temp_dst = temp;
+         GLubyte *temp = temp_src;
+         temp_src = temp_dst;
+         temp_dst = temp;
          temp_src_row_stride = temp_dst_row_stride;
          temp_src_img_stride = temp_dst_img_stride;
       }
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index e8576bf..a9e37dd 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1425,6 +1425,32 @@
    unsigned _ElementSize:8; /**< Size of each element in bytes */
    /** Index into gl_vertex_array_object::BufferBinding[] array */
    unsigned BufferBindingIndex:6;
+
+   /**
+    * Derived effective buffer binding index
+    *
+    * Index into the gl_vertex_buffer_binding array of the vao.
+    * Similar to BufferBindingIndex, but with the mapping of the
+    * position/generic0 attributes applied and with identical
+    * gl_vertex_buffer_binding entries collapsed to a single
+    * entry within the vao.
+    *
+    * The value is valid past calling _mesa_update_vao_derived_arrays.
+    * Note that _mesa_update_vao_derived_arrays is called when binding
+    * the VAO to Array._DrawVAO.
+    */
+   unsigned _EffBufferBindingIndex:6;
+   /**
+    * Derived effective relative offset.
+    *
+    * Relative offset to the effective buffers offset in
+    * gl_vertex_buffer_binding::_EffOffset.
+    *
+    * The value is valid past calling _mesa_update_vao_derived_arrays.
+    * Note that _mesa_update_vao_derived_arrays is called when binding
+    * the VAO to Array._DrawVAO.
+    */
+   GLushort _EffRelativeOffset;
 };
 
 
@@ -1440,20 +1466,35 @@
    GLuint InstanceDivisor;             /**< GL_ARB_instanced_arrays */
    struct gl_buffer_object *BufferObj; /**< GL_ARB_vertex_buffer_object */
    GLbitfield _BoundArrays;            /**< Arrays bound to this binding point */
-};
 
-
-/**
- * Vertex array information which is derived from gl_array_attributes
- * and gl_vertex_buffer_binding information.  Used by the VBO module and
- * device drivers.
- */
-struct gl_vertex_array
-{
-   /** Vertex attribute array */
-   const struct gl_array_attributes *VertexAttrib;
-   /** Vertex buffer binding */
-   const struct gl_vertex_buffer_binding *BufferBinding;
+   /**
+    * Derived effective bound arrays.
+    *
+    * The effective binding handles enabled arrays past the
+    * position/generic0 attribute mapping and reduces the refered
+    * gl_vertex_buffer_binding entries to a unique subset.
+    *
+    * The value is valid past calling _mesa_update_vao_derived_arrays.
+    * Note that _mesa_update_vao_derived_arrays is called when binding
+    * the VAO to Array._DrawVAO.
+    */
+   GLbitfield _EffBoundArrays;
+   /**
+    * Derived offset.
+    *
+    * The absolute offset to that we can collapse some attributes
+    * to this unique effective binding.
+    * For user space array bindings this contains the smallest pointer value
+    * in the bound and interleaved arrays.
+    * For VBO bindings this contains an offset that lets the attributes
+    * _EffRelativeOffset stay positive and in bounds with
+    * Const.MaxVertexAttribRelativeOffset
+    *
+    * The value is valid past calling _mesa_update_vao_derived_arrays.
+    * Note that _mesa_update_vao_derived_arrays is called when binding
+    * the VAO to Array._DrawVAO.
+    */
+   GLintptr _EffOffset;
 };
 
 
@@ -1494,6 +1535,15 @@
    /** Mask of VERT_BIT_* values indicating which arrays are enabled */
    GLbitfield _Enabled;
 
+   /**
+    * Mask of VERT_BIT_* enabled arrays past position/generic0 mapping
+    *
+    * The value is valid past calling _mesa_update_vao_derived_arrays.
+    * Note that _mesa_update_vao_derived_arrays is called when binding
+    * the VAO to Array._DrawVAO.
+    */
+   GLbitfield _EffEnabledVBO;
+
    /** Denotes the way the position/generic0 attribute is mapped */
    gl_attribute_map_mode _AttributeMapMode;
 
@@ -1564,12 +1614,6 @@
     */
    struct gl_vertex_array_object *_EmptyVAO;
 
-   /**
-    * Vertex arrays as consumed by a driver.
-    * The array pointer is set up only by the VBO module.
-    */
-   const struct gl_vertex_array *_DrawArrays; /**< 0..VERT_ATTRIB_MAX-1 */
-
    /** Legal array datatypes and the API for which they have been computed */
    GLbitfield LegalTypesMask;
    gl_api LegalTypesMaskAPI;
@@ -2554,6 +2598,10 @@
    bool uses_gl_fragcoord;
 
    bool PostDepthCoverage;
+   bool PixelInterlockOrdered;
+   bool PixelInterlockUnordered;
+   bool SampleInterlockOrdered;
+   bool SampleInterlockUnordered;
    bool InnerCoverage;
 
    /**
@@ -2844,6 +2892,12 @@
 
    /* Mask of stages this program was linked against */
    unsigned linked_stages;
+
+   /* Whether the shaders of this program are loaded from SPIR-V binaries
+    * (all have the SPIR_V_BINARY_ARB state). This was introduced by the
+    * ARB_gl_spirv extension.
+    */
+   bool spirv;
 };
 
 /**
@@ -3440,6 +3494,11 @@
    GLenum16 ColorDrawBuffer[MAX_DRAW_BUFFERS];
    GLenum16 ColorReadBuffer;
 
+   /* GL_ARB_sample_locations */
+   GLfloat *SampleLocationTable; /**< If NULL, no table has been specified */
+   GLboolean ProgrammableSampleLocations;
+   GLboolean SampleLocationPixelGrid;
+
    /** Computed from ColorDraw/ReadBuffer above */
    GLuint _NumColorDrawBuffers;
    gl_buffer_index _ColorDrawBufferIndexes[MAX_DRAW_BUFFERS];
@@ -3447,6 +3506,9 @@
    struct gl_renderbuffer *_ColorDrawBuffers[MAX_DRAW_BUFFERS];
    struct gl_renderbuffer *_ColorReadBuffer;
 
+   /* GL_MESA_framebuffer_flip_y */
+   bool FlipY;
+
    /** Delete this framebuffer */
    void (*Delete)(struct gl_framebuffer *fb);
 };
@@ -3648,6 +3710,7 @@
    GLuint MaxGeometryTotalOutputComponents;
 
    GLuint GLSLVersion;  /**< Desktop GLSL version supported (ex: 120 = 1.20) */
+   GLuint GLSLVersionCompat;  /**< Desktop compat GLSL version supported  */
 
    /**
     * Changes default GLSL extension behavior from "error" to "warn".  It's out
@@ -3667,6 +3730,18 @@
    GLboolean AllowGLSLExtensionDirectiveMidShader;
 
    /**
+    * Allow builtins as part of constant expressions. This was not allowed
+    * until GLSL 1.20 this allows it everywhere.
+    */
+   GLboolean AllowGLSLBuiltinConstantExpression;
+
+   /**
+    * Allow some relaxation of GLSL ES shader restrictions. This encompasses
+    * a number of relaxations to the ES shader rules.
+    */
+   GLboolean AllowGLSLRelaxedES;
+
+   /**
     * Allow GLSL built-in variables to be redeclared verbatim
     */
    GLboolean AllowGLSLBuiltinVariableRedeclaration;
@@ -3969,6 +4044,13 @@
    /** GL_ARB_get_program_binary */
    GLuint NumProgramBinaryFormats;
 
+   /** GL_NV_conservative_raster */
+   GLuint MaxSubpixelPrecisionBiasBits;
+
+   /** GL_NV_conservative_raster_dilate */
+   GLfloat ConservativeRasterDilateRange[2];
+   GLfloat ConservativeRasterDilateGranularity;
+
    /** Is the drivers uniform storage packed or padded to 16 bytes. */
    bool PackedDriverUniformStorage;
 
@@ -4021,6 +4103,7 @@
    GLboolean ARB_fragment_shader;
    GLboolean ARB_framebuffer_no_attachments;
    GLboolean ARB_framebuffer_object;
+   GLboolean ARB_fragment_shader_interlock;
    GLboolean ARB_enhanced_layouts;
    GLboolean ARB_explicit_attrib_location;
    GLboolean ARB_explicit_uniform_location;
@@ -4042,6 +4125,7 @@
    GLboolean ARB_post_depth_coverage;
    GLboolean ARB_query_buffer_object;
    GLboolean ARB_robust_buffer_access_behavior;
+   GLboolean ARB_sample_locations;
    GLboolean ARB_sample_shading;
    GLboolean ARB_seamless_cube_map;
    GLboolean ARB_shader_atomic_counter_ops;
@@ -4147,6 +4231,7 @@
    GLboolean OES_standard_derivatives;
    GLboolean OES_texture_buffer;
    GLboolean OES_texture_cube_map_array;
+   GLboolean OES_texture_view;
    GLboolean OES_viewport_array;
    /* vendor extensions */
    GLboolean AMD_performance_monitor;
@@ -4161,7 +4246,6 @@
    GLboolean ATI_texture_mirror_once;
    GLboolean ATI_texture_env_combine3;
    GLboolean ATI_fragment_shader;
-   GLboolean ATI_separate_stencil;
    GLboolean GREMEDY_string_marker;
    GLboolean INTEL_conservative_rasterization;
    GLboolean INTEL_performance_query;
@@ -4171,6 +4255,7 @@
    GLboolean KHR_texture_compression_astc_hdr;
    GLboolean KHR_texture_compression_astc_ldr;
    GLboolean KHR_texture_compression_astc_sliced_3d;
+   GLboolean MESA_framebuffer_flip_y;
    GLboolean MESA_tile_raster_order;
    GLboolean MESA_pack_invert;
    GLboolean EXT_shader_framebuffer_fetch;
@@ -4186,6 +4271,10 @@
    GLboolean NV_texture_env_combine4;
    GLboolean NV_texture_rectangle;
    GLboolean NV_vdpau_interop;
+   GLboolean NV_conservative_raster;
+   GLboolean NV_conservative_raster_dilate;
+   GLboolean NV_conservative_raster_pre_snap_triangles;
+   GLboolean NV_conservative_raster_pre_snap;
    GLboolean NVX_gpu_memory_info;
    GLboolean TDFX_texture_compression_FXT1;
    GLboolean OES_EGL_image;
@@ -4346,7 +4435,7 @@
    GLvertexformat ListVtxfmt;
 
    GLubyte ActiveAttribSize[VERT_ATTRIB_MAX];
-   GLfloat CurrentAttrib[VERT_ATTRIB_MAX][4];
+   GLfloat CurrentAttrib[VERT_ATTRIB_MAX][8];
 
    GLubyte ActiveMaterialSize[MAT_ATTRIB_MAX];
    GLfloat CurrentMaterial[MAT_ATTRIB_MAX][4];
@@ -4418,6 +4507,17 @@
    uint64_t NewIntelConservativeRasterization;
 
    /**
+    * gl_context::NvConservativeRasterization
+    */
+   uint64_t NewNvConservativeRasterization;
+
+   /**
+    * gl_context::ConservativeRasterMode/ConservativeRasterDilate
+    * gl_context::SubpixelPrecisionBias
+    */
+   uint64_t NewNvConservativeRasterizationParams;
+
+   /**
     * gl_context::Scissor::WindowRects
     */
    uint64_t NewWindowRectangles;
@@ -4490,6 +4590,9 @@
 
    /** Shader constants (uniforms, program parameters, state constants) */
    uint64_t NewShaderConstants[MESA_SHADER_STAGES];
+
+   /** Programmable sample location state for gl_context::DrawBuffer */
+   uint64_t NewSampleLocations;
 };
 
 struct gl_buffer_binding
@@ -4725,6 +4828,7 @@
    struct gl_texture_attrib	Texture;	/**< Texture attributes */
    struct gl_transform_attrib	Transform;	/**< Transformation attributes */
    struct gl_viewport_attrib	ViewportArray[MAX_VIEWPORTS];	/**< Viewport attributes */
+   GLuint SubpixelPrecisionBias[2];	/**< Viewport attributes */
    /*@}*/
 
    /** \name Client attribute stack */
@@ -4905,7 +5009,10 @@
    GLboolean TextureFormatSupported[MESA_FORMAT_COUNT];
 
    GLboolean RasterDiscard;  /**< GL_RASTERIZER_DISCARD */
-   GLboolean IntelConservativeRasterization; /**< GL_INTEL_CONSERVATIVE_RASTERIZATION */
+   GLboolean IntelConservativeRasterization; /**< GL_CONSERVATIVE_RASTERIZATION_INTEL */
+   GLboolean ConservativeRasterization; /**< GL_CONSERVATIVE_RASTERIZATION_NV */
+   GLfloat ConservativeRasterDilate;
+   GLenum16 ConservativeRasterMode;
 
    /** Does glVertexAttrib(0) alias glVertex()? */
    bool _AttribZeroAliasesVertex;
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index dfe6a37..d30e5c9 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -94,13 +94,31 @@
 
       ctx->Driver.GetSamplePosition(ctx, ctx->DrawBuffer, index, val);
 
-      /* winsys FBOs are upside down */
-      if (_mesa_is_winsys_fbo(ctx->DrawBuffer))
+      /* FBOs can be upside down (winsys always are)*/
+      if (ctx->DrawBuffer->FlipY)
          val[1] = 1.0f - val[1];
 
       return;
    }
 
+   case GL_PROGRAMMABLE_SAMPLE_LOCATION_ARB:
+      if (!ctx->Extensions.ARB_sample_locations) {
+         _mesa_error( ctx, GL_INVALID_ENUM, "glGetMultisamplefv(pname)" );
+         return;
+      }
+
+      if (index >= MAX_SAMPLE_LOCATION_TABLE_SIZE * 2) {
+         _mesa_error( ctx, GL_INVALID_VALUE, "glGetMultisamplefv(index)" );
+         return;
+      }
+
+      if (ctx->DrawBuffer->SampleLocationTable)
+         *val = ctx->DrawBuffer->SampleLocationTable[index];
+      else
+         *val = 0.5f;
+
+      return;
+
    default:
       _mesa_error( ctx, GL_INVALID_ENUM, "glGetMultisamplefv(pname)" );
       return;
diff --git a/src/mesa/main/performance_monitor.c b/src/mesa/main/performance_monitor.c
index 65ea843..253d42d 100644
--- a/src/mesa/main/performance_monitor.c
+++ b/src/mesa/main/performance_monitor.c
@@ -480,14 +480,18 @@
    if (enable) {
       /* Enable the counters */
       for (i = 0; i < numCounters; i++) {
-         ++m->ActiveGroups[group];
-         BITSET_SET(m->ActiveCounters[group], counterList[i]);
+         if (!BITSET_TEST(m->ActiveCounters[group], counterList[i])) {
+            ++m->ActiveGroups[group];
+            BITSET_SET(m->ActiveCounters[group], counterList[i]);
+         }
       }
    } else {
       /* Disable the counters */
       for (i = 0; i < numCounters; i++) {
-         --m->ActiveGroups[group];
-         BITSET_CLEAR(m->ActiveCounters[group], counterList[i]);
+         if (BITSET_TEST(m->ActiveCounters[group], counterList[i])) {
+            --m->ActiveGroups[group];
+            BITSET_CLEAR(m->ActiveCounters[group], counterList[i]);
+         }
       }
    }
 }
@@ -542,7 +546,7 @@
     *  when a performance monitor is not currently started."
     */
    if (!m->Active) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glBeginPerfMonitor(not active)");
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glEndPerfMonitor(not active)");
       return;
    }
 
diff --git a/src/mesa/main/polygon.c b/src/mesa/main/polygon.c
index effd4d2..ae8f2a8 100644
--- a/src/mesa/main/polygon.c
+++ b/src/mesa/main/polygon.c
@@ -329,14 +329,6 @@
 }
 
 void GLAPIENTRY
-_mesa_PolygonOffsetEXT( GLfloat factor, GLfloat bias )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   /* XXX mult by DepthMaxF here??? */
-   _mesa_PolygonOffset(factor, bias * ctx->DrawBuffer->_DepthMaxF );
-}
-
-void GLAPIENTRY
 _mesa_PolygonOffsetClampEXT( GLfloat factor, GLfloat units, GLfloat clamp )
 {
    GET_CURRENT_CONTEXT(ctx);
diff --git a/src/mesa/main/polygon.h b/src/mesa/main/polygon.h
index 1b81868..a7e383d 100644
--- a/src/mesa/main/polygon.h
+++ b/src/mesa/main/polygon.h
@@ -61,9 +61,6 @@
 _mesa_PolygonOffset( GLfloat factor, GLfloat units );
 
 extern void GLAPIENTRY
-_mesa_PolygonOffsetEXT( GLfloat factor, GLfloat bias );
-
-extern void GLAPIENTRY
 _mesa_PolygonOffsetClampEXT( GLfloat factor, GLfloat units, GLfloat clamp );
 
 extern void GLAPIENTRY
diff --git a/src/mesa/main/program_binary.c b/src/mesa/main/program_binary.c
index af94b79..7390fef 100644
--- a/src/mesa/main/program_binary.c
+++ b/src/mesa/main/program_binary.c
@@ -171,24 +171,23 @@
 write_program_payload(struct gl_context *ctx, struct blob *blob,
                       struct gl_shader_program *sh_prog)
 {
-   bool serialize[MESA_SHADER_STAGES];
    for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
       struct gl_linked_shader *shader = sh_prog->_LinkedShaders[stage];
-      serialize[stage] = shader && shader->Program->driver_cache_blob == NULL;
-      if (serialize[stage])
-         ctx->Driver.ProgramBinarySerializeDriverBlob(ctx, shader->Program);
+      if (shader)
+         ctx->Driver.ProgramBinarySerializeDriverBlob(ctx, sh_prog,
+                                                      shader->Program);
    }
 
    serialize_glsl_program(blob, ctx, sh_prog);
 
    for (unsigned stage = 0; stage < MESA_SHADER_STAGES; stage++) {
-      if (!serialize[stage])
-         continue;
-
-      struct gl_program *prog = sh_prog->_LinkedShaders[stage]->Program;
-      ralloc_free(prog->driver_cache_blob);
-      prog->driver_cache_blob = NULL;
-      prog->driver_cache_blob_size = 0;
+      struct gl_linked_shader *shader = sh_prog->_LinkedShaders[stage];
+      if (shader) {
+         struct gl_program *prog = sh_prog->_LinkedShaders[stage]->Program;
+         ralloc_free(prog->driver_cache_blob);
+         prog->driver_cache_blob = NULL;
+         prog->driver_cache_blob_size = 0;
+      }
    }
 }
 
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index 41024d6..fedd1f1 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -31,6 +31,7 @@
 #include "main/context.h"
 #include "program_resource.h"
 #include "compiler/glsl/ir_uniform.h"
+
 static bool
 supported_interface_enum(struct gl_context *ctx, GLenum iface)
 {
diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index 7ee820a..7547fa1 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -698,6 +698,7 @@
             *params = ctx->Const.QueryCounterBits.SamplesPassed;
             break;
          case GL_ANY_SAMPLES_PASSED:
+         case GL_ANY_SAMPLES_PASSED_CONSERVATIVE:
             /* The minimum value of this is 1 if it's nonzero, and the value
              * is only ever GL_TRUE or GL_FALSE, so no sense in reporting more
              * bits.
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index 6ce340d..e8c28d8 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -106,7 +106,8 @@
       /* For blit-based ReadPixels packing, the clamping is done automatically
        * unless the type is float. */
       if (_mesa_get_clamp_read_color(ctx, ctx->ReadBuffer) &&
-          (type == GL_FLOAT || type == GL_HALF_FLOAT)) {
+          (type == GL_FLOAT || type == GL_HALF_FLOAT ||
+           type == GL_UNSIGNED_INT_10F_11F_11F_REV)) {
          transferOps |= IMAGE_CLAMP_BIT;
       }
    }
@@ -114,7 +115,8 @@
       /* For CPU-based ReadPixels packing, the clamping must always be done
        * for non-float types, */
       if (_mesa_get_clamp_read_color(ctx, ctx->ReadBuffer) ||
-          (type != GL_FLOAT && type != GL_HALF_FLOAT)) {
+          (type != GL_FLOAT && type != GL_HALF_FLOAT &&
+           type != GL_UNSIGNED_INT_10F_11F_11F_REV)) {
          transferOps |= IMAGE_CLAMP_BIT;
       }
    }
@@ -232,7 +234,7 @@
 					   format, type, 0, 0);
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height, GL_MAP_READ_BIT,
-			       &map, &stride);
+			       &map, &stride, ctx->ReadBuffer->FlipY);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
       return GL_TRUE;  /* don't bother trying the slow path */
@@ -283,7 +285,7 @@
       return GL_FALSE;
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height, GL_MAP_READ_BIT,
-			       &map, &stride);
+			       &map, &stride, fb->FlipY);
 
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
@@ -341,7 +343,7 @@
 					   GL_DEPTH_COMPONENT, type, 0, 0);
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height, GL_MAP_READ_BIT,
-			       &map, &stride);
+			       &map, &stride, fb->FlipY);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
       return;
@@ -389,7 +391,7 @@
       return;
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height, GL_MAP_READ_BIT,
-			       &map, &stride);
+			       &map, &stride, fb->FlipY);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
       return;
@@ -460,7 +462,7 @@
 
    /* Map the source render buffer */
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height, GL_MAP_READ_BIT,
-                               &map, &rb_stride);
+                               &map, &rb_stride, fb->FlipY);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
       return;
@@ -650,7 +652,7 @@
       return GL_FALSE;
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height, GL_MAP_READ_BIT,
-			       &map, &stride);
+			       &map, &stride, fb->FlipY);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
       return GL_TRUE;  /* don't bother trying the slow path */
@@ -690,14 +692,14 @@
       return GL_FALSE;
 
    ctx->Driver.MapRenderbuffer(ctx, depthRb, x, y, width, height,
-			       GL_MAP_READ_BIT, &depthMap, &depthStride);
+			       GL_MAP_READ_BIT, &depthMap, &depthStride, fb->FlipY);
    if (!depthMap) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
       return GL_TRUE;  /* don't bother trying the slow path */
    }
 
    ctx->Driver.MapRenderbuffer(ctx, stencilRb, x, y, width, height,
-			       GL_MAP_READ_BIT, &stencilMap, &stencilStride);
+			       GL_MAP_READ_BIT, &stencilMap, &stencilStride, fb->FlipY);
    if (!stencilMap) {
       ctx->Driver.UnmapRenderbuffer(ctx, depthRb);
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
@@ -754,7 +756,7 @@
     * If one buffer, only map it once.
     */
    ctx->Driver.MapRenderbuffer(ctx, depthRb, x, y, width, height,
-			       GL_MAP_READ_BIT, &depthMap, &depthStride);
+			       GL_MAP_READ_BIT, &depthMap, &depthStride, fb->FlipY);
    if (!depthMap) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
       return;
@@ -763,7 +765,7 @@
    if (stencilRb != depthRb) {
       ctx->Driver.MapRenderbuffer(ctx, stencilRb, x, y, width, height,
                                   GL_MAP_READ_BIT, &stencilMap,
-                                  &stencilStride);
+                                  &stencilStride, fb->FlipY);
       if (!stencilMap) {
          ctx->Driver.UnmapRenderbuffer(ctx, depthRb);
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
@@ -901,7 +903,7 @@
 
 
 static GLenum
-read_pixels_es3_error_check(GLenum format, GLenum type,
+read_pixels_es3_error_check(struct gl_context *ctx, GLenum format, GLenum type,
                             const struct gl_renderbuffer *rb)
 {
    const GLenum internalFormat = rb->InternalFormat;
@@ -927,6 +929,16 @@
          return GL_NO_ERROR;
       if (internalFormat == GL_RGB10_A2UI && type == GL_UNSIGNED_BYTE)
          return GL_NO_ERROR;
+      if (type == GL_UNSIGNED_SHORT) {
+         switch (internalFormat) {
+         case GL_R16:
+         case GL_RG16:
+         case GL_RGB16:
+         case GL_RGBA16:
+            if (_mesa_has_EXT_texture_norm16(ctx))
+               return GL_NO_ERROR;
+         }
+      }
       break;
    case GL_BGRA:
       /* GL_EXT_read_format_bgra */
@@ -1049,7 +1061,7 @@
                }
             }
          } else {
-            err = read_pixels_es3_error_check(format, type, rb);
+            err = read_pixels_es3_error_check(ctx, format, type, rb);
          }
 
          if (err != GL_NO_ERROR) {
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 86064a9..11ecd71 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -533,6 +533,11 @@
 
       /* Resource basename. */
       const char *rname = _mesa_program_resource_name(res);
+
+      /* Since ARB_gl_spirv lack of name reflections is a possibility */
+      if (rname == NULL)
+         continue;
+
       unsigned baselen = strlen(rname);
       unsigned baselen_without_array_index = baselen;
       const char *rname_last_square_bracket = strrchr(rname, '[');
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index c77591b..f708084 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -840,7 +840,7 @@
       *params = shProg->BinaryRetreivableHint;
       return;
    case GL_PROGRAM_BINARY_LENGTH:
-      if (ctx->Const.NumProgramBinaryFormats == 0) {
+      if (ctx->Const.NumProgramBinaryFormats == 0 || !shProg->data->LinkStatus) {
          *params = 0;
       } else {
          _mesa_get_program_binary_length(ctx, shProg, params);
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index 054ab1d..31ac852 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -430,9 +430,8 @@
     * ARB_shader_image_load_store extension, c.f. table 3.21 of the OpenGL 4.2
     * specification.
     *
-    * These can be supported by GLES 3.1 with GL_NV_image_formats &
-    * GL_EXT_texture_norm16 extensions but we don't have support for the
-    * latter in Mesa yet.
+    * Following formats are supported by GLES 3.1 with GL_NV_image_formats &
+    * GL_EXT_texture_norm16 extensions.
     */
    case GL_RGBA16:
    case GL_RGBA16_SNORM:
@@ -440,7 +439,7 @@
    case GL_RG16_SNORM:
    case GL_R16:
    case GL_R16_SNORM:
-      return _mesa_is_desktop_gl(ctx);
+      return _mesa_is_desktop_gl(ctx) || _mesa_has_EXT_texture_norm16(ctx);
 
    default:
       return false;
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index be8f3f3..097cd9e 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -222,41 +222,49 @@
 }
 
 
+static GLbitfield
+update_single_program_constants(struct gl_context *ctx,
+                                struct gl_program *prog,
+                                gl_shader_stage stage)
+{
+   if (prog) {
+      const struct gl_program_parameter_list *params = prog->Parameters;
+      if (params && params->StateFlags & ctx->NewState) {
+         if (ctx->DriverFlags.NewShaderConstants[stage])
+            ctx->NewDriverState |= ctx->DriverFlags.NewShaderConstants[stage];
+         else
+            return _NEW_PROGRAM_CONSTANTS;
+      }
+   }
+   return 0;
+}
+
+
 /**
+ * This updates fixed-func state constants such as gl_ModelViewMatrix.
  * Examine shader constants and return either _NEW_PROGRAM_CONSTANTS or 0.
  */
 static GLbitfield
 update_program_constants(struct gl_context *ctx)
 {
-   GLbitfield new_state = 0x0;
+   GLbitfield new_state =
+      update_single_program_constants(ctx, ctx->VertexProgram._Current,
+                                      MESA_SHADER_VERTEX) |
+      update_single_program_constants(ctx, ctx->FragmentProgram._Current,
+                                      MESA_SHADER_FRAGMENT);
 
-   if (ctx->FragmentProgram._Current) {
-      const struct gl_program_parameter_list *params =
-         ctx->FragmentProgram._Current->Parameters;
-      if (params && params->StateFlags & ctx->NewState) {
-         if (ctx->DriverFlags.NewShaderConstants[MESA_SHADER_FRAGMENT]) {
-            ctx->NewDriverState |=
-               ctx->DriverFlags.NewShaderConstants[MESA_SHADER_FRAGMENT];
-         } else {
-            new_state |= _NEW_PROGRAM_CONSTANTS;
-         }
-      }
-   }
+   if (ctx->API == API_OPENGL_COMPAT &&
+       ctx->Const.GLSLVersionCompat >= 150) {
+      new_state |=
+         update_single_program_constants(ctx, ctx->GeometryProgram._Current,
+                                         MESA_SHADER_GEOMETRY);
 
-   /* Don't handle tessellation and geometry shaders here. They don't use
-    * any state constants.
-    */
-
-   if (ctx->VertexProgram._Current) {
-      const struct gl_program_parameter_list *params =
-         ctx->VertexProgram._Current->Parameters;
-      if (params && params->StateFlags & ctx->NewState) {
-         if (ctx->DriverFlags.NewShaderConstants[MESA_SHADER_VERTEX]) {
-            ctx->NewDriverState |=
-               ctx->DriverFlags.NewShaderConstants[MESA_SHADER_VERTEX];
-         } else {
-            new_state |= _NEW_PROGRAM_CONSTANTS;
-         }
+      if (_mesa_has_ARB_tessellation_shader(ctx)) {
+         new_state |=
+            update_single_program_constants(ctx, ctx->TessCtrlProgram._Current,
+                                            MESA_SHADER_TESS_CTRL) |
+            update_single_program_constants(ctx, ctx->TessEvalProgram._Current,
+                                            MESA_SHADER_TESS_EVAL);
       }
    }
 
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 83a4b04..82eb61d 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -575,6 +575,46 @@
    { "glBlendFunci", 40, -1 },
    { "glBlendFuncSeparatei", 40, -1 },
 
+   { "glGetSubroutineUniformLocation", 40, -1 },
+   { "glGetSubroutineIndex", 40, -1 },
+   { "glGetActiveSubroutineUniformiv", 40, -1 },
+   { "glGetActiveSubroutineUniformName", 40, -1 },
+   { "glGetActiveSubroutineName", 40, -1 },
+   { "glUniformSubroutinesuiv", 40, -1 },
+   { "glGetUniformSubroutineuiv", 40, -1 },
+   { "glGetProgramStageiv", 40, -1 },
+
+   { "glUniform1d", 40, -1 },
+   { "glUniform2d", 40, -1 },
+   { "glUniform3d", 40, -1 },
+   { "glUniform4d", 40, -1 },
+   { "glUniform1dv", 40, -1 },
+   { "glUniform2dv", 40, -1 },
+   { "glUniform3dv", 40, -1 },
+   { "glUniform4dv", 40, -1 },
+   { "glUniformMatrix2dv", 40, -1 },
+   { "glUniformMatrix3dv", 40, -1 },
+   { "glUniformMatrix4dv", 40, -1 },
+   { "glUniformMatrix2x3dv", 40, -1 },
+   { "glUniformMatrix2x4dv", 40, -1 },
+   { "glUniformMatrix3x2dv", 40, -1 },
+   { "glUniformMatrix3x4dv", 40, -1 },
+   { "glUniformMatrix4x2dv", 40, -1 },
+   { "glUniformMatrix4x3dv", 40, -1 },
+   { "glGetUniformdv", 43, -1 },
+
+   /* GL 4.1 */
+   { "glVertexAttribL1d", 41, -1 },
+   { "glVertexAttribL2d", 41, -1 },
+   { "glVertexAttribL3d", 41, -1 },
+   { "glVertexAttribL4d", 41, -1 },
+   { "glVertexAttribL1dv", 41, -1 },
+   { "glVertexAttribL2dv", 41, -1 },
+   { "glVertexAttribL3dv", 41, -1 },
+   { "glVertexAttribL4dv", 41, -1 },
+   { "glVertexAttribLPointer", 41, -1 },
+   { "glGetVertexAttribLdv", 41, -1 },
+
    /* GL 4.3 */
    { "glIsRenderbuffer", 43, -1 },
    { "glBindRenderbuffer", 43, -1 },
@@ -856,6 +896,15 @@
 // { "glTextureStorage2DMultisampleEXT", 43, -1 },      // XXX: Add to xml
 // { "glTextureStorage3DMultisampleEXT", 43, -1 },      // XXX: Add to xml
 
+   { "glViewportArrayv", 43, -1 },
+   { "glViewportIndexedf", 43, -1 },
+   { "glViewportIndexedfv", 43, -1 },
+   { "glScissorArrayv", 43, -1 },
+   { "glScissorIndexed", 43, -1 },
+   { "glScissorIndexedv", 43, -1 },
+   { "glDepthRangeArrayv", 43, -1 },
+   { "glDepthRangeIndexed", 43, -1 },
+
 /* GL 4.5 */
    /* aliased versions checked above */
    //{ "glGetGraphicsResetStatus", 45, -1 },
@@ -865,6 +914,105 @@
    //{ "glGetnUniformuiv", 45, -1 },
    { "glMemoryBarrierByRegion", 45, -1 },
 
+   /* GL_ARB_direct_state_access */
+   { "glCreateTransformFeedbacks", 45, -1 },
+   { "glTransformFeedbackBufferBase", 45, -1 },
+   { "glTransformFeedbackBufferRange", 45, -1 },
+   { "glGetTransformFeedbackiv", 45, -1 },
+   { "glGetTransformFeedbacki_v", 45, -1 },
+   { "glGetTransformFeedbacki64_v", 45, -1 },
+   { "glCreateBuffers", 45, -1 },
+   { "glNamedBufferStorage", 45, -1 },
+   { "glNamedBufferData", 45, -1 },
+   { "glNamedBufferSubData", 45, -1 },
+   { "glCopyNamedBufferSubData", 45, -1 },
+   { "glClearNamedBufferData", 45, -1 },
+   { "glClearNamedBufferSubData", 45, -1 },
+   { "glMapNamedBuffer", 45, -1 },
+   { "glMapNamedBufferRange", 45, -1 },
+   { "glUnmapNamedBuffer", 45, -1 },
+   { "glFlushMappedNamedBufferRange", 45, -1 },
+   { "glGetNamedBufferParameteriv", 45, -1 },
+   { "glGetNamedBufferParameteri64v", 45, -1 },
+   { "glGetNamedBufferPointerv", 45, -1 },
+   { "glGetNamedBufferSubData", 45, -1 },
+   { "glCreateFramebuffers", 45, -1 },
+   { "glNamedFramebufferRenderbuffer", 45, -1 },
+   { "glNamedFramebufferParameteri", 45, -1 },
+   { "glNamedFramebufferTexture", 45, -1 },
+   { "glNamedFramebufferTextureLayer", 45, -1 },
+   { "glNamedFramebufferDrawBuffer", 45, -1 },
+   { "glNamedFramebufferDrawBuffers", 45, -1 },
+   { "glNamedFramebufferReadBuffer", 45, -1 },
+   { "glInvalidateNamedFramebufferSubData", 45, -1 },
+   { "glInvalidateNamedFramebufferData", 45, -1 },
+   { "glClearNamedFramebufferiv", 45, -1 },
+   { "glClearNamedFramebufferuiv", 45, -1 },
+   { "glClearNamedFramebufferfv", 45, -1 },
+   { "glClearNamedFramebufferfi", 45, -1 },
+   { "glBlitNamedFramebuffer", 45, -1 },
+   { "glCheckNamedFramebufferStatus", 45, -1 },
+   { "glGetNamedFramebufferParameteriv", 45, -1 },
+   { "glGetNamedFramebufferAttachmentParameteriv", 45, -1 },
+   { "glCreateRenderbuffers", 45, -1 },
+   { "glNamedRenderbufferStorage", 45, -1 },
+   { "glNamedRenderbufferStorageMultisample", 45, -1 },
+   { "glGetNamedRenderbufferParameteriv", 45, -1 },
+   { "glCreateTextures", 45, -1 },
+   { "glTextureStorage1D", 45, -1 },
+   { "glTextureStorage2D", 45, -1 },
+   { "glTextureStorage3D", 45, -1 },
+   { "glTextureSubImage1D", 45, -1 },
+   { "glTextureSubImage2D", 45, -1 },
+   { "glTextureSubImage3D", 45, -1 },
+   { "glBindTextureUnit", 45, -1 },
+   { "glTextureParameterf", 45, -1 },
+   { "glTextureParameterfv", 45, -1 },
+   { "glTextureParameteri", 45, -1 },
+   { "glTextureParameterIiv", 45, -1 },
+   { "glTextureParameterIuiv", 45, -1 },
+   { "glTextureParameteriv", 45, -1 },
+   { "glGetTextureLevelParameterfv", 45, -1 },
+   { "glGetTextureLevelParameteriv", 45, -1 },
+   { "glGetTextureParameterfv", 45, -1 },
+   { "glGetTextureParameterIiv", 45, -1 },
+   { "glGetTextureParameterIuiv", 45, -1 },
+   { "glGetTextureParameteriv", 45, -1 },
+   { "glCopyTextureSubImage1D", 45, -1 },
+   { "glCopyTextureSubImage2D", 45, -1 },
+   { "glCopyTextureSubImage3D", 45, -1 },
+   { "glGetTextureImage", 45, -1 },
+   { "glGetCompressedTextureImage", 45, -1 },
+   { "glCompressedTextureSubImage1D", 45, -1 },
+   { "glCompressedTextureSubImage2D", 45, -1 },
+   { "glCompressedTextureSubImage3D", 45, -1 },
+   { "glGenerateTextureMipmap", 45, -1 },
+   { "glTextureStorage2DMultisample", 45, -1 },
+   { "glTextureStorage3DMultisample", 45, -1 },
+   { "glTextureBuffer", 45, -1 },
+   { "glTextureBufferRange", 45, -1 },
+   { "glCreateVertexArrays", 45, -1 },
+   { "glDisableVertexArrayAttrib", 45, -1 },
+   { "glEnableVertexArrayAttrib", 45, -1 },
+   { "glVertexArrayElementBuffer", 45, -1 },
+   { "glVertexArrayVertexBuffer", 45, -1 },
+   { "glVertexArrayVertexBuffers", 45, -1 },
+   { "glVertexArrayAttribFormat", 45, -1 },
+   { "glVertexArrayAttribIFormat", 45, -1 },
+   { "glVertexArrayAttribLFormat", 45, -1 },
+   { "glVertexArrayAttribBinding", 45, -1 },
+   { "glVertexArrayBindingDivisor", 45, -1 },
+   { "glGetVertexArrayiv", 45, -1 },
+   { "glGetVertexArrayIndexediv", 45, -1 },
+   { "glGetVertexArrayIndexed64iv", 45, -1 },
+   { "glCreateSamplers", 45, -1 },
+   { "glCreateProgramPipelines", 45, -1 },
+   { "glCreateQueries", 45, -1 },
+   { "glGetQueryBufferObjectiv", 45, -1 },
+   { "glGetQueryBufferObjectuiv", 45, -1 },
+   { "glGetQueryBufferObjecti64v", 45, -1 },
+   { "glGetQueryBufferObjectui64v", 45, -1 },
+
    /* GL_ARB_internalformat_query */
    { "glGetInternalformativ", 30, -1 },
 
@@ -1026,6 +1174,24 @@
    /* GL_EXT_shader_framebuffer_fetch_non_coherent */
    { "glFramebufferFetchBarrierEXT", 20, -1 },
 
+   /* GL_NV_conservative_raster */
+   { "glSubpixelPrecisionBiasNV", 10, -1 },
+
+   /* GL_NV_conservative_raster_dilate */
+   { "glConservativeRasterParameterfNV", 10, -1 },
+
+   /* GL_NV_conservative_raster_pre_snap_triangles */
+   { "glConservativeRasterParameteriNV", 10, -1 },
+
+   /* GL_ARB_sample_locations */
+   { "glFramebufferSampleLocationsfvARB", 30, -1 },
+   { "glNamedFramebufferSampleLocationsfvARB", 30, -1 },
+   { "glEvaluateDepthValuesARB", 30, -1 },
+
+   /* GL_ARB_indirect_parameters */
+   { "glMultiDrawArraysIndirectCountARB", 11, -1 },
+   { "glMultiDrawElementsIndirectCountARB", 11, -1 },
+
    { NULL, 0, -1 }
 };
 
@@ -1428,7 +1594,6 @@
    { "glGetProgramLocalParameterfvARB", 10, -1 },
    { "glGetProgramivARB", 10, -1 },
    { "glGetProgramStringARB", 10, -1 },
-   { "glPolygonOffsetEXT", 10, -1 },
    { "glColorPointerEXT", 10, -1 },
    { "glEdgeFlagPointerEXT", 10, -1 },
    { "glIndexPointerEXT", 10, -1 },
@@ -1515,16 +1680,6 @@
    /* GL 3.2 */
    { "glFramebufferTexture", 32, -1 },
 
-   /* GL 4.0 */
-   { "glGetSubroutineUniformLocation", 40, -1 },
-   { "glGetSubroutineIndex", 40, -1 },
-   { "glGetActiveSubroutineUniformiv", 40, -1 },
-   { "glGetActiveSubroutineUniformName", 40, -1 },
-   { "glGetActiveSubroutineName", 40, -1 },
-   { "glUniformSubroutinesuiv", 40, -1 },
-   { "glGetUniformSubroutineuiv", 40, -1 },
-   { "glGetProgramStageiv", 40, -1 },
-
    /* GL 4.3 */
    { "glIsRenderbuffer", 43, -1 },
    { "glBindRenderbuffer", 43, -1 },
@@ -1645,25 +1800,6 @@
    { "glDrawArraysIndirect", 43, -1 },
    { "glDrawElementsIndirect", 43, -1 },
 
-   { "glUniform1d", 40, -1 },
-   { "glUniform2d", 40, -1 },
-   { "glUniform3d", 40, -1 },
-   { "glUniform4d", 40, -1 },
-   { "glUniform1dv", 40, -1 },
-   { "glUniform2dv", 40, -1 },
-   { "glUniform3dv", 40, -1 },
-   { "glUniform4dv", 40, -1 },
-   { "glUniformMatrix2dv", 40, -1 },
-   { "glUniformMatrix3dv", 40, -1 },
-   { "glUniformMatrix4dv", 40, -1 },
-   { "glUniformMatrix2x3dv", 40, -1 },
-   { "glUniformMatrix2x4dv", 40, -1 },
-   { "glUniformMatrix3x2dv", 40, -1 },
-   { "glUniformMatrix3x4dv", 40, -1 },
-   { "glUniformMatrix4x2dv", 40, -1 },
-   { "glUniformMatrix4x3dv", 40, -1 },
-   { "glGetUniformdv", 43, -1 },
-
    { "glBindTransformFeedback", 43, -1 },
    { "glDeleteTransformFeedbacks", 43, -1 },
    { "glGenTransformFeedbacks", 43, -1 },
@@ -1744,25 +1880,6 @@
    { "glValidateProgramPipeline", 43, -1 },
    { "glGetProgramPipelineInfoLog", 43, -1 },
 
-   { "glVertexAttribL1d", 41, -1 },
-   { "glVertexAttribL2d", 41, -1 },
-   { "glVertexAttribL3d", 41, -1 },
-   { "glVertexAttribL4d", 41, -1 },
-   { "glVertexAttribL1dv", 41, -1 },
-   { "glVertexAttribL2dv", 41, -1 },
-   { "glVertexAttribL3dv", 41, -1 },
-   { "glVertexAttribL4dv", 41, -1 },
-   { "glVertexAttribLPointer", 41, -1 },
-   { "glGetVertexAttribLdv", 41, -1 },
-   { "glViewportArrayv", 43, -1 },
-   { "glViewportIndexedf", 43, -1 },
-   { "glViewportIndexedfv", 43, -1 },
-   { "glScissorArrayv", 43, -1 },
-   { "glScissorIndexed", 43, -1 },
-   { "glScissorIndexedv", 43, -1 },
-   { "glDepthRangeArrayv", 43, -1 },
-   { "glDepthRangeIndexed", 43, -1 },
-
 // { "glCreateSyncFromCLeventARB", 43, -1 },            // XXX: Add to xml
 
    { "glDrawArraysInstancedBaseInstance", 43, -1 },
@@ -1825,109 +1942,6 @@
 /* GL 4.5 */
    { "glMemoryBarrierByRegion", 45, -1 },
 
-   /* GL_ARB_direct_state_access */
-   { "glCreateTransformFeedbacks", 45, -1 },
-   { "glTransformFeedbackBufferBase", 45, -1 },
-   { "glTransformFeedbackBufferRange", 45, -1 },
-   { "glGetTransformFeedbackiv", 45, -1 },
-   { "glGetTransformFeedbacki_v", 45, -1 },
-   { "glGetTransformFeedbacki64_v", 45, -1 },
-   { "glCreateBuffers", 45, -1 },
-   { "glNamedBufferStorage", 45, -1 },
-   { "glNamedBufferData", 45, -1 },
-   { "glNamedBufferSubData", 45, -1 },
-   { "glCopyNamedBufferSubData", 45, -1 },
-   { "glClearNamedBufferData", 45, -1 },
-   { "glClearNamedBufferSubData", 45, -1 },
-   { "glMapNamedBuffer", 45, -1 },
-   { "glMapNamedBufferRange", 45, -1 },
-   { "glUnmapNamedBuffer", 45, -1 },
-   { "glFlushMappedNamedBufferRange", 45, -1 },
-   { "glGetNamedBufferParameteriv", 45, -1 },
-   { "glGetNamedBufferParameteri64v", 45, -1 },
-   { "glGetNamedBufferPointerv", 45, -1 },
-   { "glGetNamedBufferSubData", 45, -1 },
-   { "glCreateFramebuffers", 45, -1 },
-   { "glNamedFramebufferRenderbuffer", 45, -1 },
-   { "glNamedFramebufferParameteri", 45, -1 },
-   { "glNamedFramebufferTexture", 45, -1 },
-   { "glNamedFramebufferTextureLayer", 45, -1 },
-   { "glNamedFramebufferDrawBuffer", 45, -1 },
-   { "glNamedFramebufferDrawBuffers", 45, -1 },
-   { "glNamedFramebufferReadBuffer", 45, -1 },
-   { "glInvalidateNamedFramebufferSubData", 45, -1 },
-   { "glInvalidateNamedFramebufferData", 45, -1 },
-   { "glClearNamedFramebufferiv", 45, -1 },
-   { "glClearNamedFramebufferuiv", 45, -1 },
-   { "glClearNamedFramebufferfv", 45, -1 },
-   { "glClearNamedFramebufferfi", 45, -1 },
-   { "glBlitNamedFramebuffer", 45, -1 },
-   { "glCheckNamedFramebufferStatus", 45, -1 },
-   { "glGetNamedFramebufferParameteriv", 45, -1 },
-   { "glGetNamedFramebufferAttachmentParameteriv", 45, -1 },
-   { "glCreateRenderbuffers", 45, -1 },
-   { "glNamedRenderbufferStorage", 45, -1 },
-   { "glNamedRenderbufferStorageMultisample", 45, -1 },
-   { "glGetNamedRenderbufferParameteriv", 45, -1 },
-   { "glCreateTextures", 45, -1 },
-   { "glTextureStorage1D", 45, -1 },
-   { "glTextureStorage2D", 45, -1 },
-   { "glTextureStorage3D", 45, -1 },
-   { "glTextureSubImage1D", 45, -1 },
-   { "glTextureSubImage2D", 45, -1 },
-   { "glTextureSubImage3D", 45, -1 },
-   { "glBindTextureUnit", 45, -1 },
-   { "glTextureParameterf", 45, -1 },
-   { "glTextureParameterfv", 45, -1 },
-   { "glTextureParameteri", 45, -1 },
-   { "glTextureParameterIiv", 45, -1 },
-   { "glTextureParameterIuiv", 45, -1 },
-   { "glTextureParameteriv", 45, -1 },
-   { "glGetTextureLevelParameterfv", 45, -1 },
-   { "glGetTextureLevelParameteriv", 45, -1 },
-   { "glGetTextureParameterfv", 45, -1 },
-   { "glGetTextureParameterIiv", 45, -1 },
-   { "glGetTextureParameterIuiv", 45, -1 },
-   { "glGetTextureParameteriv", 45, -1 },
-   { "glCopyTextureSubImage1D", 45, -1 },
-   { "glCopyTextureSubImage2D", 45, -1 },
-   { "glCopyTextureSubImage3D", 45, -1 },
-   { "glGetTextureImage", 45, -1 },
-   { "glGetCompressedTextureImage", 45, -1 },
-   { "glCompressedTextureSubImage1D", 45, -1 },
-   { "glCompressedTextureSubImage2D", 45, -1 },
-   { "glCompressedTextureSubImage3D", 45, -1 },
-   { "glGenerateTextureMipmap", 45, -1 },
-   { "glTextureStorage2DMultisample", 45, -1 },
-   { "glTextureStorage3DMultisample", 45, -1 },
-   { "glTextureBuffer", 45, -1 },
-   { "glTextureBufferRange", 45, -1 },
-   { "glCreateVertexArrays", 45, -1 },
-   { "glDisableVertexArrayAttrib", 45, -1 },
-   { "glEnableVertexArrayAttrib", 45, -1 },
-   { "glVertexArrayElementBuffer", 45, -1 },
-   { "glVertexArrayVertexBuffer", 45, -1 },
-   { "glVertexArrayVertexBuffers", 45, -1 },
-   { "glVertexArrayAttribFormat", 45, -1 },
-   { "glVertexArrayAttribIFormat", 45, -1 },
-   { "glVertexArrayAttribLFormat", 45, -1 },
-   { "glVertexArrayAttribBinding", 45, -1 },
-   { "glVertexArrayBindingDivisor", 45, -1 },
-   { "glGetVertexArrayiv", 45, -1 },
-   { "glGetVertexArrayIndexediv", 45, -1 },
-   { "glGetVertexArrayIndexed64iv", 45, -1 },
-   { "glCreateSamplers", 45, -1 },
-   { "glCreateProgramPipelines", 45, -1 },
-   { "glCreateQueries", 45, -1 },
-   { "glGetQueryBufferObjectiv", 45, -1 },
-   { "glGetQueryBufferObjectuiv", 45, -1 },
-   { "glGetQueryBufferObjecti64v", 45, -1 },
-   { "glGetQueryBufferObjectui64v", 45, -1 },
-
-   /* GL_ARB_indirect_parameters */
-   { "glMultiDrawArraysIndirectCountARB", 31, -1 },
-   { "glMultiDrawElementsIndirectCountARB", 31, -1 },
-
    /* GL_ARB_ES3_2_compatibility */
    { "glPrimitiveBoundingBoxARB", 45, -1 },
 
@@ -2185,6 +2199,15 @@
    /* GL_EXT_polygon_offset_clamp */
    { "glPolygonOffsetClampEXT", 11, -1 },
 
+   /* GL_NV_conservative_raster */
+   { "glSubpixelPrecisionBiasNV", 20, -1 },
+
+   /* GL_NV_conservative_raster_dilate */
+   { "glConservativeRasterParameterfNV", 20, -1 },
+
+   /* GL_NV_conservative_raster_pre_snap_triangles */
+   { "glConservativeRasterParameteriNV", 20, -1 },
+
    { NULL, 0, -1 }
 };
 
@@ -2396,6 +2419,19 @@
    { "glValidateProgramPipelineEXT", 20, -1 },
    { "glGetProgramPipelineInfoLogEXT", 20, -1 },
 
+   /* GL_AMD_performance_monitor */
+   { "glGetPerfMonitorGroupsAMD", 20, -1 },
+   { "glGetPerfMonitorCountersAMD", 20, -1 },
+   { "glGetPerfMonitorGroupStringAMD", 20, -1 },
+   { "glGetPerfMonitorCounterStringAMD", 20, -1 },
+   { "glGetPerfMonitorCounterInfoAMD", 20, -1 },
+   { "glGenPerfMonitorsAMD", 20, -1 },
+   { "glDeletePerfMonitorsAMD", 20, -1 },
+   { "glSelectPerfMonitorCountersAMD", 20, -1 },
+   { "glBeginPerfMonitorAMD", 20, -1 },
+   { "glEndPerfMonitorAMD", 20, -1 },
+   { "glGetPerfMonitorCounterDataAMD", 20, -1 },
+
    /* GL_INTEL_performance_query */
    { "glGetFirstPerfQueryIdINTEL", 20, -1 },
    { "glGetNextPerfQueryIdINTEL", 20, -1 },
@@ -2452,6 +2488,15 @@
    /* GL_EXT_shader_framebuffer_fetch_non_coherent */
    { "glFramebufferFetchBarrierEXT", 20, -1 },
 
+   /* GL_NV_conservative_raster */
+   { "glSubpixelPrecisionBiasNV", 20, -1 },
+
+   /* GL_NV_conservative_raster_dilate */
+   { "glConservativeRasterParameterfNV", 20, -1 },
+
+   /* GL_NV_conservative_raster_pre_snap_triangles */
+   { "glConservativeRasterParameteriNV", 20, -1 },
+
    { NULL, 0, -1 }
 };
 
@@ -2722,6 +2767,9 @@
    /* GL_OES_texture_storage_multisample_2d_array */
    { "glTexStorage3DMultisampleOES", 31, -1 },
 
+   /* GL_OES_texture_view */
+   { "glTextureViewOES", 31, -1 },
+
    /* GL_EXT_buffer_storage */
    { "glBufferStorageEXT", 31, -1 },
 
@@ -2752,5 +2800,10 @@
    { "glDepthRangeIndexedfOES", 31, -1 },
    { "glGetFloati_vOES", 31, -1 },
 
+   /* GL_ARB_sample_locations */
+   { "glFramebufferSampleLocationsfvARB", 31, -1 },
+   { "glNamedFramebufferSampleLocationsfvARB", 31, -1 },
+   { "glEvaluateDepthValuesARB", 31, -1 },
+
    { NULL, 0, -1 },
  };
diff --git a/src/mesa/main/texcompress_astc.cpp b/src/mesa/main/texcompress_astc.cpp
new file mode 100644
index 0000000..23540c4
--- /dev/null
+++ b/src/mesa/main/texcompress_astc.cpp
@@ -0,0 +1,1871 @@
+/*
+ * Copyright 2015 Philip Taylor <philip@zaynar.co.uk>
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file texcompress_astc.c
+ *
+ * Decompression code for GL_KHR_texture_compression_astc_ldr, which is just
+ * ASTC 2D LDR.
+ *
+ * The ASTC 2D LDR decoder (without the sRGB part) was copied from the OASTC
+ * library written by Philip Taylor. I added sRGB support and adjusted it for
+ * Mesa. - Marek
+ */
+
+#include "texcompress_astc.h"
+#include "macros.h"
+#include "util/half_float.h"
+#include <stdio.h>
+
+static bool VERBOSE_DECODE = false;
+static bool VERBOSE_WRITE = false;
+
+static inline uint8_t
+uint16_div_64k_to_half_to_unorm8(uint16_t v)
+{
+   return _mesa_half_to_unorm8(_mesa_uint16_div_64k_to_half(v));
+}
+
+class decode_error
+{
+public:
+   enum type {
+      ok,
+      unsupported_hdr_void_extent,
+      reserved_block_mode_1,
+      reserved_block_mode_2,
+      dual_plane_and_too_many_partitions,
+      invalid_range_in_void_extent,
+      weight_grid_exceeds_block_size,
+      invalid_colour_endpoints_size,
+      invalid_colour_endpoints_count,
+      invalid_weight_bits,
+      invalid_num_weights,
+   };
+};
+
+
+struct cem_range {
+   uint8_t max;
+   uint8_t t, q, b;
+};
+
+/* Based on the Color Unquantization Parameters table,
+ * plus the bit-only representations, sorted by increasing size
+ */
+static cem_range cem_ranges[] = {
+   { 5, 1, 0, 1 },
+   { 7, 0, 0, 3 },
+   { 9, 0, 1, 1 },
+   { 11, 1, 0, 2 },
+   { 15, 0, 0, 4 },
+   { 19, 0, 1, 2 },
+   { 23, 1, 0, 3 },
+   { 31, 0, 0, 5 },
+   { 39, 0, 1, 3 },
+   { 47, 1, 0, 4 },
+   { 63, 0, 0, 6 },
+   { 79, 0, 1, 4 },
+   { 95, 1, 0, 5 },
+   { 127, 0, 0, 7 },
+   { 159, 0, 1, 5 },
+   { 191, 1, 0, 6 },
+   { 255, 0, 0, 8 },
+};
+
+#define CAT_BITS_2(a, b)          ( ((a) << 1) | (b) )
+#define CAT_BITS_3(a, b, c)       ( ((a) << 2) | ((b) << 1) | (c) )
+#define CAT_BITS_4(a, b, c, d)    ( ((a) << 3) | ((b) << 2) | ((c) << 1) | (d) )
+#define CAT_BITS_5(a, b, c, d, e) ( ((a) << 4) | ((b) << 3) | ((c) << 2) | ((d) << 1) | (e) )
+
+/**
+ * Unpack 5n+8 bits from 'in' into 5 output values.
+ * If n <= 4 then T should be uint32_t, else it must be uint64_t.
+ */
+template <typename T>
+static void unpack_trit_block(int n, T in, uint8_t *out)
+{
+   assert(n <= 6); /* else output will overflow uint8_t */
+
+   uint8_t T0 = (in >> (n)) & 0x1;
+   uint8_t T1 = (in >> (n+1)) & 0x1;
+   uint8_t T2 = (in >> (2*n+2)) & 0x1;
+   uint8_t T3 = (in >> (2*n+3)) & 0x1;
+   uint8_t T4 = (in >> (3*n+4)) & 0x1;
+   uint8_t T5 = (in >> (4*n+5)) & 0x1;
+   uint8_t T6 = (in >> (4*n+6)) & 0x1;
+   uint8_t T7 = (in >> (5*n+7)) & 0x1;
+   uint8_t mmask = (1 << n) - 1;
+   uint8_t m0 = (in >> (0)) & mmask;
+   uint8_t m1 = (in >> (n+2)) & mmask;
+   uint8_t m2 = (in >> (2*n+4)) & mmask;
+   uint8_t m3 = (in >> (3*n+5)) & mmask;
+   uint8_t m4 = (in >> (4*n+7)) & mmask;
+
+   uint8_t C;
+   uint8_t t4, t3, t2, t1, t0;
+   if (CAT_BITS_3(T4, T3, T2) == 0x7) {
+      C = CAT_BITS_5(T7, T6, T5, T1, T0);
+      t4 = t3 = 2;
+   } else {
+      C = CAT_BITS_5(T4, T3, T2, T1, T0);
+      if (CAT_BITS_2(T6, T5) == 0x3) {
+         t4 = 2;
+         t3 = T7;
+      } else {
+         t4 = T7;
+         t3 = CAT_BITS_2(T6, T5);
+      }
+   }
+
+   if ((C & 0x3) == 0x3) {
+      t2 = 2;
+      t1 = (C >> 4) & 0x1;
+      uint8_t C3 = (C >> 3) & 0x1;
+      uint8_t C2 = (C >> 2) & 0x1;
+      t0 = (C3 << 1) | (C2 & ~C3);
+   } else if (((C >> 2) & 0x3) == 0x3) {
+      t2 = 2;
+      t1 = 2;
+      t0 = C & 0x3;
+   } else {
+      t2 = (C >> 4) & 0x1;
+      t1 = (C >> 2) & 0x3;
+      uint8_t C1 = (C >> 1) & 0x1;
+      uint8_t C0 = (C >> 0) & 0x1;
+      t0 = (C1 << 1) | (C0 & ~C1);
+   }
+
+   out[0] = (t0 << n) | m0;
+   out[1] = (t1 << n) | m1;
+   out[2] = (t2 << n) | m2;
+   out[3] = (t3 << n) | m3;
+   out[4] = (t4 << n) | m4;
+}
+
+/**
+ * Unpack 3n+7 bits from 'in' into 3 output values
+ */
+static void unpack_quint_block(int n, uint32_t in, uint8_t *out)
+{
+   assert(n <= 5); /* else output will overflow uint8_t */
+
+   uint8_t Q0 = (in >> (n)) & 0x1;
+   uint8_t Q1 = (in >> (n+1)) & 0x1;
+   uint8_t Q2 = (in >> (n+2)) & 0x1;
+   uint8_t Q3 = (in >> (2*n+3)) & 0x1;
+   uint8_t Q4 = (in >> (2*n+4)) & 0x1;
+   uint8_t Q5 = (in >> (3*n+5)) & 0x1;
+   uint8_t Q6 = (in >> (3*n+6)) & 0x1;
+   uint8_t mmask = (1 << n) - 1;
+   uint8_t m0 = (in >> (0)) & mmask;
+   uint8_t m1 = (in >> (n+3)) & mmask;
+   uint8_t m2 = (in >> (2*n+5)) & mmask;
+
+   uint8_t C;
+   uint8_t q2, q1, q0;
+   if (CAT_BITS_4(Q6, Q5, Q2, Q1) == 0x3) {
+      q2 = CAT_BITS_3(Q0, Q4 & ~Q0, Q3 & ~Q0);
+      q1 = 4;
+      q0 = 4;
+   } else {
+      if (CAT_BITS_2(Q2, Q1) == 0x3) {
+         q2 = 4;
+         C = CAT_BITS_5(Q4, Q3, 0x1 & ~Q6, 0x1 & ~Q5, Q0);
+      } else {
+         q2 = CAT_BITS_2(Q6, Q5);
+         C = CAT_BITS_5(Q4, Q3, Q2, Q1, Q0);
+      }
+      if ((C & 0x7) == 0x5) {
+         q1 = 4;
+         q0 = (C >> 3) & 0x3;
+      } else {
+         q1 = (C >> 3) & 0x3;
+         q0 = C & 0x7;
+      }
+   }
+   out[0] = (q0 << n) | m0;
+   out[1] = (q1 << n) | m1;
+   out[2] = (q2 << n) | m2;
+}
+
+
+struct uint8x4_t
+{
+   uint8_t v[4];
+
+   uint8x4_t() { }
+
+   uint8x4_t(int a, int b, int c, int d)
+   {
+      assert(0 <= a && a <= 255);
+      assert(0 <= b && b <= 255);
+      assert(0 <= c && c <= 255);
+      assert(0 <= d && d <= 255);
+      v[0] = a;
+      v[1] = b;
+      v[2] = c;
+      v[3] = d;
+   }
+
+   static uint8x4_t clamped(int a, int b, int c, int d)
+   {
+      uint8x4_t r;
+      r.v[0] = MAX2(0, MIN2(255, a));
+      r.v[1] = MAX2(0, MIN2(255, b));
+      r.v[2] = MAX2(0, MIN2(255, c));
+      r.v[3] = MAX2(0, MIN2(255, d));
+      return r;
+   }
+};
+
+static uint8x4_t blue_contract(int r, int g, int b, int a)
+{
+   return uint8x4_t((r+b) >> 1, (g+b) >> 1, b, a);
+}
+
+static uint8x4_t blue_contract_clamped(int r, int g, int b, int a)
+{
+   return uint8x4_t::clamped((r+b) >> 1, (g+b) >> 1, b, a);
+}
+
+static void bit_transfer_signed(int &a, int &b)
+{
+   b >>= 1;
+   b |= a & 0x80;
+   a >>= 1;
+   a &= 0x3f;
+   if (a & 0x20)
+      a -= 0x40;
+}
+
+static uint32_t hash52(uint32_t p)
+{
+   p ^= p >> 15;
+   p -= p << 17;
+   p += p << 7;
+   p += p << 4;
+   p ^= p >> 5;
+   p += p << 16;
+   p ^= p >> 7;
+   p ^= p >> 3;
+   p ^= p << 6;
+   p ^= p >> 17;
+   return p;
+}
+
+static int select_partition(int seed, int x, int y, int z, int partitioncount,
+                            int small_block)
+{
+   if (small_block) {
+      x <<= 1;
+      y <<= 1;
+      z <<= 1;
+   }
+   seed += (partitioncount - 1) * 1024;
+   uint32_t rnum = hash52(seed);
+   uint8_t seed1 = rnum & 0xF;
+   uint8_t seed2 = (rnum >> 4) & 0xF;
+   uint8_t seed3 = (rnum >> 8) & 0xF;
+   uint8_t seed4 = (rnum >> 12) & 0xF;
+   uint8_t seed5 = (rnum >> 16) & 0xF;
+   uint8_t seed6 = (rnum >> 20) & 0xF;
+   uint8_t seed7 = (rnum >> 24) & 0xF;
+   uint8_t seed8 = (rnum >> 28) & 0xF;
+   uint8_t seed9 = (rnum >> 18) & 0xF;
+   uint8_t seed10 = (rnum >> 22) & 0xF;
+   uint8_t seed11 = (rnum >> 26) & 0xF;
+   uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
+
+   seed1 *= seed1;
+   seed2 *= seed2;
+   seed3 *= seed3;
+   seed4 *= seed4;
+   seed5 *= seed5;
+   seed6 *= seed6;
+   seed7 *= seed7;
+   seed8 *= seed8;
+   seed9 *= seed9;
+   seed10 *= seed10;
+   seed11 *= seed11;
+   seed12 *= seed12;
+
+   int sh1, sh2, sh3;
+   if (seed & 1) {
+      sh1 = (seed & 2 ? 4 : 5);
+      sh2 = (partitioncount == 3 ? 6 : 5);
+   } else {
+      sh1 = (partitioncount == 3 ? 6 : 5);
+      sh2 = (seed & 2 ? 4 : 5);
+   }
+   sh3 = (seed & 0x10) ? sh1 : sh2;
+
+   seed1 >>= sh1;
+   seed2 >>= sh2;
+   seed3 >>= sh1;
+   seed4 >>= sh2;
+   seed5 >>= sh1;
+   seed6 >>= sh2;
+   seed7 >>= sh1;
+   seed8 >>= sh2;
+   seed9 >>= sh3;
+   seed10 >>= sh3;
+   seed11 >>= sh3;
+   seed12 >>= sh3;
+
+   int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+   int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+   int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+   int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+   a &= 0x3F;
+   b &= 0x3F;
+   c &= 0x3F;
+   d &= 0x3F;
+
+   if (partitioncount < 4)
+      d = 0;
+   if (partitioncount < 3)
+      c = 0;
+
+   if (a >= b && a >= c && a >= d)
+      return 0;
+   else if (b >= c && b >= d)
+      return 1;
+   else if (c >= d)
+      return 2;
+   else
+      return 3;
+}
+
+
+struct InputBitVector
+{
+   uint32_t data[4];
+
+   void printf_bits(int offset, int count, const char *fmt = "", ...)
+   {
+      char out[129];
+      memset(out, '.', 128);
+      out[128] = '\0';
+      int idx = offset;
+      for (int i = 0; i < count; ++i) {
+         out[127 - idx] = ((data[idx >> 5] >> (idx & 31)) & 1) ? '1' : '0';
+         ++idx;
+      }
+      printf("%s ", out);
+      va_list ap;
+      va_start(ap, fmt);
+      vprintf(fmt, ap);
+      va_end(ap);
+      printf("\n");
+   }
+
+   uint32_t get_bits(int offset, int count)
+   {
+      assert(count >= 0 && count < 32);
+
+      uint32_t out = 0;
+      if (offset < 32)
+         out |= data[0] >> offset;
+
+      if (0 < offset && offset <= 32)
+         out |= data[1] << (32 - offset);
+      if (32 < offset && offset < 64)
+         out |= data[1] >> (offset - 32);
+
+      if (32 < offset && offset <= 64)
+         out |= data[2] << (64 - offset);
+      if (64 < offset && offset < 96)
+         out |= data[2] >> (offset - 64);
+
+      if (64 < offset && offset <= 96)
+         out |= data[3] << (96 - offset);
+      if (96 < offset && offset < 128)
+         out |= data[3] >> (offset - 96);
+
+      out &= (1 << count) - 1;
+      return out;
+   }
+
+   uint64_t get_bits64(int offset, int count)
+   {
+      assert(count >= 0 && count < 64);
+
+      uint64_t out = 0;
+      if (offset < 32)
+         out |= data[0] >> offset;
+
+      if (offset <= 32)
+         out |= (uint64_t)data[1] << (32 - offset);
+      if (32 < offset && offset < 64)
+         out |= data[1] >> (offset - 32);
+
+      if (0 < offset && offset <= 64)
+         out |= (uint64_t)data[2] << (64 - offset);
+      if (64 < offset && offset < 96)
+         out |= data[2] >> (offset - 64);
+
+      if (32 < offset && offset <= 96)
+         out |= (uint64_t)data[3] << (96 - offset);
+      if (96 < offset && offset < 128)
+         out |= data[3] >> (offset - 96);
+
+      out &= ((uint64_t)1 << count) - 1;
+      return out;
+   }
+
+   uint32_t get_bits_rev(int offset, int count)
+   {
+      assert(offset >= count);
+      uint32_t tmp = get_bits(offset - count, count);
+      uint32_t out = 0;
+      for (int i = 0; i < count; ++i)
+         out |= ((tmp >> i) & 1) << (count - 1 - i);
+      return out;
+   }
+};
+
+struct OutputBitVector
+{
+   uint32_t data[4];
+   int offset;
+
+   OutputBitVector()
+      : offset(0)
+   {
+      memset(data, 0, sizeof(data));
+   }
+
+   void append(uint32_t value, int size)
+   {
+      if (VERBOSE_WRITE)
+         printf("append offset=%d size=%d values=0x%x\n", offset, size, value);
+
+      assert(offset + size <= 128);
+
+      assert(size <= 32);
+      if (size < 32)
+         assert((value >> size) == 0);
+
+      while (size) {
+         int c = MIN2(size, 32 - (offset & 31));
+         data[offset >> 5] |= (value << (offset & 31));
+         offset += c;
+         size -= c;
+         value >>= c;
+      }
+   }
+
+   void append64(uint64_t value, int size)
+   {
+      if (VERBOSE_WRITE)
+         printf("append offset=%d size=%d values=0x%llx\n", offset, size, (unsigned long long)value);
+
+      assert(offset + size <= 128);
+
+      assert(size <= 64);
+      if (size < 64)
+         assert((value >> size) == 0);
+
+      while (size) {
+         int c = MIN2(size, 32 - (offset & 31));
+         data[offset >> 5] |= (value << (offset & 31));
+         offset += c;
+         size -= c;
+         value >>= c;
+      }
+   }
+
+   void append(OutputBitVector &v, int size)
+   {
+      if (VERBOSE_WRITE)
+         printf("append vector offset=%d size=%d\n", offset, size);
+
+      assert(offset + size <= 128);
+      int i = 0;
+      while (size >= 32) {
+         append(v.data[i++], 32);
+         size -= 32;
+      }
+      if (size > 0)
+         append(v.data[i] & ((1 << size) - 1), size);
+   }
+
+   void append_end(OutputBitVector &v, int size)
+   {
+      for (int i = 0; i < size; ++i)
+         data[(127 - i) >> 5] |= ((v.data[i >> 5] >> (i & 31)) & 1) << ((127 - i) & 31);
+   }
+
+   /* Insert the given number of '1' bits. (We could use 0s instead, but 1s are
+    * more likely to flush out bugs where we accidentally read undefined bits.)
+    */
+   void skip(int size)
+   {
+      if (VERBOSE_WRITE)
+         printf("skip offset=%d size=%d\n", offset, size);
+
+      assert(offset + size <= 128);
+      while (size >= 32) {
+         append(0xffffffff, 32);
+         size -= 32;
+      }
+      if (size > 0)
+         append(0xffffffff >> (32 - size), size);
+   }
+};
+
+
+class Decoder
+{
+public:
+   Decoder(int block_w, int block_h, int block_d, bool srgb, bool output_unorm8)
+      : block_w(block_w), block_h(block_h), block_d(block_d), srgb(srgb),
+        output_unorm8(output_unorm8) {}
+
+   decode_error::type decode(const uint8_t *in, uint16_t *output) const;
+
+   int block_w, block_h, block_d;
+   bool srgb, output_unorm8;
+};
+
+struct Block
+{
+   bool is_error;
+   bool bogus_colour_endpoints;
+   bool bogus_weights;
+
+   int high_prec;
+   int dual_plane;
+   int colour_component_selector;
+   int wt_range;
+   int wt_w, wt_h, wt_d;
+   int num_parts;
+   int partition_index;
+
+   bool is_void_extent;
+   int void_extent_d;
+   int void_extent_min_s;
+   int void_extent_max_s;
+   int void_extent_min_t;
+   int void_extent_max_t;
+   uint16_t void_extent_colour_r;
+   uint16_t void_extent_colour_g;
+   uint16_t void_extent_colour_b;
+   uint16_t void_extent_colour_a;
+
+   bool is_multi_cem;
+   int num_extra_cem_bits;
+   int colour_endpoint_data_offset;
+   int extra_cem_bits;
+   int cem_base_class;
+   int cems[4];
+
+   int num_cem_values;
+
+   /* Calculated by unpack_weights(): */
+   uint8_t weights_quant[64 + 4]; /* max 64 values, plus padding for overflows in trit parsing */
+
+   /* Calculated by unquantise_weights(): */
+   uint8_t weights[64 + 18]; /* max 64 values, plus padding for the infill interpolation */
+
+   /* Calculated by unpack_colour_endpoints(): */
+   uint8_t colour_endpoints_quant[18 + 4]; /* max 18 values, plus padding for overflows in trit parsing */
+
+   /* Calculated by unquantise_colour_endpoints(): */
+   uint8_t colour_endpoints[18];
+
+   /* Calculated by calculate_from_weights(): */
+   int wt_trits;
+   int wt_quints;
+   int wt_bits;
+   int wt_max;
+   int num_weights;
+   int weight_bits;
+
+   /* Calculated by calculate_remaining_bits(): */
+   int remaining_bits;
+
+   /* Calculated by calculate_colour_endpoints_size(): */
+   int colour_endpoint_bits;
+   int ce_max;
+   int ce_trits;
+   int ce_quints;
+   int ce_bits;
+
+   /* Calculated by compute_infill_weights(); */
+   uint8_t infill_weights[2][216]; /* large enough for 6x6x6 */
+
+   /* Calculated by decode_colour_endpoints(); */
+   uint8x4_t endpoints_decoded[2][4];
+
+   void calculate_from_weights();
+   void calculate_remaining_bits();
+   decode_error::type calculate_colour_endpoints_size();
+
+   void unquantise_weights();
+   void unquantise_colour_endpoints();
+
+   decode_error::type decode(const Decoder &decoder, InputBitVector in);
+
+   decode_error::type decode_block_mode(InputBitVector in);
+   decode_error::type decode_void_extent(InputBitVector in);
+   void decode_cem(InputBitVector in);
+   void unpack_colour_endpoints(InputBitVector in);
+   void decode_colour_endpoints();
+   void unpack_weights(InputBitVector in);
+   void compute_infill_weights(int block_w, int block_h, int block_d);
+
+   void write_decoded(const Decoder &decoder, uint16_t *output);
+};
+
+
+decode_error::type Decoder::decode(const uint8_t *in, uint16_t *output) const
+{
+   Block blk;
+   InputBitVector in_vec;
+   memcpy(&in_vec.data, in, 16);
+   decode_error::type err = blk.decode(*this, in_vec);
+   if (err == decode_error::ok) {
+      blk.write_decoded(*this, output);
+   } else {
+      /* Fill output with the error colour */
+      for (int i = 0; i < block_w * block_h * block_d; ++i) {
+         if (output_unorm8) {
+            output[i*4+0] = 0xff;
+            output[i*4+1] = 0;
+            output[i*4+2] = 0xff;
+            output[i*4+3] = 0xff;
+         } else {
+            assert(!srgb); /* srgb must use unorm8 */
+
+            output[i*4+0] = FP16_ONE;
+            output[i*4+1] = FP16_ZERO;
+            output[i*4+2] = FP16_ONE;
+            output[i*4+3] = FP16_ONE;
+         }
+      }
+   }
+   return err;
+}
+
+
+decode_error::type Block::decode_void_extent(InputBitVector block)
+{
+   /* TODO: 3D */
+
+   is_void_extent = true;
+   void_extent_d = block.get_bits(9, 1);
+   void_extent_min_s = block.get_bits(12, 13);
+   void_extent_max_s = block.get_bits(25, 13);
+   void_extent_min_t = block.get_bits(38, 13);
+   void_extent_max_t = block.get_bits(51, 13);
+   void_extent_colour_r = block.get_bits(64, 16);
+   void_extent_colour_g = block.get_bits(80, 16);
+   void_extent_colour_b = block.get_bits(96, 16);
+   void_extent_colour_a = block.get_bits(112, 16);
+
+   /* TODO: maybe we should do something useful with the extent coordinates? */
+
+   if (void_extent_d) {
+      return decode_error::unsupported_hdr_void_extent;
+   }
+
+   if (void_extent_min_s == 0x1fff && void_extent_max_s == 0x1fff
+       && void_extent_min_t == 0x1fff && void_extent_max_t == 0x1fff) {
+
+      /* No extents */
+
+   } else {
+
+      /* Check for illegal encoding */
+      if (void_extent_min_s >= void_extent_max_s || void_extent_min_t >= void_extent_max_t) {
+         return decode_error::invalid_range_in_void_extent;
+      }
+   }
+
+   return decode_error::ok;
+}
+
+decode_error::type Block::decode_block_mode(InputBitVector in)
+{
+   dual_plane = in.get_bits(10, 1);
+   high_prec = in.get_bits(9, 1);
+
+   if (in.get_bits(0, 2) != 0x0) {
+      wt_range = (in.get_bits(0, 2) << 1) | in.get_bits(4, 1);
+      int a = in.get_bits(5, 2);
+      int b = in.get_bits(7, 2);
+      switch (in.get_bits(2, 2)) {
+      case 0x0:
+         if (VERBOSE_DECODE)
+            in.printf_bits(0, 11, "DHBBAAR00RR");
+         wt_w = b + 4;
+         wt_h = a + 2;
+         break;
+      case 0x1:
+         if (VERBOSE_DECODE)
+            in.printf_bits(0, 11, "DHBBAAR01RR");
+         wt_w = b + 8;
+         wt_h = a + 2;
+         break;
+      case 0x2:
+         if (VERBOSE_DECODE)
+            in.printf_bits(0, 11, "DHBBAAR10RR");
+         wt_w = a + 2;
+         wt_h = b + 8;
+         break;
+      case 0x3:
+         if ((b & 0x2) == 0) {
+            if (VERBOSE_DECODE)
+               in.printf_bits(0, 11, "DH0BAAR11RR");
+            wt_w = a + 2;
+            wt_h = b + 6;
+         } else {
+            if (VERBOSE_DECODE)
+               in.printf_bits(0, 11, "DH1BAAR11RR");
+            wt_w = (b & 0x1) + 2;
+            wt_h = a + 2;
+         }
+         break;
+      }
+   } else {
+      if (in.get_bits(6, 3) == 0x7) {
+         if (in.get_bits(0, 9) == 0x1fc) {
+            if (VERBOSE_DECODE)
+               in.printf_bits(0, 11, "xx111111100 (void extent)");
+            return decode_void_extent(in);
+         } else {
+            if (VERBOSE_DECODE)
+               in.printf_bits(0, 11, "xx111xxxx00");
+            return decode_error::reserved_block_mode_1;
+         }
+      }
+      if (in.get_bits(0, 4) == 0x0) {
+         if (VERBOSE_DECODE)
+            in.printf_bits(0, 11, "xxxxxxx0000");
+         return decode_error::reserved_block_mode_2;
+      }
+
+      wt_range = in.get_bits(1, 3) | in.get_bits(4, 1);
+      int a = in.get_bits(5, 2);
+      int b;
+
+      switch (in.get_bits(7, 2)) {
+      case 0x0:
+         if (VERBOSE_DECODE)
+            in.printf_bits(0, 11, "DH00AARRR00");
+         wt_w = 12;
+         wt_h = a + 2;
+         break;
+      case 0x1:
+         if (VERBOSE_DECODE)
+            in.printf_bits(0, 11, "DH01AARRR00");
+         wt_w = a + 2;
+         wt_h = 12;
+         break;
+      case 0x3:
+         if (in.get_bits(5, 1) == 0) {
+            if (VERBOSE_DECODE)
+               in.printf_bits(0, 11, "DH1100RRR00");
+            wt_w = 6;
+            wt_h = 10;
+         } else {
+            if (VERBOSE_DECODE)
+               in.printf_bits(0, 11, "DH1101RRR00");
+            wt_w = 10;
+            wt_h = 6;
+         }
+         break;
+      case 0x2:
+         if (VERBOSE_DECODE)
+            in.printf_bits(0, 11, "BB10AARRR00");
+         b = in.get_bits(9, 2);
+         wt_w = a + 6;
+         wt_h = b + 6;
+         dual_plane = 0;
+         high_prec = 0;
+         break;
+      }
+   }
+   return decode_error::ok;
+}
+
+void Block::decode_cem(InputBitVector in)
+{
+   cems[0] = cems[1] = cems[2] = cems[3] = -1;
+
+   num_extra_cem_bits = 0;
+   extra_cem_bits = 0;
+
+   if (num_parts > 1) {
+
+      partition_index = in.get_bits(13, 10);
+      if (VERBOSE_DECODE)
+         in.printf_bits(13, 10, "partition ID (%d)", partition_index);
+
+      uint32_t cem = in.get_bits(23, 6);
+
+      if ((cem & 0x3) == 0x0) {
+         cem >>= 2;
+         cem_base_class = cem >> 2;
+         is_multi_cem = false;
+
+         for (int i = 0; i < num_parts; ++i)
+            cems[i] = cem;
+
+         if (VERBOSE_DECODE)
+            in.printf_bits(23, 6, "CEM (single, %d)", cem);
+      } else {
+
+         cem_base_class = (cem & 0x3) - 1;
+         is_multi_cem = true;
+
+         if (VERBOSE_DECODE)
+            in.printf_bits(23, 6, "CEM (multi, base class %d)", cem_base_class);
+
+         int offset = 128 - weight_bits;
+
+         if (num_parts == 2) {
+            if (VERBOSE_DECODE) {
+               in.printf_bits(25, 4, "M0M0 C1 C0");
+               in.printf_bits(offset - 2, 2, "M1M1");
+            }
+
+            uint32_t c0 = in.get_bits(25, 1);
+            uint32_t c1 = in.get_bits(26, 1);
+
+            extra_cem_bits = c0 + c1;
+
+            num_extra_cem_bits = 2;
+
+            uint32_t m0 = in.get_bits(27, 2);
+            uint32_t m1 = in.get_bits(offset - 2, 2);
+
+            cems[0] = ((cem_base_class + c0) << 2) | m0;
+            cems[1] = ((cem_base_class + c1) << 2) | m1;
+
+         } else if (num_parts == 3) {
+            if (VERBOSE_DECODE) {
+               in.printf_bits(25, 4, "M0 C2 C1 C0");
+               in.printf_bits(offset - 5, 5, "M2M2 M1M1 M0");
+            }
+
+            uint32_t c0 = in.get_bits(25, 1);
+            uint32_t c1 = in.get_bits(26, 1);
+            uint32_t c2 = in.get_bits(27, 1);
+
+            extra_cem_bits = c0 + c1 + c2;
+
+            num_extra_cem_bits = 5;
+
+            uint32_t m0 = in.get_bits(28, 1) | (in.get_bits(128 - weight_bits - 5, 1) << 1);
+            uint32_t m1 = in.get_bits(offset - 4, 2);
+            uint32_t m2 = in.get_bits(offset - 2, 2);
+
+            cems[0] = ((cem_base_class + c0) << 2) | m0;
+            cems[1] = ((cem_base_class + c1) << 2) | m1;
+            cems[2] = ((cem_base_class + c2) << 2) | m2;
+
+         } else if (num_parts == 4) {
+            if (VERBOSE_DECODE) {
+               in.printf_bits(25, 4, "C3 C2 C1 C0");
+               in.printf_bits(offset - 8, 8, "M3M3 M2M2 M1M1 M0M0");
+            }
+
+            uint32_t c0 = in.get_bits(25, 1);
+            uint32_t c1 = in.get_bits(26, 1);
+            uint32_t c2 = in.get_bits(27, 1);
+            uint32_t c3 = in.get_bits(28, 1);
+
+            extra_cem_bits = c0 + c1 + c2 + c3;
+
+            num_extra_cem_bits = 8;
+
+            uint32_t m0 = in.get_bits(offset - 8, 2);
+            uint32_t m1 = in.get_bits(offset - 6, 2);
+            uint32_t m2 = in.get_bits(offset - 4, 2);
+            uint32_t m3 = in.get_bits(offset - 2, 2);
+
+            cems[0] = ((cem_base_class + c0) << 2) | m0;
+            cems[1] = ((cem_base_class + c1) << 2) | m1;
+            cems[2] = ((cem_base_class + c2) << 2) | m2;
+            cems[3] = ((cem_base_class + c3) << 2) | m3;
+         } else {
+            unreachable("");
+         }
+      }
+
+      colour_endpoint_data_offset = 29;
+
+   } else {
+      uint32_t cem = in.get_bits(13, 4);
+
+      cem_base_class = cem >> 2;
+      is_multi_cem = false;
+
+      cems[0] = cem;
+
+      partition_index = -1;
+
+      if (VERBOSE_DECODE)
+         in.printf_bits(13, 4, "CEM = %d (class %d)", cem, cem_base_class);
+
+      colour_endpoint_data_offset = 17;
+   }
+}
+
+void Block::unpack_colour_endpoints(InputBitVector in)
+{
+   if (ce_trits) {
+      int offset = colour_endpoint_data_offset;
+      int bits_left = colour_endpoint_bits;
+      for (int i = 0; i < num_cem_values; i += 5) {
+         int bits_to_read = MIN2(bits_left, 8 + ce_bits * 5);
+         /* If ce_trits then ce_bits <= 6, so bits_to_read <= 38 and we have to use uint64_t */
+         uint64_t raw = in.get_bits64(offset, bits_to_read);
+         unpack_trit_block(ce_bits, raw, &colour_endpoints_quant[i]);
+
+         if (VERBOSE_DECODE)
+            in.printf_bits(offset, bits_to_read,
+                           "trits [%d,%d,%d,%d,%d]",
+                           colour_endpoints_quant[i+0], colour_endpoints_quant[i+1],
+                  colour_endpoints_quant[i+2], colour_endpoints_quant[i+3],
+                  colour_endpoints_quant[i+4]);
+
+         offset += 8 + ce_bits * 5;
+         bits_left -= 8 + ce_bits * 5;
+      }
+   } else if (ce_quints) {
+      int offset = colour_endpoint_data_offset;
+      int bits_left = colour_endpoint_bits;
+      for (int i = 0; i < num_cem_values; i += 3) {
+         int bits_to_read = MIN2(bits_left, 7 + ce_bits * 3);
+         /* If ce_quints then ce_bits <= 5, so bits_to_read <= 22 and we can use uint32_t */
+         uint32_t raw = in.get_bits(offset, bits_to_read);
+         unpack_quint_block(ce_bits, raw, &colour_endpoints_quant[i]);
+
+         if (VERBOSE_DECODE)
+            in.printf_bits(offset, bits_to_read,
+                           "quints [%d,%d,%d]",
+                           colour_endpoints_quant[i], colour_endpoints_quant[i+1], colour_endpoints_quant[i+2]);
+
+         offset += 7 + ce_bits * 3;
+         bits_left -= 7 + ce_bits * 3;
+      }
+   } else {
+      assert((colour_endpoint_bits % ce_bits) == 0);
+      int offset = colour_endpoint_data_offset;
+      for (int i = 0; i < num_cem_values; i++) {
+         colour_endpoints_quant[i] = in.get_bits(offset, ce_bits);
+
+         if (VERBOSE_DECODE)
+            in.printf_bits(offset, ce_bits, "bits [%d]", colour_endpoints_quant[i]);
+
+         offset += ce_bits;
+      }
+   }
+}
+
+void Block::decode_colour_endpoints()
+{
+   int cem_values_idx = 0;
+   for (int part = 0; part < num_parts; ++part) {
+      uint8_t *v = &colour_endpoints[cem_values_idx];
+      int v0 = v[0];
+      int v1 = v[1];
+      int v2 = v[2];
+      int v3 = v[3];
+      int v4 = v[4];
+      int v5 = v[5];
+      int v6 = v[6];
+      int v7 = v[7];
+      cem_values_idx += ((cems[part] >> 2) + 1) * 2;
+
+      uint8x4_t e0, e1;
+      int s0, s1, L0, L1;
+
+      switch (cems[part])
+      {
+      case 0:
+         e0 = uint8x4_t(v0, v0, v0, 0xff);
+         e1 = uint8x4_t(v1, v1, v1, 0xff);
+         break;
+      case 1:
+         L0 = (v0 >> 2) | (v1 & 0xc0);
+         L1 = L0 + (v1 & 0x3f);
+         if (L1 > 0xff)
+            L1 = 0xff;
+         e0 = uint8x4_t(L0, L0, L0, 0xff);
+         e1 = uint8x4_t(L1, L1, L1, 0xff);
+         break;
+      case 4:
+         e0 = uint8x4_t(v0, v0, v0, v2);
+         e1 = uint8x4_t(v1, v1, v1, v3);
+         break;
+      case 5:
+         bit_transfer_signed(v1, v0);
+         bit_transfer_signed(v3, v2);
+         e0 = uint8x4_t(v0, v0, v0, v2);
+         e1 = uint8x4_t::clamped(v0+v1, v0+v1, v0+v1, v2+v3);
+         break;
+      case 6:
+         e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, 0xff);
+         e1 = uint8x4_t(v0, v1, v2, 0xff);
+         break;
+      case 8:
+         s0 = v0 + v2 + v4;
+         s1 = v1 + v3 + v5;
+         if (s1 >= s0) {
+            e0 = uint8x4_t(v0, v2, v4, 0xff);
+            e1 = uint8x4_t(v1, v3, v5, 0xff);
+         } else {
+            e0 = blue_contract(v1, v3, v5, 0xff);
+            e1 = blue_contract(v0, v2, v4, 0xff);
+         }
+         break;
+      case 9:
+         bit_transfer_signed(v1, v0);
+         bit_transfer_signed(v3, v2);
+         bit_transfer_signed(v5, v4);
+         if (v1 + v3 + v5 >= 0) {
+            e0 = uint8x4_t(v0, v2, v4, 0xff);
+            e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, 0xff);
+         } else {
+            e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, 0xff);
+            e1 = blue_contract(v0, v2, v4, 0xff);
+         }
+         break;
+      case 10:
+         e0 = uint8x4_t(v0*v3 >> 8, v1*v3 >> 8, v2*v3 >> 8, v4);
+         e1 = uint8x4_t(v0, v1, v2, v5);
+         break;
+      case 12:
+         s0 = v0 + v2 + v4;
+         s1 = v1 + v3 + v5;
+         if (s1 >= s0) {
+            e0 = uint8x4_t(v0, v2, v4, v6);
+            e1 = uint8x4_t(v1, v3, v5, v7);
+         } else {
+            e0 = blue_contract(v1, v3, v5, v7);
+            e1 = blue_contract(v0, v2, v4, v6);
+         }
+         break;
+      case 13:
+         bit_transfer_signed(v1, v0);
+         bit_transfer_signed(v3, v2);
+         bit_transfer_signed(v5, v4);
+         bit_transfer_signed(v7, v6);
+         if (v1 + v3 + v5 >= 0) {
+            e0 = uint8x4_t(v0, v2, v4, v6);
+            e1 = uint8x4_t::clamped(v0+v1, v2+v3, v4+v5, v6+v7);
+         } else {
+            e0 = blue_contract_clamped(v0+v1, v2+v3, v4+v5, v6+v7);
+            e1 = blue_contract(v0, v2, v4, v6);
+         }
+         break;
+      default:
+         /* HDR endpoints not supported; return error colour */
+         e0 = uint8x4_t(255, 0, 255, 255);
+         e1 = uint8x4_t(255, 0, 255, 255);
+         break;
+      }
+
+      endpoints_decoded[0][part] = e0;
+      endpoints_decoded[1][part] = e1;
+
+      if (VERBOSE_DECODE) {
+         printf("cems[%d]=%d v=[", part, cems[part]);
+         for (int i = 0; i < (cems[part] >> 2) + 1; ++i) {
+            if (i)
+               printf(", ");
+            printf("%3d", v[i]);
+         }
+         printf("] e0=[%3d,%4d,%4d,%4d] e1=[%3d,%4d,%4d,%4d]\n",
+                e0.v[0], e0.v[1], e0.v[2], e0.v[3],
+               e1.v[0], e1.v[1], e1.v[2], e1.v[3]);
+      }
+   }
+}
+
+void Block::unpack_weights(InputBitVector in)
+{
+   if (wt_trits) {
+      int offset = 128;
+      int bits_left = weight_bits;
+      for (int i = 0; i < num_weights; i += 5) {
+         int bits_to_read = MIN2(bits_left, 8 + 5*wt_bits);
+         /* If wt_trits then wt_bits <= 3, so bits_to_read <= 23 and we can use uint32_t */
+         uint32_t raw = in.get_bits_rev(offset, bits_to_read);
+         unpack_trit_block(wt_bits, raw, &weights_quant[i]);
+
+         if (VERBOSE_DECODE)
+            in.printf_bits(offset - bits_to_read, bits_to_read, "weight trits [%d,%d,%d,%d,%d]",
+                           weights_quant[i+0], weights_quant[i+1],
+                  weights_quant[i+2], weights_quant[i+3],
+                  weights_quant[i+4]);
+
+         offset -= 8 + wt_bits * 5;
+         bits_left -= 8 + wt_bits * 5;
+      }
+
+   } else if (wt_quints) {
+
+      int offset = 128;
+      int bits_left = weight_bits;
+      for (int i = 0; i < num_weights; i += 3) {
+         int bits_to_read = MIN2(bits_left, 7 + 3*wt_bits);
+         /* If wt_quints then wt_bits <= 2, so bits_to_read <= 13 and we can use uint32_t */
+         uint32_t raw = in.get_bits_rev(offset, bits_to_read);
+         unpack_quint_block(wt_bits, raw, &weights_quant[i]);
+
+         if (VERBOSE_DECODE)
+            in.printf_bits(offset - bits_to_read, bits_to_read, "weight quints [%d,%d,%d]",
+                           weights_quant[i], weights_quant[i+1], weights_quant[i+2]);
+
+         offset -= 7 + wt_bits * 3;
+         bits_left -= 7 + wt_bits * 3;
+      }
+
+   } else {
+      int offset = 128;
+      assert((weight_bits % wt_bits) == 0);
+      for (int i = 0; i < num_weights; ++i) {
+         weights_quant[i] = in.get_bits_rev(offset, wt_bits);
+
+         if (VERBOSE_DECODE)
+            in.printf_bits(offset - wt_bits, wt_bits, "weight bits [%d]", weights_quant[i]);
+
+         offset -= wt_bits;
+      }
+   }
+}
+
+void Block::unquantise_weights()
+{
+   assert(num_weights <= (int)ARRAY_SIZE(weights_quant));
+   assert(num_weights <= (int)ARRAY_SIZE(weights));
+
+   memset(weights, 0, sizeof(weights));
+
+   for (int i = 0; i < num_weights; ++i) {
+
+      uint8_t v = weights_quant[i];
+      uint8_t w;
+
+      if (wt_trits) {
+
+         if (wt_bits == 0) {
+            w = v * 32;
+         } else {
+            uint8_t A, B, C, D;
+            A = (v & 0x1) ? 0x7F : 0x00;
+            switch (wt_bits) {
+            case 1:
+               B = 0;
+               C = 50;
+               D = v >> 1;
+               break;
+            case 2:
+               B = (v & 0x2) ? 0x45 : 0x00;
+               C = 23;
+               D = v >> 2;
+               break;
+            case 3:
+               B = ((v & 0x6) >> 1) | ((v & 0x6) << 4);
+               C = 11;
+               D = v >> 3;
+               break;
+            default:
+               unreachable("");
+            }
+            uint16_t T = D * C + B;
+            T = T ^ A;
+            T = (A & 0x20) | (T >> 2);
+            assert(T < 64);
+            if (T > 32)
+               T++;
+            w = T;
+         }
+
+      } else if (wt_quints) {
+
+         if (wt_bits == 0) {
+            w = v * 16;
+         } else {
+            uint8_t A, B, C, D;
+            A = (v & 0x1) ? 0x7F : 0x00;
+            switch (wt_bits) {
+            case 1:
+               B = 0;
+               C = 28;
+               D = v >> 1;
+               break;
+            case 2:
+               B = (v & 0x2) ? 0x42 : 0x00;
+               C = 13;
+               D = v >> 2;
+               break;
+            default:
+               unreachable("");
+            }
+            uint16_t T = D * C + B;
+            T = T ^ A;
+            T = (A & 0x20) | (T >> 2);
+            assert(T < 64);
+            if (T > 32)
+               T++;
+            w = T;
+         }
+         weights[i] = w;
+
+      } else {
+
+         switch (wt_bits) {
+         case 1: w = v ? 0x3F : 0x00; break;
+         case 2: w = v | (v << 2) | (v << 4); break;
+         case 3: w = v | (v << 3); break;
+         case 4: w = (v >> 2) | (v << 2); break;
+         case 5: w = (v >> 4) | (v << 1); break;
+         default: unreachable("");
+         }
+         assert(w < 64);
+         if (w > 32)
+            w++;
+      }
+      weights[i] = w;
+   }
+}
+
+void Block::compute_infill_weights(int block_w, int block_h, int block_d)
+{
+   int Ds = block_w <= 1 ? 0 : (1024 + block_w / 2) / (block_w - 1);
+   int Dt = block_h <= 1 ? 0 : (1024 + block_h / 2) / (block_h - 1);
+   int Dr = block_d <= 1 ? 0 : (1024 + block_d / 2) / (block_d - 1);
+   for (int r = 0; r < block_d; ++r) {
+      for (int t = 0; t < block_h; ++t) {
+         for (int s = 0; s < block_w; ++s) {
+            int cs = Ds * s;
+            int ct = Dt * t;
+            int cr = Dr * r;
+            int gs = (cs * (wt_w - 1) + 32) >> 6;
+            int gt = (ct * (wt_h - 1) + 32) >> 6;
+            int gr = (cr * (wt_d - 1) + 32) >> 6;
+            assert(gs >= 0 && gs <= 176);
+            assert(gt >= 0 && gt <= 176);
+            assert(gr >= 0 && gr <= 176);
+            int js = gs >> 4;
+            int fs = gs & 0xf;
+            int jt = gt >> 4;
+            int ft = gt & 0xf;
+            int jr = gr >> 4;
+            int fr = gr & 0xf;
+
+            /* TODO: 3D */
+            (void)jr;
+            (void)fr;
+
+            int w11 = (fs * ft + 8) >> 4;
+            int w10 = ft - w11;
+            int w01 = fs - w11;
+            int w00 = 16 - fs - ft + w11;
+
+            if (dual_plane) {
+               int p00, p01, p10, p11, i0, i1;
+               int v0 = js + jt * wt_w;
+               p00 = weights[(v0) * 2];
+               p01 = weights[(v0 + 1) * 2];
+               p10 = weights[(v0 + wt_w) * 2];
+               p11 = weights[(v0 + wt_w + 1) * 2];
+               i0 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
+               p00 = weights[(v0) * 2 + 1];
+               p01 = weights[(v0 + 1) * 2 + 1];
+               p10 = weights[(v0 + wt_w) * 2 + 1];
+               p11 = weights[(v0 + wt_w + 1) * 2 + 1];
+               assert((v0 + wt_w + 1) * 2 + 1 < (int)ARRAY_SIZE(weights));
+               i1 = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
+               assert(0 <= i0 && i0 <= 64);
+               infill_weights[0][s + t*block_w + r*block_w*block_h] = i0;
+               infill_weights[1][s + t*block_w + r*block_w*block_h] = i1;
+            } else {
+               int p00, p01, p10, p11, i;
+               int v0 = js + jt * wt_w;
+               p00 = weights[v0];
+               p01 = weights[v0 + 1];
+               p10 = weights[v0 + wt_w];
+               p11 = weights[v0 + wt_w + 1];
+               assert(v0 + wt_w + 1 < (int)ARRAY_SIZE(weights));
+               i = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
+               assert(0 <= i && i <= 64);
+               infill_weights[0][s + t*block_w + r*block_w*block_h] = i;
+            }
+         }
+      }
+   }
+}
+
+void Block::unquantise_colour_endpoints()
+{
+   assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints_quant));
+   assert(num_cem_values <= (int)ARRAY_SIZE(colour_endpoints));
+
+   for (int i = 0; i < num_cem_values; ++i) {
+      uint8_t v = colour_endpoints_quant[i];
+
+      if (ce_trits) {
+         uint16_t A, B, C, D;
+         uint16_t t;
+         A = (v & 0x1) ? 0x1FF : 0x000;
+         switch (ce_bits) {
+         case 1:
+            B = 0;
+            C = 204;
+            D = v >> 1;
+            break;
+         case 2:
+            B = (v & 0x2) ? 0x116 : 0x000;
+            C = 93;
+            D = v >> 2;
+            break;
+         case 3:
+            t = ((v >> 1) & 0x3);
+            B = t | (t << 2) | (t << 7);
+            C = 44;
+            D = v >> 3;
+            break;
+         case 4:
+            t = ((v >> 1) & 0x7);
+            B = t | (t << 6);
+            C = 22;
+            D = v >> 4;
+            break;
+         case 5:
+            t = ((v >> 1) & 0xF);
+            B = (t >> 2) | (t << 5);
+            C = 11;
+            D = v >> 5;
+            break;
+         case 6:
+            B = ((v & 0x3E) << 3) | ((v >> 5) & 0x1);
+            C = 5;
+            D = v >> 6;
+            break;
+         default:
+            unreachable("");
+         }
+         uint16_t T = D * C + B;
+         T = T ^ A;
+         T = (A & 0x80) | (T >> 2);
+         assert(T < 256);
+         colour_endpoints[i] = T;
+      } else if (ce_quints) {
+         uint16_t A, B, C, D;
+         uint16_t t;
+         A = (v & 0x1) ? 0x1FF : 0x000;
+         switch (ce_bits) {
+         case 1:
+            B = 0;
+            C = 113;
+            D = v >> 1;
+            break;
+         case 2:
+            B = (v & 0x2) ? 0x10C : 0x000;
+            C = 54;
+            D = v >> 2;
+            break;
+         case 3:
+            t = ((v >> 1) & 0x3);
+            B = (t >> 1) | (t << 1) | (t << 7);
+            C = 26;
+            D = v >> 3;
+            break;
+         case 4:
+            t = ((v >> 1) & 0x7);
+            B = (t >> 1) | (t << 6);
+            C = 13;
+            D = v >> 4;
+            break;
+         case 5:
+            t = ((v >> 1) & 0xF);
+            B = (t >> 4) | (t << 5);
+            C = 6;
+            D = v >> 5;
+            break;
+         default:
+            unreachable("");
+         }
+         uint16_t T = D * C + B;
+         T = T ^ A;
+         T = (A & 0x80) | (T >> 2);
+         assert(T < 256);
+         colour_endpoints[i] = T;
+      } else {
+         switch (ce_bits) {
+         case 1: v = v ? 0xFF : 0x00; break;
+         case 2: v = (v << 6) | (v << 4) | (v << 2) | v; break;
+         case 3: v = (v << 5) | (v << 2) | (v >> 1); break;
+         case 4: v = (v << 4) | v; break;
+         case 5: v = (v << 3) | (v >> 2); break;
+         case 6: v = (v << 2) | (v >> 4); break;
+         case 7: v = (v << 1) | (v >> 6); break;
+         case 8: break;
+         default: unreachable("");
+         }
+         colour_endpoints[i] = v;
+      }
+   }
+}
+
+decode_error::type Block::decode(const Decoder &decoder, InputBitVector in)
+{
+   decode_error::type err;
+
+   is_error = false;
+   bogus_colour_endpoints = false;
+   bogus_weights = false;
+   is_void_extent = false;
+
+   wt_d = 1;
+   /* TODO: 3D */
+
+   /* TODO: test for all the illegal encodings */
+
+   if (VERBOSE_DECODE)
+      in.printf_bits(0, 128);
+
+   err = decode_block_mode(in);
+   if (err != decode_error::ok)
+      return err;
+
+   if (is_void_extent)
+      return decode_error::ok;
+
+   /* TODO: 3D */
+
+   calculate_from_weights();
+
+   if (VERBOSE_DECODE)
+      printf("weights_grid=%dx%dx%d dual_plane=%d num_weights=%d high_prec=%d r=%d range=0..%d (%dt %dq %db) weight_bits=%d\n",
+             wt_w, wt_h, wt_d, dual_plane, num_weights, high_prec, wt_range, wt_max, wt_trits, wt_quints, wt_bits, weight_bits);
+
+   if (wt_w > decoder.block_w || wt_h > decoder.block_h || wt_d > decoder.block_d)
+      return decode_error::weight_grid_exceeds_block_size;
+
+   num_parts = in.get_bits(11, 2) + 1;
+
+   if (VERBOSE_DECODE)
+      in.printf_bits(11, 2, "partitions = %d", num_parts);
+
+   if (dual_plane && num_parts > 3)
+      return decode_error::dual_plane_and_too_many_partitions;
+
+   decode_cem(in);
+
+   if (VERBOSE_DECODE)
+      printf("cem=[%d,%d,%d,%d] base_cem_class=%d\n", cems[0], cems[1], cems[2], cems[3], cem_base_class);
+
+   int num_cem_pairs = (cem_base_class + 1) * num_parts + extra_cem_bits;
+   num_cem_values = num_cem_pairs * 2;
+
+   calculate_remaining_bits();
+   err = calculate_colour_endpoints_size();
+   if (err != decode_error::ok)
+      return err;
+
+   if (VERBOSE_DECODE)
+      in.printf_bits(colour_endpoint_data_offset, colour_endpoint_bits,
+                     "endpoint data (%d bits, %d vals, %dt %dq %db)",
+                     colour_endpoint_bits, num_cem_values, ce_trits, ce_quints, ce_bits);
+
+   unpack_colour_endpoints(in);
+
+   if (VERBOSE_DECODE) {
+      printf("cem values raw =[");
+      for (int i = 0; i < num_cem_values; i++) {
+         if (i)
+            printf(", ");
+         printf("%3d", colour_endpoints_quant[i]);
+      }
+      printf("]\n");
+   }
+
+   if (num_cem_values > 18)
+      return decode_error::invalid_colour_endpoints_count;
+
+   unquantise_colour_endpoints();
+
+   if (VERBOSE_DECODE) {
+      printf("cem values norm=[");
+      for (int i = 0; i < num_cem_values; i++) {
+         if (i)
+            printf(", ");
+         printf("%3d", colour_endpoints[i]);
+      }
+      printf("]\n");
+   }
+
+   decode_colour_endpoints();
+
+   if (dual_plane) {
+      int ccs_offset = 128 - weight_bits - num_extra_cem_bits - 2;
+      colour_component_selector = in.get_bits(ccs_offset, 2);
+
+      if (VERBOSE_DECODE)
+         in.printf_bits(ccs_offset, 2, "colour component selector = %d", colour_component_selector);
+   } else {
+      colour_component_selector = 0;
+   }
+
+
+   if (VERBOSE_DECODE)
+      in.printf_bits(128 - weight_bits, weight_bits, "weights (%d bits)", weight_bits);
+
+   if (num_weights > 64)
+      return decode_error::invalid_num_weights;
+
+   if (weight_bits < 24 || weight_bits > 96)
+      return decode_error::invalid_weight_bits;
+
+   unpack_weights(in);
+
+   unquantise_weights();
+
+   if (VERBOSE_DECODE) {
+      printf("weights=[");
+      for (int i = 0; i < num_weights; ++i) {
+         if (i)
+            printf(", ");
+         printf("%d", weights[i]);
+      }
+      printf("]\n");
+
+      for (int plane = 0; plane <= dual_plane; ++plane) {
+         printf("weights (plane %d):\n", plane);
+         int i = 0;
+         (void)i;
+
+         for (int r = 0; r < wt_d; ++r) {
+            for (int t = 0; t < wt_h; ++t) {
+               for (int s = 0; s < wt_w; ++s) {
+                  printf("%3d", weights[i++ * (1 + dual_plane) + plane]);
+               }
+               printf("\n");
+            }
+            if (r < wt_d - 1)
+               printf("\n");
+         }
+      }
+   }
+
+   compute_infill_weights(decoder.block_w, decoder.block_h, decoder.block_d);
+
+   if (VERBOSE_DECODE) {
+      for (int plane = 0; plane <= dual_plane; ++plane) {
+         printf("infilled weights (plane %d):\n", plane);
+         int i = 0;
+         (void)i;
+
+         for (int r = 0; r < decoder.block_d; ++r) {
+            for (int t = 0; t < decoder.block_h; ++t) {
+               for (int s = 0; s < decoder.block_w; ++s) {
+                  printf("%3d", infill_weights[plane][i++]);
+               }
+               printf("\n");
+            }
+            if (r < decoder.block_d - 1)
+               printf("\n");
+         }
+      }
+   }
+   if (VERBOSE_DECODE)
+      printf("\n");
+
+   return decode_error::ok;
+}
+
+void Block::write_decoded(const Decoder &decoder, uint16_t *output)
+{
+   /* sRGB can only be stored as unorm8. */
+   assert(!decoder.srgb || decoder.output_unorm8);
+
+   if (is_void_extent) {
+      for (int idx = 0; idx < decoder.block_w*decoder.block_h*decoder.block_d; ++idx) {
+         if (decoder.output_unorm8) {
+            if (decoder.srgb) {
+               output[idx*4+0] = void_extent_colour_r >> 8;
+               output[idx*4+1] = void_extent_colour_g >> 8;
+               output[idx*4+2] = void_extent_colour_b >> 8;
+            } else {
+               output[idx*4+0] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_r);
+               output[idx*4+1] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_g);
+               output[idx*4+2] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_b);
+            }
+            output[idx*4+3] = uint16_div_64k_to_half_to_unorm8(void_extent_colour_a);
+         } else {
+            /* Store the color as FP16. */
+            output[idx*4+0] = _mesa_uint16_div_64k_to_half(void_extent_colour_r);
+            output[idx*4+1] = _mesa_uint16_div_64k_to_half(void_extent_colour_g);
+            output[idx*4+2] = _mesa_uint16_div_64k_to_half(void_extent_colour_b);
+            output[idx*4+3] = _mesa_uint16_div_64k_to_half(void_extent_colour_a);
+         }
+      }
+      return;
+   }
+
+   int small_block = (decoder.block_w * decoder.block_h * decoder.block_d) < 31;
+
+   int idx = 0;
+   for (int z = 0; z < decoder.block_d; ++z) {
+      for (int y = 0; y < decoder.block_h; ++y) {
+         for (int x = 0; x < decoder.block_w; ++x) {
+
+            int partition;
+            if (num_parts > 1) {
+               partition = select_partition(partition_index, x, y, z, num_parts, small_block);
+               assert(partition < num_parts);
+            } else {
+               partition = 0;
+            }
+
+            /* TODO: HDR */
+
+            uint8x4_t e0 = endpoints_decoded[0][partition];
+            uint8x4_t e1 = endpoints_decoded[1][partition];
+            uint16_t c0[4], c1[4];
+
+            /* Expand to 16 bits. */
+            if (decoder.srgb) {
+               c0[0] = (uint16_t)((e0.v[0] << 8) | 0x80);
+               c0[1] = (uint16_t)((e0.v[1] << 8) | 0x80);
+               c0[2] = (uint16_t)((e0.v[2] << 8) | 0x80);
+               c0[3] = (uint16_t)((e0.v[3] << 8) | 0x80);
+
+               c1[0] = (uint16_t)((e1.v[0] << 8) | 0x80);
+               c1[1] = (uint16_t)((e1.v[1] << 8) | 0x80);
+               c1[2] = (uint16_t)((e1.v[2] << 8) | 0x80);
+               c1[3] = (uint16_t)((e1.v[3] << 8) | 0x80);
+            } else {
+               c0[0] = (uint16_t)((e0.v[0] << 8) | e0.v[0]);
+               c0[1] = (uint16_t)((e0.v[1] << 8) | e0.v[1]);
+               c0[2] = (uint16_t)((e0.v[2] << 8) | e0.v[2]);
+               c0[3] = (uint16_t)((e0.v[3] << 8) | e0.v[3]);
+
+               c1[0] = (uint16_t)((e1.v[0] << 8) | e1.v[0]);
+               c1[1] = (uint16_t)((e1.v[1] << 8) | e1.v[1]);
+               c1[2] = (uint16_t)((e1.v[2] << 8) | e1.v[2]);
+               c1[3] = (uint16_t)((e1.v[3] << 8) | e1.v[3]);
+            }
+
+            int w[4];
+            if (dual_plane) {
+               int w0 = infill_weights[0][idx];
+               int w1 = infill_weights[1][idx];
+               w[0] = w[1] = w[2] = w[3] = w0;
+               w[colour_component_selector] = w1;
+            } else {
+               int w0 = infill_weights[0][idx];
+               w[0] = w[1] = w[2] = w[3] = w0;
+            }
+
+            /* Interpolate to produce UNORM16, applying weights. */
+            uint16_t c[4] = {
+               (uint16_t)((c0[0] * (64 - w[0]) + c1[0] * w[0] + 32) >> 6),
+               (uint16_t)((c0[1] * (64 - w[1]) + c1[1] * w[1] + 32) >> 6),
+               (uint16_t)((c0[2] * (64 - w[2]) + c1[2] * w[2] + 32) >> 6),
+               (uint16_t)((c0[3] * (64 - w[3]) + c1[3] * w[3] + 32) >> 6),
+            };
+
+            if (decoder.output_unorm8) {
+               if (decoder.srgb) {
+                  output[idx*4+0] = c[0] >> 8;
+                  output[idx*4+1] = c[1] >> 8;
+                  output[idx*4+2] = c[2] >> 8;
+               } else {
+                  output[idx*4+0] = c[0] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[0]);
+                  output[idx*4+1] = c[1] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[1]);
+                  output[idx*4+2] = c[2] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[2]);
+               }
+               output[idx*4+3] = c[3] == 65535 ? 0xff : uint16_div_64k_to_half_to_unorm8(c[3]);
+            } else {
+               /* Store the color as FP16. */
+               output[idx*4+0] = c[0] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[0]);
+               output[idx*4+1] = c[1] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[1]);
+               output[idx*4+2] = c[2] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[2]);
+               output[idx*4+3] = c[3] == 65535 ? FP16_ONE : _mesa_uint16_div_64k_to_half(c[3]);
+            }
+
+            idx++;
+         }
+      }
+   }
+}
+
+void Block::calculate_from_weights()
+{
+   wt_trits = 0;
+   wt_quints = 0;
+   wt_bits = 0;
+   switch (high_prec) {
+   case 0:
+      switch (wt_range) {
+      case 0x2: wt_max = 1; wt_bits = 1; break;
+      case 0x3: wt_max = 2; wt_trits = 1; break;
+      case 0x4: wt_max = 3; wt_bits = 2; break;
+      case 0x5: wt_max = 4; wt_quints = 1; break;
+      case 0x6: wt_max = 5; wt_trits = 1; wt_bits = 1; break;
+      case 0x7: wt_max = 7; wt_bits = 3; break;
+      default: abort();
+      }
+      break;
+   case 1:
+      switch (wt_range) {
+      case 0x2: wt_max = 9; wt_quints = 1; wt_bits = 1; break;
+      case 0x3: wt_max = 11; wt_trits = 1; wt_bits = 2; break;
+      case 0x4: wt_max = 15; wt_bits = 4; break;
+      case 0x5: wt_max = 19; wt_quints = 1; wt_bits = 2; break;
+      case 0x6: wt_max = 23; wt_trits = 1; wt_bits = 3; break;
+      case 0x7: wt_max = 31; wt_bits = 5; break;
+      default: abort();
+      }
+      break;
+   }
+
+   assert(wt_trits || wt_quints || wt_bits);
+
+   num_weights = wt_w * wt_h * wt_d;
+
+   if (dual_plane)
+      num_weights *= 2;
+
+   weight_bits =
+         (num_weights * 8 * wt_trits + 4) / 5
+         + (num_weights * 7 * wt_quints + 2) / 3
+         +  num_weights * wt_bits;
+}
+
+void Block::calculate_remaining_bits()
+{
+   int config_bits;
+   if (num_parts > 1) {
+      if (!is_multi_cem)
+         config_bits = 29;
+      else
+         config_bits = 25 + 3 * num_parts;
+   } else {
+      config_bits = 17;
+   }
+
+   if (dual_plane)
+      config_bits += 2;
+
+   remaining_bits = 128 - config_bits - weight_bits;
+}
+
+decode_error::type Block::calculate_colour_endpoints_size()
+{
+   /* Specified as illegal */
+   if (remaining_bits < (13 * num_cem_values + 4) / 5) {
+      colour_endpoint_bits = ce_max = ce_trits = ce_quints = ce_bits = 0;
+      return decode_error::invalid_colour_endpoints_size;
+   }
+
+   /* Find the largest cem_ranges that fits within remaining_bits */
+   for (int i = ARRAY_SIZE(cem_ranges)-1; i >= 0; --i) {
+      int cem_bits;
+      cem_bits = (num_cem_values * 8 * cem_ranges[i].t + 4) / 5
+                 + (num_cem_values * 7 * cem_ranges[i].q + 2) / 3
+                 +  num_cem_values * cem_ranges[i].b;
+
+      if (cem_bits <= remaining_bits)
+      {
+         colour_endpoint_bits = cem_bits;
+         ce_max = cem_ranges[i].max;
+         ce_trits = cem_ranges[i].t;
+         ce_quints = cem_ranges[i].q;
+         ce_bits = cem_ranges[i].b;
+         return decode_error::ok;
+      }
+   }
+
+   assert(0);
+   return decode_error::invalid_colour_endpoints_size;
+}
+
+/**
+ * Decode ASTC 2D LDR texture data.
+ *
+ * \param src_width in pixels
+ * \param src_height in pixels
+ * \param dst_stride in bytes
+ */
+extern "C" void
+_mesa_unpack_astc_2d_ldr(uint8_t *dst_row,
+                         unsigned dst_stride,
+                         const uint8_t *src_row,
+                         unsigned src_stride,
+                         unsigned src_width,
+                         unsigned src_height,
+                         mesa_format format)
+{
+   assert(_mesa_is_format_astc_2d(format));
+   bool srgb = _mesa_get_format_color_encoding(format) == GL_SRGB;
+
+   unsigned blk_w, blk_h;
+   _mesa_get_format_block_size(format, &blk_w, &blk_h);
+
+   const unsigned block_size = 16;
+   unsigned x_blocks = (src_width + blk_w - 1) / blk_w;
+   unsigned y_blocks = (src_height + blk_h - 1) / blk_h;
+
+   Decoder dec(blk_w, blk_h, 1, srgb, true);
+
+   for (unsigned y = 0; y < y_blocks; ++y) {
+      for (unsigned x = 0; x < x_blocks; ++x) {
+         /* Same size as the largest block. */
+         uint16_t block_out[12 * 12 * 4];
+
+         dec.decode(src_row + x * block_size, block_out);
+
+         /* This can be smaller with NPOT dimensions. */
+         unsigned dst_blk_w = MIN2(blk_w, src_width  - x*blk_w);
+         unsigned dst_blk_h = MIN2(blk_h, src_height - y*blk_h);
+
+         for (unsigned sub_y = 0; sub_y < dst_blk_h; ++sub_y) {
+            for (unsigned sub_x = 0; sub_x < dst_blk_w; ++sub_x) {
+               uint8_t *dst = dst_row + sub_y * dst_stride +
+                              (x * blk_w + sub_x) * 4;
+               const uint16_t *src = &block_out[(sub_y * blk_w + sub_x) * 4];
+
+               dst[0] = src[0];
+               dst[1] = src[1];
+               dst[2] = src[2];
+               dst[3] = src[3];
+            }
+         }
+      }
+      src_row += src_stride;
+      dst_row += dst_stride * blk_h;
+   }
+}
diff --git a/src/intel/tools/gen_disasm.h b/src/mesa/main/texcompress_astc.h
similarity index 68%
copy from src/intel/tools/gen_disasm.h
copy to src/mesa/main/texcompress_astc.h
index c8c18b2..9f9c528 100644
--- a/src/intel/tools/gen_disasm.h
+++ b/src/mesa/main/texcompress_astc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014 Intel Corporation
+ * Copyright 2018 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -17,29 +17,31 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
  */
 
-#ifndef GEN_DISASM_H
-#define GEN_DISASM_H
+#ifndef TEXCOMPRESS_ASTC_H
+#define TEXCOMPRESS_ASTC_H
 
-#include "intel/dev/gen_device_info.h"
+#include <inttypes.h>
+#include "texcompress.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct gen_disasm;
-
-struct gen_disasm *gen_disasm_create(const struct gen_device_info *devinfo);
-void gen_disasm_disassemble(struct gen_disasm *disasm,
-                            void *assembly, int start, FILE *out);
-
-void gen_disasm_destroy(struct gen_disasm *disasm);
+void
+_mesa_unpack_astc_2d_ldr(uint8_t *dst_row,
+                         unsigned dst_stride,
+                         const uint8_t *src_row,
+                         unsigned src_stride,
+                         unsigned src_width,
+                         unsigned src_height,
+                         mesa_format format);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif /* GEN_DISASM_H */
+#endif
diff --git a/src/mesa/main/texcompress_bptc.c b/src/mesa/main/texcompress_bptc.c
index fd37be9..46279f1 100644
--- a/src/mesa/main/texcompress_bptc.c
+++ b/src/mesa/main/texcompress_bptc.c
@@ -29,632 +29,38 @@
 #include <stdbool.h>
 #include "texcompress.h"
 #include "texcompress_bptc.h"
-#include "util/format_srgb.h"
-#include "util/half_float.h"
+#include "texcompress_bptc_tmp.h"
 #include "texstore.h"
-#include "macros.h"
 #include "image.h"
 #include "mtypes.h"
 
-#define BLOCK_SIZE 4
-#define N_PARTITIONS 64
-#define BLOCK_BYTES 16
-
-struct bptc_unorm_mode {
-   int n_subsets;
-   int n_partition_bits;
-   bool has_rotation_bits;
-   bool has_index_selection_bit;
-   int n_color_bits;
-   int n_alpha_bits;
-   bool has_endpoint_pbits;
-   bool has_shared_pbits;
-   int n_index_bits;
-   int n_secondary_index_bits;
-};
-
-struct bptc_float_bitfield {
-   int8_t endpoint;
-   uint8_t component;
-   uint8_t offset;
-   uint8_t n_bits;
-   bool reverse;
-};
-
-struct bptc_float_mode {
-   bool reserved;
-   bool transformed_endpoints;
-   int n_partition_bits;
-   int n_endpoint_bits;
-   int n_index_bits;
-   int n_delta_bits[3];
-   struct bptc_float_bitfield bitfields[24];
-};
-
-struct bit_writer {
-   uint8_t buf;
-   int pos;
-   uint8_t *dst;
-};
-
-static const struct bptc_unorm_mode
-bptc_unorm_modes[] = {
-   /* 0 */ { 3, 4, false, false, 4, 0, true,  false, 3, 0 },
-   /* 1 */ { 2, 6, false, false, 6, 0, false, true,  3, 0 },
-   /* 2 */ { 3, 6, false, false, 5, 0, false, false, 2, 0 },
-   /* 3 */ { 2, 6, false, false, 7, 0, true,  false, 2, 0 },
-   /* 4 */ { 1, 0, true,  true,  5, 6, false, false, 2, 3 },
-   /* 5 */ { 1, 0, true,  false, 7, 8, false, false, 2, 2 },
-   /* 6 */ { 1, 0, false, false, 7, 7, true,  false, 4, 0 },
-   /* 7 */ { 2, 6, false, false, 5, 5, true,  false, 2, 0 }
-};
-
-static const struct bptc_float_mode
-bptc_float_modes[] = {
-   /* 00 */
-   { false, true, 5, 10, 3, { 5, 5, 5 },
-     { { 2, 1, 4, 1, false }, { 2, 2, 4, 1, false }, { 3, 2, 4, 1, false },
-       { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
-       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
-       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
-       { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
-       { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
-       { 3, 2, 3, 1, false },
-       { -1 } }
-   },
-   /* 01 */
-   { false, true, 5, 7, 3, { 6, 6, 6 },
-     { { 2, 1, 5, 1, false }, { 3, 1, 4, 1, false }, { 3, 1, 5, 1, false },
-       { 0, 0, 0, 7, false }, { 3, 2, 0, 1, false }, { 3, 2, 1, 1, false },
-       { 2, 2, 4, 1, false }, { 0, 1, 0, 7, false }, { 2, 2, 5, 1, false },
-       { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false }, { 0, 2, 0, 7, false },
-       { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
-       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
-       { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
-       { 2, 0, 0, 6, false },
-       { 3, 0, 0, 6, false },
-       { -1 } }
-   },
-   /* 00010 */
-   { false, true, 5, 11, 3, { 5, 4, 4 },
-     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
-       { 1, 0, 0, 5, false }, { 0, 0, 10, 1, false }, { 2, 1, 0, 4, false },
-       { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false }, { 3, 2, 0, 1, false },
-       { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
-       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
-       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
-       { -1 } }
-   },
-   /* 00011 */
-   { false, false, 0, 10, 4, { 10, 10, 10 },
-     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
-       { 1, 0, 0, 10, false }, { 1, 1, 0, 10, false }, { 1, 2, 0, 10, false },
-       { -1 } }
-   },
-   /* 00110 */
-   { false, true, 5, 11, 3, { 4, 5, 4 },
-     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
-       { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 3, 1, 4, 1, false },
-       { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false }, { 0, 1, 10, 1, false },
-       { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
-       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
-       { 3, 2, 0, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
-       { 2, 1, 4, 1, false }, { 3, 2, 3, 1, false },
-       { -1 } }
-   },
-   /* 00111 */
-   { false, true, 0, 11, 4, { 9, 9, 9 },
-     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
-       { 1, 0, 0, 9, false }, { 0, 0, 10, 1, false }, { 1, 1, 0, 9, false },
-       { 0, 1, 10, 1, false }, { 1, 2, 0, 9, false }, { 0, 2, 10, 1, false },
-       { -1 } }
-   },
-   /* 01010 */
-   { false, true, 5, 11, 3, { 4, 4, 5 },
-     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
-       { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 2, 2, 4, 1, false },
-       { 2, 1, 0, 4, false }, { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false },
-       { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
-       { 0, 2, 10, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
-       { 3, 2, 1, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
-       { 3, 2, 4, 1, false }, { 3, 2, 3, 1, false },
-       { -1 } }
-   },
-   /* 01011 */
-   { false, true, 0, 12, 4, { 8, 8, 8 },
-     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
-       { 1, 0, 0, 8, false }, { 0, 0, 10, 2, true }, { 1, 1, 0, 8, false },
-       { 0, 1, 10, 2, true }, { 1, 2, 0, 8, false }, { 0, 2, 10, 2, true },
-       { -1 } }
-   },
-   /* 01110 */
-   { false, true, 5, 9, 3, { 5, 5, 5 },
-     { { 0, 0, 0, 9, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 9, false },
-       { 2, 1, 4, 1, false }, { 0, 2, 0, 9, false }, { 3, 2, 4, 1, false },
-       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
-       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
-       { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
-       { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
-       { 3, 2, 3, 1, false },
-       { -1 } }
-   },
-   /* 01111 */
-   { false, true, 0, 16, 4, { 4, 4, 4 },
-     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
-       { 1, 0, 0, 4, false }, { 0, 0, 10, 6, true }, { 1, 1, 0, 4, false },
-       { 0, 1, 10, 6, true }, { 1, 2, 0, 4, false }, { 0, 2, 10, 6, true },
-       { -1 } }
-   },
-   /* 10010 */
-   { false, true, 5, 8, 3, { 6, 5, 5 },
-     { { 0, 0, 0, 8, false }, { 3, 1, 4, 1, false }, { 2, 2, 4, 1, false },
-       { 0, 1, 0, 8, false }, { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false },
-       { 0, 2, 0, 8, false }, { 3, 2, 3, 1, false }, { 3, 2, 4, 1, false },
-       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false },
-       { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
-       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 6, false },
-       { 3, 0, 0, 6, false },
-       { -1 } }
-   },
-   /* 10011 */
-   { true /* reserved */ },
-   /* 10110 */
-   { false, true, 5, 8, 3, { 5, 6, 5 },
-     { { 0, 0, 0, 8, false }, { 3, 2, 0, 1, false }, { 2, 2, 4, 1, false },
-       { 0, 1, 0, 8, false }, { 2, 1, 5, 1, false }, { 2, 1, 4, 1, false },
-       { 0, 2, 0, 8, false }, { 3, 1, 5, 1, false }, { 3, 2, 4, 1, false },
-       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
-       { 1, 1, 0, 6, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
-       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
-       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
-       { -1 } }
-   },
-   /* 10111 */
-   { true /* reserved */ },
-   /* 11010 */
-   { false, true, 5, 8, 3, { 5, 5, 6 },
-     { { 0, 0, 0, 8, false }, { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false },
-       { 0, 1, 0, 8, false }, { 2, 2, 5, 1, false }, { 2, 1, 4, 1, false },
-       { 0, 2, 0, 8, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
-       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
-       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
-       { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
-       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
-       { -1 } }
-   },
-   /* 11011 */
-   { true /* reserved */ },
-   /* 11110 */
-   { false, false, 5, 6, 3, { 6, 6, 6 },
-     { { 0, 0, 0, 6, false }, { 3, 1, 4, 1, false }, { 3, 2, 0, 1, false },
-       { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 6, false },
-       { 2, 1, 5, 1, false }, { 2, 2, 5, 1, false }, { 3, 2, 2, 1, false },
-       { 2, 1, 4, 1, false }, { 0, 2, 0, 6, false }, { 3, 1, 5, 1, false },
-       { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
-       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
-       { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
-       { 2, 0, 0, 6, false }, { 3, 0, 0, 6, false },
-       { -1 } }
-   },
-   /* 11111 */
-   { true /* reserved */ },
-};
-
-/* This partition table is used when the mode has two subsets. Each
- * partition is represented by a 32-bit value which gives 2 bits per texel
- * within the block. The value of the two bits represents which subset to use
- * (0 or 1).
- */
-static const uint32_t
-partition_table1[N_PARTITIONS] = {
-   0x50505050U, 0x40404040U, 0x54545454U, 0x54505040U,
-   0x50404000U, 0x55545450U, 0x55545040U, 0x54504000U,
-   0x50400000U, 0x55555450U, 0x55544000U, 0x54400000U,
-   0x55555440U, 0x55550000U, 0x55555500U, 0x55000000U,
-   0x55150100U, 0x00004054U, 0x15010000U, 0x00405054U,
-   0x00004050U, 0x15050100U, 0x05010000U, 0x40505054U,
-   0x00404050U, 0x05010100U, 0x14141414U, 0x05141450U,
-   0x01155440U, 0x00555500U, 0x15014054U, 0x05414150U,
-   0x44444444U, 0x55005500U, 0x11441144U, 0x05055050U,
-   0x05500550U, 0x11114444U, 0x41144114U, 0x44111144U,
-   0x15055054U, 0x01055040U, 0x05041050U, 0x05455150U,
-   0x14414114U, 0x50050550U, 0x41411414U, 0x00141400U,
-   0x00041504U, 0x00105410U, 0x10541000U, 0x04150400U,
-   0x50410514U, 0x41051450U, 0x05415014U, 0x14054150U,
-   0x41050514U, 0x41505014U, 0x40011554U, 0x54150140U,
-   0x50505500U, 0x00555050U, 0x15151010U, 0x54540404U,
-};
-
-/* This partition table is used when the mode has three subsets. In this case
- * the values can be 0, 1 or 2.
- */
-static const uint32_t
-partition_table2[N_PARTITIONS] = {
-   0xaa685050U, 0x6a5a5040U, 0x5a5a4200U, 0x5450a0a8U,
-   0xa5a50000U, 0xa0a05050U, 0x5555a0a0U, 0x5a5a5050U,
-   0xaa550000U, 0xaa555500U, 0xaaaa5500U, 0x90909090U,
-   0x94949494U, 0xa4a4a4a4U, 0xa9a59450U, 0x2a0a4250U,
-   0xa5945040U, 0x0a425054U, 0xa5a5a500U, 0x55a0a0a0U,
-   0xa8a85454U, 0x6a6a4040U, 0xa4a45000U, 0x1a1a0500U,
-   0x0050a4a4U, 0xaaa59090U, 0x14696914U, 0x69691400U,
-   0xa08585a0U, 0xaa821414U, 0x50a4a450U, 0x6a5a0200U,
-   0xa9a58000U, 0x5090a0a8U, 0xa8a09050U, 0x24242424U,
-   0x00aa5500U, 0x24924924U, 0x24499224U, 0x50a50a50U,
-   0x500aa550U, 0xaaaa4444U, 0x66660000U, 0xa5a0a5a0U,
-   0x50a050a0U, 0x69286928U, 0x44aaaa44U, 0x66666600U,
-   0xaa444444U, 0x54a854a8U, 0x95809580U, 0x96969600U,
-   0xa85454a8U, 0x80959580U, 0xaa141414U, 0x96960000U,
-   0xaaaa1414U, 0xa05050a0U, 0xa0a5a5a0U, 0x96000000U,
-   0x40804080U, 0xa9a8a9a8U, 0xaaaaaa44U, 0x2a4a5254U
-};
-
-static const uint8_t
-anchor_indices[][N_PARTITIONS] = {
-   /* Anchor index values for the second subset of two-subset partitioning */
-   {
-      0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,
-      0xf,0x2,0x8,0x2,0x2,0x8,0x8,0xf,0x2,0x8,0x2,0x2,0x8,0x8,0x2,0x2,
-      0xf,0xf,0x6,0x8,0x2,0x8,0xf,0xf,0x2,0x8,0x2,0x2,0x2,0xf,0xf,0x6,
-      0x6,0x2,0x6,0x8,0xf,0xf,0x2,0x2,0xf,0xf,0xf,0xf,0xf,0x2,0x2,0xf
-   },
-
-   /* Anchor index values for the second subset of three-subset partitioning */
-   {
-      0x3,0x3,0xf,0xf,0x8,0x3,0xf,0xf,0x8,0x8,0x6,0x6,0x6,0x5,0x3,0x3,
-      0x3,0x3,0x8,0xf,0x3,0x3,0x6,0xa,0x5,0x8,0x8,0x6,0x8,0x5,0xf,0xf,
-      0x8,0xf,0x3,0x5,0x6,0xa,0x8,0xf,0xf,0x3,0xf,0x5,0xf,0xf,0xf,0xf,
-      0x3,0xf,0x5,0x5,0x5,0x8,0x5,0xa,0x5,0xa,0x8,0xd,0xf,0xc,0x3,0x3
-   },
-
-   /* Anchor index values for the third subset of three-subset
-    * partitioning
-    */
-   {
-      0xf,0x8,0x8,0x3,0xf,0xf,0x3,0x8,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x8,
-      0xf,0x8,0xf,0x3,0xf,0x8,0xf,0x8,0x3,0xf,0x6,0xa,0xf,0xf,0xa,0x8,
-      0xf,0x3,0xf,0xa,0xa,0x8,0x9,0xa,0x6,0xf,0x8,0xf,0x3,0x6,0x6,0x8,
-      0xf,0x3,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x3,0xf,0xf,0x8
-   }
-};
-
-static int
-extract_bits(const uint8_t *block,
-             int offset,
-             int n_bits)
+static void
+fetch_bptc_rgb_float(const GLubyte *map,
+                     GLint rowStride, GLint i, GLint j,
+                     GLfloat *texel,
+                     bool is_signed)
 {
-   int byte_index = offset / 8;
-   int bit_index = offset % 8;
-   int n_bits_in_byte = MIN2(n_bits, 8 - bit_index);
-   int result = 0;
-   int bit = 0;
+   const GLubyte *block;
 
-   while (true) {
-      result |= ((block[byte_index] >> bit_index) &
-                 ((1 << n_bits_in_byte) - 1)) << bit;
+   block = map + (((rowStride + 3) / 4) * (j / 4) + (i / 4)) * 16;
 
-      n_bits -= n_bits_in_byte;
-
-      if (n_bits <= 0)
-         return result;
-
-      bit += n_bits_in_byte;
-      byte_index++;
-      bit_index = 0;
-      n_bits_in_byte = MIN2(n_bits, 8);
-   }
-}
-
-static uint8_t
-expand_component(uint8_t byte,
-                 int n_bits)
-{
-   /* Expands a n-bit quantity into a byte by copying the most-significant
-    * bits into the unused least-significant bits.
-    */
-   return byte << (8 - n_bits) | (byte >> (2 * n_bits - 8));
-}
-
-static int
-extract_unorm_endpoints(const struct bptc_unorm_mode *mode,
-                        const uint8_t *block,
-                        int bit_offset,
-                        uint8_t endpoints[][4])
-{
-   int component;
-   int subset;
-   int endpoint;
-   int pbit;
-   int n_components;
-
-   /* Extract each color component */
-   for (component = 0; component < 3; component++) {
-      for (subset = 0; subset < mode->n_subsets; subset++) {
-         for (endpoint = 0; endpoint < 2; endpoint++) {
-            endpoints[subset * 2 + endpoint][component] =
-               extract_bits(block, bit_offset, mode->n_color_bits);
-            bit_offset += mode->n_color_bits;
-         }
-      }
-   }
-
-   /* Extract the alpha values */
-   if (mode->n_alpha_bits > 0) {
-      for (subset = 0; subset < mode->n_subsets; subset++) {
-         for (endpoint = 0; endpoint < 2; endpoint++) {
-            endpoints[subset * 2 + endpoint][3] =
-               extract_bits(block, bit_offset, mode->n_alpha_bits);
-            bit_offset += mode->n_alpha_bits;
-         }
-      }
-
-      n_components = 4;
-   } else {
-      for (subset = 0; subset < mode->n_subsets; subset++)
-         for (endpoint = 0; endpoint < 2; endpoint++)
-            endpoints[subset * 2 + endpoint][3] = 255;
-
-      n_components = 3;
-   }
-
-   /* Add in the p-bits */
-   if (mode->has_endpoint_pbits) {
-      for (subset = 0; subset < mode->n_subsets; subset++) {
-         for (endpoint = 0; endpoint < 2; endpoint++) {
-            pbit = extract_bits(block, bit_offset, 1);
-            bit_offset += 1;
-
-            for (component = 0; component < n_components; component++) {
-               endpoints[subset * 2 + endpoint][component] <<= 1;
-               endpoints[subset * 2 + endpoint][component] |= pbit;
-            }
-         }
-      }
-   } else if (mode->has_shared_pbits) {
-      for (subset = 0; subset < mode->n_subsets; subset++) {
-         pbit = extract_bits(block, bit_offset, 1);
-         bit_offset += 1;
-
-         for (endpoint = 0; endpoint < 2; endpoint++) {
-            for (component = 0; component < n_components; component++) {
-               endpoints[subset * 2 + endpoint][component] <<= 1;
-               endpoints[subset * 2 + endpoint][component] |= pbit;
-            }
-         }
-      }
-   }
-
-   /* Expand the n-bit values to a byte */
-   for (subset = 0; subset < mode->n_subsets; subset++) {
-      for (endpoint = 0; endpoint < 2; endpoint++) {
-         for (component = 0; component < 3; component++) {
-            endpoints[subset * 2 + endpoint][component] =
-               expand_component(endpoints[subset * 2 + endpoint][component],
-                                mode->n_color_bits +
-                                mode->has_endpoint_pbits +
-                                mode->has_shared_pbits);
-         }
-
-         if (mode->n_alpha_bits > 0) {
-            endpoints[subset * 2 + endpoint][3] =
-               expand_component(endpoints[subset * 2 + endpoint][3],
-                                mode->n_alpha_bits +
-                                mode->has_endpoint_pbits +
-                                mode->has_shared_pbits);
-         }
-      }
-   }
-
-   return bit_offset;
-}
-
-static bool
-is_anchor(int n_subsets,
-          int partition_num,
-          int texel)
-{
-   if (texel == 0)
-      return true;
-
-   switch (n_subsets) {
-   case 1:
-      return false;
-   case 2:
-      return anchor_indices[0][partition_num] == texel;
-   case 3:
-      return (anchor_indices[1][partition_num] == texel ||
-              anchor_indices[2][partition_num] == texel);
-   default:
-      assert(false);
-      return false;
-   }
-}
-
-static int
-count_anchors_before_texel(int n_subsets,
-                           int partition_num,
-                           int texel)
-{
-   int count = 1;
-
-   if (texel == 0)
-      return 0;
-
-   switch (n_subsets) {
-   case 1:
-      break;
-   case 2:
-      if (texel > anchor_indices[0][partition_num])
-         count++;
-      break;
-   case 3:
-      if (texel > anchor_indices[1][partition_num])
-         count++;
-      if (texel > anchor_indices[2][partition_num])
-         count++;
-      break;
-   default:
-      assert(false);
-      return 0;
-   }
-
-   return count;
-}
-
-static int32_t
-interpolate(int32_t a, int32_t b,
-            int index,
-            int index_bits)
-{
-   static const uint8_t weights2[] = { 0, 21, 43, 64 };
-   static const uint8_t weights3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
-   static const uint8_t weights4[] =
-      { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
-   static const uint8_t *weights[] = {
-      NULL, NULL, weights2, weights3, weights4
-   };
-   int weight;
-
-   weight = weights[index_bits][index];
-
-   return ((64 - weight) * a + weight * b + 32) >> 6;
+   fetch_rgb_float_from_block(block, texel, (i % 4) + (j % 4) * 4, is_signed);
 }
 
 static void
-apply_rotation(int rotation,
-               uint8_t *result)
+fetch_bptc_rgb_signed_float(const GLubyte *map,
+                            GLint rowStride, GLint i, GLint j,
+                            GLfloat *texel)
 {
-   uint8_t t;
-
-   if (rotation == 0)
-      return;
-
-   rotation--;
-
-   t = result[rotation];
-   result[rotation] = result[3];
-   result[3] = t;
+   fetch_bptc_rgb_float(map, rowStride, i, j, texel, true);
 }
 
 static void
-fetch_rgba_unorm_from_block(const uint8_t *block,
-                            uint8_t *result,
-                            int texel)
+fetch_bptc_rgb_unsigned_float(const GLubyte *map,
+                              GLint rowStride, GLint i, GLint j,
+                              GLfloat *texel)
 {
-   int mode_num = ffs(block[0]);
-   const struct bptc_unorm_mode *mode;
-   int bit_offset, secondary_bit_offset;
-   int partition_num;
-   int subset_num;
-   int rotation;
-   int index_selection;
-   int index_bits;
-   int indices[2];
-   int index;
-   int anchors_before_texel;
-   bool anchor;
-   uint8_t endpoints[3 * 2][4];
-   uint32_t subsets;
-   int component;
-
-   if (mode_num == 0) {
-      /* According to the spec this mode is reserved and shouldn't be used. */
-      memset(result, 0, 3);
-      result[3] = 0xff;
-      return;
-   }
-
-   mode = bptc_unorm_modes + mode_num - 1;
-   bit_offset = mode_num;
-
-   partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
-   bit_offset += mode->n_partition_bits;
-
-   switch (mode->n_subsets) {
-   case 1:
-      subsets = 0;
-      break;
-   case 2:
-      subsets = partition_table1[partition_num];
-      break;
-   case 3:
-      subsets = partition_table2[partition_num];
-      break;
-   default:
-      assert(false);
-      return;
-   }
-
-   if (mode->has_rotation_bits) {
-      rotation = extract_bits(block, bit_offset, 2);
-      bit_offset += 2;
-   } else {
-      rotation = 0;
-   }
-
-   if (mode->has_index_selection_bit) {
-      index_selection = extract_bits(block, bit_offset, 1);
-      bit_offset++;
-   } else {
-      index_selection = 0;
-   }
-
-   bit_offset = extract_unorm_endpoints(mode, block, bit_offset, endpoints);
-
-   anchors_before_texel = count_anchors_before_texel(mode->n_subsets,
-                                                     partition_num, texel);
-
-   /* Calculate the offset to the secondary index */
-   secondary_bit_offset = (bit_offset +
-                           BLOCK_SIZE * BLOCK_SIZE * mode->n_index_bits -
-                           mode->n_subsets +
-                           mode->n_secondary_index_bits * texel -
-                           anchors_before_texel);
-
-   /* Calculate the offset to the primary index for this texel */
-   bit_offset += mode->n_index_bits * texel - anchors_before_texel;
-
-   subset_num = (subsets >> (texel * 2)) & 3;
-
-   anchor = is_anchor(mode->n_subsets, partition_num, texel);
-
-   index_bits = mode->n_index_bits;
-   if (anchor)
-      index_bits--;
-   indices[0] = extract_bits(block, bit_offset, index_bits);
-
-   if (mode->n_secondary_index_bits) {
-      index_bits = mode->n_secondary_index_bits;
-      if (anchor)
-         index_bits--;
-      indices[1] = extract_bits(block, secondary_bit_offset, index_bits);
-   }
-
-   index = indices[index_selection];
-   index_bits = (index_selection ?
-                 mode->n_secondary_index_bits :
-                 mode->n_index_bits);
-
-   for (component = 0; component < 3; component++)
-      result[component] = interpolate(endpoints[subset_num * 2][component],
-                                      endpoints[subset_num * 2 + 1][component],
-                                      index,
-                                      index_bits);
-
-   /* Alpha uses the opposite index from the color components */
-   if (mode->n_secondary_index_bits && !index_selection) {
-      index = indices[1];
-      index_bits = mode->n_secondary_index_bits;
-   } else {
-      index = indices[0];
-      index_bits = mode->n_index_bits;
-   }
-
-   result[3] = interpolate(endpoints[subset_num * 2][3],
-                           endpoints[subset_num * 2 + 1][3],
-                           index,
-                           index_bits);
-
-   apply_rotation(rotation, result);
+   fetch_bptc_rgb_float(map, rowStride, i, j, texel, false);
 }
 
 static void
@@ -699,257 +105,6 @@
    texel[ACOMP] = UBYTE_TO_FLOAT(texel_bytes[3]);
 }
 
-static int32_t
-sign_extend(int32_t value,
-            int n_bits)
-{
-   if ((value & (1 << (n_bits - 1)))) {
-      value |= (~(int32_t) 0) << n_bits;
-   }
-
-   return value;
-}
-
-static int
-signed_unquantize(int value, int n_endpoint_bits)
-{
-   bool sign;
-
-   if (n_endpoint_bits >= 16)
-      return value;
-
-   if (value == 0)
-      return 0;
-
-   sign = false;
-
-   if (value < 0) {
-      sign = true;
-      value = -value;
-   }
-
-   if (value >= (1 << (n_endpoint_bits - 1)) - 1)
-      value = 0x7fff;
-   else
-      value = ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
-
-   if (sign)
-      value = -value;
-
-   return value;
-}
-
-static int
-unsigned_unquantize(int value, int n_endpoint_bits)
-{
-   if (n_endpoint_bits >= 15)
-      return value;
-
-   if (value == 0)
-      return 0;
-
-   if (value == (1 << n_endpoint_bits) - 1)
-      return 0xffff;
-
-   return ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
-}
-
-static int
-extract_float_endpoints(const struct bptc_float_mode *mode,
-                        const uint8_t *block,
-                        int bit_offset,
-                        int32_t endpoints[][3],
-                        bool is_signed)
-{
-   const struct bptc_float_bitfield *bitfield;
-   int endpoint, component;
-   int n_endpoints;
-   int value;
-   int i;
-
-   if (mode->n_partition_bits)
-      n_endpoints = 4;
-   else
-      n_endpoints = 2;
-
-   memset(endpoints, 0, sizeof endpoints[0][0] * n_endpoints * 3);
-
-   for (bitfield = mode->bitfields; bitfield->endpoint != -1; bitfield++) {
-      value = extract_bits(block, bit_offset, bitfield->n_bits);
-      bit_offset += bitfield->n_bits;
-
-      if (bitfield->reverse) {
-         for (i = 0; i < bitfield->n_bits; i++) {
-            if (value & (1 << i))
-               endpoints[bitfield->endpoint][bitfield->component] |=
-                  1 << ((bitfield->n_bits - 1 - i) + bitfield->offset);
-         }
-      } else {
-         endpoints[bitfield->endpoint][bitfield->component] |=
-            value << bitfield->offset;
-      }
-   }
-
-   if (mode->transformed_endpoints) {
-      /* The endpoints are specified as signed offsets from e0 */
-      for (endpoint = 1; endpoint < n_endpoints; endpoint++) {
-         for (component = 0; component < 3; component++) {
-            value = sign_extend(endpoints[endpoint][component],
-                                mode->n_delta_bits[component]);
-            endpoints[endpoint][component] =
-               ((endpoints[0][component] + value) &
-                ((1 << mode->n_endpoint_bits) - 1));
-         }
-      }
-   }
-
-   if (is_signed) {
-      for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
-         for (component = 0; component < 3; component++) {
-            value = sign_extend(endpoints[endpoint][component],
-                                mode->n_endpoint_bits);
-            endpoints[endpoint][component] =
-               signed_unquantize(value, mode->n_endpoint_bits);
-         }
-      }
-   } else {
-      for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
-         for (component = 0; component < 3; component++) {
-            endpoints[endpoint][component] =
-               unsigned_unquantize(endpoints[endpoint][component],
-                                   mode->n_endpoint_bits);
-         }
-      }
-   }
-
-   return bit_offset;
-}
-
-static int32_t
-finish_unsigned_unquantize(int32_t value)
-{
-   return value * 31 / 64;
-}
-
-static int32_t
-finish_signed_unquantize(int32_t value)
-{
-   if (value < 0)
-      return (-value * 31 / 32) | 0x8000;
-   else
-      return value * 31 / 32;
-}
-
-static void
-fetch_rgb_float_from_block(const uint8_t *block,
-                           float *result,
-                           int texel,
-                           bool is_signed)
-{
-   int mode_num;
-   const struct bptc_float_mode *mode;
-   int bit_offset;
-   int partition_num;
-   int subset_num;
-   int index_bits;
-   int index;
-   int anchors_before_texel;
-   int32_t endpoints[2 * 2][3];
-   uint32_t subsets;
-   int n_subsets;
-   int component;
-   int32_t value;
-
-   if (block[0] & 0x2) {
-      mode_num = (((block[0] >> 1) & 0xe) | (block[0] & 1)) + 2;
-      bit_offset = 5;
-   } else {
-      mode_num = block[0] & 3;
-      bit_offset = 2;
-   }
-
-   mode = bptc_float_modes + mode_num;
-
-   if (mode->reserved) {
-      memset(result, 0, sizeof result[0] * 3);
-      result[3] = 1.0f;
-      return;
-   }
-
-   bit_offset = extract_float_endpoints(mode, block, bit_offset,
-                                        endpoints, is_signed);
-
-   if (mode->n_partition_bits) {
-      partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
-      bit_offset += mode->n_partition_bits;
-
-      subsets = partition_table1[partition_num];
-      n_subsets = 2;
-   } else {
-      partition_num = 0;
-      subsets = 0;
-      n_subsets = 1;
-   }
-
-   anchors_before_texel =
-      count_anchors_before_texel(n_subsets, partition_num, texel);
-
-   /* Calculate the offset to the primary index for this texel */
-   bit_offset += mode->n_index_bits * texel - anchors_before_texel;
-
-   subset_num = (subsets >> (texel * 2)) & 3;
-
-   index_bits = mode->n_index_bits;
-   if (is_anchor(n_subsets, partition_num, texel))
-      index_bits--;
-   index = extract_bits(block, bit_offset, index_bits);
-
-   for (component = 0; component < 3; component++) {
-      value = interpolate(endpoints[subset_num * 2][component],
-                          endpoints[subset_num * 2 + 1][component],
-                          index,
-                          mode->n_index_bits);
-
-      if (is_signed)
-         value = finish_signed_unquantize(value);
-      else
-         value = finish_unsigned_unquantize(value);
-
-      result[component] = _mesa_half_to_float(value);
-   }
-
-   result[3] = 1.0f;
-}
-
-static void
-fetch_bptc_rgb_float(const GLubyte *map,
-                     GLint rowStride, GLint i, GLint j,
-                     GLfloat *texel,
-                     bool is_signed)
-{
-   const GLubyte *block;
-
-   block = map + (((rowStride + 3) / 4) * (j / 4) + (i / 4)) * 16;
-
-   fetch_rgb_float_from_block(block, texel, (i % 4) + (j % 4) * 4, is_signed);
-}
-
-static void
-fetch_bptc_rgb_signed_float(const GLubyte *map,
-                            GLint rowStride, GLint i, GLint j,
-                            GLfloat *texel)
-{
-   fetch_bptc_rgb_float(map, rowStride, i, j, texel, true);
-}
-
-static void
-fetch_bptc_rgb_unsigned_float(const GLubyte *map,
-                              GLint rowStride, GLint i, GLint j,
-                              GLfloat *texel)
-{
-   fetch_bptc_rgb_float(map, rowStride, i, j, texel, false);
-}
-
 compressed_fetch_func
 _mesa_get_bptc_fetch_func(mesa_format format)
 {
@@ -967,312 +122,6 @@
    }
 }
 
-static void
-write_bits(struct bit_writer *writer, int n_bits, int value)
-{
-   do {
-      if (n_bits + writer->pos >= 8) {
-         *(writer->dst++) = writer->buf | (value << writer->pos);
-         writer->buf = 0;
-         value >>= (8 - writer->pos);
-         n_bits -= (8 - writer->pos);
-         writer->pos = 0;
-      } else {
-         writer->buf |= value << writer->pos;
-         writer->pos += n_bits;
-         break;
-      }
-   } while (n_bits > 0);
-}
-
-static void
-get_average_luminance_alpha_unorm(int width, int height,
-                                  const uint8_t *src, int src_rowstride,
-                                  int *average_luminance, int *average_alpha)
-{
-   int luminance_sum = 0, alpha_sum = 0;
-   int y, x;
-
-   for (y = 0; y < height; y++) {
-      for (x = 0; x < width; x++) {
-         luminance_sum += src[0] + src[1] + src[2];
-         alpha_sum += src[3];
-         src += 4;
-      }
-      src += src_rowstride - width * 4;
-   }
-
-   *average_luminance = luminance_sum / (width * height);
-   *average_alpha = alpha_sum / (width * height);
-}
-
-static void
-get_rgba_endpoints_unorm(int width, int height,
-                         const uint8_t *src, int src_rowstride,
-                         int average_luminance, int average_alpha,
-                         uint8_t endpoints[][4])
-{
-   int endpoint_luminances[2];
-   int midpoint;
-   int sums[2][4];
-   int endpoint;
-   int luminance;
-   uint8_t temp[3];
-   const uint8_t *p = src;
-   int rgb_left_endpoint_count = 0;
-   int alpha_left_endpoint_count = 0;
-   int y, x, i;
-
-   memset(sums, 0, sizeof sums);
-
-   for (y = 0; y < height; y++) {
-      for (x = 0; x < width; x++) {
-         luminance = p[0] + p[1] + p[2];
-         if (luminance < average_luminance) {
-            endpoint = 0;
-            rgb_left_endpoint_count++;
-         } else {
-            endpoint = 1;
-         }
-         for (i = 0; i < 3; i++)
-            sums[endpoint][i] += p[i];
-
-         if (p[2] < average_alpha) {
-            endpoint = 0;
-            alpha_left_endpoint_count++;
-         } else {
-            endpoint = 1;
-         }
-         sums[endpoint][3] += p[3];
-
-         p += 4;
-      }
-
-      p += src_rowstride - width * 4;
-   }
-
-   if (rgb_left_endpoint_count == 0 ||
-       rgb_left_endpoint_count == width * height) {
-      for (i = 0; i < 3; i++)
-         endpoints[0][i] = endpoints[1][i] =
-            (sums[0][i] + sums[1][i]) / (width * height);
-   } else {
-      for (i = 0; i < 3; i++) {
-         endpoints[0][i] = sums[0][i] / rgb_left_endpoint_count;
-         endpoints[1][i] = (sums[1][i] /
-                            (width * height - rgb_left_endpoint_count));
-      }
-   }
-
-   if (alpha_left_endpoint_count == 0 ||
-       alpha_left_endpoint_count == width * height) {
-      endpoints[0][3] = endpoints[1][3] =
-         (sums[0][3] + sums[1][3]) / (width * height);
-   } else {
-         endpoints[0][3] = sums[0][3] / alpha_left_endpoint_count;
-         endpoints[1][3] = (sums[1][3] /
-                            (width * height - alpha_left_endpoint_count));
-   }
-
-   /* We may need to swap the endpoints to ensure the most-significant bit of
-    * the first index is zero */
-
-   for (endpoint = 0; endpoint < 2; endpoint++) {
-      endpoint_luminances[endpoint] =
-         endpoints[endpoint][0] +
-         endpoints[endpoint][1] +
-         endpoints[endpoint][2];
-   }
-   midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2;
-
-   if ((src[0] + src[1] + src[2] <= midpoint) !=
-       (endpoint_luminances[0] <= midpoint)) {
-      memcpy(temp, endpoints[0], 3);
-      memcpy(endpoints[0], endpoints[1], 3);
-      memcpy(endpoints[1], temp, 3);
-   }
-
-   /* Same for the alpha endpoints */
-
-   midpoint = (endpoints[0][3] + endpoints[1][3]) / 2;
-
-   if ((src[3] <= midpoint) != (endpoints[0][3] <= midpoint)) {
-      temp[0] = endpoints[0][3];
-      endpoints[0][3] = endpoints[1][3];
-      endpoints[1][3] = temp[0];
-   }
-}
-
-static void
-write_rgb_indices_unorm(struct bit_writer *writer,
-                        int src_width, int src_height,
-                        const uint8_t *src, int src_rowstride,
-                        uint8_t endpoints[][4])
-{
-   int luminance;
-   int endpoint_luminances[2];
-   int endpoint;
-   int index;
-   int y, x;
-
-   for (endpoint = 0; endpoint < 2; endpoint++) {
-      endpoint_luminances[endpoint] =
-         endpoints[endpoint][0] +
-         endpoints[endpoint][1] +
-         endpoints[endpoint][2];
-   }
-
-   /* If the endpoints have the same luminance then we'll just use index 0 for
-    * all of the texels */
-   if (endpoint_luminances[0] == endpoint_luminances[1]) {
-      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 2 - 1, 0);
-      return;
-   }
-
-   for (y = 0; y < src_height; y++) {
-      for (x = 0; x < src_width; x++) {
-         luminance = src[0] + src[1] + src[2];
-
-         index = ((luminance - endpoint_luminances[0]) * 3 /
-                  (endpoint_luminances[1] - endpoint_luminances[0]));
-         if (index < 0)
-            index = 0;
-         else if (index > 3)
-            index = 3;
-
-         assert(x != 0 || y != 0 || index < 2);
-
-         write_bits(writer, (x == 0 && y == 0) ? 1 : 2, index);
-
-         src += 4;
-      }
-
-      /* Pad the indices out to the block size */
-      if (src_width < BLOCK_SIZE)
-         write_bits(writer, 2 * (BLOCK_SIZE - src_width), 0);
-
-      src += src_rowstride - src_width * 4;
-   }
-
-   /* Pad the indices out to the block size */
-   if (src_height < BLOCK_SIZE)
-      write_bits(writer, 2 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
-}
-
-static void
-write_alpha_indices_unorm(struct bit_writer *writer,
-                          int src_width, int src_height,
-                          const uint8_t *src, int src_rowstride,
-                          uint8_t endpoints[][4])
-{
-   int index;
-   int y, x;
-
-   /* If the endpoints have the same alpha then we'll just use index 0 for
-    * all of the texels */
-   if (endpoints[0][3] == endpoints[1][3]) {
-      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 3 - 1, 0);
-      return;
-   }
-
-   for (y = 0; y < src_height; y++) {
-      for (x = 0; x < src_width; x++) {
-         index = (((int) src[3] - (int) endpoints[0][3]) * 7 /
-                  ((int) endpoints[1][3] - endpoints[0][3]));
-         if (index < 0)
-            index = 0;
-         else if (index > 7)
-            index = 7;
-
-         assert(x != 0 || y != 0 || index < 4);
-
-         /* The first index has one less bit */
-         write_bits(writer, (x == 0 && y == 0) ? 2 : 3, index);
-
-         src += 4;
-      }
-
-      /* Pad the indices out to the block size */
-      if (src_width < BLOCK_SIZE)
-         write_bits(writer, 3 * (BLOCK_SIZE - src_width), 0);
-
-      src += src_rowstride - src_width * 4;
-   }
-
-   /* Pad the indices out to the block size */
-   if (src_height < BLOCK_SIZE)
-      write_bits(writer, 3 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
-}
-
-static void
-compress_rgba_unorm_block(int src_width, int src_height,
-                          const uint8_t *src, int src_rowstride,
-                          uint8_t *dst)
-{
-   int average_luminance, average_alpha;
-   uint8_t endpoints[2][4];
-   struct bit_writer writer;
-   int component, endpoint;
-
-   get_average_luminance_alpha_unorm(src_width, src_height, src, src_rowstride,
-                                     &average_luminance, &average_alpha);
-   get_rgba_endpoints_unorm(src_width, src_height, src, src_rowstride,
-                            average_luminance, average_alpha,
-                            endpoints);
-
-   writer.dst = dst;
-   writer.pos = 0;
-   writer.buf = 0;
-
-   write_bits(&writer, 5, 0x10); /* mode 4 */
-   write_bits(&writer, 2, 0); /* rotation 0 */
-   write_bits(&writer, 1, 0); /* index selection bit */
-
-   /* Write the color endpoints */
-   for (component = 0; component < 3; component++)
-      for (endpoint = 0; endpoint < 2; endpoint++)
-         write_bits(&writer, 5, endpoints[endpoint][component] >> 3);
-
-   /* Write the alpha endpoints */
-   for (endpoint = 0; endpoint < 2; endpoint++)
-      write_bits(&writer, 6, endpoints[endpoint][3] >> 2);
-
-   write_rgb_indices_unorm(&writer,
-                           src_width, src_height,
-                           src, src_rowstride,
-                           endpoints);
-   write_alpha_indices_unorm(&writer,
-                             src_width, src_height,
-                             src, src_rowstride,
-                             endpoints);
-}
-
-static void
-compress_rgba_unorm(int width, int height,
-                    const uint8_t *src, int src_rowstride,
-                    uint8_t *dst, int dst_rowstride)
-{
-   int dst_row_diff;
-   int y, x;
-
-   if (dst_rowstride >= width * 4)
-      dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
-   else
-      dst_row_diff = 0;
-
-   for (y = 0; y < height; y += BLOCK_SIZE) {
-      for (x = 0; x < width; x += BLOCK_SIZE) {
-         compress_rgba_unorm_block(MIN2(width - x, BLOCK_SIZE),
-                                   MIN2(height - y, BLOCK_SIZE),
-                                   src + x * 4 + y * src_rowstride,
-                                   src_rowstride,
-                                   dst);
-         dst += BLOCK_BYTES;
-      }
-      dst += dst_row_diff;
-   }
-}
-
 GLboolean
 _mesa_texstore_bptc_rgba_unorm(TEXSTORE_PARAMS)
 {
@@ -1318,272 +167,6 @@
    return GL_TRUE;
 }
 
-static float
-get_average_luminance_float(int width, int height,
-                            const float *src, int src_rowstride)
-{
-   float luminance_sum = 0;
-   int y, x;
-
-   for (y = 0; y < height; y++) {
-      for (x = 0; x < width; x++) {
-         luminance_sum += src[0] + src[1] + src[2];
-         src += 3;
-      }
-      src += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
-   }
-
-   return luminance_sum / (width * height);
-}
-
-static float
-clamp_value(float value, bool is_signed)
-{
-   if (value > 65504.0f)
-      return 65504.0f;
-
-   if (is_signed) {
-      if (value < -65504.0f)
-         return -65504.0f;
-      else
-         return value;
-   }
-
-   if (value < 0.0f)
-      return 0.0f;
-
-   return value;
-}
-
-static void
-get_endpoints_float(int width, int height,
-                    const float *src, int src_rowstride,
-                    float average_luminance, float endpoints[][3],
-                    bool is_signed)
-{
-   float endpoint_luminances[2];
-   float midpoint;
-   float sums[2][3];
-   int endpoint, component;
-   float luminance;
-   float temp[3];
-   const float *p = src;
-   int left_endpoint_count = 0;
-   int y, x, i;
-
-   memset(sums, 0, sizeof sums);
-
-   for (y = 0; y < height; y++) {
-      for (x = 0; x < width; x++) {
-         luminance = p[0] + p[1] + p[2];
-         if (luminance < average_luminance) {
-            endpoint = 0;
-            left_endpoint_count++;
-         } else {
-            endpoint = 1;
-         }
-         for (i = 0; i < 3; i++)
-            sums[endpoint][i] += p[i];
-
-         p += 3;
-      }
-
-      p += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
-   }
-
-   if (left_endpoint_count == 0 ||
-       left_endpoint_count == width * height) {
-      for (i = 0; i < 3; i++)
-         endpoints[0][i] = endpoints[1][i] =
-            (sums[0][i] + sums[1][i]) / (width * height);
-   } else {
-      for (i = 0; i < 3; i++) {
-         endpoints[0][i] = sums[0][i] / left_endpoint_count;
-         endpoints[1][i] = sums[1][i] / (width * height - left_endpoint_count);
-      }
-   }
-
-   /* Clamp the endpoints to the range of a half float and strip out
-    * infinities */
-   for (endpoint = 0; endpoint < 2; endpoint++) {
-      for (component = 0; component < 3; component++) {
-         endpoints[endpoint][component] =
-            clamp_value(endpoints[endpoint][component], is_signed);
-      }
-   }
-
-   /* We may need to swap the endpoints to ensure the most-significant bit of
-    * the first index is zero */
-
-   for (endpoint = 0; endpoint < 2; endpoint++) {
-      endpoint_luminances[endpoint] =
-         endpoints[endpoint][0] +
-         endpoints[endpoint][1] +
-         endpoints[endpoint][2];
-   }
-   midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2.0f;
-
-   if ((src[0] + src[1] + src[2] <= midpoint) !=
-       (endpoint_luminances[0] <= midpoint)) {
-      memcpy(temp, endpoints[0], sizeof temp);
-      memcpy(endpoints[0], endpoints[1], sizeof temp);
-      memcpy(endpoints[1], temp, sizeof temp);
-   }
-}
-
-static void
-write_rgb_indices_float(struct bit_writer *writer,
-                        int src_width, int src_height,
-                        const float *src, int src_rowstride,
-                        float endpoints[][3])
-{
-   float luminance;
-   float endpoint_luminances[2];
-   int endpoint;
-   int index;
-   int y, x;
-
-   for (endpoint = 0; endpoint < 2; endpoint++) {
-      endpoint_luminances[endpoint] =
-         endpoints[endpoint][0] +
-         endpoints[endpoint][1] +
-         endpoints[endpoint][2];
-   }
-
-   /* If the endpoints have the same luminance then we'll just use index 0 for
-    * all of the texels */
-   if (endpoint_luminances[0] == endpoint_luminances[1]) {
-      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 4 - 1, 0);
-      return;
-   }
-
-   for (y = 0; y < src_height; y++) {
-      for (x = 0; x < src_width; x++) {
-         luminance = src[0] + src[1] + src[2];
-
-         index = ((luminance - endpoint_luminances[0]) * 15 /
-                  (endpoint_luminances[1] - endpoint_luminances[0]));
-         if (index < 0)
-            index = 0;
-         else if (index > 15)
-            index = 15;
-
-         assert(x != 0 || y != 0 || index < 8);
-
-         write_bits(writer, (x == 0 && y == 0) ? 3 : 4, index);
-
-         src += 3;
-      }
-
-      /* Pad the indices out to the block size */
-      if (src_width < BLOCK_SIZE)
-         write_bits(writer, 4 * (BLOCK_SIZE - src_width), 0);
-
-      src += (src_rowstride - src_width * 3 * sizeof (float)) / sizeof (float);
-   }
-
-   /* Pad the indices out to the block size */
-   if (src_height < BLOCK_SIZE)
-      write_bits(writer, 4 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
-}
-
-static int
-get_endpoint_value(float value, bool is_signed)
-{
-   bool sign = false;
-   int half;
-
-   if (is_signed) {
-      half = _mesa_float_to_half(value);
-
-      if (half & 0x8000) {
-         half &= 0x7fff;
-         sign = true;
-      }
-
-      half = (32 * half / 31) >> 6;
-
-      if (sign)
-         half = -half & ((1 << 10) - 1);
-
-      return half;
-   } else {
-      if (value <= 0.0f)
-         return 0;
-
-      half = _mesa_float_to_half(value);
-
-      return (64 * half / 31) >> 6;
-   }
-}
-
-static void
-compress_rgb_float_block(int src_width, int src_height,
-                         const float *src, int src_rowstride,
-                         uint8_t *dst,
-                         bool is_signed)
-{
-   float average_luminance;
-   float endpoints[2][3];
-   struct bit_writer writer;
-   int component, endpoint;
-   int endpoint_value;
-
-   average_luminance =
-      get_average_luminance_float(src_width, src_height, src, src_rowstride);
-   get_endpoints_float(src_width, src_height, src, src_rowstride,
-                       average_luminance, endpoints, is_signed);
-
-   writer.dst = dst;
-   writer.pos = 0;
-   writer.buf = 0;
-
-   write_bits(&writer, 5, 3); /* mode 3 */
-
-   /* Write the endpoints */
-   for (endpoint = 0; endpoint < 2; endpoint++) {
-      for (component = 0; component < 3; component++) {
-         endpoint_value =
-            get_endpoint_value(endpoints[endpoint][component], is_signed);
-         write_bits(&writer, 10, endpoint_value);
-      }
-   }
-
-   write_rgb_indices_float(&writer,
-                           src_width, src_height,
-                           src, src_rowstride,
-                           endpoints);
-}
-
-static void
-compress_rgb_float(int width, int height,
-                   const float *src, int src_rowstride,
-                   uint8_t *dst, int dst_rowstride,
-                   bool is_signed)
-{
-   int dst_row_diff;
-   int y, x;
-
-   if (dst_rowstride >= width * 4)
-      dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
-   else
-      dst_row_diff = 0;
-
-   for (y = 0; y < height; y += BLOCK_SIZE) {
-      for (x = 0; x < width; x += BLOCK_SIZE) {
-         compress_rgb_float_block(MIN2(width - x, BLOCK_SIZE),
-                                  MIN2(height - y, BLOCK_SIZE),
-                                  src + x * 3 +
-                                  y * src_rowstride / sizeof (float),
-                                  src_rowstride,
-                                  dst,
-                                  is_signed);
-         dst += BLOCK_BYTES;
-      }
-      dst += dst_row_diff;
-   }
-}
-
 static GLboolean
 texstore_bptc_rgb_float(TEXSTORE_PARAMS,
                         bool is_signed)
diff --git a/src/mesa/main/texcompress_bptc_tmp.h b/src/mesa/main/texcompress_bptc_tmp.h
new file mode 100644
index 0000000..3c4ea2c
--- /dev/null
+++ b/src/mesa/main/texcompress_bptc_tmp.h
@@ -0,0 +1,1743 @@
+/*
+ * Copyright (C) 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Included by texcompress_bptc and gallium to define BPTC decoding routines.
+ */
+
+#include "util/format_srgb.h"
+#include "util/half_float.h"
+#include "macros.h"
+
+#define BLOCK_SIZE 4
+#define N_PARTITIONS 64
+#define BLOCK_BYTES 16
+
+struct bptc_unorm_mode {
+   int n_subsets;
+   int n_partition_bits;
+   bool has_rotation_bits;
+   bool has_index_selection_bit;
+   int n_color_bits;
+   int n_alpha_bits;
+   bool has_endpoint_pbits;
+   bool has_shared_pbits;
+   int n_index_bits;
+   int n_secondary_index_bits;
+};
+
+struct bptc_float_bitfield {
+   int8_t endpoint;
+   uint8_t component;
+   uint8_t offset;
+   uint8_t n_bits;
+   bool reverse;
+};
+
+struct bptc_float_mode {
+   bool reserved;
+   bool transformed_endpoints;
+   int n_partition_bits;
+   int n_endpoint_bits;
+   int n_index_bits;
+   int n_delta_bits[3];
+   struct bptc_float_bitfield bitfields[24];
+};
+
+struct bit_writer {
+   uint8_t buf;
+   int pos;
+   uint8_t *dst;
+};
+
+static const struct bptc_unorm_mode
+bptc_unorm_modes[] = {
+   /* 0 */ { 3, 4, false, false, 4, 0, true,  false, 3, 0 },
+   /* 1 */ { 2, 6, false, false, 6, 0, false, true,  3, 0 },
+   /* 2 */ { 3, 6, false, false, 5, 0, false, false, 2, 0 },
+   /* 3 */ { 2, 6, false, false, 7, 0, true,  false, 2, 0 },
+   /* 4 */ { 1, 0, true,  true,  5, 6, false, false, 2, 3 },
+   /* 5 */ { 1, 0, true,  false, 7, 8, false, false, 2, 2 },
+   /* 6 */ { 1, 0, false, false, 7, 7, true,  false, 4, 0 },
+   /* 7 */ { 2, 6, false, false, 5, 5, true,  false, 2, 0 }
+};
+
+static const struct bptc_float_mode
+bptc_float_modes[] = {
+   /* 00 */
+   { false, true, 5, 10, 3, { 5, 5, 5 },
+     { { 2, 1, 4, 1, false }, { 2, 2, 4, 1, false }, { 3, 2, 4, 1, false },
+       { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
+       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
+       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
+       { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
+       { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
+       { 3, 2, 3, 1, false },
+       { -1 } }
+   },
+   /* 01 */
+   { false, true, 5, 7, 3, { 6, 6, 6 },
+     { { 2, 1, 5, 1, false }, { 3, 1, 4, 1, false }, { 3, 1, 5, 1, false },
+       { 0, 0, 0, 7, false }, { 3, 2, 0, 1, false }, { 3, 2, 1, 1, false },
+       { 2, 2, 4, 1, false }, { 0, 1, 0, 7, false }, { 2, 2, 5, 1, false },
+       { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false }, { 0, 2, 0, 7, false },
+       { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
+       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
+       { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
+       { 2, 0, 0, 6, false },
+       { 3, 0, 0, 6, false },
+       { -1 } }
+   },
+   /* 00010 */
+   { false, true, 5, 11, 3, { 5, 4, 4 },
+     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
+       { 1, 0, 0, 5, false }, { 0, 0, 10, 1, false }, { 2, 1, 0, 4, false },
+       { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false }, { 3, 2, 0, 1, false },
+       { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
+       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
+       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
+       { -1 } }
+   },
+   /* 00011 */
+   { false, false, 0, 10, 4, { 10, 10, 10 },
+     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
+       { 1, 0, 0, 10, false }, { 1, 1, 0, 10, false }, { 1, 2, 0, 10, false },
+       { -1 } }
+   },
+   /* 00110 */
+   { false, true, 5, 11, 3, { 4, 5, 4 },
+     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
+       { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 3, 1, 4, 1, false },
+       { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false }, { 0, 1, 10, 1, false },
+       { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
+       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
+       { 3, 2, 0, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
+       { 2, 1, 4, 1, false }, { 3, 2, 3, 1, false },
+       { -1 } }
+   },
+   /* 00111 */
+   { false, true, 0, 11, 4, { 9, 9, 9 },
+     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
+       { 1, 0, 0, 9, false }, { 0, 0, 10, 1, false }, { 1, 1, 0, 9, false },
+       { 0, 1, 10, 1, false }, { 1, 2, 0, 9, false }, { 0, 2, 10, 1, false },
+       { -1 } }
+   },
+   /* 01010 */
+   { false, true, 5, 11, 3, { 4, 4, 5 },
+     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
+       { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 2, 2, 4, 1, false },
+       { 2, 1, 0, 4, false }, { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false },
+       { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
+       { 0, 2, 10, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
+       { 3, 2, 1, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
+       { 3, 2, 4, 1, false }, { 3, 2, 3, 1, false },
+       { -1 } }
+   },
+   /* 01011 */
+   { false, true, 0, 12, 4, { 8, 8, 8 },
+     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
+       { 1, 0, 0, 8, false }, { 0, 0, 10, 2, true }, { 1, 1, 0, 8, false },
+       { 0, 1, 10, 2, true }, { 1, 2, 0, 8, false }, { 0, 2, 10, 2, true },
+       { -1 } }
+   },
+   /* 01110 */
+   { false, true, 5, 9, 3, { 5, 5, 5 },
+     { { 0, 0, 0, 9, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 9, false },
+       { 2, 1, 4, 1, false }, { 0, 2, 0, 9, false }, { 3, 2, 4, 1, false },
+       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
+       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
+       { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
+       { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
+       { 3, 2, 3, 1, false },
+       { -1 } }
+   },
+   /* 01111 */
+   { false, true, 0, 16, 4, { 4, 4, 4 },
+     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
+       { 1, 0, 0, 4, false }, { 0, 0, 10, 6, true }, { 1, 1, 0, 4, false },
+       { 0, 1, 10, 6, true }, { 1, 2, 0, 4, false }, { 0, 2, 10, 6, true },
+       { -1 } }
+   },
+   /* 10010 */
+   { false, true, 5, 8, 3, { 6, 5, 5 },
+     { { 0, 0, 0, 8, false }, { 3, 1, 4, 1, false }, { 2, 2, 4, 1, false },
+       { 0, 1, 0, 8, false }, { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false },
+       { 0, 2, 0, 8, false }, { 3, 2, 3, 1, false }, { 3, 2, 4, 1, false },
+       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false },
+       { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
+       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 6, false },
+       { 3, 0, 0, 6, false },
+       { -1 } }
+   },
+   /* 10011 */
+   { true /* reserved */ },
+   /* 10110 */
+   { false, true, 5, 8, 3, { 5, 6, 5 },
+     { { 0, 0, 0, 8, false }, { 3, 2, 0, 1, false }, { 2, 2, 4, 1, false },
+       { 0, 1, 0, 8, false }, { 2, 1, 5, 1, false }, { 2, 1, 4, 1, false },
+       { 0, 2, 0, 8, false }, { 3, 1, 5, 1, false }, { 3, 2, 4, 1, false },
+       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
+       { 1, 1, 0, 6, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
+       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
+       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
+       { -1 } }
+   },
+   /* 10111 */
+   { true /* reserved */ },
+   /* 11010 */
+   { false, true, 5, 8, 3, { 5, 5, 6 },
+     { { 0, 0, 0, 8, false }, { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false },
+       { 0, 1, 0, 8, false }, { 2, 2, 5, 1, false }, { 2, 1, 4, 1, false },
+       { 0, 2, 0, 8, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
+       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
+       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
+       { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
+       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
+       { -1 } }
+   },
+   /* 11011 */
+   { true /* reserved */ },
+   /* 11110 */
+   { false, false, 5, 6, 3, { 6, 6, 6 },
+     { { 0, 0, 0, 6, false }, { 3, 1, 4, 1, false }, { 3, 2, 0, 1, false },
+       { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 6, false },
+       { 2, 1, 5, 1, false }, { 2, 2, 5, 1, false }, { 3, 2, 2, 1, false },
+       { 2, 1, 4, 1, false }, { 0, 2, 0, 6, false }, { 3, 1, 5, 1, false },
+       { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
+       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
+       { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
+       { 2, 0, 0, 6, false }, { 3, 0, 0, 6, false },
+       { -1 } }
+   },
+   /* 11111 */
+   { true /* reserved */ },
+};
+
+/* This partition table is used when the mode has two subsets. Each
+ * partition is represented by a 32-bit value which gives 2 bits per texel
+ * within the block. The value of the two bits represents which subset to use
+ * (0 or 1).
+ */
+static const uint32_t
+partition_table1[N_PARTITIONS] = {
+   0x50505050U, 0x40404040U, 0x54545454U, 0x54505040U,
+   0x50404000U, 0x55545450U, 0x55545040U, 0x54504000U,
+   0x50400000U, 0x55555450U, 0x55544000U, 0x54400000U,
+   0x55555440U, 0x55550000U, 0x55555500U, 0x55000000U,
+   0x55150100U, 0x00004054U, 0x15010000U, 0x00405054U,
+   0x00004050U, 0x15050100U, 0x05010000U, 0x40505054U,
+   0x00404050U, 0x05010100U, 0x14141414U, 0x05141450U,
+   0x01155440U, 0x00555500U, 0x15014054U, 0x05414150U,
+   0x44444444U, 0x55005500U, 0x11441144U, 0x05055050U,
+   0x05500550U, 0x11114444U, 0x41144114U, 0x44111144U,
+   0x15055054U, 0x01055040U, 0x05041050U, 0x05455150U,
+   0x14414114U, 0x50050550U, 0x41411414U, 0x00141400U,
+   0x00041504U, 0x00105410U, 0x10541000U, 0x04150400U,
+   0x50410514U, 0x41051450U, 0x05415014U, 0x14054150U,
+   0x41050514U, 0x41505014U, 0x40011554U, 0x54150140U,
+   0x50505500U, 0x00555050U, 0x15151010U, 0x54540404U,
+};
+
+/* This partition table is used when the mode has three subsets. In this case
+ * the values can be 0, 1 or 2.
+ */
+static const uint32_t
+partition_table2[N_PARTITIONS] = {
+   0xaa685050U, 0x6a5a5040U, 0x5a5a4200U, 0x5450a0a8U,
+   0xa5a50000U, 0xa0a05050U, 0x5555a0a0U, 0x5a5a5050U,
+   0xaa550000U, 0xaa555500U, 0xaaaa5500U, 0x90909090U,
+   0x94949494U, 0xa4a4a4a4U, 0xa9a59450U, 0x2a0a4250U,
+   0xa5945040U, 0x0a425054U, 0xa5a5a500U, 0x55a0a0a0U,
+   0xa8a85454U, 0x6a6a4040U, 0xa4a45000U, 0x1a1a0500U,
+   0x0050a4a4U, 0xaaa59090U, 0x14696914U, 0x69691400U,
+   0xa08585a0U, 0xaa821414U, 0x50a4a450U, 0x6a5a0200U,
+   0xa9a58000U, 0x5090a0a8U, 0xa8a09050U, 0x24242424U,
+   0x00aa5500U, 0x24924924U, 0x24499224U, 0x50a50a50U,
+   0x500aa550U, 0xaaaa4444U, 0x66660000U, 0xa5a0a5a0U,
+   0x50a050a0U, 0x69286928U, 0x44aaaa44U, 0x66666600U,
+   0xaa444444U, 0x54a854a8U, 0x95809580U, 0x96969600U,
+   0xa85454a8U, 0x80959580U, 0xaa141414U, 0x96960000U,
+   0xaaaa1414U, 0xa05050a0U, 0xa0a5a5a0U, 0x96000000U,
+   0x40804080U, 0xa9a8a9a8U, 0xaaaaaa44U, 0x2a4a5254U
+};
+
+static const uint8_t
+anchor_indices[][N_PARTITIONS] = {
+   /* Anchor index values for the second subset of two-subset partitioning */
+   {
+      0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,
+      0xf,0x2,0x8,0x2,0x2,0x8,0x8,0xf,0x2,0x8,0x2,0x2,0x8,0x8,0x2,0x2,
+      0xf,0xf,0x6,0x8,0x2,0x8,0xf,0xf,0x2,0x8,0x2,0x2,0x2,0xf,0xf,0x6,
+      0x6,0x2,0x6,0x8,0xf,0xf,0x2,0x2,0xf,0xf,0xf,0xf,0xf,0x2,0x2,0xf
+   },
+
+   /* Anchor index values for the second subset of three-subset partitioning */
+   {
+      0x3,0x3,0xf,0xf,0x8,0x3,0xf,0xf,0x8,0x8,0x6,0x6,0x6,0x5,0x3,0x3,
+      0x3,0x3,0x8,0xf,0x3,0x3,0x6,0xa,0x5,0x8,0x8,0x6,0x8,0x5,0xf,0xf,
+      0x8,0xf,0x3,0x5,0x6,0xa,0x8,0xf,0xf,0x3,0xf,0x5,0xf,0xf,0xf,0xf,
+      0x3,0xf,0x5,0x5,0x5,0x8,0x5,0xa,0x5,0xa,0x8,0xd,0xf,0xc,0x3,0x3
+   },
+
+   /* Anchor index values for the third subset of three-subset
+    * partitioning
+    */
+   {
+      0xf,0x8,0x8,0x3,0xf,0xf,0x3,0x8,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x8,
+      0xf,0x8,0xf,0x3,0xf,0x8,0xf,0x8,0x3,0xf,0x6,0xa,0xf,0xf,0xa,0x8,
+      0xf,0x3,0xf,0xa,0xa,0x8,0x9,0xa,0x6,0xf,0x8,0xf,0x3,0x6,0x6,0x8,
+      0xf,0x3,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x3,0xf,0xf,0x8
+   }
+};
+
+static int
+extract_bits(const uint8_t *block,
+             int offset,
+             int n_bits)
+{
+   int byte_index = offset / 8;
+   int bit_index = offset % 8;
+   int n_bits_in_byte = MIN2(n_bits, 8 - bit_index);
+   int result = 0;
+   int bit = 0;
+
+   while (true) {
+      result |= ((block[byte_index] >> bit_index) &
+                 ((1 << n_bits_in_byte) - 1)) << bit;
+
+      n_bits -= n_bits_in_byte;
+
+      if (n_bits <= 0)
+         return result;
+
+      bit += n_bits_in_byte;
+      byte_index++;
+      bit_index = 0;
+      n_bits_in_byte = MIN2(n_bits, 8);
+   }
+}
+
+static uint8_t
+expand_component(uint8_t byte,
+                 int n_bits)
+{
+   /* Expands a n-bit quantity into a byte by copying the most-significant
+    * bits into the unused least-significant bits.
+    */
+   return byte << (8 - n_bits) | (byte >> (2 * n_bits - 8));
+}
+
+static int
+extract_unorm_endpoints(const struct bptc_unorm_mode *mode,
+                        const uint8_t *block,
+                        int bit_offset,
+                        uint8_t endpoints[][4])
+{
+   int component;
+   int subset;
+   int endpoint;
+   int pbit;
+   int n_components;
+
+   /* Extract each color component */
+   for (component = 0; component < 3; component++) {
+      for (subset = 0; subset < mode->n_subsets; subset++) {
+         for (endpoint = 0; endpoint < 2; endpoint++) {
+            endpoints[subset * 2 + endpoint][component] =
+               extract_bits(block, bit_offset, mode->n_color_bits);
+            bit_offset += mode->n_color_bits;
+         }
+      }
+   }
+
+   /* Extract the alpha values */
+   if (mode->n_alpha_bits > 0) {
+      for (subset = 0; subset < mode->n_subsets; subset++) {
+         for (endpoint = 0; endpoint < 2; endpoint++) {
+            endpoints[subset * 2 + endpoint][3] =
+               extract_bits(block, bit_offset, mode->n_alpha_bits);
+            bit_offset += mode->n_alpha_bits;
+         }
+      }
+
+      n_components = 4;
+   } else {
+      for (subset = 0; subset < mode->n_subsets; subset++)
+         for (endpoint = 0; endpoint < 2; endpoint++)
+            endpoints[subset * 2 + endpoint][3] = 255;
+
+      n_components = 3;
+   }
+
+   /* Add in the p-bits */
+   if (mode->has_endpoint_pbits) {
+      for (subset = 0; subset < mode->n_subsets; subset++) {
+         for (endpoint = 0; endpoint < 2; endpoint++) {
+            pbit = extract_bits(block, bit_offset, 1);
+            bit_offset += 1;
+
+            for (component = 0; component < n_components; component++) {
+               endpoints[subset * 2 + endpoint][component] <<= 1;
+               endpoints[subset * 2 + endpoint][component] |= pbit;
+            }
+         }
+      }
+   } else if (mode->has_shared_pbits) {
+      for (subset = 0; subset < mode->n_subsets; subset++) {
+         pbit = extract_bits(block, bit_offset, 1);
+         bit_offset += 1;
+
+         for (endpoint = 0; endpoint < 2; endpoint++) {
+            for (component = 0; component < n_components; component++) {
+               endpoints[subset * 2 + endpoint][component] <<= 1;
+               endpoints[subset * 2 + endpoint][component] |= pbit;
+            }
+         }
+      }
+   }
+
+   /* Expand the n-bit values to a byte */
+   for (subset = 0; subset < mode->n_subsets; subset++) {
+      for (endpoint = 0; endpoint < 2; endpoint++) {
+         for (component = 0; component < 3; component++) {
+            endpoints[subset * 2 + endpoint][component] =
+               expand_component(endpoints[subset * 2 + endpoint][component],
+                                mode->n_color_bits +
+                                mode->has_endpoint_pbits +
+                                mode->has_shared_pbits);
+         }
+
+         if (mode->n_alpha_bits > 0) {
+            endpoints[subset * 2 + endpoint][3] =
+               expand_component(endpoints[subset * 2 + endpoint][3],
+                                mode->n_alpha_bits +
+                                mode->has_endpoint_pbits +
+                                mode->has_shared_pbits);
+         }
+      }
+   }
+
+   return bit_offset;
+}
+
+static bool
+is_anchor(int n_subsets,
+          int partition_num,
+          int texel)
+{
+   if (texel == 0)
+      return true;
+
+   switch (n_subsets) {
+   case 1:
+      return false;
+   case 2:
+      return anchor_indices[0][partition_num] == texel;
+   case 3:
+      return (anchor_indices[1][partition_num] == texel ||
+              anchor_indices[2][partition_num] == texel);
+   default:
+      assert(false);
+      return false;
+   }
+}
+
+static int
+count_anchors_before_texel(int n_subsets,
+                           int partition_num,
+                           int texel)
+{
+   int count = 1;
+
+   if (texel == 0)
+      return 0;
+
+   switch (n_subsets) {
+   case 1:
+      break;
+   case 2:
+      if (texel > anchor_indices[0][partition_num])
+         count++;
+      break;
+   case 3:
+      if (texel > anchor_indices[1][partition_num])
+         count++;
+      if (texel > anchor_indices[2][partition_num])
+         count++;
+      break;
+   default:
+      assert(false);
+      return 0;
+   }
+
+   return count;
+}
+
+static int32_t
+interpolate(int32_t a, int32_t b,
+            int index,
+            int index_bits)
+{
+   static const uint8_t weights2[] = { 0, 21, 43, 64 };
+   static const uint8_t weights3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
+   static const uint8_t weights4[] =
+      { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
+   static const uint8_t *weights[] = {
+      NULL, NULL, weights2, weights3, weights4
+   };
+   int weight;
+
+   weight = weights[index_bits][index];
+
+   return ((64 - weight) * a + weight * b + 32) >> 6;
+}
+
+static void
+apply_rotation(int rotation,
+               uint8_t *result)
+{
+   uint8_t t;
+
+   if (rotation == 0)
+      return;
+
+   rotation--;
+
+   t = result[rotation];
+   result[rotation] = result[3];
+   result[3] = t;
+}
+
+static void
+fetch_rgba_unorm_from_block(const uint8_t *block,
+                            uint8_t *result,
+                            int texel)
+{
+   int mode_num = ffs(block[0]);
+   const struct bptc_unorm_mode *mode;
+   int bit_offset, secondary_bit_offset;
+   int partition_num;
+   int subset_num;
+   int rotation;
+   int index_selection;
+   int index_bits;
+   int indices[2];
+   int index;
+   int anchors_before_texel;
+   bool anchor;
+   uint8_t endpoints[3 * 2][4];
+   uint32_t subsets;
+   int component;
+
+   if (mode_num == 0) {
+      /* According to the spec this mode is reserved and shouldn't be used. */
+      memset(result, 0, 3);
+      result[3] = 0xff;
+      return;
+   }
+
+   mode = bptc_unorm_modes + mode_num - 1;
+   bit_offset = mode_num;
+
+   partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
+   bit_offset += mode->n_partition_bits;
+
+   switch (mode->n_subsets) {
+   case 1:
+      subsets = 0;
+      break;
+   case 2:
+      subsets = partition_table1[partition_num];
+      break;
+   case 3:
+      subsets = partition_table2[partition_num];
+      break;
+   default:
+      assert(false);
+      return;
+   }
+
+   if (mode->has_rotation_bits) {
+      rotation = extract_bits(block, bit_offset, 2);
+      bit_offset += 2;
+   } else {
+      rotation = 0;
+   }
+
+   if (mode->has_index_selection_bit) {
+      index_selection = extract_bits(block, bit_offset, 1);
+      bit_offset++;
+   } else {
+      index_selection = 0;
+   }
+
+   bit_offset = extract_unorm_endpoints(mode, block, bit_offset, endpoints);
+
+   anchors_before_texel = count_anchors_before_texel(mode->n_subsets,
+                                                     partition_num, texel);
+
+   /* Calculate the offset to the secondary index */
+   secondary_bit_offset = (bit_offset +
+                           BLOCK_SIZE * BLOCK_SIZE * mode->n_index_bits -
+                           mode->n_subsets +
+                           mode->n_secondary_index_bits * texel -
+                           anchors_before_texel);
+
+   /* Calculate the offset to the primary index for this texel */
+   bit_offset += mode->n_index_bits * texel - anchors_before_texel;
+
+   subset_num = (subsets >> (texel * 2)) & 3;
+
+   anchor = is_anchor(mode->n_subsets, partition_num, texel);
+
+   index_bits = mode->n_index_bits;
+   if (anchor)
+      index_bits--;
+   indices[0] = extract_bits(block, bit_offset, index_bits);
+
+   if (mode->n_secondary_index_bits) {
+      index_bits = mode->n_secondary_index_bits;
+      if (anchor)
+         index_bits--;
+      indices[1] = extract_bits(block, secondary_bit_offset, index_bits);
+   }
+
+   index = indices[index_selection];
+   index_bits = (index_selection ?
+                 mode->n_secondary_index_bits :
+                 mode->n_index_bits);
+
+   for (component = 0; component < 3; component++)
+      result[component] = interpolate(endpoints[subset_num * 2][component],
+                                      endpoints[subset_num * 2 + 1][component],
+                                      index,
+                                      index_bits);
+
+   /* Alpha uses the opposite index from the color components */
+   if (mode->n_secondary_index_bits && !index_selection) {
+      index = indices[1];
+      index_bits = mode->n_secondary_index_bits;
+   } else {
+      index = indices[0];
+      index_bits = mode->n_index_bits;
+   }
+
+   result[3] = interpolate(endpoints[subset_num * 2][3],
+                           endpoints[subset_num * 2 + 1][3],
+                           index,
+                           index_bits);
+
+   apply_rotation(rotation, result);
+}
+
+#ifdef BPTC_BLOCK_DECODE
+static void
+decompress_rgba_unorm_block(int src_width, int src_height,
+                            const uint8_t *block,
+                            uint8_t *dst_row, int dst_rowstride)
+{
+   int mode_num = ffs(block[0]);
+   const struct bptc_unorm_mode *mode;
+   int bit_offset, secondary_bit_offset;
+   int partition_num;
+   int subset_num;
+   int rotation;
+   int index_selection;
+   int index_bits;
+   int indices[2];
+   int index;
+   int anchors_before_texel;
+   bool anchor;
+   uint8_t endpoints[3 * 2][4];
+   uint32_t subsets;
+   int component;
+   unsigned x, y;
+
+   if (mode_num == 0) {
+      /* According to the spec this mode is reserved and shouldn't be used. */
+      for(y = 0; y < src_height; y += 1) {
+         uint8_t *result = dst_row;
+         memset(result, 0, 4 * src_width);
+         for(x = 0; x < src_width; x += 1) {
+            result[3] = 0xff;
+            result += 4;
+         }
+         dst_row += dst_rowstride;
+      }
+      return;
+   }
+
+   mode = bptc_unorm_modes + mode_num - 1;
+   bit_offset = mode_num;
+
+   partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
+   bit_offset += mode->n_partition_bits;
+
+   switch (mode->n_subsets) {
+   case 1:
+      subsets = 0;
+      break;
+   case 2:
+      subsets = partition_table1[partition_num];
+      break;
+   case 3:
+      subsets = partition_table2[partition_num];
+      break;
+   default:
+      assert(false);
+      return;
+   }
+
+   if (mode->has_rotation_bits) {
+      rotation = extract_bits(block, bit_offset, 2);
+      bit_offset += 2;
+   } else {
+      rotation = 0;
+   }
+
+   if (mode->has_index_selection_bit) {
+      index_selection = extract_bits(block, bit_offset, 1);
+      bit_offset++;
+   } else {
+      index_selection = 0;
+   }
+
+   bit_offset = extract_unorm_endpoints(mode, block, bit_offset, endpoints);
+
+   for(y = 0; y < src_height; y += 1) {
+      uint8_t *result = dst_row;
+      for(x = 0; x < src_width; x += 1) {
+         int texel;
+         texel = x + y * 4;
+
+         anchors_before_texel = count_anchors_before_texel(mode->n_subsets,
+                                                           partition_num,
+                                                           texel);
+
+         /* Calculate the offset to the secondary index */
+         secondary_bit_offset = (bit_offset +
+                                 BLOCK_SIZE * BLOCK_SIZE * mode->n_index_bits -
+                                 mode->n_subsets +
+                                 mode->n_secondary_index_bits * texel -
+                                 anchors_before_texel);
+
+         /* Calculate the offset to the primary index for this texel */
+         bit_offset += mode->n_index_bits * texel - anchors_before_texel;
+
+         subset_num = (subsets >> (texel * 2)) & 3;
+
+         anchor = is_anchor(mode->n_subsets, partition_num, texel);
+
+         index_bits = mode->n_index_bits;
+         if (anchor)
+            index_bits--;
+         indices[0] = extract_bits(block, bit_offset, index_bits);
+
+         if (mode->n_secondary_index_bits) {
+            index_bits = mode->n_secondary_index_bits;
+            if (anchor)
+               index_bits--;
+            indices[1] = extract_bits(block, secondary_bit_offset, index_bits);
+         }
+
+         index = indices[index_selection];
+         index_bits = (index_selection ?
+                       mode->n_secondary_index_bits :
+                       mode->n_index_bits);
+
+         for (component = 0; component < 3; component++)
+            result[component] = interpolate(endpoints[subset_num * 2][component],
+                                            endpoints[subset_num * 2 + 1][component],
+                                            index,
+                                            index_bits);
+
+         /* Alpha uses the opposite index from the color components */
+         if (mode->n_secondary_index_bits && !index_selection) {
+            index = indices[1];
+            index_bits = mode->n_secondary_index_bits;
+         } else {
+            index = indices[0];
+            index_bits = mode->n_index_bits;
+         }
+
+         result[3] = interpolate(endpoints[subset_num * 2][3],
+                                 endpoints[subset_num * 2 + 1][3],
+                                 index,
+                                 index_bits);
+
+         apply_rotation(rotation, result);
+         result += 4;
+      }
+      dst_row += dst_rowstride;
+   }
+}
+
+static void
+decompress_rgba_unorm(int width, int height,
+                      const uint8_t *src, int src_rowstride,
+                      uint8_t *dst, int dst_rowstride)
+{
+   int src_row_diff;
+   int y, x;
+
+   if (src_rowstride >= width * 4)
+      src_row_diff = src_rowstride - ((width + 3) & ~3) * 4;
+   else
+      src_row_diff = 0;
+
+   for (y = 0; y < height; y += BLOCK_SIZE) {
+      for (x = 0; x < width; x += BLOCK_SIZE) {
+         decompress_rgba_unorm_block(MIN2(width - x, BLOCK_SIZE),
+                                     MIN2(height - y, BLOCK_SIZE),
+                                     src,
+                                     dst + x * 4 + y * dst_rowstride,
+                                     dst_rowstride);
+         src += BLOCK_BYTES;
+      }
+      src += src_row_diff;
+   }
+}
+#endif // BPTC_BLOCK_DECODE
+
+static int32_t
+sign_extend(int32_t value,
+            int n_bits)
+{
+   if ((value & (1 << (n_bits - 1)))) {
+      value |= (~(int32_t) 0) << n_bits;
+   }
+
+   return value;
+}
+
+static int
+signed_unquantize(int value, int n_endpoint_bits)
+{
+   bool sign;
+
+   if (n_endpoint_bits >= 16)
+      return value;
+
+   if (value == 0)
+      return 0;
+
+   sign = false;
+
+   if (value < 0) {
+      sign = true;
+      value = -value;
+   }
+
+   if (value >= (1 << (n_endpoint_bits - 1)) - 1)
+      value = 0x7fff;
+   else
+      value = ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
+
+   if (sign)
+      value = -value;
+
+   return value;
+}
+
+static int
+unsigned_unquantize(int value, int n_endpoint_bits)
+{
+   if (n_endpoint_bits >= 15)
+      return value;
+
+   if (value == 0)
+      return 0;
+
+   if (value == (1 << n_endpoint_bits) - 1)
+      return 0xffff;
+
+   return ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
+}
+
+static int
+extract_float_endpoints(const struct bptc_float_mode *mode,
+                        const uint8_t *block,
+                        int bit_offset,
+                        int32_t endpoints[][3],
+                        bool is_signed)
+{
+   const struct bptc_float_bitfield *bitfield;
+   int endpoint, component;
+   int n_endpoints;
+   int value;
+   int i;
+
+   if (mode->n_partition_bits)
+      n_endpoints = 4;
+   else
+      n_endpoints = 2;
+
+   memset(endpoints, 0, sizeof endpoints[0][0] * n_endpoints * 3);
+
+   for (bitfield = mode->bitfields; bitfield->endpoint != -1; bitfield++) {
+      value = extract_bits(block, bit_offset, bitfield->n_bits);
+      bit_offset += bitfield->n_bits;
+
+      if (bitfield->reverse) {
+         for (i = 0; i < bitfield->n_bits; i++) {
+            if (value & (1 << i))
+               endpoints[bitfield->endpoint][bitfield->component] |=
+                  1 << ((bitfield->n_bits - 1 - i) + bitfield->offset);
+         }
+      } else {
+         endpoints[bitfield->endpoint][bitfield->component] |=
+            value << bitfield->offset;
+      }
+   }
+
+   if (mode->transformed_endpoints) {
+      /* The endpoints are specified as signed offsets from e0 */
+      for (endpoint = 1; endpoint < n_endpoints; endpoint++) {
+         for (component = 0; component < 3; component++) {
+            value = sign_extend(endpoints[endpoint][component],
+                                mode->n_delta_bits[component]);
+            endpoints[endpoint][component] =
+               ((endpoints[0][component] + value) &
+                ((1 << mode->n_endpoint_bits) - 1));
+         }
+      }
+   }
+
+   if (is_signed) {
+      for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
+         for (component = 0; component < 3; component++) {
+            value = sign_extend(endpoints[endpoint][component],
+                                mode->n_endpoint_bits);
+            endpoints[endpoint][component] =
+               signed_unquantize(value, mode->n_endpoint_bits);
+         }
+      }
+   } else {
+      for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
+         for (component = 0; component < 3; component++) {
+            endpoints[endpoint][component] =
+               unsigned_unquantize(endpoints[endpoint][component],
+                                   mode->n_endpoint_bits);
+         }
+      }
+   }
+
+   return bit_offset;
+}
+
+static int32_t
+finish_unsigned_unquantize(int32_t value)
+{
+   return value * 31 / 64;
+}
+
+static int32_t
+finish_signed_unquantize(int32_t value)
+{
+   if (value < 0)
+      return (-value * 31 / 32) | 0x8000;
+   else
+      return value * 31 / 32;
+}
+
+static void
+fetch_rgb_float_from_block(const uint8_t *block,
+                           float *result,
+                           int texel,
+                           bool is_signed)
+{
+   int mode_num;
+   const struct bptc_float_mode *mode;
+   int bit_offset;
+   int partition_num;
+   int subset_num;
+   int index_bits;
+   int index;
+   int anchors_before_texel;
+   int32_t endpoints[2 * 2][3];
+   uint32_t subsets;
+   int n_subsets;
+   int component;
+   int32_t value;
+
+   if (block[0] & 0x2) {
+      mode_num = (((block[0] >> 1) & 0xe) | (block[0] & 1)) + 2;
+      bit_offset = 5;
+   } else {
+      mode_num = block[0] & 3;
+      bit_offset = 2;
+   }
+
+   mode = bptc_float_modes + mode_num;
+
+   if (mode->reserved) {
+      memset(result, 0, sizeof result[0] * 3);
+      result[3] = 1.0f;
+      return;
+   }
+
+   bit_offset = extract_float_endpoints(mode, block, bit_offset,
+                                        endpoints, is_signed);
+
+   if (mode->n_partition_bits) {
+      partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
+      bit_offset += mode->n_partition_bits;
+
+      subsets = partition_table1[partition_num];
+      n_subsets = 2;
+   } else {
+      partition_num = 0;
+      subsets = 0;
+      n_subsets = 1;
+   }
+
+   anchors_before_texel =
+      count_anchors_before_texel(n_subsets, partition_num, texel);
+
+   /* Calculate the offset to the primary index for this texel */
+   bit_offset += mode->n_index_bits * texel - anchors_before_texel;
+
+   subset_num = (subsets >> (texel * 2)) & 3;
+
+   index_bits = mode->n_index_bits;
+   if (is_anchor(n_subsets, partition_num, texel))
+      index_bits--;
+   index = extract_bits(block, bit_offset, index_bits);
+
+   for (component = 0; component < 3; component++) {
+      value = interpolate(endpoints[subset_num * 2][component],
+                          endpoints[subset_num * 2 + 1][component],
+                          index,
+                          mode->n_index_bits);
+
+      if (is_signed)
+         value = finish_signed_unquantize(value);
+      else
+         value = finish_unsigned_unquantize(value);
+
+      result[component] = _mesa_half_to_float(value);
+   }
+
+   result[3] = 1.0f;
+}
+
+#ifdef BPTC_BLOCK_DECODE
+static void
+decompress_rgb_float_block(unsigned src_width, unsigned src_height,
+                           const uint8_t *block,
+                           float *dst_row, unsigned dst_rowstride,
+                           bool is_signed)
+{
+   int mode_num;
+   const struct bptc_float_mode *mode;
+   int bit_offset;
+   int partition_num;
+   int subset_num;
+   int index_bits;
+   int index;
+   int anchors_before_texel;
+   int32_t endpoints[2 * 2][3];
+   uint32_t subsets;
+   int n_subsets;
+   int component;
+   int32_t value;
+   unsigned x, y;
+
+   if (block[0] & 0x2) {
+      mode_num = (((block[0] >> 1) & 0xe) | (block[0] & 1)) + 2;
+      bit_offset = 5;
+   } else {
+      mode_num = block[0] & 3;
+      bit_offset = 2;
+   }
+
+   mode = bptc_float_modes + mode_num;
+
+   if (mode->reserved) {
+      for(y = 0; y < src_height; y += 1) {
+         float *result = dst_row;
+         memset(result, 0, sizeof result[0] * 4 * src_width);
+         for(x = 0; x < src_width; x += 1) {
+            result[3] = 1.0f;
+            result += 4;
+         }
+         dst_row += dst_rowstride / sizeof dst_row[0];
+      }
+      return;
+   }
+
+   bit_offset = extract_float_endpoints(mode, block, bit_offset,
+                                        endpoints, is_signed);
+
+   if (mode->n_partition_bits) {
+      partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
+      bit_offset += mode->n_partition_bits;
+
+      subsets = partition_table1[partition_num];
+      n_subsets = 2;
+   } else {
+      partition_num = 0;
+      subsets = 0;
+      n_subsets = 1;
+   }
+
+   for(y = 0; y < src_height; y += 1) {
+      float *result = dst_row;
+      for(x = 0; x < src_width; x += 1) {
+         int texel;
+
+         texel = x + y * 4;
+
+         anchors_before_texel =
+            count_anchors_before_texel(n_subsets, partition_num, texel);
+
+         /* Calculate the offset to the primary index for this texel */
+         bit_offset += mode->n_index_bits * texel - anchors_before_texel;
+
+         subset_num = (subsets >> (texel * 2)) & 3;
+
+         index_bits = mode->n_index_bits;
+         if (is_anchor(n_subsets, partition_num, texel))
+            index_bits--;
+         index = extract_bits(block, bit_offset, index_bits);
+
+         for (component = 0; component < 3; component++) {
+            value = interpolate(endpoints[subset_num * 2][component],
+                                endpoints[subset_num * 2 + 1][component],
+                                index,
+                                mode->n_index_bits);
+
+            if (is_signed)
+               value = finish_signed_unquantize(value);
+            else
+               value = finish_unsigned_unquantize(value);
+
+            result[component] = _mesa_half_to_float(value);
+         }
+
+         result[3] = 1.0f;
+         result += 4;
+      }
+      dst_row += dst_rowstride / sizeof dst_row[0];
+   }
+}
+
+static void
+decompress_rgb_float(int width, int height,
+                      const uint8_t *src, int src_rowstride,
+                      float *dst, int dst_rowstride, bool is_signed)
+{
+   int src_row_diff;
+   int y, x;
+
+   if (src_rowstride >= width * 4)
+      src_row_diff = src_rowstride - ((width + 3) & ~3) * 4;
+   else
+      src_row_diff = 0;
+
+   for (y = 0; y < height; y += BLOCK_SIZE) {
+      for (x = 0; x < width; x += BLOCK_SIZE) {
+         decompress_rgb_float_block(MIN2(width - x, BLOCK_SIZE),
+                                    MIN2(height - y, BLOCK_SIZE),
+                                    src,
+                                    (dst + x * 4 +
+                                     (y * dst_rowstride / sizeof dst[0])),
+                                    dst_rowstride, is_signed);
+         src += BLOCK_BYTES;
+      }
+      src += src_row_diff;
+   }
+}
+#endif // BPTC_BLOCK_DECODE
+
+static void
+write_bits(struct bit_writer *writer, int n_bits, int value)
+{
+   do {
+      if (n_bits + writer->pos >= 8) {
+         *(writer->dst++) = writer->buf | (value << writer->pos);
+         writer->buf = 0;
+         value >>= (8 - writer->pos);
+         n_bits -= (8 - writer->pos);
+         writer->pos = 0;
+      } else {
+         writer->buf |= value << writer->pos;
+         writer->pos += n_bits;
+         break;
+      }
+   } while (n_bits > 0);
+}
+
+static void
+get_average_luminance_alpha_unorm(int width, int height,
+                                  const uint8_t *src, int src_rowstride,
+                                  int *average_luminance, int *average_alpha)
+{
+   int luminance_sum = 0, alpha_sum = 0;
+   int y, x;
+
+   for (y = 0; y < height; y++) {
+      for (x = 0; x < width; x++) {
+         luminance_sum += src[0] + src[1] + src[2];
+         alpha_sum += src[3];
+         src += 4;
+      }
+      src += src_rowstride - width * 4;
+   }
+
+   *average_luminance = luminance_sum / (width * height);
+   *average_alpha = alpha_sum / (width * height);
+}
+
+static void
+get_rgba_endpoints_unorm(int width, int height,
+                         const uint8_t *src, int src_rowstride,
+                         int average_luminance, int average_alpha,
+                         uint8_t endpoints[][4])
+{
+   int endpoint_luminances[2];
+   int midpoint;
+   int sums[2][4];
+   int endpoint;
+   int luminance;
+   uint8_t temp[3];
+   const uint8_t *p = src;
+   int rgb_left_endpoint_count = 0;
+   int alpha_left_endpoint_count = 0;
+   int y, x, i;
+
+   memset(sums, 0, sizeof sums);
+
+   for (y = 0; y < height; y++) {
+      for (x = 0; x < width; x++) {
+         luminance = p[0] + p[1] + p[2];
+         if (luminance < average_luminance) {
+            endpoint = 0;
+            rgb_left_endpoint_count++;
+         } else {
+            endpoint = 1;
+         }
+         for (i = 0; i < 3; i++)
+            sums[endpoint][i] += p[i];
+
+         if (p[2] < average_alpha) {
+            endpoint = 0;
+            alpha_left_endpoint_count++;
+         } else {
+            endpoint = 1;
+         }
+         sums[endpoint][3] += p[3];
+
+         p += 4;
+      }
+
+      p += src_rowstride - width * 4;
+   }
+
+   if (rgb_left_endpoint_count == 0 ||
+       rgb_left_endpoint_count == width * height) {
+      for (i = 0; i < 3; i++)
+         endpoints[0][i] = endpoints[1][i] =
+            (sums[0][i] + sums[1][i]) / (width * height);
+   } else {
+      for (i = 0; i < 3; i++) {
+         endpoints[0][i] = sums[0][i] / rgb_left_endpoint_count;
+         endpoints[1][i] = (sums[1][i] /
+                            (width * height - rgb_left_endpoint_count));
+      }
+   }
+
+   if (alpha_left_endpoint_count == 0 ||
+       alpha_left_endpoint_count == width * height) {
+      endpoints[0][3] = endpoints[1][3] =
+         (sums[0][3] + sums[1][3]) / (width * height);
+   } else {
+         endpoints[0][3] = sums[0][3] / alpha_left_endpoint_count;
+         endpoints[1][3] = (sums[1][3] /
+                            (width * height - alpha_left_endpoint_count));
+   }
+
+   /* We may need to swap the endpoints to ensure the most-significant bit of
+    * the first index is zero */
+
+   for (endpoint = 0; endpoint < 2; endpoint++) {
+      endpoint_luminances[endpoint] =
+         endpoints[endpoint][0] +
+         endpoints[endpoint][1] +
+         endpoints[endpoint][2];
+   }
+   midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2;
+
+   if ((src[0] + src[1] + src[2] <= midpoint) !=
+       (endpoint_luminances[0] <= midpoint)) {
+      memcpy(temp, endpoints[0], 3);
+      memcpy(endpoints[0], endpoints[1], 3);
+      memcpy(endpoints[1], temp, 3);
+   }
+
+   /* Same for the alpha endpoints */
+
+   midpoint = (endpoints[0][3] + endpoints[1][3]) / 2;
+
+   if ((src[3] <= midpoint) != (endpoints[0][3] <= midpoint)) {
+      temp[0] = endpoints[0][3];
+      endpoints[0][3] = endpoints[1][3];
+      endpoints[1][3] = temp[0];
+   }
+}
+
+static void
+write_rgb_indices_unorm(struct bit_writer *writer,
+                        int src_width, int src_height,
+                        const uint8_t *src, int src_rowstride,
+                        uint8_t endpoints[][4])
+{
+   int luminance;
+   int endpoint_luminances[2];
+   int endpoint;
+   int index;
+   int y, x;
+
+   for (endpoint = 0; endpoint < 2; endpoint++) {
+      endpoint_luminances[endpoint] =
+         endpoints[endpoint][0] +
+         endpoints[endpoint][1] +
+         endpoints[endpoint][2];
+   }
+
+   /* If the endpoints have the same luminance then we'll just use index 0 for
+    * all of the texels */
+   if (endpoint_luminances[0] == endpoint_luminances[1]) {
+      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 2 - 1, 0);
+      return;
+   }
+
+   for (y = 0; y < src_height; y++) {
+      for (x = 0; x < src_width; x++) {
+         luminance = src[0] + src[1] + src[2];
+
+         index = ((luminance - endpoint_luminances[0]) * 3 /
+                  (endpoint_luminances[1] - endpoint_luminances[0]));
+         if (index < 0)
+            index = 0;
+         else if (index > 3)
+            index = 3;
+
+         assert(x != 0 || y != 0 || index < 2);
+
+         write_bits(writer, (x == 0 && y == 0) ? 1 : 2, index);
+
+         src += 4;
+      }
+
+      /* Pad the indices out to the block size */
+      if (src_width < BLOCK_SIZE)
+         write_bits(writer, 2 * (BLOCK_SIZE - src_width), 0);
+
+      src += src_rowstride - src_width * 4;
+   }
+
+   /* Pad the indices out to the block size */
+   if (src_height < BLOCK_SIZE)
+      write_bits(writer, 2 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
+}
+
+static void
+write_alpha_indices_unorm(struct bit_writer *writer,
+                          int src_width, int src_height,
+                          const uint8_t *src, int src_rowstride,
+                          uint8_t endpoints[][4])
+{
+   int index;
+   int y, x;
+
+   /* If the endpoints have the same alpha then we'll just use index 0 for
+    * all of the texels */
+   if (endpoints[0][3] == endpoints[1][3]) {
+      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 3 - 1, 0);
+      return;
+   }
+
+   for (y = 0; y < src_height; y++) {
+      for (x = 0; x < src_width; x++) {
+         index = (((int) src[3] - (int) endpoints[0][3]) * 7 /
+                  ((int) endpoints[1][3] - endpoints[0][3]));
+         if (index < 0)
+            index = 0;
+         else if (index > 7)
+            index = 7;
+
+         assert(x != 0 || y != 0 || index < 4);
+
+         /* The first index has one less bit */
+         write_bits(writer, (x == 0 && y == 0) ? 2 : 3, index);
+
+         src += 4;
+      }
+
+      /* Pad the indices out to the block size */
+      if (src_width < BLOCK_SIZE)
+         write_bits(writer, 3 * (BLOCK_SIZE - src_width), 0);
+
+      src += src_rowstride - src_width * 4;
+   }
+
+   /* Pad the indices out to the block size */
+   if (src_height < BLOCK_SIZE)
+      write_bits(writer, 3 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
+}
+
+static void
+compress_rgba_unorm_block(int src_width, int src_height,
+                          const uint8_t *src, int src_rowstride,
+                          uint8_t *dst)
+{
+   int average_luminance, average_alpha;
+   uint8_t endpoints[2][4];
+   struct bit_writer writer;
+   int component, endpoint;
+
+   get_average_luminance_alpha_unorm(src_width, src_height, src, src_rowstride,
+                                     &average_luminance, &average_alpha);
+   get_rgba_endpoints_unorm(src_width, src_height, src, src_rowstride,
+                            average_luminance, average_alpha,
+                            endpoints);
+
+   writer.dst = dst;
+   writer.pos = 0;
+   writer.buf = 0;
+
+   write_bits(&writer, 5, 0x10); /* mode 4 */
+   write_bits(&writer, 2, 0); /* rotation 0 */
+   write_bits(&writer, 1, 0); /* index selection bit */
+
+   /* Write the color endpoints */
+   for (component = 0; component < 3; component++)
+      for (endpoint = 0; endpoint < 2; endpoint++)
+         write_bits(&writer, 5, endpoints[endpoint][component] >> 3);
+
+   /* Write the alpha endpoints */
+   for (endpoint = 0; endpoint < 2; endpoint++)
+      write_bits(&writer, 6, endpoints[endpoint][3] >> 2);
+
+   write_rgb_indices_unorm(&writer,
+                           src_width, src_height,
+                           src, src_rowstride,
+                           endpoints);
+   write_alpha_indices_unorm(&writer,
+                             src_width, src_height,
+                             src, src_rowstride,
+                             endpoints);
+}
+
+static void
+compress_rgba_unorm(int width, int height,
+                    const uint8_t *src, int src_rowstride,
+                    uint8_t *dst, int dst_rowstride)
+{
+   int dst_row_diff;
+   int y, x;
+
+   if (dst_rowstride >= width * 4)
+      dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
+   else
+      dst_row_diff = 0;
+
+   for (y = 0; y < height; y += BLOCK_SIZE) {
+      for (x = 0; x < width; x += BLOCK_SIZE) {
+         compress_rgba_unorm_block(MIN2(width - x, BLOCK_SIZE),
+                                   MIN2(height - y, BLOCK_SIZE),
+                                   src + x * 4 + y * src_rowstride,
+                                   src_rowstride,
+                                   dst);
+         dst += BLOCK_BYTES;
+      }
+      dst += dst_row_diff;
+   }
+}
+
+static float
+get_average_luminance_float(int width, int height,
+                            const float *src, int src_rowstride)
+{
+   float luminance_sum = 0;
+   int y, x;
+
+   for (y = 0; y < height; y++) {
+      for (x = 0; x < width; x++) {
+         luminance_sum += src[0] + src[1] + src[2];
+         src += 3;
+      }
+      src += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
+   }
+
+   return luminance_sum / (width * height);
+}
+
+static float
+clamp_value(float value, bool is_signed)
+{
+   if (value > 65504.0f)
+      return 65504.0f;
+
+   if (is_signed) {
+      if (value < -65504.0f)
+         return -65504.0f;
+      else
+         return value;
+   }
+
+   if (value < 0.0f)
+      return 0.0f;
+
+   return value;
+}
+
+static void
+get_endpoints_float(int width, int height,
+                    const float *src, int src_rowstride,
+                    float average_luminance, float endpoints[][3],
+                    bool is_signed)
+{
+   float endpoint_luminances[2];
+   float midpoint;
+   float sums[2][3];
+   int endpoint, component;
+   float luminance;
+   float temp[3];
+   const float *p = src;
+   int left_endpoint_count = 0;
+   int y, x, i;
+
+   memset(sums, 0, sizeof sums);
+
+   for (y = 0; y < height; y++) {
+      for (x = 0; x < width; x++) {
+         luminance = p[0] + p[1] + p[2];
+         if (luminance < average_luminance) {
+            endpoint = 0;
+            left_endpoint_count++;
+         } else {
+            endpoint = 1;
+         }
+         for (i = 0; i < 3; i++)
+            sums[endpoint][i] += p[i];
+
+         p += 3;
+      }
+
+      p += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
+   }
+
+   if (left_endpoint_count == 0 ||
+       left_endpoint_count == width * height) {
+      for (i = 0; i < 3; i++)
+         endpoints[0][i] = endpoints[1][i] =
+            (sums[0][i] + sums[1][i]) / (width * height);
+   } else {
+      for (i = 0; i < 3; i++) {
+         endpoints[0][i] = sums[0][i] / left_endpoint_count;
+         endpoints[1][i] = sums[1][i] / (width * height - left_endpoint_count);
+      }
+   }
+
+   /* Clamp the endpoints to the range of a half float and strip out
+    * infinities */
+   for (endpoint = 0; endpoint < 2; endpoint++) {
+      for (component = 0; component < 3; component++) {
+         endpoints[endpoint][component] =
+            clamp_value(endpoints[endpoint][component], is_signed);
+      }
+   }
+
+   /* We may need to swap the endpoints to ensure the most-significant bit of
+    * the first index is zero */
+
+   for (endpoint = 0; endpoint < 2; endpoint++) {
+      endpoint_luminances[endpoint] =
+         endpoints[endpoint][0] +
+         endpoints[endpoint][1] +
+         endpoints[endpoint][2];
+   }
+   midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2.0f;
+
+   if ((src[0] + src[1] + src[2] <= midpoint) !=
+       (endpoint_luminances[0] <= midpoint)) {
+      memcpy(temp, endpoints[0], sizeof temp);
+      memcpy(endpoints[0], endpoints[1], sizeof temp);
+      memcpy(endpoints[1], temp, sizeof temp);
+   }
+}
+
+static void
+write_rgb_indices_float(struct bit_writer *writer,
+                        int src_width, int src_height,
+                        const float *src, int src_rowstride,
+                        float endpoints[][3])
+{
+   float luminance;
+   float endpoint_luminances[2];
+   int endpoint;
+   int index;
+   int y, x;
+
+   for (endpoint = 0; endpoint < 2; endpoint++) {
+      endpoint_luminances[endpoint] =
+         endpoints[endpoint][0] +
+         endpoints[endpoint][1] +
+         endpoints[endpoint][2];
+   }
+
+   /* If the endpoints have the same luminance then we'll just use index 0 for
+    * all of the texels */
+   if (endpoint_luminances[0] == endpoint_luminances[1]) {
+      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 4 - 1, 0);
+      return;
+   }
+
+   for (y = 0; y < src_height; y++) {
+      for (x = 0; x < src_width; x++) {
+         luminance = src[0] + src[1] + src[2];
+
+         index = ((luminance - endpoint_luminances[0]) * 15 /
+                  (endpoint_luminances[1] - endpoint_luminances[0]));
+         if (index < 0)
+            index = 0;
+         else if (index > 15)
+            index = 15;
+
+         assert(x != 0 || y != 0 || index < 8);
+
+         write_bits(writer, (x == 0 && y == 0) ? 3 : 4, index);
+
+         src += 3;
+      }
+
+      /* Pad the indices out to the block size */
+      if (src_width < BLOCK_SIZE)
+         write_bits(writer, 4 * (BLOCK_SIZE - src_width), 0);
+
+      src += (src_rowstride - src_width * 3 * sizeof (float)) / sizeof (float);
+   }
+
+   /* Pad the indices out to the block size */
+   if (src_height < BLOCK_SIZE)
+      write_bits(writer, 4 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
+}
+
+static int
+get_endpoint_value(float value, bool is_signed)
+{
+   bool sign = false;
+   int half;
+
+   if (is_signed) {
+      half = _mesa_float_to_half(value);
+
+      if (half & 0x8000) {
+         half &= 0x7fff;
+         sign = true;
+      }
+
+      half = (32 * half / 31) >> 6;
+
+      if (sign)
+         half = -half & ((1 << 10) - 1);
+
+      return half;
+   } else {
+      if (value <= 0.0f)
+         return 0;
+
+      half = _mesa_float_to_half(value);
+
+      return (64 * half / 31) >> 6;
+   }
+}
+
+static void
+compress_rgb_float_block(int src_width, int src_height,
+                         const float *src, int src_rowstride,
+                         uint8_t *dst,
+                         bool is_signed)
+{
+   float average_luminance;
+   float endpoints[2][3];
+   struct bit_writer writer;
+   int component, endpoint;
+   int endpoint_value;
+
+   average_luminance =
+      get_average_luminance_float(src_width, src_height, src, src_rowstride);
+   get_endpoints_float(src_width, src_height, src, src_rowstride,
+                       average_luminance, endpoints, is_signed);
+
+   writer.dst = dst;
+   writer.pos = 0;
+   writer.buf = 0;
+
+   write_bits(&writer, 5, 3); /* mode 3 */
+
+   /* Write the endpoints */
+   for (endpoint = 0; endpoint < 2; endpoint++) {
+      for (component = 0; component < 3; component++) {
+         endpoint_value =
+            get_endpoint_value(endpoints[endpoint][component], is_signed);
+         write_bits(&writer, 10, endpoint_value);
+      }
+   }
+
+   write_rgb_indices_float(&writer,
+                           src_width, src_height,
+                           src, src_rowstride,
+                           endpoints);
+}
+
+static void
+compress_rgb_float(int width, int height,
+                   const float *src, int src_rowstride,
+                   uint8_t *dst, int dst_rowstride,
+                   bool is_signed)
+{
+   int dst_row_diff;
+   int y, x;
+
+   if (dst_rowstride >= width * 4)
+      dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
+   else
+      dst_row_diff = 0;
+
+   for (y = 0; y < height; y += BLOCK_SIZE) {
+      for (x = 0; x < width; x += BLOCK_SIZE) {
+         compress_rgb_float_block(MIN2(width - x, BLOCK_SIZE),
+                                  MIN2(height - y, BLOCK_SIZE),
+                                  src + x * 3 +
+                                  y * src_rowstride / sizeof (float),
+                                  src_rowstride,
+                                  dst,
+                                  is_signed);
+         dst += BLOCK_BYTES;
+      }
+      dst += dst_row_diff;
+   }
+}
diff --git a/src/mesa/main/texcompress_etc.c b/src/mesa/main/texcompress_etc.c
index 099787b..b39ab33 100644
--- a/src/mesa/main/texcompress_etc.c
+++ b/src/mesa/main/texcompress_etc.c
@@ -719,7 +719,8 @@
                   const uint8_t *src_row,
                   unsigned src_stride,
                   unsigned width,
-                  unsigned height)
+		  unsigned height,
+		  bool bgra)
 {
    const unsigned bw = 4, bh = 4, bs = 8, comps = 4;
    struct etc2_block block;
@@ -741,11 +742,14 @@
             for (i = 0; i < w; i++) {
                etc2_rgb8_fetch_texel(&block, i, j, dst,
                                      false /* punchthrough_alpha */);
-               /* Convert to MESA_FORMAT_B8G8R8A8_SRGB */
-               tmp = dst[0];
-               dst[0] = dst[2];
-               dst[2] = tmp;
-               dst[3] = 255;
+
+	       if (bgra) {
+		  /* Convert to MESA_FORMAT_B8G8R8A8_SRGB */
+		  tmp = dst[0];
+		  dst[0] = dst[2];
+		  dst[2] = tmp;
+		  dst[3] = 255;
+	       }
 
                dst += comps;
             }
@@ -801,7 +805,8 @@
                          const uint8_t *src_row,
                          unsigned src_stride,
                          unsigned width,
-                         unsigned height)
+			 unsigned height,
+			 bool bgra)
 {
    /* If internalformat is COMPRESSED_SRGB8_ALPHA8_ETC2_EAC, each 4 × 4 block
     * of RGBA8888 information is compressed to 128 bits. To decode a block, the
@@ -825,11 +830,13 @@
             for (i = 0; i < w; i++) {
                etc2_rgba8_fetch_texel(&block, i, j, dst);
 
-               /* Convert to MESA_FORMAT_B8G8R8A8_SRGB */
-               tmp = dst[0];
-               dst[0] = dst[2];
-               dst[2] = tmp;
-               dst[3] = dst[3];
+	       if (bgra) {
+		  /* Convert to MESA_FORMAT_B8G8R8A8_SRGB */
+		  tmp = dst[0];
+		  dst[0] = dst[2];
+		  dst[2] = tmp;
+		  dst[3] = dst[3];
+	       }
 
                dst += comps;
             }
@@ -1058,7 +1065,8 @@
                                      const uint8_t *src_row,
                                      unsigned src_stride,
                                      unsigned width,
-                                     unsigned height)
+				     unsigned height,
+				     bool bgra)
 {
    const unsigned bw = 4, bh = 4, bs = 8, comps = 4;
    struct etc2_block block;
@@ -1078,11 +1086,14 @@
             for (i = 0; i < w; i++) {
                etc2_rgb8_fetch_texel(&block, i, j, dst,
                                      true /* punchthrough_alpha */);
-               /* Convert to MESA_FORMAT_B8G8R8A8_SRGB */
-               tmp = dst[0];
-               dst[0] = dst[2];
-               dst[2] = tmp;
-               dst[3] = dst[3];
+
+	       if (bgra) {
+		  /* Convert to MESA_FORMAT_B8G8R8A8_SRGB */
+		  tmp = dst[0];
+		  dst[0] = dst[2];
+		  dst[2] = tmp;
+		  dst[3] = dst[3];
+	       }
 
                dst += comps;
             }
@@ -1206,7 +1217,8 @@
                          unsigned src_stride,
                          unsigned src_width,
                          unsigned src_height,
-                         mesa_format format)
+			 mesa_format format,
+			 bool bgra)
 {
    if (format == MESA_FORMAT_ETC2_RGB8)
       etc2_unpack_rgb8(dst_row, dst_stride,
@@ -1215,7 +1227,7 @@
    else if (format == MESA_FORMAT_ETC2_SRGB8)
       etc2_unpack_srgb8(dst_row, dst_stride,
                         src_row, src_stride,
-                        src_width, src_height);
+			src_width, src_height, bgra);
    else if (format == MESA_FORMAT_ETC2_RGBA8_EAC)
       etc2_unpack_rgba8(dst_row, dst_stride,
                         src_row, src_stride,
@@ -1223,7 +1235,7 @@
    else if (format == MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC)
       etc2_unpack_srgb8_alpha8(dst_row, dst_stride,
                                src_row, src_stride,
-                               src_width, src_height);
+			       src_width, src_height, bgra);
    else if (format == MESA_FORMAT_ETC2_R11_EAC)
       etc2_unpack_r11(dst_row, dst_stride,
                       src_row, src_stride,
@@ -1247,7 +1259,7 @@
    else if (format == MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1)
       etc2_unpack_srgb8_punchthrough_alpha1(dst_row, dst_stride,
                                             src_row, src_stride,
-                                            src_width, src_height);
+					    src_width, src_height, bgra);
 }
 
 
diff --git a/src/mesa/main/texcompress_etc.h b/src/mesa/main/texcompress_etc.h
index 319b7be..2c764b8 100644
--- a/src/mesa/main/texcompress_etc.h
+++ b/src/mesa/main/texcompress_etc.h
@@ -77,7 +77,8 @@
                          unsigned src_stride,
                          unsigned src_width,
                          unsigned src_height,
-                         mesa_format format);
+			 mesa_format format,
+			 bool bgra);
 
 compressed_fetch_func
 _mesa_get_etc_fetch_func(mesa_format format);
diff --git a/src/mesa/main/texenv.c b/src/mesa/main/texenv.c
index 22fc8da..a69c8dd 100644
--- a/src/mesa/main/texenv.c
+++ b/src/mesa/main/texenv.c
@@ -103,7 +103,7 @@
 
 
 /** Set an RGB or A combiner mode/function */
-static void
+static bool
 set_combiner_mode(struct gl_context *ctx,
                   struct gl_fixedfunc_texture_unit *texUnit,
                   GLenum pname, GLenum mode)
@@ -144,32 +144,35 @@
 
    if (!legal) {
       TE_ERROR(GL_INVALID_ENUM, "glTexEnv(param=%s)", mode);
-      return;
+      return false;
    }
 
    switch (pname) {
    case GL_COMBINE_RGB:
       if (texUnit->Combine.ModeRGB == mode)
-         return;
+         return true;
       FLUSH_VERTICES(ctx, _NEW_TEXTURE_STATE);
       texUnit->Combine.ModeRGB = mode;
       break;
 
    case GL_COMBINE_ALPHA:
       if (texUnit->Combine.ModeA == mode)
-         return;
+         return true;
       FLUSH_VERTICES(ctx, _NEW_TEXTURE_STATE);
       texUnit->Combine.ModeA = mode;
       break;
    default:
       TE_ERROR(GL_INVALID_ENUM, "glTexEnv(pname=%s)", pname);
+      return false;
    }
+
+   return true;
 }
 
 
 
 /** Set an RGB or A combiner source term */
-static void
+static bool
 set_combiner_source(struct gl_context *ctx,
                     struct gl_fixedfunc_texture_unit *texUnit,
                     GLenum pname, GLenum param)
@@ -199,13 +202,13 @@
       break;
    default:
       TE_ERROR(GL_INVALID_ENUM, "glTexEnv(pname=%s)", pname);
-      return;
+      return false;
    }
 
    if ((term == 3) && (ctx->API != API_OPENGL_COMPAT
                        || !ctx->Extensions.NV_texture_env_combine4)) {
       TE_ERROR(GL_INVALID_ENUM, "glTexEnv(pname=%s)", pname);
-      return;
+      return false;
    }
 
    assert(term < MAX_COMBINER_TERMS);
@@ -246,7 +249,7 @@
 
    if (!legal) {
       TE_ERROR(GL_INVALID_ENUM, "glTexEnv(param=%s)", param);
-      return;
+      return false;
    }
 
    FLUSH_VERTICES(ctx, _NEW_TEXTURE_STATE);
@@ -255,11 +258,13 @@
       texUnit->Combine.SourceA[term] = param;
    else
       texUnit->Combine.SourceRGB[term] = param;
+
+   return true;
 }
 
 
 /** Set an RGB or A combiner operand term */
-static void
+static bool
 set_combiner_operand(struct gl_context *ctx,
                      struct gl_fixedfunc_texture_unit *texUnit,
                      GLenum pname, GLenum param)
@@ -286,13 +291,13 @@
       break;
    default:
       TE_ERROR(GL_INVALID_ENUM, "glTexEnv(pname=%s)", pname);
-      return;
+      return false;
    }
 
    if ((term == 3) && (ctx->API != API_OPENGL_COMPAT
                        || !ctx->Extensions.NV_texture_env_combine4)) {
       TE_ERROR(GL_INVALID_ENUM, "glTexEnv(pname=%s)", pname);
-      return;
+      return false;
    }
 
    assert(term < MAX_COMBINER_TERMS);
@@ -328,7 +333,7 @@
 
    if (!legal) {
       TE_ERROR(GL_INVALID_ENUM, "glTexEnv(param=%s)", param);
-      return;
+      return false;
    }
 
    FLUSH_VERTICES(ctx, _NEW_TEXTURE_STATE);
@@ -337,10 +342,12 @@
       texUnit->Combine.OperandA[term] = param;
    else
       texUnit->Combine.OperandRGB[term] = param;
+
+   return true;
 }
 
 
-static void
+static bool
 set_combiner_scale(struct gl_context *ctx,
                    struct gl_fixedfunc_texture_unit *texUnit,
                    GLenum pname, GLfloat scale)
@@ -359,25 +366,28 @@
    else {
       _mesa_error( ctx, GL_INVALID_VALUE,
                    "glTexEnv(GL_RGB_SCALE not 1, 2 or 4)" );
-      return;
+      return false;
    }
 
    switch (pname) {
    case GL_RGB_SCALE:
       if (texUnit->Combine.ScaleShiftRGB == shift)
-         return;
+         return true;
       FLUSH_VERTICES(ctx, _NEW_TEXTURE_STATE);
       texUnit->Combine.ScaleShiftRGB = shift;
       break;
    case GL_ALPHA_SCALE:
       if (texUnit->Combine.ScaleShiftA == shift)
-         return;
+         return true;
       FLUSH_VERTICES(ctx, _NEW_TEXTURE_STATE);
       texUnit->Combine.ScaleShiftA = shift;
       break;
    default:
       TE_ERROR(GL_INVALID_ENUM, "glTexEnv(pname=%s)", pname);
+      return false;
    }
+
+   return true;
 }
 
 
@@ -418,7 +428,8 @@
          break;
       case GL_COMBINE_RGB:
       case GL_COMBINE_ALPHA:
-         set_combiner_mode(ctx, texUnit, pname, (GLenum) iparam0);
+         if (!set_combiner_mode(ctx, texUnit, pname, (GLenum) iparam0))
+            return;
 	 break;
       case GL_SOURCE0_RGB:
       case GL_SOURCE1_RGB:
@@ -428,7 +439,8 @@
       case GL_SOURCE1_ALPHA:
       case GL_SOURCE2_ALPHA:
       case GL_SOURCE3_ALPHA_NV:
-         set_combiner_source(ctx, texUnit, pname, (GLenum) iparam0);
+         if (!set_combiner_source(ctx, texUnit, pname, (GLenum) iparam0))
+            return;
 	 break;
       case GL_OPERAND0_RGB:
       case GL_OPERAND1_RGB:
@@ -438,11 +450,13 @@
       case GL_OPERAND1_ALPHA:
       case GL_OPERAND2_ALPHA:
       case GL_OPERAND3_ALPHA_NV:
-         set_combiner_operand(ctx, texUnit, pname, (GLenum) iparam0);
+         if (!set_combiner_operand(ctx, texUnit, pname, (GLenum) iparam0))
+            return;
 	 break;
       case GL_RGB_SCALE:
       case GL_ALPHA_SCALE:
-         set_combiner_scale(ctx, texUnit, pname, param[0]);
+         if (!set_combiner_scale(ctx, texUnit, pname, param[0]))
+            return;
 	 break;
       default:
 	 _mesa_error( ctx, GL_INVALID_ENUM, "glTexEnv(pname)" );
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 0ab9ed4..0c1e5d2 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -900,8 +900,7 @@
 
 /**
  * Error-check the offset and size arguments to
- * glGet[Compressed]TextureSubImage().  Also checks if the specified
- * texture image is missing.
+ * glGet[Compressed]TextureSubImage().
  * \return true if error, false if no error.
  */
 static bool
@@ -913,6 +912,7 @@
                        const char *caller)
 {
    const struct gl_texture_image *texImage;
+   GLuint imageWidth = 0, imageHeight = 0, imageDepth = 0;
 
    if (xoffset < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE, "%s(xoffset = %d)", caller, xoffset);
@@ -1002,61 +1002,38 @@
    }
 
    texImage = select_tex_image(texObj, target, level, zoffset);
-   if (!texImage) {
-      /* Trying to return a non-defined level is a valid operation per se, as
-       * OpenGL 4.6 spec, section 8.11.4 ("Texture Image Queries") does not
-       * handle this case as an error.
-       *
-       * Rather, we need to look at section 8.22 ("Texture State and Proxy
-       * State"):
-       *
-       *   "Each initial texture image is null. It has zero width, height, and
-       *    depth, internal format RGBA, or R8 for buffer textures, component
-       *    sizes set to zero and component types set to NONE, the compressed
-       *    flag set to FALSE, a zero compressed size, and the bound buffer
-       *    object name is zero."
-       *
-       * This means we need to assume the image for the non-defined level is
-       * an empty image. With this assumption, we can go back to section
-       * 8.11.4 and checking again the errors:
-       *
-       *   "An INVALID_VALUE error is generated if xoffset + width is greater
-       *    than the texture’s width, yoffset + height is greater than the
-       *    texture’s height, or zoffset + depth is greater than the texture’s
-       *    depth."
-       *
-       * Thus why we return INVALID_VALUE.
-       */
-      _mesa_error(ctx, GL_INVALID_VALUE, "%s(missing image)", caller);
-      return true;
+   if (texImage) {
+      imageWidth = texImage->Width;
+      imageHeight = texImage->Height;
+      imageDepth = texImage->Depth;
    }
 
-   if (xoffset + width > texImage->Width) {
+   if (xoffset + width > imageWidth) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(xoffset %d + width %d > %u)",
-                  caller, xoffset, width, texImage->Width);
+                  caller, xoffset, width, imageWidth);
       return true;
    }
 
-   if (yoffset + height > texImage->Height) {
+   if (yoffset + height > imageHeight) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "%s(yoffset %d + height %d > %u)",
-                  caller, yoffset, height, texImage->Height);
+                  caller, yoffset, height, imageHeight);
       return true;
    }
 
    if (target != GL_TEXTURE_CUBE_MAP) {
       /* Cube map error checking was done above */
-      if (zoffset + depth > texImage->Depth) {
+      if (zoffset + depth > imageDepth) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "%s(zoffset %d + depth %d > %u)",
-                     caller, zoffset, depth, texImage->Depth);
+                     caller, zoffset, depth, imageDepth);
          return true;
       }
    }
 
    /* Extra checks for compressed textures */
-   {
+   if (texImage) {
       GLuint bw, bh, bd;
       _mesa_get_format_block_size_3d(texImage->TexFormat, &bw, &bh, &bd);
       if (bw > 1 || bh > 1 || bd > 1) {
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 5284e60..948c7df 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -4008,7 +4008,7 @@
        */
       else if (formats_differ_in_component_sizes (texFormat, rb->Format)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glCopyTexImage%uD(componenet size changed in"
+                        "glCopyTexImage%uD(component size changed in"
                         " internal format)", dims);
             return;
       }
@@ -5345,7 +5345,7 @@
    case GL_RGBA8:
       return MESA_FORMAT_R8G8B8A8_UNORM;
    case GL_RGBA16:
-      if (_mesa_is_gles(ctx))
+      if (_mesa_is_gles(ctx) && !_mesa_has_EXT_texture_norm16(ctx))
          return MESA_FORMAT_NONE;
       return MESA_FORMAT_RGBA_UNORM16;
    case GL_RGBA16F_ARB:
@@ -5368,7 +5368,7 @@
    case GL_RG8:
       return MESA_FORMAT_R8G8_UNORM;
    case GL_RG16:
-      if (_mesa_is_gles(ctx))
+      if (_mesa_is_gles(ctx) && !_mesa_has_EXT_texture_norm16(ctx))
          return MESA_FORMAT_NONE;
       return MESA_FORMAT_R16G16_UNORM;
    case GL_RG16F:
@@ -5391,7 +5391,7 @@
    case GL_R8:
       return MESA_FORMAT_R_UNORM8;
    case GL_R16:
-      if (_mesa_is_gles(ctx))
+      if (_mesa_is_gles(ctx) && !_mesa_has_EXT_texture_norm16(ctx))
          return MESA_FORMAT_NONE;
       return MESA_FORMAT_R_UNORM16;
    case GL_R16F:
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 301407e..a3ec724 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1426,6 +1426,11 @@
                               _mesa_get_format_bits(texFormat,
                                                     GL_TEXTURE_GREEN_SIZE));
             }
+            if (*params == 0 && pname == GL_TEXTURE_INTENSITY_SIZE) {
+               /* Gallium may store intensity as LA */
+               *params = _mesa_get_format_bits(texFormat, 
+                                               GL_TEXTURE_ALPHA_SIZE);
+            }
          }
          else {
             *params = 0;
@@ -1979,33 +1984,32 @@
          break;
 
       case GL_TEXTURE_IMMUTABLE_LEVELS:
-         if (_mesa_is_gles3(ctx) ||
-             (_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_view))
+         if (_mesa_is_gles3(ctx) || _mesa_has_texture_view(ctx))
             *params = (GLfloat) obj->ImmutableLevels;
          else
             goto invalid_pname;
          break;
 
       case GL_TEXTURE_VIEW_MIN_LEVEL:
-         if (!ctx->Extensions.ARB_texture_view)
+         if (!_mesa_has_texture_view(ctx))
             goto invalid_pname;
          *params = (GLfloat) obj->MinLevel;
          break;
 
       case GL_TEXTURE_VIEW_NUM_LEVELS:
-         if (!ctx->Extensions.ARB_texture_view)
+         if (!_mesa_has_texture_view(ctx))
             goto invalid_pname;
          *params = (GLfloat) obj->NumLevels;
          break;
 
       case GL_TEXTURE_VIEW_MIN_LAYER:
-         if (!ctx->Extensions.ARB_texture_view)
+         if (!_mesa_has_texture_view(ctx))
             goto invalid_pname;
          *params = (GLfloat) obj->MinLayer;
          break;
 
       case GL_TEXTURE_VIEW_NUM_LAYERS:
-         if (!ctx->Extensions.ARB_texture_view)
+         if (!_mesa_has_texture_view(ctx))
             goto invalid_pname;
          *params = (GLfloat) obj->NumLayers;
          break;
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index 44edba3..9cb8b90 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -445,6 +445,7 @@
             _mesa_error(ctx, GL_OUT_OF_MEMORY,
                         "glTex%sStorage%uD(texture too large)",
                         suffix, dims);
+            return;
          }
       }
 
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 31163f6..2913d4b 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -79,7 +79,7 @@
 
 
 enum {
-   ZERO = 4, 
+   ZERO = 4,
    ONE = 5
 };
 
@@ -431,47 +431,47 @@
    for (img = 0; img < srcDepth; img++) {
       GLuint *dstRow = (GLuint *) dstSlices[img];
       const GLubyte *src
-	 = (const GLubyte *) _mesa_image_address(dims, srcPacking, srcAddr,
-						srcWidth, srcHeight,
-						srcFormat, srcType,
-						img, 0, 0);
+         = (const GLubyte *) _mesa_image_address(dims, srcPacking, srcAddr,
+                                                srcWidth, srcHeight,
+                                                srcFormat, srcType,
+                                                img, 0, 0);
       for (row = 0; row < srcHeight; row++) {
-	 GLint i;
-	 GLboolean keepdepth = GL_FALSE, keepstencil = GL_FALSE;
-	 
-	 if (srcFormat == GL_DEPTH_COMPONENT) { /* preserve stencil */
-	    keepstencil = GL_TRUE;
-	 }
+         GLint i;
+         GLboolean keepdepth = GL_FALSE, keepstencil = GL_FALSE;
+
+         if (srcFormat == GL_DEPTH_COMPONENT) { /* preserve stencil */
+            keepstencil = GL_TRUE;
+         }
          else if (srcFormat == GL_STENCIL_INDEX) { /* preserve depth */
-	    keepdepth = GL_TRUE;
-	 }
+            keepdepth = GL_TRUE;
+         }
 
-	 if (keepdepth == GL_FALSE)
-	    /* the 24 depth bits will be in the low position: */
-	    _mesa_unpack_depth_span(ctx, srcWidth,
-				    GL_UNSIGNED_INT, /* dst type */
-				    keepstencil ? depth : dstRow, /* dst addr */
-				    depthScale,
-				    srcType, src, srcPacking);	 
+         if (keepdepth == GL_FALSE)
+            /* the 24 depth bits will be in the low position: */
+            _mesa_unpack_depth_span(ctx, srcWidth,
+                                    GL_UNSIGNED_INT, /* dst type */
+                                    keepstencil ? depth : dstRow, /* dst addr */
+                                    depthScale,
+                                    srcType, src, srcPacking);
 
-	 if (keepstencil == GL_FALSE)
-	    /* get the 8-bit stencil values */
-	    _mesa_unpack_stencil_span(ctx, srcWidth,
-				      GL_UNSIGNED_BYTE, /* dst type */
-				      stencil, /* dst addr */
-				      srcType, src, srcPacking,
-				      ctx->_ImageTransferState);
+         if (keepstencil == GL_FALSE)
+            /* get the 8-bit stencil values */
+            _mesa_unpack_stencil_span(ctx, srcWidth,
+                                      GL_UNSIGNED_BYTE, /* dst type */
+                                      stencil, /* dst addr */
+                                      srcType, src, srcPacking,
+                                      ctx->_ImageTransferState);
 
-	 /* merge stencil values into depth values */
-	 for (i = 0; i < srcWidth; i++) {
-	    if (keepstencil)
-	       dstRow[i] = depth[i] | (dstRow[i] & 0xFF000000);
-	    else
-	       dstRow[i] = (dstRow[i] & 0xFFFFFF) | (stencil[i] << 24);
+         /* merge stencil values into depth values */
+         for (i = 0; i < srcWidth; i++) {
+            if (keepstencil)
+               dstRow[i] = depth[i] | (dstRow[i] & 0xFF000000);
+            else
+               dstRow[i] = (dstRow[i] & 0xFFFFFF) | (stencil[i] << 24);
 
-	 }
-	 src += srcRowStride;
-	 dstRow += dstRowStride / sizeof(GLuint);
+         }
+         src += srcRowStride;
+         dstRow += dstRowStride / sizeof(GLuint);
       }
    }
 
@@ -493,7 +493,7 @@
 
    {
       const GLint srcRowStride
-	 = _mesa_image_row_stride(srcPacking, srcWidth, srcFormat, srcType);
+         = _mesa_image_row_stride(srcPacking, srcWidth, srcFormat, srcType);
       GLint img, row;
       GLubyte *stencil = malloc(srcWidth * sizeof(GLubyte));
 
@@ -673,12 +673,10 @@
 static GLboolean
 texstore_rgba(TEXSTORE_PARAMS)
 {
-   void *tempImage = NULL, *tempRGBA = NULL;
-   int srcRowStride, img;
+   void *tempImage = NULL;
+   int img;
    GLubyte *src, *dst;
-   uint32_t srcMesaFormat;
    uint8_t rebaseSwizzle[4];
-   bool needRebase;
    bool transferOpsDone = false;
 
    /* We have to handle MESA_FORMAT_YCBCR manually because it is a special case
@@ -725,7 +723,9 @@
        */
       GLint swapSize = _mesa_sizeof_packed_type(srcType);
       if (swapSize == 2 || swapSize == 4) {
-         int imageStride = _mesa_image_image_stride(srcPacking, srcWidth, srcHeight, srcFormat, srcType);
+         int imageStride = _mesa_image_image_stride(srcPacking, srcWidth,
+                                                    srcHeight, srcFormat,
+                                                    srcType);
          int bufferSize = imageStride * srcDepth;
          int layer;
          const uint8_t *src;
@@ -748,15 +748,18 @@
       }
    }
 
-   srcRowStride =
+   int srcRowStride =
       _mesa_image_row_stride(srcPacking, srcWidth, srcFormat, srcType);
 
-   srcMesaFormat = _mesa_format_from_format_and_type(srcFormat, srcType);
+   uint32_t srcMesaFormat =
+      _mesa_format_from_format_and_type(srcFormat, srcType);
+
    dstFormat = _mesa_get_srgb_format_linear(dstFormat);
 
    /* If we have transferOps then we need to convert to RGBA float first,
       then apply transferOps, then do the conversion to dst
     */
+   void *tempRGBA = NULL;
    if (!transferOpsDone &&
        _mesa_texstore_needs_transfer_ops(ctx, baseInternalFormat, dstFormat)) {
       /* Allocate RGBA float image */
@@ -764,7 +767,6 @@
       tempRGBA = malloc(4 * elementCount * sizeof(float));
       if (!tempRGBA) {
          free(tempImage);
-         free(tempRGBA);
          return GL_FALSE;
       }
 
@@ -798,6 +800,7 @@
       _mesa_image_address(dims, srcPacking, srcAddr, srcWidth, srcHeight,
                           srcFormat, srcType, 0, 0, 0);
 
+   bool needRebase;
    if (_mesa_get_format_base_format(dstFormat) != baseInternalFormat) {
       needRebase =
          _mesa_compute_rgba2base2rgba_component_mapping(baseInternalFormat,
@@ -1054,7 +1057,8 @@
                                                 format, type);
       break;
    default:
-      _mesa_warning(ctx, "Unexpected target 0x%x in store_texsubimage()", target);
+      _mesa_warning(ctx, "Unexpected target 0x%x in store_texsubimage()",
+                    target);
       return;
    }
 
@@ -1289,7 +1293,8 @@
             ((packing->RowLength + bw - 1) / bw);
       }
 
-      store->SkipBytes += packing->SkipPixels * packing->CompressedBlockSize / bw;
+      store->SkipBytes +=
+         packing->SkipPixels * packing->CompressedBlockSize / bw;
    }
 
    if (dims > 1 && packing->CompressedBlockHeight &&
@@ -1377,7 +1382,8 @@
          ctx->Driver.UnmapTextureImage(ctx, texImage, slice + zoffset);
 
          /* advance to next slice */
-         src += store.TotalBytesPerRow * (store.TotalRowsPerSlice - store.CopyRowsPerSlice);
+         src += store.TotalBytesPerRow * (store.TotalRowsPerSlice
+                                          - store.CopyRowsPerSlice);
       }
       else {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexSubImage%uD",
diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c
index a46c9f9..8eccdc2 100644
--- a/src/mesa/main/transformfeedback.c
+++ b/src/mesa/main/transformfeedback.c
@@ -40,6 +40,7 @@
 #include "shaderapi.h"
 #include "shaderobj.h"
 
+#include "program/program.h"
 #include "program/prog_parameter.h"
 
 struct using_program_tuple
@@ -470,6 +471,7 @@
 
    if (obj->program != source) {
       ctx->NewDriverState |= ctx->DriverFlags.NewTransformFeedbackProg;
+      _mesa_reference_program_(ctx, &obj->program, source);
       obj->program = source;
    }
 
@@ -504,6 +506,7 @@
    assert(ctx->Driver.EndTransformFeedback);
    ctx->Driver.EndTransformFeedback(ctx, obj);
 
+   _mesa_reference_program_(ctx, &obj->program, NULL);
    ctx->TransformFeedback.CurrentObject->Active = GL_FALSE;
    ctx->TransformFeedback.CurrentObject->Paused = GL_FALSE;
    ctx->TransformFeedback.CurrentObject->EndedAnytime = GL_TRUE;
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index 5df38a1..a3e1aeb 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -159,7 +159,7 @@
 _mesa_vertex_attrib_binding(struct gl_context *ctx,
                             struct gl_vertex_array_object *vao,
                             gl_vert_attrib attribIndex,
-                            GLuint bindingIndex, bool flush_vertices)
+                            GLuint bindingIndex)
 {
    struct gl_array_attributes *array = &vao->VertexAttrib[attribIndex];
    assert(!vao->SharedAndImmutable);
@@ -172,16 +172,14 @@
       else
          vao->VertexAttribBufferMask &= ~array_bit;
 
-      if (flush_vertices) {
-         FLUSH_VERTICES(ctx, _NEW_ARRAY);
-      }
-
       vao->BufferBinding[array->BufferBindingIndex]._BoundArrays &= ~array_bit;
       vao->BufferBinding[bindingIndex]._BoundArrays |= array_bit;
 
       array->BufferBindingIndex = bindingIndex;
 
       vao->NewArrays |= vao->_Enabled & array_bit;
+      if (vao == ctx->Array.VAO)
+         ctx->NewState |= _NEW_ARRAY;
    }
 }
 
@@ -195,7 +193,7 @@
                          struct gl_vertex_array_object *vao,
                          GLuint index,
                          struct gl_buffer_object *vbo,
-                         GLintptr offset, GLsizei stride, bool flush_vertices)
+                         GLintptr offset, GLsizei stride)
 {
    assert(index < ARRAY_SIZE(vao->BufferBinding));
    assert(!vao->SharedAndImmutable);
@@ -204,9 +202,6 @@
    if (binding->BufferObj != vbo ||
        binding->Offset != offset ||
        binding->Stride != stride) {
-      if (flush_vertices) {
-         FLUSH_VERTICES(ctx, _NEW_ARRAY);
-      }
 
       _mesa_reference_buffer_object(ctx, &binding->BufferObj, vbo);
 
@@ -219,6 +214,8 @@
          vao->VertexAttribBufferMask |= binding->_BoundArrays;
 
       vao->NewArrays |= vao->_Enabled & binding->_BoundArrays;
+      if (vao == ctx->Array.VAO)
+         ctx->NewState |= _NEW_ARRAY;
    }
 }
 
@@ -238,9 +235,10 @@
    assert(!vao->SharedAndImmutable);
 
    if (binding->InstanceDivisor != divisor) {
-      FLUSH_VERTICES(ctx, _NEW_ARRAY);
       binding->InstanceDivisor = divisor;
       vao->NewArrays |= vao->_Enabled & binding->_BoundArrays;
+      if (vao == ctx->Array.VAO)
+         ctx->NewState |= _NEW_ARRAY;
    }
 }
 
@@ -322,8 +320,6 @@
  * \param doubles        Double values not reduced to floats
  * \param relativeOffset Offset of the first element relative to the binding
  *                       offset.
- * \param flush_verties  Should \c FLUSH_VERTICES be invoked before updating
- *                       state?
  */
 void
 _mesa_update_array_format(struct gl_context *ctx,
@@ -352,7 +348,8 @@
    array->_ElementSize = elementSize;
 
    vao->NewArrays |= vao->_Enabled & VERT_BIT(attrib);
-   ctx->NewState |= _NEW_ARRAY;
+   if (vao == ctx->Array.VAO)
+      ctx->NewState |= _NEW_ARRAY;
 }
 
 /**
@@ -599,18 +596,23 @@
                              normalized, integer, doubles, 0);
 
    /* Reset the vertex attrib binding */
-   _mesa_vertex_attrib_binding(ctx, vao, attrib, attrib, true);
+   _mesa_vertex_attrib_binding(ctx, vao, attrib, attrib);
 
    /* The Stride and Ptr fields are not set by update_array_format() */
    struct gl_array_attributes *array = &vao->VertexAttrib[attrib];
    array->Stride = stride;
+   /* For updating the pointer we would need to add the vao->NewArrays flag
+    * to the VAO. But but that is done already unconditionally in
+    * _mesa_update_array_format called above.
+    */
+   assert((vao->NewArrays | ~vao->_Enabled) & VERT_BIT(attrib));
    array->Ptr = ptr;
 
    /* Update the vertex buffer binding */
    GLsizei effectiveStride = stride != 0 ? stride : array->_ElementSize;
    _mesa_bind_vertex_buffer(ctx, vao, attrib,
                             ctx->Array.ArrayBufferObj, (GLintptr) ptr,
-                            effectiveStride, true);
+                            effectiveStride);
 }
 
 void GLAPIENTRY
@@ -618,7 +620,6 @@
                              const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
 
    update_array(ctx, VERT_ATTRIB_POS, GL_RGBA, 4, size, type, stride,
                 GL_FALSE, GL_FALSE, GL_FALSE, ptr);
@@ -630,8 +631,6 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    GLenum format = GL_RGBA;
    GLbitfield legalTypes = (ctx->API == API_OPENGLES)
       ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT)
@@ -655,7 +654,6 @@
 _mesa_NormalPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr )
 {
    GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
 
    update_array(ctx, VERT_ATTRIB_NORMAL, GL_RGBA, 3, 3, type, stride, GL_TRUE,
                 GL_FALSE, GL_FALSE, ptr);
@@ -667,8 +665,6 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    GLenum format = GL_RGBA;
    const GLbitfield legalTypes = (ctx->API == API_OPENGLES)
       ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT)
@@ -693,7 +689,6 @@
                             const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
 
    GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
    update_array(ctx, VERT_ATTRIB_COLOR0, format, BGRA_OR_4, size,
@@ -707,8 +702,6 @@
    GET_CURRENT_CONTEXT(ctx);
    const GLint sizeMin = (ctx->API == API_OPENGLES) ? 4 : 3;
 
-   FLUSH_VERTICES(ctx, 0);
-
    GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
    const GLbitfield legalTypes = (ctx->API == API_OPENGLES)
       ? (UNSIGNED_BYTE_BIT | HALF_BIT | FLOAT_BIT | FIXED_ES_BIT)
@@ -735,7 +728,6 @@
 _mesa_FogCoordPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
 
    update_array(ctx, VERT_ATTRIB_FOG, GL_RGBA, 1, 1, type, stride, GL_FALSE,
                 GL_FALSE, GL_FALSE, ptr);
@@ -747,8 +739,6 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    GLenum format = GL_RGBA;
    const GLbitfield legalTypes = (HALF_BIT | FLOAT_BIT | DOUBLE_BIT);
 
@@ -767,7 +757,6 @@
 _mesa_IndexPointer_no_error(GLenum type, GLsizei stride, const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
 
    update_array(ctx, VERT_ATTRIB_COLOR_INDEX, GL_RGBA, 1, 1, type, stride,
                 GL_FALSE, GL_FALSE, GL_FALSE, ptr);
@@ -779,8 +768,6 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    GLenum format = GL_RGBA;
    const GLbitfield legalTypes = (UNSIGNED_BYTE_BIT | SHORT_BIT | INT_BIT |
                                      FLOAT_BIT | DOUBLE_BIT);
@@ -802,7 +789,6 @@
                                      GLsizei stride, const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
 
    GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
    update_array(ctx, VERT_ATTRIB_COLOR1, format, BGRA_OR_4, size, type,
@@ -816,8 +802,6 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    GLenum format = get_array_format(ctx, BGRA_OR_4, &size);
    const GLbitfield legalTypes = (BYTE_BIT | UNSIGNED_BYTE_BIT |
                                   SHORT_BIT | UNSIGNED_SHORT_BIT |
@@ -844,7 +828,6 @@
 {
    GET_CURRENT_CONTEXT(ctx);
    const GLuint unit = ctx->Array.ActiveTexture;
-   FLUSH_VERTICES(ctx, 0);
 
    update_array(ctx, VERT_ATTRIB_TEX(unit), GL_RGBA, 4, size, type,
                 stride, GL_FALSE, GL_FALSE, GL_FALSE, ptr);
@@ -859,8 +842,6 @@
    const GLint sizeMin = (ctx->API == API_OPENGLES) ? 2 : 1;
    const GLuint unit = ctx->Array.ActiveTexture;
 
-   FLUSH_VERTICES(ctx, 0);
-
    GLenum format = GL_RGBA;
    const GLbitfield legalTypes = (ctx->API == API_OPENGLES)
       ? (BYTE_BIT | SHORT_BIT | FLOAT_BIT | FIXED_ES_BIT)
@@ -887,7 +868,6 @@
    /* this is the same type that glEdgeFlag uses */
    const GLboolean integer = GL_FALSE;
    GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
 
    update_array(ctx, VERT_ATTRIB_EDGEFLAG, GL_RGBA, 1, 1, GL_UNSIGNED_BYTE,
                 stride, GL_FALSE, integer, GL_FALSE, ptr);
@@ -901,8 +881,6 @@
    const GLboolean integer = GL_FALSE;
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    GLenum format = GL_RGBA;
    const GLbitfield legalTypes = UNSIGNED_BYTE_BIT;
 
@@ -923,7 +901,6 @@
                                    const GLvoid *ptr)
 {
    GET_CURRENT_CONTEXT(ctx);
-   FLUSH_VERTICES(ctx, 0);
 
    update_array(ctx, VERT_ATTRIB_POINT_SIZE, GL_RGBA, 1, 1, type, stride,
                 GL_FALSE, GL_FALSE, GL_FALSE, ptr);
@@ -935,8 +912,6 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    GLenum format = GL_RGBA;
    if (ctx->API != API_OPENGLES) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -1098,22 +1073,21 @@
 void
 _mesa_enable_vertex_array_attrib(struct gl_context *ctx,
                                  struct gl_vertex_array_object *vao,
-                                 gl_vert_attrib attrib, bool flush_vertices)
+                                 gl_vert_attrib attrib)
 {
    assert(attrib < ARRAY_SIZE(vao->VertexAttrib));
    assert(!vao->SharedAndImmutable);
 
    if (!vao->VertexAttrib[attrib].Enabled) {
       /* was disabled, now being enabled */
-      if (flush_vertices) {
-         FLUSH_VERTICES(ctx, _NEW_ARRAY);
-      }
-
       vao->VertexAttrib[attrib].Enabled = GL_TRUE;
       const GLbitfield array_bit = VERT_BIT(attrib);
       vao->_Enabled |= array_bit;
       vao->NewArrays |= array_bit;
 
+      if (vao == ctx->Array.VAO)
+         ctx->NewState |= _NEW_ARRAY;
+
       /* Update the map mode if needed */
       if (array_bit & (VERT_BIT_POS|VERT_BIT_GENERIC0))
          update_attribute_map_mode(ctx, vao);
@@ -1131,8 +1105,7 @@
       return;
    }
 
-   _mesa_enable_vertex_array_attrib(ctx, vao,
-                                    VERT_ATTRIB_GENERIC(index), true);
+   _mesa_enable_vertex_array_attrib(ctx, vao, VERT_ATTRIB_GENERIC(index));
 }
 
 
@@ -1150,7 +1123,7 @@
 {
    GET_CURRENT_CONTEXT(ctx);
    _mesa_enable_vertex_array_attrib(ctx, ctx->Array.VAO,
-                                    VERT_ATTRIB_GENERIC(index), true);
+                                    VERT_ATTRIB_GENERIC(index));
 }
 
 
@@ -1180,30 +1153,28 @@
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_vertex_array_object *vao = _mesa_lookup_vao(ctx, vaobj);
-   _mesa_enable_vertex_array_attrib(ctx, vao,
-                                    VERT_ATTRIB_GENERIC(index), true);
+   _mesa_enable_vertex_array_attrib(ctx, vao, VERT_ATTRIB_GENERIC(index));
 }
 
 
 void
 _mesa_disable_vertex_array_attrib(struct gl_context *ctx,
                                   struct gl_vertex_array_object *vao,
-                                  gl_vert_attrib attrib, bool flush_vertices)
+                                  gl_vert_attrib attrib)
 {
    assert(attrib < ARRAY_SIZE(vao->VertexAttrib));
    assert(!vao->SharedAndImmutable);
 
    if (vao->VertexAttrib[attrib].Enabled) {
       /* was enabled, now being disabled */
-      if (flush_vertices) {
-         FLUSH_VERTICES(ctx, _NEW_ARRAY);
-      }
-
       vao->VertexAttrib[attrib].Enabled = GL_FALSE;
       const GLbitfield array_bit = VERT_BIT(attrib);
       vao->_Enabled &= ~array_bit;
       vao->NewArrays |= array_bit;
 
+      if (vao == ctx->Array.VAO)
+         ctx->NewState |= _NEW_ARRAY;
+
       /* Update the map mode if needed */
       if (array_bit & (VERT_BIT_POS|VERT_BIT_GENERIC0))
          update_attribute_map_mode(ctx, vao);
@@ -1222,7 +1193,7 @@
    }
 
    const gl_vert_attrib attrib = VERT_ATTRIB_GENERIC(index);
-   _mesa_disable_vertex_array_attrib(ctx, ctx->Array.VAO, attrib, true);
+   _mesa_disable_vertex_array_attrib(ctx, ctx->Array.VAO, attrib);
 }
 
 
@@ -1231,7 +1202,7 @@
 {
    GET_CURRENT_CONTEXT(ctx);
    const gl_vert_attrib attrib = VERT_ATTRIB_GENERIC(index);
-   _mesa_disable_vertex_array_attrib(ctx, ctx->Array.VAO, attrib, true);
+   _mesa_disable_vertex_array_attrib(ctx, ctx->Array.VAO, attrib);
 }
 
 
@@ -1258,7 +1229,7 @@
    }
 
    const gl_vert_attrib attrib = VERT_ATTRIB_GENERIC(index);
-   _mesa_disable_vertex_array_attrib(ctx, vao, attrib, true);
+   _mesa_disable_vertex_array_attrib(ctx, vao, attrib);
 }
 
 
@@ -1268,7 +1239,7 @@
    GET_CURRENT_CONTEXT(ctx);
    struct gl_vertex_array_object *vao = _mesa_lookup_vao(ctx, vaobj);
    const gl_vert_attrib attrib = VERT_ATTRIB_GENERIC(index);
-   _mesa_disable_vertex_array_attrib(ctx, vao, attrib, true);
+   _mesa_disable_vertex_array_attrib(ctx, vao, attrib);
 }
 
 
@@ -1723,8 +1694,6 @@
    GLint defstride;                /* default stride */
    GLint c, f;
 
-   FLUSH_VERTICES(ctx, 0);
-
    f = sizeof(GLfloat);
    c = f * ((4 * sizeof(GLubyte) + (f - 1)) / f);
 
@@ -1894,8 +1863,6 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glLockArrays %d %d\n", first, count);
 
@@ -1924,8 +1891,6 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   FLUSH_VERTICES(ctx, 0);
-
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glUnlockArrays\n");
 
@@ -1986,10 +1951,7 @@
 static void
 primitive_restart_index(struct gl_context *ctx, GLuint index)
 {
-   if (ctx->Array.RestartIndex != index) {
-      FLUSH_VERTICES(ctx, 0);
-      ctx->Array.RestartIndex = index;
-   }
+   ctx->Array.RestartIndex = index;
 }
 
 
@@ -2039,7 +2001,7 @@
     *       VertexAttribBinding(index, index);
     *       VertexBindingDivisor(index, divisor);"
     */
-   _mesa_vertex_attrib_binding(ctx, vao, genericIndex, genericIndex, true);
+   _mesa_vertex_attrib_binding(ctx, vao, genericIndex, genericIndex);
    vertex_binding_divisor(ctx, vao, genericIndex, divisor);
 }
 
@@ -2081,7 +2043,7 @@
     *       VertexAttribBinding(index, index);
     *       VertexBindingDivisor(index, divisor);"
     */
-   _mesa_vertex_attrib_binding(ctx, vao, genericIndex, genericIndex, true);
+   _mesa_vertex_attrib_binding(ctx, vao, genericIndex, genericIndex);
    vertex_binding_divisor(ctx, vao, genericIndex, divisor);
 }
 
@@ -2125,7 +2087,7 @@
    }
 
    _mesa_bind_vertex_buffer(ctx, vao, VERT_ATTRIB_GENERIC(bindingIndex),
-                            vbo, offset, stride, true);
+                            vbo, offset, stride);
 }
 
 
@@ -2277,7 +2239,7 @@
 
       for (i = 0; i < count; i++)
          _mesa_bind_vertex_buffer(ctx, vao, VERT_ATTRIB_GENERIC(first + i),
-                                  vbo, 0, 16, true);
+                                  vbo, 0, 16);
 
       return;
    }
@@ -2351,7 +2313,7 @@
       }
 
       _mesa_bind_vertex_buffer(ctx, vao, VERT_ATTRIB_GENERIC(first + i),
-                               vbo, offsets[i], strides[i], true);
+                               vbo, offsets[i], strides[i]);
    }
 
    _mesa_HashUnlockMutex(ctx->Shared->BufferObjects);
@@ -2516,8 +2478,6 @@
       }
    }
 
-   FLUSH_VERTICES(ctx, 0);
-
    _mesa_update_array_format(ctx, ctx->Array.VAO,
                              VERT_ATTRIB_GENERIC(attribIndex), size, type,
                              format, normalized, integer, doubles,
@@ -2606,8 +2566,6 @@
       }
    }
 
-   FLUSH_VERTICES(ctx, 0);
-
    _mesa_update_array_format(ctx, vao, VERT_ATTRIB_GENERIC(attribIndex), size,
                              type, format, normalized, integer, doubles,
                              relativeOffset);
@@ -2685,7 +2643,7 @@
 
    _mesa_vertex_attrib_binding(ctx, vao,
                                VERT_ATTRIB_GENERIC(attribIndex),
-                               VERT_ATTRIB_GENERIC(bindingIndex), true);
+                               VERT_ATTRIB_GENERIC(bindingIndex));
 }
 
 
@@ -2695,7 +2653,7 @@
    GET_CURRENT_CONTEXT(ctx);
    _mesa_vertex_attrib_binding(ctx, ctx->Array.VAO,
                                VERT_ATTRIB_GENERIC(attribIndex),
-                               VERT_ATTRIB_GENERIC(bindingIndex), true);
+                               VERT_ATTRIB_GENERIC(bindingIndex));
 }
 
 
@@ -2731,7 +2689,7 @@
    struct gl_vertex_array_object *vao = _mesa_lookup_vao(ctx, vaobj);
    _mesa_vertex_attrib_binding(ctx, vao,
                                VERT_ATTRIB_GENERIC(attribIndex),
-                               VERT_ATTRIB_GENERIC(bindingIndex), true);
+                               VERT_ATTRIB_GENERIC(bindingIndex));
 }
 
 
@@ -2868,6 +2826,8 @@
    dst->Ptr            = src->Ptr;
    dst->Enabled        = src->Enabled;
    dst->_ElementSize   = src->_ElementSize;
+   dst->_EffBufferBindingIndex = src->_EffBufferBindingIndex;
+   dst->_EffRelativeOffset = src->_EffRelativeOffset;
 }
 
 void
@@ -2879,6 +2839,8 @@
    dst->Stride          = src->Stride;
    dst->InstanceDivisor = src->InstanceDivisor;
    dst->_BoundArrays    = src->_BoundArrays;
+   dst->_EffBoundArrays = src->_EffBoundArrays;
+   dst->_EffOffset      = src->_EffOffset;
 
    _mesa_reference_buffer_object(ctx, &dst->BufferObj, src->BufferObj);
 }
diff --git a/src/mesa/main/varray.h b/src/mesa/main/varray.h
index 25d2a29..a901bf9 100644
--- a/src/mesa/main/varray.h
+++ b/src/mesa/main/varray.h
@@ -53,20 +53,6 @@
 }
 
 
-/**
- * This specifies the set of vertex arrays used by the driver for drawing.
- */
-static inline void
-_mesa_set_drawing_arrays(struct gl_context *ctx,
-                         const struct gl_vertex_array *arrays)
-{
-   if (ctx->Array._DrawArrays != arrays) {
-      ctx->Array._DrawArrays = arrays;
-      ctx->NewDriverState |= ctx->DriverFlags.NewArray;
-   }
-}
-
-
 extern void
 _mesa_update_array_format(struct gl_context *ctx,
                           struct gl_vertex_array_object *vao,
@@ -78,20 +64,20 @@
 extern void
 _mesa_enable_vertex_array_attrib(struct gl_context *ctx,
                                  struct gl_vertex_array_object *vao,
-                                 gl_vert_attrib attrib, bool flush_vertices);
+                                 gl_vert_attrib attrib);
 
 
 extern void
 _mesa_disable_vertex_array_attrib(struct gl_context *ctx,
                                   struct gl_vertex_array_object *vao,
-                                  gl_vert_attrib attrib, bool flush_vertices);
+                                  gl_vert_attrib attrib);
 
 
 extern void
 _mesa_vertex_attrib_binding(struct gl_context *ctx,
                             struct gl_vertex_array_object *vao,
                             gl_vert_attrib attribIndex,
-                            GLuint bindingIndex, bool flush_vertices);
+                            GLuint bindingIndex);
 
 
 extern void
@@ -99,7 +85,7 @@
                          struct gl_vertex_array_object *vao,
                          GLuint index,
                          struct gl_buffer_object *vbo,
-                         GLintptr offset, GLsizei stride, bool flush_vertices);
+                         GLintptr offset, GLsizei stride);
 
 extern void GLAPIENTRY
 _mesa_VertexPointer_no_error(GLint size, GLenum type, GLsizei stride,
@@ -463,18 +449,6 @@
                                 GLuint divisor);
 
 
-/**
- * Shallow copy one vertex array to another.
- */
-static inline void
-_mesa_copy_vertex_array(struct gl_vertex_array *dst,
-                        const struct gl_vertex_array *src)
-{
-   dst->VertexAttrib = src->VertexAttrib;
-   dst->BufferBinding = src->BufferBinding;
-}
-
-
 extern void
 _mesa_copy_vertex_attrib_array(struct gl_context *ctx,
                                struct gl_array_attributes *dst,
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 9c05f66..39fb134 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -127,7 +127,9 @@
 		     ,
 		     prefix,
 		     ctx->Version / 10, ctx->Version % 10,
-		     (ctx->API == API_OPENGL_CORE) ? " (Core Profile)" : ""
+		     (ctx->API == API_OPENGL_CORE) ? " (Core Profile)" :
+                     (ctx->API == API_OPENGL_COMPAT && ctx->Version >= 32) ?
+                        " (Compatibility Profile)" : ""
 		     );
    }
 }
@@ -260,15 +262,7 @@
                          extensions->ARB_fragment_shader &&
                          extensions->ARB_texture_non_power_of_two &&
                          extensions->EXT_blend_equation_separate &&
-
-                         /* Technically, 2.0 requires the functionality of the
-                          * EXT version.  Enable 2.0 if either extension is
-                          * available, and assume that a driver that only
-                          * exposes the ATI extension will fallback to
-                          * software when necessary.
-                          */
-                         (extensions->EXT_stencil_two_side
-                          || extensions->ATI_separate_stencil));
+                         extensions->EXT_stencil_two_side);
    const bool ver_2_1 = (ver_2_0 &&
                          extensions->EXT_pixel_buffer_object &&
                          extensions->EXT_texture_sRGB);
@@ -374,6 +368,7 @@
                          extensions->ARB_texture_view);
    const bool ver_4_4 = (ver_4_3 &&
                          consts->GLSLVersion >= 440 &&
+                         consts->MaxVertexAttribStride >= 2048 &&
                          extensions->ARB_buffer_storage &&
                          extensions->ARB_clear_texture &&
                          extensions->ARB_enhanced_layouts &&
@@ -534,6 +529,7 @@
    const bool es31_compute_shader =
       consts->MaxComputeWorkGroupInvocations >= 128;
    const bool ver_3_1 = (ver_3_0 &&
+                         consts->MaxVertexAttribStride >= 2048 &&
                          extensions->ARB_arrays_of_arrays &&
                          es31_compute_shader &&
                          extensions->ARB_draw_indirect &&
@@ -546,7 +542,8 @@
                          extensions->ARB_shading_language_packing &&
                          extensions->ARB_stencil_texturing &&
                          extensions->ARB_texture_multisample &&
-                         extensions->ARB_gpu_shader5 &&
+                         extensions->ARB_texture_gather &&
+                         extensions->MESA_shader_integer_functions &&
                          extensions->EXT_shader_integer_mix);
    const bool ver_3_2 = (ver_3_1 &&
                          extensions->EXT_draw_buffers2 &&
@@ -587,9 +584,7 @@
       /* Disable higher GLSL versions for legacy contexts.
        * This disallows creation of higher compatibility contexts. */
       if (!consts->AllowHigherCompatVersion) {
-         if (consts->GLSLVersion > 140) {
-            consts->GLSLVersion = 140;
-         }
+         consts->GLSLVersion = consts->GLSLVersionCompat;
       }
       /* fall through */
    case API_OPENGL_CORE:
@@ -621,6 +616,9 @@
     */
    if (_mesa_is_desktop_gl(ctx)) {
       switch (ctx->Version) {
+      case 21:
+         ctx->Const.GLSLVersion = 120;
+         break;
       case 30:
          ctx->Const.GLSLVersion = 130;
          break;
diff --git a/src/mesa/main/viewport.c b/src/mesa/main/viewport.c
index 398cc63..97d3285 100644
--- a/src/mesa/main/viewport.c
+++ b/src/mesa/main/viewport.c
@@ -489,6 +489,9 @@
       ctx->ViewportArray[i].Near = 0.0;
       ctx->ViewportArray[i].Far = 1.0;
    }
+
+   ctx->SubpixelPrecisionBias[0] = 0;
+   ctx->SubpixelPrecisionBias[1] = 0;
 }
 
 
@@ -599,3 +602,58 @@
       translate[2] = n;
    }
 }
+
+
+static void
+subpixel_precision_bias(struct gl_context *ctx, GLuint xbits, GLuint ybits)
+{
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glSubpixelPrecisionBiasNV(%u, %u)\n", xbits, ybits);
+
+   ctx->SubpixelPrecisionBias[0] = xbits;
+   ctx->SubpixelPrecisionBias[1] = ybits;
+
+   FLUSH_VERTICES(ctx, 0);
+   ctx->NewDriverState |=
+      ctx->DriverFlags.NewNvConservativeRasterizationParams;
+}
+
+void GLAPIENTRY
+_mesa_SubpixelPrecisionBiasNV_no_error(GLuint xbits, GLuint ybits)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glSubpixelPrecisionBiasNV(%u, %u)\n", xbits, ybits);
+
+   subpixel_precision_bias(ctx, xbits, ybits);
+}
+
+void GLAPIENTRY
+_mesa_SubpixelPrecisionBiasNV(GLuint xbits, GLuint ybits)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glSubpixelPrecisionBiasNV(%u, %u)\n", xbits, ybits);
+
+   ASSERT_OUTSIDE_BEGIN_END(ctx);
+
+   if (!ctx->Extensions.NV_conservative_raster) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glSubpixelPrecisionBiasNV not supported");
+      return;
+   }
+
+   if (xbits > ctx->Const.MaxSubpixelPrecisionBiasBits) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glSubpixelPrecisionBiasNV");
+      return;
+   }
+
+   if (ybits > ctx->Const.MaxSubpixelPrecisionBiasBits) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "glSubpixelPrecisionBiasNV");
+      return;
+   }
+
+   subpixel_precision_bias(ctx, xbits, ybits);
+}
diff --git a/src/mesa/main/viewport.h b/src/mesa/main/viewport.h
index 5860e4f..aca6296 100644
--- a/src/mesa/main/viewport.h
+++ b/src/mesa/main/viewport.h
@@ -104,4 +104,10 @@
 _mesa_get_viewport_xform(struct gl_context *ctx, unsigned i,
                          float scale[3], float translate[3]);
 
+extern void GLAPIENTRY
+_mesa_SubpixelPrecisionBiasNV_no_error(GLuint xbits, GLuint ybits);
+
+extern void GLAPIENTRY
+_mesa_SubpixelPrecisionBiasNV(GLuint xbits, GLuint ybits);
+
 #endif
diff --git a/src/mesa/main/vtxfmt.c b/src/mesa/main/vtxfmt.c
index 61629a4..3e96c7d 100644
--- a/src/mesa/main/vtxfmt.c
+++ b/src/mesa/main/vtxfmt.c
@@ -211,7 +211,7 @@
       SET_VertexAttribL1ui64vARB(tab, vfmt->VertexAttribL1ui64vARB);
    }
 
-   if (ctx->API == API_OPENGL_CORE) {
+   if (_mesa_is_desktop_gl(ctx)) {
       /* GL_ARB_vertex_attrib_64bit */
       SET_VertexAttribL1d(tab, vfmt->VertexAttribL1d);
       SET_VertexAttribL2d(tab, vfmt->VertexAttribL2d);
diff --git a/src/mesa/meson.build b/src/mesa/meson.build
index 3100dfc..2963369 100644
--- a/src/mesa/meson.build
+++ b/src/mesa/meson.build
@@ -63,8 +63,6 @@
   'main/api_exec.h',
   'main/api_loopback.c',
   'main/api_loopback.h',
-  'main/api_validate.c',
-  'main/api_validate.h',
   'main/arbprogram.c',
   'main/arbprogram.h',
   'main/arrayobj.c',
@@ -98,6 +96,8 @@
   'main/condrender.c',
   'main/condrender.h',
   'main/config.h',
+  'main/conservativeraster.c',
+  'main/conservativeraster.h',
   'main/context.c',
   'main/context.h',
   'main/convolve.c',
@@ -119,6 +119,8 @@
   'main/drawpix.h',
   'main/drawtex.c',
   'main/drawtex.h',
+  'main/draw_validate.c',
+  'main/draw_validate.h',
   'main/enable.c',
   'main/enable.h',
   'main/enums.h',
@@ -252,6 +254,8 @@
   'main/syncobj.c',
   'main/syncobj.h',
   'main/texcompress.c',
+  'main/texcompress_astc.cpp',
+  'main/texcompress_astc.h',
   'main/texcompress_bptc.c',
   'main/texcompress_bptc.h',
   'main/texcompress_cpal.c',
@@ -611,6 +615,8 @@
     capture : true,
   )
 endif
+
+inc_libmesa_asm = []
 if with_asm_arch == 'x86'
   files_libmesa_common += files(
     'x86/assyntax.h',
@@ -641,10 +647,18 @@
     'x86/sse_normal.S',
     'x86/read_rgba_span_x86.S',
   )
+  inc_libmesa_asm = include_directories('x86')
 elif with_asm_arch == 'x86_64'
   files_libmesa_common += files('x86-64/x86-64.h', 'x86-64/xform4.S')
+  inc_libmesa_asm = include_directories('x86-64')
+elif with_asm_arch == 'sparc'
+  files_libmesa_common += files(
+    'sparc/sparc_clip.S',
+    'sparc/norm.S',
+    'sparc/xform.S',
+  )
+  inc_libmesa_asm = include_directories('sparc')
 endif
-# TODO: sparc
 
 format_fallback_c = custom_target(
   'format_fallback.c',
@@ -708,7 +722,7 @@
   [files_libmesa_common, files_libmesa_classic],
   c_args : [c_vis_args, c_msvc_compat_args],
   cpp_args : [cpp_vis_args, cpp_msvc_compat_args],
-  include_directories : [inc_common, include_directories('main')],
+  include_directories : [inc_common, inc_libmesa_asm, include_directories('main')],
   link_with : [libglsl, libmesa_sse41],
   dependencies : idep_nir_headers,
   build_by_default : false,
@@ -719,7 +733,7 @@
   [files_libmesa_common, files_libmesa_gallium],
   c_args : [c_vis_args, c_msvc_compat_args],
   cpp_args : [cpp_vis_args, cpp_msvc_compat_args],
-  include_directories : [inc_common, include_directories('main')],
+  include_directories : [inc_common, inc_libmesa_asm, include_directories('main')],
   link_with : [libglsl, libmesa_sse41],
   dependencies : [idep_nir_headers, dep_vdpau],
   build_by_default : false,
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 60fb24b..2908819 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -3153,6 +3153,7 @@
                       "SPIR_V_BINARY_ARB state");
       }
    }
+   prog->data->spirv = spirv;
 
    if (prog->data->LinkStatus) {
       if (!spirv)
diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c
index 4d7f388..3bbe451 100644
--- a/src/mesa/program/prog_statevars.c
+++ b/src/mesa/program/prog_statevars.c
@@ -571,7 +571,7 @@
       case STATE_FB_WPOS_Y_TRANSFORM:
          /* A driver may negate this conditional by using ZW swizzle
           * instead of XY (based on e.g. some other state). */
-         if (_mesa_is_user_fbo(ctx->DrawBuffer)) {
+         if (!ctx->DrawBuffer->FlipY) {
             /* Identity (XY) followed by flipping Y upside down (ZW). */
             value[0] = 1.0F;
             value[1] = 0.0F;
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index cd874e4..14e57b6 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -164,43 +164,14 @@
       case PROGRAM_STATE_VAR: {
          assert(c->parameters != NULL);
 
-         nir_intrinsic_instr *load =
-            nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_var);
-         nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
-         load->num_components = 4;
+         nir_deref_instr *deref = nir_build_deref_var(b, c->parameters);
 
-         load->variables[0] = nir_deref_var_create(load, c->parameters);
-         nir_deref_array *deref_arr =
-            nir_deref_array_create(load->variables[0]);
-         deref_arr->deref.type = glsl_vec4_type();
-         load->variables[0]->deref.child = &deref_arr->deref;
+         nir_ssa_def *index = nir_imm_int(b, prog_src->Index);
+         if (prog_src->RelAddr)
+            index = nir_iadd(b, index, nir_load_reg(b, c->addr_reg));
+         deref = nir_build_deref_array(b, deref, nir_channel(b, index, 0));
 
-         if (prog_src->RelAddr) {
-            deref_arr->deref_array_type = nir_deref_array_type_indirect;
-
-            nir_alu_src addr_src = { NIR_SRC_INIT };
-            addr_src.src = nir_src_for_reg(c->addr_reg);
-            nir_ssa_def *reladdr = nir_imov_alu(b, addr_src, 1);
-
-            if (prog_src->Index < 0) {
-               /* This is a negative offset which should be added to the address
-                * register's value.
-                */
-               reladdr = nir_iadd(b, reladdr, nir_imm_int(b, prog_src->Index));
-
-               deref_arr->base_offset = 0;
-            } else {
-               deref_arr->base_offset = prog_src->Index;
-            }
-            deref_arr->indirect = nir_src_for_ssa(reladdr);
-         } else {
-            deref_arr->deref_array_type = nir_deref_array_type_direct;
-            deref_arr->base_offset = prog_src->Index;
-         }
-
-         nir_builder_instr_insert(b, &load->instr);
-
-         src.src = nir_src_for_ssa(&load->dest.ssa);
+         src.src = nir_src_for_ssa(nir_load_deref(b, deref));
          break;
       }
       default:
diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 45a4596..df1a94e 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -27,6 +27,7 @@
 
 
 #include <stdio.h>
+#include "main/arrayobj.h"
 #include "main/glheader.h"
 #include "main/context.h"
 
@@ -138,19 +139,15 @@
 
 static void check_attrib_edgeflag(struct st_context *st)
 {
-   const struct gl_vertex_array *arrays = st->ctx->Array._DrawArrays;
-   const struct gl_vertex_buffer_binding *binding;
    GLboolean vertdata_edgeflags, edgeflag_culls_prims, edgeflags_enabled;
    struct gl_program *vp = st->ctx->VertexProgram._Current;
 
-   if (!arrays)
-      return;
-
    edgeflags_enabled = st->ctx->Polygon.FrontMode != GL_FILL ||
                        st->ctx->Polygon.BackMode != GL_FILL;
 
-   binding = arrays[VERT_ATTRIB_EDGEFLAG].BufferBinding;
-   vertdata_edgeflags = edgeflags_enabled && binding->Stride != 0;
+   vertdata_edgeflags = edgeflags_enabled &&
+      _mesa_draw_edge_flag_array_enabled(st->ctx);
+
    if (vertdata_edgeflags != st->vertdata_edgeflags) {
       st->vertdata_edgeflags = vertdata_edgeflags;
       if (vp)
diff --git a/src/mesa/state_tracker/st_atom.h b/src/mesa/state_tracker/st_atom.h
index 2567ad3..96e128d 100644
--- a/src/mesa/state_tracker/st_atom.h
+++ b/src/mesa/state_tracker/st_atom.h
@@ -86,7 +86,7 @@
                                  ST_NEW_CS_SAMPLERS)
 
 #define ST_NEW_FRAMEBUFFER      (ST_NEW_FB_STATE | \
-                                 ST_NEW_SAMPLE_MASK | \
+                                 ST_NEW_SAMPLE_STATE | \
                                  ST_NEW_SAMPLE_SHADING)
 
 #define ST_NEW_VERTEX_PROGRAM(st, p) (p->affected_states | \
diff --git a/src/mesa/state_tracker/st_atom_array.c b/src/mesa/state_tracker/st_atom_array.c
index 2fd67e8..e4fc19e 100644
--- a/src/mesa/state_tracker/st_atom_array.c
+++ b/src/mesa/state_tracker/st_atom_array.c
@@ -48,6 +48,7 @@
 #include "main/bufferobj.h"
 #include "main/glformats.h"
 #include "main/varray.h"
+#include "main/arrayobj.h"
 
 /* vertex_formats[gltype - GL_BYTE][integer*2 + normalized][size - 1] */
 static const uint16_t vertex_formats[][4][4] = {
@@ -306,79 +307,6 @@
    return vertex_formats[type - GL_BYTE][index][size-1];
 }
 
-static const struct gl_vertex_array *
-get_client_array(const struct gl_vertex_array *arrays,
-                 unsigned mesaAttr)
-{
-   /* st_program uses 0xffffffff to denote a double placeholder attribute */
-   if (mesaAttr == ST_DOUBLE_ATTRIB_PLACEHOLDER)
-      return NULL;
-   return &arrays[mesaAttr];
-}
-
-/**
- * Examine the active arrays to determine if we have interleaved
- * vertex arrays all living in one VBO, or all living in user space.
- */
-static GLboolean
-is_interleaved_arrays(const struct st_vertex_program *vp,
-                      const struct gl_vertex_array *arrays,
-                      unsigned num_inputs)
-{
-   GLuint attr;
-   const struct gl_buffer_object *firstBufObj = NULL;
-   GLint firstStride = -1;
-   const GLubyte *firstPtr = NULL;
-   GLboolean userSpaceBuffer = GL_FALSE;
-
-   for (attr = 0; attr < num_inputs; attr++) {
-      const struct gl_vertex_array *array;
-      const struct gl_vertex_buffer_binding *binding;
-      const struct gl_array_attributes *attrib;
-      const GLubyte *ptr;
-      const struct gl_buffer_object *bufObj;
-      GLsizei stride;
-
-      array = get_client_array(arrays, vp->index_to_input[attr]);
-      if (!array)
-	 continue;
-
-      binding = array->BufferBinding;
-      attrib = array->VertexAttrib;
-      stride = binding->Stride; /* in bytes */
-      ptr = _mesa_vertex_attrib_address(attrib, binding);
-
-      /* To keep things simple, don't allow interleaved zero-stride attribs. */
-      if (stride == 0)
-         return false;
-
-      bufObj = binding->BufferObj;
-      if (attr == 0) {
-         /* save info about the first array */
-         firstStride = stride;
-         firstPtr = ptr;
-         firstBufObj = bufObj;
-         userSpaceBuffer = !_mesa_is_bufferobj(bufObj);
-      }
-      else {
-         /* check if other arrays interleave with the first, in same buffer */
-         if (stride != firstStride)
-            return GL_FALSE; /* strides don't match */
-
-         if (bufObj != firstBufObj)
-            return GL_FALSE; /* arrays in different VBOs */
-
-         if (llabs(ptr - firstPtr) > firstStride)
-            return GL_FALSE; /* arrays start too far apart */
-
-         if ((!_mesa_is_bufferobj(bufObj)) != userSpaceBuffer)
-            return GL_FALSE; /* mix of VBO and user-space arrays */
-      }
-   }
-
-   return GL_TRUE;
-}
-
 static void init_velement(struct pipe_vertex_element *velement,
                           int src_offset, int format,
                           int instance_divisor, int vbo_index)
@@ -392,13 +320,13 @@
 
 static void init_velement_lowered(const struct st_vertex_program *vp,
                                   struct pipe_vertex_element *velements,
-                                  int src_offset, int format,
-                                  int instance_divisor, int vbo_index,
-                                  int nr_components, GLboolean doubles,
-                                  GLuint *attr_idx)
+                                  const struct gl_array_attributes *attrib,
+                                  int src_offset, int instance_divisor,
+                                  int vbo_index, int idx)
 {
-   int idx = *attr_idx;
-   if (doubles) {
+   const GLubyte nr_components = attrib->Size;
+
+   if (attrib->Doubles) {
       int lower_format;
 
       if (nr_components < 2)
@@ -427,15 +355,13 @@
             init_velement(&velements[idx], src_offset, PIPE_FORMAT_R32G32_UINT,
                           instance_divisor, vbo_index);
          }
-
-         idx++;
       }
    } else {
+      const unsigned format = st_pipe_vertex_format(attrib);
+
       init_velement(&velements[idx], src_offset,
                     format, instance_divisor, vbo_index);
-      idx++;
    }
-   *attr_idx = idx;
 }
 
 static void
@@ -457,274 +383,132 @@
    cso_set_vertex_elements(cso, num_velements, velements);
 }
 
-/**
- * Set up for drawing interleaved arrays that all live in one VBO
- * or all live in user space.
- * \param vbuffer  returns vertex buffer info
- * \param velements  returns vertex element info
- */
-static void
-setup_interleaved_attribs(struct st_context *st,
-                          const struct st_vertex_program *vp,
-                          const struct gl_vertex_array *arrays,
-                          unsigned num_inputs)
-{
-   struct pipe_vertex_buffer vbuffer;
-   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS] = {{0}};
-   GLuint attr;
-   const GLubyte *low_addr = NULL;
-   GLboolean usingVBO;      /* all arrays in a VBO? */
-   struct gl_buffer_object *bufobj;
-   GLsizei stride;
-
-   /* Find the lowest address of the arrays we're drawing,
-    * Init bufobj and stride.
-    */
-   if (num_inputs) {
-      const struct gl_vertex_array *array;
-      const struct gl_vertex_buffer_binding *binding;
-      const struct gl_array_attributes *attrib;
-
-      array = get_client_array(arrays, vp->index_to_input[0]);
-      assert(array);
-
-      binding = array->BufferBinding;
-      attrib = array->VertexAttrib;
-
-      /* Since we're doing interleaved arrays, we know there'll be at most
-       * one buffer object and the stride will be the same for all arrays.
-       * Grab them now.
-       */
-      bufobj = binding->BufferObj;
-      stride = binding->Stride;
-
-      low_addr = _mesa_vertex_attrib_address(attrib, binding);
-
-      for (attr = 1; attr < num_inputs; attr++) {
-         const GLubyte *start;
-         array = get_client_array(arrays, vp->index_to_input[attr]);
-         if (!array)
-            continue;
-         binding = array->BufferBinding;
-         attrib = array->VertexAttrib;
-         start = _mesa_vertex_attrib_address(attrib, binding);
-         low_addr = MIN2(low_addr, start);
-      }
-   }
-   else {
-      /* not sure we'll ever have zero inputs, but play it safe */
-      bufobj = NULL;
-      stride = 0;
-      low_addr = 0;
-   }
-
-   /* are the arrays in user space? */
-   usingVBO = _mesa_is_bufferobj(bufobj);
-
-   for (attr = 0; attr < num_inputs;) {
-      const struct gl_vertex_array *array;
-      const struct gl_vertex_buffer_binding *binding;
-      const struct gl_array_attributes *attrib;
-      const GLubyte *ptr;
-      unsigned src_offset;
-      unsigned src_format;
-
-      array = get_client_array(arrays, vp->index_to_input[attr]);
-      assert(array);
-
-      binding = array->BufferBinding;
-      attrib = array->VertexAttrib;
-      ptr = _mesa_vertex_attrib_address(attrib, binding);
-
-      src_offset = (unsigned) (ptr - low_addr);
-
-      src_format = st_pipe_vertex_format(attrib);
-
-      init_velement_lowered(vp, velements, src_offset, src_format,
-                            binding->InstanceDivisor, 0,
-                            attrib->Size, attrib->Doubles, &attr);
-   }
-
-   /*
-    * Return the vbuffer info and setup user-space attrib info, if needed.
-    */
-   if (num_inputs == 0) {
-      /* just defensive coding here */
-      vbuffer.buffer.resource = NULL;
-      vbuffer.is_user_buffer = false;
-      vbuffer.buffer_offset = 0;
-      vbuffer.stride = 0;
-   }
-   else if (usingVBO) {
-      /* all interleaved arrays in a VBO */
-      struct st_buffer_object *stobj = st_buffer_object(bufobj);
-
-      if (!stobj || !stobj->buffer) {
-         st->vertex_array_out_of_memory = true;
-         return; /* out-of-memory error probably */
-      }
-
-      vbuffer.buffer.resource = stobj->buffer;
-      vbuffer.is_user_buffer = false;
-      vbuffer.buffer_offset = pointer_to_offset(low_addr);
-      vbuffer.stride = stride;
-   }
-   else {
-      /* all interleaved arrays in user memory */
-      vbuffer.buffer.user = low_addr;
-      vbuffer.is_user_buffer = !!low_addr; /* if NULL, then unbind */
-      vbuffer.buffer_offset = 0;
-      vbuffer.stride = stride;
-
-      if (low_addr)
-         st->draw_needs_minmax_index = true;
-   }
-
-   set_vertex_attribs(st, &vbuffer, num_inputs ? 1 : 0,
-                      velements, num_inputs);
-}
-
-/**
- * Set up a separate pipe_vertex_buffer and pipe_vertex_element for each
- * vertex attribute.
- * \param vbuffer  returns vertex buffer info
- * \param velements  returns vertex element info
- */
-static void
-setup_non_interleaved_attribs(struct st_context *st,
-                              const struct st_vertex_program *vp,
-                              const struct gl_vertex_array *arrays,
-                              unsigned num_inputs)
+void
+st_update_array(struct st_context *st)
 {
    struct gl_context *ctx = st->ctx;
+   /* vertex program validation must be done before this */
+   const struct st_vertex_program *vp = st->vp;
+   /* _NEW_PROGRAM, ST_NEW_VS_STATE */
+   const GLbitfield inputs_read = st->vp_variant->vert_attrib_mask;
+   const struct gl_vertex_array_object *vao = ctx->Array._DrawVAO;
+   const ubyte *input_to_index = vp->input_to_index;
+
    struct pipe_vertex_buffer vbuffer[PIPE_MAX_ATTRIBS];
-   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS] = {{0}};
+   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
    unsigned num_vbuffers = 0;
-   unsigned unref_buffers = 0;
-   GLuint attr;
 
-   for (attr = 0; attr < num_inputs;) {
-      const unsigned mesaAttr = vp->index_to_input[attr];
-      const struct gl_vertex_array *array;
-      const struct gl_vertex_buffer_binding *binding;
-      const struct gl_array_attributes *attrib;
-      struct gl_buffer_object *bufobj;
-      GLsizei stride;
-      unsigned src_format;
-      unsigned bufidx;
+   st->vertex_array_out_of_memory = FALSE;
+   st->draw_needs_minmax_index = false;
 
-      array = get_client_array(arrays, mesaAttr);
-      assert(array);
+   /* _NEW_PROGRAM */
+   /* ST_NEW_VERTEX_ARRAYS alias ctx->DriverFlags.NewArray */
+   /* Process attribute array data. */
+   GLbitfield mask = inputs_read & _mesa_draw_array_bits(ctx);
+   while (mask) {
+      /* The attribute index to start pulling a binding */
+      const gl_vert_attrib i = ffs(mask) - 1;
+      const struct gl_vertex_buffer_binding *const binding
+         = _mesa_draw_buffer_binding(vao, i);
+      const unsigned bufidx = num_vbuffers++;
 
-      bufidx = num_vbuffers++;
-
-      binding = array->BufferBinding;
-      attrib = array->VertexAttrib;
-      stride = binding->Stride;
-      bufobj = binding->BufferObj;
-
-      if (_mesa_is_bufferobj(bufobj)) {
-         /* Attribute data is in a VBO.
-          * Recall that for VBOs, the gl_vertex_array->Ptr field is
-          * really an offset from the start of the VBO, not a pointer.
-          */
-         struct st_buffer_object *stobj = st_buffer_object(bufobj);
-
+      if (_mesa_is_bufferobj(binding->BufferObj)) {
+         struct st_buffer_object *stobj = st_buffer_object(binding->BufferObj);
          if (!stobj || !stobj->buffer) {
             st->vertex_array_out_of_memory = true;
             return; /* out-of-memory error probably */
          }
 
+         /* Set the binding */
          vbuffer[bufidx].buffer.resource = stobj->buffer;
          vbuffer[bufidx].is_user_buffer = false;
-         vbuffer[bufidx].buffer_offset =
-            binding->Offset + attrib->RelativeOffset;
+         vbuffer[bufidx].buffer_offset = _mesa_draw_binding_offset(binding);
+      } else {
+         /* Set the binding */
+         const void *ptr = (const void *)_mesa_draw_binding_offset(binding);
+         vbuffer[bufidx].buffer.user = ptr;
+         vbuffer[bufidx].is_user_buffer = true;
+         vbuffer[bufidx].buffer_offset = 0;
+
+         if (!binding->InstanceDivisor)
+            st->draw_needs_minmax_index = true;
       }
-      else {
-         if (stride == 0) {
-            unsigned size = attrib->_ElementSize;
-            /* This is optimal for GPU cache line usage if the upload size
-             * is <= cache line size.
-             */
-            unsigned alignment = util_next_power_of_two(size);
+      vbuffer[bufidx].stride = binding->Stride; /* in bytes */
 
-            assert(attrib->Ptr);
-            vbuffer[bufidx].buffer.user = attrib->Ptr;
-            void *ptr = attrib->Ptr ? (void*)attrib->Ptr :
-                                      (void*)ctx->Current.Attrib[mesaAttr];
+      const GLbitfield boundmask = _mesa_draw_bound_attrib_bits(binding);
+      GLbitfield attrmask = mask & boundmask;
+      /* Mark the those attributes as processed */
+      mask &= ~boundmask;
+      /* We can assume that we have array for the binding */
+      assert(attrmask);
+      /* Walk attributes belonging to the binding */
+      while (attrmask) {
+         const gl_vert_attrib attr = u_bit_scan(&attrmask);
+         const struct gl_array_attributes *const attrib
+            = _mesa_draw_array_attrib(vao, attr);
+         const GLuint off = _mesa_draw_attributes_relative_offset(attrib);
+         init_velement_lowered(vp, velements, attrib, off,
+                               binding->InstanceDivisor, bufidx,
+                               input_to_index[attr]);
+      }
+   }
 
-            vbuffer[bufidx].is_user_buffer = false;
-            vbuffer[bufidx].buffer.resource = NULL;
+   const unsigned first_current_vbuffer = num_vbuffers;
+   /* _NEW_PROGRAM | _NEW_CURRENT_ATTRIB */
+   /* Process values that should have better been uniforms in the application */
+   GLbitfield curmask = inputs_read & _mesa_draw_current_bits(ctx);
+   if (curmask) {
+      /* For each attribute, upload the maximum possible size. */
+      GLubyte data[VERT_ATTRIB_MAX * sizeof(GLdouble) * 4];
+      GLubyte *cursor = data;
+      const unsigned bufidx = num_vbuffers++;
+      unsigned max_alignment = 1;
 
-            /* Use const_uploader for zero-stride vertex attributes, because
-             * it may use a better memory placement than stream_uploader.
-             * The reason is that zero-stride attributes can be fetched many
-             * times (thousands of times), so a better placement is going to
-             * perform better.
-             *
-             * Upload the maximum possible size, which is 4x GLdouble = 32.
-             */
-            u_upload_data(st->can_bind_const_buffer_as_vertex ?
-                             st->pipe->const_uploader :
-                             st->pipe->stream_uploader,
-                          0, size, alignment, ptr,
-                          &vbuffer[bufidx].buffer_offset,
-                          &vbuffer[bufidx].buffer.resource);
-            unref_buffers |= 1u << bufidx;
-         } else {
-            assert(attrib->Ptr);
-            vbuffer[bufidx].buffer.user = attrib->Ptr;
-            vbuffer[bufidx].is_user_buffer = true;
-            vbuffer[bufidx].buffer_offset = 0;
+      while (curmask) {
+         const gl_vert_attrib attr = u_bit_scan(&curmask);
+         const struct gl_array_attributes *const attrib
+            = _mesa_draw_current_attrib(ctx, attr);
+         const unsigned size = attrib->_ElementSize;
+         const unsigned alignment = util_next_power_of_two(size);
+         max_alignment = MAX2(max_alignment, alignment);
+         memcpy(cursor, attrib->Ptr, size);
+         if (alignment != size)
+            memset(cursor + size, 0, alignment - size);
 
-            if (!binding->InstanceDivisor)
-               st->draw_needs_minmax_index = true;
-         }
+         init_velement_lowered(vp, velements, attrib, cursor - data, 0,
+                               bufidx, input_to_index[attr]);
+
+         cursor += alignment;
       }
 
-      /* common-case setup */
-      vbuffer[bufidx].stride = stride; /* in bytes */
+      vbuffer[bufidx].is_user_buffer = false;
+      vbuffer[bufidx].buffer.resource = NULL;
+      /* vbuffer[bufidx].buffer_offset is set below */
+      vbuffer[bufidx].stride = 0;
 
-      src_format = st_pipe_vertex_format(attrib);
-
-      init_velement_lowered(vp, velements, 0, src_format,
-                            binding->InstanceDivisor, bufidx,
-                            attrib->Size, attrib->Doubles, &attr);
+      /* Use const_uploader for zero-stride vertex attributes, because
+       * it may use a better memory placement than stream_uploader.
+       * The reason is that zero-stride attributes can be fetched many
+       * times (thousands of times), so a better placement is going to
+       * perform better.
+       */
+      u_upload_data(st->can_bind_const_buffer_as_vertex ?
+                    st->pipe->const_uploader :
+                    st->pipe->stream_uploader,
+                    0, cursor - data, max_alignment, data,
+                    &vbuffer[bufidx].buffer_offset,
+                    &vbuffer[bufidx].buffer.resource);
    }
 
    if (!ctx->Const.AllowMappedBuffersDuringExecution) {
       u_upload_unmap(st->pipe->stream_uploader);
    }
 
+   const unsigned num_inputs = st->vp_variant->num_inputs;
    set_vertex_attribs(st, vbuffer, num_vbuffers, velements, num_inputs);
 
    /* Unreference uploaded zero-stride vertex buffers. */
-   while (unref_buffers) {
-      unsigned i = u_bit_scan(&unref_buffers);
+   for (unsigned i = first_current_vbuffer; i < num_vbuffers; ++i) {
       pipe_resource_reference(&vbuffer[i].buffer.resource, NULL);
    }
 }
-
-void st_update_array(struct st_context *st)
-{
-   struct gl_context *ctx = st->ctx;
-   const struct gl_vertex_array *arrays = ctx->Array._DrawArrays;
-   const struct st_vertex_program *vp;
-   unsigned num_inputs;
-
-   st->vertex_array_out_of_memory = FALSE;
-   st->draw_needs_minmax_index = false;
-
-   /* No drawing has been done yet, so do nothing. */
-   if (!arrays)
-      return;
-
-   /* vertex program validation must be done before this */
-   vp = st->vp;
-   num_inputs = st->vp_variant->num_inputs;
-
-   if (is_interleaved_arrays(vp, arrays, num_inputs))
-      setup_interleaved_attribs(st, vp, arrays, num_inputs);
-   else
-      setup_non_interleaved_attribs(st, vp, arrays, num_inputs);
-}
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 6455e61..fa147b8 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -92,7 +92,7 @@
    /* update constants */
    if (params && params->NumParameters) {
       struct pipe_constant_buffer cb;
-      const uint paramBytes = params->NumParameters * sizeof(GLfloat) * 4;
+      const uint paramBytes = params->NumParameterValues * sizeof(GLfloat);
 
       /* Update the constants which come from fixed-function state, such as
        * transformation matrices, fog factors, etc.  The rest of the values in
diff --git a/src/mesa/state_tracker/st_atom_depth.c b/src/mesa/state_tracker/st_atom_depth.c
index 6ddb8f5..9e12361 100644
--- a/src/mesa/state_tracker/st_atom_depth.c
+++ b/src/mesa/state_tracker/st_atom_depth.c
@@ -107,8 +107,9 @@
    if (ctx->DrawBuffer->Visual.depthBits > 0) {
       if (ctx->Depth.Test) {
          dsa->depth.enabled = 1;
-         dsa->depth.writemask = ctx->Depth.Mask;
          dsa->depth.func = st_compare_func_to_pipe(ctx->Depth.Func);
+         if (dsa->depth.func != PIPE_FUNC_EQUAL)
+            dsa->depth.writemask = ctx->Depth.Mask;
       }
       if (ctx->Depth.BoundsTest) {
          dsa->depth.bounds_test = 1;
diff --git a/src/mesa/state_tracker/st_atom_framebuffer.c b/src/mesa/state_tracker/st_atom_framebuffer.c
index 3ef3ff3..807c312 100644
--- a/src/mesa/state_tracker/st_atom_framebuffer.c
+++ b/src/mesa/state_tracker/st_atom_framebuffer.c
@@ -95,7 +95,7 @@
        * drivers callback must be adjusted for this.
        */
       if (screen->is_format_supported(screen, PIPE_FORMAT_NONE,
-                                      PIPE_TEXTURE_2D, msaa_mode,
+                                      PIPE_TEXTURE_2D, msaa_mode, msaa_mode,
                                       PIPE_BIND_RENDER_TARGET))
          quantized_samples = msaa_mode;
    }
diff --git a/src/mesa/state_tracker/st_atom_list.h b/src/mesa/state_tracker/st_atom_list.h
index 5391d47..e1aebc9 100644
--- a/src/mesa/state_tracker/st_atom_list.h
+++ b/src/mesa/state_tracker/st_atom_list.h
@@ -34,7 +34,7 @@
 ST_STATE(ST_NEW_FB_STATE, st_update_framebuffer_state) /* depends on update_*_texture and bind_*_images */
 ST_STATE(ST_NEW_BLEND, st_update_blend) /* depends on update_framebuffer_state */
 ST_STATE(ST_NEW_RASTERIZER, st_update_rasterizer) /* depends on update_framebuffer_state */
-ST_STATE(ST_NEW_SAMPLE_MASK, st_update_sample_mask) /* depends on update_framebuffer_state */
+ST_STATE(ST_NEW_SAMPLE_STATE, st_update_sample_state) /* depends on update_framebuffer_state */
 ST_STATE(ST_NEW_SAMPLE_SHADING, st_update_sample_shading)
 ST_STATE(ST_NEW_SCISSOR, st_update_scissor) /* depends on update_framebuffer_state */
 ST_STATE(ST_NEW_VIEWPORT, st_update_viewport) /* depends on update_framebuffer_state */
diff --git a/src/mesa/state_tracker/st_atom_msaa.c b/src/mesa/state_tracker/st_atom_msaa.c
index 556c7c5..c6affec 100644
--- a/src/mesa/state_tracker/st_atom_msaa.c
+++ b/src/mesa/state_tracker/st_atom_msaa.c
@@ -33,13 +33,84 @@
 #include "st_program.h"
 
 #include "cso_cache/cso_context.h"
+#include "util/u_framebuffer.h"
 #include "main/framebuffer.h"
 
 
-/* Update the sample mask for MSAA.
+/**
+ * Update the sample locations
+ */
+static void
+update_sample_locations(struct st_context *st)
+{
+   struct gl_framebuffer *fb = st->ctx->DrawBuffer;
+
+   if (!st->ctx->Extensions.ARB_sample_locations)
+      return;
+
+   if (fb->ProgrammableSampleLocations) {
+      unsigned grid_width, grid_height, size, pixel, sample_index;
+      unsigned samples = st->state.fb_num_samples;
+      bool sample_location_pixel_grid = fb->SampleLocationPixelGrid;
+      uint8_t locations[
+         PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE *
+         PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * 32];
+
+      st->pipe->screen->get_sample_pixel_grid(
+         st->pipe->screen, samples, &grid_width, &grid_height);
+      size = grid_width * grid_height * samples;
+
+      /**
+       * when a dimension is greater than MAX_SAMPLE_LOCATION_GRID_SIZE,
+       * st->ctx->Driver.GetSamplePixelGrid() returns 1 for both dimensions.
+       */
+      if (grid_width > MAX_SAMPLE_LOCATION_GRID_SIZE ||
+          grid_height > MAX_SAMPLE_LOCATION_GRID_SIZE)
+         sample_location_pixel_grid = false;
+
+      for (pixel = 0; pixel < grid_width * grid_height; pixel++) {
+         for (sample_index = 0; sample_index < samples; sample_index++) {
+            int table_index = sample_index;
+            float x = 0.5f, y = 0.5f;
+            uint8_t loc;
+            if (sample_location_pixel_grid)
+               table_index = pixel * samples + sample_index;
+            if (fb->SampleLocationTable) {
+               x = fb->SampleLocationTable[table_index*2];
+               y = fb->SampleLocationTable[table_index*2+1];
+            }
+            if (st->state.fb_orientation == Y_0_BOTTOM)
+               y = 1.0 - y;
+
+            loc = roundf(CLAMP(x * 16.0f, 0.0f, 15.0f));
+            loc |= (int)roundf(CLAMP(y * 16.0f, 0.0f, 15.0f)) << 4;
+            locations[pixel * samples + sample_index] = loc;
+         }
+      }
+
+      util_sample_locations_flip_y(
+         st->pipe->screen, st->state.fb_height, samples, locations);
+
+      if (!st->state.enable_sample_locations ||
+          st->state.sample_locations_samples != samples ||
+          memcmp(locations, st->state.sample_locations, size) != 0) {
+         st->pipe->set_sample_locations( st->pipe, size, locations);
+         
+         st->state.sample_locations_samples = samples;
+         memcpy(st->state.sample_locations, locations, size);
+      }
+   } else if (st->state.enable_sample_locations) {
+      st->pipe->set_sample_locations(st->pipe, 0, NULL);
+   }
+
+   st->state.enable_sample_locations = fb->ProgrammableSampleLocations;
+}
+
+
+/* Update the sample mask and locations for MSAA.
  */
 void
-st_update_sample_mask(struct st_context *st)
+st_update_sample_state(struct st_context *st)
 {
    unsigned sample_mask = 0xffffffff;
    unsigned sample_count = st->state.fb_num_samples;
@@ -64,6 +135,8 @@
    }
 
    cso_set_sample_mask(st->cso_context, sample_mask);
+
+   update_sample_locations(st);
 }
 
 
diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index 1be072e..0383b8a 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -298,5 +298,20 @@
    raster->clip_plane_enable = ctx->Transform.ClipPlanesEnabled;
    raster->clip_halfz = (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE);
 
+    /* ST_NEW_RASTERIZER */
+   if (ctx->ConservativeRasterization) {
+      if (ctx->ConservativeRasterMode == GL_CONSERVATIVE_RASTER_MODE_POST_SNAP_NV)
+         raster->conservative_raster_mode = PIPE_CONSERVATIVE_RASTER_POST_SNAP;
+      else
+         raster->conservative_raster_mode = PIPE_CONSERVATIVE_RASTER_PRE_SNAP;
+   } else {
+      raster->conservative_raster_mode = PIPE_CONSERVATIVE_RASTER_OFF;
+   }
+
+   raster->conservative_raster_dilate = ctx->ConservativeRasterDilate;
+
+   raster->subpixel_precision_x = ctx->SubpixelPrecisionBias[0];
+   raster->subpixel_precision_y = ctx->SubpixelPrecisionBias[1];
+
    cso_set_rasterizer(st->cso_context, raster);
 }
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index b25ae5f..babb001 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -564,17 +564,17 @@
 
    /* find a usable texture format */
    if (screen->is_format_supported(screen, PIPE_FORMAT_I8_UNORM,
-                                   st->internal_target, 0,
+                                   st->internal_target, 0, 0,
                                    PIPE_BIND_SAMPLER_VIEW)) {
       st->bitmap.tex_format = PIPE_FORMAT_I8_UNORM;
    }
    else if (screen->is_format_supported(screen, PIPE_FORMAT_A8_UNORM,
-                                        st->internal_target, 0,
+                                        st->internal_target, 0, 0,
                                         PIPE_BIND_SAMPLER_VIEW)) {
       st->bitmap.tex_format = PIPE_FORMAT_A8_UNORM;
    }
    else if (screen->is_format_supported(screen, PIPE_FORMAT_L8_UNORM,
-                                        st->internal_target, 0,
+                                        st->internal_target, 0, 0,
                                         PIPE_BIND_SAMPLER_VIEW)) {
       st->bitmap.tex_format = PIPE_FORMAT_L8_UNORM;
    }
@@ -774,10 +774,7 @@
 
    u_upload_unmap(pipe->stream_uploader);
 
-   cso_set_vertex_buffers(st->cso_context,
-                          cso_get_aux_vertex_buffer_slot(st->cso_context),
-                          1, &vb);
-
+   cso_set_vertex_buffers(st->cso_context, 0, 1, &vb);
    cso_draw_arrays(st->cso_context, PIPE_PRIM_QUADS, 0, num_verts);
 
 out:
diff --git a/src/mesa/state_tracker/st_cb_copyimage.c b/src/mesa/state_tracker/st_cb_copyimage.c
index d160c8c..6d3eda0 100644
--- a/src/mesa/state_tracker/st_cb_copyimage.c
+++ b/src/mesa/state_tracker/st_cb_copyimage.c
@@ -360,7 +360,7 @@
 
 static struct pipe_resource *
 create_texture(struct pipe_screen *screen, enum pipe_format format,
-               unsigned nr_samples,
+               unsigned nr_samples, unsigned nr_storage_samples,
                unsigned width, unsigned height, unsigned depth)
 {
    struct pipe_resource templ;
@@ -372,6 +372,7 @@
    templ.depth0 = 1;
    templ.array_size = depth;
    templ.nr_samples = nr_samples;
+   templ.nr_storage_samples = nr_storage_samples;
    templ.usage = PIPE_USAGE_DEFAULT;
    templ.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
 
@@ -443,7 +444,7 @@
        * then proceed the generic swizzled_copy.
        */
       temp = create_texture(pipe->screen, canon_format, src->nr_samples,
-                            src_box->width,
+                            src->nr_storage_samples, src_box->width,
                             src_box->height, src_box->depth);
 
       u_box_3d(0, 0, 0, src_box->width, src_box->height, src_box->depth,
@@ -468,7 +469,7 @@
       /* Use the temporary texture. First, use the generic copy, but use
        * a canonical format in the destination. Then convert */
       temp = create_texture(pipe->screen, canon_format, dst->nr_samples,
-                            src_box->width,
+                            dst->nr_storage_samples, src_box->width,
                             src_box->height, src_box->depth);
 
       u_box_3d(0, 0, 0, src_box->width, src_box->height, src_box->depth,
@@ -532,7 +533,6 @@
                  src_box);
 }
 
-/* Note, the only allowable compressed format for this function is ETC */
 static void
 fallback_copy_image(struct st_context *st,
                     struct gl_texture_image *dst_image,
@@ -551,19 +551,25 @@
    bool dst_is_compressed = dst_image && _mesa_is_format_compressed(dst_image->TexFormat);
    bool src_is_compressed = src_image && _mesa_is_format_compressed(src_image->TexFormat);
 
+   unsigned dst_blk_w = 1, dst_blk_h = 1, src_blk_w = 1, src_blk_h = 1;
+   if (dst_image)
+      _mesa_get_format_block_size(dst_image->TexFormat, &dst_blk_w, &dst_blk_h);
+   if (src_image)
+      _mesa_get_format_block_size(src_image->TexFormat, &src_blk_w, &src_blk_h);
+
    unsigned dst_w = src_w;
    unsigned dst_h = src_h;
    unsigned lines = src_h;
 
    if (src_is_compressed && !dst_is_compressed) {
-      dst_w = DIV_ROUND_UP(dst_w, 4);
-      dst_h = DIV_ROUND_UP(dst_h, 4);
+      dst_w = DIV_ROUND_UP(dst_w, src_blk_w);
+      dst_h = DIV_ROUND_UP(dst_h, src_blk_h);
    } else if (!src_is_compressed && dst_is_compressed) {
-      dst_w *= 4;
-      dst_h *= 4;
+      dst_w *= dst_blk_w;
+      dst_h *= dst_blk_h;
    }
    if (src_is_compressed) {
-      lines = DIV_ROUND_UP(lines, 4);
+      lines = DIV_ROUND_UP(lines, src_blk_h);
    }
 
    if (src_image)
@@ -668,8 +674,8 @@
 
    u_box_2d_zslice(src_x, src_y, src_z, src_width, src_height, &box);
 
-   if ((src_image && st_etc_fallback(st, src_image)) ||
-       (dst_image && st_etc_fallback(st, dst_image))) {
+   if ((src_image && st_compressed_format_fallback(st, src_image->TexFormat)) ||
+       (dst_image && st_compressed_format_fallback(st, dst_image->TexFormat))) {
       fallback_copy_image(st, dst_image, dst_res, dst_x, dst_y, orig_dst_z,
                           src_image, src_res, src_x, src_y, orig_src_z,
                           src_width, src_height);
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 2bffe42..a05e264 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -1479,10 +1479,12 @@
          if (screen->is_format_supported(screen, blit.src.format,
                                          blit.src.resource->target,
                                          blit.src.resource->nr_samples,
+                                         blit.src.resource->nr_storage_samples,
                                          PIPE_BIND_SAMPLER_VIEW) &&
              screen->is_format_supported(screen, blit.dst.format,
                                          blit.dst.resource->target,
                                          blit.dst.resource->nr_samples,
+                                         blit.dst.resource->nr_storage_samples,
                                          PIPE_BIND_RENDER_TARGET)) {
             pipe->blit(pipe, &blit);
             return GL_TRUE;
@@ -1582,7 +1584,7 @@
       (type == GL_COLOR ? PIPE_BIND_RENDER_TARGET : PIPE_BIND_DEPTH_STENCIL);
 
    if (!screen->is_format_supported(screen, srcFormat, st->internal_target, 0,
-                                    srcBind)) {
+                                    0, srcBind)) {
       /* srcFormat is non-renderable. Find a compatible renderable format. */
       if (type == GL_DEPTH) {
          srcFormat = st_choose_format(st, GL_DEPTH_COMPONENT, GL_NONE,
diff --git a/src/mesa/state_tracker/st_cb_drawtex.c b/src/mesa/state_tracker/st_cb_drawtex.c
index f18925e..b6bf71d 100644
--- a/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/src/mesa/state_tracker/st_cb_drawtex.c
@@ -276,8 +276,7 @@
       cso_set_viewport(cso, &vp);
    }
 
-   util_draw_vertex_buffer(pipe, cso, vbuffer,
-			   cso_get_aux_vertex_buffer_slot(cso),
+   util_draw_vertex_buffer(pipe, cso, vbuffer, 0,
                            offset,  /* offset */
                            PIPE_PRIM_TRIANGLE_FAN,
                            4,  /* verts */
diff --git a/src/mesa/state_tracker/st_cb_eglimage.c b/src/mesa/state_tracker/st_cb_eglimage.c
index 0db2efc..d6b93c3 100644
--- a/src/mesa/state_tracker/st_cb_eglimage.c
+++ b/src/mesa/state_tracker/st_cb_eglimage.c
@@ -41,10 +41,12 @@
 
 static bool
 is_format_supported(struct pipe_screen *screen, enum pipe_format format,
-                    unsigned nr_samples, unsigned usage)
+                    unsigned nr_samples, unsigned nr_storage_samples,
+                    unsigned usage)
 {
    bool supported = screen->is_format_supported(screen, format, PIPE_TEXTURE_2D,
-                                                nr_samples, usage);
+                                                nr_samples, nr_storage_samples,
+                                                usage);
 
    /* for sampling, some formats can be emulated.. it doesn't matter that
     * the surface will have a format that the driver can't cope with because
@@ -55,14 +57,14 @@
       if (format == PIPE_FORMAT_IYUV) {
          supported = screen->is_format_supported(screen, PIPE_FORMAT_R8_UNORM,
                                                  PIPE_TEXTURE_2D, nr_samples,
-                                                 usage);
+                                                 nr_storage_samples, usage);
       } else if (format == PIPE_FORMAT_NV12) {
          supported = screen->is_format_supported(screen, PIPE_FORMAT_R8_UNORM,
                                                  PIPE_TEXTURE_2D, nr_samples,
-                                                 usage) &&
+                                                 nr_storage_samples, usage) &&
                      screen->is_format_supported(screen, PIPE_FORMAT_R8G8_UNORM,
                                                  PIPE_TEXTURE_2D, nr_samples,
-                                                 usage);
+                                                 nr_storage_samples, usage);
       }
    }
 
@@ -91,7 +93,8 @@
       return false;
    }
 
-   if (!is_format_supported(screen, out->format, out->texture->nr_samples, usage)) {
+   if (!is_format_supported(screen, out->format, out->texture->nr_samples,
+                            out->texture->nr_storage_samples, usage)) {
       /* unable to specify a texture object using the specified EGL image */
       pipe_resource_reference(&out->texture, NULL);
       _mesa_error(ctx, GL_INVALID_OPERATION, "%s(format not supported)", error);
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 696a08f..73414fd 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -204,6 +204,8 @@
    templ.depth0 = 1;
    templ.array_size = 1;
    templ.nr_samples = rb->NumSamples;
+   templ.nr_storage_samples = rb->NumSamples;
+
    if (util_format_is_depth_or_stencil(format)) {
       templ.bind = PIPE_BIND_DEPTH_STENCIL;
    }
@@ -614,8 +616,10 @@
    }
 
    valid = screen->is_format_supported(screen, format,
-                                      PIPE_TEXTURE_2D,
-                                      stObj->pt->nr_samples, bindings);
+                                       PIPE_TEXTURE_2D,
+                                       stObj->pt->nr_samples,
+                                       stObj->pt->nr_storage_samples,
+                                       bindings);
    if (!valid) {
       st_fbo_invalid("Invalid format");
    }
@@ -714,13 +718,11 @@
  * created FBOs.
  */
 static void
-st_DrawBuffer(struct gl_context *ctx, GLenum buffer)
+st_DrawBufferAllocate(struct gl_context *ctx)
 {
    struct st_context *st = st_context(ctx);
    struct gl_framebuffer *fb = ctx->DrawBuffer;
 
-   (void) buffer;
-
    if (_mesa_is_winsys_fbo(fb)) {
       GLuint i;
       /* add the renderbuffers on demand */
@@ -736,8 +738,8 @@
 
 
 /**
- * Called via glReadBuffer.  As with st_DrawBuffer, we use this function
- * to check if we need to allocate a renderbuffer on demand.
+ * Called via glReadBuffer.  As with st_DrawBufferAllocate, we use this
+ * function to check if we need to allocate a renderbuffer on demand.
  */
 static void
 st_ReadBuffer(struct gl_context *ctx, GLenum buffer)
@@ -772,7 +774,8 @@
                    struct gl_renderbuffer *rb,
                    GLuint x, GLuint y, GLuint w, GLuint h,
                    GLbitfield mode,
-                   GLubyte **mapOut, GLint *rowStrideOut)
+                   GLubyte **mapOut, GLint *rowStrideOut,
+                   bool flip_y)
 {
    struct st_context *st = st_context(ctx);
    struct st_renderbuffer *strb = st_renderbuffer(rb);
@@ -781,6 +784,9 @@
    GLuint y2;
    GLubyte *map;
 
+   /* driver does not support GL_FRAMEBUFFER_FLIP_Y_MESA */
+   assert((rb->Name == 0) == flip_y);
+
    if (strb->software) {
       /* software-allocated renderbuffer (probably an accum buffer) */
       if (strb->data) {
@@ -857,6 +863,19 @@
 }
 
 
+/**
+ * Called via ctx->Driver.EvaluateDepthValues.
+ */
+static void
+st_EvaluateDepthValues(struct gl_context *ctx)
+{
+   struct st_context *st = st_context(ctx);
+
+   st_validate_state(st, ST_PIPELINE_UPDATE_FRAMEBUFFER);
+
+   st->pipe->evaluate_depth_buffer(st->pipe);
+}
+
 
 void
 st_init_fbo_functions(struct dd_function_table *functions)
@@ -868,9 +887,10 @@
    functions->FinishRenderTexture = st_finish_render_texture;
    functions->ValidateFramebuffer = st_validate_framebuffer;
 
-   functions->DrawBuffer = st_DrawBuffer;
+   functions->DrawBufferAllocate = st_DrawBufferAllocate;
    functions->ReadBuffer = st_ReadBuffer;
 
    functions->MapRenderbuffer = st_MapRenderbuffer;
    functions->UnmapRenderbuffer = st_UnmapRenderbuffer;
+   functions->EvaluateDepthValues = st_EvaluateDepthValues;
 }
diff --git a/src/mesa/state_tracker/st_cb_feedback.c b/src/mesa/state_tracker/st_cb_feedback.c
index b7a082f..6e48be6 100644
--- a/src/mesa/state_tracker/st_cb_feedback.c
+++ b/src/mesa/state_tracker/st_cb_feedback.c
@@ -273,34 +273,6 @@
 
 
 static void
-feedback_draw_vbo(struct gl_context *ctx,
-                  const struct _mesa_prim *prims,
-                  GLuint nr_prims,
-                  const struct _mesa_index_buffer *ib,
-                  GLboolean index_bounds_valid,
-                  GLuint min_index,
-                  GLuint max_index,
-                  struct gl_transform_feedback_object *tfb_vertcount,
-                  unsigned stream,
-                  struct gl_buffer_object *indirect)
-{
-   struct st_context *st = st_context(ctx);
-
-   /* The initial pushdown of the inputs array into the drivers */
-   _mesa_set_drawing_arrays(ctx, st->draw_arrays.inputs);
-   _vbo_update_inputs(ctx, &st->draw_arrays);
-
-   /* The above needs to happen outside of st_feedback_draw_vbo,
-    * since st_RasterPossets _DrawArrays and does not want that to be
-    * overwritten by _mesa_set_drawing_arrays.
-    */
-   st_feedback_draw_vbo(ctx, prims, nr_prims, ib, index_bounds_valid,
-                        min_index, max_index, tfb_vertcount,
-                        stream, indirect);
-}
-
-
-static void
 st_RenderMode(struct gl_context *ctx, GLenum newMode )
 {
    struct st_context *st = st_context(ctx);
@@ -318,7 +290,7 @@
          st->selection_stage = draw_glselect_stage(ctx, draw);
       draw_set_rasterize_stage(draw, st->selection_stage);
       /* Plug in new vbo draw function */
-      ctx->Driver.Draw = feedback_draw_vbo;
+      ctx->Driver.Draw = st_feedback_draw_vbo;
    }
    else {
       struct gl_program *vp = st->ctx->VertexProgram._Current;
@@ -327,7 +299,7 @@
          st->feedback_stage = draw_glfeedback_stage(ctx, draw);
       draw_set_rasterize_stage(draw, st->feedback_stage);
       /* Plug in new vbo draw function */
-      ctx->Driver.Draw = feedback_draw_vbo;
+      ctx->Driver.Draw = st_feedback_draw_vbo;
       /* need to generate/use a vertex program that emits pos/color/tex */
       if (vp)
          st->dirty |= ST_NEW_VERTEX_PROGRAM(st, st_vertex_program(vp));
diff --git a/src/mesa/state_tracker/st_cb_memoryobjects.c b/src/mesa/state_tracker/st_cb_memoryobjects.c
index 63a8c2a..39174bc 100644
--- a/src/mesa/state_tracker/st_cb_memoryobjects.c
+++ b/src/mesa/state_tracker/st_cb_memoryobjects.c
@@ -65,7 +65,7 @@
    struct pipe_screen *screen = pipe->screen;
    struct winsys_handle whandle;
 
-   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.type = WINSYS_HANDLE_TYPE_FD;
    whandle.handle = fd;
    whandle.offset = 0;
    whandle.layer = 0;
diff --git a/src/mesa/state_tracker/st_cb_msaa.c b/src/mesa/state_tracker/st_cb_msaa.c
index 7f1b4fd..6c5dc1f 100644
--- a/src/mesa/state_tracker/st_cb_msaa.c
+++ b/src/mesa/state_tracker/st_cb_msaa.c
@@ -56,8 +56,35 @@
 }
 
 
+static void
+st_GetProgrammableSampleCaps(struct gl_context *ctx, const struct gl_framebuffer *fb,
+                             GLuint *outBits, GLuint *outWidth, GLuint *outHeight)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_screen *screen = st->pipe->screen;
+
+   st_validate_state(st, ST_PIPELINE_UPDATE_FRAMEBUFFER);
+
+   *outBits = 4;
+   *outWidth = 1;
+   *outHeight = 1;
+
+   if (ctx->Extensions.ARB_sample_locations)
+      screen->get_sample_pixel_grid(screen, st->state.fb_num_samples,
+                                    outWidth, outHeight);
+
+   /* We could handle this better in some circumstances,
+    * but it's not really an issue */
+   if (*outWidth > MAX_SAMPLE_LOCATION_GRID_SIZE ||
+       *outHeight > MAX_SAMPLE_LOCATION_GRID_SIZE) {
+      *outWidth = 1;
+      *outHeight = 1;
+   }
+}
+
 void
 st_init_msaa_functions(struct dd_function_table *functions)
 {
    functions->GetSamplePosition = st_GetSamplePosition;
+   functions->GetProgrammableSampleCaps = st_GetProgrammableSampleCaps;
 }
diff --git a/src/mesa/state_tracker/st_cb_rasterpos.c b/src/mesa/state_tracker/st_cb_rasterpos.c
index 4e5417b..13cc9a7 100644
--- a/src/mesa/state_tracker/st_cb_rasterpos.c
+++ b/src/mesa/state_tracker/st_cb_rasterpos.c
@@ -38,8 +38,11 @@
 
 #include "main/imports.h"
 #include "main/macros.h"
+#include "main/arrayobj.h"
 #include "main/feedback.h"
 #include "main/rastpos.h"
+#include "main/state.h"
+#include "main/varray.h"
 
 #include "st_context.h"
 #include "st_atom.h"
@@ -60,9 +63,7 @@
    struct gl_context *ctx;            /**< Rendering context */
 
    /* vertex attrib info we can setup once and re-use */
-   struct gl_vertex_buffer_binding binding;
-   struct gl_array_attributes attrib[VERT_ATTRIB_MAX];
-   struct gl_vertex_array array[VERT_ATTRIB_MAX];
+   struct gl_vertex_array_object *VAO;
    struct _mesa_prim prim;
 };
 
@@ -102,6 +103,8 @@
 static void
 rastpos_destroy(struct draw_stage *stage)
 {
+   struct rastpos_stage *rstage = (struct rastpos_stage*)stage;
+   _mesa_reference_vao(rstage->ctx, &rstage->VAO, NULL);
    free(stage);
 }
 
@@ -181,7 +184,6 @@
 new_draw_rastpos_stage(struct gl_context *ctx, struct draw_context *draw)
 {
    struct rastpos_stage *rs = ST_CALLOC_STRUCT(rastpos_stage);
-   GLuint i;
 
    rs->stage.draw = draw;
    rs->stage.next = NULL;
@@ -194,17 +196,11 @@
    rs->stage.destroy = rastpos_destroy;
    rs->ctx = ctx;
 
-   rs->binding.Stride = 0;
-   rs->binding.BufferObj = NULL;
-   for (i = 0; i < ARRAY_SIZE(rs->array); i++) {
-      rs->attrib[i].Size = 4;
-      rs->attrib[i].Type = GL_FLOAT;
-      rs->attrib[i].Format = GL_RGBA;
-      rs->attrib[i].Ptr = (GLubyte *) ctx->Current.Attrib[i];
-      rs->attrib[i].Normalized = GL_TRUE;
-      rs->array[i].BufferBinding = &rs->binding;
-      rs->array[i].VertexAttrib = &rs->attrib[i];
-   }
+   rs->VAO = _mesa_new_vao(ctx, ~((GLuint)0));
+   _mesa_vertex_attrib_binding(ctx, rs->VAO, VERT_ATTRIB_POS, 0);
+   _mesa_update_array_format(ctx, rs->VAO, VERT_ATTRIB_POS, 4, GL_FLOAT,
+                             GL_RGBA, GL_FALSE, GL_FALSE, GL_FALSE, 0);
+   _mesa_enable_vertex_array_attrib(ctx, rs->VAO, 0);
 
    rs->prim.mode = GL_POINTS;
    rs->prim.indexed = 0;
@@ -224,7 +220,6 @@
    struct st_context *st = st_context(ctx);
    struct draw_context *draw = st_get_draw_context(st);
    struct rastpos_stage *rs;
-   const struct gl_vertex_array *saved_arrays = ctx->Array._DrawArrays;
 
    if (!st->draw)
       return;
@@ -260,16 +255,13 @@
    /* All vertex attribs but position were previously initialized above.
     * Just plug in position pointer now.
     */
-   rs->attrib[0].Ptr = (GLubyte *) v;
+   rs->VAO->VertexAttrib[VERT_ATTRIB_POS].Ptr = (GLubyte *) v;
+   rs->VAO->NewArrays |= VERT_BIT_POS;
+   _mesa_set_draw_vao(ctx, rs->VAO, VERT_BIT_POS);
 
-   /* Draw the point.
-    *
-    * Don't set DriverFlags.NewArray.
-    * st_feedback_draw_vbo doesn't check for that flag. */
-   ctx->Array._DrawArrays = rs->array;
+   /* Draw the point. */
    st_feedback_draw_vbo(ctx, &rs->prim, 1, NULL, GL_TRUE, 0, 1,
                         NULL, 0, NULL);
-   ctx->Array._DrawArrays = saved_arrays;
 
    /* restore draw's rasterization stage depending on rendermode */
    if (ctx->RenderMode == GL_FEEDBACK) {
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 2918121..3e00890 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -112,7 +112,7 @@
    if (texture->nr_samples > 1)
       return false;
 
-   if (!screen->is_format_supported(screen, dst_format, PIPE_BUFFER, 0,
+   if (!screen->is_format_supported(screen, dst_format, PIPE_BUFFER, 0, 0,
                                     PIPE_BIND_SHADER_IMAGE))
       return false;
 
@@ -449,7 +449,7 @@
 
    if (!src_format ||
        !screen->is_format_supported(screen, src_format, src->target,
-                                    src->nr_samples,
+                                    src->nr_samples, src->nr_storage_samples,
                                     PIPE_BIND_SAMPLER_VIEW)) {
       goto fallback;
    }
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 40a1ce1..93b6b32 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -41,6 +41,7 @@
 #include "main/pbo.h"
 #include "main/pixeltransfer.h"
 #include "main/texcompress.h"
+#include "main/texcompress_astc.h"
 #include "main/texcompress_etc.h"
 #include "main/texgetimage.h"
 #include "main/teximage.h"
@@ -207,9 +208,9 @@
    stImage->transfer = NULL;
    stImage->num_transfers = 0;
 
-   if (stImage->etc_data) {
-      free(stImage->etc_data);
-      stImage->etc_data = NULL;
+   if (stImage->compressed_data) {
+      free(stImage->compressed_data);
+      stImage->compressed_data = NULL;
    }
 
    /* if the texture image is being deallocated, the structure of the
@@ -219,29 +220,38 @@
 }
 
 bool
-st_etc_fallback(struct st_context *st, struct gl_texture_image *texImage)
+st_compressed_format_fallback(struct st_context *st, mesa_format format)
 {
-   return (_mesa_is_format_etc2(texImage->TexFormat) && !st->has_etc2) ||
-          (texImage->TexFormat == MESA_FORMAT_ETC1_RGB8 && !st->has_etc1);
+   if (format == MESA_FORMAT_ETC1_RGB8)
+      return !st->has_etc1;
+
+   if (_mesa_is_format_etc2(format))
+      return !st->has_etc2;
+
+   if (_mesa_is_format_astc_2d(format))
+      return !st->has_astc_2d_ldr;
+
+   return false;
 }
 
 static void
-etc_fallback_allocate(struct st_context *st, struct st_texture_image *stImage)
+compressed_tex_fallback_allocate(struct st_context *st,
+                                 struct st_texture_image *stImage)
 {
    struct gl_texture_image *texImage = &stImage->base;
 
-   if (!st_etc_fallback(st, texImage))
+   if (!st_compressed_format_fallback(st, texImage->TexFormat))
       return;
 
-   if (stImage->etc_data)
-      free(stImage->etc_data);
+   if (stImage->compressed_data)
+      free(stImage->compressed_data);
 
    unsigned data_size = _mesa_format_image_size(texImage->TexFormat,
                                                 texImage->Width2,
                                                 texImage->Height2,
                                                 texImage->Depth2);
 
-   stImage->etc_data =
+   stImage->compressed_data =
       malloc(data_size * _mesa_num_tex_faces(texImage->TexObject->Target));
 }
 
@@ -269,23 +279,29 @@
    map = st_texture_image_map(st, stImage, transfer_flags, x, y, slice, w, h, 1,
                               &transfer);
    if (map) {
-      if (st_etc_fallback(st, texImage)) {
-         /* ETC isn't supported by all gallium drivers, where it's represented
-          * by uncompressed formats. We store the compressed data (as it's
-          * needed for image copies in OES_copy_image), and decompress as
-          * necessary in Unmap.
+      if (st_compressed_format_fallback(st, texImage->TexFormat)) {
+         /* Some compressed formats don't have to be supported by drivers,
+          * and st/mesa transparently handles decompression on upload (Unmap),
+          * so that drivers don't see the compressed formats.
           *
-          * Note: all ETC1/ETC2 formats have 4x4 block sizes.
+          * We store the compressed data (it's needed for glGetCompressedTex-
+          * Image and image copies in OES_copy_image).
           */
          unsigned z = transfer->box.z;
          struct st_texture_image_transfer *itransfer = &stImage->transfer[z];
 
-         unsigned bytes = _mesa_get_format_bytes(texImage->TexFormat);
+         unsigned blk_w, blk_h;
+         _mesa_get_format_block_size(texImage->TexFormat, &blk_w, &blk_h);
+
+         unsigned y_blocks = DIV_ROUND_UP(texImage->Height2, blk_h);
          unsigned stride = *rowStrideOut = itransfer->temp_stride =
             _mesa_format_row_stride(texImage->TexFormat, texImage->Width2);
+         unsigned block_size = _mesa_get_format_bytes(texImage->TexFormat);
+
          *mapOut = itransfer->temp_data =
-            stImage->etc_data + ((x / 4) * bytes + (y / 4) * stride) +
-            z * stride * texImage->Height2 / 4;
+            stImage->compressed_data +
+            (z * y_blocks + (y / blk_h)) * stride +
+            (x / blk_w) * block_size;
          itransfer->map = map;
       }
       else {
@@ -310,8 +326,9 @@
    struct st_context *st = st_context(ctx);
    struct st_texture_image *stImage  = st_texture_image(texImage);
 
-   if (st_etc_fallback(st, texImage)) {
-      /* Decompress the ETC texture to the mapped one. */
+   if (st_compressed_format_fallback(st, texImage->TexFormat)) {
+      /* Decompress the compressed image on upload if the driver doesn't
+       * support the compressed format. */
       unsigned z = slice + stImage->base.Face;
       struct st_texture_image_transfer *itransfer = &stImage->transfer[z];
       struct pipe_transfer *transfer = itransfer->transfer;
@@ -324,12 +341,20 @@
                                        itransfer->temp_data,
                                        itransfer->temp_stride,
                                        transfer->box.width, transfer->box.height);
-         }
-         else {
+         } else if (_mesa_is_format_etc2(texImage->TexFormat)) {
+	    bool bgra = stImage->pt->format == PIPE_FORMAT_B8G8R8A8_SRGB;
             _mesa_unpack_etc2_format(itransfer->map, transfer->stride,
                                      itransfer->temp_data, itransfer->temp_stride,
                                      transfer->box.width, transfer->box.height,
+				     texImage->TexFormat,
+				     bgra);
+         } else if (_mesa_is_format_astc_2d(texImage->TexFormat)) {
+            _mesa_unpack_astc_2d_ldr(itransfer->map, transfer->stride,
+                                     itransfer->temp_data, itransfer->temp_stride,
+                                     transfer->box.width, transfer->box.height,
                                      texImage->TexFormat);
+         } else {
+            unreachable("unexpected format for a compressed format fallback");
          }
       }
 
@@ -357,13 +382,13 @@
    else
       bindings = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
 
-   if (screen->is_format_supported(screen, format, target, 0, bindings))
+   if (screen->is_format_supported(screen, format, target, 0, 0, bindings))
       return bindings;
    else {
       /* Try non-sRGB. */
       format = util_format_linear(format);
 
-      if (screen->is_format_supported(screen, format, target, 0, bindings))
+      if (screen->is_format_supported(screen, format, target, 0, 0, bindings))
          return bindings;
       else
          return PIPE_BIND_SAMPLER_VIEW;
@@ -623,7 +648,7 @@
 
    stObj->needs_validation = true;
 
-   etc_fallback_allocate(st, stImage);
+   compressed_tex_fallback_allocate(st, stImage);
 
    /* Look if the parent texture object has space for this image */
    if (stObj->pt &&
@@ -1318,13 +1343,13 @@
 
       if (dst_format != orig_dst_format &&
           !screen->is_format_supported(screen, dst_format, PIPE_TEXTURE_2D, 0,
-                                       PIPE_BIND_RENDER_TARGET)) {
+                                       0, PIPE_BIND_RENDER_TARGET)) {
          return false;
       }
    }
 
    if (!src_format ||
-       !screen->is_format_supported(screen, src_format, PIPE_BUFFER, 0,
+       !screen->is_format_supported(screen, src_format, PIPE_BUFFER, 0, 0,
                                     PIPE_BIND_SAMPLER_VIEW)) {
       return false;
    }
@@ -1400,6 +1425,7 @@
       dst_level = texImage->TexObject->MinLevel + texImage->Level;
 
    assert(!_mesa_is_format_etc2(texImage->TexFormat) &&
+          !_mesa_is_format_astc_2d(texImage->TexFormat) &&
           texImage->TexFormat != MESA_FORMAT_ETC1_RGB8);
 
    if (!dst)
@@ -1468,7 +1494,8 @@
 
    if (!dst_format ||
        !screen->is_format_supported(screen, dst_format, dst->target,
-                                    dst->nr_samples, bind)) {
+                                    dst->nr_samples, dst->nr_storage_samples,
+                                    bind)) {
       goto fallback;
    }
 
@@ -1684,10 +1711,8 @@
    if (!_mesa_is_bufferobj(ctx->Unpack.BufferObj))
       goto fallback;
 
-   if (st_etc_fallback(st, texImage)) {
-      /* ETC isn't supported and is represented by uncompressed formats. */
+   if (st_compressed_format_fallback(st, texImage->TexFormat))
       goto fallback;
-   }
 
    if (!dst) {
       goto fallback;
@@ -1714,13 +1739,14 @@
       goto fallback;
    }
 
-   if (!screen->is_format_supported(screen, copy_format, PIPE_BUFFER, 0,
+   if (!screen->is_format_supported(screen, copy_format, PIPE_BUFFER, 0, 0,
                                     PIPE_BIND_SAMPLER_VIEW)) {
       goto fallback;
    }
 
    if (!screen->is_format_supported(screen, copy_format, dst->target,
-                                    dst->nr_samples, PIPE_BIND_RENDER_TARGET)) {
+                                    dst->nr_samples, dst->nr_storage_samples,
+                                    PIPE_BIND_RENDER_TARGET)) {
       goto fallback;
    }
 
@@ -1862,6 +1888,7 @@
    boolean done = FALSE;
 
    assert(!_mesa_is_format_etc2(texImage->TexFormat) &&
+          !_mesa_is_format_astc_2d(texImage->TexFormat) &&
           texImage->TexFormat != MESA_FORMAT_ETC1_RGB8);
 
    st_flush_bitmap_cache(st);
@@ -1914,7 +1941,7 @@
 
    if (!src_format ||
        !screen->is_format_supported(screen, src_format, src->target,
-                                    src->nr_samples,
+                                    src->nr_samples, src->nr_storage_samples,
                                     PIPE_BIND_SAMPLER_VIEW)) {
       goto fallback;
    }
@@ -1954,6 +1981,23 @@
       case PIPE_FORMAT_RGTC1_UNORM:
       case PIPE_FORMAT_RGTC2_UNORM:
       case PIPE_FORMAT_ETC1_RGB8:
+      case PIPE_FORMAT_ETC2_RGB8:
+      case PIPE_FORMAT_ETC2_RGB8A1:
+      case PIPE_FORMAT_ETC2_RGBA8:
+      case PIPE_FORMAT_ASTC_4x4:
+      case PIPE_FORMAT_ASTC_5x4:
+      case PIPE_FORMAT_ASTC_5x5:
+      case PIPE_FORMAT_ASTC_6x5:
+      case PIPE_FORMAT_ASTC_6x6:
+      case PIPE_FORMAT_ASTC_8x5:
+      case PIPE_FORMAT_ASTC_8x6:
+      case PIPE_FORMAT_ASTC_8x8:
+      case PIPE_FORMAT_ASTC_10x5:
+      case PIPE_FORMAT_ASTC_10x6:
+      case PIPE_FORMAT_ASTC_10x8:
+      case PIPE_FORMAT_ASTC_10x10:
+      case PIPE_FORMAT_ASTC_12x10:
+      case PIPE_FORMAT_ASTC_12x12:
       case PIPE_FORMAT_BPTC_RGBA_UNORM:
          dst_glformat = GL_RGBA8;
          break;
@@ -1969,6 +2013,30 @@
             goto fallback;
          dst_glformat = GL_RGBA32F;
          break;
+      case PIPE_FORMAT_ETC2_R11_UNORM:
+         if (!screen->is_format_supported(screen, PIPE_FORMAT_R16_UNORM,
+                                          pipe_target, 0, 0, bind))
+            goto fallback;
+         dst_glformat = GL_R16;
+         break;
+      case PIPE_FORMAT_ETC2_R11_SNORM:
+         if (!screen->is_format_supported(screen, PIPE_FORMAT_R16_SNORM,
+                                          pipe_target, 0, 0, bind))
+            goto fallback;
+         dst_glformat = GL_R16_SNORM;
+         break;
+      case PIPE_FORMAT_ETC2_RG11_UNORM:
+         if (!screen->is_format_supported(screen, PIPE_FORMAT_R16G16_UNORM,
+                                          pipe_target, 0, 0, bind))
+            goto fallback;
+         dst_glformat = GL_RG16;
+         break;
+      case PIPE_FORMAT_ETC2_RG11_SNORM:
+         if (!screen->is_format_supported(screen, PIPE_FORMAT_R16G16_SNORM,
+                                          pipe_target, 0, 0, bind))
+            goto fallback;
+         dst_glformat = GL_RG16_SNORM;
+         break;
       default:
          assert(0);
          goto fallback;
@@ -2338,6 +2406,7 @@
    st_invalidate_readpix_cache(st);
 
    assert(!_mesa_is_format_etc2(texImage->TexFormat) &&
+          !_mesa_is_format_astc_2d(texImage->TexFormat) &&
           texImage->TexFormat != MESA_FORMAT_ETC1_RGB8);
 
    if (!strb || !strb->surface || !stImage->pt) {
@@ -2370,7 +2439,8 @@
 
    if (!dst_format ||
        !screen->is_format_supported(screen, dst_format, stImage->pt->target,
-                                    stImage->pt->nr_samples, bind)) {
+                                    stImage->pt->nr_samples,
+                                    stImage->pt->nr_storage_samples, bind)) {
       goto fallback;
    }
 
@@ -2708,7 +2778,7 @@
        (int) target, util_format_name(format), last_level);
 
    assert(format);
-   assert(screen->is_format_supported(screen, format, target, 0,
+   assert(screen->is_format_supported(screen, format, target, 0, 0,
                                       PIPE_BIND_SAMPLER_VIEW));
 
    memset(&pt, 0, sizeof(pt));
@@ -2724,6 +2794,7 @@
    /* only set this for OpenGL textures, not renderbuffers */
    pt.flags = PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY;
    pt.nr_samples = nr_samples;
+   pt.nr_storage_samples = nr_samples;
 
    newtex = screen->resource_from_memobj(screen, &pt, memObj->memory, offset);
 
@@ -2780,7 +2851,7 @@
 
       for (; num_samples <= ctx->Const.MaxSamples; num_samples++) {
          if (screen->is_format_supported(screen, fmt, ptarget,
-                                         num_samples,
+                                         num_samples, num_samples,
                                          PIPE_BIND_SAMPLER_VIEW)) {
             /* Update the sample count in gl_texture_image as well. */
             texImage->NumSamples = num_samples;
@@ -2834,7 +2905,7 @@
             st_texture_image(texObj->Image[face][level]);
          pipe_resource_reference(&stImage->pt, stObj->pt);
 
-         etc_fallback_allocate(st, stImage);
+         compressed_tex_fallback_allocate(st, stImage);
       }
    }
 
@@ -2889,6 +2960,7 @@
       pt.target = gl_target_to_pipe(target);
       pt.format = st_mesa_format_to_pipe_format(st, format);
       pt.nr_samples = numSamples;
+      pt.nr_storage_samples = numSamples;
 
       st_gl_texture_dims_to_pipe_dims(target,
                                       width, height, depth,
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index b0266be..34bfb84 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -172,7 +172,7 @@
    st->dirty |= ST_NEW_BLEND |
                 ST_NEW_DSA |
                 ST_NEW_FB_STATE |
-                ST_NEW_SAMPLE_MASK |
+                ST_NEW_SAMPLE_STATE |
                 ST_NEW_SAMPLE_SHADING |
                 ST_NEW_FS_STATE |
                 ST_NEW_POLY_STIPPLE |
@@ -323,9 +323,10 @@
    f->NewLogicOp = ST_NEW_BLEND;
    f->NewStencil = ST_NEW_DSA;
    f->NewMultisampleEnable = ST_NEW_BLEND | ST_NEW_RASTERIZER |
-                             ST_NEW_SAMPLE_MASK | ST_NEW_SAMPLE_SHADING;
+                             ST_NEW_SAMPLE_STATE | ST_NEW_SAMPLE_SHADING;
    f->NewSampleAlphaToXEnable = ST_NEW_BLEND;
-   f->NewSampleMask = ST_NEW_SAMPLE_MASK;
+   f->NewSampleMask = ST_NEW_SAMPLE_STATE;
+   f->NewSampleLocations = ST_NEW_SAMPLE_STATE;
    f->NewSampleShading = ST_NEW_SAMPLE_SHADING;
 
    /* This depends on what the gallium driver wants. */
@@ -344,6 +345,8 @@
    f->NewPolygonState = ST_NEW_RASTERIZER;
    f->NewPolygonStipple = ST_NEW_POLY_STIPPLE;
    f->NewViewport = ST_NEW_VIEWPORT;
+   f->NewNvConservativeRasterization = ST_NEW_RASTERIZER;
+   f->NewNvConservativeRasterizationParams = ST_NEW_RASTERIZER;
 }
 
 
@@ -393,26 +396,17 @@
    /* Setup vertex element info for 'struct st_util_vertex'.
     */
    {
-      const unsigned slot = cso_get_aux_vertex_buffer_slot(st->cso_context);
-
-      /* If this assertion ever fails all state tracker calls to
-       * cso_get_aux_vertex_buffer_slot() should be audited.  This
-       * particular call would have to be moved to just before each
-       * drawing call.
-       */
-      assert(slot == 0);
-
       STATIC_ASSERT(sizeof(struct st_util_vertex) == 9 * sizeof(float));
 
       memset(&st->util_velems, 0, sizeof(st->util_velems));
       st->util_velems[0].src_offset = 0;
-      st->util_velems[0].vertex_buffer_index = slot;
+      st->util_velems[0].vertex_buffer_index = 0;
       st->util_velems[0].src_format = PIPE_FORMAT_R32G32B32_FLOAT;
       st->util_velems[1].src_offset = 3 * sizeof(float);
-      st->util_velems[1].vertex_buffer_index = slot;
+      st->util_velems[1].vertex_buffer_index = 0;
       st->util_velems[1].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       st->util_velems[2].src_offset = 7 * sizeof(float);
-      st->util_velems[2].vertex_buffer_index = slot;
+      st->util_velems[2].vertex_buffer_index = 0;
       st->util_velems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
    }
 
@@ -439,11 +433,14 @@
       screen->get_param(screen, PIPE_CAP_SHADER_STENCIL_EXPORT);
    st->has_shader_model3 = screen->get_param(screen, PIPE_CAP_SM3);
    st->has_etc1 = screen->is_format_supported(screen, PIPE_FORMAT_ETC1_RGB8,
-                                              PIPE_TEXTURE_2D, 0,
+                                              PIPE_TEXTURE_2D, 0, 0,
                                               PIPE_BIND_SAMPLER_VIEW);
    st->has_etc2 = screen->is_format_supported(screen, PIPE_FORMAT_ETC2_RGB8,
-                                              PIPE_TEXTURE_2D, 0,
+                                              PIPE_TEXTURE_2D, 0, 0,
                                               PIPE_BIND_SAMPLER_VIEW);
+   st->has_astc_2d_ldr =
+      screen->is_format_supported(screen, PIPE_FORMAT_ASTC_4x4_SRGB,
+                                  PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW);
    st->prefer_blit_based_texture_transfer = screen->get_param(screen,
                               PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER);
    st->force_persample_in_shader =
@@ -550,9 +547,6 @@
    /* Initialize context's winsys buffers list */
    LIST_INITHEAD(&st->winsys_buffers);
 
-   /* Keep our list of gl_vertex_array inputs */
-   _vbo_init_inputs(&st->draw_arrays);
-
    return st;
 }
 
@@ -768,11 +762,15 @@
       screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
                                PIPE_SHADER_CAP_PREFERRED_IR);
    if (preferred_ir == PIPE_SHADER_IR_NIR) {
-      functions->ProgramBinarySerializeDriverBlob = st_serialise_nir_program;
+      functions->ShaderCacheSerializeDriverBlob =  st_serialise_nir_program;
+      functions->ProgramBinarySerializeDriverBlob =
+         st_serialise_nir_program_binary;
       functions->ProgramBinaryDeserializeDriverBlob =
          st_deserialise_nir_program;
    } else {
-      functions->ProgramBinarySerializeDriverBlob = st_serialise_tgsi_program;
+      functions->ShaderCacheSerializeDriverBlob =  st_serialise_tgsi_program;
+      functions->ProgramBinarySerializeDriverBlob =
+         st_serialise_tgsi_program_binary;
       functions->ProgramBinaryDeserializeDriverBlob =
          st_deserialise_tgsi_program;
    }
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 5125fc5..6b1b563 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -120,6 +120,7 @@
    boolean has_shader_model3;
    boolean has_etc1;
    boolean has_etc2;
+   boolean has_astc_2d_ldr;
    boolean prefer_blit_based_texture_transfer;
    boolean force_persample_in_shader;
    boolean has_shareable_shaders;
@@ -178,6 +179,12 @@
       GLuint poly_stipple[32];  /**< In OpenGL's bottom-to-top order */
 
       GLuint fb_orientation;
+
+      bool enable_sample_locations;
+      unsigned sample_locations_samples;
+      uint8_t sample_locations[
+         PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE *
+         PIPE_MAX_SAMPLE_LOCATION_GRID_SIZE * 32];
    } state;
 
    uint64_t dirty; /**< dirty states */
@@ -295,9 +302,6 @@
 
    /* Winsys buffers */
    struct list_head winsys_buffers;
-
-   /* For the initial pushdown, keep the list of vbo inputs. */
-   struct vbo_inputs draw_arrays;
 };
 
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 6243659..eb52d95 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -146,10 +146,6 @@
    unsigned i;
    unsigned start = 0;
 
-   /* The initial pushdown of the inputs array into the drivers */
-   _mesa_set_drawing_arrays(ctx, st->draw_arrays.inputs);
-   _vbo_update_inputs(ctx, &st->draw_arrays);
-
    prepare_draw(st, ctx);
 
    if (st->vertex_array_out_of_memory)
@@ -160,6 +156,7 @@
    info.vertices_per_patch = ctx->TessCtrlProgram.patch_vertices;
    info.indirect = NULL;
    info.count_from_stream_output = NULL;
+   info.restart_index = 0;
 
    if (ib) {
       struct gl_buffer_object *bufobj = ib->obj;
@@ -255,10 +252,6 @@
    struct pipe_draw_info info;
    struct pipe_draw_indirect_info indirect;
 
-   /* The initial pushdown of the inputs array into the drivers */
-   _mesa_set_drawing_arrays(ctx, st->draw_arrays.inputs);
-   _vbo_update_inputs(ctx, &st->draw_arrays);
-
    assert(stride);
    prepare_draw(st, ctx);
 
@@ -268,6 +261,7 @@
    memset(&indirect, 0, sizeof(indirect));
    util_draw_init_info(&info);
    info.start = 0; /* index offset / index size */
+   info.max_index = ~0u; /* so that u_vbuf can tell that it's unknown */
 
    if (ib) {
       struct gl_buffer_object *bufobj = ib->obj;
@@ -427,15 +421,7 @@
 
    u_upload_unmap(st->pipe->stream_uploader);
 
-   /* At the time of writing, cso_get_aux_vertex_buffer_slot() always returns
-    * zero.  If that ever changes we need to audit the calls to that function
-    * and make sure the slot number is used consistently everywhere.
-    */
-   assert(cso_get_aux_vertex_buffer_slot(st->cso_context) == 0);
-
-   cso_set_vertex_buffers(st->cso_context,
-                          cso_get_aux_vertex_buffer_slot(st->cso_context),
-                          1, &vb);
+   cso_set_vertex_buffers(st->cso_context, 0, 1, &vb);
 
    if (num_instances > 1) {
       cso_draw_arrays_instanced(st->cso_context, PIPE_PRIM_TRIANGLE_FAN, 0, 4,
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index c1ebcd9..5b897bd 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -38,7 +38,6 @@
 
 struct _mesa_index_buffer;
 struct _mesa_prim;
-struct gl_vertex_array;
 struct gl_context;
 struct st_context;
 
diff --git a/src/mesa/state_tracker/st_draw_feedback.c b/src/mesa/state_tracker/st_draw_feedback.c
index fa96b4e..eb05ac9 100644
--- a/src/mesa/state_tracker/st_draw_feedback.c
+++ b/src/mesa/state_tracker/st_draw_feedback.c
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 #include "main/imports.h"
+#include "main/arrayobj.h"
 #include "main/image.h"
 #include "main/macros.h"
 #include "main/varray.h"
@@ -131,9 +132,7 @@
    struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
    struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {NULL};
    struct pipe_transfer *ib_transfer = NULL;
-   const struct gl_vertex_array *arrays = ctx->Array._DrawArrays;
    GLuint attr, i;
-   const GLubyte *low_addr = NULL;
    const void *mapped_indices = NULL;
 
    if (!draw)
@@ -168,56 +167,28 @@
    draw_bind_vertex_shader(draw, st->vp_variant->draw_shader);
    set_feedback_vertex_format(ctx);
 
-   /* Find the lowest address of the arrays we're drawing */
-   if (vp->num_inputs) {
-      const struct gl_vertex_array *array;
-      const struct gl_vertex_buffer_binding *binding;
-      const struct gl_array_attributes *attrib;
-      array = &arrays[vp->index_to_input[0]];
-      binding = array->BufferBinding;
-      attrib = array->VertexAttrib;
-
-      low_addr = _mesa_vertex_attrib_address(attrib, binding);
-
-      for (attr = 1; attr < vp->num_inputs; attr++) {
-         const GLubyte *start;
-         array = &arrays[vp->index_to_input[attr]];
-         binding = array->BufferBinding;
-         attrib = array->VertexAttrib;
-         start = _mesa_vertex_attrib_address(attrib, binding);
-         low_addr = MIN2(low_addr, start);
-      }
-   }
-
    /* loop over TGSI shader inputs to determine vertex buffer
     * and attribute info
     */
    for (attr = 0; attr < vp->num_inputs; attr++) {
       const GLuint mesaAttr = vp->index_to_input[attr];
-      const struct gl_vertex_array *array = &arrays[mesaAttr];
       const struct gl_vertex_buffer_binding *binding;
       const struct gl_array_attributes *attrib;
-      struct gl_buffer_object *bufobj;
       void *map;
 
-      binding = array->BufferBinding;
-      attrib = array->VertexAttrib;
-      bufobj = binding->BufferObj;
+      _mesa_draw_attrib_and_binding(ctx, mesaAttr, &attrib, &binding);
 
-      if (bufobj && bufobj->Name) {
-         /* Attribute data is in a VBO.
-          * Recall that for VBOs, the gl_vertex_array->Ptr field is
-          * really an offset from the start of the VBO, not a pointer.
-          */
-         struct st_buffer_object *stobj = st_buffer_object(bufobj);
+      if (_mesa_is_bufferobj(binding->BufferObj)) {
+         /* Attribute data is in a VBO. */
+         struct st_buffer_object *stobj = st_buffer_object(binding->BufferObj);
          assert(stobj->buffer);
 
          vbuffers[attr].buffer.resource = NULL;
          vbuffers[attr].is_user_buffer = false;
          pipe_resource_reference(&vbuffers[attr].buffer.resource, stobj->buffer);
-         vbuffers[attr].buffer_offset = pointer_to_offset(low_addr);
-         velements[attr].src_offset = binding->Offset
-            + attrib->RelativeOffset - pointer_to_offset(low_addr);
+         vbuffers[attr].buffer_offset = _mesa_draw_binding_offset(binding);
+         velements[attr].src_offset =
+            _mesa_draw_attributes_relative_offset(attrib);
 
          /* map the attrib buffer */
          map = pipe_buffer_map(pipe, vbuffers[attr].buffer.resource,
@@ -227,6 +198,7 @@
                                        vbuffers[attr].buffer.resource->width0);
       }
       else {
+         /* Attribute data is in a user space array. */
          vbuffers[attr].buffer.user = attrib->Ptr;
          vbuffers[attr].is_user_buffer = true;
          vbuffers[attr].buffer_offset = 0;
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 0dc8adb..c6d9731 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -494,6 +494,16 @@
    c->UseSTD430AsDefaultPacking =
       screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF);
 
+   c->MaxSubpixelPrecisionBiasBits =
+      screen->get_param(screen, PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS);
+
+   c->ConservativeRasterDilateRange[0] =
+      screen->get_paramf(screen, PIPE_CAPF_MIN_CONSERVATIVE_RASTER_DILATE);
+   c->ConservativeRasterDilateRange[1] =
+      screen->get_paramf(screen, PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE);
+   c->ConservativeRasterDilateGranularity =
+      screen->get_paramf(screen, PIPE_CAPF_CONSERVATIVE_RASTER_DILATE_GRANULARITY);
+
    /* limit the max combined shader output resources to a driver limit */
    temp = screen->get_param(screen, PIPE_CAP_MAX_COMBINED_SHADER_OUTPUT_RESOURCES);
    if (temp > 0 && c->MaxCombinedShaderOutputResources > temp)
@@ -549,7 +559,7 @@
       /* Examine each format in the list. */
       for (j = 0; j < num_formats && mapping[i].format[j]; j++) {
          if (screen->is_format_supported(screen, mapping[i].format[j],
-                                         target, 0, bind_flags)) {
+                                         target, 0, 0, bind_flags)) {
             num_supported++;
          }
       }
@@ -582,7 +592,7 @@
    for (i = max_samples; i > 0; --i) {
       for (f = 0; f < num_formats; f++) {
          if (screen->is_format_supported(screen, formats[f],
-                                         PIPE_TEXTURE_2D, i, bind)) {
+                                         PIPE_TEXTURE_2D, i, i, bind)) {
             return i;
          }
       }
@@ -636,6 +646,7 @@
       { o(ARB_query_buffer_object),          PIPE_CAP_QUERY_BUFFER_OBJECT              },
       { o(ARB_robust_buffer_access_behavior), PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR   },
       { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
+      { o(ARB_sample_locations),             PIPE_CAP_PROGRAMMABLE_SAMPLE_LOCATIONS    },
       { o(ARB_seamless_cube_map),            PIPE_CAP_SEAMLESS_CUBE_MAP                },
       { o(ARB_shader_ballot),                PIPE_CAP_TGSI_BALLOT                      },
       { o(ARB_shader_clock),                 PIPE_CAP_TGSI_CLOCK                       },
@@ -791,6 +802,11 @@
           PIPE_FORMAT_ASTC_12x10_SRGB,
           PIPE_FORMAT_ASTC_12x12_SRGB } },
 
+      /* ASTC software fallback support. */
+      { { o(KHR_texture_compression_astc_ldr) },
+        { PIPE_FORMAT_R8G8B8A8_UNORM,
+          PIPE_FORMAT_R8G8B8A8_SRGB } },
+
       { { o(EXT_texture_shared_exponent) },
         { PIPE_FORMAT_R9G9B9E5_FLOAT } },
 
@@ -800,7 +816,9 @@
       { { o(EXT_texture_sRGB),
           o(EXT_texture_sRGB_decode) },
         { PIPE_FORMAT_A8B8G8R8_SRGB,
-          PIPE_FORMAT_B8G8R8A8_SRGB },
+	  PIPE_FORMAT_B8G8R8A8_SRGB,
+	  PIPE_FORMAT_A8R8G8B8_SRGB,
+	  PIPE_FORMAT_R8G8B8A8_SRGB},
         GL_TRUE }, /* at least one format must be supported */
 
       { { o(EXT_texture_type_2_10_10_10_REV) },
@@ -890,7 +908,6 @@
    extensions->EXT_texture_env_dot3 = GL_TRUE;
 
    extensions->ATI_fragment_shader = GL_TRUE;
-   extensions->ATI_separate_stencil = GL_TRUE;
    extensions->ATI_texture_env_combine3 = GL_TRUE;
 
    extensions->MESA_pack_invert = GL_TRUE;
@@ -926,11 +943,17 @@
 
    /* Figure out GLSL support and set GLSLVersion to it. */
    consts->GLSLVersion = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
+   consts->GLSLVersionCompat =
+      screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY);
+
+   const unsigned GLSLVersion =
+      api == API_OPENGL_COMPAT ? consts->GLSLVersionCompat :
+                                 consts->GLSLVersion;
 
    _mesa_override_glsl_version(consts);
 
    if (options->force_glsl_version > 0 &&
-       options->force_glsl_version <= consts->GLSLVersion) {
+       options->force_glsl_version <= GLSLVersion) {
       consts->ForceGLSLVersion = options->force_glsl_version;
    }
 
@@ -944,24 +967,24 @@
 
    consts->AllowGLSLCrossStageInterpolationMismatch = options->allow_glsl_cross_stage_interpolation_mismatch;
 
-   if (consts->GLSLVersion >= 400)
+   if (GLSLVersion >= 400)
       extensions->ARB_gpu_shader5 = GL_TRUE;
-   if (consts->GLSLVersion >= 410)
+   if (GLSLVersion >= 410)
       extensions->ARB_shader_precision = GL_TRUE;
 
    /* This extension needs full OpenGL 3.2, but we don't know if that's
     * supported at this point. Only check the GLSL version. */
-   if (consts->GLSLVersion >= 150 &&
+   if (GLSLVersion >= 150 &&
        screen->get_param(screen, PIPE_CAP_TGSI_VS_LAYER_VIEWPORT)) {
       extensions->AMD_vertex_shader_layer = GL_TRUE;
    }
 
-   if (consts->GLSLVersion >= 140) {
+   if (GLSLVersion >= 140) {
       if (screen->get_param(screen, PIPE_CAP_TGSI_ARRAY_COMPONENTS))
          extensions->ARB_enhanced_layouts = GL_TRUE;
    }
 
-   if (consts->GLSLVersion >= 130) {
+   if (GLSLVersion >= 130) {
       consts->NativeIntegers = GL_TRUE;
       consts->MaxClipPlanes = 8;
 
@@ -1004,8 +1027,10 @@
 
    /* Below are the cases which cannot be moved into tables easily. */
 
+   /* The compatibility profile also requires GLSLVersionCompat >= 400. */
    if (screen->get_shader_param(screen, PIPE_SHADER_TESS_CTRL,
-                                PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
+                                PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0 &&
+       (api != API_OPENGL_COMPAT || consts->GLSLVersionCompat >= 400)) {
       extensions->ARB_tessellation_shader = GL_TRUE;
    }
 
@@ -1013,7 +1038,7 @@
     * invocations of a geometry shader. There is no separate cap for that, so
     * we check the GLSLVersion.
     */
-   if (consts->GLSLVersion >= 400 &&
+   if (GLSLVersion >= 400 &&
        screen->get_shader_param(screen, PIPE_SHADER_GEOMETRY,
                                 PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
       extensions->OES_geometry_shader = GL_TRUE;
@@ -1115,6 +1140,12 @@
    if (options->allow_glsl_extension_directive_midshader)
       consts->AllowGLSLExtensionDirectiveMidShader = GL_TRUE;
 
+   if (options->allow_glsl_builtin_const_expression)
+      consts->AllowGLSLBuiltinConstantExpression = GL_TRUE;
+
+   if (options->allow_glsl_relaxed_es)
+      consts->AllowGLSLRelaxedES = GL_TRUE;
+
    consts->MinMapBufferAlignment =
       screen->get_param(screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
 
@@ -1160,7 +1191,7 @@
 
    consts->MaxViewports = screen->get_param(screen, PIPE_CAP_MAX_VIEWPORTS);
    if (consts->MaxViewports >= 16) {
-      if (consts->GLSLVersion >= 400) {
+      if (GLSLVersion >= 400) {
          consts->ViewportBounds.Min = -32768.0;
          consts->ViewportBounds.Max = 32767.0;
       } else {
@@ -1186,29 +1217,32 @@
       extensions->ARB_framebuffer_no_attachments = GL_TRUE;
 
    /* GL_ARB_ES3_compatibility.
-    *
-    * Assume that ES3 is supported if GLSL 3.30 is supported.
-    * (OpenGL 3.3 is a requirement for that extension.)
+    * Check requirements for GLSL ES 3.00.
     */
-   if (consts->GLSLVersion >= 330 &&
+   if (GLSLVersion >= 130 &&
+       extensions->ARB_uniform_buffer_object &&
+       extensions->ARB_shader_bit_encoding &&
+       extensions->NV_primitive_restart &&
+       screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
+                                PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS) >= 16 &&
        /* Requirements for ETC2 emulation. */
        screen->is_format_supported(screen, PIPE_FORMAT_R8G8B8A8_UNORM,
-                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_TEXTURE_2D, 0, 0,
                                    PIPE_BIND_SAMPLER_VIEW) &&
-       screen->is_format_supported(screen, PIPE_FORMAT_B8G8R8A8_SRGB,
-                                   PIPE_TEXTURE_2D, 0,
+       screen->is_format_supported(screen, PIPE_FORMAT_R8G8B8A8_SRGB,
+                                   PIPE_TEXTURE_2D, 0, 0,
                                    PIPE_BIND_SAMPLER_VIEW) &&
        screen->is_format_supported(screen, PIPE_FORMAT_R16_UNORM,
-                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_TEXTURE_2D, 0, 0,
                                    PIPE_BIND_SAMPLER_VIEW) &&
        screen->is_format_supported(screen, PIPE_FORMAT_R16G16_UNORM,
-                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_TEXTURE_2D, 0, 0,
                                    PIPE_BIND_SAMPLER_VIEW) &&
        screen->is_format_supported(screen, PIPE_FORMAT_R16_SNORM,
-                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_TEXTURE_2D, 0, 0,
                                    PIPE_BIND_SAMPLER_VIEW) &&
        screen->is_format_supported(screen, PIPE_FORMAT_R16G16_SNORM,
-                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_TEXTURE_2D, 0, 0,
                                    PIPE_BIND_SAMPLER_VIEW)) {
       extensions->ARB_ES3_compatibility = GL_TRUE;
    }
@@ -1364,4 +1398,28 @@
       extensions->ARB_texture_cube_map_array &&
       extensions->ARB_texture_stencil8 &&
       extensions->ARB_texture_multisample;
+
+   if (screen->get_param(screen, PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_TRIANGLES) &&
+       screen->get_param(screen, PIPE_CAP_CONSERVATIVE_RASTER_POST_SNAP_POINTS_LINES) &&
+       screen->get_param(screen, PIPE_CAP_CONSERVATIVE_RASTER_POST_DEPTH_COVERAGE)) {
+      float max_dilate;
+      bool pre_snap_triangles, pre_snap_points_lines;
+
+      max_dilate = screen->get_paramf(screen, PIPE_CAPF_MAX_CONSERVATIVE_RASTER_DILATE);
+
+      pre_snap_triangles =
+         screen->get_param(screen, PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_TRIANGLES);
+      pre_snap_points_lines =
+         screen->get_param(screen, PIPE_CAP_CONSERVATIVE_RASTER_PRE_SNAP_POINTS_LINES);
+
+      extensions->NV_conservative_raster =
+         screen->get_param(screen, PIPE_CAP_MAX_CONSERVATIVE_RASTER_SUBPIXEL_PRECISION_BIAS) > 1;
+
+      if (extensions->NV_conservative_raster) {
+         extensions->NV_conservative_raster_dilate = max_dilate >= 0.75;
+         extensions->NV_conservative_raster_pre_snap_triangles = pre_snap_triangles;
+         extensions->NV_conservative_raster_pre_snap =
+            pre_snap_triangles && pre_snap_points_lines;
+      }
+   }
 }
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 3db3c7e..c2535e8 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -52,6 +52,7 @@
 #include "st_cb_texture.h"
 #include "st_context.h"
 #include "st_format.h"
+#include "st_texture.h"
 
 
 /**
@@ -61,6 +62,12 @@
 st_mesa_format_to_pipe_format(const struct st_context *st,
                               mesa_format mesaFormat)
 {
+   struct pipe_screen *screen = st->pipe->screen;
+   bool has_bgra_srgb = screen->is_format_supported(screen,
+						    PIPE_FORMAT_B8G8R8A8_SRGB,
+						    PIPE_TEXTURE_2D, 0, 0,
+						    PIPE_BIND_SAMPLER_VIEW);
+
    switch (mesaFormat) {
    case MESA_FORMAT_A8B8G8R8_UNORM:
       return PIPE_FORMAT_ABGR8888_UNORM;
@@ -458,11 +465,13 @@
    case MESA_FORMAT_ETC2_RGB8:
       return st->has_etc2 ? PIPE_FORMAT_ETC2_RGB8 : PIPE_FORMAT_R8G8B8A8_UNORM;
    case MESA_FORMAT_ETC2_SRGB8:
-      return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGB8 : PIPE_FORMAT_B8G8R8A8_SRGB;
+      return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGB8 :
+	 has_bgra_srgb ? PIPE_FORMAT_B8G8R8A8_SRGB : PIPE_FORMAT_R8G8B8A8_SRGB;
    case MESA_FORMAT_ETC2_RGBA8_EAC:
       return st->has_etc2 ? PIPE_FORMAT_ETC2_RGBA8 : PIPE_FORMAT_R8G8B8A8_UNORM;
    case MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC:
-      return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGBA8 : PIPE_FORMAT_B8G8R8A8_SRGB;
+      return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGBA8 :
+	 has_bgra_srgb ? PIPE_FORMAT_B8G8R8A8_SRGB : PIPE_FORMAT_R8G8B8A8_SRGB;
    case MESA_FORMAT_ETC2_R11_EAC:
       return st->has_etc2 ? PIPE_FORMAT_ETC2_R11_UNORM : PIPE_FORMAT_R16_UNORM;
    case MESA_FORMAT_ETC2_RG11_EAC:
@@ -474,64 +483,121 @@
    case MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1:
       return st->has_etc2 ? PIPE_FORMAT_ETC2_RGB8A1 : PIPE_FORMAT_R8G8B8A8_UNORM;
    case MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1:
-      return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGB8A1 : PIPE_FORMAT_B8G8R8A8_SRGB;
+      return st->has_etc2 ? PIPE_FORMAT_ETC2_SRGB8A1 :
+	 has_bgra_srgb ? PIPE_FORMAT_B8G8R8A8_SRGB : PIPE_FORMAT_R8G8B8A8_SRGB;
 
    case MESA_FORMAT_RGBA_ASTC_4x4:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_4x4;
    case MESA_FORMAT_RGBA_ASTC_5x4:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_5x4;
    case MESA_FORMAT_RGBA_ASTC_5x5:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_5x5;
    case MESA_FORMAT_RGBA_ASTC_6x5:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_6x5;
    case MESA_FORMAT_RGBA_ASTC_6x6:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_6x6;
    case MESA_FORMAT_RGBA_ASTC_8x5:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_8x5;
    case MESA_FORMAT_RGBA_ASTC_8x6:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_8x6;
    case MESA_FORMAT_RGBA_ASTC_8x8:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_8x8;
    case MESA_FORMAT_RGBA_ASTC_10x5:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_10x5;
    case MESA_FORMAT_RGBA_ASTC_10x6:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_10x6;
    case MESA_FORMAT_RGBA_ASTC_10x8:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_10x8;
    case MESA_FORMAT_RGBA_ASTC_10x10:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_10x10;
    case MESA_FORMAT_RGBA_ASTC_12x10:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_12x10;
    case MESA_FORMAT_RGBA_ASTC_12x12:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
       return PIPE_FORMAT_ASTC_12x12;
 
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_4x4_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_5x4_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_5x5_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_6x5_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_6x6_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_8x5_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_8x6_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_8x8_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_10x5_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_10x6_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_10x8_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_10x10_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_12x10_SRGB;
    case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12:
+      if (!st->has_astc_2d_ldr)
+         return PIPE_FORMAT_R8G8B8A8_SRGB;
       return PIPE_FORMAT_ASTC_12x12_SRGB;
 
    default:
@@ -1037,10 +1103,7 @@
    for (i = 1; i < MESA_FORMAT_COUNT; i++) {
       enum pipe_format pf;
 
-      /* ETC formats are translated differently, skip them. */
-      if (_mesa_is_format_etc2(i))
-         continue;
-      if (i == MESA_FORMAT_ETC1_RGB8 && !st->has_etc1)
+      if (st_compressed_format_fallback(st, i))
          continue;
 
       pf = st_mesa_format_to_pipe_format(st, i);
@@ -1052,12 +1115,8 @@
 
    /* Test all Gallium formats */
    for (i = 1; i < PIPE_FORMAT_COUNT; i++) {
-      /* ETC formats are translated differently, skip them. */
-      if (i == PIPE_FORMAT_ETC1_RGB8 && !st->has_etc1)
-         continue;
-
       mesa_format mf = st_pipe_format_to_mesa_format(i);
-      if (_mesa_is_format_etc2(mf) && !st->has_etc2)
+      if (st_compressed_format_fallback(st, mf))
          continue;
 
       if (mf != MESA_FORMAT_NONE) {
@@ -2000,7 +2059,7 @@
    uint i;
    for (i = 0; formats[i]; i++) {
       if (screen->is_format_supported(screen, formats[i], target,
-                                      sample_count, bindings)) {
+                                      sample_count, sample_count, bindings)) {
          if (!allow_dxt && util_format_is_s3tc(formats[i])) {
             /* we can't return a dxt format, continue searching */
             continue;
@@ -2133,11 +2192,24 @@
    /* search for exact matches */
    pf = find_exact_format(internalFormat, format, type);
    if (pf != PIPE_FORMAT_NONE &&
-       screen->is_format_supported(screen, pf,
-                                   target, sample_count, bindings)) {
+       screen->is_format_supported(screen, pf, target, sample_count,
+                                   sample_count, bindings)) {
       goto success;
    }
 
+   /* For an unsized GL_RGB but a 2_10_10_10 type, try to pick one of the
+    * 2_10_10_10 formats.  This is important for
+    * GL_EXT_texture_type_2_10_10_10_EXT support, which says that these
+    * formats are not color-renderable.  Mesa's check for making those
+    * non-color-renderable is based on our chosen format being 2101010.
+    */
+   if (type == GL_UNSIGNED_INT_2_10_10_10_REV) {
+      if (internalFormat == GL_RGB)
+         internalFormat = GL_RGB10;
+      else if (internalFormat == GL_RGBA)
+         internalFormat = GL_RGB10_A2;
+   }
+
    /* search table for internalFormat */
    for (i = 0; i < ARRAY_SIZE(format_map); i++) {
       const struct format_mapping *mapping = &format_map[i];
@@ -2218,8 +2290,8 @@
             st_mesa_format_to_pipe_format(st, mesa_format);
 
          if (format &&
-             screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, 0,
-                                         bind)) {
+             screen->is_format_supported(screen, format, PIPE_TEXTURE_2D,
+                                         0, 0, bind)) {
             return format;
          }
          /* It's unlikely to find 2 matching Mesa formats. */
@@ -2325,10 +2397,8 @@
    }
 
    if (pFormat == PIPE_FORMAT_NONE) {
-      /* lie about using etc1/etc2 natively if we do decoding tricks */
       mFormat = _mesa_glenum_to_compressed_format(internalFormat);
-      if ((mFormat == MESA_FORMAT_ETC1_RGB8 && !st->has_etc1) ||
-          (_mesa_is_format_etc2(mFormat) && !st->has_etc2))
+      if (st_compressed_format_fallback(st, mFormat))
           return mFormat;
 
       /* no luck at all */
diff --git a/src/mesa/state_tracker/st_glsl_to_nir.cpp b/src/mesa/state_tracker/st_glsl_to_nir.cpp
index bcf6a7c..83620fb 100644
--- a/src/mesa/state_tracker/st_glsl_to_nir.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_nir.cpp
@@ -43,7 +43,9 @@
 #include "compiler/nir/nir.h"
 #include "compiler/glsl_types.h"
 #include "compiler/glsl/glsl_to_nir.h"
+#include "compiler/glsl/gl_nir.h"
 #include "compiler/glsl/ir.h"
+#include "compiler/glsl/ir_optimization.h"
 #include "compiler/glsl/string_to_uint_map.h"
 
 
@@ -271,12 +273,15 @@
           uniform->interface_type != NULL)
          continue;
 
-      if (!uniform->data.bindless &&
-          (uniform->type->is_sampler() || uniform->type->is_image())) {
-         if (uniform->type->is_sampler())
-            loc = shaderidx++;
-         else
-            loc = imageidx++;
+      const struct glsl_type *type = glsl_without_array(uniform->type);
+      if (!uniform->data.bindless && (type->is_sampler() || type->is_image())) {
+         if (type->is_sampler()) {
+            loc = shaderidx;
+            shaderidx += type_size(uniform->type);
+         } else {
+            loc = imageidx;
+            imageidx += type_size(uniform->type);
+         }
       } else if (strncmp(uniform->name, "gl_", 3) == 0) {
          const gl_state_index16 *const stateTokens = uniform->state_slots[0].tokens;
          /* This state reference has already been setup by ir_to_mesa, but we'll
@@ -284,7 +289,6 @@
           */
 
          unsigned comps;
-         const struct glsl_type *type = glsl_without_array(uniform->type);
          if (glsl_type_is_struct(type)) {
             comps = 4;
          } else {
@@ -313,18 +317,22 @@
    *size = max;
 }
 
-static void
-st_nir_opts(nir_shader *nir)
+void
+st_nir_opts(nir_shader *nir, bool scalar)
 {
    bool progress;
    do {
       progress = false;
 
       NIR_PASS_V(nir, nir_lower_vars_to_ssa);
-      NIR_PASS_V(nir, nir_lower_alu_to_scalar);
-      NIR_PASS_V(nir, nir_lower_phis_to_scalar);
 
-      NIR_PASS_V(nir, nir_lower_64bit_pack);
+      if (scalar) {
+         NIR_PASS_V(nir, nir_lower_alu_to_scalar);
+         NIR_PASS_V(nir, nir_lower_phis_to_scalar);
+      }
+
+      NIR_PASS_V(nir, nir_lower_alu);
+      NIR_PASS_V(nir, nir_lower_pack);
       NIR_PASS(progress, nir, nir_copy_prop);
       NIR_PASS(progress, nir, nir_opt_remove_phis);
       NIR_PASS(progress, nir, nir_opt_dce);
@@ -360,6 +368,9 @@
 {
    const nir_shader_compiler_options *options =
       st->ctx->Const.ShaderCompilerOptions[prog->info.stage].NirOptions;
+   enum pipe_shader_type type = pipe_shader_type_from_mesa(stage);
+   struct pipe_screen *screen = st->pipe->screen;
+   bool is_scalar = screen->get_shader_param(screen, type, PIPE_SHADER_CAP_SCALAR_ISA);
    assert(options);
 
    if (prog->nir)
@@ -377,7 +388,7 @@
          ~prev_stages & shader_program->data->linked_stages;
 
       nir->info.next_stage = stages_mask ?
-         (gl_shader_stage) ffs(stages_mask) : MESA_SHADER_FRAGMENT;
+         (gl_shader_stage) u_bit_scan(&stages_mask) : MESA_SHADER_FRAGMENT;
    } else {
       nir->info.next_stage = MESA_SHADER_FRAGMENT;
    }
@@ -402,7 +413,7 @@
    NIR_PASS_V(nir, nir_split_var_copies);
    NIR_PASS_V(nir, nir_lower_var_copies);
 
-   st_nir_opts(nir);
+   st_nir_opts(nir, is_scalar);
 
    return nir;
 }
@@ -467,7 +478,7 @@
    st_set_prog_affected_state_flags(prog);
 
    NIR_PASS_V(nir, st_nir_lower_builtin);
-   NIR_PASS_V(nir, nir_lower_atomics, shader_program, true);
+   NIR_PASS_V(nir, gl_nir_lower_atomics, shader_program, true);
 
    if (st->ctx->_Shader->Flags & GLSL_DUMP) {
       _mesa_log("\n");
@@ -553,6 +564,7 @@
                         struct gl_linked_shader *shader)
 {
    struct st_context *st = st_context(ctx);
+   struct pipe_screen *pscreen = ctx->st->pipe->screen;
    struct gl_program *prog;
 
    validate_ir_tree(shader->ir);
@@ -565,6 +577,10 @@
    _mesa_generate_parameters_list_for_uniforms(ctx, shader_program, shader,
                                                prog->Parameters);
 
+   /* Remove reads from output registers. */
+   if (!pscreen->get_param(pscreen, PIPE_CAP_TGSI_CAN_READ_OUTPUTS))
+      lower_output_reads(shader->Stage, shader->ir);
+
    if (ctx->_Shader->Flags & GLSL_DUMP) {
       _mesa_log("\n");
       _mesa_log("GLSL IR for linked %s program %d:\n",
@@ -584,7 +600,7 @@
 }
 
 static void
-st_nir_link_shaders(nir_shader **producer, nir_shader **consumer)
+st_nir_link_shaders(nir_shader **producer, nir_shader **consumer, bool scalar)
 {
    nir_lower_io_arrays_to_elements(*producer, *consumer);
 
@@ -611,8 +627,8 @@
       NIR_PASS_V(*producer, nir_lower_indirect_derefs, indirect_mask);
       NIR_PASS_V(*consumer, nir_lower_indirect_derefs, indirect_mask);
 
-      st_nir_opts(*producer);
-      st_nir_opts(*consumer);
+      st_nir_opts(*producer, scalar);
+      st_nir_opts(*consumer, scalar);
    }
 }
 
@@ -623,6 +639,20 @@
             struct gl_shader_program *shader_program)
 {
    struct st_context *st = st_context(ctx);
+   struct pipe_screen *screen = st->pipe->screen;
+   bool is_scalar[MESA_SHADER_STAGES];
+
+   /* Determine scalar property of each shader stage */
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_linked_shader *shader = shader_program->_LinkedShaders[i];
+      enum pipe_shader_type type;
+
+      if (shader == NULL)
+         continue;
+
+      type = pipe_shader_type_from_mesa(shader->Stage);
+      is_scalar[i] = screen->get_shader_param(screen, type, PIPE_SHADER_CAP_SCALAR_ISA);
+   }
 
    /* Determine first and last stage. */
    unsigned first = MESA_SHADER_STAGES;
@@ -651,7 +681,7 @@
 
       nir_shader *nir = shader->Program->nir;
       NIR_PASS_V(nir, nir_lower_io_to_scalar_early, mask);
-      st_nir_opts(nir);
+      st_nir_opts(nir, is_scalar[i]);
    }
 
    /* Linking the stages in the opposite order (from fragment to vertex)
@@ -666,7 +696,8 @@
          continue;
 
       st_nir_link_shaders(&shader->Program->nir,
-                          &shader_program->_LinkedShaders[next]->Program->nir);
+                          &shader_program->_LinkedShaders[next]->Program->nir,
+                          is_scalar[i]);
       next = i;
    }
 
@@ -714,7 +745,15 @@
       shader->Program->info = nir->info;
 
       if (prev != -1) {
-         nir_compact_varyings(shader_program->_LinkedShaders[prev]->Program->nir,
+         struct gl_program *prev_shader =
+            shader_program->_LinkedShaders[prev]->Program;
+
+         /* We can't use nir_compact_varyings with transform feedback, since
+          * the pipe_stream_output->output_register field is based on the
+          * pre-compacted driver_locations.
+          */
+         if (!prev_shader->sh.LinkedTransformFeedback)
+            nir_compact_varyings(shader_program->_LinkedShaders[prev]->Program->nir,
                               nir, ctx->API != API_OPENGL_COMPAT);
       }
       prev = i;
@@ -734,6 +773,8 @@
          _mesa_reference_program(ctx, &shader->Program, NULL);
          return false;
       }
+
+      nir_sweep(shader->Program->nir);
    }
 
    return true;
@@ -813,9 +854,9 @@
    }
 
    if (screen->get_param(screen, PIPE_CAP_NIR_SAMPLERS_AS_DEREF))
-      NIR_PASS_V(nir, nir_lower_samplers_as_deref, shader_program);
+      NIR_PASS_V(nir, gl_nir_lower_samplers_as_deref, shader_program);
    else
-      NIR_PASS_V(nir, nir_lower_samplers, shader_program);
+      NIR_PASS_V(nir, gl_nir_lower_samplers, shader_program);
 }
 
 } /* extern "C" */
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index b321112..44a0890 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -123,6 +123,7 @@
    enum glsl_interp_mode interp;
    enum glsl_base_type base_type;
    ubyte usage_mask; /* GLSL-style usage-mask,  i.e. single bit per double */
+   bool invariant;
 };
 
 static struct inout_decl *
@@ -316,6 +317,7 @@
                           st_src_reg *indirect,
                           unsigned *location);
    st_src_reg canonicalize_gather_offset(st_src_reg offset);
+   bool handle_bound_deref(ir_dereference *ir);
 
    bool try_emit_mad(ir_expression *ir,
               int mul_operand);
@@ -1225,6 +1227,10 @@
    st_src_reg a, b, c;
    st_dst_reg result_dst;
 
+   // there is no TGSI opcode for this
+   if (ir->type->is_integer_64())
+      return false;
+
    ir_expression *expr = ir->operands[mul_operand]->as_expression();
    if (!expr || expr->operation != ir_binop_mul)
       return false;
@@ -2439,10 +2445,15 @@
 void
 glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 {
-   variable_storage *entry = find_variable_storage(ir->var);
+   variable_storage *entry;
    ir_variable *var = ir->var;
    bool remove_array;
 
+   if (handle_bound_deref(ir->as_dereference()))
+      return;
+
+   entry = find_variable_storage(ir->var);
+
    if (!entry) {
       switch (var->data.mode) {
       case ir_var_uniform:
@@ -2508,6 +2519,8 @@
          unsigned num_components;
          num_outputs++;
 
+         decl->invariant = var->data.invariant;
+
          if (type_without_array->is_64bit())
             component = component / 2;
          if (type_without_array->vector_elements)
@@ -2669,6 +2682,9 @@
    bool is_2D = false;
    ir_variable *var = ir->variable_referenced();
 
+   if (handle_bound_deref(ir->as_dereference()))
+      return;
+
    /* We only need the logic provided by st_glsl_storage_type_size()
     * for arrays of structs. Indirect sampler and image indexing is handled
     * elsewhere.
@@ -2768,6 +2784,9 @@
    ir_variable *var = ir->record->variable_referenced();
    int offset = 0;
 
+   if (handle_bound_deref(ir->as_dereference()))
+      return;
+
    ir->record->accept(this);
 
    assert(ir->field_idx >= 0);
@@ -3104,7 +3123,7 @@
    GLdouble stack_vals[4] = { 0 };
    gl_constant_value *values = (gl_constant_value *) stack_vals;
    GLenum gl_type = GL_NONE;
-   unsigned int i;
+   unsigned int i, elements;
    static int in_array = 0;
    gl_register_file file = in_array ? PROGRAM_CONSTANT : PROGRAM_IMMEDIATE;
 
@@ -3226,6 +3245,7 @@
       return;
    }
 
+   elements = ir->type->vector_elements;
    switch (ir->type->base_type) {
    case GLSL_TYPE_FLOAT:
       gl_type = GL_FLOAT;
@@ -3275,14 +3295,21 @@
          values[i].u = ir->value.b[i] ? ctx->Const.UniformBooleanTrue : 0;
       }
       break;
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_IMAGE:
+      gl_type = GL_UNSIGNED_INT;
+      elements = 2;
+      values[0].u = ir->value.u64[0] & 0xffffffff;
+      values[1].u = ir->value.u64[0] >> 32;
+      break;
    default:
-      assert(!"Non-float/uint/int/bool constant");
+      assert(!"Non-float/uint/int/bool/sampler/image constant");
    }
 
    this->result = st_src_reg(file, -1, ir->type);
    this->result.index = add_constant(file,
                                      values,
-                                     ir->type->vector_elements,
+                                     elements,
                                      gl_type,
                                      &this->result.swizzle);
 }
@@ -3990,6 +4017,8 @@
    case ir_intrinsic_generic_atomic_max:
    case ir_intrinsic_generic_atomic_exchange:
    case ir_intrinsic_generic_atomic_comp_swap:
+   case ir_intrinsic_begin_invocation_interlock:
+   case ir_intrinsic_end_invocation_interlock:
       unreachable("Invalid intrinsic");
    }
 }
@@ -4110,6 +4139,45 @@
 
    return offset;
 }
+ 
+bool
+glsl_to_tgsi_visitor::handle_bound_deref(ir_dereference *ir)
+{
+   ir_variable *var = ir->variable_referenced();
+
+   if (!var || var->data.mode != ir_var_uniform || var->data.bindless ||
+       !(ir->type->is_image() || ir->type->is_sampler()))
+      return false;
+
+   /* Convert from bound sampler/image to bindless handle. */
+   bool is_image = ir->type->is_image();
+   st_src_reg resource(is_image ? PROGRAM_IMAGE : PROGRAM_SAMPLER, 0, GLSL_TYPE_UINT);
+   uint16_t index = 0;
+   unsigned array_size = 1, base = 0;
+   st_src_reg reladdr;
+   get_deref_offsets(ir, &array_size, &base, &index, &reladdr, true);
+
+   resource.index = index;
+   if (reladdr.file != PROGRAM_UNDEFINED) {
+      resource.reladdr = ralloc(mem_ctx, st_src_reg);
+      *resource.reladdr = reladdr;
+      emit_arl(ir, sampler_reladdr, reladdr);
+   }
+
+   this->result = get_temp(glsl_type::uvec2_type);
+   st_dst_reg dst(this->result);
+   dst.writemask = WRITEMASK_XY;
+
+   glsl_to_tgsi_instruction *inst = emit_asm(
+      ir, is_image ? TGSI_OPCODE_IMG2HND : TGSI_OPCODE_SAMP2HND, dst);
+
+   inst->tex_target = ir->type->sampler_index();
+   inst->resource = resource;
+   inst->sampler_array_size = array_size;
+   inst->sampler_base = base;
+
+   return true;
+}
 
 void
 glsl_to_tgsi_visitor::visit(ir_texture *ir)
@@ -5904,6 +5972,7 @@
    case TGSI_OPCODE_TXL2:
    case TGSI_OPCODE_TG4:
    case TGSI_OPCODE_LODQ:
+   case TGSI_OPCODE_SAMP2HND:
       if (inst->resource.file == PROGRAM_SAMPLER) {
          src[num_src] = t->samplers[inst->resource.index];
       } else {
@@ -5942,6 +6011,7 @@
    case TGSI_OPCODE_ATOMUMAX:
    case TGSI_OPCODE_ATOMIMIN:
    case TGSI_OPCODE_ATOMIMAX:
+   case TGSI_OPCODE_IMG2HND:
       for (i = num_src - 1; i >= 0; i--)
          src[i + 1] = src[i];
       num_src++;
@@ -6006,6 +6076,34 @@
    }
 }
 
+/* Invert SamplePos.y when rendering to the default framebuffer. */
+static void
+emit_samplepos_adjustment(struct st_translate *t, int wpos_y_transform)
+{
+   struct ureg_program *ureg = t->ureg;
+
+   assert(wpos_y_transform >= 0);
+   struct ureg_src trans_const = ureg_DECL_constant(ureg, wpos_y_transform);
+   struct ureg_src samplepos_sysval = t->systemValues[SYSTEM_VALUE_SAMPLE_POS];
+   struct ureg_dst samplepos_flipped = ureg_DECL_temporary(ureg);
+   struct ureg_dst is_fbo = ureg_DECL_temporary(ureg);
+
+   ureg_ADD(ureg, ureg_writemask(samplepos_flipped, TGSI_WRITEMASK_Y),
+            ureg_imm1f(ureg, 1), ureg_negate(samplepos_sysval));
+
+   /* If trans.x == 1, use samplepos.y, else use 1 - samplepos.y. */
+   ureg_FSEQ(ureg, ureg_writemask(is_fbo, TGSI_WRITEMASK_Y),
+             ureg_scalar(trans_const, TGSI_SWIZZLE_X), ureg_imm1f(ureg, 1));
+   ureg_UCMP(ureg, ureg_writemask(samplepos_flipped, TGSI_WRITEMASK_Y),
+             ureg_src(is_fbo), samplepos_sysval, ureg_src(samplepos_flipped));
+   ureg_MOV(ureg, ureg_writemask(samplepos_flipped, TGSI_WRITEMASK_X),
+            samplepos_sysval);
+
+   /* Use the result in place of the system value. */
+   t->systemValues[SYSTEM_VALUE_SAMPLE_POS] = ureg_src(samplepos_flipped);
+}
+
+
 /**
  * Emit the TGSI instructions for inverting and adjusting WPOS.
  * This code is unavoidable because it also depends on whether
@@ -6441,14 +6539,15 @@
                      (enum tgsi_semantic) outputSemanticName[slot],
                      outputSemanticIndex[slot],
                      decl->gs_out_streams,
-                     slot, tgsi_usage_mask, decl->array_id, decl->size);
-
+                     slot, tgsi_usage_mask, decl->array_id, decl->size, decl->invariant);
+         dst.Invariant = decl->invariant;
          for (unsigned j = 0; j < decl->size; ++j) {
             if (t->outputs[slot + j].File != TGSI_FILE_OUTPUT) {
                /* The ArrayID is set up in dst_register */
                t->outputs[slot + j] = dst;
                t->outputs[slot + j].ArrayID = 0;
                t->outputs[slot + j].Index += j;
+               t->outputs[slot + j].Invariant = decl->invariant;
             }
          }
       }
@@ -6572,6 +6671,10 @@
                emit_wpos(st_context(ctx), t, proginfo, ureg,
                          program->wpos_transform_const);
 
+            if (procType == PIPE_SHADER_FRAGMENT &&
+                semName == TGSI_SEMANTIC_SAMPLEPOS)
+               emit_samplepos_adjustment(t, program->wpos_transform_const);
+
             sysInputs &= ~(1ull << i);
          }
       }
@@ -6873,7 +6976,8 @@
    /* This must be done before the uniform storage is associated. */
    if (shader->Stage == MESA_SHADER_FRAGMENT &&
        (prog->info.inputs_read & VARYING_BIT_POS ||
-        prog->info.system_values_read & (1ull << SYSTEM_VALUE_FRAG_COORD))) {
+        prog->info.system_values_read & (1ull << SYSTEM_VALUE_FRAG_COORD) ||
+        prog->info.system_values_read & (1ull << SYSTEM_VALUE_SAMPLE_POS))) {
       static const gl_state_index16 wposTransformState[STATE_LENGTH] = {
          STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM
       };
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi_private.h b/src/mesa/state_tracker/st_glsl_to_tgsi_private.h
index c482828..fccb704 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi_private.h
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi_private.h
@@ -179,6 +179,7 @@
    case TGSI_OPCODE_ATOMUMAX:
    case TGSI_OPCODE_ATOMIMIN:
    case TGSI_OPCODE_ATOMIMAX:
+   case TGSI_OPCODE_IMG2HND:
       return true;
    default:
       return false;
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index d6901c9..9ed316b 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -472,6 +472,7 @@
           st_pipe_format_to_mesa_format(srgb_format) != MESA_FORMAT_NONE &&
           screen->is_format_supported(screen, srgb_format,
                                       PIPE_TEXTURE_2D, stfbi->visual->samples,
+                                      stfbi->visual->samples,
                                       (PIPE_BIND_DISPLAY_TARGET |
                                        PIPE_BIND_RENDER_TARGET)))
          mode.sRGBCapable = GL_TRUE;
@@ -833,6 +834,7 @@
    struct st_context *shared_ctx = (struct st_context *) shared_stctxi;
    struct st_context *st;
    struct pipe_context *pipe;
+   struct gl_config* mode_ptr;
    struct gl_config mode;
    gl_api api;
    bool no_error = false;
@@ -892,7 +894,13 @@
    }
 
    st_visual_to_context_mode(&attribs->visual, &mode);
-   st = st_create_context(api, pipe, &mode, shared_ctx,
+
+   if (attribs->visual.no_config)
+      mode_ptr = NULL;
+   else
+      mode_ptr = &mode;
+
+   st = st_create_context(api, pipe, mode_ptr, shared_ctx,
                           &attribs->options, no_error);
    if (!st) {
       *error = ST_CONTEXT_ERROR_NO_MEMORY;
diff --git a/src/mesa/state_tracker/st_nir.h b/src/mesa/state_tracker/st_nir.h
index 1c2e32a..aa6e327 100644
--- a/src/mesa/state_tracker/st_nir.h
+++ b/src/mesa/state_tracker/st_nir.h
@@ -42,6 +42,8 @@
                      struct gl_shader_program *shader_program,
                      struct nir_shader *nir);
 
+void st_nir_opts(struct nir_shader *nir, bool is_scalar);
+
 bool
 st_link_nir(struct gl_context *ctx,
             struct gl_shader_program *shader_program);
diff --git a/src/mesa/state_tracker/st_nir_lower_builtin.c b/src/mesa/state_tracker/st_nir_lower_builtin.c
index 660fdf3..195dc40 100644
--- a/src/mesa/state_tracker/st_nir_lower_builtin.c
+++ b/src/mesa/state_tracker/st_nir_lower_builtin.c
@@ -58,6 +58,7 @@
 
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_builder.h"
+#include "compiler/nir/nir_deref.h"
 #include "st_nir.h"
 #include "compiler/glsl/ir.h"
 #include "uniforms.h"
@@ -70,44 +71,45 @@
 } lower_builtin_state;
 
 static const struct gl_builtin_uniform_element *
-get_element(const struct gl_builtin_uniform_desc *desc, nir_deref_var *deref)
+get_element(const struct gl_builtin_uniform_desc *desc, nir_deref_path *path)
 {
-   nir_deref *tail = &deref->deref;
+   int idx = 1;
+
+   assert(path->path[0]->deref_type == nir_deref_type_var);
 
    if ((desc->num_elements == 1) && (desc->elements[0].field == NULL))
       return NULL;
 
    /* we handle arrays in get_variable(): */
-   if (tail->child->deref_type == nir_deref_type_array)
-      tail = tail->child;
+   if (path->path[idx]->deref_type == nir_deref_type_array)
+      idx++;
 
    /* don't need to deal w/ non-struct or array of non-struct: */
-   if (!tail->child)
+   if (!path->path[idx])
       return NULL;
 
-   if (tail->child->deref_type != nir_deref_type_struct)
+   if (path->path[idx]->deref_type != nir_deref_type_struct)
       return NULL;
 
-   nir_deref_struct *deref_struct = nir_deref_as_struct(tail->child);
+   assert(path->path[idx]->strct.index < desc->num_elements);
 
-   assert(deref_struct->index < desc->num_elements);
-
-   return &desc->elements[deref_struct->index];
+   return &desc->elements[path->path[idx]->strct.index ];
 }
 
 static nir_variable *
-get_variable(lower_builtin_state *state, nir_deref_var *deref,
+get_variable(lower_builtin_state *state, nir_deref_path *path,
              const struct gl_builtin_uniform_element *element)
 {
    nir_shader *shader = state->shader;
    gl_state_index16 tokens[STATE_LENGTH];
+   int idx = 1;
 
    memcpy(tokens, element->tokens, sizeof(tokens));
 
-   if (deref->deref.child->deref_type == nir_deref_type_array) {
-      nir_deref_array *darr = nir_deref_as_array(deref->deref.child);
+   if (path->path[idx]->deref_type == nir_deref_type_array) {
+      nir_const_value *c = nir_src_as_const_value(path->path[idx]->arr.index);
 
-      assert(darr->deref_array_type == nir_deref_array_type_direct);
+      assert(c);
 
       /* we need to fixup the array index slot: */
       switch (tokens[0]) {
@@ -121,7 +123,7 @@
       case STATE_TEXGEN:
       case STATE_TEXENV_COLOR:
       case STATE_CLIPPLANE:
-         tokens[1] = darr->base_offset;
+         tokens[1] = c->u32[0];
          break;
       }
    }
@@ -153,6 +155,7 @@
 lower_builtin_block(lower_builtin_state *state, nir_block *block)
 {
    nir_builder *b = &state->builder;
+   bool progress = false;
 
    nir_foreach_instr_safe(instr, block) {
       if (instr->type != nir_instr_type_intrinsic)
@@ -160,10 +163,11 @@
 
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
-      if (intrin->intrinsic != nir_intrinsic_load_var)
+      if (intrin->intrinsic != nir_intrinsic_load_deref)
          continue;
 
-      nir_variable *var = intrin->variables[0]->var;
+      nir_variable *var =
+         nir_deref_instr_get_variable(nir_src_as_deref(intrin->src[0]));
       if (var->data.mode != nir_var_uniform)
          continue;
 
@@ -178,12 +182,16 @@
       if (!desc)
          continue;
 
-      const struct gl_builtin_uniform_element *element =
-         get_element(desc, intrin->variables[0]);
+      nir_deref_path path;
+      nir_deref_path_init(&path, nir_src_as_deref(intrin->src[0]), NULL);
+
+      const struct gl_builtin_uniform_element *element = get_element(desc, &path);
 
       /* matrix elements (array_deref) do not need special handling: */
-      if (!element)
+      if (!element) {
+         nir_deref_path_finish(&path);
          continue;
+      }
 
       /* remove existing var from uniform list: */
       exec_node_remove(&var->node);
@@ -192,8 +200,8 @@
        */
       exec_node_self_link(&var->node);
 
-      nir_variable *new_var =
-         get_variable(state, intrin->variables[0], element);
+      nir_variable *new_var = get_variable(state, &path, element);
+      nir_deref_path_finish(&path);
 
       b->cursor = nir_before_instr(instr);
 
@@ -217,9 +225,11 @@
        * var since we don't want it to get uniform space allocated.
        */
       nir_instr_remove(&intrin->instr);
+
+      progress = true;
    }
 
-   return true;
+   return progress;
 }
 
 static void
@@ -228,10 +238,14 @@
    nir_builder_init(&state->builder, impl);
    state->mem_ctx = ralloc_parent(impl);
 
+   bool progress = false;
    nir_foreach_block(block, impl) {
-      lower_builtin_block(state, block);
+      progress |= lower_builtin_block(state, block);
    }
 
+   if (progress)
+      nir_remove_dead_derefs_impl(impl);
+
    nir_metadata_preserve(impl, nir_metadata_block_index |
                                nir_metadata_dominance);
 }
diff --git a/src/mesa/state_tracker/st_pbo.c b/src/mesa/state_tracker/st_pbo.c
index 628e3ca..6c55e85 100644
--- a/src/mesa/state_tracker/st_pbo.c
+++ b/src/mesa/state_tracker/st_pbo.c
@@ -245,7 +245,7 @@
 
       velem.src_offset = 0;
       velem.instance_divisor = 0;
-      velem.vertex_buffer_index = cso_get_aux_vertex_buffer_slot(cso);
+      velem.vertex_buffer_index = 0;
       velem.src_format = PIPE_FORMAT_R32G32_FLOAT;
 
       cso_set_vertex_elements(cso, 1, &velem);
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index fe72dda..8117f4f 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -388,11 +388,11 @@
    enum pipe_error error;
    unsigned num_outputs = 0;
    unsigned attr;
-   ubyte input_to_index[VERT_ATTRIB_MAX] = {0};
    ubyte output_semantic_name[VARYING_SLOT_MAX] = {0};
    ubyte output_semantic_index[VARYING_SLOT_MAX] = {0};
 
    stvp->num_inputs = 0;
+   memset(stvp->input_to_index, ~0, sizeof(stvp->input_to_index));
 
    if (stvp->Base.arb.IsPositionInvariant)
       _mesa_insert_mvp_code(st->ctx, &stvp->Base);
@@ -403,7 +403,7 @@
     */
    for (attr = 0; attr < VERT_ATTRIB_MAX; attr++) {
       if ((stvp->Base.info.inputs_read & BITFIELD64_BIT(attr)) != 0) {
-         input_to_index[attr] = stvp->num_inputs;
+         stvp->input_to_index[attr] = stvp->num_inputs;
          stvp->index_to_input[stvp->num_inputs] = attr;
          stvp->num_inputs++;
          if ((stvp->Base.info.vs.double_inputs_read &
@@ -415,7 +415,7 @@
       }
    }
    /* bit of a hack, presetup potentially unused edgeflag input */
-   input_to_index[VERT_ATTRIB_EDGEFLAG] = stvp->num_inputs;
+   stvp->input_to_index[VERT_ATTRIB_EDGEFLAG] = stvp->num_inputs;
    stvp->index_to_input[stvp->num_inputs] = VERT_ATTRIB_EDGEFLAG;
 
    /* Compute mapping of vertex program outputs to slots.
@@ -495,7 +495,7 @@
                                    &stvp->Base,
                                    /* inputs */
                                    stvp->num_inputs,
-                                   input_to_index,
+                                   stvp->input_to_index,
                                    NULL, /* inputSlotToAttr */
                                    NULL, /* input semantic name */
                                    NULL, /* input semantic index */
@@ -518,7 +518,7 @@
                                         &stvp->Base,
                                         /* inputs */
                                         stvp->num_inputs,
-                                        input_to_index,
+                                        stvp->input_to_index,
                                         NULL, /* input semantic name */
                                         NULL, /* input semantic index */
                                         NULL,
@@ -629,6 +629,13 @@
       /* create now */
       vpv = st_create_vp_variant(st, stvp, key);
       if (vpv) {
+          for (unsigned index = 0; index < vpv->num_inputs; ++index) {
+             unsigned attr = stvp->index_to_input[index];
+             if (attr == ST_DOUBLE_ATTRIB_PLACEHOLDER)
+                continue;
+             vpv->vert_attrib_mask |= 1u << attr;
+          }
+
          /* insert into list */
          vpv->next = stvp->variants;
          stvp->variants = vpv;
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index a520ffb..f67ea5e 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -196,6 +196,9 @@
 
    /** similar to that in st_vertex_program, but with edgeflags info too */
    GLuint num_inputs;
+
+   /** Bitfield of VERT_BIT_* bits of mesa vertex processing inputs */
+   GLbitfield vert_attrib_mask;
 };
 
 
@@ -215,6 +218,8 @@
    /** maps a TGSI input index back to a Mesa VERT_ATTRIB_x */
    ubyte index_to_input[PIPE_MAX_ATTRIBS];
    ubyte num_inputs;
+   /** Reverse mapping of the above */
+   ubyte input_to_index[VERT_ATTRIB_MAX];
 
    /** Maps VARYING_SLOT_x to slot */
    ubyte result_to_output[VARYING_SLOT_MAX];
diff --git a/src/mesa/state_tracker/st_shader_cache.c b/src/mesa/state_tracker/st_shader_cache.c
index 3ca3fef..c82ce3e 100644
--- a/src/mesa/state_tracker/st_shader_cache.c
+++ b/src/mesa/state_tracker/st_shader_cache.c
@@ -74,6 +74,9 @@
 st_serialise_ir_program(struct gl_context *ctx, struct gl_program *prog,
                         bool nir)
 {
+   if (prog->driver_cache_blob)
+      return;
+
    struct blob blob;
    blob_init(&blob);
 
@@ -84,6 +87,8 @@
       blob_write_uint32(&blob, stvp->num_inputs);
       blob_write_bytes(&blob, stvp->index_to_input,
                        sizeof(stvp->index_to_input));
+      blob_write_bytes(&blob, stvp->input_to_index,
+                       sizeof(stvp->input_to_index));
       blob_write_bytes(&blob, stvp->result_to_output,
                        sizeof(stvp->result_to_output));
 
@@ -206,6 +211,8 @@
       stvp->num_inputs = blob_read_uint32(&blob_reader);
       blob_copy_bytes(&blob_reader, (uint8_t *) stvp->index_to_input,
                       sizeof(stvp->index_to_input));
+      blob_copy_bytes(&blob_reader, (uint8_t *) stvp->input_to_index,
+                      sizeof(stvp->input_to_index));
       blob_copy_bytes(&blob_reader, (uint8_t *) stvp->result_to_output,
                       sizeof(stvp->result_to_output));
 
@@ -408,6 +415,14 @@
 }
 
 void
+st_serialise_tgsi_program_binary(struct gl_context *ctx,
+                                 struct gl_shader_program *shProg,
+                                 struct gl_program *prog)
+{
+   st_serialise_ir_program(ctx, prog, false);
+}
+
+void
 st_deserialise_tgsi_program(struct gl_context *ctx,
                             struct gl_shader_program *shProg,
                             struct gl_program *prog)
@@ -422,6 +437,14 @@
 }
 
 void
+st_serialise_nir_program_binary(struct gl_context *ctx,
+                                struct gl_shader_program *shProg,
+                                struct gl_program *prog)
+{
+   st_serialise_ir_program(ctx, prog, true);
+}
+
+void
 st_deserialise_nir_program(struct gl_context *ctx,
                            struct gl_shader_program *shProg,
                            struct gl_program *prog)
diff --git a/src/mesa/state_tracker/st_shader_cache.h b/src/mesa/state_tracker/st_shader_cache.h
index 132dac0..5b0bff7 100644
--- a/src/mesa/state_tracker/st_shader_cache.h
+++ b/src/mesa/state_tracker/st_shader_cache.h
@@ -39,6 +39,11 @@
 st_serialise_tgsi_program(struct gl_context *ctx, struct gl_program *prog);
 
 void
+st_serialise_tgsi_program_binary(struct gl_context *ctx,
+                                 struct gl_shader_program *shProg,
+                                 struct gl_program *prog);
+
+void
 st_deserialise_tgsi_program(struct gl_context *ctx,
                             struct gl_shader_program *shProg,
                             struct gl_program *prog);
@@ -47,6 +52,11 @@
 st_serialise_nir_program(struct gl_context *ctx, struct gl_program *prog);
 
 void
+st_serialise_nir_program_binary(struct gl_context *ctx,
+                                struct gl_shader_program *shProg,
+                                struct gl_program *prog);
+
+void
 st_deserialise_nir_program(struct gl_context *ctx,
                            struct gl_shader_program *shProg,
                            struct gl_program *prog);
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index 7d83036..5da98bd 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -79,7 +79,7 @@
        (int) target, util_format_name(format), last_level);
 
    assert(format);
-   assert(screen->is_format_supported(screen, format, target, 0,
+   assert(screen->is_format_supported(screen, format, target, 0, 0,
                                       PIPE_BIND_SAMPLER_VIEW));
 
    memset(&pt, 0, sizeof(pt));
@@ -95,6 +95,7 @@
    /* only set this for OpenGL textures, not renderbuffers */
    pt.flags = PIPE_RESOURCE_FLAG_TEXTURING_MORE_LIKELY;
    pt.nr_samples = nr_samples;
+   pt.nr_storage_samples = nr_samples;
 
    newtex = screen->resource_create(screen, &pt);
 
diff --git a/src/mesa/state_tracker/st_texture.h b/src/mesa/state_tracker/st_texture.h
index c10a275..726ab78 100644
--- a/src/mesa/state_tracker/st_texture.h
+++ b/src/mesa/state_tracker/st_texture.h
@@ -42,9 +42,9 @@
 struct st_texture_image_transfer {
    struct pipe_transfer *transfer;
 
-   /* For ETC fallback. */
-   GLubyte *temp_data; /**< Temporary ETC texture storage. */
-   unsigned temp_stride; /**< Stride of the ETC texture storage. */
+   /* For compressed texture fallback. */
+   GLubyte *temp_data; /**< Temporary compressed texture storage. */
+   unsigned temp_stride; /**< Stride of the compressed texture storage. */
    GLubyte *map; /**< Saved map pointer of the uncompressed transfer. */
 };
 
@@ -90,10 +90,11 @@
    struct st_texture_image_transfer *transfer;
    unsigned num_transfers;
 
-   /* For ETC images, keep track of the original data. This is necessary for
-    * mapping/unmapping, as well as image copies.
+   /* For compressed images unsupported by the driver. Keep track of
+    * the original data. This is necessary for mapping/unmapping,
+    * as well as image copies.
     */
-   GLubyte *etc_data;
+   GLubyte *compressed_data;
 };
 
 
@@ -315,7 +316,7 @@
 st_destroy_bound_image_handles(struct st_context *st);
 
 bool
-st_etc_fallback(struct st_context *st, struct gl_texture_image *texImage);
+st_compressed_format_fallback(struct st_context *st, mesa_format format);
 
 void
 st_convert_image(const struct st_context *st, const struct gl_image_unit *u,
diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c
index eb61aef..d0b9cfb 100644
--- a/src/mesa/state_tracker/st_vdpau.c
+++ b/src/mesa/state_tracker/st_vdpau.c
@@ -127,7 +127,7 @@
    templ.usage = PIPE_USAGE_DEFAULT;
 
    memset(&whandle, 0, sizeof(whandle));
-   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.type = WINSYS_HANDLE_TYPE_FD;
    whandle.handle = desc->handle;
    whandle.offset = desc->offset;
    whandle.stride = desc->stride;
diff --git a/src/mesa/swrast/s_aalinetemp.h b/src/mesa/swrast/s_aalinetemp.h
index bebb131..64767a3 100644
--- a/src/mesa/swrast/s_aalinetemp.h
+++ b/src/mesa/swrast/s_aalinetemp.h
@@ -179,10 +179,12 @@
          if (attr >= VARYING_SLOT_TEX0 && attr < VARYING_SLOT_VAR0) {
             const GLuint u = attr - VARYING_SLOT_TEX0;
             const struct gl_texture_object *obj = ctx->Texture.Unit[u]._Current;
-            const struct gl_texture_image *texImage =
-               _mesa_base_tex_image(obj);
-            line.texWidth[attr]  = (GLfloat) texImage->Width;
-            line.texHeight[attr] = (GLfloat) texImage->Height;
+            if (obj) {
+               const struct gl_texture_image *texImage =
+                  _mesa_base_tex_image(obj);
+               line.texWidth[attr]  = (GLfloat) texImage->Width;
+               line.texHeight[attr] = (GLfloat) texImage->Height;
+            }
          }
       ATTRIB_LOOP_END
    }
diff --git a/src/mesa/swrast/s_blit.c b/src/mesa/swrast/s_blit.c
index 19fe848..107e413 100644
--- a/src/mesa/swrast/s_blit.c
+++ b/src/mesa/swrast/s_blit.c
@@ -253,7 +253,7 @@
          ctx->Driver.MapRenderbuffer(ctx, readRb, 0, 0,
                                      readRb->Width, readRb->Height,
                                      GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
-                                     &map, &rowStride);
+                                     &map, &rowStride, readFb->FlipY);
          if (!map) {
             goto fail_no_memory;
          }
@@ -280,14 +280,16 @@
          ctx->Driver.MapRenderbuffer(ctx, readRb,
                                      srcXpos, srcYpos,
                                      srcWidth, srcHeight,
-                                     GL_MAP_READ_BIT, &srcMap, &srcRowStride);
+                                     GL_MAP_READ_BIT, &srcMap, &srcRowStride,
+                                     readFb->FlipY);
          if (!srcMap) {
             goto fail_no_memory;
          }
          ctx->Driver.MapRenderbuffer(ctx, drawRb,
                                      dstXpos, dstYpos,
                                      dstWidth, dstHeight,
-                                     GL_MAP_WRITE_BIT, &dstMap, &dstRowStride);
+                                     GL_MAP_WRITE_BIT, &dstMap, &dstRowStride,
+                                     drawFb->FlipY);
          if (!dstMap) {
             ctx->Driver.UnmapRenderbuffer(ctx, readRb);
             goto fail_no_memory;
@@ -594,7 +596,8 @@
          ctx->Driver.MapRenderbuffer(ctx, readRb,
                                      0, 0, readRb->Width, readRb->Height,
                                      GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
-                                     &srcMap, &srcRowStride);
+                                     &srcMap, &srcRowStride,
+                                     readFb->FlipY);
          if (!srcMap) {
             goto fail_no_memory;
          }
@@ -609,13 +612,15 @@
           */
          ctx->Driver.MapRenderbuffer(ctx, readRb,
                                      0, 0, readRb->Width, readRb->Height,
-                                     GL_MAP_READ_BIT, &srcMap, &srcRowStride);
+                                     GL_MAP_READ_BIT, &srcMap, &srcRowStride,
+                                     readFb->FlipY);
          if (!srcMap) {
             goto fail_no_memory;
          }
          ctx->Driver.MapRenderbuffer(ctx, drawRb,
                                      0, 0, drawRb->Width, drawRb->Height,
-                                     GL_MAP_WRITE_BIT, &dstMap, &dstRowStride);
+                                     GL_MAP_WRITE_BIT, &dstMap, &dstRowStride,
+                                     drawFb->FlipY);
          if (!dstMap) {
             ctx->Driver.UnmapRenderbuffer(ctx, readRb);
             goto fail_no_memory;
diff --git a/src/mesa/swrast/s_clear.c b/src/mesa/swrast/s_clear.c
index ddafb67..ef0f6df 100644
--- a/src/mesa/swrast/s_clear.c
+++ b/src/mesa/swrast/s_clear.c
@@ -66,7 +66,8 @@
 
    /* map dest buffer */
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height,
-                               mapMode, &map, &rowStride);
+                               mapMode, &map, &rowStride,
+                               ctx->DrawBuffer->FlipY);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glClear(color)");
       return;
diff --git a/src/mesa/swrast/s_copypix.c b/src/mesa/swrast/s_copypix.c
index 0dbccc0..d0703fa 100644
--- a/src/mesa/swrast/s_copypix.c
+++ b/src/mesa/swrast/s_copypix.c
@@ -503,7 +503,7 @@
       ctx->Driver.MapRenderbuffer(ctx, srcRb, 0, 0,
                                   srcRb->Width, srcRb->Height,
                                   GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
-                                  &map, &rowStride);
+                                  &map, &rowStride, srcFb->FlipY);
       if (!map) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyPixels");
          return GL_TRUE; /* don't retry with slow path */
@@ -530,14 +530,16 @@
       /* different src/dst buffers */
       ctx->Driver.MapRenderbuffer(ctx, srcRb, srcX, srcY,
                                   width, height,
-                                  GL_MAP_READ_BIT, &srcMap, &srcRowStride);
+                                  GL_MAP_READ_BIT, &srcMap, &srcRowStride,
+                                  srcFb->FlipY);
       if (!srcMap) {
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyPixels");
          return GL_TRUE; /* don't retry with slow path */
       }
       ctx->Driver.MapRenderbuffer(ctx, dstRb, dstX, dstY,
                                   width, height,
-                                  GL_MAP_WRITE_BIT, &dstMap, &dstRowStride);
+                                  GL_MAP_WRITE_BIT, &dstMap, &dstRowStride,
+                                  dstFb->FlipY);
       if (!dstMap) {
          ctx->Driver.UnmapRenderbuffer(ctx, srcRb);
          _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyPixels");
@@ -598,7 +600,8 @@
    ctx->Driver.MapRenderbuffer(ctx, rb,
                                0, 0, rb->Width, rb->Height,
                                GL_MAP_READ_BIT,
-                               &srb->Map, &srb->RowStride);
+                               &srb->Map, &srb->RowStride,
+                               fb->FlipY);
 
    return rb;
 }
diff --git a/src/mesa/swrast/s_depth.c b/src/mesa/swrast/s_depth.c
index ffadc05..4b9640d 100644
--- a/src/mesa/swrast/s_depth.c
+++ b/src/mesa/swrast/s_depth.c
@@ -570,7 +570,8 @@
    }
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height,
-                               mapMode, &map, &rowStride);
+                               mapMode, &map, &rowStride,
+                               ctx->DrawBuffer->FlipY);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glClear(depth)");
       return;
@@ -695,7 +696,8 @@
    }
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height,
-                               mapMode, &map, &rowStride);
+                               mapMode, &map, &rowStride,
+                               ctx->DrawBuffer->FlipY);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glClear(depth+stencil)");
       return;
diff --git a/src/mesa/swrast/s_drawpix.c b/src/mesa/swrast/s_drawpix.c
index f05528d..7ee401b 100644
--- a/src/mesa/swrast/s_drawpix.c
+++ b/src/mesa/swrast/s_drawpix.c
@@ -55,7 +55,8 @@
                            GLint x, GLint y,
                            GLsizei width, GLsizei height,
                            const struct gl_pixelstore_attrib *unpack,
-                           const GLvoid *pixels)
+                           const GLvoid *pixels,
+                           bool flip_y)
 {
    const GLubyte *src = (const GLubyte *)
       _mesa_image_address2d(unpack, pixels, width,
@@ -67,7 +68,8 @@
    GLint dstRowStride;
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height,
-                               GL_MAP_WRITE_BIT, &dst, &dstRowStride);
+                               GL_MAP_WRITE_BIT, &dst, &dstRowStride,
+                               flip_y);
 
    if (!dst) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glDrawPixels");
@@ -102,7 +104,8 @@
                            GLint x, GLint y,
                            GLsizei width, GLsizei height,
                            const struct gl_pixelstore_attrib *unpack,
-                           const GLvoid *pixels)
+                           const GLvoid *pixels,
+                           bool flip_y)
 {
    const GLubyte *src = (const GLubyte *)
       _mesa_image_address2d(unpack, pixels, width,
@@ -114,7 +117,8 @@
    GLint dstRowStride;
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height,
-                               GL_MAP_WRITE_BIT, &dst, &dstRowStride);
+                               GL_MAP_WRITE_BIT, &dst, &dstRowStride,
+                               flip_y);
 
    if (!dst) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glDrawPixels");
@@ -151,7 +155,8 @@
                          GLsizei width, GLsizei height,
                          GLenum format, GLenum type,
                          const struct gl_pixelstore_attrib *unpack,
-                         const GLvoid *pixels)
+                         const GLvoid *pixels,
+                         bool flip_y)
 {
    const GLubyte *src = (const GLubyte *)
       _mesa_image_address2d(unpack, pixels, width,
@@ -164,7 +169,8 @@
    GLint dstRowStride;
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height,
-                               GL_MAP_WRITE_BIT, &dst, &dstRowStride);
+                               GL_MAP_WRITE_BIT, &dst, &dstRowStride,
+                               flip_y);
 
    if (!dst) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glDrawPixels");
@@ -197,6 +203,7 @@
                       const struct gl_pixelstore_attrib *userUnpack,
                       const GLvoid *pixels)
 {
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
    SWcontext *swrast = SWRAST_CONTEXT(ctx);
    struct gl_pixelstore_attrib unpack;
@@ -228,7 +235,7 @@
        (rb->Format == MESA_FORMAT_B8G8R8X8_UNORM ||
         rb->Format == MESA_FORMAT_B8G8R8A8_UNORM)) {
       fast_draw_rgb_ubyte_pixels(ctx, rb, x, y, width, height,
-                                 &unpack, pixels);
+                                 &unpack, pixels, fb->FlipY);
       return GL_TRUE;
    }
 
@@ -237,14 +244,15 @@
        (rb->Format == MESA_FORMAT_B8G8R8X8_UNORM ||
         rb->Format == MESA_FORMAT_B8G8R8A8_UNORM)) {
       fast_draw_rgba_ubyte_pixels(ctx, rb, x, y, width, height,
-                                  &unpack, pixels);
+                                  &unpack, pixels, fb->FlipY);
       return GL_TRUE;
    }
 
    if (_mesa_format_matches_format_and_type(rb->Format, format, type,
                                             ctx->Unpack.SwapBytes, NULL)) {
       fast_draw_generic_pixels(ctx, rb, x, y, width, height,
-                               format, type, &unpack, pixels);
+                               format, type, &unpack, pixels,
+                               fb->FlipY);
       return GL_TRUE;
    }
 
diff --git a/src/mesa/swrast/s_renderbuffer.c b/src/mesa/swrast/s_renderbuffer.c
index f76489c..8c97e4e 100644
--- a/src/mesa/swrast/s_renderbuffer.c
+++ b/src/mesa/swrast/s_renderbuffer.c
@@ -180,7 +180,8 @@
                               GLuint x, GLuint y, GLuint w, GLuint h,
                               GLbitfield mode,
                               GLubyte **out_map,
-                              GLint *out_stride)
+                              GLint *out_stride,
+                              bool flip_y)
 {
    struct swrast_renderbuffer *srb = swrast_renderbuffer(rb);
    GLubyte *map = srb->Buffer;
@@ -578,7 +579,8 @@
       ctx->Driver.MapRenderbuffer(ctx, rb,
                                   0, 0, rb->Width, rb->Height,
                                   GL_MAP_READ_BIT | GL_MAP_WRITE_BIT,
-                                  &srb->Map, &srb->RowStride);
+                                  &srb->Map, &srb->RowStride,
+                                  fb->FlipY);
    }
 
    assert(srb->Map);
diff --git a/src/mesa/swrast/s_renderbuffer.h b/src/mesa/swrast/s_renderbuffer.h
index 2595d7c..9238d8a 100644
--- a/src/mesa/swrast/s_renderbuffer.h
+++ b/src/mesa/swrast/s_renderbuffer.h
@@ -43,7 +43,8 @@
                               GLuint x, GLuint y, GLuint w, GLuint h,
                               GLbitfield mode,
                               GLubyte **out_map,
-                              GLint *out_stride);
+                              GLint *out_stride,
+                              bool flip_y);
 
 extern void
 _swrast_unmap_soft_renderbuffer(struct gl_context *ctx,
diff --git a/src/mesa/swrast/s_stencil.c b/src/mesa/swrast/s_stencil.c
index 7a4dc45..8ccd5a1 100644
--- a/src/mesa/swrast/s_stencil.c
+++ b/src/mesa/swrast/s_stencil.c
@@ -579,7 +579,8 @@
    }
 
    ctx->Driver.MapRenderbuffer(ctx, rb, x, y, width, height,
-                               mapMode, &map, &rowStride);
+                               mapMode, &map, &rowStride,
+                               ctx->DrawBuffer->FlipY);
    if (!map) {
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "glClear(stencil)");
       return;
diff --git a/src/mesa/tnl/t_context.c b/src/mesa/tnl/t_context.c
index 345f0bf..3383b23 100644
--- a/src/mesa/tnl/t_context.c
+++ b/src/mesa/tnl/t_context.c
@@ -99,8 +99,8 @@
    _math_init_transformation();
    _math_init_translate();
 
-   /* Keep our list of gl_vertex_array inputs */
-   _vbo_init_inputs(&tnl->draw_arrays);
+   /* Keep our list of tnl_vertex_array inputs */
+   _tnl_init_inputs(&tnl->draw_arrays);
 
    return GL_TRUE;
 }
diff --git a/src/mesa/tnl/t_context.h b/src/mesa/tnl/t_context.h
index 4827480..eca9f66 100644
--- a/src/mesa/tnl/t_context.h
+++ b/src/mesa/tnl/t_context.h
@@ -57,6 +57,8 @@
 
 #include "vbo/vbo.h"
 
+#include "tnl.h"
+
 #define MAX_PIPELINE_STAGES     30
 
 /*
@@ -497,6 +499,41 @@
 
 
 /**
+ * Utility that tracks and updates the current array entries.
+ */
+struct tnl_inputs
+{
+   /**
+    * Array of inputs to be set to the _DrawArrays pointer.
+    * The array contains pointers into the _DrawVAO and to the vbo modules
+    * current values. The array of pointers is updated incrementally
+    * based on the current and vertex_processing_mode values below.
+    */
+   struct tnl_vertex_array inputs[VERT_ATTRIB_MAX];
+   /** Those VERT_BIT_'s where the inputs array point to current values. */
+   GLbitfield current;
+   /** Store which aliasing current values - generics or materials - are set. */
+   gl_vertex_processing_mode vertex_processing_mode;
+};
+
+
+/**
+ * Initialize inputs.
+ */
+void
+_tnl_init_inputs(struct tnl_inputs *inputs);
+
+
+/**
+ * Update the tnl_vertex_array array inside the tnl_inputs structure
+ * provided the current _VPMode, the provided vao and
+ * the vao's enabled arrays filtered by the filter bitmask.
+ */
+void
+_tnl_update_inputs(struct gl_context *ctx, struct tnl_inputs *inputs);
+
+
+/**
  * Context state for T&L context.
  */
 typedef struct
@@ -537,8 +574,8 @@
    struct tnl_shine_tab *_ShineTabList;  /**< MRU list of inactive shine tables */
    /**@}*/
 
-   /* The list of gl_vertex_array inputs. */
-   struct vbo_inputs draw_arrays;
+   /* The list of tnl_vertex_array inputs. */
+   struct tnl_inputs draw_arrays;
 } TNLcontext;
 
 
diff --git a/src/mesa/tnl/t_draw.c b/src/mesa/tnl/t_draw.c
index a83b98e..1fe2d40 100644
--- a/src/mesa/tnl/t_draw.c
+++ b/src/mesa/tnl/t_draw.c
@@ -28,6 +28,7 @@
 #include <stdio.h>
 
 #include "main/glheader.h"
+#include "main/arrayobj.h"
 #include "main/bufferobj.h"
 #include "main/condrender.h"
 #include "main/context.h"
@@ -273,7 +274,7 @@
 
 
 static void bind_inputs( struct gl_context *ctx, 
-			 const struct gl_vertex_array *inputs,
+			 const struct tnl_vertex_array *inputs,
 			 GLint count,
 			 struct gl_buffer_object **bo,
 			 GLuint *nr_bo )
@@ -285,7 +286,7 @@
    /* Map all the VBOs
     */
    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      const struct gl_vertex_array *array = &inputs[i];
+      const struct tnl_vertex_array *array = &inputs[i];
       const struct gl_vertex_buffer_binding *binding = array->BufferBinding;
       const struct gl_array_attributes *attrib = array->VertexAttrib;
       const void *ptr;
@@ -426,7 +427,7 @@
 /* This is the main workhorse doing all the rendering work.
  */
 void _tnl_draw_prims(struct gl_context *ctx,
-                     const struct gl_vertex_array *arrays,
+                     const struct tnl_vertex_array *arrays,
 			 const struct _mesa_prim *prim,
 			 GLuint nr_prims,
 			 const struct _mesa_index_buffer *ib,
@@ -538,11 +539,93 @@
 
 
 void
+_tnl_init_inputs(struct tnl_inputs *inputs)
+{
+   inputs->current = 0;
+   inputs->vertex_processing_mode = VP_MODE_FF;
+}
+
+
+/**
+ * Update the tnl_inputs's arrays to point to the vao->_VertexArray arrays
+ * according to the 'enable' bitmask.
+ * \param enable  bitfield of VERT_BIT_x flags.
+ */
+static inline void
+update_vao_inputs(struct gl_context *ctx,
+                  struct tnl_inputs *inputs, GLbitfield enable)
+{
+   const struct gl_vertex_array_object *vao = ctx->Array._DrawVAO;
+
+   /* Make sure we process only arrays enabled in the VAO */
+   assert((enable & ~_mesa_get_vao_vp_inputs(vao)) == 0);
+
+   /* Fill in the client arrays from the VAO */
+   const struct gl_vertex_buffer_binding *bindings = &vao->BufferBinding[0];
+   while (enable) {
+      const int attr = u_bit_scan(&enable);
+      struct tnl_vertex_array *input = &inputs->inputs[attr];
+      const struct gl_array_attributes *attrib;
+      attrib = _mesa_draw_array_attrib(vao, attr);
+      input->VertexAttrib = attrib;
+      input->BufferBinding = &bindings[attrib->BufferBindingIndex];
+   }
+}
+
+
+/**
+ * Update the tnl_inputs's arrays to point to the vbo->currval arrays
+ * according to the 'current' bitmask.
+ * \param current  bitfield of VERT_BIT_x flags.
+ */
+static inline void
+update_current_inputs(struct gl_context *ctx,
+                      struct tnl_inputs *inputs, GLbitfield current)
+{
+   gl_vertex_processing_mode mode = ctx->VertexProgram._VPMode;
+
+   /* All previously non current array pointers need update. */
+   GLbitfield mask = current & ~inputs->current;
+   /* On mode change, the slots aliasing with materials need update too */
+   if (mode != inputs->vertex_processing_mode)
+      mask |= current & VERT_BIT_MAT_ALL;
+
+   while (mask) {
+      const int attr = u_bit_scan(&mask);
+      struct tnl_vertex_array *input = &inputs->inputs[attr];
+      input->VertexAttrib = _vbo_current_attrib(ctx, attr);
+      input->BufferBinding = _vbo_current_binding(ctx);
+   }
+
+   inputs->current = current;
+   inputs->vertex_processing_mode = mode;
+}
+
+
+/**
+ * Update the tnl_inputs's arrays to point to the vao->_VertexArray and
+ * vbo->currval arrays according to Array._DrawVAO and
+ * Array._DrawVAOEnableAttribs.
+ */
+void
+_tnl_update_inputs(struct gl_context *ctx, struct tnl_inputs *inputs)
+{
+   const GLbitfield enable = ctx->Array._DrawVAOEnabledAttribs;
+
+   /* Update array input pointers */
+   update_vao_inputs(ctx, inputs, enable);
+
+   /* The rest must be current inputs. */
+   update_current_inputs(ctx, inputs, ~enable & VERT_BIT_ALL);
+}
+
+
+const struct tnl_vertex_array*
 _tnl_bind_inputs( struct gl_context *ctx )
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   _mesa_set_drawing_arrays(ctx, tnl->draw_arrays.inputs);
-   _vbo_update_inputs(ctx, &tnl->draw_arrays);
+   _tnl_update_inputs(ctx, &tnl->draw_arrays);
+   return tnl->draw_arrays.inputs;
 }
 
 
@@ -558,12 +641,11 @@
           struct gl_transform_feedback_object *tfb_vertcount,
           unsigned stream, struct gl_buffer_object *indirect)
 {
-   /* Update TNLcontext::draw_arrays and set that pointer
-    * into Array._DrawArrays.
+   /* Update TNLcontext::draw_arrays and return that pointer.
     */
-   _tnl_bind_inputs(ctx);
+   const struct tnl_vertex_array* arrays = _tnl_bind_inputs(ctx);
 
-   _tnl_draw_prims(ctx, ctx->Array._DrawArrays, prim, nr_prims, ib,
+   _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib,
                    index_bounds_valid, min_index, max_index,
                    tfb_vertcount, stream, indirect);
 }
diff --git a/src/mesa/tnl/t_rebase.c b/src/mesa/tnl/t_rebase.c
index 09a8a3d..b6950e0 100644
--- a/src/mesa/tnl/t_rebase.c
+++ b/src/mesa/tnl/t_rebase.c
@@ -104,7 +104,7 @@
  *      all or nothing.
  */
 void t_rebase_prims( struct gl_context *ctx,
-                     const struct gl_vertex_array *arrays,
+                     const struct tnl_vertex_array *arrays,
                      const struct _mesa_prim *prim,
                      GLuint nr_prims,
                      const struct _mesa_index_buffer *ib,
@@ -113,7 +113,7 @@
                      tnl_draw_func draw )
 {
    struct gl_array_attributes tmp_attribs[VERT_ATTRIB_MAX];
-   struct gl_vertex_array tmp_arrays[VERT_ATTRIB_MAX];
+   struct tnl_vertex_array tmp_arrays[VERT_ATTRIB_MAX];
 
    struct _mesa_index_buffer tmp_ib;
    struct _mesa_prim *tmp_prims = NULL;
diff --git a/src/mesa/tnl/t_rebase.h b/src/mesa/tnl/t_rebase.h
index ce2e8b0..d0aa9e1 100644
--- a/src/mesa/tnl/t_rebase.h
+++ b/src/mesa/tnl/t_rebase.h
@@ -28,7 +28,7 @@
 #include "tnl.h"
 
 void t_rebase_prims( struct gl_context *ctx,
-                     const struct gl_vertex_array *arrays,
+                     const struct tnl_vertex_array *arrays,
                      const struct _mesa_prim *prim,
                      GLuint nr_prims,
                      const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/tnl/t_split.c b/src/mesa/tnl/t_split.c
index b98bd40..d7aac10e 100644
--- a/src/mesa/tnl/t_split.c
+++ b/src/mesa/tnl/t_split.c
@@ -100,7 +100,7 @@
 
 void
 _tnl_split_prims(struct gl_context *ctx,
-                 const struct gl_vertex_array arrays[],
+                 const struct tnl_vertex_array arrays[],
                  const struct _mesa_prim *prim,
                  GLuint nr_prims,
                  const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/tnl/t_split.h b/src/mesa/tnl/t_split.h
index ced7d30..49017e5 100644
--- a/src/mesa/tnl/t_split.h
+++ b/src/mesa/tnl/t_split.h
@@ -51,7 +51,7 @@
 
 void
 _tnl_split_inplace(struct gl_context *ctx,
-                   const struct gl_vertex_array arrays[],
+                   const struct tnl_vertex_array arrays[],
                    const struct _mesa_prim *prim,
                    GLuint nr_prims,
                    const struct _mesa_index_buffer *ib,
@@ -64,7 +64,7 @@
  */
 void
 _tnl_split_copy(struct gl_context *ctx,
-                const struct gl_vertex_array arrays[],
+                const struct tnl_vertex_array arrays[],
                 const struct _mesa_prim *prim,
                 GLuint nr_prims,
                 const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/tnl/t_split_copy.c b/src/mesa/tnl/t_split_copy.c
index 30fe500..085ae9a 100644
--- a/src/mesa/tnl/t_split_copy.c
+++ b/src/mesa/tnl/t_split_copy.c
@@ -53,7 +53,7 @@
  */
 struct copy_context {
    struct gl_context *ctx;
-   const struct gl_vertex_array *array;
+   const struct tnl_vertex_array *array;
    const struct _mesa_prim *prim;
    GLuint nr_prims;
    const struct _mesa_index_buffer *ib;
@@ -64,7 +64,7 @@
    struct {
       GLuint attr;
       GLuint size;
-      const struct gl_vertex_array *array;
+      const struct tnl_vertex_array *array;
       const GLubyte *src_ptr;
 
       struct gl_vertex_buffer_binding dstbinding;
@@ -73,7 +73,7 @@
    } varying[VERT_ATTRIB_MAX];
    GLuint nr_varying;
 
-   struct gl_vertex_array dstarray[VERT_ATTRIB_MAX];
+   struct tnl_vertex_array dstarray[VERT_ATTRIB_MAX];
    struct _mesa_index_buffer dstib;
 
    GLuint *translated_elt_buf;
@@ -113,6 +113,18 @@
 
 
 /**
+ * Shallow copy one vertex array to another.
+ */
+static inline void
+copy_vertex_array(struct tnl_vertex_array *dst,
+                  const struct tnl_vertex_array *src)
+{
+   dst->VertexAttrib = src->VertexAttrib;
+   dst->BufferBinding = src->BufferBinding;
+}
+
+
+/**
  * Starts returning true slightly before the buffer fills, to ensure
  * that there is sufficient room for any remaining vertices to finish
  * off the prim:
@@ -142,7 +154,7 @@
  */
 static void
 dump_draw_info(struct gl_context *ctx,
-               const struct gl_vertex_array *arrays,
+               const struct tnl_vertex_array *arrays,
                const struct _mesa_prim *prims,
                GLuint nr_prims,
                const struct _mesa_index_buffer *ib,
@@ -157,7 +169,7 @@
       printf("  Prim mode 0x%x\n", prims[i].mode);
       printf("  IB: %p\n", (void*) ib);
       for (j = 0; j < VERT_ATTRIB_MAX; j++) {
-         const struct gl_vertex_array *array = &arrays[j];
+         const struct tnl_vertex_array *array = &arrays[j];
          const struct gl_vertex_buffer_binding *binding
             = array->BufferBinding;
          const struct gl_array_attributes *attrib = array->VertexAttrib;
@@ -254,7 +266,7 @@
       GLuint i;
 
       for (i = 0; i < copy->nr_varying; i++) {
-         const struct gl_vertex_array *srcarray = copy->varying[i].array;
+         const struct tnl_vertex_array *srcarray = copy->varying[i].array;
          const struct gl_vertex_buffer_binding* srcbinding
             = srcarray->BufferBinding;
          const GLubyte *srcptr
@@ -432,11 +444,11 @@
     */
    copy->vertex_size = 0;
    for (i = 0; i < VERT_ATTRIB_MAX; i++) {
-      const struct gl_vertex_array *array = &copy->array[i];
+      const struct tnl_vertex_array *array = &copy->array[i];
       const struct gl_vertex_buffer_binding *binding = array->BufferBinding;
 
       if (binding->Stride == 0) {
-         _mesa_copy_vertex_array(&copy->dstarray[i], array);
+         copy_vertex_array(&copy->dstarray[i], array);
       }
       else {
          const struct gl_array_attributes *attrib = array->VertexAttrib;
@@ -517,9 +529,9 @@
    /* Setup new vertex arrays to point into the output buffer:
     */
    for (offset = 0, i = 0; i < copy->nr_varying; i++) {
-      const struct gl_vertex_array *src = copy->varying[i].array;
+      const struct tnl_vertex_array *src = copy->varying[i].array;
       const struct gl_array_attributes *srcattr = src->VertexAttrib;
-      struct gl_vertex_array *dst = &copy->dstarray[copy->varying[i].attr];
+      struct tnl_vertex_array *dst = &copy->dstarray[copy->varying[i].attr];
       struct gl_vertex_buffer_binding *dstbind = &copy->varying[i].dstbinding;
       struct gl_array_attributes *dstattr = &copy->varying[i].dstattribs;
 
@@ -591,7 +603,7 @@
  */
 void
 _tnl_split_copy(struct gl_context *ctx,
-                const struct gl_vertex_array *arrays,
+                const struct tnl_vertex_array *arrays,
                 const struct _mesa_prim *prim,
                 GLuint nr_prims,
                 const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/tnl/t_split_inplace.c b/src/mesa/tnl/t_split_inplace.c
index 15a0986..8e9ecb7 100644
--- a/src/mesa/tnl/t_split_inplace.c
+++ b/src/mesa/tnl/t_split_inplace.c
@@ -43,7 +43,7 @@
  */
 struct split_context {
    struct gl_context *ctx;
-   const struct gl_vertex_array *array;
+   const struct tnl_vertex_array *array;
    const struct _mesa_prim *prim;
    GLuint nr_prims;
    const struct _mesa_index_buffer *ib;
@@ -265,7 +265,7 @@
 
 void
 _tnl_split_inplace(struct gl_context *ctx,
-                   const struct gl_vertex_array *arrays,
+                   const struct tnl_vertex_array *arrays,
                    const struct _mesa_prim *prim,
                    GLuint nr_prims,
                    const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/tnl/tnl.h b/src/mesa/tnl/tnl.h
index e506aee..5d84a1c 100644
--- a/src/mesa/tnl/tnl.h
+++ b/src/mesa/tnl/tnl.h
@@ -30,7 +30,6 @@
 
 #include "main/glheader.h"
 
-struct gl_vertex_array;
 struct gl_context;
 struct gl_program;
 struct gl_buffer_object;
@@ -66,7 +65,22 @@
 extern void
 _tnl_need_projected_coords( struct gl_context *ctx, GLboolean flag );
 
-extern void
+
+/**
+ * Vertex array information which is derived from gl_array_attributes
+ * and gl_vertex_buffer_binding information.  Used by the TNL module and
+ * device drivers.
+ */
+struct tnl_vertex_array
+{
+   /** Vertex attribute array */
+   const struct gl_array_attributes *VertexAttrib;
+   /** Vertex buffer binding */
+   const struct gl_vertex_buffer_binding *BufferBinding;
+};
+
+
+extern const struct tnl_vertex_array*
 _tnl_bind_inputs( struct gl_context *ctx );
 
 
@@ -86,7 +100,7 @@
 
 void
 _tnl_draw_prims(struct gl_context *ctx,
-                const struct gl_vertex_array *arrays,
+                const struct tnl_vertex_array *arrays,
 		     const struct _mesa_prim *prim,
 		     GLuint nr_prims,
 		     const struct _mesa_index_buffer *ib,
@@ -153,7 +167,7 @@
  *                  This may be deprecated in the future
  */
 typedef void (*tnl_draw_func)(struct gl_context *ctx,
-                              const struct gl_vertex_array* arrays,
+                              const struct tnl_vertex_array* arrays,
                               const struct _mesa_prim *prims,
                               GLuint nr_prims,
                               const struct _mesa_index_buffer *ib,
@@ -181,7 +195,7 @@
 
 void
 _tnl_split_prims(struct gl_context *ctx,
-                 const struct gl_vertex_array *arrays,
+                 const struct tnl_vertex_array *arrays,
                  const struct _mesa_prim *prim,
                  GLuint nr_prims,
                  const struct _mesa_index_buffer *ib,
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index 9b15066..4e3f159 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -38,9 +38,7 @@
 extern "C" {
 #endif
 
-struct gl_vertex_array;
 struct gl_context;
-struct gl_transform_feedback_object;
 
 struct _mesa_prim
 {
@@ -151,39 +149,12 @@
                          struct gl_buffer_object *indirect);
 
 
-/**
- * Utility that tracks and updates the current array entries.
- */
-struct vbo_inputs
-{
-   /**
-    * Array of inputs to be set to the _DrawArrays pointer.
-    * The array contains pointers into the _DrawVAO and to the vbo modules
-    * current values. The array of pointers is updated incrementally
-    * based on the current and vertex_processing_mode values below.
-    */
-   struct gl_vertex_array inputs[VERT_ATTRIB_MAX];
-   /** Those VERT_BIT_'s where the inputs array point to current values. */
-   GLbitfield current;
-   /** Store which aliasing current values - generics or materials - are set. */
-   gl_vertex_processing_mode vertex_processing_mode;
-};
+const struct gl_array_attributes*
+_vbo_current_attrib(const struct gl_context *ctx, gl_vert_attrib attr);
 
 
-/**
- * Initialize inputs.
- */
-void
-_vbo_init_inputs(struct vbo_inputs *inputs);
-
-
-/**
- * Update the gl_vertex_array array inside the vbo_inputs structure
- * provided the current _VPMode, the provided vao and
- * the vao's enabled arrays filtered by the filter bitmask.
- */
-void
-_vbo_update_inputs(struct gl_context *ctx, struct vbo_inputs *inputs);
+const struct gl_vertex_buffer_binding*
+_vbo_current_binding(const struct gl_context *ctx);
 
 
 void GLAPIENTRY
diff --git a/src/mesa/vbo/vbo_context.c b/src/mesa/vbo/vbo_context.c
index ada78ff..cf9405d 100644
--- a/src/mesa/vbo/vbo_context.c
+++ b/src/mesa/vbo/vbo_context.c
@@ -157,7 +157,7 @@
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
 
-   if (ctx->NewState & (_NEW_PROGRAM | _NEW_ARRAY)) {
+   if (ctx->NewState & _NEW_ARRAY) {
       _ae_invalidate_state(ctx);
    }
    if (ctx->NewState & _NEW_EVAL)
@@ -202,7 +202,7 @@
    vbo->VAO = _mesa_new_vao(ctx, ~((GLuint)0));
    /* The exec VAO assumes to have all arributes bound to binding 0 */
    for (unsigned i = 0; i < VERT_ATTRIB_MAX; ++i)
-      _mesa_vertex_attrib_binding(ctx, vbo->VAO, i, 0, false);
+      _mesa_vertex_attrib_binding(ctx, vbo->VAO, i, 0);
 
    _math_init_eval();
 
@@ -234,6 +234,23 @@
 }
 
 
+const struct gl_array_attributes *
+_vbo_current_attrib(const struct gl_context *ctx, gl_vert_attrib attr)
+{
+   const struct vbo_context *vbo = vbo_context_const(ctx);
+   const gl_vertex_processing_mode vmp = ctx->VertexProgram._VPMode;
+   return &vbo->current[_vbo_attribute_alias_map[vmp][attr]];
+}
+
+
+const struct gl_vertex_buffer_binding *
+_vbo_current_binding(const struct gl_context *ctx)
+{
+   const struct vbo_context *vbo = vbo_context_const(ctx);
+   return &vbo->binding;
+}
+
+
 /*
  * Helper function for _vbo_draw_indirect below that additionally takes a zero
  * initialized array of _mesa_prim scratch space memory as the last argument.
diff --git a/src/mesa/vbo/vbo_exec.c b/src/mesa/vbo/vbo_exec.c
index 357ec1d..34dbc00 100644
--- a/src/mesa/vbo/vbo_exec.c
+++ b/src/mesa/vbo/vbo_exec.c
@@ -239,89 +239,3 @@
    p0->count += p1->count;
    p0->end = p1->end;
 }
-
-
-void
-_vbo_init_inputs(struct vbo_inputs *inputs)
-{
-   inputs->current = 0;
-   inputs->vertex_processing_mode = VP_MODE_FF;
-}
-
-
-/**
- * Update the vbo_inputs's arrays to point to the vao->_VertexArray arrays
- * according to the 'enable' bitmask.
- * \param enable  bitfield of VERT_BIT_x flags.
- */
-static inline void
-update_vao_inputs(struct gl_context *ctx,
-                  struct vbo_inputs *inputs, GLbitfield enable)
-{
-   const struct gl_vertex_array_object *vao = ctx->Array._DrawVAO;
-
-   /* Make sure we process only arrays enabled in the VAO */
-   assert((enable & ~_mesa_get_vao_vp_inputs(vao)) == 0);
-
-   /* Fill in the client arrays from the VAO */
-   const GLubyte *const map = _mesa_vao_attribute_map[vao->_AttributeMapMode];
-   const struct gl_array_attributes *attribs = &vao->VertexAttrib[0];
-   const struct gl_vertex_buffer_binding *bindings = &vao->BufferBinding[0];
-   while (enable) {
-      const int attr = u_bit_scan(&enable);
-      struct gl_vertex_array *input = &inputs->inputs[attr];
-      const struct gl_array_attributes *attrib = &attribs[map[attr]];
-      input->VertexAttrib = attrib;
-      input->BufferBinding = &bindings[attrib->BufferBindingIndex];
-   }
-}
-
-
-/**
- * Update the vbo_inputs's arrays to point to the vbo->currval arrays
- * according to the 'current' bitmask.
- * \param current  bitfield of VERT_BIT_x flags.
- */
-static inline void
-update_current_inputs(struct gl_context *ctx,
-                      struct vbo_inputs *inputs, GLbitfield current)
-{
-   gl_vertex_processing_mode mode = ctx->VertexProgram._VPMode;
-
-   /* All previously non current array pointers need update. */
-   GLbitfield mask = current & ~inputs->current;
-   /* On mode change, the slots aliasing with materials need update too */
-   if (mode != inputs->vertex_processing_mode)
-      mask |= current & VERT_BIT_MAT_ALL;
-
-   struct vbo_context *vbo = vbo_context(ctx);
-   const struct gl_array_attributes *const currval = &vbo->current[0];
-   const GLubyte *const map = _vbo_attribute_alias_map[mode];
-   while (mask) {
-      const int attr = u_bit_scan(&mask);
-      struct gl_vertex_array *input = &inputs->inputs[attr];
-      input->VertexAttrib = &currval[map[attr]];
-      input->BufferBinding = &vbo->binding;
-   }
-
-   inputs->current = current;
-   inputs->vertex_processing_mode = mode;
-}
-
-
-/**
- * Update the vbo_inputs's arrays to point to the vao->_VertexArray and
- * vbo->currval arrays according to Array._DrawVAO and
- * Array._DrawVAOEnableAttribs.
- */
-void
-_vbo_update_inputs(struct gl_context *ctx, struct vbo_inputs *inputs)
-{
-   const GLbitfield enable = ctx->Array._DrawVAOEnabledAttribs;
-
-   /* Update array input pointers */
-   update_vao_inputs(ctx, inputs, enable);
-
-   /* The rest must be current inputs. */
-   update_current_inputs(ctx, inputs, ~enable & VERT_BIT_ALL);
-}
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index b397171..20148ac 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -40,7 +40,7 @@
 #include "main/state.h"
 #include "main/light.h"
 #include "main/api_arrayelt.h"
-#include "main/api_validate.h"
+#include "main/draw_validate.h"
 #include "main/dispatch.h"
 #include "util/bitscan.h"
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index b3ce138..51c000e 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -31,7 +31,7 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/state.h"
-#include "main/api_validate.h"
+#include "main/draw_validate.h"
 #include "main/dispatch.h"
 #include "main/varray.h"
 #include "main/bufferobj.h"
@@ -39,6 +39,21 @@
 #include "main/macros.h"
 #include "main/transformfeedback.h"
 
+typedef struct {
+   GLuint count;
+   GLuint primCount;
+   GLuint first;
+   GLuint baseInstance;
+} DrawArraysIndirectCommand;
+
+typedef struct {
+   GLuint count;
+   GLuint primCount;
+   GLuint firstIndex;
+   GLint  baseVertex;
+   GLuint baseInstance;
+} DrawElementsIndirectCommand;
+
 
 /**
  * Check that element 'j' of the array has reasonable data.
@@ -530,9 +545,9 @@
       _mesa_debug(ctx, "glDrawArrays(%s, %d, %d)\n",
                   _mesa_enum_to_string(mode), start, count);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -568,10 +583,9 @@
       _mesa_debug(ctx, "glDrawArraysInstanced(%s, %d, %d, %d)\n",
                   _mesa_enum_to_string(mode), start, count, numInstances);
 
+   FLUSH_FOR_DRAW(ctx);
 
    if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
-
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -610,9 +624,9 @@
                   _mesa_enum_to_string(mode), first, count,
                   numInstances, baseInstance);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -650,9 +664,9 @@
                   "glMultiDrawArrays(%s, %p, %p, %d)\n",
                   _mesa_enum_to_string(mode), first, count, primcount);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -873,9 +887,9 @@
                   _mesa_enum_to_string(mode), start, end, count,
                   _mesa_enum_to_string(type), indices, basevertex);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -984,9 +998,9 @@
                   _mesa_enum_to_string(mode), count,
                   _mesa_enum_to_string(type), indices);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1017,9 +1031,9 @@
                   _mesa_enum_to_string(mode), count,
                   _mesa_enum_to_string(type), indices);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1050,9 +1064,9 @@
                   _mesa_enum_to_string(mode), count,
                   _mesa_enum_to_string(type), indices);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1089,9 +1103,9 @@
                   _mesa_enum_to_string(type), indices,
                   numInstances, basevertex);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1130,9 +1144,9 @@
                   _mesa_enum_to_string(type), indices,
                   numInstances, baseInstance);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1173,9 +1187,9 @@
                   _mesa_enum_to_string(type), indices,
                   numInstances, basevertex, baseInstance);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1337,6 +1351,8 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
+   FLUSH_FOR_DRAW(ctx);
+
    _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
    if (!_mesa_validate_MultiDrawElements(ctx, mode, count, type, indices,
@@ -1360,9 +1376,9 @@
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1398,9 +1414,9 @@
 {
    struct _mesa_prim prim;
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1615,9 +1631,26 @@
       _mesa_debug(ctx, "glDrawArraysIndirect(%s, %p)\n",
                   _mesa_enum_to_string(mode), indirect);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   /* From the ARB_draw_indirect spec:
+    *
+    *    "Initially zero is bound to DRAW_INDIRECT_BUFFER. In the
+    *    compatibility profile, this indicates that DrawArraysIndirect and
+    *    DrawElementsIndirect are to source their arguments directly from the
+    *    pointer passed as their <indirect> parameters."
+    */
+   if (ctx->API == API_OPENGL_COMPAT &&
+       !_mesa_is_bufferobj(ctx->DrawIndirectBuffer)) {
+      DrawArraysIndirectCommand *cmd = (DrawArraysIndirectCommand *) indirect;
 
+      vbo_exec_DrawArraysInstancedBaseInstance(mode, cmd->first, cmd->count,
+                                               cmd->primCount,
+                                               cmd->baseInstance);
+      return;
+   }
+
+   FLUSH_FOR_DRAW(ctx);
+
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1646,9 +1679,46 @@
                   _mesa_enum_to_string(mode),
                   _mesa_enum_to_string(type), indirect);
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   /* From the ARB_draw_indirect spec:
+    *
+    *    "Initially zero is bound to DRAW_INDIRECT_BUFFER. In the
+    *    compatibility profile, this indicates that DrawArraysIndirect and
+    *    DrawElementsIndirect are to source their arguments directly from the
+    *    pointer passed as their <indirect> parameters."
+    */
+   if (ctx->API == API_OPENGL_COMPAT &&
+       !_mesa_is_bufferobj(ctx->DrawIndirectBuffer)) {
+      /*
+       * Unlike regular DrawElementsInstancedBaseVertex commands, the indices
+       * may not come from a client array and must come from an index buffer.
+       * If no element array buffer is bound, an INVALID_OPERATION error is
+       * generated.
+       */
+      if (!_mesa_is_bufferobj(ctx->Array.VAO->IndexBufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glDrawElementsIndirect(no buffer bound "
+                     "to GL_ELEMENT_ARRAY_BUFFER)");
+      } else {
+         DrawElementsIndirectCommand *cmd =
+            (DrawElementsIndirectCommand *) indirect;
 
+         /* Convert offset to pointer */
+         void *offset = (void *)
+            ((cmd->firstIndex * _mesa_sizeof_type(type)) & 0xffffffffUL);
+
+         vbo_exec_DrawElementsInstancedBaseVertexBaseInstance(mode, cmd->count,
+                                                              type, offset,
+                                                              cmd->primCount,
+                                                              cmd->baseVertex,
+                                                              cmd->baseInstance);
+      }
+
+      return;
+   }
+
+   FLUSH_FOR_DRAW(ctx);
+
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1679,11 +1749,42 @@
 
    /* If <stride> is zero, the array elements are treated as tightly packed. */
    if (stride == 0)
-      stride = 4 * sizeof(GLuint);      /* sizeof(DrawArraysIndirectCommand) */
+      stride = sizeof(DrawArraysIndirectCommand);
+
+   /* From the ARB_draw_indirect spec:
+    *
+    *    "Initially zero is bound to DRAW_INDIRECT_BUFFER. In the
+    *    compatibility profile, this indicates that DrawArraysIndirect and
+    *    DrawElementsIndirect are to source their arguments directly from the
+    *    pointer passed as their <indirect> parameters."
+    */
+   if (ctx->API == API_OPENGL_COMPAT &&
+       !_mesa_is_bufferobj(ctx->DrawIndirectBuffer)) {
+
+      if (!_mesa_valid_draw_indirect_multi(ctx, primcount, stride,
+                                           "glMultiDrawArraysIndirect"))
+         return;
+
+      const ubyte *ptr = (const ubyte *) indirect;
+      for (unsigned i = 0; i < primcount; i++) {
+         DrawArraysIndirectCommand *cmd = (DrawArraysIndirectCommand *) ptr;
+         vbo_exec_DrawArraysInstancedBaseInstance(mode, cmd->first,
+                                                  cmd->count, cmd->primCount,
+                                                  cmd->baseInstance);
+
+         if (stride == 0) {
+            ptr += sizeof(DrawArraysIndirectCommand);
+         } else {
+            ptr += stride;
+         }
+      }
+
+      return;
+   }
+
+   FLUSH_FOR_DRAW(ctx);
 
    if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
-
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1718,11 +1819,53 @@
 
    /* If <stride> is zero, the array elements are treated as tightly packed. */
    if (stride == 0)
-      stride = 5 * sizeof(GLuint);      /* sizeof(DrawElementsIndirectCommand) */
+      stride = sizeof(DrawElementsIndirectCommand);
+
+
+   /* From the ARB_draw_indirect spec:
+    *
+    *    "Initially zero is bound to DRAW_INDIRECT_BUFFER. In the
+    *    compatibility profile, this indicates that DrawArraysIndirect and
+    *    DrawElementsIndirect are to source their arguments directly from the
+    *    pointer passed as their <indirect> parameters."
+    */
+   if (ctx->API == API_OPENGL_COMPAT &&
+       !_mesa_is_bufferobj(ctx->DrawIndirectBuffer)) {
+      /*
+       * Unlike regular DrawElementsInstancedBaseVertex commands, the indices
+       * may not come from a client array and must come from an index buffer.
+       * If no element array buffer is bound, an INVALID_OPERATION error is
+       * generated.
+       */
+      if (!_mesa_is_bufferobj(ctx->Array.VAO->IndexBufferObj)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION,
+                     "glMultiDrawElementsIndirect(no buffer bound "
+                     "to GL_ELEMENT_ARRAY_BUFFER)");
+
+         return;
+      }
+
+      if (!_mesa_valid_draw_indirect_multi(ctx, primcount, stride,
+                                           "glMultiDrawArraysIndirect"))
+         return;
+
+      const ubyte *ptr = (const ubyte *) indirect;
+      for (unsigned i = 0; i < primcount; i++) {
+         vbo_exec_DrawElementsIndirect(mode, type, ptr);
+
+         if (stride == 0) {
+            ptr += sizeof(DrawElementsIndirectCommand);
+         } else {
+            ptr += stride;
+         }
+      }
+
+      return;
+   }
+
+   FLUSH_FOR_DRAW(ctx);
 
    if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
-
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1815,9 +1958,9 @@
    if (stride == 0)
       stride = 4 * sizeof(GLuint);      /* sizeof(DrawArraysIndirectCommand) */
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1860,9 +2003,9 @@
    if (stride == 0)
       stride = 5 * sizeof(GLuint);      /* sizeof(DrawElementsIndirectCommand) */
 
-   if (_mesa_is_no_error_enabled(ctx)) {
-      FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
+   if (_mesa_is_no_error_enabled(ctx)) {
       _mesa_set_draw_vao(ctx, ctx->Array.VAO, enabled_filter(ctx));
 
       if (ctx->NewState)
@@ -1932,20 +2075,11 @@
                                                       vbo_exec_DrawElementsInstancedBaseVertexBaseInstance);
    }
 
-   if (ctx->API == API_OPENGL_CORE || _mesa_is_gles31(ctx)) {
+   if (_mesa_is_desktop_gl(ctx) || _mesa_is_gles31(ctx)) {
       SET_DrawArraysIndirect(exec, vbo_exec_DrawArraysIndirect);
       SET_DrawElementsIndirect(exec, vbo_exec_DrawElementsIndirect);
    }
 
-   if (ctx->API == API_OPENGL_CORE) {
-      SET_MultiDrawArraysIndirect(exec, vbo_exec_MultiDrawArraysIndirect);
-      SET_MultiDrawElementsIndirect(exec, vbo_exec_MultiDrawElementsIndirect);
-      SET_MultiDrawArraysIndirectCountARB(exec,
-                                          vbo_exec_MultiDrawArraysIndirectCount);
-      SET_MultiDrawElementsIndirectCountARB(exec,
-                                            vbo_exec_MultiDrawElementsIndirectCount);
-   }
-
    if (_mesa_is_desktop_gl(ctx) || _mesa_is_gles3(ctx)) {
       SET_DrawArraysInstancedARB(exec, vbo_exec_DrawArraysInstanced);
       SET_DrawElementsInstancedARB(exec, vbo_exec_DrawElementsInstanced);
@@ -1959,6 +2093,12 @@
                                          vbo_exec_DrawTransformFeedbackInstanced);
       SET_DrawTransformFeedbackStreamInstanced(exec,
                                                vbo_exec_DrawTransformFeedbackStreamInstanced);
+      SET_MultiDrawArraysIndirect(exec, vbo_exec_MultiDrawArraysIndirect);
+      SET_MultiDrawElementsIndirect(exec, vbo_exec_MultiDrawElementsIndirect);
+      SET_MultiDrawArraysIndirectCountARB(exec,
+                                          vbo_exec_MultiDrawArraysIndirectCount);
+      SET_MultiDrawElementsIndirectCountARB(exec,
+                                            vbo_exec_MultiDrawElementsIndirectCount);
    }
 }
 
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 31d7700..8d74725 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -195,15 +195,14 @@
    GLbitfield mask = vao->_Enabled & ~vao_enabled;
    while (mask) {
       const int vao_attr = u_bit_scan(&mask);
-      _mesa_disable_vertex_array_attrib(ctx, vao, vao_attr, false);
+      _mesa_disable_vertex_array_attrib(ctx, vao, vao_attr);
    }
    assert((~vao_enabled & vao->_Enabled) == 0);
 
    /* Bind the buffer object */
    const GLuint stride = exec->vtx.vertex_size*sizeof(GLfloat);
-   assert(stride <= ctx->Const.MaxVertexAttribStride);
    _mesa_bind_vertex_buffer(ctx, vao, 0, exec->vtx.bufferobj, buffer_offset,
-                            stride, false);
+                            stride);
 
    /* Retrieve the mapping from VBO_ATTRIB to VERT_ATTRIB space
     * Note that the position/generic0 aliasing is done in the VAO.
@@ -225,7 +224,7 @@
       _vbo_set_attrib_format(ctx, vao, vao_attr, buffer_offset,
                              size, type, offset);
       if ((vao->_Enabled & VERT_BIT(vao_attr)) == 0)
-         _mesa_enable_vertex_array_attrib(ctx, vao, vao_attr, false);
+         _mesa_enable_vertex_array_attrib(ctx, vao, vao_attr);
 
       /* The vao is initially created with all bindings set to 0. */
       assert(vao->VertexAttrib[vao_attr].BufferBindingIndex == 0);
diff --git a/src/mesa/vbo/vbo_private.h b/src/mesa/vbo/vbo_private.h
index 95d67d1..86f6b41 100644
--- a/src/mesa/vbo/vbo_private.h
+++ b/src/mesa/vbo/vbo_private.h
@@ -60,6 +60,13 @@
 }
 
 
+static inline const struct vbo_context *
+vbo_context_const(const struct gl_context *ctx)
+{
+   return ctx->vbo_context;
+}
+
+
 /**
  * Array to apply the fixed function material aliasing map to
  * an attribute value used in vbo processing inputs to an attribute
@@ -207,9 +214,17 @@
 {
    const GLboolean integer = vbo_attrtype_to_integer_flag(type);
    const GLboolean doubles = vbo_attrtype_to_double_flag(type);
+
+   if (doubles)
+      size /= 2;
    _mesa_update_array_format(ctx, vao, attr, size, type, GL_RGBA,
                              GL_FALSE, integer, doubles, offset);
-   /* Ptr for userspace arrays */
+   /* Ptr for userspace arrays.
+    * For updating the pointer we would need to add the vao->NewArrays flag
+    * to the VAO. But but that is done already unconditionally in
+    * _mesa_update_array_format called above.
+    */
+   assert((vao->NewArrays | ~vao->_Enabled) & VERT_BIT(attr));
    vao->VertexAttrib[attr].Ptr = ADD_POINTERS(buffer_offset, offset);
 }
 
diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index 71304ed..d5b43d0 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -75,7 +75,7 @@
 #include "main/enums.h"
 #include "main/eval.h"
 #include "main/macros.h"
-#include "main/api_validate.h"
+#include "main/draw_validate.h"
 #include "main/api_arrayelt.h"
 #include "main/vtxfmt.h"
 #include "main/dispatch.h"
@@ -499,7 +499,7 @@
     */
 
    /* Bind the buffer object at binding point 0 */
-   _mesa_bind_vertex_buffer(ctx, *vao, 0, bo, buffer_offset, stride, false);
+   _mesa_bind_vertex_buffer(ctx, *vao, 0, bo, buffer_offset, stride);
 
    /* Retrieve the mapping from VBO_ATTRIB to VERT_ATTRIB space
     * Note that the position/generic0 aliasing is done in the VAO.
@@ -514,8 +514,8 @@
 
       _vbo_set_attrib_format(ctx, *vao, vao_attr, buffer_offset,
                              size[vbo_attr], type[vbo_attr], offset[vbo_attr]);
-      _mesa_vertex_attrib_binding(ctx, *vao, vao_attr, 0, false);
-      _mesa_enable_vertex_array_attrib(ctx, *vao, vao_attr, false);
+      _mesa_vertex_attrib_binding(ctx, *vao, vao_attr, 0);
+      _mesa_enable_vertex_array_attrib(ctx, *vao, vao_attr);
    }
    assert(vao_enabled == (*vao)->_Enabled);
    assert((vao_enabled & ~(*vao)->VertexAttribBufferMask) == 0);
@@ -791,9 +791,12 @@
       const int i = u_bit_scan64(&enabled);
       assert(save->attrsz[i]);
 
-      save->currentsz[i][0] = save->attrsz[i];
-      COPY_CLEAN_4V_TYPE_AS_UNION(save->current[i], save->attrsz[i],
-                                  save->attrptr[i], save->attrtype[i]);
+      if (save->attrtype[i] == GL_DOUBLE ||
+          save->attrtype[i] == GL_UNSIGNED_INT64_ARB)
+         memcpy(save->current[i], save->attrptr[i], save->attrsz[i] * sizeof(GLfloat));
+      else
+         COPY_CLEAN_4V_TYPE_AS_UNION(save->current[i], save->attrsz[i],
+                                     save->attrptr[i], save->attrtype[i]);
    }
 }
 
@@ -935,11 +938,13 @@
  * get a glTexCoord4f() or glTexCoord1f() call.
  */
 static void
-fixup_vertex(struct gl_context *ctx, GLuint attr, GLuint sz)
+fixup_vertex(struct gl_context *ctx, GLuint attr,
+             GLuint sz, GLenum newType)
 {
    struct vbo_save_context *save = &vbo_context(ctx)->save;
 
-   if (sz > save->attrsz[attr]) {
+   if (sz > save->attrsz[attr] ||
+       newType != save->attrtype[attr]) {
       /* New size is larger.  Need to flush existing vertices and get
        * an enlarged vertex format.
        */
@@ -994,9 +999,10 @@
 #define ATTR_UNION(A, N, T, C, V0, V1, V2, V3)			\
 do {								\
    struct vbo_save_context *save = &vbo_context(ctx)->save;	\
+   int sz = (sizeof(C) / sizeof(GLfloat));			\
 								\
    if (save->active_sz[A] != N)					\
-      fixup_vertex(ctx, A, N);					\
+      fixup_vertex(ctx, A, N * sz, T);				\
 								\
    {								\
       C *dest = (C *)save->attrptr[A];                          \
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index f4b2c80..409a353 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -54,16 +54,24 @@
       struct gl_array_attributes *currval = &vbo->current[shift + i];
       const GLubyte size = attrib->Size;
       const GLenum16 type = attrib->Type;
-      fi_type tmp[4];
+      fi_type tmp[8];
+      int dmul = 1;
 
-      COPY_CLEAN_4V_TYPE_AS_UNION(tmp, size, *data, type);
+      if (type == GL_DOUBLE ||
+          type == GL_UNSIGNED_INT64_ARB)
+         dmul = 2;
+
+      if (dmul == 2)
+         memcpy(tmp, *data, size * dmul * sizeof(GLfloat));
+      else
+         COPY_CLEAN_4V_TYPE_AS_UNION(tmp, size, *data, type);
 
       if (type != currval->Type ||
-          memcmp(currval->Ptr, tmp, 4 * sizeof(GLfloat)) != 0) {
-         memcpy((fi_type*)currval->Ptr, tmp, 4 * sizeof(GLfloat));
+          memcmp(currval->Ptr, tmp, 4 * sizeof(GLfloat) * dmul) != 0) {
+         memcpy((fi_type*)currval->Ptr, tmp, 4 * sizeof(GLfloat) * dmul);
 
          currval->Size = size;
-         currval->_ElementSize = size * sizeof(GLfloat);
+         currval->_ElementSize = size * sizeof(GLfloat) * dmul;
          currval->Type = type;
          currval->Integer = vbo_attrtype_to_integer_flag(type);
          currval->Doubles = vbo_attrtype_to_double_flag(type);
@@ -168,7 +176,7 @@
       remap_vertex_store = GL_TRUE;
    }
 
-   FLUSH_CURRENT(ctx, 0);
+   FLUSH_FOR_DRAW(ctx);
 
    if (node->prim_count > 0) {
 
diff --git a/src/meson.build b/src/meson.build
index c2566b7..5cfc4f9 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -51,12 +51,16 @@
 subdir('mapi')
 # TODO: opengl
 subdir('compiler')
-subdir('egl/wayland/wayland-drm')
-subdir('vulkan')
+if with_platform_wayland
+  subdir('egl/wayland/wayland-drm')
+endif
+if with_any_vk
+  subdir('vulkan')
+endif
 if with_gallium_radeonsi or with_amd_vk
   subdir('amd')
 endif
-if with_gallium_vc4
+if with_gallium_vc4 or with_gallium_v3d
   subdir('broadcom')
 endif
 if with_dri_i965 or with_intel_vk
diff --git a/src/util/BUILD.gn b/src/util/BUILD.gn
index 2bb187f..d4b33e9 100644
--- a/src/util/BUILD.gn
+++ b/src/util/BUILD.gn
@@ -53,6 +53,7 @@
     "texcompress_rgtc_tmp.h",
     "u_atomic.h",
     "u_vector.h",
+    "vma.h",
   ]
 }
 
@@ -86,6 +87,7 @@
     "sha1/sha1.c",
     "strtod.c",
     "u_vector.c",
+    "vma.c",
   ]
 }
 
diff --git a/src/util/Makefile.am b/src/util/Makefile.am
index 07bf052..1e14308 100644
--- a/src/util/Makefile.am
+++ b/src/util/Makefile.am
@@ -22,7 +22,12 @@
 SUBDIRS = . \
 	xmlpool \
 	tests/hash_table \
-	tests/string_buffer
+	tests/string_buffer \
+	tests/set
+
+if HAVE_STD_CXX11
+SUBDIRS += tests/vma
+endif
 
 include Makefile.sources
 
@@ -54,7 +59,8 @@
 	$(PTHREAD_LIBS) \
 	$(CLOCK_LIB) \
 	$(ZLIB_LIBS) \
-	$(LIBATOMIC_LIBS)
+	$(LIBATOMIC_LIBS) \
+	-lm
 
 libxmlconfig_la_SOURCES = $(XMLCONFIG_FILES)
 libxmlconfig_la_CFLAGS = \
diff --git a/src/util/Makefile.sources b/src/util/Makefile.sources
index 104ecae..531fd83 100644
--- a/src/util/Makefile.sources
+++ b/src/util/Makefile.sources
@@ -24,12 +24,16 @@
 	mesa-sha1.h \
 	os_time.c \
 	os_time.h \
+	u_process.c \
+	u_process.h \
 	sha1/sha1.c \
 	sha1/sha1.h \
 	ralloc.c \
 	ralloc.h \
 	rand_xor.c \
 	rand_xor.h \
+	rb_tree.c \
+	rb_tree.h \
 	register_allocate.c \
 	register_allocate.h \
 	rgtc.c \
@@ -56,7 +60,9 @@
 	u_string.h \
 	u_thread.h \
 	u_vector.c \
-	u_vector.h
+	u_vector.h \
+	vma.c \
+	vma.h
 
 MESA_UTIL_GENERATED_FILES = \
 	format_srgb.c
diff --git a/src/util/bitscan.h b/src/util/bitscan.h
index 5cc75f0..dc89ac9 100644
--- a/src/util/bitscan.h
+++ b/src/util/bitscan.h
@@ -123,6 +123,17 @@
    return (v & (v - 1)) == 0;
 }
 
+/* Determine if an uint64_t value is a power of two.
+ *
+ * \note
+ * Zero is treated as a power of two.
+ */
+static inline bool
+util_is_power_of_two_or_zero64(uint64_t v)
+{
+   return (v & (v - 1)) == 0;
+}
+
 /* Determine if an unsigned value is a power of two.
  *
  * \note
diff --git a/src/util/build_id.c b/src/util/build_id.c
index fb67d16..8b4f8f3 100644
--- a/src/util/build_id.c
+++ b/src/util/build_id.c
@@ -28,6 +28,7 @@
 #include <string.h>
 
 #include "build_id.h"
+#include "macros.h"
 
 #ifndef NT_GNU_BUILD_ID
 #define NT_GNU_BUILD_ID 3
@@ -37,8 +38,6 @@
 #define ElfW(type) Elf_##type
 #endif
 
-#define ALIGN(val, align)      (((val) + (align) - 1) & ~((align) - 1))
-
 struct build_id_note {
    ElfW(Nhdr) nhdr;
 
@@ -90,8 +89,8 @@
          }
 
          size_t offset = sizeof(ElfW(Nhdr)) +
-                         ALIGN(note->nhdr.n_namesz, 4) +
-                         ALIGN(note->nhdr.n_descsz, 4);
+                         ALIGN_POT(note->nhdr.n_namesz, 4) +
+                         ALIGN_POT(note->nhdr.n_descsz, 4);
          note = (struct build_id_note *)((char *)note + offset);
          len -= offset;
       }
diff --git a/src/util/disk_cache.c b/src/util/disk_cache.c
index 4a762ef..368ec41 100644
--- a/src/util/disk_cache.c
+++ b/src/util/disk_cache.c
@@ -189,7 +189,7 @@
 } while (0);
 
 struct disk_cache *
-disk_cache_create(const char *gpu_name, const char *timestamp,
+disk_cache_create(const char *gpu_name, const char *driver_id,
                   uint64_t driver_flags)
 {
    void *local;
@@ -376,7 +376,7 @@
     * The queue will resize automatically when it's full, so adding new jobs
     * doesn't stall.
     */
-   util_queue_init(&cache->cache_queue, "disk_cache", 32, 1,
+   util_queue_init(&cache->cache_queue, "disk$", 32, 1,
                    UTIL_QUEUE_INIT_RESIZE_IF_FULL |
                    UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY);
 
@@ -387,9 +387,9 @@
    cache->driver_keys_blob_size = cv_size;
 
    /* Create driver id keys */
-   size_t ts_size = strlen(timestamp) + 1;
+   size_t id_size = strlen(driver_id) + 1;
    size_t gpu_name_size = strlen(gpu_name) + 1;
-   cache->driver_keys_blob_size += ts_size;
+   cache->driver_keys_blob_size += id_size;
    cache->driver_keys_blob_size += gpu_name_size;
 
    /* We sometimes store entire structs that contains a pointers in the cache,
@@ -409,7 +409,7 @@
 
    uint8_t *drv_key_blob = cache->driver_keys_blob;
    DRV_KEY_CPY(drv_key_blob, &cache_version, cv_size)
-   DRV_KEY_CPY(drv_key_blob, timestamp, ts_size)
+   DRV_KEY_CPY(drv_key_blob, driver_id, id_size)
    DRV_KEY_CPY(drv_key_blob, gpu_name, gpu_name_size)
    DRV_KEY_CPY(drv_key_blob, &ptr_size, ptr_size_size)
    DRV_KEY_CPY(drv_key_blob, &driver_flags, driver_flags_size)
diff --git a/src/util/disk_cache.h b/src/util/disk_cache.h
index 50bd9f4..2a147cb 100644
--- a/src/util/disk_cache.h
+++ b/src/util/disk_cache.h
@@ -26,11 +26,14 @@
 
 #ifdef HAVE_DLFCN_H
 #include <dlfcn.h>
+#include <stdio.h>
+#include "util/build_id.h"
 #endif
 #include <assert.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <sys/stat.h>
+#include "util/mesa-sha1.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -100,7 +103,33 @@
    if (stat(info.dli_fname, &st)) {
       return false;
    }
+
+   if (!st.st_mtime) {
+      fprintf(stderr, "Mesa: The provided filesystem timestamp for the cache "
+              "is bogus! Disabling On-disk cache.\n");
+      return false;
+   }
+
    *timestamp = st.st_mtime;
+
+   return true;
+}
+
+static inline bool
+disk_cache_get_function_identifier(void *ptr, struct mesa_sha1 *ctx)
+{
+   uint32_t timestamp;
+
+#ifdef HAVE_DL_ITERATE_PHDR
+   const struct build_id_note *note = NULL;
+   if ((note = build_id_find_nhdr_for_addr(ptr))) {
+      _mesa_sha1_update(ctx, build_id_data(note), build_id_length(note));
+   } else
+#endif
+   if (disk_cache_get_function_timestamp(ptr, &timestamp)) {
+      _mesa_sha1_update(ctx, &timestamp, sizeof(timestamp));
+   } else
+      return false;
    return true;
 }
 #endif
diff --git a/src/util/drirc b/src/util/drirc
index edf1439..0cd04bc 100644
--- a/src/util/drirc
+++ b/src/util/drirc
@@ -100,6 +100,14 @@
             <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
         </application>
 
+        <application name="RAGE (64-bit)" executable="Rage64.exe">
+            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
+        </application>
+
+        <application name="RAGE (32-bit)" executable="Rage.exe">
+            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
+        </application>
+
         <application name="Second Life" executable="do-not-directly-run-secondlife-bin">
             <option name="allow_glsl_extension_directive_midshader" value="true" />
         </application>
@@ -180,6 +188,23 @@
             <option name="allow_glsl_cross_stage_interpolation_mismatch" value="true"/>
         </application>
 
+        <application name="Google Earth VR" executable="Earth.exe">
+            <option name="allow_glsl_builtin_const_expression" value="true"/>
+            <option name="allow_glsl_relaxed_es" value="true"/>
+        </application>
+
+        <application name="No Mans Sky" executable="NMS.exe">
+            <option name="force_glsl_extensions_warn" value="true" />
+        </application>
+
+        <application name="ARMA 3" executable="arma3.x86_64">
+            <option name="glsl_correct_derivatives_after_discard" value="true"/>
+        </application>
+
+        <application name="Wolfenstein The Old Blood" executable="WolfOldBlood_x64.exe">
+            <option name="force_compat_profile" value="true" />
+        </application>
+
         <!-- The GL thread whitelist is below, workarounds are above.
              Keep it that way. -->
 
diff --git a/src/util/format_srgb.h b/src/util/format_srgb.h
index 34b50af..596af56 100644
--- a/src/util/format_srgb.h
+++ b/src/util/format_srgb.h
@@ -55,6 +55,20 @@
 
 
 static inline float
+util_format_srgb_to_linear_float(float cs)
+{
+   if (cs <= 0.0f)
+      return 0.0f;
+   else if (cs <= 0.04045f)
+      return cs / 12.92f;
+   else if (cs < 1.0f)
+      return powf((cs + 0.055) / 1.055f, 2.4f);
+   else
+      return 1.0f;
+}
+
+
+static inline float
 util_format_linear_to_srgb_float(float cl)
 {
    if (cl <= 0.0f)
diff --git a/src/util/format_srgb.py b/src/util/format_srgb.py
index 44b35a0..0b3b561 100644
--- a/src/util/format_srgb.py
+++ b/src/util/format_srgb.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 
 CopyRight = '''
 /**************************************************************************
@@ -57,33 +58,27 @@
 
 
 def generate_srgb_tables():
-    print 'const float'
-    print 'util_format_srgb_8unorm_to_linear_float_table[256] = {'
+    print('const float')
+    print('util_format_srgb_8unorm_to_linear_float_table[256] = {')
     for j in range(0, 256, 4):
-        print '   ',
-        for i in range(j, j + 4):
-            print '%.7e,' % (srgb_to_linear(i / 255.0),),
-        print
-    print '};'
-    print
-    print 'const uint8_t'
-    print 'util_format_srgb_to_linear_8unorm_table[256] = {'
+        print('   ', end=' ')
+        print(' '.join(['%.7e,' % srgb_to_linear(i / 255.0) for i in range(j, j + 4)]))
+    print('};')
+    print()
+    print('const uint8_t')
+    print('util_format_srgb_to_linear_8unorm_table[256] = {')
     for j in range(0, 256, 16):
-        print '   ',
-        for i in range(j, j + 16):
-            print '%3u,' % (int(srgb_to_linear(i / 255.0) * 255.0 + 0.5),),
-        print
-    print '};'
-    print
-    print 'const uint8_t'
-    print 'util_format_linear_to_srgb_8unorm_table[256] = {'
+        print('   ', end=' ')
+        print(' '.join(['%3u,' % int(srgb_to_linear(i / 255.0) * 255.0 + 0.5) for i in range(j, j + 16)]))
+    print('};')
+    print()
+    print('const uint8_t')
+    print('util_format_linear_to_srgb_8unorm_table[256] = {')
     for j in range(0, 256, 16):
-        print '   ',
-        for i in range(j, j + 16):
-            print '%3u,' % (int(linear_to_srgb(i / 255.0) * 255.0 + 0.5),),
-        print
-    print '};'
-    print
+        print('   ', end=' ')
+        print(' '.join(['%3u,' % int(linear_to_srgb(i / 255.0) * 255.0 + 0.5) for i in range(j, j + 16)]))
+    print('};')
+    print()
 
 # calculate the table interpolation values used in float linear to unorm8 srgb
     numexp = 13
@@ -128,25 +123,23 @@
 
         valtable.append((int_a << 16) + int_b)
 
-    print 'const unsigned'
-    print 'util_format_linear_to_srgb_helper_table[104] = {'
+    print('const unsigned')
+    print('util_format_linear_to_srgb_helper_table[104] = {')
 
     for j in range(0, nbuckets, 4):
-        print '   ',
-        for i in range(j, j + 4):
-            print '0x%08x,' % (valtable[i],),
-        print
-    print '};'
-    print
+        print('   ', end=' ')
+        print(' '.join(['0x%08x,' % valtable[i] for i in range(j, j + 4)]))
+    print('};')
+    print()
 
 def main():
-    print '/* This file is autogenerated by u_format_srgb.py. Do not edit directly. */'
-    print
+    print('/* This file is autogenerated by u_format_srgb.py. Do not edit directly. */')
+    print()
     # This will print the copyright message on the top of this file
-    print CopyRight.strip()
-    print
-    print '#include "format_srgb.h"'
-    print
+    print(CopyRight.strip())
+    print()
+    print('#include "format_srgb.h"')
+    print()
     generate_srgb_tables()
 
 
diff --git a/src/util/half_float.c b/src/util/half_float.c
index 4df64c2..2eff2c84 100644
--- a/src/util/half_float.c
+++ b/src/util/half_float.c
@@ -2,6 +2,8 @@
  * Mesa 3-D graphics library
  *
  * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ * Copyright 2015 Philip Taylor <philip@zaynar.co.uk>
+ * Copyright 2018 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -175,3 +177,70 @@
    result = fi.f;
    return result;
 }
+
+/**
+  * Convert 0.0 to 0x00, 1.0 to 0xff.
+  * Values outside the range [0.0, 1.0] will give undefined results.
+  */
+uint8_t _mesa_half_to_unorm8(uint16_t val)
+{
+   const int m = val & 0x3ff;
+   const int e = (val >> 10) & 0x1f;
+   const int s = (val >> 15) & 0x1;
+
+   /* v = round_to_nearest(1.mmmmmmmmmm * 2^(e-15) * 255)
+    *   = round_to_nearest((1.mmmmmmmmmm * 255) * 2^(e-15))
+    *   = round_to_nearest((1mmmmmmmmmm * 255) * 2^(e-25))
+    *   = round_to_zero((1mmmmmmmmmm * 255) * 2^(e-25) + 0.5)
+    *   = round_to_zero(((1mmmmmmmmmm * 255) * 2^(e-24) + 1) / 2)
+    *
+    * This happens to give the correct answer for zero/subnormals too
+    */
+   assert(s == 0 && val <= FP16_ONE); /* check 0 <= this <= 1 */
+   /* (implies e <= 15, which means the bit-shifts below are safe) */
+
+   uint32_t v = ((1 << 10) | m) * 255;
+   v = ((v >> (24 - e)) + 1) >> 1;
+   return v;
+}
+
+/**
+  * Takes a uint16_t, divides by 65536, converts the infinite-precision
+  * result to fp16 with round-to-zero. Used by the ASTC decoder.
+  */
+uint16_t _mesa_uint16_div_64k_to_half(uint16_t v)
+{
+   /* Zero or subnormal. Set the mantissa to (v << 8) and return. */
+   if (v < 4)
+      return v << 8;
+
+   /* Count the leading 0s in the uint16_t */
+#ifdef HAVE___BUILTIN_CLZ
+   int n = __builtin_clz(v) - 16;
+#else
+   int n = 16;
+   for (int i = 15; i >= 0; i--) {
+      if (v & (1 << i)) {
+         n = 15 - i;
+         break;
+      }
+   }
+#endif
+
+   /* Shift the mantissa up so bit 16 is the hidden 1 bit,
+    * mask it off, then shift back down to 10 bits
+    */
+   int m = ( ((uint32_t)v << (n + 1)) & 0xffff ) >> 6;
+
+   /*  (0{n} 1 X{15-n}) * 2^-16
+    * = 1.X * 2^(15-n-16)
+    * = 1.X * 2^(14-n - 15)
+    * which is the FP16 form with e = 14 - n
+    */
+   int e = 14 - n;
+
+   assert(e >= 1 && e <= 30);
+   assert(m >= 0 && m < 0x400);
+
+   return (e << 10) | m;
+}
diff --git a/src/util/half_float.h b/src/util/half_float.h
index b3bc3f6..0155742 100644
--- a/src/util/half_float.h
+++ b/src/util/half_float.h
@@ -32,8 +32,13 @@
 extern "C" {
 #endif
 
+#define FP16_ONE     0x3C00
+#define FP16_ZERO    0
+
 uint16_t _mesa_float_to_half(float val);
 float _mesa_half_to_float(uint16_t val);
+uint8_t _mesa_half_to_unorm8(uint16_t v);
+uint16_t _mesa_uint16_div_64k_to_half(uint16_t v);
 
 static inline bool
 _mesa_half_is_negative(uint16_t h)
diff --git a/src/util/hash_table.c b/src/util/hash_table.c
index f8d5d0f..7ee9e18 100644
--- a/src/util/hash_table.c
+++ b/src/util/hash_table.c
@@ -421,6 +421,15 @@
 }
 
 /**
+ * Removes the entry with the corresponding key, if exists.
+ */
+void _mesa_hash_table_remove_key(struct hash_table *ht,
+                                 const void *key)
+{
+   _mesa_hash_table_remove(ht, _mesa_hash_table_search(ht, key));
+}
+
+/**
  * This function is an iterator over the hash table.
  *
  * Pass in NULL for the first entry, as in the start of a for loop.  Note that
diff --git a/src/util/hash_table.h b/src/util/hash_table.h
index 3846dad..40ff041 100644
--- a/src/util/hash_table.h
+++ b/src/util/hash_table.h
@@ -88,6 +88,8 @@
                                   const void *key);
 void _mesa_hash_table_remove(struct hash_table *ht,
                              struct hash_entry *entry);
+void _mesa_hash_table_remove_key(struct hash_table *ht,
+                                 const void *key);
 
 struct hash_entry *_mesa_hash_table_next_entry(struct hash_table *ht,
                                                struct hash_entry *entry);
diff --git a/src/util/list.h b/src/util/list.h
index 6edb750..09d1b4c 100644
--- a/src/util/list.h
+++ b/src/util/list.h
@@ -72,7 +72,7 @@
     list->prev = item;
 }
 
-static inline bool list_empty(struct list_head *list);
+static inline bool list_empty(const struct list_head *list);
 
 static inline void list_replace(struct list_head *from, struct list_head *to)
 {
@@ -101,7 +101,7 @@
     item->prev = item;
 }
 
-static inline bool list_empty(struct list_head *list)
+static inline bool list_empty(const struct list_head *list)
 {
    return list->next == list;
 }
@@ -114,7 +114,7 @@
    return list->next != NULL && list->next != list && list->next->next == list;
 }
 
-static inline unsigned list_length(struct list_head *list)
+static inline unsigned list_length(const struct list_head *list)
 {
    struct list_head *node;
    unsigned length = 0;
@@ -145,7 +145,7 @@
    dst->prev = src->prev;
 }
 
-static inline void list_validate(struct list_head *list)
+static inline void list_validate(const struct list_head *list)
 {
    struct list_head *node;
    assert(list->next->prev == list && list->prev->next == list);
diff --git a/src/util/macros.h b/src/util/macros.h
index 6d3df90..fb522ee 100644
--- a/src/util/macros.h
+++ b/src/util/macros.h
@@ -285,6 +285,9 @@
 #define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
 #define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
 
+/** Align a value to a power of two */
+#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
+
 /**
  * Macro for declaring an explicit conversion operator.  Defaults to an
  * implicit conversion if C++11 is not supported.
diff --git a/src/util/meson.build b/src/util/meson.build
index eece1ce..795b5fd 100644
--- a/src/util/meson.build
+++ b/src/util/meson.build
@@ -48,12 +48,16 @@
   'mesa-sha1.h',
   'os_time.c',
   'os_time.h',
+  'u_process.c',
+  'u_process.h',
   'sha1/sha1.c',
   'sha1/sha1.h',
   'ralloc.c',
   'ralloc.h',
   'rand_xor.c',
   'rand_xor.h',
+  'rb_tree.c',
+  'rb_tree.h',
   'register_allocate.c',
   'register_allocate.h',
   'rgtc.c',
@@ -81,6 +85,8 @@
   'u_thread.h',
   'u_vector.c',
   'u_vector.h',
+  'vma.c',
+  'vma.h',
 )
 
 install_data('drirc', install_dir : get_option('sysconfdir'))
@@ -102,7 +108,7 @@
   'mesa_util',
   [files_mesa_util, format_srgb],
   include_directories : inc_common,
-  dependencies : [dep_zlib, dep_clock, dep_thread, dep_atomic],
+  dependencies : [dep_zlib, dep_clock, dep_thread, dep_atomic, dep_m],
   c_args : [c_msvc_compat_args, c_vis_args],
   build_by_default : false
 )
@@ -111,6 +117,7 @@
   'xmlconfig',
   files_xmlconfig,
   include_directories : inc_common,
+  link_with : libmesa_util,
   dependencies : [dep_expat, dep_m],
   c_args : [
     c_msvc_compat_args, c_vis_args,
@@ -157,4 +164,6 @@
 
   subdir('tests/hash_table')
   subdir('tests/string_buffer')
+  subdir('tests/vma')
+  subdir('tests/set')
 endif
diff --git a/src/util/ralloc.c b/src/util/ralloc.c
index 42cfa2e..fc35661 100644
--- a/src/util/ralloc.c
+++ b/src/util/ralloc.c
@@ -61,7 +61,7 @@
 #endif
    ralloc_header
 {
-#ifdef DEBUG
+#ifndef NDEBUG
    /* A canary value used to determine whether a pointer is ralloc'd. */
    unsigned canary;
 #endif
@@ -88,9 +88,7 @@
 {
    ralloc_header *info = (ralloc_header *) (((char *) ptr) -
 					    sizeof(ralloc_header));
-#ifdef DEBUG
    assert(info->canary == CANARY);
-#endif
    return info;
 }
 
@@ -140,7 +138,7 @@
 
    add_child(parent, info);
 
-#ifdef DEBUG
+#ifndef NDEBUG
    info->canary = CANARY;
 #endif
 
@@ -553,14 +551,20 @@
  * other buffers.
  */
 
-#define ALIGN_POT(x, y) (((x) + (y) - 1) & ~((y) - 1))
-
 #define MIN_LINEAR_BUFSIZE 2048
-#define SUBALLOC_ALIGNMENT sizeof(uintptr_t)
+#define SUBALLOC_ALIGNMENT 8
 #define LMAGIC 0x87b9c7d3
 
-struct linear_header {
-#ifdef DEBUG
+struct
+#ifdef _MSC_VER
+ __declspec(align(8))
+#elif defined(__LP64__)
+ __attribute__((aligned(16)))
+#else
+ __attribute__((aligned(8)))
+#endif
+   linear_header {
+#ifndef NDEBUG
    unsigned magic;   /* for debugging */
 #endif
    unsigned offset;  /* points to the first unused byte in the buffer */
@@ -610,7 +614,7 @@
    if (unlikely(!node))
       return NULL;
 
-#ifdef DEBUG
+#ifndef NDEBUG
    node->magic = LMAGIC;
 #endif
    node->offset = 0;
@@ -630,9 +634,7 @@
    linear_size_chunk *ptr;
    unsigned full_size;
 
-#ifdef DEBUG
    assert(first->magic == LMAGIC);
-#endif
    assert(!latest->next);
 
    size = ALIGN_POT(size, SUBALLOC_ALIGNMENT);
@@ -653,6 +655,8 @@
    ptr = (linear_size_chunk *)((char*)&latest[1] + latest->offset);
    ptr->size = size;
    latest->offset += full_size;
+
+   assert((uintptr_t)&ptr[1] % SUBALLOC_ALIGNMENT == 0);
    return &ptr[1];
 }
 
@@ -704,9 +708,7 @@
       return;
 
    node = LINEAR_PARENT_TO_HEADER(ptr);
-#ifdef DEBUG
    assert(node->magic == LMAGIC);
-#endif
 
    while (node) {
       void *ptr = node;
@@ -725,9 +727,7 @@
       return;
 
    node = LINEAR_PARENT_TO_HEADER(ptr);
-#ifdef DEBUG
    assert(node->magic == LMAGIC);
-#endif
 
    while (node) {
       ralloc_steal(new_ralloc_ctx, node);
@@ -740,9 +740,7 @@
 ralloc_parent_of_linear_parent(void *ptr)
 {
    linear_header *node = LINEAR_PARENT_TO_HEADER(ptr);
-#ifdef DEBUG
    assert(node->magic == LMAGIC);
-#endif
    return node->ralloc_parent;
 }
 
diff --git a/src/util/rb_tree.c b/src/util/rb_tree.c
new file mode 100644
index 0000000..a86fa31
--- /dev/null
+++ b/src/util/rb_tree.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright © 2017 Jason Ekstrand
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "rb_tree.h"
+
+/** \file rb_tree.c
+ *
+ * An implementation of a red-black tree
+ *
+ * This file implements the guts of a red-black tree.  The implementation
+ * is mostly based on the one in "Introduction to Algorithms", third
+ * edition, by Cormen, Leiserson, Rivest, and Stein.  The primary
+ * divergence in our algorithms from those presented in CLRS is that we use
+ * NULL for the leaves instead of a sentinel.  This means we have to do a
+ * tiny bit more tracking in our implementation of delete but it makes the
+ * algorithms far more explicit than stashing stuff in the sentinel.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+static bool
+rb_node_is_black(struct rb_node *n)
+{
+    /* NULL nodes are leaves and therefore black */
+    return (n == NULL) || (n->parent & 1);
+}
+
+static bool
+rb_node_is_red(struct rb_node *n)
+{
+    return !rb_node_is_black(n);
+}
+
+static void
+rb_node_set_black(struct rb_node *n)
+{
+    n->parent |= 1;
+}
+
+static void
+rb_node_set_red(struct rb_node *n)
+{
+    n->parent &= ~1ull;
+}
+
+static void
+rb_node_copy_color(struct rb_node *dst, struct rb_node *src)
+{
+    dst->parent = (dst->parent & ~1ull) | (src->parent & 1);
+}
+
+static void
+rb_node_set_parent(struct rb_node *n, struct rb_node *p)
+{
+    n->parent = (n->parent & 1) | (uintptr_t)p;
+}
+
+static struct rb_node *
+rb_node_minimum(struct rb_node *node)
+{
+    while (node->left)
+        node = node->left;
+    return node;
+}
+
+static struct rb_node *
+rb_node_maximum(struct rb_node *node)
+{
+    while (node->right)
+        node = node->right;
+    return node;
+}
+
+void
+rb_tree_init(struct rb_tree *T)
+{
+    T->root = NULL;
+}
+
+/**
+ * Replace the subtree of T rooted at u with the subtree rooted at v
+ *
+ * This is called RB-transplant in CLRS.
+ *
+ * The node to be replaced is assumed to be a non-leaf.
+ */
+static void
+rb_tree_splice(struct rb_tree *T, struct rb_node *u, struct rb_node *v)
+{
+    assert(u);
+    struct rb_node *p = rb_node_parent(u);
+    if (p == NULL) {
+        assert(T->root == u);
+        T->root = v;
+    } else if (u == p->left) {
+        p->left = v;
+    } else {
+        assert(u == p->right);
+        p->right = v;
+    }
+    if (v)
+        rb_node_set_parent(v, p);
+}
+
+static void
+rb_tree_rotate_left(struct rb_tree *T, struct rb_node *x)
+{
+    assert(x && x->right);
+
+    struct rb_node *y = x->right;
+    x->right = y->left;
+    if (y->left)
+        rb_node_set_parent(y->left, x);
+    rb_tree_splice(T, x, y);
+    y->left = x;
+    rb_node_set_parent(x, y);
+}
+
+static void
+rb_tree_rotate_right(struct rb_tree *T, struct rb_node *y)
+{
+    assert(y && y->left);
+
+    struct rb_node *x = y->left;
+    y->left = x->right;
+    if (x->right)
+        rb_node_set_parent(x->right, y);
+    rb_tree_splice(T, y, x);
+    x->right = y;
+    rb_node_set_parent(y, x);
+}
+
+void
+rb_tree_insert_at(struct rb_tree *T, struct rb_node *parent,
+                  struct rb_node *node, bool insert_left)
+{
+    /* This sets null children, parent, and a color of red */
+    memset(node, 0, sizeof(*node));
+
+    if (parent == NULL) {
+        assert(T->root == NULL);
+        T->root = node;
+        rb_node_set_black(node);
+        return;
+    }
+
+    if (insert_left) {
+        assert(parent->left == NULL);
+        parent->left = node;
+    } else {
+        assert(parent->right == NULL);
+        parent->right = node;
+    }
+    rb_node_set_parent(node, parent);
+
+    /* Now we do the insertion fixup */
+    struct rb_node *z = node;
+    while (rb_node_is_red(rb_node_parent(z))) {
+        struct rb_node *z_p = rb_node_parent(z);
+        assert(z == z_p->left || z == z_p->right);
+        struct rb_node *z_p_p = rb_node_parent(z_p);
+        assert(z_p_p != NULL);
+        if (z_p == z_p_p->left) {
+            struct rb_node *y = z_p_p->right;
+            if (rb_node_is_red(y)) {
+                rb_node_set_black(z_p);
+                rb_node_set_black(y);
+                rb_node_set_red(z_p_p);
+                z = z_p_p;
+            } else {
+                if (z == z_p->right) {
+                    z = z_p;
+                    rb_tree_rotate_left(T, z);
+                    /* We changed z */
+                    z_p = rb_node_parent(z);
+                    assert(z == z_p->left || z == z_p->right);
+                    z_p_p = rb_node_parent(z_p);
+                }
+                rb_node_set_black(z_p);
+                rb_node_set_red(z_p_p);
+                rb_tree_rotate_right(T, z_p_p);
+            }
+        } else {
+            struct rb_node *y = z_p_p->left;
+            if (rb_node_is_red(y)) {
+                rb_node_set_black(z_p);
+                rb_node_set_black(y);
+                rb_node_set_red(z_p_p);
+                z = z_p_p;
+            } else {
+                if (z == z_p->left) {
+                    z = z_p;
+                    rb_tree_rotate_right(T, z);
+                    /* We changed z */
+                    z_p = rb_node_parent(z);
+                    assert(z == z_p->left || z == z_p->right);
+                    z_p_p = rb_node_parent(z_p);
+                }
+                rb_node_set_black(z_p);
+                rb_node_set_red(z_p_p);
+                rb_tree_rotate_left(T, z_p_p);
+            }
+        }
+    }
+    rb_node_set_black(T->root);
+}
+
+void
+rb_tree_remove(struct rb_tree *T, struct rb_node *z)
+{
+    /* x_p is always the parent node of X.  We have to track this
+     * separately because x may be NULL.
+     */
+    struct rb_node *x, *x_p;
+    struct rb_node *y = z;
+    bool y_was_black = rb_node_is_black(y);
+    if (z->left == NULL) {
+        x = z->right;
+        x_p = rb_node_parent(z);
+        rb_tree_splice(T, z, x);
+    } else if (z->right == NULL) {
+        x = z->left;
+        x_p = rb_node_parent(z);
+        rb_tree_splice(T, z, x);
+    } else {
+        /* Find the minimum sub-node of z->right */
+        y = rb_node_minimum(z->right);
+        y_was_black = rb_node_is_black(y);
+
+        x = y->right;
+        if (rb_node_parent(y) == z) {
+            x_p = y;
+        } else {
+            x_p = rb_node_parent(y);
+            rb_tree_splice(T, y, x);
+            y->right = z->right;
+            rb_node_set_parent(y->right, y);
+        }
+        assert(y->left == NULL);
+        rb_tree_splice(T, z, y);
+        y->left = z->left;
+        rb_node_set_parent(y->left, y);
+        rb_node_copy_color(y, z);
+    }
+
+    assert(x_p == NULL || x == x_p->left || x == x_p->right);
+
+    if (!y_was_black)
+        return;
+
+    /* Fixup RB tree after the delete */
+    while (x != T->root && rb_node_is_black(x)) {
+        if (x == x_p->left) {
+            struct rb_node *w = x_p->right;
+            if (rb_node_is_red(w)) {
+                rb_node_set_black(w);
+                rb_node_set_red(x_p);
+                rb_tree_rotate_left(T, x_p);
+                assert(x == x_p->left);
+                w = x_p->right;
+            }
+            if (rb_node_is_black(w->left) && rb_node_is_black(w->right)) {
+                rb_node_set_red(w);
+                x = x_p;
+            } else {
+                if (rb_node_is_black(w->right)) {
+                    rb_node_set_black(w->left);
+                    rb_node_set_red(w);
+                    rb_tree_rotate_right(T, w);
+                    w = x_p->right;
+                }
+                rb_node_copy_color(w, x_p);
+                rb_node_set_black(x_p);
+                rb_node_set_black(w->right);
+                rb_tree_rotate_left(T, x_p);
+                x = T->root;
+            }
+        } else {
+            struct rb_node *w = x_p->left;
+            if (rb_node_is_red(w)) {
+                rb_node_set_black(w);
+                rb_node_set_red(x_p);
+                rb_tree_rotate_right(T, x_p);
+                assert(x == x_p->right);
+                w = x_p->left;
+            }
+            if (rb_node_is_black(w->right) && rb_node_is_black(w->left)) {
+                rb_node_set_red(w);
+                x = x_p;
+            } else {
+                if (rb_node_is_black(w->left)) {
+                    rb_node_set_black(w->right);
+                    rb_node_set_red(w);
+                    rb_tree_rotate_left(T, w);
+                    w = x_p->left;
+                }
+                rb_node_copy_color(w, x_p);
+                rb_node_set_black(x_p);
+                rb_node_set_black(w->left);
+                rb_tree_rotate_right(T, x_p);
+                x = T->root;
+            }
+        }
+        x_p = rb_node_parent(x);
+    }
+    if (x)
+        rb_node_set_black(x);
+}
+
+struct rb_node *
+rb_tree_first(struct rb_tree *T)
+{
+    return T->root ? rb_node_minimum(T->root) : NULL;
+}
+
+struct rb_node *
+rb_tree_last(struct rb_tree *T)
+{
+    return T->root ? rb_node_maximum(T->root) : NULL;
+}
+
+struct rb_node *
+rb_node_next(struct rb_node *node)
+{
+    if (node->right) {
+        /* If we have a right child, then the next thing (compared to this
+         * node) is the left-most child of our right child.
+         */
+        return rb_node_minimum(node->right);
+    } else {
+        /* If node doesn't have a right child, crawl back up the to the
+         * left until we hit a parent to the right.
+         */
+        struct rb_node *p = rb_node_parent(node);
+        while (p && node == p->right) {
+            node = p;
+            p = rb_node_parent(node);
+        }
+        assert(p == NULL || node == p->left);
+        return p;
+    }
+}
+
+struct rb_node *
+rb_node_prev(struct rb_node *node)
+{
+    if (node->left) {
+        /* If we have a left child, then the previous thing (compared to
+         * this node) is the right-most child of our left child.
+         */
+        return rb_node_maximum(node->left);
+    } else {
+        /* If node doesn't have a left child, crawl back up the to the
+         * right until we hit a parent to the left.
+         */
+        struct rb_node *p = rb_node_parent(node);
+        while (p && node == p->left) {
+            node = p;
+            p = rb_node_parent(node);
+        }
+        assert(p == NULL || node == p->right);
+        return p;
+    }
+}
+
+static void
+validate_rb_node(struct rb_node *n, int black_depth)
+{
+    if (n == NULL) {
+        assert(black_depth == 0);
+        return;
+    }
+
+    if (rb_node_is_black(n)) {
+        black_depth--;
+    } else {
+        assert(rb_node_is_black(n->left));
+        assert(rb_node_is_black(n->right));
+    }
+
+    validate_rb_node(n->left, black_depth);
+    validate_rb_node(n->right, black_depth);
+}
+
+void
+rb_tree_validate(struct rb_tree *T)
+{
+    if (T->root == NULL)
+        return;
+
+    assert(rb_node_is_black(T->root));
+
+    unsigned black_depth = 0;
+    for (struct rb_node *n = T->root; n; n = n->left) {
+        if (rb_node_is_black(n))
+            black_depth++;
+    }
+
+    validate_rb_node(T->root, black_depth);
+}
diff --git a/src/util/rb_tree.h b/src/util/rb_tree.h
new file mode 100644
index 0000000..c77e925
--- /dev/null
+++ b/src/util/rb_tree.h
@@ -0,0 +1,269 @@
+/*
+ * Copyright © 2017 Jason Ekstrand
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef RB_TREE_H
+#define RB_TREE_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+/** A red-black tree node
+ *
+ * This struct represents a node in the red-black tree.  This struct should
+ * be embedded as a member in whatever structure you wish to put in the
+ * tree.
+ */
+struct rb_node {
+    /** Parent and color of this node
+     *
+     * The least significant bit represents the color and is est to 1 for
+     * black and 0 for red.  The other bits are the pointer to the parent
+     * and that pointer can be retrieved by masking off the bottom bit and
+     * casting to a pointer.
+     */
+    uintptr_t parent;
+
+    /** Left child of this node, null for a leaf */
+    struct rb_node *left;
+
+    /** Right child of this node, null for a leaf */
+    struct rb_node *right;
+};
+
+/** Return the parent node of the given node or NULL if it is the root */
+static inline struct rb_node *
+rb_node_parent(struct rb_node *n)
+{
+    return (struct rb_node *)(n->parent & ~(uintptr_t)1);
+}
+
+/** A red-black tree
+ *
+ * This struct represents the red-black tree itself.  It is just a pointer
+ * to the root node with no other metadata.
+ */
+struct rb_tree {
+    struct rb_node *root;
+};
+
+/** Initialize a red-black tree */
+void rb_tree_init(struct rb_tree *T);
+
+/** Returns true if the red-black tree is empty */
+static inline bool
+rb_tree_is_empty(const struct rb_tree *T)
+{
+    return T->root == NULL;
+}
+
+/** Retrieve the data structure containing a node
+ *
+ * \param   type    The type of the containing data structure
+ *
+ * \param   node    A pointer to a rb_node
+ *
+ * \param   field   The rb_node field in the containing data structure
+ */
+#define rb_node_data(type, node, field) \
+    ((type *)(((char *)(node)) - offsetof(type, field)))
+
+/** Insert a node into a tree at a particular location
+ *
+ * This function should probably not be used directly as it relies on the
+ * caller to ensure that the parent node is correct.  Use rb_tree_insert
+ * instead.
+ *
+ * \param   T           The red-black tree into which to insert the new node
+ *
+ * \param   parent      The node in the tree that will be the parent of the
+ *                      newly inserted node
+ *
+ * \param   node        The node to insert
+ *
+ * \param   insert_left If true, the new node will be the left child of
+ *                      \p parent, otherwise it will be the right child
+ */
+void rb_tree_insert_at(struct rb_tree *T, struct rb_node *parent,
+                       struct rb_node *node, bool insert_left);
+
+/** Insert a node into a tree
+ *
+ * \param   T       The red-black tree into which to insert the new node
+ *
+ * \param   node    The node to insert
+ *
+ * \param   cmp     A comparison function to use to order the nodes.
+ */
+static inline void
+rb_tree_insert(struct rb_tree *T, struct rb_node *node,
+               int (*cmp)(const struct rb_node *, const struct rb_node *))
+{
+    /* This function is declared inline in the hopes that the compiler can
+     * optimize away the comparison function pointer call.
+     */
+    struct rb_node *y = NULL;
+    struct rb_node *x = T->root;
+    bool left = false;
+    while (x != NULL) {
+        y = x;
+        left = cmp(node, x) < 0;
+        if (left)
+            x = x->left;
+        else
+            x = x->right;
+    }
+
+    rb_tree_insert_at(T, y, node, left);
+}
+
+/** Remove a node from a tree
+ *
+ * \param   T       The red-black tree from which to remove the node
+ *
+ * \param   node    The node to remove
+ */
+void rb_tree_remove(struct rb_tree *T, struct rb_node *z);
+
+/** Search the tree for a node
+ *
+ * If a node with a matching key exists, the first matching node found will
+ * be returned.  If no matching node exists, NULL is returned.
+ *
+ * \param   T       The red-black tree to search
+ *
+ * \param   key     The key to search for
+ *
+ * \param   cmp     A comparison function to use to order the nodes
+ */
+static inline struct rb_node *
+rb_tree_search(struct rb_tree *T, const void *key,
+               int (*cmp)(const struct rb_node *, const void *))
+{
+    /* This function is declared inline in the hopes that the compiler can
+     * optimize away the comparison function pointer call.
+     */
+    struct rb_node *x = T->root;
+    while (x != NULL) {
+        int c = cmp(x, key);
+        if (c < 0)
+            x = x->right;
+        else if (c > 0)
+            x = x->left;
+        else
+            return x;
+    }
+
+    return x;
+}
+
+/** Sloppily search the tree for a node
+ *
+ * This function searches the tree for a given node.  If a node with a
+ * matching key exists, that first matching node found will be returned.
+ * If no node with an exactly matching key exists, the node returned will
+ * be either the right-most node comparing less than \p key or the
+ * right-most node comparing greater than \p key.  If the tree is empty,
+ * NULL is returned.
+ *
+ * \param   T       The red-black tree to search
+ *
+ * \param   key     The key to search for
+ *
+ * \param   cmp     A comparison function to use to order the nodes
+ */
+static inline struct rb_node *
+rb_tree_search_sloppy(struct rb_tree *T, const void *key,
+                      int (*cmp)(const struct rb_node *, const void *))
+{
+    /* This function is declared inline in the hopes that the compiler can
+     * optimize away the comparison function pointer call.
+     */
+    struct rb_node *y = NULL;
+    struct rb_node *x = T->root;
+    while (x != NULL) {
+        y = x;
+        int c = cmp(x, key);
+        if (c < 0)
+            x = x->right;
+        else if (c > 0)
+            x = x->left;
+        else
+            return x;
+    }
+
+    return y;
+}
+
+/** Get the first (left-most) node in the tree or NULL */
+struct rb_node *rb_tree_first(struct rb_tree *T);
+
+/** Get the last (right-most) node in the tree or NULL */
+struct rb_node *rb_tree_last(struct rb_tree *T);
+
+/** Get the next node (to the right) in the tree or NULL */
+struct rb_node *rb_node_next(struct rb_node *node);
+
+/** Get the next previous (to the left) in the tree or NULL */
+struct rb_node *rb_node_prev(struct rb_node *node);
+
+/** Iterate over the nodes in the tree
+ *
+ * \param   type    The type of the containing data structure
+ *
+ * \param   node    The variable name for current node in the iteration;
+ *                  this will be declared as a pointer to \p type
+ *
+ * \param   T       The red-black tree
+ *
+ * \param   field   The rb_node field in containing data structure
+ */
+#define rb_tree_foreach(type, node, T, field) \
+   for (type *node = rb_node_data(type, rb_tree_first(T), field); \
+        &node->field != NULL; \
+        node = rb_node_data(type, rb_node_next(&node->field), field))
+
+/** Iterate over the nodes in the tree in reverse
+ *
+ * \param   type    The type of the containing data structure
+ *
+ * \param   node    The variable name for current node in the iteration;
+ *                  this will be declared as a pointer to \p type
+ *
+ * \param   T       The red-black tree
+ *
+ * \param   field   The rb_node field in containing data structure
+ */
+#define rb_tree_foreach_rev(type, node, T, field) \
+   for (type *node = rb_node_data(type, rb_tree_last(T), field); \
+        &node->field != NULL; \
+        node = rb_node_data(type, rb_node_prev(&node->field), field))
+
+/** Validate a red-black tree
+ *
+ * This function walks the tree and validates that this is a valid red-
+ * black tree.  If anything is wrong, it will assert-fail.
+ */
+void rb_tree_validate(struct rb_tree *T);
+
+#endif /* RB_TREE_H */
diff --git a/src/util/set.c b/src/util/set.c
index d71f771..feef96d 100644
--- a/src/util/set.c
+++ b/src/util/set.c
@@ -34,6 +34,7 @@
 
 #include <stdlib.h>
 #include <assert.h>
+#include <string.h>
 
 #include "macros.h"
 #include "ralloc.h"
@@ -132,6 +133,28 @@
    return ht;
 }
 
+struct set *
+_mesa_set_clone(struct set *set, void *dst_mem_ctx)
+{
+   struct set *clone;
+
+   clone = ralloc(dst_mem_ctx, struct set);
+   if (clone == NULL)
+      return NULL;
+
+   memcpy(clone, set, sizeof(struct set));
+
+   clone->table = ralloc_array(clone, struct set_entry, clone->size);
+   if (clone->table == NULL) {
+      ralloc_free(clone);
+      return NULL;
+   }
+
+   memcpy(clone->table, set->table, clone->size * sizeof(struct set_entry));
+
+   return clone;
+}
+
 /**
  * Frees the given set.
  *
@@ -156,6 +179,29 @@
 }
 
 /**
+ * Clears all values from the given set.
+ *
+ * If delete_function is passed, it gets called on each entry present before
+ * the set is cleared.
+ */
+void
+_mesa_set_clear(struct set *set, void (*delete_function)(struct set_entry *entry))
+{
+   struct set_entry *entry;
+
+   if (!set)
+      return;
+
+   set_foreach (set, entry) {
+      if (delete_function)
+         delete_function(entry);
+      entry->key = deleted_key;
+   }
+
+   set->entries = set->deleted_entries = 0;
+}
+
+/**
  * Finds a set entry with the given key and hash of that key.
  *
  * Returns NULL if no entry is found.
@@ -338,6 +384,15 @@
 }
 
 /**
+ * Removes the entry with the corresponding key, if exists.
+ */
+void
+_mesa_set_remove_key(struct set *set, const void *key)
+{
+   _mesa_set_remove(set, _mesa_set_search(set, key));
+}
+
+/**
  * This function is an iterator over the hash table.
  *
  * Pass in NULL for the first entry, as in the start of a for loop.  Note that
diff --git a/src/util/set.h b/src/util/set.h
index 9acd2c2..ffd19a7 100644
--- a/src/util/set.h
+++ b/src/util/set.h
@@ -58,9 +58,15 @@
                  uint32_t (*key_hash_function)(const void *key),
                  bool (*key_equals_function)(const void *a,
                                              const void *b));
+struct set *
+_mesa_set_clone(struct set *set, void *dst_mem_ctx);
+
 void
 _mesa_set_destroy(struct set *set,
                   void (*delete_function)(struct set_entry *entry));
+void
+_mesa_set_clear(struct set *set,
+                void (*delete_function)(struct set_entry *entry));
 
 struct set_entry *
 _mesa_set_add(struct set *set, const void *key);
@@ -75,6 +81,8 @@
 
 void
 _mesa_set_remove(struct set *set, struct set_entry *entry);
+void
+_mesa_set_remove_key(struct set *set, const void *key);
 
 struct set_entry *
 _mesa_set_next_entry(const struct set *set, struct set_entry *entry);
diff --git a/src/util/slab.c b/src/util/slab.c
index 4ce0e9a..5f04866 100644
--- a/src/util/slab.c
+++ b/src/util/slab.c
@@ -28,8 +28,6 @@
 #include <stdbool.h>
 #include <string.h>
 
-#define ALIGN(value, align) (((value) + (align) - 1) & ~((align) - 1))
-
 #define SLAB_MAGIC_ALLOCATED 0xcafe4321
 #define SLAB_MAGIC_FREE 0x7ee01234
 
@@ -109,8 +107,8 @@
                    unsigned num_items)
 {
    mtx_init(&parent->mutex, mtx_plain);
-   parent->element_size = ALIGN(sizeof(struct slab_element_header) + item_size,
-                                sizeof(intptr_t));
+   parent->element_size = ALIGN_POT(sizeof(struct slab_element_header) + item_size,
+                                    sizeof(intptr_t));
    parent->num_elements = num_items;
 }
 
diff --git a/src/util/tests/hash_table/.gitignore b/src/util/tests/hash_table/.gitignore
index a0d50ab..5f2315b 100644
--- a/src/util/tests/hash_table/.gitignore
+++ b/src/util/tests/hash_table/.gitignore
@@ -6,6 +6,7 @@
 insert_many
 null_destroy
 random_entry
+remove_key
 remove_null
 replacement
 clear
diff --git a/src/util/tests/hash_table/Makefile.am b/src/util/tests/hash_table/Makefile.am
index 6b9221f..526454c 100644
--- a/src/util/tests/hash_table/Makefile.am
+++ b/src/util/tests/hash_table/Makefile.am
@@ -38,6 +38,7 @@
 	insert_many \
 	null_destroy \
 	random_entry \
+	remove_key \
 	remove_null \
 	replacement \
 	$()
diff --git a/src/util/tests/hash_table/meson.build b/src/util/tests/hash_table/meson.build
index 4bbc510..c7b03f1 100644
--- a/src/util/tests/hash_table/meson.build
+++ b/src/util/tests/hash_table/meson.build
@@ -20,7 +20,8 @@
 
 foreach t : ['clear', 'collision', 'delete_and_lookup', 'delete_management',
              'destroy_callback', 'insert_and_lookup', 'insert_many',
-             'null_destroy', 'random_entry', 'remove_null', 'replacement']
+             'null_destroy', 'random_entry', 'remove_key', 'remove_null',
+             'replacement']
   test(
     t,
     executable(
diff --git a/src/util/tests/hash_table/remove_key.c b/src/util/tests/hash_table/remove_key.c
new file mode 100644
index 0000000..906de67
--- /dev/null
+++ b/src/util/tests/hash_table/remove_key.c
@@ -0,0 +1,63 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "hash_table.h"
+
+int
+main(int argc, char **argv)
+{
+   struct hash_table *ht;
+   const char *str1 = "test1";
+   const char *str2 = "test2";
+   struct hash_entry *entry;
+
+   (void) argc;
+   (void) argv;
+
+   ht = _mesa_hash_table_create(NULL, _mesa_key_hash_string, _mesa_key_string_equal);
+
+   _mesa_hash_table_insert(ht, str1, NULL);
+   _mesa_hash_table_insert(ht, str2, NULL);
+
+   entry = _mesa_hash_table_search(ht, str2);
+   assert(strcmp(entry->key, str2) == 0);
+
+   entry = _mesa_hash_table_search(ht, str1);
+   assert(strcmp(entry->key, str1) == 0);
+
+   _mesa_hash_table_remove_key(ht, str1);
+
+   entry = _mesa_hash_table_search(ht, str1);
+   assert(entry == NULL);
+
+   entry = _mesa_hash_table_search(ht, str2);
+   assert(strcmp(entry->key, str2) == 0);
+
+   _mesa_hash_table_destroy(ht, NULL);
+
+   return 0;
+}
diff --git a/src/util/tests/set/Makefile.am b/src/util/tests/set/Makefile.am
new file mode 100644
index 0000000..5529f4c
--- /dev/null
+++ b/src/util/tests/set/Makefile.am
@@ -0,0 +1,42 @@
+# Copyright © 2018 Intel
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice (including the next
+#  paragraph) shall be included in all copies or substantial portions of the
+#  Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+#  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+#  IN THE SOFTWARE.
+
+AM_CPPFLAGS = \
+	-I$(top_srcdir)/src \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/gtest/include \
+	$(PTHREAD_CFLAGS) \
+	$(DEFINES)
+
+TESTS = set_test
+
+check_PROGRAMS = $(TESTS)
+
+set_test_SOURCES = \
+	set_test.cpp
+
+set_test_LDADD = \
+	$(top_builddir)/src/gtest/libgtest.la \
+	$(top_builddir)/src/util/libmesautil.la \
+	$(PTHREAD_LIBS) \
+	$(DLOPEN_LIBS)
+
+EXTRA_DIST = meson.build
diff --git a/src/gallium/winsys/vc5/drm/meson.build b/src/util/tests/set/meson.build
similarity index 80%
copy from src/gallium/winsys/vc5/drm/meson.build
copy to src/util/tests/set/meson.build
index d859301..add3fc5 100644
--- a/src/gallium/winsys/vc5/drm/meson.build
+++ b/src/util/tests/set/meson.build
@@ -1,15 +1,15 @@
-# Copyright © 2017 Broadcom
-#
+# Copyright © 2018 Intel Corporation
+
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
+
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-#
+
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -18,12 +18,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-libvc5winsys = static_library(
-  'vc5winsys',
-  files('vc5_drm_winsys.c'),
-  include_directories : [
-    inc_src, inc_include,
-    inc_gallium, inc_gallium_aux, inc_gallium_drivers,
-  ],
-  c_args : [c_vis_args],
+test(
+  'set',
+  executable(
+    'set_test',
+    'set_test.cpp',
+    dependencies : [dep_thread, dep_dl, idep_gtest],
+    include_directories : inc_common,
+    link_with : [libmesa_util],
+  )
 )
diff --git a/src/util/tests/set/set_test.cpp b/src/util/tests/set/set_test.cpp
new file mode 100644
index 0000000..a1eef0b
--- /dev/null
+++ b/src/util/tests/set/set_test.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "util/hash_table.h"
+#include "util/set.h"
+
+TEST(set, basic)
+{
+   struct set *s = _mesa_set_create(NULL, _mesa_hash_pointer,
+                                    _mesa_key_pointer_equal);
+   struct set_entry *entry;
+
+   const void *a = (const void *)10;
+   const void *b = (const void *)20;
+
+   _mesa_set_add(s, a);
+   _mesa_set_add(s, b);
+   EXPECT_EQ(s->entries, 2);
+
+   _mesa_set_add(s, a);
+   EXPECT_EQ(s->entries, 2);
+
+   entry = _mesa_set_search(s, a);
+   EXPECT_TRUE(entry);
+   EXPECT_EQ(entry->key, a);
+
+   _mesa_set_remove(s, entry);
+   EXPECT_EQ(s->entries, 1);
+
+   entry = _mesa_set_search(s, a);
+   EXPECT_FALSE(entry);
+
+   _mesa_set_destroy(s, NULL);
+}
+
+TEST(set, clone)
+{
+   struct set *s = _mesa_set_create(NULL, _mesa_hash_pointer,
+                                    _mesa_key_pointer_equal);
+   struct set_entry *entry;
+
+   const void *a = (const void *)10;
+   const void *b = (const void *)20;
+   const void *c = (const void *)30;
+
+   _mesa_set_add(s, a);
+   _mesa_set_add(s, b);
+   _mesa_set_add(s, c);
+
+   entry = _mesa_set_search(s, c);
+   EXPECT_TRUE(entry);
+   EXPECT_EQ(entry->key, c);
+
+   _mesa_set_remove(s, entry);
+   EXPECT_EQ(s->entries, 2);
+
+   struct set *clone = _mesa_set_clone(s, NULL);
+   EXPECT_EQ(clone->entries, 2);
+
+   EXPECT_TRUE(_mesa_set_search(clone, a));
+   EXPECT_TRUE(_mesa_set_search(clone, b));
+   EXPECT_FALSE(_mesa_set_search(clone, c));
+
+   _mesa_set_destroy(s, NULL);
+   _mesa_set_destroy(clone, NULL);
+}
+
+TEST(set, remove_key)
+{
+   struct set *s = _mesa_set_create(NULL, _mesa_hash_pointer,
+                                    _mesa_key_pointer_equal);
+
+   const void *a = (const void *)10;
+   const void *b = (const void *)20;
+   const void *c = (const void *)30;
+
+   _mesa_set_add(s, a);
+   _mesa_set_add(s, b);
+   EXPECT_EQ(s->entries, 2);
+
+   /* Remove existing key. */
+   _mesa_set_remove_key(s, a);
+   EXPECT_EQ(s->entries, 1);
+   EXPECT_FALSE(_mesa_set_search(s, a));
+   EXPECT_TRUE(_mesa_set_search(s, b));
+
+   /* Remove non-existing key. */
+   _mesa_set_remove_key(s, c);
+   EXPECT_EQ(s->entries, 1);
+   EXPECT_FALSE(_mesa_set_search(s, a));
+   EXPECT_TRUE(_mesa_set_search(s, b));
+
+   _mesa_set_destroy(s, NULL);
+}
diff --git a/src/util/tests/string_buffer/string_buffer_test.cpp b/src/util/tests/string_buffer/string_buffer_test.cpp
index 545f607..afb6dfb 100644
--- a/src/util/tests/string_buffer/string_buffer_test.cpp
+++ b/src/util/tests/string_buffer/string_buffer_test.cpp
@@ -95,15 +95,15 @@
    EXPECT_TRUE(strlen(buf->buf) == 0);
 
    /* Test a string with some formatting */
-   sprintf(str4, "Testing formatting %d, %f", 100, 1.0);
+   snprintf(str4, sizeof(str4), "Testing formatting %d, %f", 100, 1.0);
    EXPECT_TRUE(_mesa_string_buffer_printf(buf, "Testing formatting %d, %f", 100, 1.0));
    EXPECT_TRUE(strcmp(buf->buf, str4) == 0);
 
    /* Compile a string with some other formatting */
-   sprintf(str5, "Testing formatting %d, %x", 100, 0xDEADBEAF);
+   snprintf(str5, sizeof(str5), "Testing formatting %d, %x", 100, 0xDEADBEAF);
 
    /* Concatenate str5 to str4 */
-   strcat(str4, str5);
+   strncat(str4, str5, sizeof(str5));
 
    /* Now use the formatted append function again */
    EXPECT_TRUE(_mesa_string_buffer_printf(buf, "Testing formatting %d, %x", 100, 0xDEADBEAF));
diff --git a/src/util/tests/vma/Makefile.am b/src/util/tests/vma/Makefile.am
new file mode 100644
index 0000000..b9ca8f5
--- /dev/null
+++ b/src/util/tests/vma/Makefile.am
@@ -0,0 +1,39 @@
+# Copyright © 2018 Intel Corporation
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice (including the next
+#  paragraph) shall be included in all copies or substantial portions of the
+#  Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+#  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+#  IN THE SOFTWARE.
+
+AM_CPPFLAGS = \
+	-I$(top_srcdir)/include \
+	-I$(top_srcdir)/src/util \
+	$(DEFINES)
+
+TESTS = vma_random_test
+
+check_PROGRAMS = $(TESTS)
+
+vma_random_test_SOURCES = \
+	vma_random_test.cpp
+
+vma_random_test_LDADD = \
+	$(top_builddir)/src/util/libmesautil.la
+
+vma_random_test_CXXFLAGS = $(CXX11_CXXFLAGS)
+
+EXTRA_DIST = meson.build
diff --git a/src/gallium/winsys/vc5/drm/meson.build b/src/util/tests/vma/meson.build
similarity index 80%
copy from src/gallium/winsys/vc5/drm/meson.build
copy to src/util/tests/vma/meson.build
index d859301..53562db 100644
--- a/src/gallium/winsys/vc5/drm/meson.build
+++ b/src/util/tests/vma/meson.build
@@ -1,15 +1,15 @@
-# Copyright © 2017 Broadcom
-#
+# Copyright © 2018 Intel Corporation
+
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-#
+
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-#
+
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -18,12 +18,12 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-libvc5winsys = static_library(
-  'vc5winsys',
-  files('vc5_drm_winsys.c'),
-  include_directories : [
-    inc_src, inc_include,
-    inc_gallium, inc_gallium_aux, inc_gallium_drivers,
-  ],
-  c_args : [c_vis_args],
+test(
+  'vma_random',
+  executable(
+    'vma_random_test',
+    'vma_random_test.cpp',
+    include_directories : [inc_include, inc_util],
+    link_with : [libmesa_util],
+  )
 )
diff --git a/src/util/tests/vma/vma_random_test.cpp b/src/util/tests/vma/vma_random_test.cpp
new file mode 100644
index 0000000..1f194fc
--- /dev/null
+++ b/src/util/tests/vma/vma_random_test.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/* it is a test after all */
+#undef NDEBUG
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+#include <set>
+#include <vector>
+
+#include <err.h>
+
+#include "vma.h"
+
+namespace {
+
+static const uint64_t MEM_PAGE_SIZE = 4096;
+
+struct allocation {
+   uint64_t start_page;
+   uint64_t num_pages;
+};
+
+struct allocation_less {
+   bool operator()(const allocation& lhs, const allocation& rhs) const
+   {
+      assert(lhs.start_page + lhs.num_pages > lhs.start_page);
+      return lhs.start_page + lhs.num_pages <= rhs.start_page;
+   }
+};
+
+constexpr uint64_t allocation_end_page(const allocation& a) {
+   return a.start_page + a.num_pages;
+}
+
+struct random_test {
+   static const uint64_t MEM_START_PAGE = 1;
+   static const uint64_t MEM_SIZE = 0xfffffffffffff000;
+   static const uint64_t MEM_PAGES = MEM_SIZE / MEM_PAGE_SIZE;
+
+   random_test(uint_fast32_t seed)
+      : heap_holes{allocation{MEM_START_PAGE, MEM_PAGES}}, rand{seed}
+   {
+      util_vma_heap_init(&heap, MEM_START_PAGE * MEM_PAGE_SIZE, MEM_SIZE);
+   }
+
+   void test(unsigned long count)
+   {
+      std::uniform_int_distribution<> one_to_thousand(1, 1000);
+      while (count-- > 0) {
+         int action = one_to_thousand(rand);
+         if (action == 1)          fill();
+         else if (action == 2)     empty();
+         else if (action < 374)    dealloc();
+         else                      alloc();
+      }
+   }
+
+   bool alloc(uint64_t size_order=52, uint64_t align_order=52)
+   {
+      std::geometric_distribution<> dist;
+
+      if (align_order > 51)
+         align_order = std::min(dist(rand), 51);
+      uint64_t align_pages = 1ULL << align_order;
+      uint64_t align = align_pages * MEM_PAGE_SIZE;
+
+      if (size_order > 51)
+         size_order = std::min(dist(rand), 51);
+      uint64_t size_pages = 1ULL << size_order;
+      uint64_t size = size_pages * MEM_PAGE_SIZE;
+
+      uint64_t addr = util_vma_heap_alloc(&heap, size, align);
+
+      if (addr == 0) {
+         /* assert no gaps are present in the tracker that could satisfy this
+          * allocation.
+          */
+         for (const auto& hole : heap_holes) {
+            uint64_t hole_alignment_pages =
+               (align_pages - (hole.start_page % align_pages)) % align_pages;
+            assert(hole.num_pages < size_pages + hole_alignment_pages);
+         }
+         return false;
+      } else {
+         assert(addr % align == 0);
+         uint64_t addr_page = addr / MEM_PAGE_SIZE;
+         allocation a{addr_page, size_pages};
+         auto i = heap_holes.find(a);
+         assert(i != end(heap_holes));
+         allocation hole = *i;
+
+         assert(hole.start_page <= addr_page);
+         assert(hole.num_pages >= size_pages + addr_page - hole.start_page);
+
+         heap_holes.erase(i);
+         if (hole.start_page < a.start_page) {
+            heap_holes.emplace(allocation{hole.start_page,
+                     a.start_page - hole.start_page});
+         }
+         if (allocation_end_page(hole) > allocation_end_page(a)) {
+            heap_holes.emplace(allocation{allocation_end_page(a),
+                     allocation_end_page(hole) - allocation_end_page(a)});
+         }
+
+         allocations.push_back(a);
+         return true;
+      }
+   }
+
+   void dealloc()
+   {
+      if (allocations.size() == 0)
+         return;
+
+      std::uniform_int_distribution<> dist(0, allocations.size() - 1);
+      int to_dealloc = dist(rand);
+
+      std::swap(allocations.at(to_dealloc), allocations.back());
+      allocation a = allocations.back();
+      allocations.pop_back();
+
+      util_vma_heap_free(&heap, a.start_page * MEM_PAGE_SIZE,
+                         a.num_pages * MEM_PAGE_SIZE);
+
+      assert(heap_holes.find(a) == end(heap_holes));
+      auto next = heap_holes.upper_bound(a);
+      if (next != end(heap_holes)) {
+         if (next->start_page == allocation_end_page(a)) {
+            allocation x {a.start_page, a.num_pages + next->num_pages};
+            next = heap_holes.erase(next);
+            next = heap_holes.insert(next, x);
+
+            if (next != begin(heap_holes)) {
+               auto prev = next;
+               prev--;
+               if (allocation_end_page(*prev) == next->start_page) {
+                  allocation x {prev->start_page,
+                        prev->num_pages + next->num_pages};
+
+                  heap_holes.erase(prev);
+                  next = heap_holes.erase(next);
+                  heap_holes.insert(next, x);
+               }
+            }
+
+            return;
+         }
+      }
+
+      if (next != begin(heap_holes)) {
+         auto prev = next;
+         prev--;
+         if (allocation_end_page(*prev) == a.start_page) {
+            allocation x {prev->start_page, prev->num_pages + a.num_pages};
+            next = heap_holes.erase(prev);
+            heap_holes.insert(next, x);
+
+            return;
+         }
+      }
+
+      heap_holes.emplace(a);
+   }
+
+   void fill()
+   {
+      for (int sz = 51; sz >= 0; sz--) {
+         while (alloc(sz, 0))
+            ;
+      }
+      assert(heap_holes.empty());
+   }
+
+   void empty()
+   {
+      while (allocations.size() != 0)
+         dealloc();
+      assert(heap_holes.size() == 1);
+      auto& hole = *begin(heap_holes);
+      assert(hole.start_page == MEM_START_PAGE && hole.num_pages == MEM_PAGES);
+   }
+
+   struct util_vma_heap heap;
+   std::set<allocation, allocation_less> heap_holes;
+   std::default_random_engine rand;
+   std::vector<allocation> allocations;
+};
+
+}
+
+int main(int argc, char **argv)
+{
+   unsigned long seed, count;
+   if (argc == 3) {
+      char *arg_end = NULL;
+      seed = strtoul(argv[1], &arg_end, 0);
+      if (!arg_end || *arg_end || seed == ULONG_MAX)
+         errx(1, "invalid seed \"%s\"", argv[1]);
+
+      arg_end = NULL;
+      count = strtoul(argv[2], &arg_end, 0);
+      if (!arg_end || *arg_end || count == ULONG_MAX)
+         errx(1, "invalid count \"%s\"", argv[2]);
+   } else if (argc == 1) {
+      /* importantly chosen prime numbers. */
+      seed = 8675309;
+      count = 2459;
+   } else {
+      errx(1, "USAGE: %s seed iter_count\n", argv[0]);
+   }
+
+   random_test r{(uint_fast32_t)seed};
+   r.test(count);
+
+   printf("ok\n");
+   return 0;
+}
diff --git a/src/util/u_process.c b/src/util/u_process.c
new file mode 100644
index 0000000..5e59276
--- /dev/null
+++ b/src/util/u_process.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2003 Felix Kuehling
+ * Copyright © 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+#include "u_process.h"
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#undef GET_PROGRAM_NAME
+
+#if (defined(__GNU_LIBRARY__) || defined(__GLIBC__)) && !defined(__UCLIBC__)
+#    if !defined(__GLIBC__) || (__GLIBC__ < 2)
+/* These aren't declared in any libc5 header */
+extern char *program_invocation_name, *program_invocation_short_name;
+#    endif
+static const char *
+__getProgramName()
+{
+   char * arg = strrchr(program_invocation_name, '/');
+   if (arg)
+      return arg+1;
+
+   /* If there was no '/' at all we likely have a windows like path from
+    * a wine application.
+    */
+   arg = strrchr(program_invocation_name, '\\');
+   if (arg)
+      return arg+1;
+
+   return program_invocation_name;
+}
+#    define GET_PROGRAM_NAME() __getProgramName()
+#elif defined(__CYGWIN__)
+#    define GET_PROGRAM_NAME() program_invocation_short_name
+#elif defined(__FreeBSD__) && (__FreeBSD__ >= 2)
+#    include <osreldate.h>
+#    if (__FreeBSD_version >= 440000)
+#        define GET_PROGRAM_NAME() getprogname()
+#    endif
+#elif defined(__NetBSD__) && defined(__NetBSD_Version__) && (__NetBSD_Version__ >= 106000100)
+#    define GET_PROGRAM_NAME() getprogname()
+#elif defined(__DragonFly__)
+#    define GET_PROGRAM_NAME() getprogname()
+#elif defined(__APPLE__)
+#    define GET_PROGRAM_NAME() getprogname()
+#elif defined(ANDROID)
+#    define GET_PROGRAM_NAME() getprogname()
+#elif defined(__sun)
+/* Solaris has getexecname() which returns the full path - return just
+   the basename to match BSD getprogname() */
+#    include <libgen.h>
+
+static const char *
+__getProgramName()
+{
+    static const char *progname;
+
+    if (progname == NULL) {
+        const char *e = getexecname();
+        if (e != NULL) {
+            /* Have to make a copy since getexecname can return a readonly
+               string, but basename expects to be able to modify its arg. */
+            char *n = strdup(e);
+            if (n != NULL) {
+                progname = basename(n);
+            }
+        }
+    }
+    return progname;
+}
+
+#    define GET_PROGRAM_NAME() __getProgramName()
+#endif
+
+#if !defined(GET_PROGRAM_NAME)
+#    if defined(__OpenBSD__) || defined(NetBSD) || defined(__UCLIBC__) || defined(ANDROID)
+/* This is a hack. It's said to work on OpenBSD, NetBSD and GNU.
+ * Rogelio M.Serrano Jr. reported it's also working with UCLIBC. It's
+ * used as a last resort, if there is no documented facility available. */
+static const char *
+__getProgramName()
+{
+    extern const char *__progname;
+    char * arg = strrchr(__progname, '/');
+    if (arg)
+        return arg+1;
+    else
+        return __progname;
+}
+#        define GET_PROGRAM_NAME() __getProgramName()
+#    else
+#        define GET_PROGRAM_NAME() ""
+#        pragma message ( "Warning: Per application configuration won't work with your OS version." )
+#    endif
+#endif
+
+const char *
+util_get_process_name(void)
+{
+   return GET_PROGRAM_NAME();
+}
diff --git a/src/util/u_process.h b/src/util/u_process.h
new file mode 100644
index 0000000..77f7cb1
--- /dev/null
+++ b/src/util/u_process.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright © 2003 Felix Kuehling
+ * Copyright © 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+
+#ifndef PROCESS_H
+#define PROCESS_H
+
+const char *
+util_get_process_name(void);
+
+#endif
diff --git a/src/util/u_queue.c b/src/util/u_queue.c
index da513fd..22d2cdd 100644
--- a/src/util/u_queue.c
+++ b/src/util/u_queue.c
@@ -31,6 +31,7 @@
 #include "util/os_time.h"
 #include "util/u_string.h"
 #include "util/u_thread.h"
+#include "u_process.h"
 
 static void util_queue_killall_and_wait(struct util_queue *queue);
 
@@ -238,9 +239,9 @@
 
    free(input);
 
-   if (queue->name) {
+   if (strlen(queue->name) > 0) {
       char name[16];
-      util_snprintf(name, sizeof(name), "%s:%i", queue->name, thread_index);
+      util_snprintf(name, sizeof(name), "%s%i", queue->name, thread_index);
       u_thread_setname(name);
    }
 
@@ -299,8 +300,34 @@
 {
    unsigned i;
 
+   /* Form the thread name from process_name and name, limited to 13
+    * characters. Characters 14-15 are reserved for the thread number.
+    * Character 16 should be 0. Final form: "process:name12"
+    *
+    * If name is too long, it's truncated. If any space is left, the process
+    * name fills it.
+    */
+   const char *process_name = util_get_process_name();
+   int process_len = process_name ? strlen(process_name) : 0;
+   int name_len = strlen(name);
+   const int max_chars = sizeof(queue->name) - 1;
+
+   name_len = MIN2(name_len, max_chars);
+
+   /* See if there is any space left for the process name, reserve 1 for
+    * the colon. */
+   process_len = MIN2(process_len, max_chars - name_len - 1);
+   process_len = MAX2(process_len, 0);
+
    memset(queue, 0, sizeof(*queue));
-   queue->name = name;
+
+   if (process_len) {
+      util_snprintf(queue->name, sizeof(queue->name), "%.*s:%s",
+                    process_len, process_name, name);
+   } else {
+      util_snprintf(queue->name, sizeof(queue->name), "%s", name);
+   }
+
    queue->flags = flags;
    queue->num_threads = num_threads;
    queue->max_jobs = max_jobs;
diff --git a/src/util/u_queue.h b/src/util/u_queue.h
index d702c4b..714d924 100644
--- a/src/util/u_queue.h
+++ b/src/util/u_queue.h
@@ -199,7 +199,7 @@
 
 /* Put this into your context. */
 struct util_queue {
-   const char *name;
+   char name[14]; /* 13 characters = the thread name without the index */
    mtx_t finish_lock; /* only for util_queue_finish */
    mtx_t lock;
    cnd_t has_queued_cond;
diff --git a/src/util/vma.c b/src/util/vma.c
new file mode 100644
index 0000000..c8f5503
--- /dev/null
+++ b/src/util/vma.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+
+#include "util/u_math.h"
+#include "util/vma.h"
+
+struct util_vma_hole {
+   struct list_head link;
+   uint64_t offset;
+   uint64_t size;
+};
+
+#define util_vma_foreach_hole(_hole, _heap) \
+   list_for_each_entry(struct util_vma_hole, _hole, &(_heap)->holes, link)
+
+#define util_vma_foreach_hole_safe(_hole, _heap) \
+   list_for_each_entry_safe(struct util_vma_hole, _hole, &(_heap)->holes, link)
+
+void
+util_vma_heap_init(struct util_vma_heap *heap,
+                   uint64_t start, uint64_t size)
+{
+   list_inithead(&heap->holes);
+   util_vma_heap_free(heap, start, size);
+}
+
+void
+util_vma_heap_finish(struct util_vma_heap *heap)
+{
+   util_vma_foreach_hole_safe(hole, heap)
+      free(hole);
+}
+
+#ifndef NDEBUG
+static void
+util_vma_heap_validate(struct util_vma_heap *heap)
+{
+   uint64_t prev_offset = 0;
+   util_vma_foreach_hole(hole, heap) {
+      assert(hole->offset > 0);
+      assert(hole->size > 0);
+
+      if (&hole->link == heap->holes.next) {
+         /* This must be the top-most hole.  Assert that, if it overflows, it
+          * overflows to 0, i.e. 2^64.
+          */
+         assert(hole->size + hole->offset == 0 ||
+                hole->size + hole->offset > hole->offset);
+      } else {
+         /* This is not the top-most hole so it must not overflow and, in
+          * fact, must be strictly lower than the top-most hole.  If
+          * hole->size + hole->offset == prev_offset, then we failed to join
+          * holes during a util_vma_heap_free.
+          */
+         assert(hole->size + hole->offset > hole->offset &&
+                hole->size + hole->offset < prev_offset);
+      }
+      prev_offset = hole->offset;
+   }
+}
+#else
+#define util_vma_heap_validate(heap)
+#endif
+
+uint64_t
+util_vma_heap_alloc(struct util_vma_heap *heap,
+                    uint64_t size, uint64_t alignment)
+{
+   /* The caller is expected to reject zero-size allocations */
+   assert(size > 0);
+   assert(alignment > 0);
+
+   util_vma_heap_validate(heap);
+
+   util_vma_foreach_hole_safe(hole, heap) {
+      if (size > hole->size)
+         continue;
+
+      /* Compute the offset as the highest address where a chunk of the given
+       * size can be without going over the top of the hole.
+       *
+       * This calculation is known to not overflow because we know that
+       * hole->size + hole->offset can only overflow to 0 and size > 0.
+       */
+      uint64_t offset = (hole->size - size) + hole->offset;
+
+      /* Align the offset.  We align down and not up because we are allocating
+       * from the top of the hole and not the bottom.
+       */
+      offset = (offset / alignment) * alignment;
+
+      if (offset < hole->offset)
+         continue;
+
+      if (offset == hole->offset && size == hole->size) {
+         /* Just get rid of the hole. */
+         list_del(&hole->link);
+         free(hole);
+         util_vma_heap_validate(heap);
+         return offset;
+      }
+
+      assert(offset - hole->offset <= hole->size - size);
+      uint64_t waste = (hole->size - size) - (offset - hole->offset);
+      if (waste == 0) {
+         /* We allocated at the top.  Shrink the hole down. */
+         hole->size -= size;
+         util_vma_heap_validate(heap);
+         return offset;
+      }
+
+      if (offset == hole->offset) {
+         /* We allocated at the bottom. Shrink the hole up. */
+         hole->offset += size;
+         hole->size -= size;
+         util_vma_heap_validate(heap);
+         return offset;
+      }
+
+      /* We allocated in the middle.  We need to split the old hole into two
+       * holes, one high and one low.
+       */
+      struct util_vma_hole *high_hole = calloc(1, sizeof(*hole));
+      high_hole->offset = offset + size;
+      high_hole->size = waste;
+
+      /* Adjust the hole to be the amount of space left at he bottom of the
+       * original hole.
+       */
+      hole->size = offset - hole->offset;
+
+      /* Place the new hole before the old hole so that the list is in order
+       * from high to low.
+       */
+      list_addtail(&high_hole->link, &hole->link);
+
+      util_vma_heap_validate(heap);
+
+      return offset;
+   }
+
+   /* Failed to allocate */
+   return 0;
+}
+
+void
+util_vma_heap_free(struct util_vma_heap *heap,
+                   uint64_t offset, uint64_t size)
+{
+   /* An offset of 0 is reserved for allocation failure.  It is not a valid
+    * address and cannot be freed.
+    */
+   assert(offset > 0);
+
+   /* Freeing something with a size of 0 is also not valid. */
+   assert(size > 0);
+
+   /* It's possible for offset + size to wrap around if we touch the top of
+    * the 64-bit address space, but we cannot go any higher than 2^64.
+    */
+   assert(offset + size == 0 || offset + size > offset);
+
+   util_vma_heap_validate(heap);
+
+   /* Find immediately higher and lower holes if they exist. */
+   struct util_vma_hole *high_hole = NULL, *low_hole = NULL;
+   util_vma_foreach_hole(hole, heap) {
+      if (hole->offset <= offset) {
+         low_hole = hole;
+         break;
+      }
+      high_hole = hole;
+   }
+
+   if (high_hole)
+      assert(offset + size <= high_hole->offset);
+   bool high_adjacent = high_hole && offset + size == high_hole->offset;
+
+   if (low_hole) {
+      assert(low_hole->offset + low_hole->size > low_hole->offset);
+      assert(low_hole->offset + low_hole->size <= offset);
+   }
+   bool low_adjacent = low_hole && low_hole->offset + low_hole->size == offset;
+
+   if (low_adjacent && high_adjacent) {
+      /* Merge the two holes */
+      low_hole->size += size + high_hole->size;
+      list_del(&high_hole->link);
+      free(high_hole);
+   } else if (low_adjacent) {
+      /* Merge into the low hole */
+      low_hole->size += size;
+   } else if (high_adjacent) {
+      /* Merge into the high hole */
+      high_hole->offset = offset;
+      high_hole->size += size;
+   } else {
+      /* Neither hole is adjacent; make a new one */
+      struct util_vma_hole *hole = calloc(1, sizeof(*hole));
+
+      hole->offset = offset;
+      hole->size = size;
+
+      /* Add it after the high hole so we maintain high-to-low ordering */
+      if (high_hole)
+         list_add(&hole->link, &high_hole->link);
+      else
+         list_add(&hole->link, &heap->holes);
+   }
+
+   util_vma_heap_validate(heap);
+}
diff --git a/src/intel/tools/gen_disasm.h b/src/util/vma.h
similarity index 65%
copy from src/intel/tools/gen_disasm.h
copy to src/util/vma.h
index c8c18b2..ed69914 100644
--- a/src/intel/tools/gen_disasm.h
+++ b/src/util/vma.h
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2014 Intel Corporation
+ * Copyright © 2018 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -21,25 +21,33 @@
  * IN THE SOFTWARE.
  */
 
-#ifndef GEN_DISASM_H
-#define GEN_DISASM_H
+#ifndef _UTIL_VMA_H
+#define _UTIL_VMA_H
 
-#include "intel/dev/gen_device_info.h"
+#include <stdint.h>
+
+#include "list.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct gen_disasm;
+struct util_vma_heap {
+   struct list_head holes;
+};
 
-struct gen_disasm *gen_disasm_create(const struct gen_device_info *devinfo);
-void gen_disasm_disassemble(struct gen_disasm *disasm,
-                            void *assembly, int start, FILE *out);
+void util_vma_heap_init(struct util_vma_heap *heap,
+                        uint64_t start, uint64_t size);
+void util_vma_heap_finish(struct util_vma_heap *heap);
 
-void gen_disasm_destroy(struct gen_disasm *disasm);
+uint64_t util_vma_heap_alloc(struct util_vma_heap *heap,
+                             uint64_t size, uint64_t alignment);
+
+void util_vma_heap_free(struct util_vma_heap *heap,
+                        uint64_t offset, uint64_t size);
 
 #ifdef __cplusplus
-}
+} /* extern C */
 #endif
 
-#endif /* GEN_DISASM_H */
+#endif /* _UTIL_DEBUG_H */
diff --git a/src/util/xmlconfig.c b/src/util/xmlconfig.c
index 60a6331..ba65729 100644
--- a/src/util/xmlconfig.c
+++ b/src/util/xmlconfig.c
@@ -37,81 +37,8 @@
 #include <unistd.h>
 #include <errno.h>
 #include "xmlconfig.h"
+#include "u_process.h"
 
-#undef GET_PROGRAM_NAME
-
-#if (defined(__GNU_LIBRARY__) || defined(__GLIBC__)) && !defined(__UCLIBC__)
-#    if !defined(__GLIBC__) || (__GLIBC__ < 2)
-/* These aren't declared in any libc5 header */
-extern char *program_invocation_name, *program_invocation_short_name;
-#    endif
-#    define GET_PROGRAM_NAME() program_invocation_short_name
-#elif defined(__CYGWIN__)
-#    define GET_PROGRAM_NAME() program_invocation_short_name
-#elif defined(__FreeBSD__) && (__FreeBSD__ >= 2)
-#    include <osreldate.h>
-#    if (__FreeBSD_version >= 440000)
-#        include <stdlib.h>
-#        define GET_PROGRAM_NAME() getprogname()
-#    endif
-#elif defined(__NetBSD__) && defined(__NetBSD_Version__) && (__NetBSD_Version__ >= 106000100)
-#    include <stdlib.h>
-#    define GET_PROGRAM_NAME() getprogname()
-#elif defined(__DragonFly__)
-#    include <stdlib.h>
-#    define GET_PROGRAM_NAME() getprogname()
-#elif defined(__APPLE__)
-#    include <stdlib.h>
-#    define GET_PROGRAM_NAME() getprogname()
-#elif defined(__sun)
-/* Solaris has getexecname() which returns the full path - return just
-   the basename to match BSD getprogname() */
-#    include <stdlib.h>
-#    include <libgen.h>
-
-static const char *
-__getProgramName()
-{
-    static const char *progname;
-
-    if (progname == NULL) {
-        const char *e = getexecname();
-        if (e != NULL) {
-            /* Have to make a copy since getexecname can return a readonly
-               string, but basename expects to be able to modify its arg. */
-            char *n = strdup(e);
-            if (n != NULL) {
-                progname = basename(n);
-            }
-        }
-    }
-    return progname;
-}
-
-#    define GET_PROGRAM_NAME() __getProgramName()
-#endif
-
-#if !defined(GET_PROGRAM_NAME)
-#    if defined(__OpenBSD__) || defined(NetBSD) || defined(__UCLIBC__) || defined(ANDROID)
-/* This is a hack. It's said to work on OpenBSD, NetBSD and GNU.
- * Rogelio M.Serrano Jr. reported it's also working with UCLIBC. It's
- * used as a last resort, if there is no documented facility available. */
-static const char *
-__getProgramName()
-{
-    extern const char *__progname;
-    char * arg = strrchr(__progname, '/');
-    if (arg)
-        return arg+1;
-    else
-        return __progname;
-}
-#        define GET_PROGRAM_NAME() __getProgramName()
-#    else
-#        define GET_PROGRAM_NAME() ""
-#        warning "Per application configuration won't work with your OS version."
-#    endif
-#endif
 
 /** \brief Find an option in an option cache with the name as key */
 static uint32_t
@@ -998,7 +925,7 @@
     userData.cache = cache;
     userData.screenNum = screenNum;
     userData.driverName = driverName;
-    userData.execName = GET_PROGRAM_NAME();
+    userData.execName = util_get_process_name();
 
     if ((home = getenv ("HOME"))) {
         uint32_t len = strlen (home);
diff --git a/src/util/xmlpool/gen_xmlpool.py b/src/util/xmlpool/gen_xmlpool.py
index eb68a65..b0db183 100644
--- a/src/util/xmlpool/gen_xmlpool.py
+++ b/src/util/xmlpool/gen_xmlpool.py
@@ -7,6 +7,8 @@
 # `{localedir}/{language}/LC_MESSAGES/options.mo`.
 #
 
+from __future__ import print_function
+
 import sys
 import gettext
 import re
@@ -40,7 +42,7 @@
                 # open quote
                 q = u'\u201d'
             r = r + q
-        elif escapeSeqs.has_key(s[i]):
+        elif s[i] in escapeSeqs:
             r = r + escapeSeqs[s[i]]
         else:
             r = r + s[i]
@@ -88,7 +90,7 @@
                 escape = False
                 r = r + chr(num)
         else:
-            if escapeSeqs.has_key(s[i]):
+            if s[i] in escapeSeqs:
                 r = r + escapeSeqs[s[i]]
                 escape = False
             elif s[i] >= '0' and s[i] <= '7':
@@ -130,16 +132,16 @@
         # non-ascii unicode chars in the original English descriptions.
         text = escapeCString (trans.ugettext (unicode (expandCString (
             matches[0].expand (r'\5')), "utf-8"))).encode("utf-8")
-        print matches[0].expand (r'\1' + lang + r'\3"' + text + r'"\7') + suffix
+        print(matches[0].expand (r'\1' + lang + r'\3"' + text + r'"\7') + suffix)
         # Expand any subsequent enum lines
         for match in matches[1:]:
             text = escapeCString (trans.ugettext (unicode (expandCString (
                 match.expand (r'\3')), "utf-8"))).encode("utf-8")
-            print match.expand (r'\1"' + text + r'"\5')
+            print(match.expand (r'\1"' + text + r'"\5'))
 
         # Expand description end
         if end:
-            print end,
+            print(end, end='')
 
 # Compile a list of translation classes to all supported languages.
 # The first translation is always a NullTranslations.
@@ -160,14 +162,13 @@
 reDESC_END   = re.compile (r'\s*DRI_CONF_DESC_END')
 
 # Print a header
-print \
-"/***********************************************************************\n" \
+print("/***********************************************************************\n" \
 " ***        THIS FILE IS GENERATED AUTOMATICALLY. DON'T EDIT!        ***\n" \
-" ***********************************************************************/"
+" ***********************************************************************/")
 
 # Process the options template and generate options.h with all
 # translations.
-template = file (template_header_path, "r")
+template = open (template_header_path, "r")
 descMatches = []
 for line in template:
     if len(descMatches) > 0:
@@ -185,7 +186,7 @@
         continue
     if reLibintl_h.search (line):
         # Ignore (comment out) #include <libintl.h>
-        print "/* %s * commented out by gen_xmlpool.py */" % line
+        print("/* %s * commented out by gen_xmlpool.py */" % line)
         continue
     matchDESC       = reDESC      .match (line)
     matchDESC_BEGIN = reDESC_BEGIN.match (line)
@@ -196,7 +197,9 @@
         assert len(descMatches) == 0
         descMatches = [matchDESC_BEGIN]
     else:
-        print line,
+        print(line, end='')
+
+template.close()
 
 if len(descMatches) > 0:
     sys.stderr.write ("Warning: unterminated description at end of file.\n")
diff --git a/src/util/xmlpool/t_options.h b/src/util/xmlpool/t_options.h
index 3ada813..7d21750 100644
--- a/src/util/xmlpool/t_options.h
+++ b/src/util/xmlpool/t_options.h
@@ -115,6 +115,16 @@
         DRI_CONF_DESC(en,gettext("Allow GLSL #extension directives in the middle of shaders")) \
 DRI_CONF_OPT_END
 
+#define DRI_CONF_ALLOW_GLSL_BUILTIN_CONST_EXPRESSION(def) \
+DRI_CONF_OPT_BEGIN_B(allow_glsl_builtin_const_expression, def) \
+        DRI_CONF_DESC(en,gettext("Allow builtins as part of constant expressions")) \
+DRI_CONF_OPT_END
+
+#define DRI_CONF_ALLOW_GLSL_RELAXED_ES(def) \
+DRI_CONF_OPT_BEGIN_B(allow_glsl_relaxed_es, def) \
+        DRI_CONF_DESC(en,gettext("Allow some relaxation of GLSL ES shader restrictions")) \
+DRI_CONF_OPT_END
+
 #define DRI_CONF_ALLOW_GLSL_BUILTIN_VARIABLE_REDECLARATION(def) \
 DRI_CONF_OPT_BEGIN_B(allow_glsl_builtin_variable_redeclaration, def) \
         DRI_CONF_DESC(en,gettext("Allow GLSL built-in variables to be redeclared verbatim")) \
@@ -140,6 +150,11 @@
         DRI_CONF_DESC(en,gettext("Allow interpolation qualifier mismatch across shader stages")) \
 DRI_CONF_OPT_END
 
+#define DRI_CONF_FORCE_COMPAT_PROFILE(def) \
+DRI_CONF_OPT_BEGIN_B(force_compat_profile, def) \
+        DRI_CONF_DESC(en,gettext("Force an OpenGL compatibility context")) \
+DRI_CONF_OPT_END
+
 /**
  * \brief Image quality-related options
  */
diff --git a/src/vulkan/Makefile.am b/src/vulkan/Makefile.am
index 3da03ed..ce1a79d 100644
--- a/src/vulkan/Makefile.am
+++ b/src/vulkan/Makefile.am
@@ -54,6 +54,20 @@
 VULKAN_WSI_SOURCES += $(VULKAN_WSI_X11_FILES)
 endif
 
+if HAVE_PLATFORM_DRM
+AM_CPPFLAGS += \
+	-DVK_USE_PLATFORM_DISPLAY_KHR
+
+VULKAN_WSI_SOURCES += $(VULKAN_WSI_DISPLAY_FILES)
+endif
+
+if HAVE_XLIB_LEASE
+AM_CPPFLAGS += \
+	$(XCB_RANDR_CFLAGS) \
+	$(XLIB_RANDR_CFLAGS) \
+	-DVK_USE_PLATFORM_XLIB_XRANDR_EXT
+endif
+
 CLEANFILES = \
 	$(VULKAN_UTIL_GENERATED_FILES) \
 	$(VULKAN_WSI_WAYLAND_GENERATED_FILES)
diff --git a/src/vulkan/Makefile.sources b/src/vulkan/Makefile.sources
index 101a943..f0f6bcd 100644
--- a/src/vulkan/Makefile.sources
+++ b/src/vulkan/Makefile.sources
@@ -19,6 +19,10 @@
 	wsi/wsi_common_x11.c \
 	wsi/wsi_common_x11.h
 
+VULKAN_WSI_DISPLAY_FILES := \
+	wsi/wsi_common_display.c \
+	wsi/wsi_common_display.h
+
 VULKAN_UTIL_FILES := \
 	util/vk_alloc.h \
 	util/vk_debug_report.c \
diff --git a/src/vulkan/util/gen_enum_to_str.py b/src/vulkan/util/gen_enum_to_str.py
index fa47099..6d36ef5 100755
--- a/src/vulkan/util/gen_enum_to_str.py
+++ b/src/vulkan/util/gen_enum_to_str.py
@@ -177,7 +177,7 @@
     def add_value_from_xml(self, elem, extension=None):
         if 'value' in elem.attrib:
             self.add_value(elem.attrib['name'],
-                           value=int(elem.attrib['value'], 0))
+                           value=int(elem.attrib['value'], base=0))
         elif 'alias' in elem.attrib:
             self.add_value(elem.attrib['name'],
                            value=self.name_to_value[elem.attrib['alias']])
diff --git a/src/vulkan/util/vk_alloc.h b/src/vulkan/util/vk_alloc.h
index f58a806..69cead7 100644
--- a/src/vulkan/util/vk_alloc.h
+++ b/src/vulkan/util/vk_alloc.h
@@ -67,6 +67,23 @@
    alloc->pfnFree(alloc->pUserData, data);
 }
 
+static inline char *
+vk_strdup(const VkAllocationCallbacks *alloc, const char *s,
+          VkSystemAllocationScope scope)
+{
+   if (s == NULL)
+      return NULL;
+
+   size_t size = strlen(s) + 1;
+   char *copy = (char*) vk_alloc(alloc, size, 1, scope);
+   if (copy == NULL)
+      return NULL;
+
+   memcpy(copy, s, size);
+
+   return copy;
+}
+
 static inline void *
 vk_alloc2(const VkAllocationCallbacks *parent_alloc,
           const VkAllocationCallbacks *alloc,
diff --git a/src/vulkan/wsi/meson.build b/src/vulkan/wsi/meson.build
index 223c8ca..d073b23 100644
--- a/src/vulkan/wsi/meson.build
+++ b/src/vulkan/wsi/meson.build
@@ -59,6 +59,19 @@
   ]
 endif
 
+if with_platform_drm
+  vulkan_wsi_args += '-DVK_USE_PLATFORM_DISPLAY_KHR'
+  files_vulkan_wsi += files(
+    'wsi_common_display.c',
+    'wsi_common_display.h',
+  )
+endif
+
+if with_xlib_lease
+  vulkan_wsi_deps += [dep_xcb_xrandr, dep_xlib_xrandr]
+  vulkan_wsi_args += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
+endif
+
 libvulkan_wsi = static_library(
   'vulkan_wsi',
   files_vulkan_wsi,
diff --git a/src/vulkan/wsi/wsi_common.c b/src/vulkan/wsi/wsi_common.c
index 87e508d..3416fef 100644
--- a/src/vulkan/wsi/wsi_common.c
+++ b/src/vulkan/wsi/wsi_common.c
@@ -32,7 +32,8 @@
 wsi_device_init(struct wsi_device *wsi,
                 VkPhysicalDevice pdevice,
                 WSI_FN_GetPhysicalDeviceProcAddr proc_addr,
-                const VkAllocationCallbacks *alloc)
+                const VkAllocationCallbacks *alloc,
+                int display_fd)
 {
    VkResult result;
 
@@ -91,6 +92,12 @@
       goto fail;
 #endif
 
+#ifdef VK_USE_PLATFORM_DISPLAY_KHR
+   result = wsi_display_init_wsi(wsi, alloc, display_fd);
+   if (result != VK_SUCCESS)
+      goto fail;
+#endif
+
    return VK_SUCCESS;
 
 fail:
@@ -102,6 +109,9 @@
 wsi_device_finish(struct wsi_device *wsi,
                   const VkAllocationCallbacks *alloc)
 {
+#ifdef VK_USE_PLATFORM_DISPLAY_KHR
+   wsi_display_finish_wsi(wsi, alloc);
+#endif
 #ifdef VK_USE_PLATFORM_WAYLAND_KHR
    wsi_wl_finish_wsi(wsi, alloc);
 #endif
@@ -685,7 +695,16 @@
    ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
    struct wsi_interface *iface = wsi_device->wsi[surface->platform];
 
-   return iface->get_capabilities(surface, pSurfaceCapabilities);
+   VkSurfaceCapabilities2KHR caps2 = {
+      .sType = VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR,
+   };
+
+   VkResult result = iface->get_capabilities2(surface, NULL, &caps2);
+
+   if (result == VK_SUCCESS)
+      *pSurfaceCapabilities = caps2.surfaceCapabilities;
+
+   return result;
 }
 
 VkResult
@@ -701,6 +720,51 @@
 }
 
 VkResult
+wsi_common_get_surface_capabilities2ext(
+   struct wsi_device *wsi_device,
+   VkSurfaceKHR _surface,
+   VkSurfaceCapabilities2EXT *pSurfaceCapabilities)
+{
+   ICD_FROM_HANDLE(VkIcdSurfaceBase, surface, _surface);
+   struct wsi_interface *iface = wsi_device->wsi[surface->platform];
+
+   assert(pSurfaceCapabilities->sType ==
+          VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_EXT);
+
+   struct wsi_surface_supported_counters counters = {
+      .sType = VK_STRUCTURE_TYPE_WSI_SURFACE_SUPPORTED_COUNTERS_MESA,
+      .pNext = pSurfaceCapabilities->pNext,
+      .supported_surface_counters = 0,
+   };
+
+   VkSurfaceCapabilities2KHR caps2 = {
+      .sType = VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR,
+      .pNext = &counters,
+   };
+
+   VkResult result = iface->get_capabilities2(surface, NULL, &caps2);
+
+   if (result == VK_SUCCESS) {
+      VkSurfaceCapabilities2EXT *ext_caps = pSurfaceCapabilities;
+      VkSurfaceCapabilitiesKHR khr_caps = caps2.surfaceCapabilities;
+
+      ext_caps->minImageCount = khr_caps.minImageCount;
+      ext_caps->maxImageCount = khr_caps.maxImageCount;
+      ext_caps->currentExtent = khr_caps.currentExtent;
+      ext_caps->minImageExtent = khr_caps.minImageExtent;
+      ext_caps->maxImageExtent = khr_caps.maxImageExtent;
+      ext_caps->maxImageArrayLayers = khr_caps.maxImageArrayLayers;
+      ext_caps->supportedTransforms = khr_caps.supportedTransforms;
+      ext_caps->currentTransform = khr_caps.currentTransform;
+      ext_caps->supportedCompositeAlpha = khr_caps.supportedCompositeAlpha;
+      ext_caps->supportedUsageFlags = khr_caps.supportedUsageFlags;
+      ext_caps->supportedSurfaceCounters = counters.supported_surface_counters;
+   }
+
+   return result;
+}
+
+VkResult
 wsi_common_get_surface_formats(struct wsi_device *wsi_device,
                                VkSurfaceKHR _surface,
                                uint32_t *pSurfaceFormatCount,
@@ -792,17 +856,14 @@
 }
 
 VkResult
-wsi_common_acquire_next_image(const struct wsi_device *wsi,
-                              VkDevice device,
-                              VkSwapchainKHR _swapchain,
-                              uint64_t timeout,
-                              VkSemaphore semaphore,
-                              uint32_t *pImageIndex)
+wsi_common_acquire_next_image2(const struct wsi_device *wsi,
+                               VkDevice device,
+                               const VkAcquireNextImageInfoKHR *pAcquireInfo,
+                               uint32_t *pImageIndex)
 {
-   WSI_FROM_HANDLE(wsi_swapchain, swapchain, _swapchain);
+   WSI_FROM_HANDLE(wsi_swapchain, swapchain, pAcquireInfo->swapchain);
 
-   return swapchain->acquire_next_image(swapchain, timeout,
-                                        semaphore, pImageIndex);
+   return swapchain->acquire_next_image(swapchain, pAcquireInfo, pImageIndex);
 }
 
 VkResult
diff --git a/src/vulkan/wsi/wsi_common.h b/src/vulkan/wsi/wsi_common.h
index 6cf729b..14f6509 100644
--- a/src/vulkan/wsi/wsi_common.h
+++ b/src/vulkan/wsi/wsi_common.h
@@ -36,6 +36,7 @@
 #define VK_STRUCTURE_TYPE_WSI_IMAGE_CREATE_INFO_MESA (VkStructureType)1000001002
 #define VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA (VkStructureType)1000001003
 #define VK_STRUCTURE_TYPE_WSI_FORMAT_MODIFIER_PROPERTIES_LIST_MESA (VkStructureType)1000001004
+#define VK_STRUCTURE_TYPE_WSI_SURFACE_SUPPORTED_COUNTERS_MESA (VkStructureType)1000001005
 
 struct wsi_image_create_info {
     VkStructureType sType;
@@ -66,9 +67,27 @@
    struct wsi_format_modifier_properties *modifier_properties;
 };
 
+/* To be chained into VkSurfaceCapabilities2KHR */
+struct wsi_surface_supported_counters {
+   VkStructureType sType;
+   const void *pNext;
+
+   VkSurfaceCounterFlagsEXT supported_surface_counters;
+
+};
+
+struct wsi_fence {
+   VkDevice                     device;
+   const struct wsi_device      *wsi_device;
+   VkDisplayKHR                 display;
+   const VkAllocationCallbacks  *alloc;
+   VkResult                     (*wait)(struct wsi_fence *fence, uint64_t abs_timeout);
+   void                         (*destroy)(struct wsi_fence *fence);
+};
+
 struct wsi_interface;
 
-#define VK_ICD_WSI_PLATFORM_MAX 5
+#define VK_ICD_WSI_PLATFORM_MAX (VK_ICD_WSI_PLATFORM_DISPLAY + 1)
 
 struct wsi_device {
    VkPhysicalDevice pdevice;
@@ -116,7 +135,8 @@
 wsi_device_init(struct wsi_device *wsi,
                 VkPhysicalDevice pdevice,
                 WSI_FN_GetPhysicalDeviceProcAddr proc_addr,
-                const VkAllocationCallbacks *alloc);
+                const VkAllocationCallbacks *alloc,
+                int display_fd);
 
 void
 wsi_device_finish(struct wsi_device *wsi,
@@ -178,17 +198,21 @@
                                      VkPresentModeKHR *pPresentModes);
 
 VkResult
+wsi_common_get_surface_capabilities2ext(
+   struct wsi_device *wsi_device,
+   VkSurfaceKHR surface,
+   VkSurfaceCapabilities2EXT *pSurfaceCapabilities);
+
+VkResult
 wsi_common_get_images(VkSwapchainKHR _swapchain,
                       uint32_t *pSwapchainImageCount,
                       VkImage *pSwapchainImages);
 
 VkResult
-wsi_common_acquire_next_image(const struct wsi_device *wsi,
-                              VkDevice device,
-                              VkSwapchainKHR swapchain,
-                              uint64_t timeout,
-                              VkSemaphore semaphore,
-                              uint32_t *pImageIndex);
+wsi_common_acquire_next_image2(const struct wsi_device *wsi,
+                               VkDevice device,
+                               const VkAcquireNextImageInfoKHR *pAcquireInfo,
+                               uint32_t *pImageIndex);
 
 VkResult
 wsi_common_create_swapchain(struct wsi_device *wsi,
diff --git a/src/vulkan/wsi/wsi_common_display.c b/src/vulkan/wsi/wsi_common_display.c
new file mode 100644
index 0000000..bc87ce4
--- /dev/null
+++ b/src/vulkan/wsi/wsi_common_display.c
@@ -0,0 +1,2468 @@
+/*
+ * Copyright © 2017 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include "util/macros.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <math.h>
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+#include <drm_fourcc.h>
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+#include <xcb/randr.h>
+#include <X11/Xlib-xcb.h>
+#endif
+#include "util/hash_table.h"
+#include "util/list.h"
+
+#include "vk_util.h"
+#include "wsi_common_private.h"
+#include "wsi_common_display.h"
+#include "wsi_common_queue.h"
+
+#if 0
+#define wsi_display_debug(...) fprintf(stderr, __VA_ARGS__)
+#define wsi_display_debug_code(...)     __VA_ARGS__
+#else
+#define wsi_display_debug(...)
+#define wsi_display_debug_code(...)
+#endif
+
+/* These have lifetime equal to the instance, so they effectively
+ * never go away. This means we must keep track of them separately
+ * from all other resources.
+ */
+typedef struct wsi_display_mode {
+   struct list_head             list;
+   struct wsi_display_connector *connector;
+   bool                         valid; /* was found in most recent poll */
+   bool                         preferred;
+   uint32_t                     clock; /* in kHz */
+   uint16_t                     hdisplay, hsync_start, hsync_end, htotal, hskew;
+   uint16_t                     vdisplay, vsync_start, vsync_end, vtotal, vscan;
+   uint32_t                     flags;
+} wsi_display_mode;
+
+typedef struct wsi_display_connector {
+   struct list_head             list;
+   struct wsi_display           *wsi;
+   uint32_t                     id;
+   uint32_t                     crtc_id;
+   char                         *name;
+   bool                         connected;
+   bool                         active;
+   struct list_head             display_modes;
+   wsi_display_mode             *current_mode;
+   drmModeModeInfo              current_drm_mode;
+   uint32_t                     dpms_property;
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+   xcb_randr_output_t           output;
+#endif
+} wsi_display_connector;
+
+struct wsi_display {
+   struct wsi_interface         base;
+
+   const VkAllocationCallbacks  *alloc;
+
+   int                          fd;
+
+   pthread_mutex_t              wait_mutex;
+   pthread_cond_t               wait_cond;
+   pthread_t                    wait_thread;
+
+   struct list_head             connectors;
+};
+
+#define wsi_for_each_display_mode(_mode, _conn)                 \
+   list_for_each_entry_safe(struct wsi_display_mode, _mode,     \
+                            &(_conn)->display_modes, list)
+
+#define wsi_for_each_connector(_conn, _dev)                             \
+   list_for_each_entry_safe(struct wsi_display_connector, _conn,        \
+                            &(_dev)->connectors, list)
+
+enum wsi_image_state {
+   WSI_IMAGE_IDLE,
+   WSI_IMAGE_DRAWING,
+   WSI_IMAGE_QUEUED,
+   WSI_IMAGE_FLIPPING,
+   WSI_IMAGE_DISPLAYING
+};
+
+struct wsi_display_image {
+   struct wsi_image             base;
+   struct wsi_display_swapchain *chain;
+   enum wsi_image_state         state;
+   uint32_t                     fb_id;
+   uint32_t                     buffer[4];
+   uint64_t                     flip_sequence;
+};
+
+struct wsi_display_swapchain {
+   struct wsi_swapchain         base;
+   struct wsi_display           *wsi;
+   VkIcdSurfaceDisplay          *surface;
+   uint64_t                     flip_sequence;
+   VkResult                     status;
+   struct wsi_display_image     images[0];
+};
+
+struct wsi_display_fence {
+   struct wsi_fence             base;
+   bool                         event_received;
+   bool                         destroyed;
+   uint64_t                     sequence;
+};
+
+static uint64_t fence_sequence;
+
+ICD_DEFINE_NONDISP_HANDLE_CASTS(wsi_display_mode, VkDisplayModeKHR)
+ICD_DEFINE_NONDISP_HANDLE_CASTS(wsi_display_connector, VkDisplayKHR)
+
+static bool
+wsi_display_mode_matches_drm(wsi_display_mode *wsi,
+                             drmModeModeInfoPtr drm)
+{
+   return wsi->clock == drm->clock &&
+      wsi->hdisplay == drm->hdisplay &&
+      wsi->hsync_start == drm->hsync_start &&
+      wsi->hsync_end == drm->hsync_end &&
+      wsi->htotal == drm->htotal &&
+      wsi->hskew == drm->hskew &&
+      wsi->vdisplay == drm->vdisplay &&
+      wsi->vsync_start == drm->vsync_start &&
+      wsi->vsync_end == drm->vsync_end &&
+      wsi->vtotal == drm->vtotal &&
+      MAX2(wsi->vscan, 1) == MAX2(drm->vscan, 1) &&
+      wsi->flags == drm->flags;
+}
+
+static double
+wsi_display_mode_refresh(struct wsi_display_mode *wsi)
+{
+   return (double) wsi->clock * 1000.0 / ((double) wsi->htotal *
+                                          (double) wsi->vtotal *
+                                          (double) MAX2(wsi->vscan, 1));
+}
+
+static uint64_t wsi_get_current_monotonic(void)
+{
+   struct timespec tv;
+
+   clock_gettime(CLOCK_MONOTONIC, &tv);
+   return tv.tv_nsec + tv.tv_sec*1000000000ull;
+}
+
+static uint64_t wsi_rel_to_abs_time(uint64_t rel_time)
+{
+   uint64_t current_time = wsi_get_current_monotonic();
+
+   /* check for overflow */
+   if (rel_time > UINT64_MAX - current_time)
+      return UINT64_MAX;
+
+   return current_time + rel_time;
+}
+
+static struct wsi_display_mode *
+wsi_display_find_drm_mode(struct wsi_device *wsi_device,
+                          struct wsi_display_connector *connector,
+                          drmModeModeInfoPtr mode)
+{
+   wsi_for_each_display_mode(display_mode, connector) {
+      if (wsi_display_mode_matches_drm(display_mode, mode))
+         return display_mode;
+   }
+   return NULL;
+}
+
+static void
+wsi_display_invalidate_connector_modes(struct wsi_device *wsi_device,
+                                       struct wsi_display_connector *connector)
+{
+   wsi_for_each_display_mode(display_mode, connector) {
+      display_mode->valid = false;
+   }
+}
+
+static VkResult
+wsi_display_register_drm_mode(struct wsi_device *wsi_device,
+                              struct wsi_display_connector *connector,
+                              drmModeModeInfoPtr drm_mode)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   struct wsi_display_mode *display_mode =
+      wsi_display_find_drm_mode(wsi_device, connector, drm_mode);
+
+   if (display_mode) {
+      display_mode->valid = true;
+      return VK_SUCCESS;
+   }
+
+   display_mode = vk_zalloc(wsi->alloc, sizeof (struct wsi_display_mode),
+                            8, VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (!display_mode)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   display_mode->connector = connector;
+   display_mode->valid = true;
+   display_mode->preferred = (drm_mode->type & DRM_MODE_TYPE_PREFERRED) != 0;
+   display_mode->clock = drm_mode->clock; /* kHz */
+   display_mode->hdisplay = drm_mode->hdisplay;
+   display_mode->hsync_start = drm_mode->hsync_start;
+   display_mode->hsync_end = drm_mode->hsync_end;
+   display_mode->htotal = drm_mode->htotal;
+   display_mode->hskew = drm_mode->hskew;
+   display_mode->vdisplay = drm_mode->vdisplay;
+   display_mode->vsync_start = drm_mode->vsync_start;
+   display_mode->vsync_end = drm_mode->vsync_end;
+   display_mode->vtotal = drm_mode->vtotal;
+   display_mode->vscan = drm_mode->vscan;
+   display_mode->flags = drm_mode->flags;
+
+   list_addtail(&display_mode->list, &connector->display_modes);
+   return VK_SUCCESS;
+}
+
+/*
+ * Update our information about a specific connector
+ */
+
+static struct wsi_display_connector *
+wsi_display_find_connector(struct wsi_device *wsi_device,
+                          uint32_t connector_id)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   wsi_for_each_connector(connector, wsi) {
+      if (connector->id == connector_id)
+         return connector;
+   }
+
+   return NULL;
+}
+
+static struct wsi_display_connector *
+wsi_display_alloc_connector(struct wsi_display *wsi,
+                            uint32_t connector_id)
+{
+   struct wsi_display_connector *connector =
+      vk_zalloc(wsi->alloc, sizeof (struct wsi_display_connector),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+
+   connector->id = connector_id;
+   connector->wsi = wsi;
+   connector->active = false;
+   /* XXX use EDID name */
+   connector->name = "monitor";
+   list_inithead(&connector->display_modes);
+   return connector;
+}
+
+static struct wsi_display_connector *
+wsi_display_get_connector(struct wsi_device *wsi_device,
+                          uint32_t connector_id)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   if (wsi->fd < 0)
+      return NULL;
+
+   drmModeConnectorPtr drm_connector =
+      drmModeGetConnector(wsi->fd, connector_id);
+
+   if (!drm_connector)
+      return NULL;
+
+   struct wsi_display_connector *connector =
+      wsi_display_find_connector(wsi_device, connector_id);
+
+   if (!connector) {
+      connector = wsi_display_alloc_connector(wsi, connector_id);
+      if (!connector) {
+         drmModeFreeConnector(drm_connector);
+         return NULL;
+      }
+      list_addtail(&connector->list, &wsi->connectors);
+   }
+
+   connector->connected = drm_connector->connection != DRM_MODE_DISCONNECTED;
+
+   /* Look for a DPMS property if we haven't already found one */
+   for (int p = 0; connector->dpms_property == 0 &&
+           p < drm_connector->count_props; p++)
+   {
+      drmModePropertyPtr prop = drmModeGetProperty(wsi->fd,
+                                                   drm_connector->props[p]);
+      if (!prop)
+         continue;
+      if (prop->flags & DRM_MODE_PROP_ENUM) {
+         if (!strcmp(prop->name, "DPMS"))
+            connector->dpms_property = drm_connector->props[p];
+      }
+      drmModeFreeProperty(prop);
+   }
+
+   /* Mark all connector modes as invalid */
+   wsi_display_invalidate_connector_modes(wsi_device, connector);
+
+   /*
+    * List current modes, adding new ones and marking existing ones as
+    * valid
+    */
+   for (int m = 0; m < drm_connector->count_modes; m++) {
+      VkResult result = wsi_display_register_drm_mode(wsi_device,
+                                                      connector,
+                                                      &drm_connector->modes[m]);
+      if (result != VK_SUCCESS) {
+         drmModeFreeConnector(drm_connector);
+         return NULL;
+      }
+   }
+
+   drmModeFreeConnector(drm_connector);
+
+   return connector;
+}
+
+#define MM_PER_PIXEL     (1.0/96.0 * 25.4)
+
+static uint32_t
+mode_size(struct wsi_display_mode *mode)
+{
+   /* fortunately, these are both uint16_t, so this is easy */
+   return (uint32_t) mode->hdisplay * (uint32_t) mode->vdisplay;
+}
+
+static void
+wsi_display_fill_in_display_properties(struct wsi_device *wsi_device,
+                                       struct wsi_display_connector *connector,
+                                       VkDisplayProperties2KHR *properties2)
+{
+   assert(properties2->sType == VK_STRUCTURE_TYPE_DISPLAY_PROPERTIES_2_KHR);
+   VkDisplayPropertiesKHR *properties = &properties2->displayProperties;
+
+   properties->display = wsi_display_connector_to_handle(connector);
+   properties->displayName = connector->name;
+
+   /* Find the first preferred mode and assume that's the physical
+    * resolution. If there isn't a preferred mode, find the largest mode and
+    * use that.
+    */
+
+   struct wsi_display_mode *preferred_mode = NULL, *largest_mode = NULL;
+   wsi_for_each_display_mode(display_mode, connector) {
+      if (!display_mode->valid)
+         continue;
+      if (display_mode->preferred) {
+         preferred_mode = display_mode;
+         break;
+      }
+      if (largest_mode == NULL ||
+          mode_size(display_mode) > mode_size(largest_mode))
+      {
+         largest_mode = display_mode;
+      }
+   }
+
+   if (preferred_mode) {
+      properties->physicalResolution.width = preferred_mode->hdisplay;
+      properties->physicalResolution.height = preferred_mode->vdisplay;
+   } else if (largest_mode) {
+      properties->physicalResolution.width = largest_mode->hdisplay;
+      properties->physicalResolution.height = largest_mode->vdisplay;
+   } else {
+      properties->physicalResolution.width = 1024;
+      properties->physicalResolution.height = 768;
+   }
+
+   /* Make up physical size based on 96dpi */
+   properties->physicalDimensions.width =
+      floor(properties->physicalResolution.width * MM_PER_PIXEL + 0.5);
+   properties->physicalDimensions.height =
+      floor(properties->physicalResolution.height * MM_PER_PIXEL + 0.5);
+
+   properties->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
+   properties->planeReorderPossible = VK_FALSE;
+   properties->persistentContent = VK_FALSE;
+}
+
+/*
+ * Implement vkGetPhysicalDeviceDisplayPropertiesKHR (VK_KHR_display)
+ */
+VkResult
+wsi_display_get_physical_device_display_properties(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t *property_count,
+   VkDisplayPropertiesKHR *properties)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   if (properties == NULL) {
+      return wsi_display_get_physical_device_display_properties2(
+            physical_device, wsi_device, property_count, NULL);
+   } else {
+      /* If we're actually returning properties, allocate a temporary array of
+       * VkDisplayProperties2KHR structs, call properties2 to fill them out,
+       * and then copy them to the client.  This seems a bit expensive but
+       * wsi_display_get_physical_device_display_properties2() calls
+       * drmModeGetResources() which does an ioctl and then a bunch of
+       * allocations so this should get lost in the noise.
+       */
+      VkDisplayProperties2KHR *props2 =
+         vk_zalloc(wsi->alloc, sizeof(*props2) * *property_count, 8,
+                   VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (props2 == NULL)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+      for (uint32_t i = 0; i < *property_count; i++)
+         props2[i].sType = VK_STRUCTURE_TYPE_DISPLAY_PROPERTIES_2_KHR;
+
+      VkResult result = wsi_display_get_physical_device_display_properties2(
+            physical_device, wsi_device, property_count, props2);
+
+      if (result == VK_SUCCESS || result == VK_INCOMPLETE) {
+         for (uint32_t i = 0; i < *property_count; i++)
+            properties[i] = props2[i].displayProperties;
+      }
+
+      vk_free(wsi->alloc, props2);
+
+      return result;
+   }
+}
+
+VkResult
+wsi_display_get_physical_device_display_properties2(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t *property_count,
+   VkDisplayProperties2KHR *properties)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   if (wsi->fd < 0)
+      goto bail;
+
+   drmModeResPtr mode_res = drmModeGetResources(wsi->fd);
+
+   if (!mode_res)
+      goto bail;
+
+   VK_OUTARRAY_MAKE(conn, properties, property_count);
+
+   /* Get current information */
+
+   for (int c = 0; c < mode_res->count_connectors; c++) {
+      struct wsi_display_connector *connector =
+         wsi_display_get_connector(wsi_device, mode_res->connectors[c]);
+
+      if (!connector) {
+         drmModeFreeResources(mode_res);
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      }
+
+      if (connector->connected) {
+         vk_outarray_append(&conn, prop) {
+            wsi_display_fill_in_display_properties(wsi_device,
+                                                   connector,
+                                                   prop);
+         }
+      }
+   }
+
+   drmModeFreeResources(mode_res);
+
+   return vk_outarray_status(&conn);
+
+bail:
+   *property_count = 0;
+   return VK_SUCCESS;
+}
+
+/*
+ * Implement vkGetPhysicalDeviceDisplayPlanePropertiesKHR (VK_KHR_display
+ */
+static void
+wsi_display_fill_in_display_plane_properties(
+   struct wsi_device *wsi_device,
+   struct wsi_display_connector *connector,
+   VkDisplayPlaneProperties2KHR *properties)
+{
+   assert(properties->sType == VK_STRUCTURE_TYPE_DISPLAY_PLANE_PROPERTIES_2_KHR);
+   VkDisplayPlanePropertiesKHR *prop = &properties->displayPlaneProperties;
+
+   if (connector && connector->active) {
+      prop->currentDisplay = wsi_display_connector_to_handle(connector);
+      prop->currentStackIndex = 0;
+   } else {
+      prop->currentDisplay = VK_NULL_HANDLE;
+      prop->currentStackIndex = 0;
+   }
+}
+
+VkResult
+wsi_display_get_physical_device_display_plane_properties(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t *property_count,
+   VkDisplayPlanePropertiesKHR *properties)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   VK_OUTARRAY_MAKE(conn, properties, property_count);
+
+   wsi_for_each_connector(connector, wsi) {
+      vk_outarray_append(&conn, prop) {
+         VkDisplayPlaneProperties2KHR prop2 = {
+            .sType = VK_STRUCTURE_TYPE_DISPLAY_PLANE_PROPERTIES_2_KHR,
+         };
+         wsi_display_fill_in_display_plane_properties(wsi_device, connector,
+                                                      &prop2);
+         *prop = prop2.displayPlaneProperties;
+      }
+   }
+   return vk_outarray_status(&conn);
+}
+
+VkResult
+wsi_display_get_physical_device_display_plane_properties2(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t *property_count,
+   VkDisplayPlaneProperties2KHR *properties)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   VK_OUTARRAY_MAKE(conn, properties, property_count);
+
+   wsi_for_each_connector(connector, wsi) {
+      vk_outarray_append(&conn, prop) {
+         wsi_display_fill_in_display_plane_properties(wsi_device, connector,
+                                                      prop);
+      }
+   }
+   return vk_outarray_status(&conn);
+}
+
+/*
+ * Implement vkGetDisplayPlaneSupportedDisplaysKHR (VK_KHR_display)
+ */
+
+VkResult
+wsi_display_get_display_plane_supported_displays(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t plane_index,
+   uint32_t *display_count,
+   VkDisplayKHR *displays)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   VK_OUTARRAY_MAKE(conn, displays, display_count);
+
+   int c = 0;
+
+   wsi_for_each_connector(connector, wsi) {
+      if (c == plane_index && connector->connected) {
+         vk_outarray_append(&conn, display) {
+            *display = wsi_display_connector_to_handle(connector);
+         }
+      }
+      c++;
+   }
+   return vk_outarray_status(&conn);
+}
+
+/*
+ * Implement vkGetDisplayModePropertiesKHR (VK_KHR_display)
+ */
+
+static void
+wsi_display_fill_in_display_mode_properties(
+   struct wsi_device *wsi_device,
+   struct wsi_display_mode *display_mode,
+   VkDisplayModeProperties2KHR *properties)
+{
+   assert(properties->sType == VK_STRUCTURE_TYPE_DISPLAY_MODE_PROPERTIES_2_KHR);
+   VkDisplayModePropertiesKHR *prop = &properties->displayModeProperties;
+
+   prop->displayMode = wsi_display_mode_to_handle(display_mode);
+   prop->parameters.visibleRegion.width = display_mode->hdisplay;
+   prop->parameters.visibleRegion.height = display_mode->vdisplay;
+   prop->parameters.refreshRate =
+      (uint32_t) (wsi_display_mode_refresh(display_mode) * 1000 + 0.5);
+}
+
+VkResult
+wsi_display_get_display_mode_properties(VkPhysicalDevice physical_device,
+                                        struct wsi_device *wsi_device,
+                                        VkDisplayKHR display,
+                                        uint32_t *property_count,
+                                        VkDisplayModePropertiesKHR *properties)
+{
+   struct wsi_display_connector *connector =
+      wsi_display_connector_from_handle(display);
+
+   VK_OUTARRAY_MAKE(conn, properties, property_count);
+
+   wsi_for_each_display_mode(display_mode, connector) {
+      if (!display_mode->valid)
+         continue;
+
+      vk_outarray_append(&conn, prop) {
+         VkDisplayModeProperties2KHR prop2 = {
+            .sType = VK_STRUCTURE_TYPE_DISPLAY_MODE_PROPERTIES_2_KHR,
+         };
+         wsi_display_fill_in_display_mode_properties(wsi_device,
+                                                     display_mode, &prop2);
+         *prop = prop2.displayModeProperties;
+      }
+   }
+   return vk_outarray_status(&conn);
+}
+
+VkResult
+wsi_display_get_display_mode_properties2(VkPhysicalDevice physical_device,
+                                         struct wsi_device *wsi_device,
+                                         VkDisplayKHR display,
+                                         uint32_t *property_count,
+                                         VkDisplayModeProperties2KHR *properties)
+{
+   struct wsi_display_connector *connector =
+      wsi_display_connector_from_handle(display);
+
+   VK_OUTARRAY_MAKE(conn, properties, property_count);
+
+   wsi_for_each_display_mode(display_mode, connector) {
+      if (!display_mode->valid)
+         continue;
+
+      vk_outarray_append(&conn, prop) {
+         wsi_display_fill_in_display_mode_properties(wsi_device,
+                                                     display_mode, prop);
+      }
+   }
+   return vk_outarray_status(&conn);
+}
+
+static bool
+wsi_display_mode_matches_vk(wsi_display_mode *wsi,
+                            const VkDisplayModeParametersKHR *vk)
+{
+   return (vk->visibleRegion.width == wsi->hdisplay &&
+           vk->visibleRegion.height == wsi->vdisplay &&
+           fabs(wsi_display_mode_refresh(wsi) * 1000.0 - vk->refreshRate) < 10);
+}
+
+/*
+ * Implement vkCreateDisplayModeKHR (VK_KHR_display)
+ */
+VkResult
+wsi_display_create_display_mode(VkPhysicalDevice physical_device,
+                                struct wsi_device *wsi_device,
+                                VkDisplayKHR display,
+                                const VkDisplayModeCreateInfoKHR *create_info,
+                                const VkAllocationCallbacks *allocator,
+                                VkDisplayModeKHR *mode)
+{
+   struct wsi_display_connector *connector =
+      wsi_display_connector_from_handle(display);
+
+   if (create_info->flags != 0)
+      return VK_ERROR_INITIALIZATION_FAILED;
+
+   /* Check and see if the requested mode happens to match an existing one and
+    * return that. This makes the conformance suite happy. Doing more than
+    * this would involve embedding the CVT function into the driver, which seems
+    * excessive.
+    */
+   wsi_for_each_display_mode(display_mode, connector) {
+      if (display_mode->valid) {
+         if (wsi_display_mode_matches_vk(display_mode, &create_info->parameters)) {
+            *mode = wsi_display_mode_to_handle(display_mode);
+            return VK_SUCCESS;
+         }
+      }
+   }
+   return VK_ERROR_INITIALIZATION_FAILED;
+}
+
+/*
+ * Implement vkGetDisplayPlaneCapabilities
+ */
+VkResult
+wsi_get_display_plane_capabilities(VkPhysicalDevice physical_device,
+                                   struct wsi_device *wsi_device,
+                                   VkDisplayModeKHR mode_khr,
+                                   uint32_t plane_index,
+                                   VkDisplayPlaneCapabilitiesKHR *capabilities)
+{
+   struct wsi_display_mode *mode = wsi_display_mode_from_handle(mode_khr);
+
+   /* XXX use actual values */
+   capabilities->supportedAlpha = VK_DISPLAY_PLANE_ALPHA_OPAQUE_BIT_KHR;
+   capabilities->minSrcPosition.x = 0;
+   capabilities->minSrcPosition.y = 0;
+   capabilities->maxSrcPosition.x = 0;
+   capabilities->maxSrcPosition.y = 0;
+   capabilities->minSrcExtent.width = mode->hdisplay;
+   capabilities->minSrcExtent.height = mode->vdisplay;
+   capabilities->maxSrcExtent.width = mode->hdisplay;
+   capabilities->maxSrcExtent.height = mode->vdisplay;
+   capabilities->minDstPosition.x = 0;
+   capabilities->minDstPosition.y = 0;
+   capabilities->maxDstPosition.x = 0;
+   capabilities->maxDstPosition.y = 0;
+   capabilities->minDstExtent.width = mode->hdisplay;
+   capabilities->minDstExtent.height = mode->vdisplay;
+   capabilities->maxDstExtent.width = mode->hdisplay;
+   capabilities->maxDstExtent.height = mode->vdisplay;
+   return VK_SUCCESS;
+}
+
+VkResult
+wsi_get_display_plane_capabilities2(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo,
+   VkDisplayPlaneCapabilities2KHR *capabilities)
+{
+   assert(capabilities->sType ==
+          VK_STRUCTURE_TYPE_DISPLAY_PLANE_CAPABILITIES_2_KHR);
+
+   return wsi_get_display_plane_capabilities(physical_device, wsi_device,
+                                             pDisplayPlaneInfo->mode,
+                                             pDisplayPlaneInfo->planeIndex,
+                                             &capabilities->capabilities);
+}
+
+VkResult
+wsi_create_display_surface(VkInstance instance,
+                           const VkAllocationCallbacks *allocator,
+                           const VkDisplaySurfaceCreateInfoKHR *create_info,
+                           VkSurfaceKHR *surface_khr)
+{
+   VkIcdSurfaceDisplay *surface = vk_zalloc(allocator, sizeof *surface, 8,
+                                            VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (surface == NULL)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   surface->base.platform = VK_ICD_WSI_PLATFORM_DISPLAY;
+
+   surface->displayMode = create_info->displayMode;
+   surface->planeIndex = create_info->planeIndex;
+   surface->planeStackIndex = create_info->planeStackIndex;
+   surface->transform = create_info->transform;
+   surface->globalAlpha = create_info->globalAlpha;
+   surface->alphaMode = create_info->alphaMode;
+   surface->imageExtent = create_info->imageExtent;
+
+   *surface_khr = VkIcdSurfaceBase_to_handle(&surface->base);
+   return VK_SUCCESS;
+}
+
+
+static VkResult
+wsi_display_surface_get_support(VkIcdSurfaceBase *surface,
+                                struct wsi_device *wsi_device,
+                                const VkAllocationCallbacks *allocator,
+                                uint32_t queueFamilyIndex,
+                                int local_fd,
+                                VkBool32* pSupported)
+{
+   *pSupported = VK_TRUE;
+   return VK_SUCCESS;
+}
+
+static VkResult
+wsi_display_surface_get_capabilities(VkIcdSurfaceBase *surface_base,
+                                     VkSurfaceCapabilitiesKHR* caps)
+{
+   VkIcdSurfaceDisplay *surface = (VkIcdSurfaceDisplay *) surface_base;
+   wsi_display_mode *mode = wsi_display_mode_from_handle(surface->displayMode);
+
+   caps->currentExtent.width = mode->hdisplay;
+   caps->currentExtent.height = mode->vdisplay;
+
+   /* XXX Figure out extents based on driver capabilities */
+   caps->maxImageExtent = caps->minImageExtent = caps->currentExtent;
+
+   caps->supportedCompositeAlpha = VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR;
+
+   caps->minImageCount = 2;
+   caps->maxImageCount = 0;
+
+   caps->supportedTransforms = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
+   caps->currentTransform = VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR;
+   caps->maxImageArrayLayers = 1;
+   caps->supportedUsageFlags =
+      VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
+      VK_IMAGE_USAGE_SAMPLED_BIT |
+      VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+      VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+
+   return VK_SUCCESS;
+}
+
+static VkResult
+wsi_display_surface_get_surface_counters(
+   VkIcdSurfaceBase *surface_base,
+   VkSurfaceCounterFlagsEXT *counters)
+{
+   *counters = VK_SURFACE_COUNTER_VBLANK_EXT;
+   return VK_SUCCESS;
+}
+
+static VkResult
+wsi_display_surface_get_capabilities2(VkIcdSurfaceBase *icd_surface,
+                                      const void *info_next,
+                                      VkSurfaceCapabilities2KHR *caps)
+{
+   assert(caps->sType == VK_STRUCTURE_TYPE_SURFACE_CAPABILITIES_2_KHR);
+   VkResult result;
+
+   result = wsi_display_surface_get_capabilities(icd_surface,
+                                                 &caps->surfaceCapabilities);
+   if (result != VK_SUCCESS)
+      return result;
+
+   struct wsi_surface_supported_counters *counters =
+      vk_find_struct( caps->pNext, WSI_SURFACE_SUPPORTED_COUNTERS_MESA);
+
+   if (counters) {
+      result = wsi_display_surface_get_surface_counters(
+         icd_surface,
+         &counters->supported_surface_counters);
+   }
+
+   return result;
+}
+
+static const struct {
+   VkFormat     format;
+   uint32_t     drm_format;
+} available_surface_formats[] = {
+   { .format = VK_FORMAT_B8G8R8A8_SRGB, .drm_format = DRM_FORMAT_XRGB8888 },
+   { .format = VK_FORMAT_B8G8R8A8_UNORM, .drm_format = DRM_FORMAT_XRGB8888 },
+};
+
+static VkResult
+wsi_display_surface_get_formats(VkIcdSurfaceBase *icd_surface,
+                                struct wsi_device *wsi_device,
+                                uint32_t *surface_format_count,
+                                VkSurfaceFormatKHR *surface_formats)
+{
+   VK_OUTARRAY_MAKE(out, surface_formats, surface_format_count);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(available_surface_formats); i++) {
+      vk_outarray_append(&out, f) {
+         f->format = available_surface_formats[i].format;
+         f->colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static VkResult
+wsi_display_surface_get_formats2(VkIcdSurfaceBase *surface,
+                                 struct wsi_device *wsi_device,
+                                 const void *info_next,
+                                 uint32_t *surface_format_count,
+                                 VkSurfaceFormat2KHR *surface_formats)
+{
+   VK_OUTARRAY_MAKE(out, surface_formats, surface_format_count);
+
+   for (unsigned i = 0; i < ARRAY_SIZE(available_surface_formats); i++) {
+      vk_outarray_append(&out, f) {
+         assert(f->sType == VK_STRUCTURE_TYPE_SURFACE_FORMAT_2_KHR);
+         f->surfaceFormat.format = available_surface_formats[i].format;
+         f->surfaceFormat.colorSpace = VK_COLORSPACE_SRGB_NONLINEAR_KHR;
+      }
+   }
+
+   return vk_outarray_status(&out);
+}
+
+static VkResult
+wsi_display_surface_get_present_modes(VkIcdSurfaceBase *surface,
+                                      uint32_t *present_mode_count,
+                                      VkPresentModeKHR *present_modes)
+{
+   VK_OUTARRAY_MAKE(conn, present_modes, present_mode_count);
+
+   vk_outarray_append(&conn, present) {
+      *present = VK_PRESENT_MODE_FIFO_KHR;
+   }
+
+   return vk_outarray_status(&conn);
+}
+
+static void
+wsi_display_destroy_buffer(struct wsi_display *wsi,
+                           uint32_t buffer)
+{
+   (void) drmIoctl(wsi->fd, DRM_IOCTL_MODE_DESTROY_DUMB,
+                   &((struct drm_mode_destroy_dumb) { .handle = buffer }));
+}
+
+static VkResult
+wsi_display_image_init(VkDevice device_h,
+                       struct wsi_swapchain *drv_chain,
+                       const VkSwapchainCreateInfoKHR *create_info,
+                       const VkAllocationCallbacks *allocator,
+                       struct wsi_display_image *image)
+{
+   struct wsi_display_swapchain *chain =
+      (struct wsi_display_swapchain *) drv_chain;
+   struct wsi_display *wsi = chain->wsi;
+   uint32_t drm_format = 0;
+
+   for (unsigned i = 0; i < ARRAY_SIZE(available_surface_formats); i++) {
+      if (create_info->imageFormat == available_surface_formats[i].format) {
+         drm_format = available_surface_formats[i].drm_format;
+         break;
+      }
+   }
+
+   /* the application provided an invalid format, bail */
+   if (drm_format == 0)
+      return VK_ERROR_DEVICE_LOST;
+
+   VkResult result = wsi_create_native_image(&chain->base, create_info,
+                                             0, NULL, NULL,
+                                             &image->base);
+   if (result != VK_SUCCESS)
+      return result;
+
+   memset(image->buffer, 0, sizeof (image->buffer));
+
+   for (unsigned int i = 0; i < image->base.num_planes; i++) {
+      int ret = drmPrimeFDToHandle(wsi->fd, image->base.fds[i],
+                                   &image->buffer[i]);
+
+      close(image->base.fds[i]);
+      image->base.fds[i] = -1;
+      if (ret < 0)
+         goto fail_handle;
+   }
+
+   image->chain = chain;
+   image->state = WSI_IMAGE_IDLE;
+   image->fb_id = 0;
+
+   int ret = drmModeAddFB2(wsi->fd,
+                           create_info->imageExtent.width,
+                           create_info->imageExtent.height,
+                           drm_format,
+                           image->buffer,
+                           image->base.row_pitches,
+                           image->base.offsets,
+                           &image->fb_id, 0);
+
+   if (ret)
+      goto fail_fb;
+
+   return VK_SUCCESS;
+
+fail_fb:
+fail_handle:
+   for (unsigned int i = 0; i < image->base.num_planes; i++) {
+      if (image->buffer[i])
+         wsi_display_destroy_buffer(wsi, image->buffer[i]);
+      if (image->base.fds[i] != -1) {
+         close(image->base.fds[i]);
+         image->base.fds[i] = -1;
+      }
+   }
+
+   wsi_destroy_image(&chain->base, &image->base);
+
+   return VK_ERROR_OUT_OF_HOST_MEMORY;
+}
+
+static void
+wsi_display_image_finish(struct wsi_swapchain *drv_chain,
+                         const VkAllocationCallbacks *allocator,
+                         struct wsi_display_image *image)
+{
+   struct wsi_display_swapchain *chain =
+      (struct wsi_display_swapchain *) drv_chain;
+   struct wsi_display *wsi = chain->wsi;
+
+   drmModeRmFB(wsi->fd, image->fb_id);
+   for (unsigned int i = 0; i < image->base.num_planes; i++)
+      wsi_display_destroy_buffer(wsi, image->buffer[i]);
+   wsi_destroy_image(&chain->base, &image->base);
+}
+
+static VkResult
+wsi_display_swapchain_destroy(struct wsi_swapchain *drv_chain,
+                              const VkAllocationCallbacks *allocator)
+{
+   struct wsi_display_swapchain *chain =
+      (struct wsi_display_swapchain *) drv_chain;
+
+   for (uint32_t i = 0; i < chain->base.image_count; i++)
+      wsi_display_image_finish(drv_chain, allocator, &chain->images[i]);
+   vk_free(allocator, chain);
+   return VK_SUCCESS;
+}
+
+static struct wsi_image *
+wsi_display_get_wsi_image(struct wsi_swapchain *drv_chain,
+                          uint32_t image_index)
+{
+   struct wsi_display_swapchain *chain =
+      (struct wsi_display_swapchain *) drv_chain;
+
+   return &chain->images[image_index].base;
+}
+
+static void
+wsi_display_idle_old_displaying(struct wsi_display_image *active_image)
+{
+   struct wsi_display_swapchain *chain = active_image->chain;
+
+   wsi_display_debug("idle everyone but %ld\n",
+                     active_image - &(chain->images[0]));
+   for (uint32_t i = 0; i < chain->base.image_count; i++)
+      if (chain->images[i].state == WSI_IMAGE_DISPLAYING &&
+          &chain->images[i] != active_image)
+      {
+         wsi_display_debug("idle %d\n", i);
+         chain->images[i].state = WSI_IMAGE_IDLE;
+      }
+}
+
+static VkResult
+_wsi_display_queue_next(struct wsi_swapchain *drv_chain);
+
+static void
+wsi_display_page_flip_handler2(int fd,
+                               unsigned int frame,
+                               unsigned int sec,
+                               unsigned int usec,
+                               uint32_t crtc_id,
+                               void *data)
+{
+   struct wsi_display_image *image = data;
+   struct wsi_display_swapchain *chain = image->chain;
+
+   wsi_display_debug("image %ld displayed at %d\n",
+                     image - &(image->chain->images[0]), frame);
+   image->state = WSI_IMAGE_DISPLAYING;
+   wsi_display_idle_old_displaying(image);
+   VkResult result = _wsi_display_queue_next(&(chain->base));
+   if (result != VK_SUCCESS)
+      chain->status = result;
+}
+
+static void wsi_display_fence_event_handler(struct wsi_display_fence *fence);
+
+static void wsi_display_page_flip_handler(int fd,
+                                          unsigned int frame,
+                                          unsigned int sec,
+                                          unsigned int usec,
+                                          void *data)
+{
+   wsi_display_page_flip_handler2(fd, frame, sec, usec, 0, data);
+}
+
+static void wsi_display_vblank_handler(int fd, unsigned int frame,
+                                       unsigned int sec, unsigned int usec,
+                                       void *data)
+{
+   struct wsi_display_fence *fence = data;
+
+   wsi_display_fence_event_handler(fence);
+}
+
+static void wsi_display_sequence_handler(int fd, uint64_t frame,
+                                         uint64_t nsec, uint64_t user_data)
+{
+   struct wsi_display_fence *fence =
+      (struct wsi_display_fence *) (uintptr_t) user_data;
+
+   wsi_display_fence_event_handler(fence);
+}
+
+static drmEventContext event_context = {
+   .version = DRM_EVENT_CONTEXT_VERSION,
+   .page_flip_handler = wsi_display_page_flip_handler,
+#if DRM_EVENT_CONTEXT_VERSION >= 3
+   .page_flip_handler2 = wsi_display_page_flip_handler2,
+#endif
+   .vblank_handler = wsi_display_vblank_handler,
+   .sequence_handler = wsi_display_sequence_handler,
+};
+
+static void *
+wsi_display_wait_thread(void *data)
+{
+   struct wsi_display *wsi = data;
+   struct pollfd pollfd = {
+      .fd = wsi->fd,
+      .events = POLLIN
+   };
+
+   pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
+   for (;;) {
+      int ret = poll(&pollfd, 1, -1);
+      if (ret > 0) {
+         pthread_mutex_lock(&wsi->wait_mutex);
+         (void) drmHandleEvent(wsi->fd, &event_context);
+         pthread_mutex_unlock(&wsi->wait_mutex);
+         pthread_cond_broadcast(&wsi->wait_cond);
+      }
+   }
+   return NULL;
+}
+
+static int
+wsi_display_start_wait_thread(struct wsi_display *wsi)
+{
+   if (!wsi->wait_thread) {
+      int ret = pthread_create(&wsi->wait_thread, NULL,
+                               wsi_display_wait_thread, wsi);
+      if (ret)
+         return ret;
+   }
+   return 0;
+}
+
+/*
+ * Wait for at least one event from the kernel to be processed.
+ * Call with wait_mutex held
+ */
+static int
+wsi_display_wait_for_event(struct wsi_display *wsi,
+                           uint64_t timeout_ns)
+{
+   int ret;
+
+   ret = wsi_display_start_wait_thread(wsi);
+
+   if (ret)
+      return ret;
+
+   struct timespec abs_timeout = {
+      .tv_sec = timeout_ns / 1000000000ULL,
+      .tv_nsec = timeout_ns % 1000000000ULL,
+   };
+
+   ret = pthread_cond_timedwait(&wsi->wait_cond, &wsi->wait_mutex,
+                                &abs_timeout);
+
+   wsi_display_debug("%9ld done waiting for event %d\n", pthread_self(), ret);
+   return ret;
+}
+
+static VkResult
+wsi_display_acquire_next_image(struct wsi_swapchain *drv_chain,
+                               const VkAcquireNextImageInfoKHR *info,
+                               uint32_t *image_index)
+{
+   struct wsi_display_swapchain *chain =
+      (struct wsi_display_swapchain *)drv_chain;
+   struct wsi_display *wsi = chain->wsi;
+   int ret = 0;
+   VkResult result = VK_SUCCESS;
+
+   /* Bail early if the swapchain is broken */
+   if (chain->status != VK_SUCCESS)
+      return chain->status;
+
+   uint64_t timeout = info->timeout;
+   if (timeout != 0 && timeout != UINT64_MAX)
+      timeout = wsi_rel_to_abs_time(timeout);
+
+   pthread_mutex_lock(&wsi->wait_mutex);
+   for (;;) {
+      for (uint32_t i = 0; i < chain->base.image_count; i++) {
+         if (chain->images[i].state == WSI_IMAGE_IDLE) {
+            *image_index = i;
+            wsi_display_debug("image %d available\n", i);
+            chain->images[i].state = WSI_IMAGE_DRAWING;
+            result = VK_SUCCESS;
+            goto done;
+         }
+         wsi_display_debug("image %d state %d\n", i, chain->images[i].state);
+      }
+
+      if (ret == ETIMEDOUT) {
+         result = VK_TIMEOUT;
+         goto done;
+      }
+
+      ret = wsi_display_wait_for_event(wsi, timeout);
+
+      if (ret && ret != ETIMEDOUT) {
+         result = VK_ERROR_SURFACE_LOST_KHR;
+         goto done;
+      }
+   }
+done:
+   pthread_mutex_unlock(&wsi->wait_mutex);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   return chain->status;
+}
+
+/*
+ * Check whether there are any other connectors driven by this crtc
+ */
+static bool
+wsi_display_crtc_solo(struct wsi_display *wsi,
+                      drmModeResPtr mode_res,
+                      drmModeConnectorPtr connector,
+                      uint32_t crtc_id)
+{
+   /* See if any other connectors share the same encoder */
+   for (int c = 0; c < mode_res->count_connectors; c++) {
+      if (mode_res->connectors[c] == connector->connector_id)
+         continue;
+
+      drmModeConnectorPtr other_connector =
+         drmModeGetConnector(wsi->fd, mode_res->connectors[c]);
+
+      if (other_connector) {
+         bool match = (other_connector->encoder_id == connector->encoder_id);
+         drmModeFreeConnector(other_connector);
+         if (match)
+            return false;
+      }
+   }
+
+   /* See if any other encoders share the same crtc */
+   for (int e = 0; e < mode_res->count_encoders; e++) {
+      if (mode_res->encoders[e] == connector->encoder_id)
+         continue;
+
+      drmModeEncoderPtr other_encoder =
+         drmModeGetEncoder(wsi->fd, mode_res->encoders[e]);
+
+      if (other_encoder) {
+         bool match = (other_encoder->crtc_id == crtc_id);
+         drmModeFreeEncoder(other_encoder);
+         if (match)
+            return false;
+      }
+   }
+   return true;
+}
+
+/*
+ * Pick a suitable CRTC to drive this connector. Prefer a CRTC which is
+ * currently driving this connector and not any others. Settle for a CRTC
+ * which is currently idle.
+ */
+static uint32_t
+wsi_display_select_crtc(struct wsi_display_connector *connector,
+                        drmModeResPtr mode_res,
+                        drmModeConnectorPtr drm_connector)
+{
+   struct wsi_display *wsi = connector->wsi;
+
+   /* See what CRTC is currently driving this connector */
+   if (drm_connector->encoder_id) {
+      drmModeEncoderPtr encoder =
+         drmModeGetEncoder(wsi->fd, drm_connector->encoder_id);
+
+      if (encoder) {
+         uint32_t crtc_id = encoder->crtc_id;
+         drmModeFreeEncoder(encoder);
+         if (crtc_id) {
+            if (wsi_display_crtc_solo(wsi, mode_res, drm_connector, crtc_id))
+               return crtc_id;
+         }
+      }
+   }
+   uint32_t crtc_id = 0;
+   for (int c = 0; crtc_id == 0 && c < mode_res->count_crtcs; c++) {
+      drmModeCrtcPtr crtc = drmModeGetCrtc(wsi->fd, mode_res->crtcs[c]);
+      if (crtc && crtc->buffer_id == 0)
+         crtc_id = crtc->crtc_id;
+      drmModeFreeCrtc(crtc);
+   }
+   return crtc_id;
+}
+
+static VkResult
+wsi_display_setup_connector(wsi_display_connector *connector,
+                            wsi_display_mode *display_mode)
+{
+   struct wsi_display *wsi = connector->wsi;
+
+   if (connector->current_mode == display_mode && connector->crtc_id)
+      return VK_SUCCESS;
+
+   VkResult result = VK_SUCCESS;
+
+   drmModeResPtr mode_res = drmModeGetResources(wsi->fd);
+   if (!mode_res) {
+      if (errno == ENOMEM)
+         result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      else
+         result = VK_ERROR_SURFACE_LOST_KHR;
+      goto bail;
+   }
+
+   drmModeConnectorPtr drm_connector =
+      drmModeGetConnectorCurrent(wsi->fd, connector->id);
+
+   if (!drm_connector) {
+      if (errno == ENOMEM)
+         result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      else
+         result = VK_ERROR_SURFACE_LOST_KHR;
+      goto bail_mode_res;
+   }
+
+   /* Pick a CRTC if we don't have one */
+   if (!connector->crtc_id) {
+      connector->crtc_id = wsi_display_select_crtc(connector,
+                                                   mode_res, drm_connector);
+      if (!connector->crtc_id) {
+         result = VK_ERROR_SURFACE_LOST_KHR;
+         goto bail_connector;
+      }
+   }
+
+   if (connector->current_mode != display_mode) {
+
+      /* Find the drm mode corresponding to the requested VkDisplayMode */
+      drmModeModeInfoPtr drm_mode = NULL;
+
+      for (int m = 0; m < drm_connector->count_modes; m++) {
+         drm_mode = &drm_connector->modes[m];
+         if (wsi_display_mode_matches_drm(display_mode, drm_mode))
+            break;
+         drm_mode = NULL;
+      }
+
+      if (!drm_mode) {
+         result = VK_ERROR_SURFACE_LOST_KHR;
+         goto bail_connector;
+      }
+
+      connector->current_mode = display_mode;
+      connector->current_drm_mode = *drm_mode;
+   }
+
+bail_connector:
+   drmModeFreeConnector(drm_connector);
+bail_mode_res:
+   drmModeFreeResources(mode_res);
+bail:
+   return result;
+
+}
+
+static VkResult
+wsi_display_fence_wait(struct wsi_fence *fence_wsi, uint64_t timeout)
+{
+   const struct wsi_device *wsi_device = fence_wsi->wsi_device;
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   struct wsi_display_fence *fence = (struct wsi_display_fence *) fence_wsi;
+
+   wsi_display_debug("%9lu wait fence %lu %ld\n",
+                     pthread_self(), fence->sequence,
+                     (int64_t) (timeout - wsi_get_current_monotonic()));
+   wsi_display_debug_code(uint64_t start_ns = wsi_get_current_monotonic());
+   pthread_mutex_lock(&wsi->wait_mutex);
+
+   VkResult result;
+   int ret = 0;
+   for (;;) {
+      if (fence->event_received) {
+         wsi_display_debug("%9lu fence %lu passed\n",
+                           pthread_self(), fence->sequence);
+         result = VK_SUCCESS;
+         break;
+      }
+
+      if (ret == ETIMEDOUT) {
+         wsi_display_debug("%9lu fence %lu timeout\n",
+                           pthread_self(), fence->sequence);
+         result = VK_TIMEOUT;
+         break;
+      }
+
+      ret = wsi_display_wait_for_event(wsi, timeout);
+
+      if (ret && ret != ETIMEDOUT) {
+         wsi_display_debug("%9lu fence %lu error\n",
+                           pthread_self(), fence->sequence);
+         result = VK_ERROR_DEVICE_LOST;
+         break;
+      }
+   }
+   pthread_mutex_unlock(&wsi->wait_mutex);
+   wsi_display_debug("%9lu fence wait %f ms\n",
+                     pthread_self(),
+                     ((int64_t) (wsi_get_current_monotonic() - start_ns)) /
+                     1.0e6);
+   return result;
+}
+
+static void
+wsi_display_fence_check_free(struct wsi_display_fence *fence)
+{
+   if (fence->event_received && fence->destroyed)
+      vk_free(fence->base.alloc, fence);
+}
+
+static void wsi_display_fence_event_handler(struct wsi_display_fence *fence)
+{
+   fence->event_received = true;
+   wsi_display_fence_check_free(fence);
+}
+
+static void
+wsi_display_fence_destroy(struct wsi_fence *fence_wsi)
+{
+   struct wsi_display_fence *fence = (struct wsi_display_fence *) fence_wsi;
+
+   assert(!fence->destroyed);
+   fence->destroyed = true;
+   wsi_display_fence_check_free(fence);
+}
+
+static struct wsi_display_fence *
+wsi_display_fence_alloc(VkDevice device,
+                        const struct wsi_device *wsi_device,
+                        VkDisplayKHR display,
+                        const VkAllocationCallbacks *allocator)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   struct wsi_display_fence *fence =
+      vk_zalloc2(wsi->alloc, allocator, sizeof (*fence),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (!fence)
+      return NULL;
+
+   fence->base.device = device;
+   fence->base.display = display;
+   fence->base.wsi_device = wsi_device;
+   fence->base.alloc = allocator ? allocator : wsi->alloc;
+   fence->base.wait = wsi_display_fence_wait;
+   fence->base.destroy = wsi_display_fence_destroy;
+   fence->event_received = false;
+   fence->destroyed = false;
+   fence->sequence = ++fence_sequence;
+   return fence;
+}
+
+static VkResult
+wsi_register_vblank_event(struct wsi_display_fence *fence,
+                          const struct wsi_device *wsi_device,
+                          VkDisplayKHR display,
+                          uint32_t flags,
+                          uint64_t frame_requested,
+                          uint64_t *frame_queued)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   struct wsi_display_connector *connector =
+      wsi_display_connector_from_handle(display);
+
+   if (wsi->fd < 0)
+      return VK_ERROR_INITIALIZATION_FAILED;
+
+   for (;;) {
+      int ret = drmCrtcQueueSequence(wsi->fd, connector->crtc_id,
+                                     flags,
+                                     frame_requested,
+                                     frame_queued,
+                                     (uint64_t) fence);
+
+      if (!ret)
+         return VK_SUCCESS;
+
+      if (errno != ENOMEM) {
+
+         /* Something unexpected happened. Pause for a moment so the
+          * application doesn't just spin and then return a failure indication
+          */
+
+         wsi_display_debug("queue vblank event %lu failed\n", fence->sequence);
+         struct timespec delay = {
+            .tv_sec = 0,
+            .tv_nsec = 100000000ull,
+         };
+         nanosleep(&delay, NULL);
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      }
+
+      /* The kernel event queue is full. Wait for some events to be
+       * processed and try again
+       */
+
+      pthread_mutex_lock(&wsi->wait_mutex);
+      ret = wsi_display_wait_for_event(wsi, wsi_rel_to_abs_time(100000000ull));
+      pthread_mutex_unlock(&wsi->wait_mutex);
+
+      if (ret) {
+         wsi_display_debug("vblank queue full, event wait failed\n");
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      }
+   }
+}
+
+/*
+ * Check to see if the kernel has no flip queued and if there's an image
+ * waiting to be displayed.
+ */
+static VkResult
+_wsi_display_queue_next(struct wsi_swapchain *drv_chain)
+{
+   struct wsi_display_swapchain *chain =
+      (struct wsi_display_swapchain *) drv_chain;
+   struct wsi_display *wsi = chain->wsi;
+   VkIcdSurfaceDisplay *surface = chain->surface;
+   wsi_display_mode *display_mode =
+      wsi_display_mode_from_handle(surface->displayMode);
+   wsi_display_connector *connector = display_mode->connector;
+
+   if (wsi->fd < 0)
+      return VK_ERROR_SURFACE_LOST_KHR;
+
+   if (display_mode != connector->current_mode)
+      connector->active = false;
+
+   for (;;) {
+
+      /* Check to see if there is an image to display, or if some image is
+       * already queued */
+
+      struct wsi_display_image *image = NULL;
+
+      for (uint32_t i = 0; i < chain->base.image_count; i++) {
+         struct wsi_display_image *tmp_image = &chain->images[i];
+
+         switch (tmp_image->state) {
+         case WSI_IMAGE_FLIPPING:
+            /* already flipping, don't send another to the kernel yet */
+            return VK_SUCCESS;
+         case WSI_IMAGE_QUEUED:
+            /* find the oldest queued */
+            if (!image || tmp_image->flip_sequence < image->flip_sequence)
+               image = tmp_image;
+            break;
+         default:
+            break;
+         }
+      }
+
+      if (!image)
+         return VK_SUCCESS;
+
+      int ret;
+      if (connector->active) {
+         ret = drmModePageFlip(wsi->fd, connector->crtc_id, image->fb_id,
+                                   DRM_MODE_PAGE_FLIP_EVENT, image);
+         if (ret == 0) {
+            image->state = WSI_IMAGE_FLIPPING;
+            return VK_SUCCESS;
+         }
+         wsi_display_debug("page flip err %d %s\n", ret, strerror(-ret));
+      } else {
+         ret = -EINVAL;
+      }
+
+      if (ret == -EINVAL) {
+         VkResult result = wsi_display_setup_connector(connector, display_mode);
+
+         if (result != VK_SUCCESS) {
+            image->state = WSI_IMAGE_IDLE;
+            return result;
+         }
+
+         /* XXX allow setting of position */
+         ret = drmModeSetCrtc(wsi->fd, connector->crtc_id,
+                              image->fb_id, 0, 0,
+                              &connector->id, 1,
+                              &connector->current_drm_mode);
+         if (ret == 0) {
+            /* Assume that the mode set is synchronous and that any
+             * previous image is now idle.
+             */
+            image->state = WSI_IMAGE_DISPLAYING;
+            wsi_display_idle_old_displaying(image);
+            connector->active = true;
+            return VK_SUCCESS;
+         }
+      }
+
+      if (ret != -EACCES) {
+         connector->active = false;
+         image->state = WSI_IMAGE_IDLE;
+         return VK_ERROR_SURFACE_LOST_KHR;
+      }
+
+      /* Some other VT is currently active. Sit here waiting for
+       * our VT to become active again by polling once a second
+       */
+      usleep(1000 * 1000);
+      connector->active = false;
+   }
+}
+
+static VkResult
+wsi_display_queue_present(struct wsi_swapchain *drv_chain,
+                          uint32_t image_index,
+                          const VkPresentRegionKHR *damage)
+{
+   struct wsi_display_swapchain *chain =
+      (struct wsi_display_swapchain *) drv_chain;
+   struct wsi_display *wsi = chain->wsi;
+   struct wsi_display_image *image = &chain->images[image_index];
+   VkResult result;
+
+   /* Bail early if the swapchain is broken */
+   if (chain->status != VK_SUCCESS)
+      return chain->status;
+
+   assert(image->state == WSI_IMAGE_DRAWING);
+   wsi_display_debug("present %d\n", image_index);
+
+   pthread_mutex_lock(&wsi->wait_mutex);
+
+   image->flip_sequence = ++chain->flip_sequence;
+   image->state = WSI_IMAGE_QUEUED;
+
+   result = _wsi_display_queue_next(drv_chain);
+   if (result != VK_SUCCESS)
+      chain->status = result;
+
+   pthread_mutex_unlock(&wsi->wait_mutex);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   return chain->status;
+}
+
+static VkResult
+wsi_display_surface_create_swapchain(
+   VkIcdSurfaceBase *icd_surface,
+   VkDevice device,
+   struct wsi_device *wsi_device,
+   int local_fd,
+   const VkSwapchainCreateInfoKHR *create_info,
+   const VkAllocationCallbacks *allocator,
+   struct wsi_swapchain **swapchain_out)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   assert(create_info->sType == VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR);
+
+   const unsigned num_images = create_info->minImageCount;
+   struct wsi_display_swapchain *chain =
+      vk_zalloc(allocator,
+                sizeof(*chain) + num_images * sizeof(chain->images[0]),
+                8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+
+   if (chain == NULL)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   VkResult result = wsi_swapchain_init(wsi_device, &chain->base, device,
+                                        create_info, allocator);
+   if (result != VK_SUCCESS) {
+      vk_free(allocator, chain);
+      return result;
+   }
+
+   chain->base.destroy = wsi_display_swapchain_destroy;
+   chain->base.get_wsi_image = wsi_display_get_wsi_image;
+   chain->base.acquire_next_image = wsi_display_acquire_next_image;
+   chain->base.queue_present = wsi_display_queue_present;
+   chain->base.present_mode = create_info->presentMode;
+   chain->base.image_count = num_images;
+
+   chain->wsi = wsi;
+   chain->status = VK_SUCCESS;
+
+   chain->surface = (VkIcdSurfaceDisplay *) icd_surface;
+
+   for (uint32_t image = 0; image < chain->base.image_count; image++) {
+      result = wsi_display_image_init(device, &chain->base,
+                                      create_info, allocator,
+                                      &chain->images[image]);
+      if (result != VK_SUCCESS) {
+         while (image > 0) {
+            --image;
+            wsi_display_image_finish(&chain->base, allocator,
+                                     &chain->images[image]);
+         }
+         vk_free(allocator, chain);
+         goto fail_init_images;
+      }
+   }
+
+   *swapchain_out = &chain->base;
+
+   return VK_SUCCESS;
+
+fail_init_images:
+   return result;
+}
+
+static bool
+wsi_init_pthread_cond_monotonic(pthread_cond_t *cond)
+{
+   pthread_condattr_t condattr;
+   bool ret = false;
+
+   if (pthread_condattr_init(&condattr) != 0)
+      goto fail_attr_init;
+
+   if (pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC) != 0)
+      goto fail_attr_set;
+
+   if (pthread_cond_init(cond, &condattr) != 0)
+      goto fail_cond_init;
+
+   ret = true;
+
+fail_cond_init:
+fail_attr_set:
+   pthread_condattr_destroy(&condattr);
+fail_attr_init:
+   return ret;
+}
+
+VkResult
+wsi_display_init_wsi(struct wsi_device *wsi_device,
+                     const VkAllocationCallbacks *alloc,
+                     int display_fd)
+{
+   struct wsi_display *wsi = vk_zalloc(alloc, sizeof(*wsi), 8,
+                                       VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   VkResult result;
+
+   if (!wsi) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail;
+   }
+
+   wsi->fd = display_fd;
+   wsi->alloc = alloc;
+
+   list_inithead(&wsi->connectors);
+
+   int ret = pthread_mutex_init(&wsi->wait_mutex, NULL);
+   if (ret) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail_mutex;
+   }
+
+   if (!wsi_init_pthread_cond_monotonic(&wsi->wait_cond)) {
+      result = VK_ERROR_OUT_OF_HOST_MEMORY;
+      goto fail_cond;
+   }
+
+   wsi->base.get_support = wsi_display_surface_get_support;
+   wsi->base.get_capabilities2 = wsi_display_surface_get_capabilities2;
+   wsi->base.get_formats = wsi_display_surface_get_formats;
+   wsi->base.get_formats2 = wsi_display_surface_get_formats2;
+   wsi->base.get_present_modes = wsi_display_surface_get_present_modes;
+   wsi->base.create_swapchain = wsi_display_surface_create_swapchain;
+
+   wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY] = &wsi->base;
+
+   return VK_SUCCESS;
+
+fail_cond:
+   pthread_mutex_destroy(&wsi->wait_mutex);
+fail_mutex:
+   vk_free(alloc, wsi);
+fail:
+   return result;
+}
+
+void
+wsi_display_finish_wsi(struct wsi_device *wsi_device,
+                       const VkAllocationCallbacks *alloc)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   if (wsi) {
+      wsi_for_each_connector(connector, wsi) {
+         wsi_for_each_display_mode(mode, connector) {
+            vk_free(wsi->alloc, mode);
+         }
+         vk_free(wsi->alloc, connector);
+      }
+
+      pthread_mutex_lock(&wsi->wait_mutex);
+      if (wsi->wait_thread) {
+         pthread_cancel(wsi->wait_thread);
+         pthread_join(wsi->wait_thread, NULL);
+      }
+      pthread_mutex_unlock(&wsi->wait_mutex);
+      pthread_mutex_destroy(&wsi->wait_mutex);
+      pthread_cond_destroy(&wsi->wait_cond);
+
+      vk_free(alloc, wsi);
+   }
+}
+
+/*
+ * Implement vkReleaseDisplay
+ */
+VkResult
+wsi_release_display(VkPhysicalDevice            physical_device,
+                    struct wsi_device           *wsi_device,
+                    VkDisplayKHR                display)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   if (wsi->fd >= 0) {
+      close(wsi->fd);
+      wsi->fd = -1;
+   }
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+   wsi_display_connector_from_handle(display)->output = None;
+#endif
+
+   return VK_SUCCESS;
+}
+
+#ifdef VK_USE_PLATFORM_XLIB_XRANDR_EXT
+
+static struct wsi_display_connector *
+wsi_display_find_output(struct wsi_device *wsi_device,
+                        xcb_randr_output_t output)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+
+   wsi_for_each_connector(connector, wsi) {
+      if (connector->output == output)
+         return connector;
+   }
+
+   return NULL;
+}
+
+/*
+ * Given a RandR output, find the associated kernel connector_id by
+ * looking at the CONNECTOR_ID property provided by the X server
+ */
+
+static uint32_t
+wsi_display_output_to_connector_id(xcb_connection_t *connection,
+                                   xcb_atom_t *connector_id_atom_p,
+                                   xcb_randr_output_t output)
+{
+   uint32_t connector_id = 0;
+   xcb_atom_t connector_id_atom = *connector_id_atom_p;
+
+   if (connector_id_atom == 0) {
+   /* Go dig out the CONNECTOR_ID property */
+      xcb_intern_atom_cookie_t ia_c = xcb_intern_atom(connection,
+                                                          true,
+                                                          12,
+                                                          "CONNECTOR_ID");
+      xcb_intern_atom_reply_t *ia_r = xcb_intern_atom_reply(connection,
+                                                                 ia_c,
+                                                                 NULL);
+      if (ia_r) {
+         *connector_id_atom_p = connector_id_atom = ia_r->atom;
+         free(ia_r);
+      }
+   }
+
+   /* If there's an CONNECTOR_ID atom in the server, then there may be a
+    * CONNECTOR_ID property. Otherwise, there will not be and we don't even
+    * need to bother.
+    */
+   if (connector_id_atom) {
+
+      xcb_randr_query_version_cookie_t qv_c =
+         xcb_randr_query_version(connection, 1, 6);
+      xcb_randr_get_output_property_cookie_t gop_c =
+         xcb_randr_get_output_property(connection,
+                                       output,
+                                       connector_id_atom,
+                                       0,
+                                       0,
+                                       0xffffffffUL,
+                                       0,
+                                       0);
+      xcb_randr_query_version_reply_t *qv_r =
+         xcb_randr_query_version_reply(connection, qv_c, NULL);
+      free(qv_r);
+      xcb_randr_get_output_property_reply_t *gop_r =
+         xcb_randr_get_output_property_reply(connection, gop_c, NULL);
+      if (gop_r) {
+         if (gop_r->num_items == 1 && gop_r->format == 32)
+            memcpy(&connector_id, xcb_randr_get_output_property_data(gop_r), 4);
+         free(gop_r);
+      }
+   }
+   return connector_id;
+}
+
+static bool
+wsi_display_check_randr_version(xcb_connection_t *connection)
+{
+   xcb_randr_query_version_cookie_t qv_c =
+      xcb_randr_query_version(connection, 1, 6);
+   xcb_randr_query_version_reply_t *qv_r =
+      xcb_randr_query_version_reply(connection, qv_c, NULL);
+   bool ret = false;
+
+   if (!qv_r)
+      return false;
+
+   /* Check for version 1.6 or newer */
+   ret = (qv_r->major_version > 1 ||
+          (qv_r->major_version == 1 && qv_r->minor_version >= 6));
+
+   free(qv_r);
+   return ret;
+}
+
+/*
+ * Given a kernel connector id, find the associated RandR output using the
+ * CONNECTOR_ID property
+ */
+
+static xcb_randr_output_t
+wsi_display_connector_id_to_output(xcb_connection_t *connection,
+                                   uint32_t connector_id)
+{
+   if (!wsi_display_check_randr_version(connection))
+      return 0;
+
+   const xcb_setup_t *setup = xcb_get_setup(connection);
+
+   xcb_atom_t connector_id_atom = 0;
+   xcb_randr_output_t output = 0;
+
+   /* Search all of the screens for the provided output */
+   xcb_screen_iterator_t iter;
+   for (iter = xcb_setup_roots_iterator(setup);
+        output == 0 && iter.rem;
+        xcb_screen_next(&iter))
+   {
+      xcb_randr_get_screen_resources_cookie_t gsr_c =
+         xcb_randr_get_screen_resources(connection, iter.data->root);
+      xcb_randr_get_screen_resources_reply_t *gsr_r =
+         xcb_randr_get_screen_resources_reply(connection, gsr_c, NULL);
+
+      if (!gsr_r)
+         return 0;
+
+      xcb_randr_output_t *ro = xcb_randr_get_screen_resources_outputs(gsr_r);
+      int o;
+
+      for (o = 0; o < gsr_r->num_outputs; o++) {
+         if (wsi_display_output_to_connector_id(connection,
+                                                &connector_id_atom, ro[o])
+             == connector_id)
+         {
+            output = ro[o];
+            break;
+         }
+      }
+      free(gsr_r);
+   }
+   return output;
+}
+
+/*
+ * Given a RandR output, find out which screen it's associated with
+ */
+static xcb_window_t
+wsi_display_output_to_root(xcb_connection_t *connection,
+                           xcb_randr_output_t output)
+{
+   if (!wsi_display_check_randr_version(connection))
+      return 0;
+
+   const xcb_setup_t *setup = xcb_get_setup(connection);
+   xcb_window_t root = 0;
+
+   /* Search all of the screens for the provided output */
+   for (xcb_screen_iterator_t iter = xcb_setup_roots_iterator(setup);
+        root == 0 && iter.rem;
+        xcb_screen_next(&iter))
+   {
+      xcb_randr_get_screen_resources_cookie_t gsr_c =
+         xcb_randr_get_screen_resources(connection, iter.data->root);
+      xcb_randr_get_screen_resources_reply_t *gsr_r =
+         xcb_randr_get_screen_resources_reply(connection, gsr_c, NULL);
+
+      if (!gsr_r)
+         return 0;
+
+      xcb_randr_output_t *ro = xcb_randr_get_screen_resources_outputs(gsr_r);
+
+      for (int o = 0; o < gsr_r->num_outputs; o++) {
+         if (ro[o] == output) {
+            root = iter.data->root;
+            break;
+         }
+      }
+      free(gsr_r);
+   }
+   return root;
+}
+
+static bool
+wsi_display_mode_matches_x(struct wsi_display_mode *wsi,
+                           xcb_randr_mode_info_t *xcb)
+{
+   return wsi->clock == (xcb->dot_clock + 500) / 1000 &&
+      wsi->hdisplay == xcb->width &&
+      wsi->hsync_start == xcb->hsync_start &&
+      wsi->hsync_end == xcb->hsync_end &&
+      wsi->htotal == xcb->htotal &&
+      wsi->hskew == xcb->hskew &&
+      wsi->vdisplay == xcb->height &&
+      wsi->vsync_start == xcb->vsync_start &&
+      wsi->vsync_end == xcb->vsync_end &&
+      wsi->vtotal == xcb->vtotal &&
+      wsi->vscan <= 1 &&
+      wsi->flags == xcb->mode_flags;
+}
+
+static struct wsi_display_mode *
+wsi_display_find_x_mode(struct wsi_device *wsi_device,
+                        struct wsi_display_connector *connector,
+                        xcb_randr_mode_info_t *mode)
+{
+   wsi_for_each_display_mode(display_mode, connector) {
+      if (wsi_display_mode_matches_x(display_mode, mode))
+         return display_mode;
+   }
+   return NULL;
+}
+
+static VkResult
+wsi_display_register_x_mode(struct wsi_device *wsi_device,
+                            struct wsi_display_connector *connector,
+                            xcb_randr_mode_info_t *x_mode,
+                            bool preferred)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   struct wsi_display_mode *display_mode =
+      wsi_display_find_x_mode(wsi_device, connector, x_mode);
+
+   if (display_mode) {
+      display_mode->valid = true;
+      return VK_SUCCESS;
+   }
+
+   display_mode = vk_zalloc(wsi->alloc, sizeof (struct wsi_display_mode),
+                            8, VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
+   if (!display_mode)
+      return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+   display_mode->connector = connector;
+   display_mode->valid = true;
+   display_mode->preferred = preferred;
+   display_mode->clock = (x_mode->dot_clock + 500) / 1000; /* kHz */
+   display_mode->hdisplay = x_mode->width;
+   display_mode->hsync_start = x_mode->hsync_start;
+   display_mode->hsync_end = x_mode->hsync_end;
+   display_mode->htotal = x_mode->htotal;
+   display_mode->hskew = x_mode->hskew;
+   display_mode->vdisplay = x_mode->height;
+   display_mode->vsync_start = x_mode->vsync_start;
+   display_mode->vsync_end = x_mode->vsync_end;
+   display_mode->vtotal = x_mode->vtotal;
+   display_mode->vscan = 0;
+   display_mode->flags = x_mode->mode_flags;
+
+   list_addtail(&display_mode->list, &connector->display_modes);
+   return VK_SUCCESS;
+}
+
+static struct wsi_display_connector *
+wsi_display_get_output(struct wsi_device *wsi_device,
+                       xcb_connection_t *connection,
+                       xcb_randr_output_t output)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   struct wsi_display_connector *connector;
+   uint32_t connector_id;
+
+   xcb_window_t root = wsi_display_output_to_root(connection, output);
+   if (!root)
+      return NULL;
+
+   /* See if we already have a connector for this output */
+   connector = wsi_display_find_output(wsi_device, output);
+
+   if (!connector) {
+      xcb_atom_t connector_id_atom = 0;
+
+      /*
+       * Go get the kernel connector ID for this X output
+       */
+      connector_id = wsi_display_output_to_connector_id(connection,
+                                                        &connector_id_atom,
+                                                        output);
+
+      /* Any X server with lease support will have this atom */
+      if (!connector_id) {
+         return NULL;
+      }
+
+      /* See if we already have a connector for this id */
+      connector = wsi_display_find_connector(wsi_device, connector_id);
+
+      if (connector == NULL) {
+         connector = wsi_display_alloc_connector(wsi, connector_id);
+         if (!connector) {
+            return NULL;
+         }
+         list_addtail(&connector->list, &wsi->connectors);
+      }
+      connector->output = output;
+   }
+
+   xcb_randr_get_screen_resources_cookie_t src =
+      xcb_randr_get_screen_resources(connection, root);
+   xcb_randr_get_output_info_cookie_t oic =
+      xcb_randr_get_output_info(connection, output, XCB_CURRENT_TIME);
+   xcb_randr_get_screen_resources_reply_t *srr =
+      xcb_randr_get_screen_resources_reply(connection, src, NULL);
+   xcb_randr_get_output_info_reply_t *oir =
+      xcb_randr_get_output_info_reply(connection, oic, NULL);
+
+   if (oir && srr) {
+      /* Get X modes and add them */
+
+      connector->connected =
+         oir->connection != XCB_RANDR_CONNECTION_DISCONNECTED;
+
+      wsi_display_invalidate_connector_modes(wsi_device, connector);
+
+      xcb_randr_mode_t *x_modes = xcb_randr_get_output_info_modes(oir);
+      for (int m = 0; m < oir->num_modes; m++) {
+         xcb_randr_mode_info_iterator_t i =
+            xcb_randr_get_screen_resources_modes_iterator(srr);
+         while (i.rem) {
+            xcb_randr_mode_info_t *mi = i.data;
+            if (mi->id == x_modes[m]) {
+               VkResult result = wsi_display_register_x_mode(
+                  wsi_device, connector, mi, m < oir->num_preferred);
+               if (result != VK_SUCCESS) {
+                  free(oir);
+                  free(srr);
+                  return NULL;
+               }
+               break;
+            }
+            xcb_randr_mode_info_next(&i);
+         }
+      }
+   }
+
+   free(oir);
+   free(srr);
+   return connector;
+}
+
+static xcb_randr_crtc_t
+wsi_display_find_crtc_for_output(xcb_connection_t *connection,
+                                 xcb_window_t root,
+                                 xcb_randr_output_t output)
+{
+   xcb_randr_get_screen_resources_cookie_t gsr_c =
+      xcb_randr_get_screen_resources(connection, root);
+   xcb_randr_get_screen_resources_reply_t *gsr_r =
+      xcb_randr_get_screen_resources_reply(connection, gsr_c, NULL);
+
+   if (!gsr_r)
+      return 0;
+
+   xcb_randr_crtc_t *rc = xcb_randr_get_screen_resources_crtcs(gsr_r);
+   xcb_randr_crtc_t idle_crtc = 0;
+   xcb_randr_crtc_t active_crtc = 0;
+
+   /* Find either a crtc already connected to the desired output or idle */
+   for (int c = 0; active_crtc == 0 && c < gsr_r->num_crtcs; c++) {
+      xcb_randr_get_crtc_info_cookie_t gci_c =
+         xcb_randr_get_crtc_info(connection, rc[c], gsr_r->config_timestamp);
+      xcb_randr_get_crtc_info_reply_t *gci_r =
+         xcb_randr_get_crtc_info_reply(connection, gci_c, NULL);
+
+      if (gci_r) {
+         if (gci_r->mode) {
+            int num_outputs = xcb_randr_get_crtc_info_outputs_length(gci_r);
+            xcb_randr_output_t *outputs =
+               xcb_randr_get_crtc_info_outputs(gci_r);
+
+            if (num_outputs == 1 && outputs[0] == output)
+               active_crtc = rc[c];
+
+         } else if (idle_crtc == 0) {
+            int num_possible = xcb_randr_get_crtc_info_possible_length(gci_r);
+            xcb_randr_output_t *possible =
+               xcb_randr_get_crtc_info_possible(gci_r);
+
+            for (int p = 0; p < num_possible; p++)
+               if (possible[p] == output) {
+                  idle_crtc = rc[c];
+                  break;
+               }
+         }
+         free(gci_r);
+      }
+   }
+   free(gsr_r);
+
+   if (active_crtc)
+      return active_crtc;
+   return idle_crtc;
+}
+
+VkResult
+wsi_acquire_xlib_display(VkPhysicalDevice physical_device,
+                         struct wsi_device *wsi_device,
+                         Display *dpy,
+                         VkDisplayKHR display)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   xcb_connection_t *connection = XGetXCBConnection(dpy);
+   struct wsi_display_connector *connector =
+      wsi_display_connector_from_handle(display);
+   xcb_window_t root;
+
+   /* XXX no support for multiple leases yet */
+   if (wsi->fd >= 0)
+      return VK_ERROR_INITIALIZATION_FAILED;
+
+   if (!connector->output) {
+      connector->output = wsi_display_connector_id_to_output(connection,
+                                                             connector->id);
+
+      /* Check and see if we found the output */
+      if (!connector->output)
+         return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   root = wsi_display_output_to_root(connection, connector->output);
+   if (!root)
+      return VK_ERROR_INITIALIZATION_FAILED;
+
+   xcb_randr_crtc_t crtc = wsi_display_find_crtc_for_output(connection,
+                                                            root,
+                                                            connector->output);
+
+   if (!crtc)
+      return VK_ERROR_INITIALIZATION_FAILED;
+
+#ifdef HAVE_DRI3_MODIFIERS
+   xcb_randr_lease_t lease = xcb_generate_id(connection);
+   xcb_randr_create_lease_cookie_t cl_c =
+      xcb_randr_create_lease(connection, root, lease, 1, 1,
+                             &crtc, &connector->output);
+   xcb_randr_create_lease_reply_t *cl_r =
+      xcb_randr_create_lease_reply(connection, cl_c, NULL);
+   if (!cl_r)
+      return VK_ERROR_INITIALIZATION_FAILED;
+
+   int fd = -1;
+   if (cl_r->nfd > 0) {
+      int *rcl_f = xcb_randr_create_lease_reply_fds(connection, cl_r);
+
+      fd = rcl_f[0];
+   }
+   free (cl_r);
+   if (fd < 0)
+      return VK_ERROR_INITIALIZATION_FAILED;
+
+   wsi->fd = fd;
+#endif
+
+   return VK_SUCCESS;
+}
+
+VkResult
+wsi_get_randr_output_display(VkPhysicalDevice physical_device,
+                             struct wsi_device *wsi_device,
+                             Display *dpy,
+                             RROutput output,
+                             VkDisplayKHR *display)
+{
+   xcb_connection_t *connection = XGetXCBConnection(dpy);
+   struct wsi_display_connector *connector =
+      wsi_display_get_output(wsi_device, connection, (xcb_randr_output_t) output);
+
+   if (connector)
+      *display = wsi_display_connector_to_handle(connector);
+   else
+      *display = NULL;
+   return VK_SUCCESS;
+}
+
+#endif
+
+/* VK_EXT_display_control */
+VkResult
+wsi_display_power_control(VkDevice device,
+                          struct wsi_device *wsi_device,
+                          VkDisplayKHR display,
+                          const VkDisplayPowerInfoEXT *display_power_info)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   struct wsi_display_connector *connector =
+      wsi_display_connector_from_handle(display);
+   int mode;
+
+   if (wsi->fd < 0)
+      return VK_ERROR_INITIALIZATION_FAILED;
+
+   switch (display_power_info->powerState) {
+   case VK_DISPLAY_POWER_STATE_OFF_EXT:
+      mode = DRM_MODE_DPMS_OFF;
+      break;
+   case VK_DISPLAY_POWER_STATE_SUSPEND_EXT:
+      mode = DRM_MODE_DPMS_SUSPEND;
+      break;
+   default:
+      mode = DRM_MODE_DPMS_ON;
+      break;
+   }
+   drmModeConnectorSetProperty(wsi->fd,
+                               connector->id,
+                               connector->dpms_property,
+                               mode);
+   return VK_SUCCESS;
+}
+
+VkResult
+wsi_register_device_event(VkDevice device,
+                          struct wsi_device *wsi_device,
+                          const VkDeviceEventInfoEXT *device_event_info,
+                          const VkAllocationCallbacks *allocator,
+                          struct wsi_fence **fence_p)
+{
+   return VK_ERROR_FEATURE_NOT_PRESENT;
+}
+
+VkResult
+wsi_register_display_event(VkDevice device,
+                           struct wsi_device *wsi_device,
+                           VkDisplayKHR display,
+                           const VkDisplayEventInfoEXT *display_event_info,
+                           const VkAllocationCallbacks *allocator,
+                           struct wsi_fence **fence_p)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   struct wsi_display_fence *fence;
+   VkResult ret;
+
+   switch (display_event_info->displayEvent) {
+   case VK_DISPLAY_EVENT_TYPE_FIRST_PIXEL_OUT_EXT:
+
+      fence = wsi_display_fence_alloc(device, wsi_device, display, allocator);
+
+      if (!fence)
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+
+      ret = wsi_register_vblank_event(fence, wsi_device, display,
+                                      DRM_CRTC_SEQUENCE_RELATIVE, 1, NULL);
+
+      if (ret == VK_SUCCESS)
+         *fence_p = &fence->base;
+      else if (fence != NULL)
+         vk_free2(wsi->alloc, allocator, fence);
+
+      break;
+   default:
+      ret = VK_ERROR_FEATURE_NOT_PRESENT;
+      break;
+   }
+
+   return ret;
+}
+
+
+VkResult
+wsi_get_swapchain_counter(VkDevice device,
+                          struct wsi_device *wsi_device,
+                          VkSwapchainKHR _swapchain,
+                          VkSurfaceCounterFlagBitsEXT flag_bits,
+                          uint64_t *value)
+{
+   struct wsi_display *wsi =
+      (struct wsi_display *) wsi_device->wsi[VK_ICD_WSI_PLATFORM_DISPLAY];
+   struct wsi_display_swapchain *swapchain =
+      (struct wsi_display_swapchain *) wsi_swapchain_from_handle(_swapchain);
+   struct wsi_display_connector *connector =
+      wsi_display_mode_from_handle(swapchain->surface->displayMode)->connector;
+
+   if (wsi->fd < 0)
+      return VK_ERROR_INITIALIZATION_FAILED;
+
+   if (!connector->active) {
+      *value = 0;
+      return VK_SUCCESS;
+   }
+
+   int ret = drmCrtcGetSequence(wsi->fd, connector->crtc_id, value, NULL);
+   if (ret)
+      *value = 0;
+
+   return VK_SUCCESS;
+}
+
diff --git a/src/vulkan/wsi/wsi_common_display.h b/src/vulkan/wsi/wsi_common_display.h
new file mode 100644
index 0000000..50d7f83
--- /dev/null
+++ b/src/vulkan/wsi/wsi_common_display.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright © 2017 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#ifndef WSI_COMMON_DISPLAY_H
+#define WSI_COMMON_DISPLAY_H
+
+#include "wsi_common.h"
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+
+#define typed_memcpy(dest, src, count) ({ \
+   STATIC_ASSERT(sizeof(*src) == sizeof(*dest)); \
+   memcpy((dest), (src), (count) * sizeof(*(src))); \
+})
+
+VkResult
+wsi_display_get_physical_device_display_properties(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t *property_count,
+   VkDisplayPropertiesKHR *properties);
+
+VkResult
+wsi_display_get_physical_device_display_properties2(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t *pPropertyCount,
+   VkDisplayProperties2KHR *pProperties);
+
+VkResult
+wsi_display_get_physical_device_display_plane_properties(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t *property_count,
+   VkDisplayPlanePropertiesKHR *properties);
+
+VkResult
+wsi_display_get_physical_device_display_plane_properties2(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t *property_count,
+   VkDisplayPlaneProperties2KHR *properties);
+
+VkResult
+wsi_display_get_display_plane_supported_displays(
+   VkPhysicalDevice physical_device,
+   struct wsi_device *wsi_device,
+   uint32_t plane_index,
+   uint32_t *display_count,
+   VkDisplayKHR *displays);
+
+VkResult
+wsi_display_get_display_mode_properties(VkPhysicalDevice physical_device,
+                                        struct wsi_device *wsi_device,
+                                        VkDisplayKHR display,
+                                        uint32_t *property_count,
+                                        VkDisplayModePropertiesKHR *properties);
+
+VkResult
+wsi_display_get_display_mode_properties2(VkPhysicalDevice physical_device,
+                                         struct wsi_device *wsi_device,
+                                         VkDisplayKHR display,
+                                         uint32_t *property_count,
+                                         VkDisplayModeProperties2KHR *properties);
+
+VkResult
+wsi_display_create_display_mode(VkPhysicalDevice physical_device,
+                                struct wsi_device *wsi_device,
+                                VkDisplayKHR display,
+                                const VkDisplayModeCreateInfoKHR *create_info,
+                                const VkAllocationCallbacks *allocator,
+                                VkDisplayModeKHR *mode);
+
+VkResult
+wsi_get_display_plane_capabilities(VkPhysicalDevice physical_device,
+                                   struct wsi_device *wsi_device,
+                                   VkDisplayModeKHR mode_khr,
+                                   uint32_t plane_index,
+                                   VkDisplayPlaneCapabilitiesKHR *capabilities);
+
+VkResult
+wsi_get_display_plane_capabilities2(VkPhysicalDevice physical_device,
+                                    struct wsi_device *wsi_device,
+                                    const VkDisplayPlaneInfo2KHR *pDisplayPlaneInfo,
+                                    VkDisplayPlaneCapabilities2KHR *capabilities);
+
+VkResult
+wsi_create_display_surface(VkInstance instance,
+                           const VkAllocationCallbacks *pAllocator,
+                           const VkDisplaySurfaceCreateInfoKHR *pCreateInfo,
+                           VkSurfaceKHR *pSurface);
+
+VkResult
+wsi_release_display(VkPhysicalDevice            physical_device,
+                    struct wsi_device           *wsi_device,
+                    VkDisplayKHR                display);
+
+
+#if VK_USE_PLATFORM_XLIB_XRANDR_EXT
+VkResult
+wsi_acquire_xlib_display(VkPhysicalDevice       physical_device,
+                         struct wsi_device      *wsi_device,
+                         Display                *dpy,
+                         VkDisplayKHR           display);
+
+VkResult
+wsi_get_randr_output_display(VkPhysicalDevice   physical_device,
+                             struct wsi_device  *wsi_device,
+                             Display            *dpy,
+                             RROutput           output,
+                             VkDisplayKHR       *display);
+
+#endif /* VK_USE_PLATFORM_XLIB_XRANDR_EXT */
+
+/* VK_EXT_display_control */
+VkResult
+wsi_display_power_control(VkDevice                      device,
+                          struct wsi_device             *wsi_device,
+                          VkDisplayKHR                  display,
+                          const VkDisplayPowerInfoEXT   *display_power_info);
+
+VkResult
+wsi_register_device_event(VkDevice                      device,
+                          struct wsi_device             *wsi_device,
+                          const VkDeviceEventInfoEXT    *device_event_info,
+                          const VkAllocationCallbacks   *allocator,
+                          struct wsi_fence              **fence);
+
+VkResult
+wsi_register_display_event(VkDevice                     device,
+                           struct wsi_device            *wsi_device,
+                           VkDisplayKHR                 display,
+                           const VkDisplayEventInfoEXT  *display_event_info,
+                           const VkAllocationCallbacks  *allocator,
+                           struct wsi_fence             **fence);
+
+VkResult
+wsi_get_swapchain_counter(VkDevice                      device,
+                          struct wsi_device             *wsi_device,
+                          VkSwapchainKHR                swapchain,
+                          VkSurfaceCounterFlagBitsEXT   flag_bits,
+                          uint64_t                      *value);
+
+#endif
diff --git a/src/vulkan/wsi/wsi_common_private.h b/src/vulkan/wsi/wsi_common_private.h
index 90941c8..ee7ae75 100644
--- a/src/vulkan/wsi/wsi_common_private.h
+++ b/src/vulkan/wsi/wsi_common_private.h
@@ -62,7 +62,7 @@
    struct wsi_image *(*get_wsi_image)(struct wsi_swapchain *swapchain,
                                       uint32_t image_index);
    VkResult (*acquire_next_image)(struct wsi_swapchain *swap_chain,
-                                  uint64_t timeout, VkSemaphore semaphore,
+                                  const VkAcquireNextImageInfoKHR *info,
                                   uint32_t *image_index);
    VkResult (*queue_present)(struct wsi_swapchain *swap_chain,
                              uint32_t image_index,
@@ -104,8 +104,6 @@
                            uint32_t queueFamilyIndex,
                            int local_fd,
                            VkBool32* pSupported);
-   VkResult (*get_capabilities)(VkIcdSurfaceBase *surface,
-                                VkSurfaceCapabilitiesKHR* pSurfaceCapabilities);
    VkResult (*get_capabilities2)(VkIcdSurfaceBase *surface,
                                  const void *info_next,
                                  VkSurfaceCapabilities2KHR* pSurfaceCapabilities);
@@ -141,6 +139,15 @@
                        const VkAllocationCallbacks *alloc);
 
 
+VkResult
+wsi_display_init_wsi(struct wsi_device *wsi_device,
+                     const VkAllocationCallbacks *alloc,
+                     int display_fd);
+
+void
+wsi_display_finish_wsi(struct wsi_device *wsi_device,
+                       const VkAllocationCallbacks *alloc);
+
 #define WSI_DEFINE_NONDISP_HANDLE_CASTS(__wsi_type, __VkType)              \
                                                                            \
    static inline struct __wsi_type *                                       \
diff --git a/src/vulkan/wsi/wsi_common_wayland.c b/src/vulkan/wsi/wsi_common_wayland.c
index ec38a4e..aeff823 100644
--- a/src/vulkan/wsi/wsi_common_wayland.c
+++ b/src/vulkan/wsi/wsi_common_wayland.c
@@ -455,10 +455,11 @@
       (struct wsi_wayland *)wsi_device->wsi[VK_ICD_WSI_PLATFORM_WAYLAND];
 
    struct wsi_wl_display display;
-   int ret = wsi_wl_display_init(wsi, &display, wl_display, false);
-   wsi_wl_display_finish(&display);
+   VkResult ret = wsi_wl_display_init(wsi, &display, wl_display, false);
+   if (ret == VK_SUCCESS)
+      wsi_wl_display_finish(&display);
 
-   return ret == 0;
+   return ret == VK_SUCCESS;
 }
 
 static VkResult
@@ -658,8 +659,7 @@
 
 static VkResult
 wsi_wl_swapchain_acquire_next_image(struct wsi_swapchain *wsi_chain,
-                                    uint64_t timeout,
-                                    VkSemaphore semaphore,
+                                    const VkAcquireNextImageInfoKHR *info,
                                     uint32_t *image_index)
 {
    struct wsi_wl_swapchain *chain = (struct wsi_wl_swapchain *)wsi_chain;
@@ -1010,7 +1010,6 @@
    wsi->wsi = wsi_device;
 
    wsi->base.get_support = wsi_wl_surface_get_support;
-   wsi->base.get_capabilities = wsi_wl_surface_get_capabilities;
    wsi->base.get_capabilities2 = wsi_wl_surface_get_capabilities2;
    wsi->base.get_formats = wsi_wl_surface_get_formats;
    wsi->base.get_formats2 = wsi_wl_surface_get_formats2;
diff --git a/src/vulkan/wsi/wsi_common_x11.c b/src/vulkan/wsi/wsi_common_x11.c
index 20d7cf5..164f760 100644
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@@ -948,11 +948,11 @@
 
 static VkResult
 x11_acquire_next_image(struct wsi_swapchain *anv_chain,
-                       uint64_t timeout,
-                       VkSemaphore semaphore,
+                       const VkAcquireNextImageInfoKHR *info,
                        uint32_t *image_index)
 {
    struct x11_swapchain *chain = (struct x11_swapchain *)anv_chain;
+   uint64_t timeout = info->timeout;
 
    if (chain->threaded) {
       return x11_acquire_next_image_from_queue(chain, image_index, timeout);
@@ -1469,7 +1469,6 @@
    }
 
    wsi->base.get_support = x11_surface_get_support;
-   wsi->base.get_capabilities = x11_surface_get_capabilities;
    wsi->base.get_capabilities2 = x11_surface_get_capabilities2;
    wsi->base.get_formats = x11_surface_get_formats;
    wsi->base.get_formats2 = x11_surface_get_formats2;